From a9cacf4eb71697351ee658a570636f04bbf31ad5 Mon Sep 17 00:00:00 2001 From: Declan Brady Date: Mon, 22 Jun 2026 15:59:33 -0400 Subject: [PATCH 01/10] =?UTF-8?q?feat(harness):=20unified=20harness=20surf?= =?UTF-8?q?ace=20=E2=80=94=20foundation=20(span=20derivation,=20delivery?= =?UTF-8?q?=20adapters,=20emitter)=20(#412)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .github/workflows/harness-integration.yml | 40 ++ src/agentex/lib/core/harness/__init__.py | 30 ++ src/agentex/lib/core/harness/auto_send.py | 156 ++++++ src/agentex/lib/core/harness/emitter.py | 80 +++ .../lib/core/harness/span_derivation.py | 154 ++++++ src/agentex/lib/core/harness/tracer.py | 88 ++++ src/agentex/lib/core/harness/types.py | 93 ++++ .../lib/core/harness/yield_delivery.py | 31 ++ tests/lib/core/harness/__init__.py | 0 .../lib/core/harness/conformance/__init__.py | 0 tests/lib/core/harness/conformance/runner.py | 48 ++ .../harness/conformance/test_conformance.py | 43 ++ tests/lib/core/harness/test_auto_send.py | 490 ++++++++++++++++++ tests/lib/core/harness/test_emitter.py | 148 ++++++ .../lib/core/harness/test_span_derivation.py | 286 ++++++++++ tests/lib/core/harness/test_tracer.py | 93 ++++ tests/lib/core/harness/test_types.py | 53 ++ tests/lib/core/harness/test_yield_delivery.py | 89 ++++ 18 files changed, 1922 insertions(+) create mode 100644 .github/workflows/harness-integration.yml create mode 100644 src/agentex/lib/core/harness/__init__.py create mode 100644 src/agentex/lib/core/harness/auto_send.py create mode 100644 src/agentex/lib/core/harness/emitter.py create mode 100644 src/agentex/lib/core/harness/span_derivation.py create mode 100644 src/agentex/lib/core/harness/tracer.py create mode 100644 src/agentex/lib/core/harness/types.py create mode 100644 src/agentex/lib/core/harness/yield_delivery.py create mode 100644 tests/lib/core/harness/__init__.py create mode 100644 tests/lib/core/harness/conformance/__init__.py create mode 100644 tests/lib/core/harness/conformance/runner.py create mode 100644 tests/lib/core/harness/conformance/test_conformance.py create mode 100644 tests/lib/core/harness/test_auto_send.py create mode 100644 tests/lib/core/harness/test_emitter.py create mode 100644 tests/lib/core/harness/test_span_derivation.py create mode 100644 tests/lib/core/harness/test_tracer.py create mode 100644 tests/lib/core/harness/test_types.py create mode 100644 tests/lib/core/harness/test_yield_delivery.py diff --git a/.github/workflows/harness-integration.yml b/.github/workflows/harness-integration.yml new file mode 100644 index 000000000..51893f10f --- /dev/null +++ b/.github/workflows/harness-integration.yml @@ -0,0 +1,40 @@ +name: Harness Integration + +on: + push: + branches: [main] + pull_request: + paths: + - "src/agentex/lib/core/harness/**" + - "src/agentex/lib/adk/_modules/**" + - ".github/workflows/harness-integration.yml" + +jobs: + conformance: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + + - name: Install uv + uses: astral-sh/setup-uv@d4b2f3b6ecc6e67c4457f6d3e41ec42d3d0fcb86 # v5.4.2 + with: + version: '0.10.2' + + - name: Bootstrap + run: ./scripts/bootstrap + + # Defer to scripts/test so the harness suite runs under the exact same + # invocation as the main CI test job: DEFER_PYDANTIC_BUILD=false and + # `uv run --isolated --all-packages --all-extras pytest`, across the + # min/max supported Python versions. Running `uv run pytest` directly + # would risk an all-extras-only dep passing locally but failing in CI. + - name: Conformance suite + run: ./scripts/test tests/lib/core/harness/ -v + + # Live integration matrix (harness x {sync, async, temporal}) is added per-harness + # in the migration plans. Placeholder job keeps the workflow valid until then. + live-matrix: + runs-on: ubuntu-latest + if: false # enabled once the first harness's test agents land + steps: + - run: echo "populated by migration PRs" # TODO(harness-migration): enable per-harness; see migration PRs 4-8 diff --git a/src/agentex/lib/core/harness/__init__.py b/src/agentex/lib/core/harness/__init__.py new file mode 100644 index 000000000..067751d63 --- /dev/null +++ b/src/agentex/lib/core/harness/__init__.py @@ -0,0 +1,30 @@ +"""Shared, harness-independent machinery for the unified harness surface. + +The Agentex StreamTaskMessage* stream is the single source of truth; this +package derives spans from it and delivers it (yield or auto-send), so every +harness tap gets streaming + tracing + turn usage uniformly. +""" + +from agentex.lib.core.harness.types import ( + OpenSpan, + CloseSpan, + TurnUsage, + SpanSignal, + TurnResult, + HarnessTurn, + StreamTaskMessage, +) +from agentex.lib.core.harness.tracer import SpanTracer +from agentex.lib.core.harness.emitter import UnifiedEmitter + +__all__ = [ + "UnifiedEmitter", + "SpanTracer", + "OpenSpan", + "CloseSpan", + "SpanSignal", + "StreamTaskMessage", + "TurnUsage", + "TurnResult", + "HarnessTurn", +] diff --git a/src/agentex/lib/core/harness/auto_send.py b/src/agentex/lib/core/harness/auto_send.py new file mode 100644 index 000000000..2ecd6b583 --- /dev/null +++ b/src/agentex/lib/core/harness/auto_send.py @@ -0,0 +1,156 @@ +"""Auto-send delivery: canonical stream -> adk.streaming side effects + tracing.""" + +from __future__ import annotations + +from typing import Any, AsyncIterator +from datetime import datetime + +from agentex.types.text_delta import TextDelta +from agentex.types.text_content import TextContent +from agentex.lib.core.harness.types import TurnUsage, TurnResult, StreamTaskMessage +from agentex.lib.core.harness.tracer import SpanTracer +from agentex.types.task_message_update import ( + StreamTaskMessageDone, + StreamTaskMessageFull, + StreamTaskMessageDelta, + StreamTaskMessageStart, +) +from agentex.lib.core.harness.span_derivation import SpanDeriver + +try: + from agentex.lib.utils.logging import make_logger + + logger = make_logger(__name__) +except Exception: # ddtrace may be absent in some envs; fall back to stdlib + import logging + + logger = logging.getLogger(__name__) + + +async def auto_send( + events: AsyncIterator[StreamTaskMessage], + task_id: str, + tracer: SpanTracer | None = None, + streaming: Any = None, + usage: TurnUsage | None = None, + created_at: datetime | None = None, +) -> TurnResult: + """Push the canonical stream to the task stream via adk.streaming. + + Opens a streaming context per message (keyed by index), streams deltas via + ctx.stream_update, and closes via ctx.close() on Done. Posts tool + request/response full messages by opening a context with the content and + closing it immediately (no deltas). Derives and traces spans from the same + stream. Returns the last text segment's text + usage. + + Index-keyed routing: each Start(index=i) opens a context stored in + ctx_map[i]; Delta(index=i) routes to ctx_map.get(i); Done(index=i) closes + and removes ctx_map[i]. Events with index is None are skipped. The finally + block closes all remaining open contexts. + + final_text last-segment semantics: a new Start(TextContent) resets + final_text_parts so that multi-step turns return the LAST text segment. + Full(TextContent) also overwrites final_text_parts (same semantics). + + AGX1-378: created_at is forwarded to every streaming_task_message_context + call so callers can back-date message timestamps. + + Mirrors the open/close/stream_update pattern from + src/agentex/lib/adk/_modules/_langgraph_async.py: + - context opened via streaming_task_message_context(...).__aenter__() + - context closed via ctx.close() (not __aexit__) + - deltas pushed as StreamTaskMessageDelta with parent_task_message set + from ctx.task_message + + For async + temporal agents (call from inside an activity). + """ + if streaming is None: + from agentex.lib import adk + + streaming = adk.streaming + + deriver = SpanDeriver() if tracer is not None else None + final_text_parts: list[str] = [] + ctx_map: dict[int, Any] = {} + + async def _close_all() -> None: + # Guard each close independently: a failure on one context (e.g. a + # backend hiccup during teardown) must not abandon the remaining open + # contexts, otherwise their task messages would never be finalized. + for ctx in list(ctx_map.values()): + try: + await ctx.close() + except Exception as exc: + logger.warning("[harness.auto_send] context close failed during teardown: %s", exc) + ctx_map.clear() + + try: + async for event in events: + if deriver is not None and tracer is not None: + for signal in deriver.observe(event): + await tracer.handle(signal) + + if isinstance(event, StreamTaskMessageStart): + if event.index is None: + continue + i = event.index + # Reset final_text_parts when a new text segment starts + if isinstance(event.content, TextContent): + final_text_parts = [] + ctx = streaming.streaming_task_message_context( + task_id=task_id, + initial_content=event.content, + created_at=created_at, + ) + ctx_map[i] = await ctx.__aenter__() + + elif isinstance(event, StreamTaskMessageDelta): + if event.index is None: + continue + ctx = ctx_map.get(event.index) + if ctx is not None and event.delta is not None: + # Reconstruct the delta with parent_task_message set from + # the context's task_message (mirrors _langgraph_async.py + # lines 72-78 and 117-127). + delta_with_parent = StreamTaskMessageDelta( + parent_task_message=ctx.task_message, + delta=event.delta, + type="delta", + index=event.index, + ) + await ctx.stream_update(delta_with_parent) + if isinstance(event.delta, TextDelta) and event.delta.text_delta: + final_text_parts.append(event.delta.text_delta) + + elif isinstance(event, StreamTaskMessageDone): + if event.index is None: + continue + ctx = ctx_map.pop(event.index, None) + if ctx is not None: + await ctx.close() + + elif isinstance(event, StreamTaskMessageFull): + # Full messages: post the full message by opening a context + # with the content and closing it immediately (no deltas; + # StreamingTaskMessageContext.close() persists initial_content + # when the accumulator is empty). Use async with so the context + # is closed even if close() raises (__aexit__ delegates to + # close()). + # Full(TextContent) also resets final_text_parts for + # last-segment semantics. + if isinstance(event.content, TextContent): + final_text_parts = [event.content.content] + async with streaming.streaming_task_message_context( + task_id=task_id, + initial_content=event.content, + created_at=created_at, + ): + pass + + finally: + await _close_all() + if deriver is not None and tracer is not None: + for signal in deriver.flush(): + await tracer.handle(signal) + + return TurnResult(final_text="".join(final_text_parts), usage=usage or TurnUsage()) diff --git a/src/agentex/lib/core/harness/emitter.py b/src/agentex/lib/core/harness/emitter.py new file mode 100644 index 000000000..5b56793bf --- /dev/null +++ b/src/agentex/lib/core/harness/emitter.py @@ -0,0 +1,80 @@ +"""UnifiedEmitter: the single facade agent authors use for either delivery mode.""" + +from __future__ import annotations + +from typing import AsyncGenerator +from datetime import datetime + +from agentex.lib.core.harness.types import TurnResult, HarnessTurn, StreamTaskMessage +from agentex.lib.core.harness.tracer import SpanTracer +from agentex.lib.core.harness.auto_send import auto_send +from agentex.lib.core.harness.yield_delivery import yield_events + + +class UnifiedEmitter: + """Ties trace context + chosen delivery together. + + Tracing modes (the `tracer` arg): + - tracer=None (default): auto-construct a SpanTracer if `trace_id` is present. + - tracer=False: disable tracing entirely, regardless of `trace_id`. + - tracer=: use the supplied instance. + + `tracing` and `streaming` are injection escape-hatches for tests/advanced + use; leave them None in production so the real adk modules are used. + """ + + tracer: SpanTracer | None + + def __init__( + self, + task_id: str, + trace_id: str | None, + parent_span_id: str | None, + tracer: SpanTracer | bool | None = None, + tracing: object | None = None, + streaming: object | None = None, + ): + self.task_id = task_id + self.trace_id = trace_id + self.parent_span_id = parent_span_id + self._streaming = streaming + if tracer is False: + self.tracer = None + elif isinstance(tracer, SpanTracer): + self.tracer = tracer + elif trace_id: + self.tracer = SpanTracer( + trace_id=trace_id, + parent_span_id=parent_span_id, + task_id=task_id, + tracing=tracing, + ) + else: + self.tracer = None + + async def yield_turn(self, turn: HarnessTurn) -> AsyncGenerator[StreamTaskMessage, None]: + """Sync HTTP ACP delivery: forward events, trace as side effect.""" + async for event in yield_events(turn.events, tracer=self.tracer): + yield event + + async def auto_send_turn(self, turn: HarnessTurn, created_at: datetime | None = None) -> TurnResult: + """Async/temporal delivery: push to the task stream, return TurnResult. + + Pass `created_at` (e.g. `workflow.now()` under Temporal) to stamp the + turn's messages with a deterministic timestamp; it is forwarded to the + streaming contexts. Default None preserves server-side timestamps. + """ + # `turn.usage()` is only valid AFTER `turn.events` is exhausted (the + # HarnessTurn single-pass contract: real turns populate usage while the + # stream is consumed). So drive delivery first, then read usage — do NOT + # pass `usage=turn.usage()` eagerly here (that would capture the empty + # default before the stream runs). + result = await auto_send( + turn.events, + task_id=self.task_id, + tracer=self.tracer, + streaming=self._streaming, + created_at=created_at, + ) + result.usage = turn.usage() + return result diff --git a/src/agentex/lib/core/harness/span_derivation.py b/src/agentex/lib/core/harness/span_derivation.py new file mode 100644 index 000000000..cecb24bcc --- /dev/null +++ b/src/agentex/lib/core/harness/span_derivation.py @@ -0,0 +1,154 @@ +"""Pure reducer: canonical StreamTaskMessage* stream -> span open/close signals. + +Has no dependency on adk; unit-testable in isolation. Delivery adapters feed it +every event and act on the returned signals. +""" + +from __future__ import annotations + +import json +from dataclasses import dataclass + +from agentex.lib.core.harness.types import OpenSpan, CloseSpan, SpanSignal, StreamTaskMessage +from agentex.types.tool_request_delta import ToolRequestDelta +from agentex.types.task_message_update import ( + StreamTaskMessageDone, + StreamTaskMessageFull, + StreamTaskMessageDelta, + StreamTaskMessageStart, +) +from agentex.types.tool_request_content import ToolRequestContent +from agentex.types.tool_response_content import ToolResponseContent + + +@dataclass +class _ToolReqMeta: + tool_call_id: str + name: str + arguments: dict[str, object] + args_buf: str = "" # accumulated streamed argument fragments + + +class SpanDeriver: + """Stateful reducer over the canonical stream. + + Tool span: open on Done of a ToolRequestContent index; close on matching + ToolResponseContent by tool_call_id. Reasoning span: open on + Start(ReasoningContent); close on that index's Done. + + Deliberate contracts: + - A `Full(ToolResponseContent)` whose tool_call_id was never opened is + ignored (no CloseSpan emitted). + - A `Done` for an index that was never a tool_request/reasoning Start is + ignored (no signal emitted). + - Events with `index is None` are skipped entirely; without a stable index + they cannot be reliably paired, and aliasing them to a sentinel would + let unrelated None-indexed events cross-match. + - `flush()` closes anything still open as incomplete; unclosed tool spans + are emitted in the order they were opened. + """ + + def __init__(self) -> None: + self._tool_by_index: dict[int, _ToolReqMeta] = {} + self._reasoning_index_open: set[int] = set() + # insertion-ordered set of open tool_call_ids (dict keys preserve order) + self._open_tool_ids: dict[str, None] = {} + + def observe(self, event: StreamTaskMessage) -> list[SpanSignal]: + if isinstance(event, StreamTaskMessageStart): + return self._on_start(event) + if isinstance(event, StreamTaskMessageDelta): + return self._on_delta(event) + if isinstance(event, StreamTaskMessageFull): + return self._on_full(event) + if isinstance(event, StreamTaskMessageDone): + return self._on_done(event) + return [] + + def flush(self) -> list[SpanSignal]: + """Close anything still open at end of stream, marked incomplete.""" + signals: list[SpanSignal] = [] + for tcid in list(self._open_tool_ids): + signals.append(CloseSpan(key=tcid, output=None, is_complete=False)) + self._open_tool_ids.clear() + for idx in sorted(self._reasoning_index_open): + signals.append(CloseSpan(key=f"reasoning:{idx}", output=None, is_complete=False)) + self._reasoning_index_open.clear() + return signals + + def _on_start(self, event: StreamTaskMessageStart) -> list[SpanSignal]: + if event.index is None: + return [] + idx = event.index + content = event.content + if isinstance(content, ToolRequestContent): + self._tool_by_index[idx] = _ToolReqMeta( + tool_call_id=content.tool_call_id, + name=content.name, + arguments=dict(content.arguments or {}), + ) + return [] + if content.type == "reasoning": + self._reasoning_index_open.add(idx) + return [OpenSpan(key=f"reasoning:{idx}", kind="reasoning", name="reasoning", input={})] + return [] + + def _on_delta(self, event: StreamTaskMessageDelta) -> list[SpanSignal]: + if event.index is None: + return [] + idx = event.index + delta = event.delta + if isinstance(delta, ToolRequestDelta): + meta = self._tool_by_index.get(idx) + if meta is not None and delta.arguments_delta: + meta.args_buf += delta.arguments_delta + return [] + + def _on_full(self, event: StreamTaskMessageFull) -> list[SpanSignal]: + """Handle a Full event. + + A `Full(ToolRequestContent)` opens a tool span (keyed by tool_call_id) + if it is not already open; the matching `Full(ToolResponseContent)` + closes it. This handles harnesses (e.g. LangGraph) that emit tool calls + as a single Full rather than Start+Done. + """ + content = event.content + if isinstance(content, ToolRequestContent): + tcid = content.tool_call_id + if tcid not in self._open_tool_ids: + self._open_tool_ids[tcid] = None + args = dict(content.arguments or {}) + return [OpenSpan(key=tcid, kind="tool", name=content.name, input=args)] + return [] + if isinstance(content, ToolResponseContent): + tcid = content.tool_call_id + if tcid in self._open_tool_ids: + self._open_tool_ids.pop(tcid, None) + return [ + CloseSpan( + key=tcid, + output=content.content, + is_complete=True, + is_error=content.is_error, + ) + ] + return [] + + def _on_done(self, event: StreamTaskMessageDone) -> list[SpanSignal]: + if event.index is None: + return [] + idx = event.index + meta = self._tool_by_index.pop(idx, None) + if meta is not None: + args = meta.arguments + if meta.args_buf: + try: + args = json.loads(meta.args_buf) + except json.JSONDecodeError: + args = {"_raw": meta.args_buf} + self._open_tool_ids[meta.tool_call_id] = None + return [OpenSpan(key=meta.tool_call_id, kind="tool", name=meta.name, input=args)] + if idx in self._reasoning_index_open: + self._reasoning_index_open.discard(idx) + return [CloseSpan(key=f"reasoning:{idx}", output=None, is_complete=True)] + return [] diff --git a/src/agentex/lib/core/harness/tracer.py b/src/agentex/lib/core/harness/tracer.py new file mode 100644 index 000000000..4ca4d628b --- /dev/null +++ b/src/agentex/lib/core/harness/tracer.py @@ -0,0 +1,88 @@ +"""Adapter from SpanSignals to adk.tracing spans (best-effort, overridable).""" + +from __future__ import annotations + +from typing import Any + +from agentex.lib.core.harness.types import OpenSpan, CloseSpan, SpanSignal + +try: + from agentex.lib.utils.logging import make_logger + + logger = make_logger(__name__) +except Exception: # ddtrace may be absent in some envs; fall back to stdlib + import logging + + logger = logging.getLogger(__name__) + + +class SpanTracer: + """Opens/closes adk.tracing child spans in response to span signals. + + `tracing` defaults to the real `adk.tracing` module; inject a fake in tests + or a custom tracer to override. No-op when `trace_id` is falsy. Never raises. + + The real TracingModule.end_span does NOT accept an output kwarg — output is + recorded by mutating span.output before calling end_span, matching the pattern + used throughout the codebase (see _langgraph_tracing.py on_tool_end etc.). + + Span-lifecycle contract: the `_open` dict (span key -> span object) is scoped + to a single turn. Pairing is by `key`: + - A duplicate OpenSpan for a key already in `_open` silently replaces the + earlier span; the earlier span is then orphaned (never closed / leaked). + - A CloseSpan for an unknown key is a no-op. + - Unpaired opens accumulate in `_open` for the lifetime of the tracer; since + a tracer is expected to live for one turn, this is bounded and acceptable. + """ + + def __init__( + self, + trace_id: str | None, + parent_span_id: str | None, + tracing: Any = None, + task_id: str | None = None, + ): + self.trace_id = trace_id + self.parent_span_id = parent_span_id + self.task_id = task_id + if tracing is None: + from agentex.lib import adk + + tracing = adk.tracing + self._tracing = tracing + self._open: dict[str, Any] = {} # span key -> span object + + async def handle(self, signal: SpanSignal) -> None: + if not self.trace_id: + return + try: + if isinstance(signal, OpenSpan): + span = await self._tracing.start_span( + trace_id=self.trace_id, + name=signal.name, + input=signal.input, + parent_id=self.parent_span_id, + task_id=self.task_id, + ) + if span is not None: + self._open[signal.key] = span + elif isinstance(signal, CloseSpan): + span = self._open.pop(signal.key, None) + if span is not None: + # Output is recorded by mutating span.output before end_span. + # The real TracingModule.end_span signature is: + # end_span(trace_id, span, start_to_close_timeout, heartbeat_timeout, retry_policy) + # It does not accept an output= kwarg. + span.output = signal.output + # Tool failure status (ToolResponseContent.is_error) is recorded + # on span.data when the harness reports one; Span has no dedicated + # error field. None means no status was reported, so leave data alone. + if signal.is_error is not None: + data = span.data if isinstance(span.data, dict) else {} + span.data = {**data, "is_error": signal.is_error} + await self._tracing.end_span( + trace_id=self.trace_id, + span=span, + ) + except Exception as exc: # best-effort: tracing never breaks delivery + logger.warning("[harness.tracer] span signal failed: %s", exc) diff --git a/src/agentex/lib/core/harness/types.py b/src/agentex/lib/core/harness/types.py new file mode 100644 index 000000000..b37dc1e51 --- /dev/null +++ b/src/agentex/lib/core/harness/types.py @@ -0,0 +1,93 @@ +"""Types for the unified harness surface.""" + +from __future__ import annotations + +from typing import Any, Union, Literal, Protocol, AsyncIterator, runtime_checkable +from dataclasses import field, dataclass + +from pydantic import BaseModel, ConfigDict + +from agentex.types.task_message_update import ( + StreamTaskMessageDone, + StreamTaskMessageFull, + StreamTaskMessageDelta, + StreamTaskMessageStart, +) + +# The canonical stream element. Taps yield these; delivery adapters consume them. +StreamTaskMessage = Union[ + StreamTaskMessageStart, + StreamTaskMessageDelta, + StreamTaskMessageFull, + StreamTaskMessageDone, +] + +SpanKind = Literal["tool", "reasoning", "subagent"] + + +@dataclass +class OpenSpan: + """Signal to open a child span. `key` pairs an open with its close.""" + + key: str + kind: SpanKind + name: str + input: dict[str, Any] = field(default_factory=dict) + + +@dataclass +class CloseSpan: + """Signal to close the span previously opened with the same `key`.""" + + key: str + output: Any = None + is_complete: bool = True # False when closed by flush() without a result + is_error: bool | None = None # tool failure status; None when the harness reports no status + + +SpanSignal = Union[OpenSpan, CloseSpan] + + +class TurnUsage(BaseModel): + """Harness-independent turn usage/cost, attached to the turn span. + + Token field names align with agentex.lib.core.observability.llm_metrics. + """ + + model_config = ConfigDict(from_attributes=True, populate_by_name=True) + + model: str | None = None + input_tokens: int | None = None + output_tokens: int | None = None + cached_input_tokens: int | None = None + reasoning_tokens: int | None = None + total_tokens: int | None = None + cost_usd: float | None = None + duration_ms: int | None = None + num_llm_calls: int = 0 + num_tool_calls: int = 0 + num_reasoning_blocks: int = 0 + + +class TurnResult(BaseModel): + """Returned to the caller after a turn is delivered.""" + + model_config = ConfigDict(from_attributes=True, populate_by_name=True) + + final_text: str = "" + usage: TurnUsage = TurnUsage() + + +@runtime_checkable +class HarnessTurn(Protocol): + """A single harness turn: a canonical stream plus its normalized usage. + + Python async generators cannot cleanly return a value to their consumer, so + a tap exposes usage via `usage()` (valid only after `events` is exhausted) + rather than via StopAsyncIteration. + """ + + @property + def events(self) -> AsyncIterator[StreamTaskMessage]: ... + + def usage(self) -> TurnUsage: ... diff --git a/src/agentex/lib/core/harness/yield_delivery.py b/src/agentex/lib/core/harness/yield_delivery.py new file mode 100644 index 000000000..69b39f152 --- /dev/null +++ b/src/agentex/lib/core/harness/yield_delivery.py @@ -0,0 +1,31 @@ +"""Yield delivery: pass the canonical stream through, tracing as a side effect.""" + +from __future__ import annotations + +from typing import AsyncIterator, AsyncGenerator + +from agentex.lib.core.harness.types import StreamTaskMessage +from agentex.lib.core.harness.tracer import SpanTracer +from agentex.lib.core.harness.span_derivation import SpanDeriver + + +async def yield_events( + events: AsyncIterator[StreamTaskMessage], + tracer: SpanTracer | None = None, +) -> AsyncGenerator[StreamTaskMessage, None]: + """Forward each event to the caller; derive + trace spans as a side effect. + + For sync HTTP ACP agents that yield events back over the response. When + `tracer` is None, this is a pure passthrough. + """ + deriver = SpanDeriver() if tracer is not None else None + try: + async for event in events: + if deriver is not None and tracer is not None: + for signal in deriver.observe(event): + await tracer.handle(signal) + yield event + finally: + if deriver is not None and tracer is not None: + for signal in deriver.flush(): + await tracer.handle(signal) diff --git a/tests/lib/core/harness/__init__.py b/tests/lib/core/harness/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/lib/core/harness/conformance/__init__.py b/tests/lib/core/harness/conformance/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/lib/core/harness/conformance/runner.py b/tests/lib/core/harness/conformance/runner.py new file mode 100644 index 000000000..81a74860c --- /dev/null +++ b/tests/lib/core/harness/conformance/runner.py @@ -0,0 +1,48 @@ +"""Shared conformance engine: every harness tap registers fixtures here. + +A fixture is (name, list[StreamTaskMessage]). The runner asserts that span +derivation over the events is identical regardless of delivery channel, which is +the cross-channel guarantee from the spec. + +Registry shared-state hazard: `_REGISTRY` is process-global. Every `test_*.py` +module that calls `register()` at import time contributes to it, so a module +that parametrizes over `all_fixtures()` will see fixtures registered by ANY +other conformance module imported earlier in the same pytest process (collection +order is not guaranteed). To stay deterministic, each future harness conformance +module should register and parametrize over its OWN fixtures (e.g. keep a +module-local list it both registers and parametrizes), rather than relying on +cross-module global accumulation via `all_fixtures()`. +""" + +from __future__ import annotations + +from dataclasses import dataclass + +from agentex.lib.core.harness.types import SpanSignal, StreamTaskMessage +from agentex.lib.core.harness.span_derivation import SpanDeriver + + +@dataclass +class Fixture: + name: str + events: list[StreamTaskMessage] + + +_REGISTRY: list[Fixture] = [] + + +def register(fixture: Fixture) -> None: + _REGISTRY.append(fixture) + + +def all_fixtures() -> list[Fixture]: + return list(_REGISTRY) + + +def derive_all(events: list[StreamTaskMessage]) -> list[SpanSignal]: + d = SpanDeriver() + out: list[SpanSignal] = [] + for e in events: + out.extend(d.observe(e)) + out.extend(d.flush()) + return out diff --git a/tests/lib/core/harness/conformance/test_conformance.py b/tests/lib/core/harness/conformance/test_conformance.py new file mode 100644 index 000000000..d9eec1c15 --- /dev/null +++ b/tests/lib/core/harness/conformance/test_conformance.py @@ -0,0 +1,43 @@ +import pytest + +from agentex.types.task_message_update import ( + StreamTaskMessageDone, + StreamTaskMessageFull, + StreamTaskMessageStart, +) +from agentex.types.tool_request_content import ToolRequestContent +from agentex.types.tool_response_content import ToolResponseContent + +from .runner import Fixture, register, derive_all, all_fixtures + +register( + Fixture( + name="builtin-single-tool", + events=[ + StreamTaskMessageStart( + type="start", + index=0, + content=ToolRequestContent( + type="tool_request", author="agent", tool_call_id="c", name="Bash", arguments={} + ), + ), + StreamTaskMessageDone(type="done", index=0), + StreamTaskMessageFull( + type="full", + index=1, + content=ToolResponseContent( + type="tool_response", author="agent", tool_call_id="c", name="Bash", content="ok" + ), + ), + ], + ) +) + + +@pytest.mark.parametrize("fixture", all_fixtures(), ids=lambda f: f.name) +def test_span_derivation_is_deterministic(fixture): + """Exercises the cross-channel guarantee: yield and auto-send observe the + same event stream, so span derivation must be deterministic/idempotent.""" + # Deriving twice over the same events yields identical signals (the property + # that makes yield vs auto-send equivalent, since both observe the same stream). + assert derive_all(fixture.events) == derive_all(fixture.events) diff --git a/tests/lib/core/harness/test_auto_send.py b/tests/lib/core/harness/test_auto_send.py new file mode 100644 index 000000000..1948e9196 --- /dev/null +++ b/tests/lib/core/harness/test_auto_send.py @@ -0,0 +1,490 @@ +"""Tests for auto_send delivery adapter. + +The fake mirrors the real StreamingTaskMessageContext API exactly: +- streaming_task_message_context(...) returns a context object (synchronously) +- open the context via __aenter__ (returns self after creating the task message) +- stream deltas via ctx.stream_update(StreamTaskMessageDelta(...)) +- close via ctx.close() (NOT __aexit__) + +This mirrors _langgraph_async.py lines 62-78 and 100-127. +""" + +import types as _types +from datetime import datetime + +import pytest + +from agentex.types.task_message import TaskMessage +from agentex.types.text_content import TextContent +from agentex.lib.core.harness.tracer import SpanTracer +from agentex.types.task_message_delta import TextDelta +from agentex.types.tool_request_delta import ToolRequestDelta +from agentex.types.task_message_update import ( + StreamTaskMessageDone, + StreamTaskMessageFull, + StreamTaskMessageDelta, + StreamTaskMessageStart, +) +from agentex.lib.core.harness.auto_send import auto_send +from agentex.types.tool_request_content import ToolRequestContent +from agentex.types.tool_response_content import ToolResponseContent + + +class _FakeCtx: + """Mirrors StreamingTaskMessageContext: __aenter__ opens (returns self with task_message set), + close() closes. stream_update records the call. + + task_message is a real TaskMessage instance so that auto_send can use it + as parent_task_message in StreamTaskMessageDelta without Pydantic validation errors. + """ + + def __init__(self, sink, content_type, initial_content): + self.sink = sink + self.content_type = content_type + # Real TaskMessage so StreamTaskMessageDelta(parent_task_message=...) passes validation + self.task_message = TaskMessage(id="msg-1", task_id="task1", content=initial_content) + + async def __aenter__(self): + self.sink.append(("open", self.content_type)) + return self + + async def __aexit__(self, *a): + # __aexit__ delegates to close in the real impl; keep for safety + await self.close() + return False + + async def close(self): + self.sink.append(("close", self.content_type)) + + async def stream_update(self, update): + self.sink.append(("update", update)) + return update + + +class _FakeStreaming: + """Mirrors StreamingService: streaming_task_message_context returns a context object.""" + + def __init__(self): + self.sink = [] + self.recorded_created_at: list[datetime | None] = [] + + def streaming_task_message_context(self, task_id, initial_content, streaming_mode="coalesced", created_at=None): + ctype = getattr(initial_content, "type", None) + self.sink.append(("ctx", ctype)) + self.recorded_created_at.append(created_at) + return _FakeCtx(self.sink, ctype, initial_content) + + +async def _gen(events): + for e in events: + yield e + + +# --------------------------------------------------------------------------- +# Test 1: text streaming — open, stream deltas, close; return accumulated text +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_auto_send_streams_text_and_returns_final_text(): + streaming = _FakeStreaming() + events = [ + StreamTaskMessageStart( + type="start", + index=0, + content=TextContent(type="text", author="agent", content=""), + ), + StreamTaskMessageDelta( + type="delta", + index=0, + delta=TextDelta(type="text", text_delta="Hel"), + ), + StreamTaskMessageDelta( + type="delta", + index=0, + delta=TextDelta(type="text", text_delta="lo"), + ), + StreamTaskMessageDone(type="done", index=0), + ] + result = await auto_send(_gen(events), task_id="task1", tracer=None, streaming=streaming) + + assert result.final_text == "Hello" + + kinds = [s[0] for s in streaming.sink] + # A context was created for the text content + assert kinds[0] == "ctx" + # It was opened and closed + assert "open" in kinds + assert "close" in kinds + # Exactly two updates were streamed (one per delta) + updates = [s for s in streaming.sink if s[0] == "update"] + assert len(updates) == 2 + + +# --------------------------------------------------------------------------- +# Test 2: tool_request Full + tool_response Full — each posts one full message +# (open context with the content, no deltas, close immediately) +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_auto_send_posts_full_tool_messages(): + streaming = _FakeStreaming() + events = [ + # Two Full events post two messages (open+close immediately, no deltas). + StreamTaskMessageFull( + type="full", + index=0, + content=ToolRequestContent( + type="tool_request", + author="agent", + tool_call_id="c1", + name="Bash", + arguments={"cmd": "ls"}, + ), + ), + StreamTaskMessageFull( + type="full", + index=1, + content=ToolResponseContent( + type="tool_response", + author="agent", + tool_call_id="c1", + name="Bash", + content="file.py", + ), + ), + ] + result = await auto_send(_gen(events), task_id="task1", tracer=None, streaming=streaming) + + assert result.final_text == "" + + # Each Full event opens and closes exactly one context. + ctx_events = [s for s in streaming.sink if s[0] == "ctx"] + assert len(ctx_events) == 2 + content_types = [s[1] for s in ctx_events] + assert content_types == ["tool_request", "tool_response"] + + # Each context is opened and closed + opens = [s for s in streaming.sink if s[0] == "open"] + closes = [s for s in streaming.sink if s[0] == "close"] + assert len(opens) == 2 + assert len(closes) == 2 + + # No stream_update calls (full messages have no deltas) + updates = [s for s in streaming.sink if s[0] == "update"] + assert len(updates) == 0 + + +# --------------------------------------------------------------------------- +# Test 3: tracing — spans are derived and handed to the tracer +# --------------------------------------------------------------------------- + + +class _RecordTracing: + def __init__(self): + self.started, self.ended = [], [] + + async def start_span(self, *, trace_id, name, input=None, parent_id=None, data=None, task_id=None): + self.started.append(name) + return _types.SimpleNamespace() + + async def end_span(self, *, trace_id, span): + self.ended.append(getattr(span, "output", None)) + + +@pytest.mark.asyncio +async def test_auto_send_derives_tool_spans_via_tracer(): + fake_tracing = _RecordTracing() + tracer = SpanTracer(trace_id="t", parent_span_id="p", tracing=fake_tracing) + streaming = _FakeStreaming() + + events = [ + StreamTaskMessageStart( + type="start", + index=0, + content=ToolRequestContent( + type="tool_request", + author="agent", + tool_call_id="c1", + name="Bash", + arguments={}, + ), + ), + StreamTaskMessageDone(type="done", index=0), + StreamTaskMessageFull( + type="full", + index=1, + content=ToolResponseContent( + type="tool_response", + author="agent", + tool_call_id="c1", + name="Bash", + content="ok", + ), + ), + ] + + result = await auto_send(_gen(events), task_id="task1", tracer=tracer, streaming=streaming) + + assert result.final_text == "" + assert fake_tracing.started == ["Bash"] + assert fake_tracing.ended == ["ok"] + + +# --------------------------------------------------------------------------- +# Test 4: text followed by a tool Full — text context is closed before Full +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_auto_send_closes_text_context_before_full_message(): + streaming = _FakeStreaming() + events = [ + StreamTaskMessageStart( + type="start", + index=0, + content=TextContent(type="text", author="agent", content=""), + ), + StreamTaskMessageDelta( + type="delta", + index=0, + delta=TextDelta(type="text", text_delta="Hi"), + ), + StreamTaskMessageDone(type="done", index=0), + StreamTaskMessageFull( + type="full", + index=1, + content=ToolRequestContent( + type="tool_request", + author="agent", + tool_call_id="c2", + name="read_file", + arguments={}, + ), + ), + ] + result = await auto_send(_gen(events), task_id="task1", tracer=None, streaming=streaming) + assert result.final_text == "Hi" + + # Verify ordering: text ctx opens, updates, closes; then tool_request ctx opens, closes + event_sequence = [(s[0], s[1]) for s in streaming.sink] + text_open_idx = next(i for i, s in enumerate(event_sequence) if s == ("open", "text")) + text_close_idx = next(i for i, s in enumerate(event_sequence) if s == ("close", "text")) + tool_open_idx = next(i for i, s in enumerate(event_sequence) if s == ("open", "tool_request")) + assert text_open_idx < text_close_idx < tool_open_idx + + +# --------------------------------------------------------------------------- +# Test 5: midstream error — propagates AND the open context is closed (finally) +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_open_context_closed_on_midstream_error(): + streaming = _FakeStreaming() + + async def _exploding_gen(): + yield StreamTaskMessageStart( + type="start", + index=0, + content=TextContent(type="text", author="agent", content=""), + ) + raise RuntimeError("boom") + + with pytest.raises(RuntimeError, match="boom"): + await auto_send(_exploding_gen(), task_id="task1", tracer=None, streaming=streaming) + + # The text context that was opened mid-stream was closed by the finally block. + assert ("open", "text") in [(s[0], s[1]) for s in streaming.sink] + assert ("close", "text") in [(s[0], s[1]) for s in streaming.sink] + + +# --------------------------------------------------------------------------- +# Test 6: streamed tool_request delivered (AGX1-377 core) +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_auto_send_streams_tool_request(): + """A Start(ToolRequestContent) MUST open a streaming context (AGX1-377).""" + streaming = _FakeStreaming() + events = [ + StreamTaskMessageStart( + type="start", + index=0, + content=ToolRequestContent( + type="tool_request", + author="agent", + tool_call_id="c_tool", + name="Bash", + arguments={}, + ), + ), + StreamTaskMessageDelta( + type="delta", + index=0, + delta=ToolRequestDelta( + type="tool_request", + tool_call_id="c_tool", + name="Bash", + arguments_delta='{"cmd": "ls"}', + ), + ), + StreamTaskMessageDone(type="done", index=0), + ] + result = await auto_send(_gen(events), task_id="task1", tracer=None, streaming=streaming) + + assert result.final_text == "" + + ctx_events = [s for s in streaming.sink if s[0] == "ctx"] + assert len(ctx_events) == 1 + assert ctx_events[0][1] == "tool_request" + + opens = [s for s in streaming.sink if s[0] == "open"] + closes = [s for s in streaming.sink if s[0] == "close"] + assert len(opens) == 1 + assert len(closes) == 1 + + updates = [s for s in streaming.sink if s[0] == "update"] + assert len(updates) == 1 + + +# --------------------------------------------------------------------------- +# Test 7: interleaved indexes route correctly +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_auto_send_interleaved_indexes_route_correctly(): + """Deltas must be routed to the correct index-keyed context.""" + streaming = _FakeStreaming() + events = [ + StreamTaskMessageStart( + type="start", + index=0, + content=TextContent(type="text", author="agent", content=""), + ), + StreamTaskMessageStart( + type="start", + index=1, + content=TextContent(type="text", author="agent", content=""), + ), + StreamTaskMessageDelta( + type="delta", + index=0, + delta=TextDelta(type="text", text_delta="A"), + ), + StreamTaskMessageDelta( + type="delta", + index=1, + delta=TextDelta(type="text", text_delta="B"), + ), + StreamTaskMessageDone(type="done", index=0), + StreamTaskMessageDone(type="done", index=1), + ] + result = await auto_send(_gen(events), task_id="task1", tracer=None, streaming=streaming) + + ctx_events = [s for s in streaming.sink if s[0] == "ctx"] + assert len(ctx_events) == 2 + + opens = [s for s in streaming.sink if s[0] == "open"] + assert len(opens) == 2 + + updates = [s for s in streaming.sink if s[0] == "update"] + assert len(updates) == 2 + + update_deltas = [s[1].delta for s in streaming.sink if s[0] == "update"] + text_deltas = [d.text_delta for d in update_deltas if isinstance(d, TextDelta)] + assert set(text_deltas) == {"A", "B"} + + +# --------------------------------------------------------------------------- +# Test 8: final_text returns last text segment for multi-step +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_auto_send_final_text_last_segment(): + """final_text must be the LAST text segment, not accumulated across all turns.""" + streaming = _FakeStreaming() + events = [ + StreamTaskMessageStart( + type="start", + index=0, + content=TextContent(type="text", author="agent", content=""), + ), + StreamTaskMessageDelta( + type="delta", + index=0, + delta=TextDelta(type="text", text_delta="First"), + ), + StreamTaskMessageDone(type="done", index=0), + StreamTaskMessageStart( + type="start", + index=1, + content=TextContent(type="text", author="agent", content=""), + ), + StreamTaskMessageDelta( + type="delta", + index=1, + delta=TextDelta(type="text", text_delta="Second"), + ), + StreamTaskMessageDone(type="done", index=1), + ] + result = await auto_send(_gen(events), task_id="task1", tracer=None, streaming=streaming) + assert result.final_text == "Second" + + +# --------------------------------------------------------------------------- +# Test 9: Full(TextContent) contributes to final_text +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_auto_send_full_text_content_sets_final_text(): + """A Full(TextContent) must contribute its text to final_text.""" + streaming = _FakeStreaming() + events = [ + StreamTaskMessageFull( + type="full", + index=0, + content=TextContent(type="text", author="agent", content="hello"), + ), + ] + result = await auto_send(_gen(events), task_id="task1", tracer=None, streaming=streaming) + assert result.final_text == "hello" + + +# --------------------------------------------------------------------------- +# Test 10: created_at is forwarded to streaming context (AGX1-378) +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_auto_send_created_at_forwarded(): + """created_at must be forwarded to every streaming_task_message_context call.""" + streaming = _FakeStreaming() + dt = datetime(2025, 1, 15, 12, 0, 0) + events = [ + StreamTaskMessageStart( + type="start", + index=0, + content=TextContent(type="text", author="agent", content=""), + ), + StreamTaskMessageDone(type="done", index=0), + StreamTaskMessageFull( + type="full", + index=1, + content=ToolRequestContent( + type="tool_request", + author="agent", + tool_call_id="c_ts", + name="Bash", + arguments={}, + ), + ), + ] + await auto_send(_gen(events), task_id="task1", tracer=None, streaming=streaming, created_at=dt) + + assert all(ts == dt for ts in streaming.recorded_created_at) diff --git a/tests/lib/core/harness/test_emitter.py b/tests/lib/core/harness/test_emitter.py new file mode 100644 index 000000000..df155ec44 --- /dev/null +++ b/tests/lib/core/harness/test_emitter.py @@ -0,0 +1,148 @@ +import pytest + +from agentex.types.task_message import TaskMessage +from agentex.types.text_content import TextContent +from agentex.lib.core.harness.types import TurnUsage +from agentex.lib.core.harness.emitter import UnifiedEmitter +from agentex.types.task_message_delta import TextDelta +from agentex.types.task_message_update import ( + StreamTaskMessageDone, + StreamTaskMessageDelta, + StreamTaskMessageStart, +) + + +class _FakeTracing: + async def start_span(self, **kw): + return None + + async def end_span(self, **kw): + pass + + +class _FakeCtx: + """Minimal StreamingTaskMessageContext fake (see test_auto_send.py).""" + + def __init__(self, sink, content_type, initial_content): + self.sink = sink + self.content_type = content_type + self.task_message = TaskMessage(id="msg-1", task_id="task1", content=initial_content) + + async def __aenter__(self): + self.sink.append(("open", self.content_type)) + return self + + async def __aexit__(self, *a): + await self.close() + return False + + async def close(self): + self.sink.append(("close", self.content_type)) + + async def stream_update(self, update): + self.sink.append(("update", update)) + return update + + +class _FakeStreaming: + def __init__(self): + self.sink = [] + + def streaming_task_message_context(self, task_id, initial_content, streaming_mode="coalesced", created_at=None): + ctype = getattr(initial_content, "type", None) + self.sink.append(("ctx", ctype)) + return _FakeCtx(self.sink, ctype, initial_content) + + +class _Turn: + def __init__(self, events_list, usage): + self._events_list = events_list + self._usage = usage + + @property + async def events(self): + for e in self._events_list: + yield e + + def usage(self): + return self._usage + + +@pytest.mark.asyncio +async def test_emitter_yield_mode_passes_through(): + events = [ + StreamTaskMessageStart(type="start", index=0, content=TextContent(type="text", author="agent", content="hi")), + StreamTaskMessageDone(type="done", index=0), + ] + turn = _Turn(events, TurnUsage(model="m")) + emitter = UnifiedEmitter(task_id="t", trace_id=None, parent_span_id=None) + out = [e async for e in emitter.yield_turn(turn)] + assert out == events + + +@pytest.mark.asyncio +async def test_emitter_tracing_default_on_when_trace_id_present(): + # Inject a fake tracing backend so the test env doesn't need temporalio. + # This exercises the default-on path (tracer=None) when trace_id is truthy. + emitter = UnifiedEmitter(task_id="t", trace_id="trace1", parent_span_id="p", tracing=_FakeTracing()) + assert emitter.tracer is not None + + +@pytest.mark.asyncio +async def test_emitter_tracing_overridable_off(): + emitter = UnifiedEmitter(task_id="t", trace_id="trace1", parent_span_id="p", tracer=False) + assert emitter.tracer is None + + +@pytest.mark.asyncio +async def test_emitter_auto_send_turn_returns_usage(): + usage = TurnUsage(model="m", input_tokens=5) + events = [ + StreamTaskMessageStart(type="start", index=0, content=TextContent(type="text", author="agent", content="")), + StreamTaskMessageDelta(type="delta", index=0, delta=TextDelta(type="text", text_delta="Hello")), + StreamTaskMessageDone(type="done", index=0), + ] + turn = _Turn(events, usage) + fake = _FakeStreaming() + emitter = UnifiedEmitter(task_id="t", trace_id=None, parent_span_id=None, streaming=fake) + result = await emitter.auto_send_turn(turn) + assert result.usage == usage + assert result.final_text == "Hello" + + +class _ContractTurn: + """A turn that honors the single-pass contract: usage() is the empty default + UNTIL `events` is exhausted, then the real usage (this is how real harness + turns behave — they populate usage while the stream is consumed).""" + + def __init__(self, events_list, real_usage): + self._events_list = events_list + self._real_usage = real_usage + self._exhausted = False + + @property + async def events(self): + for e in self._events_list: + yield e + self._exhausted = True + + def usage(self): + return self._real_usage if self._exhausted else TurnUsage(model="m") + + +@pytest.mark.asyncio +async def test_emitter_auto_send_turn_reads_usage_after_exhaustion(): + # Regression: auto_send_turn must read turn.usage() AFTER consuming the + # stream, not eagerly when building the auto_send call (which would capture + # the empty default and lose real token usage on the auto_send path). + real_usage = TurnUsage(model="m", input_tokens=11, output_tokens=22, total_tokens=33, num_llm_calls=2) + events = [ + StreamTaskMessageStart(type="start", index=0, content=TextContent(type="text", author="agent", content="")), + StreamTaskMessageDelta(type="delta", index=0, delta=TextDelta(type="text", text_delta="hi")), + StreamTaskMessageDone(type="done", index=0), + ] + turn = _ContractTurn(events, real_usage) + emitter = UnifiedEmitter(task_id="t", trace_id=None, parent_span_id=None, streaming=_FakeStreaming()) + result = await emitter.auto_send_turn(turn) + assert result.usage == real_usage + assert result.usage.input_tokens == 11 and result.usage.total_tokens == 33 diff --git a/tests/lib/core/harness/test_span_derivation.py b/tests/lib/core/harness/test_span_derivation.py new file mode 100644 index 000000000..51e2ede2c --- /dev/null +++ b/tests/lib/core/harness/test_span_derivation.py @@ -0,0 +1,286 @@ +from agentex.types.text_content import TextContent +from agentex.lib.core.harness.types import OpenSpan, CloseSpan +from agentex.types.reasoning_content import ReasoningContent +from agentex.types.tool_request_delta import ToolRequestDelta +from agentex.types.task_message_update import ( + StreamTaskMessageDone, + StreamTaskMessageFull, + StreamTaskMessageDelta, + StreamTaskMessageStart, +) +from agentex.types.tool_request_content import ToolRequestContent +from agentex.types.tool_response_content import ToolResponseContent +from agentex.lib.core.harness.span_derivation import SpanDeriver + + +def _signals(deriver, events): + out = [] + for e in events: + out.extend(deriver.observe(e)) + out.extend(deriver.flush()) + return out + + +def _tool_req(idx, tcid, name, args): + return StreamTaskMessageStart( + type="start", + index=idx, + content=ToolRequestContent(type="tool_request", author="agent", tool_call_id=tcid, name=name, arguments=args), + ) + + +def test_text_only_yields_no_spans(): + d = SpanDeriver() + events = [ + StreamTaskMessageStart(type="start", index=0, content=TextContent(type="text", author="agent", content="")), + StreamTaskMessageDelta(type="delta", index=0, delta=None), + StreamTaskMessageDone(type="done", index=0), + ] + assert _signals(d, events) == [] + + +def test_single_tool_opens_on_done_closes_on_response(): + d = SpanDeriver() + events = [ + _tool_req(0, "call_1", "Bash", {"cmd": "ls"}), + StreamTaskMessageDone(type="done", index=0), + StreamTaskMessageFull( + type="full", + index=1, + content=ToolResponseContent( + type="tool_response", author="agent", tool_call_id="call_1", name="Bash", content="files" + ), + ), + ] + sigs = _signals(d, events) + assert sigs == [ + OpenSpan(key="call_1", kind="tool", name="Bash", input={"cmd": "ls"}), + CloseSpan(key="call_1", output="files", is_complete=True), + ] + # No status reported -> CloseSpan carries is_error=None. + assert sigs[1].is_error is None + + +def test_tool_response_is_error_propagates_to_close_span(): + """ToolResponseContent.is_error flows onto the CloseSpan so a derived tool + span can be marked as a failure (AGX1-371).""" + d = SpanDeriver() + events = [ + _tool_req(0, "call_err", "Bash", {"cmd": "false"}), + StreamTaskMessageDone(type="done", index=0), + StreamTaskMessageFull( + type="full", + index=1, + content=ToolResponseContent( + type="tool_response", + author="agent", + tool_call_id="call_err", + name="Bash", + content="boom", + is_error=True, + ), + ), + ] + sigs = _signals(d, events) + assert sigs == [ + OpenSpan(key="call_err", kind="tool", name="Bash", input={"cmd": "false"}), + CloseSpan(key="call_err", output="boom", is_complete=True, is_error=True), + ] + + +def test_reasoning_opens_on_start_closes_on_done(): + d = SpanDeriver() + events = [ + StreamTaskMessageStart( + type="start", index=0, content=ReasoningContent(type="reasoning", author="agent", summary=[], content=[]) + ), + StreamTaskMessageDone(type="done", index=0), + ] + sigs = _signals(d, events) + assert sigs[0] == OpenSpan(key="reasoning:0", kind="reasoning", name="reasoning", input={}) + assert sigs[1] == CloseSpan(key="reasoning:0", output=None, is_complete=True) + + +def test_parallel_tools_pair_by_tool_call_id(): + d = SpanDeriver() + events = [ + _tool_req(0, "a", "T1", {}), + _tool_req(1, "b", "T2", {}), + StreamTaskMessageDone(type="done", index=0), + StreamTaskMessageDone(type="done", index=1), + StreamTaskMessageFull( + type="full", + index=2, + content=ToolResponseContent( + type="tool_response", author="agent", tool_call_id="b", name="T2", content="rb" + ), + ), + StreamTaskMessageFull( + type="full", + index=3, + content=ToolResponseContent( + type="tool_response", author="agent", tool_call_id="a", name="T1", content="ra" + ), + ), + ] + sigs = _signals(d, events) + opens = [s for s in sigs if isinstance(s, OpenSpan)] + closes = [s for s in sigs if isinstance(s, CloseSpan)] + assert {o.key for o in opens} == {"a", "b"} + assert [c.key for c in closes] == ["b", "a"] + assert all(c.is_complete for c in closes) + + +def test_streamed_args_accumulate_into_open_input(): + d = SpanDeriver() + events = [ + StreamTaskMessageStart( + type="start", + index=0, + content=ToolRequestContent( + type="tool_request", author="agent", tool_call_id="c", name="Bash", arguments={} + ), + ), + StreamTaskMessageDelta( + type="delta", + index=0, + delta=ToolRequestDelta(type="tool_request", tool_call_id="c", name="Bash", arguments_delta='{"cmd":'), + ), + StreamTaskMessageDelta( + type="delta", + index=0, + delta=ToolRequestDelta(type="tool_request", tool_call_id="c", name="Bash", arguments_delta='"ls"}'), + ), + StreamTaskMessageDone(type="done", index=0), + ] + sigs = _signals(d, events) + assert sigs[0] == OpenSpan(key="c", kind="tool", name="Bash", input={"cmd": "ls"}) + + +def test_unclosed_tool_closed_incomplete_on_flush(): + d = SpanDeriver() + events = [ + _tool_req(0, "x", "Bash", {}), + StreamTaskMessageDone(type="done", index=0), + ] + sigs = _signals(d, events) + assert sigs[0] == OpenSpan(key="x", kind="tool", name="Bash", input={}) + assert sigs[1] == CloseSpan(key="x", output=None, is_complete=False) + + +def test_none_index_is_skipped(): + d = SpanDeriver() + events = [ + StreamTaskMessageStart( + type="start", + index=None, + content=ToolRequestContent( + type="tool_request", author="agent", tool_call_id="n", name="Bash", arguments={} + ), + ), + StreamTaskMessageDone(type="done", index=None), + ] + assert _signals(d, events) == [] + + +def test_orphan_tool_response_ignored(): + d = SpanDeriver() + events = [ + StreamTaskMessageFull( + type="full", + index=0, + content=ToolResponseContent( + type="tool_response", author="agent", tool_call_id="z", name="Bash", content="r" + ), + ), + ] + assert _signals(d, events) == [] + + +def test_full_tool_request_opens_span(): + """Full(ToolRequestContent) must open a tool span (for LangGraph-style harnesses).""" + d = SpanDeriver() + events = [ + StreamTaskMessageFull( + type="full", + index=0, + content=ToolRequestContent( + type="tool_request", + author="agent", + tool_call_id="call_x", + name="Bash", + arguments={"cmd": "ls"}, + ), + ), + ] + sigs = _signals(d, events) + assert sigs[0] == OpenSpan(key="call_x", kind="tool", name="Bash", input={"cmd": "ls"}) + assert sigs[1] == CloseSpan(key="call_x", output=None, is_complete=False) + + +def test_full_tool_request_and_response_paired(): + """Full(ToolRequestContent) + Full(ToolResponseContent) produces a complete span pair.""" + d = SpanDeriver() + events = [ + StreamTaskMessageFull( + type="full", + index=0, + content=ToolRequestContent( + type="tool_request", + author="agent", + tool_call_id="call_y", + name="Grep", + arguments={}, + ), + ), + StreamTaskMessageFull( + type="full", + index=1, + content=ToolResponseContent( + type="tool_response", + author="agent", + tool_call_id="call_y", + name="Grep", + content="result", + ), + ), + ] + sigs = _signals(d, events) + assert sigs == [ + OpenSpan(key="call_y", kind="tool", name="Grep", input={}), + CloseSpan(key="call_y", output="result", is_complete=True), + ] + + +def test_full_tool_request_does_not_double_open(): + """A Full(ToolRequestContent) for an already-open tool_call_id is a no-op.""" + d = SpanDeriver() + events = [ + StreamTaskMessageStart( + type="start", + index=0, + content=ToolRequestContent( + type="tool_request", + author="agent", + tool_call_id="call_z", + name="X", + arguments={}, + ), + ), + StreamTaskMessageDone(type="done", index=0), + StreamTaskMessageFull( + type="full", + index=1, + content=ToolRequestContent( + type="tool_request", + author="agent", + tool_call_id="call_z", + name="X", + arguments={}, + ), + ), + ] + sigs = _signals(d, events) + opens = [s for s in sigs if isinstance(s, OpenSpan)] + assert len(opens) == 1 + assert opens[0].key == "call_z" diff --git a/tests/lib/core/harness/test_tracer.py b/tests/lib/core/harness/test_tracer.py new file mode 100644 index 000000000..ed40cf595 --- /dev/null +++ b/tests/lib/core/harness/test_tracer.py @@ -0,0 +1,93 @@ +from typing import override + +import pytest + +from agentex.lib.core.harness.types import OpenSpan, CloseSpan +from agentex.lib.core.harness.tracer import SpanTracer + + +class _FakeSpan: + def __init__(self, name): + self.name = name + self.output = None + self.data = None + + +class _FakeTracing: + def __init__(self): + self.started = [] + self.ended = [] + self.ended_spans = [] + + async def start_span(self, *, trace_id, name, input=None, parent_id=None, data=None, task_id=None): + self.started.append((name, parent_id, input)) + return _FakeSpan(name) + + async def end_span(self, *, trace_id, span): + self.ended.append((span.name, span.output)) + self.ended_spans.append(span) + + +@pytest.mark.asyncio +async def test_open_then_close_starts_and_ends_span(): + fake = _FakeTracing() + tracer = SpanTracer(trace_id="t1", parent_span_id="p1", tracing=fake) + await tracer.handle(OpenSpan(key="call_1", kind="tool", name="Bash", input={"cmd": "ls"})) + await tracer.handle(CloseSpan(key="call_1", output="files", is_complete=True)) + assert fake.started == [("Bash", "p1", {"cmd": "ls"})] + assert fake.ended == [("Bash", "files")] + + +@pytest.mark.asyncio +async def test_close_records_is_error_on_span_data(): + """A CloseSpan carrying is_error records the status on span.data (AGX1-371).""" + fake = _FakeTracing() + tracer = SpanTracer(trace_id="t1", parent_span_id="p1", tracing=fake) + await tracer.handle(OpenSpan(key="call_err", kind="tool", name="Bash", input={})) + await tracer.handle(CloseSpan(key="call_err", output="boom", is_complete=True, is_error=True)) + assert fake.ended_spans[0].data == {"is_error": True} + + +@pytest.mark.asyncio +async def test_close_without_status_leaves_span_data_untouched(): + """is_error=None (no status reported) must not write to span.data.""" + fake = _FakeTracing() + tracer = SpanTracer(trace_id="t1", parent_span_id="p1", tracing=fake) + await tracer.handle(OpenSpan(key="call_1", kind="tool", name="Bash", input={})) + await tracer.handle(CloseSpan(key="call_1", output="files", is_complete=True)) + assert fake.ended_spans[0].data is None + + +@pytest.mark.asyncio +async def test_no_trace_id_is_noop(): + fake = _FakeTracing() + tracer = SpanTracer(trace_id="", parent_span_id=None, tracing=fake) + await tracer.handle(OpenSpan(key="k", kind="tool", name="X")) + await tracer.handle(CloseSpan(key="k")) + assert fake.started == [] and fake.ended == [] + + +@pytest.mark.asyncio +async def test_tracing_failure_is_swallowed(): + class _Boom(_FakeTracing): + @override + async def start_span(self, **kw): + raise RuntimeError("backend down") + + tracer = SpanTracer(trace_id="t1", parent_span_id="p1", tracing=_Boom()) + # Must not raise. + await tracer.handle(OpenSpan(key="k", kind="tool", name="X")) + await tracer.handle(CloseSpan(key="k")) + assert tracer._open == {} + + +@pytest.mark.asyncio +async def test_duplicate_open_replaces_silently(): + fake = _FakeTracing() + tracer = SpanTracer(trace_id="t1", parent_span_id="p1", tracing=fake) + await tracer.handle(OpenSpan(key="k", kind="tool", name="A")) + await tracer.handle(OpenSpan(key="k", kind="tool", name="B")) + await tracer.handle(CloseSpan(key="k")) + # Both opens started spans, but only the second ("B") is closed. + assert [name for name, _, _ in fake.started] == ["A", "B"] + assert fake.ended == [("B", None)] diff --git a/tests/lib/core/harness/test_types.py b/tests/lib/core/harness/test_types.py new file mode 100644 index 000000000..68bc89ce2 --- /dev/null +++ b/tests/lib/core/harness/test_types.py @@ -0,0 +1,53 @@ +from typing import AsyncIterator + +from agentex.lib.core.harness.types import ( + OpenSpan, + CloseSpan, + TurnUsage, + TurnResult, + HarnessTurn, + StreamTaskMessage, +) + + +def test_open_close_span_construct(): + o = OpenSpan(key="call_1", kind="tool", name="Bash", input={"cmd": "ls"}) + c = CloseSpan(key="call_1", output="files", is_complete=True) + assert o.key == c.key == "call_1" + assert o.kind == "tool" + assert c.is_complete is True + + +def test_turn_usage_defaults_are_none(): + u = TurnUsage(model="claude-opus-4-6") + assert u.model == "claude-opus-4-6" + assert u.input_tokens is None + assert u.num_tool_calls == 0 + + +def test_turn_result_wraps_usage(): + r = TurnResult(final_text="hi", usage=TurnUsage(model="m")) + assert r.final_text == "hi" + assert r.usage.model == "m" + + +def test_close_span_defaults(): + c = CloseSpan(key="x") + assert c.output is None + assert c.is_complete is True + + +def test_harness_turn_runtime_check(): + class _Turn: + @property + def events(self) -> AsyncIterator[StreamTaskMessage]: + async def _gen() -> AsyncIterator[StreamTaskMessage]: + if False: + yield # pragma: no cover + + return _gen() + + def usage(self) -> TurnUsage: + return TurnUsage(model="m") + + assert isinstance(_Turn(), HarnessTurn) is True diff --git a/tests/lib/core/harness/test_yield_delivery.py b/tests/lib/core/harness/test_yield_delivery.py new file mode 100644 index 000000000..f3f491d84 --- /dev/null +++ b/tests/lib/core/harness/test_yield_delivery.py @@ -0,0 +1,89 @@ +import types as _types + +import pytest + +from agentex.lib.core.harness.tracer import SpanTracer +from agentex.types.task_message_update import ( + StreamTaskMessageDone, + StreamTaskMessageFull, + StreamTaskMessageStart, +) +from agentex.types.tool_request_content import ToolRequestContent +from agentex.types.tool_response_content import ToolResponseContent +from agentex.lib.core.harness.yield_delivery import yield_events + + +class _RecordTracing: + def __init__(self): + self.started, self.ended = [], [] + + async def start_span(self, *, trace_id, name, input=None, parent_id=None, data=None, task_id=None): + self.started.append(name) + return _types.SimpleNamespace() # supports arbitrary attribute assignment (span.output = ...) + + async def end_span(self, *, trace_id, span): + self.ended.append(getattr(span, "output", None)) + + +async def _gen(events): + for e in events: + yield e + + +@pytest.mark.asyncio +async def test_yield_passes_events_through_and_traces(): + fake = _RecordTracing() + tracer = SpanTracer(trace_id="t", parent_span_id="p", tracing=fake) + events = [ + StreamTaskMessageStart( + type="start", + index=0, + content=ToolRequestContent( + type="tool_request", author="agent", tool_call_id="c", name="Bash", arguments={} + ), + ), + StreamTaskMessageDone(type="done", index=0), + StreamTaskMessageFull( + type="full", + index=1, + content=ToolResponseContent( + type="tool_response", author="agent", tool_call_id="c", name="Bash", content="ok" + ), + ), + ] + out = [e async for e in yield_events(_gen(events), tracer=tracer)] + assert out == events # passthrough unchanged + assert fake.started == ["Bash"] # span derived + opened + assert fake.ended == ["ok"] # span closed with response + + +@pytest.mark.asyncio +async def test_yield_without_tracer_is_pure_passthrough(): + events = [ + StreamTaskMessageDone(type="done", index=0), + ] + out = [e async for e in yield_events(_gen(events), tracer=None)] + assert out == events + + +@pytest.mark.asyncio +async def test_flush_runs_on_early_close(): + fake = _RecordTracing() + tracer = SpanTracer(trace_id="t", parent_span_id="p", tracing=fake) + events = [ + StreamTaskMessageStart( + type="start", + index=0, + content=ToolRequestContent( + type="tool_request", author="agent", tool_call_id="c", name="Bash", arguments={} + ), + ), + StreamTaskMessageDone(type="done", index=0), + # response intentionally never arrives + ] + gen = yield_events(_gen(events), tracer=tracer) + first = await gen.__anext__() # Start + second = await gen.__anext__() # Done -> tool span opens here + await gen.aclose() # triggers the finally -> flush() + assert fake.started == ["Bash"] + assert fake.ended == [None] # flush closed the unpaired span (incomplete, no output) From c8de1d4c9c3b5b3c16ad4aaf9644c1ba0d618757 Mon Sep 17 00:00:00 2001 From: Vijay Kalmath <158184866+vkalmathscale@users.noreply.github.com> Date: Mon, 22 Jun 2026 16:02:41 -0400 Subject: [PATCH 02/10] feat(streaming): stream tool call argument deltas in TemporalStreamingModel (#355) Co-authored-by: stainless-app[bot] <142633134+stainless-app[bot]@users.noreply.github.com> Co-authored-by: Declan Brady Co-authored-by: Nitesh Dhanpal Co-authored-by: Claude Opus 4.8 --- .../models/temporal_streaming_model.py | 93 ++++++++- .../tests/test_streaming_model.py | 194 ++++++++++++++++++ 2 files changed, 280 insertions(+), 7 deletions(-) diff --git a/src/agentex/lib/core/temporal/plugins/openai_agents/models/temporal_streaming_model.py b/src/agentex/lib/core/temporal/plugins/openai_agents/models/temporal_streaming_model.py index 7ccc6627a..75dc0f053 100644 --- a/src/agentex/lib/core/temporal/plugins/openai_agents/models/temporal_streaming_model.py +++ b/src/agentex/lib/core/temporal/plugins/openai_agents/models/temporal_streaming_model.py @@ -1,6 +1,7 @@ """Custom Temporal Model Provider with streaming support for OpenAI agents.""" from __future__ import annotations +import json import time import uuid from typing import Any, List, Union, Optional, override @@ -63,9 +64,9 @@ from agentex.lib import adk from agentex.lib.utils.logging import make_logger from agentex.lib.core.tracing.tracer import AsyncTracer -from agentex.types.task_message_delta import TextDelta, ReasoningContentDelta, ReasoningSummaryDelta +from agentex.types.task_message_delta import TextDelta, ToolRequestDelta, ReasoningContentDelta, ReasoningSummaryDelta from agentex.types.task_message_update import StreamTaskMessageFull, StreamTaskMessageDelta -from agentex.types.task_message_content import TextContent, ReasoningContent +from agentex.types.task_message_content import TextContent, ReasoningContent, ToolRequestContent from agentex.lib.adk.utils._modules.client import create_async_agentex_client from agentex.lib.core.temporal.plugins.openai_agents.interceptors.context_interceptor import ( streaming_task_id, @@ -722,12 +723,27 @@ async def get_response( streaming_mode=self.streaming_mode, ).__aenter__() elif item and getattr(item, 'type', None) == 'function_call': - # Track the function call being streamed + # Open a streaming context per function call so argument + # deltas can be published incrementally. Coalescing and + # mode dispatch are handled by the streaming layer. + call_id = getattr(item, 'call_id', '') + tool_name = getattr(item, 'name', '') + call_context = await adk.streaming.streaming_task_message_context( + task_id=task_id, + initial_content=ToolRequestContent( + author="agent", + tool_call_id=call_id, + name=tool_name, + arguments={}, + ), + streaming_mode=self.streaming_mode, + ).__aenter__() function_calls_in_progress[output_index] = { 'id': getattr(item, 'id', ''), - 'call_id': getattr(item, 'call_id', ''), - 'name': getattr(item, 'name', ''), + 'call_id': call_id, + 'name': tool_name, 'arguments': getattr(item, 'arguments', ''), + 'context': call_context, } logger.debug(f"[TemporalStreamingModel] Starting function call: {item.name}") @@ -748,8 +764,24 @@ async def get_response( output_index = getattr(event, 'output_index', 0) delta = getattr(event, 'delta', '') - if output_index in function_calls_in_progress: - function_calls_in_progress[output_index]['arguments'] += delta + call_data = function_calls_in_progress.get(output_index) + if call_data is not None: + call_data['arguments'] += delta + call_context = call_data.get('context') + if call_context is not None: + try: + await call_context.stream_update(StreamTaskMessageDelta( + parent_task_message=call_context.task_message, + delta=ToolRequestDelta( + tool_call_id=call_data['call_id'], + name=call_data['name'], + arguments_delta=delta, + type="tool_request", + ), + type="delta", + )) + except Exception as e: + logger.warning(f"Failed to send tool request delta: {e}") logger.debug(f"[TemporalStreamingModel] Function call args delta: {delta[:50]}...") elif isinstance(event, ResponseFunctionCallArgumentsDoneEvent): @@ -874,6 +906,42 @@ async def get_response( ) output_items.append(tool_call) + # Emit the final ToolRequestContent and close the + # per-call streaming context. If the model produced + # invalid JSON args (truncation, hallucination), fall + # back to an empty dict so the streaming layer can + # still persist a message. + call_context = call_data.get('context') + if call_context is not None: + raw_args = call_data['arguments'] or '' + try: + parsed_args = json.loads(raw_args) if raw_args else {} + except json.JSONDecodeError: + logger.warning( + f"Failed to parse tool call arguments for {call_data['name']} " + f"(raw_args_bytes={len(raw_args)})" + ) + parsed_args = {} + try: + await call_context.stream_update(StreamTaskMessageFull( + parent_task_message=call_context.task_message, + content=ToolRequestContent( + author="agent", + tool_call_id=call_data['call_id'], + name=call_data['name'], + arguments=parsed_args, + ), + type="full", + )) + except Exception as e: + logger.warning(f"Failed to send tool request full update: {e}") + try: + await call_context.close() + except Exception as e: + logger.warning(f"Failed to close tool request context: {e}") + finally: + call_data['context'] = None + elif isinstance(event, ResponseReasoningSummaryPartAddedEvent): # New reasoning part/summary started - reset accumulator part = getattr(event, 'part', None) @@ -907,6 +975,17 @@ async def get_response( await streaming_context.close() streaming_context = None + # Defensive: close any function call contexts that didn't see a + # ResponseOutputItemDoneEvent (truncated stream, error mid-call). + for call_data in function_calls_in_progress.values(): + call_context = call_data.get('context') + if call_context is not None: + try: + await call_context.close() + except Exception as e: + logger.warning(f"Failed to close orphaned tool request context: {e}") + call_data['context'] = None + # Build the response from output items collected during streaming # Create output from the items we collected response_output = [] diff --git a/src/agentex/lib/core/temporal/plugins/openai_agents/tests/test_streaming_model.py b/src/agentex/lib/core/temporal/plugins/openai_agents/tests/test_streaming_model.py index 97dda0e61..26c0b7c4b 100644 --- a/src/agentex/lib/core/temporal/plugins/openai_agents/tests/test_streaming_model.py +++ b/src/agentex/lib/core/temporal/plugins/openai_agents/tests/test_streaming_model.py @@ -12,8 +12,11 @@ from openai.types.responses import ( ResponseCompletedEvent, ResponseTextDeltaEvent, + ResponseOutputItemDoneEvent, ResponseOutputItemAddedEvent, + ResponseFunctionCallArgumentsDoneEvent, ResponseReasoningSummaryTextDeltaEvent, + ResponseFunctionCallArgumentsDeltaEvent, ) @@ -851,6 +854,197 @@ async def test_missing_task_id_error(self, streaming_model): ) +class TestStreamingModelFunctionCallArgsStreaming: + """Verify ``ResponseFunctionCallArgumentsDeltaEvent``s are surfaced as + ``ToolRequestDelta`` updates and that a final ``ToolRequestContent`` Full is + emitted on ``ResponseOutputItemDoneEvent``. + + Without this, write-heavy tools (``write_file``, ``apply_patch``) buffer their + entire argument body inside ``invoke_model_activity`` and the UI sees a + multi-second freeze while the model is actively producing tokens. + """ + + @staticmethod + def _build_function_call_stream(arguments_text: str): + """Construct a streaming event sequence for a single function_call. + + Mirrors the production order: Added → N × ArgumentsDelta → ArgumentsDone + → OutputItemDone → ResponseCompleted. ``spec=...`` makes ``isinstance`` + dispatch in production work without triggering pydantic validation. + """ + call_item = MagicMock() + call_item.type = "function_call" + call_item.id = "fc_abc" + call_item.call_id = "call_abc" + call_item.name = "write_file" + call_item.arguments = "" + + item_added = MagicMock(spec=ResponseOutputItemAddedEvent) + item_added.item = call_item + item_added.output_index = 0 + + # Split the argument text into a few chunks to exercise the per-delta loop + chunk_size = max(1, len(arguments_text) // 3) if arguments_text else 1 + chunks = [arguments_text[i:i + chunk_size] for i in range(0, len(arguments_text), chunk_size)] or [""] + delta_events = [] + for chunk in chunks: + ev = MagicMock(spec=ResponseFunctionCallArgumentsDeltaEvent) + ev.delta = chunk + ev.output_index = 0 + delta_events.append(ev) + + args_done = MagicMock(spec=ResponseFunctionCallArgumentsDoneEvent) + args_done.arguments = arguments_text + args_done.output_index = 0 + + item_done = MagicMock(spec=ResponseOutputItemDoneEvent) + item_done.item = call_item + item_done.output_index = 0 + + completed = MagicMock(spec=ResponseCompletedEvent) + completed.response = MagicMock(output=[], usage=MagicMock(), id=None) + + return [item_added, *delta_events, args_done, item_done, completed], chunks + + @staticmethod + def _install_real_task_message(mock_adk_streaming, task_id: str): + """Replace the autouse fixture's MagicMock ``task_message`` with a real + ``TaskMessage`` so production's ``StreamTaskMessageDelta(parent_task_message=...)`` + construction passes pydantic validation. The default mock works for tests + that only assert on the context's ``__aenter__`` call but breaks tests + that exercise ``stream_update`` end-to-end. + """ + from agentex.types.task_message import TaskMessage + from agentex.types.task_message_content import ToolRequestContent + + ctx = mock_adk_streaming.streaming_task_message_context.return_value + ctx.task_message = TaskMessage( + id="msg_test", + task_id=task_id, + content=ToolRequestContent( + author="agent", + tool_call_id="call_abc", + name="write_file", + arguments={}, + ), + streaming_status="IN_PROGRESS", + ) + return ctx + + @pytest.mark.asyncio + async def test_function_call_emits_argument_deltas_and_final_full( + self, streaming_model, mock_adk_streaming, _streaming_context_vars, sample_task_id + ): + """A function_call with well-formed JSON args should produce: + (1) one streaming context opened with ``ToolRequestContent`` initial_content, + (2) one ``StreamTaskMessageDelta`` per ``ArgumentsDelta`` event carrying a + ``ToolRequestDelta`` with the right ``tool_call_id`` and ``arguments_delta``, + (3) one final ``StreamTaskMessageFull`` with ``ToolRequestContent`` whose + ``arguments`` is the parsed JSON dict. + """ + from agentex.types.task_message_delta import ToolRequestDelta + from agentex.types.task_message_update import StreamTaskMessageFull, StreamTaskMessageDelta + from agentex.types.task_message_content import ToolRequestContent + + ctx = self._install_real_task_message(mock_adk_streaming, sample_task_id) + + args_text = '{"path": "/tmp/foo.txt", "contents": "hello world"}' + events, chunks = self._build_function_call_stream(args_text) + + mock_stream = AsyncMock() + mock_stream.__aiter__.return_value = iter(events) + streaming_model.client.responses.create = AsyncMock(return_value=mock_stream) + + await streaming_model.get_response( + system_instructions=None, + input="please write foo", + model_settings=ModelSettings(), + tools=[], + output_schema=None, + handoffs=[], + tracing=None, + ) + + # 1. A streaming context was opened with ToolRequestContent. + opens = [ + c for c in mock_adk_streaming.streaming_task_message_context.call_args_list + if isinstance(c.kwargs.get("initial_content"), ToolRequestContent) + ] + assert len(opens) == 1, f"expected one ToolRequest context, got {len(opens)}" + initial = opens[0].kwargs["initial_content"] + assert initial.tool_call_id == "call_abc" + assert initial.name == "write_file" + + # 2. One StreamTaskMessageDelta(ToolRequestDelta) was streamed per + # ArgumentsDelta event, preserving the delta text exactly. + delta_updates = [ + call.args[0] if call.args else call.kwargs.get("update") + for call in ctx.stream_update.call_args_list + if (call.args and isinstance(call.args[0], StreamTaskMessageDelta) + and isinstance(call.args[0].delta, ToolRequestDelta)) + ] + assert len(delta_updates) == len(chunks) + for update, expected_chunk in zip(delta_updates, chunks): + assert update.delta.tool_call_id == "call_abc" + assert update.delta.name == "write_file" + assert update.delta.arguments_delta == expected_chunk + + # 3. A final StreamTaskMessageFull(ToolRequestContent) was streamed with + # parsed args. + full_updates = [ + call.args[0] if call.args else call.kwargs.get("update") + for call in ctx.stream_update.call_args_list + if (call.args and isinstance(call.args[0], StreamTaskMessageFull) + and isinstance(call.args[0].content, ToolRequestContent)) + ] + assert len(full_updates) == 1 + final = full_updates[0].content + assert final.tool_call_id == "call_abc" + assert final.name == "write_file" + assert final.arguments == {"path": "/tmp/foo.txt", "contents": "hello world"} + + @pytest.mark.asyncio + async def test_function_call_malformed_args_fall_back_to_empty_dict( + self, streaming_model, mock_adk_streaming, _streaming_context_vars, sample_task_id, caplog + ): + """If the model produces invalid JSON for the args, the final + ``ToolRequestContent`` should carry ``arguments={}`` and a warning should + be logged. The raw delta stream is preserved either way. + """ + from agentex.types.task_message_update import StreamTaskMessageFull + from agentex.types.task_message_content import ToolRequestContent + + ctx = self._install_real_task_message(mock_adk_streaming, sample_task_id) + + # Missing closing brace — invalid JSON. + events, _ = self._build_function_call_stream('{"path": "/tmp/foo.txt", "contents":') + + mock_stream = AsyncMock() + mock_stream.__aiter__.return_value = iter(events) + streaming_model.client.responses.create = AsyncMock(return_value=mock_stream) + + with caplog.at_level("WARNING"): + await streaming_model.get_response( + system_instructions=None, + input="please write foo", + model_settings=ModelSettings(), + tools=[], + output_schema=None, + handoffs=[], + tracing=None, + ) + + full_updates = [ + call.args[0] if call.args else call.kwargs.get("update") + for call in ctx.stream_update.call_args_list + if (call.args and isinstance(call.args[0], StreamTaskMessageFull) + and isinstance(call.args[0].content, ToolRequestContent)) + ] + assert len(full_updates) == 1 + assert full_updates[0].content.arguments == {} + assert any("Failed to parse tool call arguments" in r.getMessage() for r in caplog.records) + + class TestStreamingModelUsageResponseIdAndCacheKey: """Cover real-Usage capture, real response_id, span emission, and opt-in prompt_cache_key.""" From 694960f913b8ba521d9236e876e5e00f57a3a3ff Mon Sep 17 00:00:00 2001 From: Declan Brady Date: Mon, 22 Jun 2026 16:09:20 -0400 Subject: [PATCH 03/10] fix(harness): assert cross-channel (yield vs auto-send) conformance equivalence [AGX1-373] (#414) --- .github/workflows/agentex-tutorials-test.yml | 28 ++ tests/lib/core/harness/conformance/runner.py | 472 +++++++++++++++++- .../harness/conformance/test_conformance.py | 258 +++++++++- 3 files changed, 747 insertions(+), 11 deletions(-) diff --git a/.github/workflows/agentex-tutorials-test.yml b/.github/workflows/agentex-tutorials-test.yml index f19c58d4d..41b495d71 100644 --- a/.github/workflows/agentex-tutorials-test.yml +++ b/.github/workflows/agentex-tutorials-test.yml @@ -49,6 +49,29 @@ jobs: curl -LsSf https://astral.sh/uv/install.sh | sh echo "$HOME/.local/bin" >> $GITHUB_PATH + # Subprocess-CLI harnesses: install the relevant CLI only for the + # claude-code / codex tutorials (no-op for every other tutorial). npm is + # preinstalled on ubuntu runners. Versions mirror the golden agent's + # sandbox image (teams/sgp/agents/golden_agent/sandbox/Dockerfile): claude-code + # is pinned to the same CLAUDE_CODE_VERSION; codex is left unpinned there, + # so it is left unpinned here too. Bump CLAUDE_CODE_VERSION in lockstep + # with the sandbox Dockerfile. + - name: Install harness CLI (claude-code / codex only) + if: ${{ contains(matrix.tutorial, 'claude_code') || contains(matrix.tutorial, 'codex') }} + env: + CLAUDE_CODE_VERSION: "2.1.142" + run: | + if [[ "${{ matrix.tutorial }}" == *claude_code* ]]; then + echo "📦 Installing Claude Code CLI (v${CLAUDE_CODE_VERSION})..." + npm install -g "@anthropic-ai/claude-code@${CLAUDE_CODE_VERSION}" + claude --version || true + fi + if [[ "${{ matrix.tutorial }}" == *codex* ]]; then + echo "📦 Installing Codex CLI..." + npm install -g @openai/codex + codex --version || true + fi + - name: Pull latest AgentEx image run: | echo "🐳 Pulling latest Scale AgentEx Docker image..." @@ -136,6 +159,11 @@ jobs: working-directory: ./examples/tutorials env: OPENAI_API_KEY: ${{ secrets.TUTORIAL_OPENAI_API_KEY }} + ANTHROPIC_API_KEY: ${{ secrets.TUTORIAL_ANTHROPIC_API_KEY }} + # Enable the gated live tests only for the matching subprocess-CLI + # harness tutorial (the CLI is installed for it in the step above). + CLAUDE_LIVE_TESTS: ${{ contains(matrix.tutorial, 'claude_code') && '1' || '' }} + CODEX_LIVE_TESTS: ${{ contains(matrix.tutorial, 'codex') && '1' || '' }} HEALTH_CHECK_PORT: 8080 # Use non-privileged port for temporal worker health checks run: | echo "Testing tutorial: ${{ matrix.tutorial }}" diff --git a/tests/lib/core/harness/conformance/runner.py b/tests/lib/core/harness/conformance/runner.py index 81a74860c..84e84fa51 100644 --- a/tests/lib/core/harness/conformance/runner.py +++ b/tests/lib/core/harness/conformance/runner.py @@ -1,8 +1,30 @@ """Shared conformance engine: every harness tap registers fixtures here. -A fixture is (name, list[StreamTaskMessage]). The runner asserts that span -derivation over the events is identical regardless of delivery channel, which is -the cross-channel guarantee from the spec. +A fixture is (name, list[StreamTaskMessage]). The runner asserts two things: + +1. **Cross-channel logical equivalence**: yield_events and auto_send produce the + same *logical* sequence of delivered message contents. "Logical" means we + normalise away the streaming-envelope difference: + - yield channel delivers StreamTaskMessageFull(ToolResponseContent) verbatim. + - auto_send channel delivers the same tool-response by opening a streaming + context with the full content and closing it immediately (Start+Done on the + wire), not a Full event. + Both reduce to the same LogicalDelivery(type, identity, payload) tuple; the + conformance test compares those normalised sequences. + + `payload` carries the content that callers actually consume: + - text: initial_content.content prepended, then accumulated delta string + - reasoning: initial_content.summary joined, then accumulated delta string + - tool_request: the arguments dict (JSON-sorted), from Start content + - tool_response: the content value (str) + This catches a channel that delivers the right structural shape but corrupts, + drops, or omits initial_content (including reasoning summary) or payload. + +2. **Span signal equivalence**: each channel is driven with its own recording + tracer that captures every SpanSignal it actually receives in handle(); the + two channels' recorded signal lists must be identical. Comparing what each + channel genuinely emitted (rather than re-deriving from the events) catches a + regression where a channel skips deriver.observe() for some event type. Registry shared-state hazard: `_REGISTRY` is process-global. Every `test_*.py` module that calls `register()` at import time contributes to it, so a module @@ -12,13 +34,51 @@ module should register and parametrize over its OWN fixtures (e.g. keep a module-local list it both registers and parametrizes), rather than relying on cross-module global accumulation via `all_fixtures()`. + +Design decision — Full-message handling in auto_send +---------------------------------------------------- +auto_send posts a StreamTaskMessageFull (tool_request or tool_response) by +opening a streaming context with the full content and closing it immediately, +rather than calling adk.messages.create. This open+close approach is retained +because: + - StreamingTaskMessageContext.close() persists initial_content when no deltas + have been streamed, so the message IS correctly persisted. + - It mirrors the pattern already used by the real _langgraph_async.py harness, + keeping behavioural parity. + - Switching to adk.messages.create would require an additional injectable + dependency, adding surface area for no observable benefit. +The conformance test treats this as an ACCEPTABLE envelope difference: at the +logical-content level, Full(ToolResponseContent) from yield and +Start(content)+Done from auto_send are equivalent. The recorded span signals are +identical because both adapters drive the same SpanDeriver.observe() call +sequence and forward every signal to their tracer. + +AGX1-377 fix: auto_send now DELIVERS streamed tool-request messages (Start+Done) +instead of dropping them. The conformance normaliser previously suppressed the +delivery for Start(tool_request)+Done on the yield channel to match auto_send's +old drop behaviour. That suppression is now removed: both channels produce a +LogicalDelivery for a streamed tool_request, and the cross-channel assertion +verifies it is delivered on both. """ from __future__ import annotations +import json +import types as _types +from typing import Any, NamedTuple, override from dataclasses import dataclass +from agentex.types.text_delta import TextDelta +from agentex.types.task_message import TaskMessage from agentex.lib.core.harness.types import SpanSignal, StreamTaskMessage +from agentex.lib.core.harness.tracer import SpanTracer +from agentex.types.task_message_update import ( + StreamTaskMessageDone, + StreamTaskMessageFull, + StreamTaskMessageDelta, + StreamTaskMessageStart, +) +from agentex.types.reasoning_content_delta import ReasoningContentDelta from agentex.lib.core.harness.span_derivation import SpanDeriver @@ -46,3 +106,409 @@ def derive_all(events: list[StreamTaskMessage]) -> list[SpanSignal]: out.extend(d.observe(e)) out.extend(d.flush()) return out + + +# --------------------------------------------------------------------------- +# Logical delivery normalisation +# --------------------------------------------------------------------------- + + +class LogicalDelivery(NamedTuple): + """A single logically-delivered message, channel-agnostic. + + `content_type` is the .type of the content (e.g. "text", "reasoning", + "tool_request", "tool_response"). `identity` is a frozenset of key=value + pairs that uniquely identify the content (e.g. tool_call_id for tool + messages, or index for text/reasoning). `payload` is a stable string + representation of the content callers actually consume: + - text: initial_content.content prepended to accumulated delta strings + - reasoning: initial_content.summary joined, prepended to accumulated + reasoning-content delta strings + - tool_request: JSON-sorted arguments from Start content + - tool_response: str(content) from Full event + """ + + content_type: str + identity: frozenset[tuple[str, Any]] + payload: str = "" + + +def _yield_logical_deliveries(events: list[StreamTaskMessage]) -> list[LogicalDelivery]: + """Extract logical deliveries from the yield channel's event list. + + The yield channel forwards events verbatim. A logical delivery is: + - A Full event (tool_request / tool_response): content delivered as-is. + - A Start + ... + Done sequence for text/reasoning/tool_request content. + + The `payload` field captures the content callers consume: + - text: initial_content.content (from Start) prepended to accumulated deltas + - reasoning: initial_content.summary joined (from Start) prepended to + accumulated reasoning-content deltas (this catches a channel that drops + the summary) + - tool_request: JSON-sorted arguments from the Start content (AGX1-377: now + delivered on both channels, no longer suppressed) + - tool_response: str(content) from Full event + """ + from agentex.types.text_content import TextContent + from agentex.types.reasoning_content import ReasoningContent + from agentex.types.tool_request_content import ToolRequestContent + + deliveries: list[LogicalDelivery] = [] + # Track which indices had a Start so we can pair with Done + started: dict[int, Any] = {} # index -> initial content + # Accumulate delta text per index (seed with initial_content text if present) + accumulated: dict[int, list[str]] = {} # index -> list of delta strings + + for event in events: + if isinstance(event, StreamTaskMessageStart): + if event.index is not None: + started[event.index] = event.content + # Seed accumulator with initial_content so a channel that drops + # initial_content but delivers deltas correctly will fail. + seed: list[str] = [] + if isinstance(event.content, TextContent) and event.content.content: + seed = [event.content.content] + elif isinstance(event.content, ReasoningContent) and event.content.summary: + seed = list(event.content.summary) + accumulated[event.index] = seed + elif isinstance(event, StreamTaskMessageDelta): + if event.index is not None and event.delta is not None: + if isinstance(event.delta, TextDelta) and event.delta.text_delta: + accumulated.setdefault(event.index, []).append(event.delta.text_delta) + elif isinstance(event.delta, ReasoningContentDelta) and event.delta.content_delta: + accumulated.setdefault(event.index, []).append(event.delta.content_delta) + elif isinstance(event, StreamTaskMessageDone): + if event.index is not None and event.index in started: + content = started.pop(event.index) + deltas = accumulated.pop(event.index, []) + ctype = getattr(content, "type", None) or "" + if ctype in ("text", "reasoning"): + deliveries.append( + LogicalDelivery( + content_type=ctype, + identity=frozenset({("index", event.index)}), + payload="".join(deltas), + ) + ) + elif ctype == "tool_request" and isinstance(content, ToolRequestContent): + # AGX1-377 fix: auto_send now delivers streamed tool-request + # messages. Emit a delivery here so the cross-channel + # assertion verifies it is present on both channels. + deliveries.append( + LogicalDelivery( + content_type=ctype, + identity=frozenset( + { + ("tool_call_id", content.tool_call_id), + ("name", content.name), + } + ), + payload=json.dumps(content.arguments, sort_keys=True), + ) + ) + elif isinstance(event, StreamTaskMessageFull): + content = event.content + ctype = getattr(content, "type", None) or "" + if ctype == "tool_response": + from agentex.types.tool_response_content import ToolResponseContent + + if isinstance(content, ToolResponseContent): + deliveries.append( + LogicalDelivery( + content_type=ctype, + identity=frozenset( + { + ("tool_call_id", content.tool_call_id), + ("name", content.name), + } + ), + payload=str(content.content), + ) + ) + elif ctype == "tool_request": + from agentex.types.tool_request_content import ToolRequestContent + + if isinstance(content, ToolRequestContent): + deliveries.append( + LogicalDelivery( + content_type=ctype, + identity=frozenset( + { + ("tool_call_id", content.tool_call_id), + ("name", content.name), + } + ), + payload=json.dumps(content.arguments, sort_keys=True), + ) + ) + + return deliveries + + +# --------------------------------------------------------------------------- +# Fake streaming backend for auto_send conformance runner +# --------------------------------------------------------------------------- + + +class _FakeCtx: + """Mirrors StreamingTaskMessageContext: __aenter__ opens, close() closes.""" + + def __init__(self, sink: list[Any], content_type: str, initial_content: Any) -> None: + self.sink = sink + self.content_type = content_type + self.task_message = TaskMessage( + id="msg-conformance", + task_id="conformance-task", + content=initial_content, + ) + + async def __aenter__(self) -> "_FakeCtx": + self.sink.append(("open", self.content_type, self.task_message.content)) + return self + + async def __aexit__(self, *args: Any) -> bool: + await self.close() + return False + + async def close(self) -> None: + self.sink.append(("close", self.content_type)) + + async def stream_update(self, update: Any) -> Any: + self.sink.append(("update", update)) + return update + + +class _FakeStreaming: + """Fake streaming backend; records every context lifecycle event.""" + + def __init__(self) -> None: + self.sink: list[Any] = [] + + def streaming_task_message_context( + self, + task_id: str, + initial_content: Any, + streaming_mode: str = "coalesced", + created_at: Any = None, + ) -> _FakeCtx: + ctype = getattr(initial_content, "type", None) or "" + self.sink.append(("ctx", ctype, initial_content)) + return _FakeCtx(self.sink, ctype, initial_content) + + +class _FakeTracing: + """Minimal tracing backend: records started/ended span names + outputs.""" + + def __init__(self) -> None: + self.started: list[str] = [] + self.ended: list[Any] = [] + + async def start_span( + self, + *, + trace_id: str, + name: str, + input: Any = None, + parent_id: Any = None, + data: Any = None, + task_id: Any = None, + ) -> Any: + self.started.append(name) + return _types.SimpleNamespace() + + async def end_span(self, *, trace_id: str, span: Any) -> None: + self.ended.append(getattr(span, "output", None)) + + +class _RecordingTracer(SpanTracer): + """SpanTracer that records every SpanSignal it actually receives. + + Each delivery channel calls `tracer.handle(signal)` for every signal it + derives from the stream, so `received_signals` captures what the channel + genuinely emitted — not a re-derivation. Comparing the two channels' + recorded lists catches regressions where a channel skips + `deriver.observe(event)` for some event type. + """ + + def __init__(self, tracing: Any) -> None: + super().__init__( + trace_id="conformance-trace", + parent_span_id="conformance-parent", + tracing=tracing, + ) + self.received_signals: list[SpanSignal] = [] + + @override + async def handle(self, signal: SpanSignal) -> None: + self.received_signals.append(signal) + await super().handle(signal) + + +async def _gen(events: list[StreamTaskMessage]): # type: ignore[return] + for e in events: + yield e + + +def _auto_send_logical_deliveries(sink: list[Any]) -> list[LogicalDelivery]: + """Extract logical deliveries from the auto_send fake streaming sink. + + Each context lifecycle in the sink looks like: + ("ctx", ctype, content) -- context created + ("open", ctype, content) -- context __aenter__ + [("update", delta), ...] -- optional deltas (StreamTaskMessageDelta) + ("close", ctype) -- context closed + + A logical delivery corresponds to each open+close pair. For text/reasoning + we identify by sequential position and build the payload by prepending the + initial_content text (TextContent.content) or summary (ReasoningContent.summary) + to accumulated deltas. This matches _yield_logical_deliveries so a channel + that drops initial_content or reasoning summary fails the comparison. + For tool messages we use tool_call_id + name and capture arguments/content. + """ + from agentex.types.text_content import TextContent + from agentex.types.reasoning_content import ReasoningContent + from agentex.types.tool_request_content import ToolRequestContent + from agentex.types.tool_response_content import ToolResponseContent + + deliveries: list[LogicalDelivery] = [] + open_idx = 0 + while open_idx < len(sink): + entry = sink[open_idx] + if entry[0] == "ctx": + ctype: str = entry[1] + content: Any = entry[2] + found_open = False + delta_parts: list[str] = [] + # Seed delta_parts with initial_content so payload comparison + # catches a channel that drops initial_content but delivers deltas. + if isinstance(content, TextContent) and content.content: + delta_parts = [content.content] + elif isinstance(content, ReasoningContent) and content.summary: + delta_parts = list(content.summary) + for j in range(open_idx + 1, len(sink)): + if sink[j][0] == "open" and sink[j][1] == ctype and not found_open: + found_open = True + elif found_open and sink[j][0] == "update": + # Accumulate delta content from StreamTaskMessageDelta + update = sink[j][1] + if isinstance(update, StreamTaskMessageDelta) and update.delta is not None: + if isinstance(update.delta, TextDelta) and update.delta.text_delta: + delta_parts.append(update.delta.text_delta) + elif isinstance(update.delta, ReasoningContentDelta) and update.delta.content_delta: + delta_parts.append(update.delta.content_delta) + elif sink[j][0] == "close" and sink[j][1] == ctype and found_open: + # Matched open+close: emit logical delivery with payload + if ctype in ("text", "reasoning"): + count = sum(1 for k in range(open_idx) if sink[k][0] == "ctx" and sink[k][1] == ctype) + deliveries.append( + LogicalDelivery( + content_type=ctype, + identity=frozenset({("seq", count)}), + payload="".join(delta_parts), + ) + ) + elif ctype == "tool_response": + if isinstance(content, ToolResponseContent): + deliveries.append( + LogicalDelivery( + content_type=ctype, + identity=frozenset( + { + ("tool_call_id", content.tool_call_id), + ("name", content.name), + } + ), + payload=str(content.content), + ) + ) + elif ctype == "tool_request": + if isinstance(content, ToolRequestContent): + deliveries.append( + LogicalDelivery( + content_type=ctype, + identity=frozenset( + { + ("tool_call_id", content.tool_call_id), + ("name", content.name), + } + ), + payload=json.dumps(content.arguments, sort_keys=True), + ) + ) + open_idx = j + 1 + break + else: + open_idx += 1 + else: + open_idx += 1 + + return deliveries + + +def _yield_text_reasoning_seq(deliveries: list[LogicalDelivery]) -> list[LogicalDelivery]: + """Re-key text/reasoning deliveries from index-based to seq-based identity. + + The yield channel uses event.index as identity; auto_send uses a sequential + counter. To compare across channels, normalise both to sequential position + within each content type. + """ + result: list[LogicalDelivery] = [] + counts: dict[str, int] = {} + for d in deliveries: + if d.content_type in ("text", "reasoning"): + seq = counts.get(d.content_type, 0) + counts[d.content_type] = seq + 1 + result.append( + LogicalDelivery( + content_type=d.content_type, + identity=frozenset({("seq", seq)}), + payload=d.payload, + ) + ) + else: + result.append(d) + return result + + +async def run_cross_channel_conformance( + fixture: Fixture, +) -> tuple[list[LogicalDelivery], list[LogicalDelivery], list[SpanSignal], list[SpanSignal]]: + """Run both channels over a fixture; return (yield_deliveries, auto_deliveries, + yield_spans, auto_spans). + + The caller asserts yield_deliveries == auto_deliveries and + yield_spans == auto_spans. The span signals are the ones each channel's + tracer ACTUALLY recorded while delivering (not a re-derivation), so a + regression where a channel skips deriver.observe() for some event type is + caught. + """ + from agentex.lib.core.harness.auto_send import auto_send + from agentex.lib.core.harness.yield_delivery import yield_events + + # --- yield channel --- + tracer_yield = _RecordingTracer(tracing=_FakeTracing()) + yield_out = [e async for e in yield_events(_gen(fixture.events), tracer=tracer_yield)] + + # Span signals the yield channel actually emitted to its tracer + yield_spans = tracer_yield.received_signals + + # Logical deliveries from yield output + yield_deliveries = _yield_text_reasoning_seq(_yield_logical_deliveries(yield_out)) + + # --- auto_send channel --- + tracer_auto = _RecordingTracer(tracing=_FakeTracing()) + fake_streaming = _FakeStreaming() + await auto_send( + _gen(fixture.events), + task_id="conformance-task", + tracer=tracer_auto, + streaming=fake_streaming, + ) + + # Span signals the auto_send channel actually emitted to its tracer + auto_spans = tracer_auto.received_signals + + # Logical deliveries from what the streaming backend received + auto_deliveries = _auto_send_logical_deliveries(fake_streaming.sink) + + return yield_deliveries, auto_deliveries, yield_spans, auto_spans diff --git a/tests/lib/core/harness/conformance/test_conformance.py b/tests/lib/core/harness/conformance/test_conformance.py index d9eec1c15..6d5f8ca66 100644 --- a/tests/lib/core/harness/conformance/test_conformance.py +++ b/tests/lib/core/harness/conformance/test_conformance.py @@ -1,16 +1,68 @@ +"""Cross-channel conformance tests: yield_events vs auto_send. + +What is asserted +---------------- +For each fixture the conformance runner drives BOTH delivery channels and +verifies two guarantees: + +1. **Logical-delivery equivalence**: the sequence of logically-delivered + messages is identical across channels. "Logical" normalises away the + streaming-envelope difference: + - yield channel delivers StreamTaskMessageFull(ToolResponseContent) as-is. + - auto_send delivers the same tool-response by opening a streaming context + with the full content and closing it immediately. + Both collapse to LogicalDelivery(content_type, identity, payload) tuples + that compare equal. The payload includes initial_content (TextContent.content + and ReasoningContent.summary) so a channel that drops initial content fails. + +2. **Span signal equivalence**: both channels feed the same pure SpanDeriver + over the same event sequence, so the derived span signals must be identical. + +What is NOT asserted +-------------------- +Raw wire-level event shapes are NOT compared (that would fail by design: the +Full vs Start+Done envelope difference is a documented, acceptable choice in +auto_send — see runner.py for the rationale). + +AGX1-377 fix: auto_send now delivers streamed tool-request messages. The +suppression that previously prevented the yield normaliser from emitting a +LogicalDelivery for Start(tool_request)+Done is removed. Both channels now +produce a delivery for streamed tool_request, verified by the +"streamed-tool-request" fixture. +""" + +from __future__ import annotations + import pytest +from agentex.types.text_delta import TextDelta +from agentex.types.text_content import TextContent +from agentex.types.reasoning_content import ReasoningContent from agentex.types.task_message_update import ( StreamTaskMessageDone, StreamTaskMessageFull, + StreamTaskMessageDelta, StreamTaskMessageStart, ) from agentex.types.tool_request_content import ToolRequestContent from agentex.types.tool_response_content import ToolResponseContent +from agentex.types.reasoning_content_delta import ReasoningContentDelta + +from .runner import ( + Fixture, + register, + derive_all, + all_fixtures, + run_cross_channel_conformance, +) -from .runner import Fixture, register, derive_all, all_fixtures +# --------------------------------------------------------------------------- +# Fixtures +# --------------------------------------------------------------------------- -register( +_FIXTURES: list[Fixture] = [ + # fixture 1: single tool call — tool_request delivered via Full (classic path) + # plus a streamed tool_response via Full. Both channels should deliver both. Fixture( name="builtin-single-tool", events=[ @@ -30,14 +82,204 @@ ), ), ], + ), + # fixture 2: streaming text — exercises the text start/delta/done path. + # Uses non-empty initial_content so the payload comparison catches a channel + # that drops StreamTaskMessageStart.content (Greptile id 3438655533, P1). + Fixture( + name="streaming-text", + events=[ + StreamTaskMessageStart( + type="start", + index=0, + content=TextContent(type="text", author="agent", content="Init"), + ), + StreamTaskMessageDelta( + type="delta", + index=0, + delta=TextDelta(type="text", text_delta="Hello"), + ), + StreamTaskMessageDelta( + type="delta", + index=0, + delta=TextDelta(type="text", text_delta=" world"), + ), + StreamTaskMessageDone(type="done", index=0), + ], + ), + # fixture 3: reasoning block — exercises reasoning span open/close + delivery. + # ReasoningContent.summary is included in the payload so a channel that drops + # the reasoning-summary fails (Greptile id 3438655533, P1). + Fixture( + name="reasoning-block", + events=[ + StreamTaskMessageStart( + type="start", + index=0, + content=ReasoningContent( + type="reasoning", + author="agent", + summary=["Thinking..."], + ), + ), + StreamTaskMessageDelta( + type="delta", + index=0, + delta=ReasoningContentDelta( + type="reasoning_content", + content_index=0, + content_delta="step 1", + ), + ), + StreamTaskMessageDone(type="done", index=0), + ], + ), + # fixture 4: streamed tool_request (AGX1-377 fix) — tool_request delivered + # via Start+Done (no Full). auto_send now delivers this instead of dropping + # it. Both channels must produce a LogicalDelivery for this fixture. + Fixture( + name="streamed-tool-request", + events=[ + StreamTaskMessageStart( + type="start", + index=0, + content=ToolRequestContent( + type="tool_request", + author="agent", + tool_call_id="tr-1", + name="Read", + arguments={"path": "/tmp/foo"}, + ), + ), + StreamTaskMessageDone(type="done", index=0), + StreamTaskMessageFull( + type="full", + index=1, + content=ToolResponseContent( + type="tool_response", + author="agent", + tool_call_id="tr-1", + name="Read", + content="file contents", + ), + ), + ], + ), + # fixture 5: parallel tool calls + a tool that errors (AGX1-373 review, + # danielmillerp). The earlier fixtures only exercise one tool at a time, so + # equivalence is proven over trivially-orderable streams. This stresses the + # representative case: two tool spans open SIMULTANEOUSLY (p-ls opens via the + # streamed Start+Done path, p-read opens via Full while p-ls is still open), + # then close in a different order than they opened, and one of them returns + # an error. It guards against the two channels agreeing with each other while + # both mishandling interleaved/parallel spans or a failing tool. + # + # The failing tool sets ToolResponseContent.is_error=True (AGX1-371), which + # the span deriver threads onto the closed tool span's CloseSpan.is_error. + # Both channels feed the same deriver, so the recorded span signals — error + # status included — must match. + Fixture( + name="parallel-tools-with-error", + events=[ + # p-ls: streamed tool_request (opens its span at Done). + StreamTaskMessageStart( + type="start", + index=0, + content=ToolRequestContent( + type="tool_request", + author="agent", + tool_call_id="p-ls", + name="Bash", + arguments={"command": "ls /nope"}, + ), + ), + StreamTaskMessageDone(type="done", index=0), + # p-read: Full tool_request opens a second span while p-ls is open. + StreamTaskMessageFull( + type="full", + index=1, + content=ToolRequestContent( + type="tool_request", + author="agent", + tool_call_id="p-read", + name="Read", + arguments={"path": "/etc/hosts"}, + ), + ), + # p-ls errors and closes first (close order != open order). + StreamTaskMessageFull( + type="full", + index=2, + content=ToolResponseContent( + type="tool_response", + author="agent", + tool_call_id="p-ls", + name="Bash", + content="Error: ls: /nope: No such file or directory", + is_error=True, + ), + ), + # p-read succeeds and closes second. + StreamTaskMessageFull( + type="full", + index=3, + content=ToolResponseContent( + type="tool_response", + author="agent", + tool_call_id="p-read", + name="Read", + content="127.0.0.1 localhost", + ), + ), + ], + ), +] + +# Register all fixtures for backward-compatible use via all_fixtures() +for _f in _FIXTURES: + register(_f) + + +# --------------------------------------------------------------------------- +# Cross-channel conformance: logical equivalence + span equivalence +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize("fixture", _FIXTURES, ids=lambda f: f.name) +@pytest.mark.asyncio +async def test_cross_channel_equivalence(fixture: Fixture) -> None: + """Assert that yield_events and auto_send produce equivalent logical + deliveries and identical span signals for every fixture. + + This is the real cross-channel guarantee: the two delivery adapters + agree on WHAT was delivered (logical content) and HOW spans were derived, + even though their streaming-envelope shapes differ (Full vs Start+Done for + tool messages). + + The span signals are the ones each channel's tracer ACTUALLY recorded while + delivering, not a re-derivation, so a regression where one channel skips + deriver.observe() for some event type is caught here. + """ + yield_deliveries, auto_deliveries, yield_spans, auto_spans = await run_cross_channel_conformance(fixture) + + assert yield_deliveries == auto_deliveries, ( + f"[{fixture.name}] logical deliveries differ:\n yield: {yield_deliveries}\n auto_send: {auto_deliveries}" ) -) + assert yield_spans == auto_spans, ( + f"[{fixture.name}] span signals differ:\n yield: {yield_spans}\n auto_send: {auto_spans}" + ) + + +# --------------------------------------------------------------------------- +# Backward-compatible determinism test (kept for regression coverage) +# --------------------------------------------------------------------------- @pytest.mark.parametrize("fixture", all_fixtures(), ids=lambda f: f.name) -def test_span_derivation_is_deterministic(fixture): - """Exercises the cross-channel guarantee: yield and auto-send observe the - same event stream, so span derivation must be deterministic/idempotent.""" - # Deriving twice over the same events yields identical signals (the property - # that makes yield vs auto-send equivalent, since both observe the same stream). +def test_span_derivation_is_deterministic(fixture: Fixture) -> None: + """Span derivation over the same event list is idempotent. + + Retained as a lightweight regression guard. The primary cross-channel + guarantee is asserted in test_cross_channel_equivalence above. + """ assert derive_all(fixture.events) == derive_all(fixture.events) From 5ec62c20781d24fc3e0b92734fcd444b1e791d70 Mon Sep 17 00:00:00 2001 From: Declan Brady Date: Mon, 22 Jun 2026 18:21:13 -0400 Subject: [PATCH 04/10] feat(pydantic-ai): migrate onto unified harness surface (PR4) (#415) --- .github/workflows/harness-integration.yml | 27 +- ...unified-harness-surface-pr4-pydantic-ai.md | 246 +++++++++++ .../00_sync/harness_pydantic_ai/.dockerignore | 43 ++ .../00_sync/harness_pydantic_ai/Dockerfile | 50 +++ .../00_sync/harness_pydantic_ai/README.md | 54 +++ .../00_sync/harness_pydantic_ai/manifest.yaml | 58 +++ .../harness_pydantic_ai/project/__init__.py | 0 .../harness_pydantic_ai/project/acp.py | 92 +++++ .../harness_pydantic_ai/project/agent.py | 39 ++ .../harness_pydantic_ai/project/tools.py | 20 + .../harness_pydantic_ai/pyproject.toml | 36 ++ .../harness_pydantic_ai/tests/test_agent.py | 138 +++++++ .../00_base/harness_pydantic_ai/.dockerignore | 43 ++ .../00_base/harness_pydantic_ai/Dockerfile | 50 +++ .../00_base/harness_pydantic_ai/README.md | 54 +++ .../00_base/harness_pydantic_ai/manifest.yaml | 58 +++ .../harness_pydantic_ai/project/__init__.py | 0 .../harness_pydantic_ai/project/acp.py | 159 +++++++ .../harness_pydantic_ai/project/agent.py | 39 ++ .../harness_pydantic_ai/project/tools.py | 20 + .../harness_pydantic_ai/pyproject.toml | 36 ++ .../harness_pydantic_ai/tests/test_agent.py | 118 ++++++ .../harness_pydantic_ai/.dockerignore | 43 ++ .../harness_pydantic_ai/Dockerfile | 43 ++ .../10_temporal/harness_pydantic_ai/README.md | 61 +++ .../harness_pydantic_ai/manifest.yaml | 62 +++ .../harness_pydantic_ai/project/__init__.py | 0 .../harness_pydantic_ai/project/acp.py | 35 ++ .../harness_pydantic_ai/project/agent.py | 111 +++++ .../harness_pydantic_ai/project/run_worker.py | 48 +++ .../harness_pydantic_ai/project/tools.py | 24 ++ .../harness_pydantic_ai/project/workflow.py | 137 +++++++ .../harness_pydantic_ai/pyproject.toml | 38 ++ .../harness_pydantic_ai/tests/test_agent.py | 114 +++++ .../lib/adk/_modules/_pydantic_ai_async.py | 249 +---------- .../lib/adk/_modules/_pydantic_ai_sync.py | 33 +- .../lib/adk/_modules/_pydantic_ai_tracing.py | 39 ++ .../lib/adk/_modules/_pydantic_ai_turn.py | 134 ++++++ tests/lib/adk/test_pydantic_ai_async.py | 311 +++++++++++--- tests/lib/adk/test_pydantic_ai_sync.py | 74 ++++ .../lib/adk/test_pydantic_ai_sync_unified.py | 209 ++++++++++ tests/lib/adk/test_pydantic_ai_turn.py | 276 +++++++++++++ .../test_pydantic_ai_conformance.py | 194 +++++++++ .../harness/test_harness_pydantic_ai_async.py | 361 ++++++++++++++++ .../harness/test_harness_pydantic_ai_sync.py | 388 ++++++++++++++++++ .../test_harness_pydantic_ai_temporal.py | 370 +++++++++++++++++ 46 files changed, 4439 insertions(+), 295 deletions(-) create mode 100644 docs/superpowers/plans/2026-06-18-unified-harness-surface-pr4-pydantic-ai.md create mode 100644 examples/tutorials/00_sync/harness_pydantic_ai/.dockerignore create mode 100644 examples/tutorials/00_sync/harness_pydantic_ai/Dockerfile create mode 100644 examples/tutorials/00_sync/harness_pydantic_ai/README.md create mode 100644 examples/tutorials/00_sync/harness_pydantic_ai/manifest.yaml create mode 100644 examples/tutorials/00_sync/harness_pydantic_ai/project/__init__.py create mode 100644 examples/tutorials/00_sync/harness_pydantic_ai/project/acp.py create mode 100644 examples/tutorials/00_sync/harness_pydantic_ai/project/agent.py create mode 100644 examples/tutorials/00_sync/harness_pydantic_ai/project/tools.py create mode 100644 examples/tutorials/00_sync/harness_pydantic_ai/pyproject.toml create mode 100644 examples/tutorials/00_sync/harness_pydantic_ai/tests/test_agent.py create mode 100644 examples/tutorials/10_async/00_base/harness_pydantic_ai/.dockerignore create mode 100644 examples/tutorials/10_async/00_base/harness_pydantic_ai/Dockerfile create mode 100644 examples/tutorials/10_async/00_base/harness_pydantic_ai/README.md create mode 100644 examples/tutorials/10_async/00_base/harness_pydantic_ai/manifest.yaml create mode 100644 examples/tutorials/10_async/00_base/harness_pydantic_ai/project/__init__.py create mode 100644 examples/tutorials/10_async/00_base/harness_pydantic_ai/project/acp.py create mode 100644 examples/tutorials/10_async/00_base/harness_pydantic_ai/project/agent.py create mode 100644 examples/tutorials/10_async/00_base/harness_pydantic_ai/project/tools.py create mode 100644 examples/tutorials/10_async/00_base/harness_pydantic_ai/pyproject.toml create mode 100644 examples/tutorials/10_async/00_base/harness_pydantic_ai/tests/test_agent.py create mode 100644 examples/tutorials/10_async/10_temporal/harness_pydantic_ai/.dockerignore create mode 100644 examples/tutorials/10_async/10_temporal/harness_pydantic_ai/Dockerfile create mode 100644 examples/tutorials/10_async/10_temporal/harness_pydantic_ai/README.md create mode 100644 examples/tutorials/10_async/10_temporal/harness_pydantic_ai/manifest.yaml create mode 100644 examples/tutorials/10_async/10_temporal/harness_pydantic_ai/project/__init__.py create mode 100644 examples/tutorials/10_async/10_temporal/harness_pydantic_ai/project/acp.py create mode 100644 examples/tutorials/10_async/10_temporal/harness_pydantic_ai/project/agent.py create mode 100644 examples/tutorials/10_async/10_temporal/harness_pydantic_ai/project/run_worker.py create mode 100644 examples/tutorials/10_async/10_temporal/harness_pydantic_ai/project/tools.py create mode 100644 examples/tutorials/10_async/10_temporal/harness_pydantic_ai/project/workflow.py create mode 100644 examples/tutorials/10_async/10_temporal/harness_pydantic_ai/pyproject.toml create mode 100644 examples/tutorials/10_async/10_temporal/harness_pydantic_ai/tests/test_agent.py create mode 100644 src/agentex/lib/adk/_modules/_pydantic_ai_turn.py create mode 100644 tests/lib/adk/test_pydantic_ai_sync_unified.py create mode 100644 tests/lib/adk/test_pydantic_ai_turn.py create mode 100644 tests/lib/core/harness/conformance/test_pydantic_ai_conformance.py create mode 100644 tests/lib/core/harness/test_harness_pydantic_ai_async.py create mode 100644 tests/lib/core/harness/test_harness_pydantic_ai_sync.py create mode 100644 tests/lib/core/harness/test_harness_pydantic_ai_temporal.py diff --git a/.github/workflows/harness-integration.yml b/.github/workflows/harness-integration.yml index 51893f10f..11b5239dc 100644 --- a/.github/workflows/harness-integration.yml +++ b/.github/workflows/harness-integration.yml @@ -7,6 +7,7 @@ on: paths: - "src/agentex/lib/core/harness/**" - "src/agentex/lib/adk/_modules/**" + - "tests/lib/core/harness/test_harness_pydantic_ai_*.py" - ".github/workflows/harness-integration.yml" jobs: @@ -31,10 +32,28 @@ jobs: - name: Conformance suite run: ./scripts/test tests/lib/core/harness/ -v - # Live integration matrix (harness x {sync, async, temporal}) is added per-harness - # in the migration plans. Placeholder job keeps the workflow valid until then. + # Offline pydantic-ai integration tests (sync / async / temporal channels). + # These use pydantic-ai TestModel + fake streaming/tracing and require no live + # infrastructure. Enabled here for PR 4 (pydantic-ai migration). Future harness + # migration PRs (5-8) should add their integration-test paths to this matrix. live-matrix: runs-on: ubuntu-latest - if: false # enabled once the first harness's test agents land + strategy: + matrix: + channel: [sync, async, temporal] + fail-fast: false + name: pydantic-ai-${{ matrix.channel }} steps: - - run: echo "populated by migration PRs" # TODO(harness-migration): enable per-harness; see migration PRs 4-8 + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + + - name: Install uv + uses: astral-sh/setup-uv@d4b2f3b6ecc6e67c4457f6d3e41ec42d3d0fcb86 # v5.4.2 + with: + version: '0.10.2' + + - name: Bootstrap + run: ./scripts/bootstrap + + - name: pydantic-ai ${{ matrix.channel }} integration tests (offline, TestModel) + run: | + ./scripts/test tests/lib/core/harness/test_harness_pydantic_ai_${{ matrix.channel }}.py -v diff --git a/docs/superpowers/plans/2026-06-18-unified-harness-surface-pr4-pydantic-ai.md b/docs/superpowers/plans/2026-06-18-unified-harness-surface-pr4-pydantic-ai.md new file mode 100644 index 000000000..2fa1892fe --- /dev/null +++ b/docs/superpowers/plans/2026-06-18-unified-harness-surface-pr4-pydantic-ai.md @@ -0,0 +1,246 @@ +# Unified Harness Surface — PR 4: pydantic-ai Migration Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Migrate the pydantic-ai harness onto the unified harness surface so it emits streaming + persisted messages + tracing + turn usage through ONE source of truth, over both delivery channels (yield + auto-send), with no public regression — and ship its 3 integration test agents (sync/async/temporal). + +**Architecture:** Wrap a pydantic-ai run as a `HarnessTurn` (canonical `StreamTaskMessage*` stream + normalized `TurnUsage`). Reuse the existing `convert_pydantic_ai_to_agentex_events` mapping as the tap. Reimplement the existing public auto-send helper on top of `UnifiedEmitter.auto_send_turn`, and route sync ACP agents through `UnifiedEmitter.yield_turn`. Retire the bespoke `_pydantic_ai_tracing` handler in favor of the surface's derived spans (keep the old symbol as a deprecated shim). + +**Tech Stack:** Python 3, pydantic-ai (`pydantic_ai`), pydantic v2, pytest + pytest-asyncio, the `agentex.lib.core.harness` package from PRs 1-3. + +**Foundation:** `src/agentex/lib/core/harness/` (`UnifiedEmitter`, `SpanTracer`, `SpanDeriver`, `HarnessTurn`, `TurnUsage`, `TurnResult`, `yield_events`, `auto_send`, conformance scaffold). Design: `docs/superpowers/specs/2026-06-18-unified-harness-surface-design.md`. + +--- + +## Dependencies (must land first) + +- **AGX1-373** — cross-channel conformance equivalence + `Full` wire reconciliation. PR 4's conformance fixtures register into the upgraded cross-channel runner. **Do not start Task 6 until 373 is merged into the foundation branch.** +- **AGX1-375** — public `adk` import path for the harness surface. If merged, import the surface via the public path in this PR; if not, import from `agentex.lib.core.harness` and add a follow-up note. (Tasks below assume `from agentex.lib.core.harness import UnifiedEmitter, TurnUsage, ...`; swap to the public path if 375 landed.) + +This is one PR (target < 1000 lines code, excluding any recorded fixtures). The 3 test agents are the largest chunk; if the diff exceeds budget, split the test agents into a follow-up PR 4b (note in the PR description). + +--- + +## File Structure + +- Modify `src/agentex/lib/adk/_modules/_pydantic_ai_sync.py` — add an optional `on_result` callback to `convert_pydantic_ai_to_agentex_events` (additive) so usage can be captured. Behavior unchanged when omitted. +- Create `src/agentex/lib/adk/_modules/_pydantic_ai_turn.py` — `PydanticAITurn(HarnessTurn)` + `pydantic_ai_usage_to_turn_usage(...)`. +- Modify `src/agentex/lib/adk/_modules/_pydantic_ai_async.py` — reimplement `stream_pydantic_ai_events` on `UnifiedEmitter.auto_send_turn`, preserving signature + return. +- Modify `src/agentex/lib/adk/_modules/_pydantic_ai_tracing.py` — mark `create_pydantic_ai_tracing_handler` / `AgentexPydanticAITracingHandler` deprecated (docstring + `DeprecationWarning`); keep importable. +- Create `tests/lib/core/harness/conformance/test_pydantic_ai_conformance.py` — register pydantic-ai fixtures into the cross-channel conformance runner. +- Create `examples/tutorials/harness-pydantic-ai-{sync,async,temporal}/` — 3 test agents (modeled on the `sync-pydantic-ai` / `default-pydantic-ai` / `temporal-pydantic-ai` CLI templates) using the unified surface. +- Modify `.github/workflows/harness-integration.yml` — enable the pydantic-ai rows of the `live-matrix` job. +- Modify `.github/workflows/agentex-tutorials-test.yml` (or its agent list) — include the 3 new test agents if that workflow enumerates agents. + +--- + +## Task 1: Expose the pydantic-ai run result for usage capture + +**Files:** +- Modify: `src/agentex/lib/adk/_modules/_pydantic_ai_sync.py` +- Test: `tests/lib/adk/test_pydantic_ai_sync.py` (create if absent) + +The converter already iterates the pydantic-ai event stream and currently *ignores* `AgentRunResultEvent` (the terminal event carrying the run result + usage). Add an optional callback so a caller can capture it without changing existing behavior. + +- [ ] **Step 1: Write the failing test.** + +```python +import pytest +from agentex.lib.adk._modules._pydantic_ai_sync import convert_pydantic_ai_to_agentex_events + + +class _FakeResultEvent: # stand-in for pydantic_ai.run.AgentRunResultEvent + def __init__(self, result): + self.result = result + + +async def _stream(events): + for e in events: + yield e + + +@pytest.mark.asyncio +async def test_on_result_callback_receives_terminal_event(monkeypatch): + # When the stream ends with an AgentRunResultEvent, on_result is invoked with it, + # and the converter still yields no extra events for it. + captured = {} + # Use a real AgentRunResultEvent if constructable; otherwise patch isinstance check. + # (Implementer: see Step 3 note — match the real terminal event type.) + ... +``` + +Implementer note: the exact terminal event type is `pydantic_ai.run.AgentRunResultEvent` (already imported in `_pydantic_ai_sync.py`). Write the test to feed a stream ending in a real `AgentRunResultEvent` (construct it as the installed pydantic-ai version requires; inspect `python -c "import pydantic_ai.run, inspect; print(inspect.signature(pydantic_ai.run.AgentRunResultEvent))"`). Assert `on_result` is called once with that event and that the converter yields the same `StreamTaskMessage*` sequence as without the callback (no behavior change for the streaming output). + +- [ ] **Step 2: Run** `uv run pytest tests/lib/adk/test_pydantic_ai_sync.py -v` — expect FAIL (no `on_result` param). + +- [ ] **Step 3: Implement.** Add `on_result: Callable[[AgentRunResultEvent], None] | None = None` (and an async-callable variant if needed) to `convert_pydantic_ai_to_agentex_events`. In the existing `elif isinstance(event, (FunctionToolCallEvent, FinalResultEvent, AgentRunResultEvent))` branch, when the event is an `AgentRunResultEvent` and `on_result` is set, call it (await if it's a coroutine). Keep yielding nothing for it. No other change. + +- [ ] **Step 4: Run** the test — expect PASS, plus run the existing `_pydantic_ai_sync` tests if any to confirm no regression. + +- [ ] **Step 5: Commit** `feat(pydantic-ai): optional on_result callback to expose run result for usage capture`. + +--- + +## Task 2: Normalize pydantic-ai usage to `TurnUsage` + +**Files:** +- Create: `src/agentex/lib/adk/_modules/_pydantic_ai_turn.py` +- Test: `tests/lib/adk/test_pydantic_ai_turn.py` + +- [ ] **Step 1: Verify the real usage shape FIRST.** Run `uv run python -c "from pydantic_ai.usage import RunUsage; import inspect; print([f for f in RunUsage.model_fields])"` (the type/name may be `RunUsage` or `Usage` depending on the installed version). Record the exact field names (commonly: `input_tokens`, `output_tokens`, `total_tokens`, `requests`, and a cache/`details` field). The mapping in Step 3 MUST use the real field names. + +- [ ] **Step 2: Write the failing test.** + +```python +from agentex.lib.adk._modules._pydantic_ai_turn import pydantic_ai_usage_to_turn_usage + + +def test_usage_normalization_maps_fields(): + # Build a usage object matching the installed pydantic-ai RunUsage shape + # (see Task 2 Step 1 for the real fields), then assert the mapping. + usage_obj = ... # construct RunUsage(input_tokens=10, output_tokens=20, requests=2, ...) + tu = pydantic_ai_usage_to_turn_usage(usage_obj, model="openai:gpt-4o") + assert tu.model == "openai:gpt-4o" + assert tu.input_tokens == 10 + assert tu.output_tokens == 20 + assert tu.num_llm_calls == 2 +``` + +- [ ] **Step 3: Implement** `pydantic_ai_usage_to_turn_usage(usage, model) -> TurnUsage` mapping the verified RunUsage fields onto `TurnUsage` (`input_tokens`, `output_tokens`, `total_tokens`, `cached_input_tokens` if available, `num_llm_calls` ← `requests`). Use `getattr(usage, "", None)` defensively so a version field rename degrades to `None` rather than crashing. Then implement `PydanticAITurn`: + +```python +class PydanticAITurn: + """A pydantic-ai run as a HarnessTurn: canonical event stream + normalized usage.""" + + def __init__(self, stream, model: str | None = None): + self._stream = stream + self._model = model + self._usage = TurnUsage(model=model) + + @property + async def events(self): + def _capture(result_event): + run_result = getattr(result_event, "result", None) + usage_obj = run_result.usage() if run_result is not None else None + if usage_obj is not None: + self._usage = pydantic_ai_usage_to_turn_usage(usage_obj, self._model) + async for ev in convert_pydantic_ai_to_agentex_events(self._stream, on_result=_capture): + yield ev + + def usage(self) -> TurnUsage: + return self._usage +``` + +(Verify `run_result.usage()` is the correct accessor for the installed version; adjust if it's an attribute.) + +- [ ] **Step 4: Add a `PydanticAITurn` test** that feeds a small stream ending in an `AgentRunResultEvent` whose `result.usage()` returns a known usage, drives `turn.events` to exhaustion, then asserts `turn.usage()` reflects the normalized values and that `events` yielded the expected `StreamTaskMessage*`. Confirm `usage()` BEFORE exhaustion returns the default (documented single-pass contract). + +- [ ] **Step 5: Run** the tests — expect PASS. + +- [ ] **Step 6: Commit** `feat(pydantic-ai): PydanticAITurn HarnessTurn + usage normalization`. + +--- + +## Task 3: Reimplement the auto-send helper on the unified surface + +**Files:** +- Modify: `src/agentex/lib/adk/_modules/_pydantic_ai_async.py` +- Test: `tests/lib/adk/test_pydantic_ai_async.py` + +`stream_pydantic_ai_events(stream, task_id, ...)` currently hand-drives `adk.streaming`. Reimplement it to delegate to `UnifiedEmitter.auto_send_turn(PydanticAITurn(stream, model))`, preserving its signature and return value (the accumulated final text). Feature-add: traces by default. + +- [ ] **Step 1: Capture current behavior as a characterization test.** Before changing anything, write a test that runs the CURRENT `stream_pydantic_ai_events` over a fixture stream with a fake `adk.streaming` and records the messages produced (text, tool request/response). This is the backward-compat baseline ("equivalent messages before/after" from the design). + +- [ ] **Step 2: Run** it green against the current implementation. Commit the test alone: `test(pydantic-ai): characterize stream_pydantic_ai_events output`. + +- [ ] **Step 3: Reimplement** `stream_pydantic_ai_events` to build a `PydanticAITurn` and call `UnifiedEmitter(task_id=task_id, trace_id=, parent_span_id=, streaming=).auto_send_turn(turn)`, returning `result.final_text`. Resolve `trace_id`/`parent_span_id` the same way the module does today (from the streaming/tracing context vars it already reads). Preserve the exact public signature and return type. + +- [ ] **Step 4: Run** the characterization test — it must still pass (same messages). Adjust the test only if AGX1-373 deliberately changed the tool-message wire shape; in that case assert the post-373 shape and note it. Confirm tracing now occurs by default (assert spans via a fake tracer). + +- [ ] **Step 5: Commit** `refactor(pydantic-ai): reimplement stream_pydantic_ai_events on UnifiedEmitter (default tracing)`. + +--- + +## Task 4: Route sync ACP delivery through the surface + deprecate the bespoke tracing handler + +**Files:** +- Modify: `src/agentex/lib/adk/_modules/_pydantic_ai_tracing.py` +- (Reference) the sync ACP usage pattern in the pydantic-ai docs/templates. + +- [ ] **Step 1: Deprecate the bespoke tracing handler.** Add a `DeprecationWarning` (via `warnings.warn(...)`) and a docstring note to `create_pydantic_ai_tracing_handler` / `AgentexPydanticAITracingHandler` stating the unified surface (`UnifiedEmitter`, which derives spans from the canonical stream) supersedes it. Keep the symbols importable and functional (no removal — backward compat). + +- [ ] **Step 2: Confirm the sync path.** The sync tap remains `convert_pydantic_ai_to_agentex_events`. Document (in the module docstring of `_pydantic_ai_sync.py`) the recommended sync ACP usage: + +```python +turn = PydanticAITurn(agent.run_stream_events(...), model=...) +async for event in emitter.yield_turn(turn): + yield event +``` + +No code change beyond the docstring (the sync converter already yields the canonical stream; `yield_turn` adds tracing). Add a test that `emitter.yield_turn(PydanticAITurn(...))` forwards the same events the bare converter would and derives spans. + +- [ ] **Step 3: Run** tests; **Commit** `refactor(pydantic-ai): deprecate bespoke tracing handler; document unified sync path`. + +--- + +## Task 5: pydantic-ai cross-channel conformance fixtures + +**Files:** +- Create: `tests/lib/core/harness/conformance/test_pydantic_ai_conformance.py` + +**Blocked by AGX1-373** (the cross-channel conformance runner). Once 373 is merged into the foundation branch: + +- [ ] **Step 1: Record canonical fixtures.** For 3-4 representative pydantic-ai runs (text-only; single tool; reasoning/thinking; multi-step text+tool), capture the `StreamTaskMessage*` sequence the tap produces (run `convert_pydantic_ai_to_agentex_events` over recorded `AgentStreamEvent` inputs, or hand-author the canonical sequences). Store as `Fixture(name=..., events=[...])`. + +- [ ] **Step 2: Register** each fixture with the conformance runner and let the cross-channel parametrized test (from AGX1-373) assert yield-vs-auto-send equivalence + span equivalence for each. Register/parametrize within THIS module (per the runner's documented per-module registry semantics). + +- [ ] **Step 3: Run** `./scripts/test tests/lib/core/harness/ -v` — all green. **Commit** `test(pydantic-ai): cross-channel conformance fixtures`. + +--- + +## Task 6: Three integration test agents (sync / async / temporal) + +**Files:** +- Create: `examples/tutorials/harness-pydantic-ai-sync/` , `…-async/` , `…-temporal/` (each a minimal Agentex agent). +- Modify: `.github/workflows/harness-integration.yml` (enable pydantic-ai `live-matrix` rows). +- Modify: `.github/workflows/agentex-tutorials-test.yml` if it enumerates agents. + +Each agent is the smallest agent that exercises one delivery channel through the unified surface with the pydantic-ai harness. + +- [ ] **Step 1: Scaffold from the existing templates.** Base each agent on the corresponding CLI template: `sync-pydantic-ai`, `default-pydantic-ai` (async), `temporal-pydantic-ai` (under `src/agentex/lib/cli/templates/`). In each, the message handler builds `PydanticAITurn(agent.run_stream_events(params.content.content), model=...)` and: + - sync agent: `async for ev in emitter.yield_turn(turn): yield ev` + - async + temporal agents: `await emitter.auto_send_turn(turn)` (temporal: inside the activity, as the template already structures it). + Use a tiny pydantic-ai agent with ONE trivial tool so the run exercises text + a tool call + tool response. + +- [ ] **Step 2: Write an integration test per agent** that drives it with a fixed prompt and asserts: valid ordered messages (text + tool request + tool response) and a well-formed span tree. Use the repo's existing tutorial-agent test harness pattern (see `agentex-tutorials-test.yml` and how current tutorial agents are tested). + +- [ ] **Step 3: Wire CI.** In `.github/workflows/harness-integration.yml`, replace the `if: false` placeholder `live-matrix` job (or add a real matrix) with the pydantic-ai × {sync, async, temporal} entries, each running its agent's integration test. If `agentex-tutorials-test.yml` enumerates agents, add the three there too. `log`/document any agent-type not covered (none expected for pydantic-ai). + +- [ ] **Step 4: Run** the integration tests locally (as far as the env allows) and the conformance + unit suites. **Commit** `test(pydantic-ai): sync/async/temporal integration agents + enable CI live-matrix rows`. + +--- + +## Task 7: Full suite, type check, and backward-compat audit + +- [ ] **Step 1:** `./scripts/test tests/lib/core/harness/ tests/lib/adk/ -v` — all green on 3.12 + 3.13. +- [ ] **Step 2:** `uv run pyright src/agentex/lib/` (or the harness + pydantic modules) — 0 new errors. +- [ ] **Step 3: Backward-compat audit.** Confirm the public signatures are unchanged: `convert_pydantic_ai_to_agentex_events` (only gained an optional kwarg), `stream_pydantic_ai_events` (same signature + return), `create_pydantic_ai_tracing_handler` (still importable, now warns). Grep the repo + templates for callers and confirm none broke. +- [ ] **Step 4:** If any fix was needed, **Commit** `chore(pydantic-ai): type/back-compat fixes`. + +--- + +## Self-Review checklist (run before opening the PR) + +- Every public symbol that existed before still exists with the same signature (additive-only): `convert_pydantic_ai_to_agentex_events`, `stream_pydantic_ai_events`, `create_pydantic_ai_tracing_handler`. +- The auto-send helper returns the same final text as before (characterization test passes, or the post-373 shape is asserted with a note). +- Tracing is now on by default for both channels and is overridable (emitter `tracer=False`). +- Usage normalization uses the REAL pydantic-ai usage field names (verified in Task 2 Step 1), with defensive `getattr`. +- Conformance fixtures register per-module and pass the cross-channel assertion from AGX1-373. +- 3 test agents exist and their CI rows are enabled. +- No `# type: ignore` added without justification. + +## Notes for the PR description + +- Link AGX1-373 (dependency) and AGX1-375 (import path); note AGX1-374 (reasoning/mixed-ordering auto_send tests) is foundation-level and orthogonal. +- State the diff size; if test agents pushed it over budget, note the PR 4b split. +- This is the template the langgraph (PR 5) and openai (PR 6) migrations follow. diff --git a/examples/tutorials/00_sync/harness_pydantic_ai/.dockerignore b/examples/tutorials/00_sync/harness_pydantic_ai/.dockerignore new file mode 100644 index 000000000..c49489471 --- /dev/null +++ b/examples/tutorials/00_sync/harness_pydantic_ai/.dockerignore @@ -0,0 +1,43 @@ +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg + +# Environments +.env** +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# IDE +.idea/ +.vscode/ +*.swp +*.swo + +# Git +.git +.gitignore + +# Misc +.DS_Store diff --git a/examples/tutorials/00_sync/harness_pydantic_ai/Dockerfile b/examples/tutorials/00_sync/harness_pydantic_ai/Dockerfile new file mode 100644 index 000000000..3a9412fa9 --- /dev/null +++ b/examples/tutorials/00_sync/harness_pydantic_ai/Dockerfile @@ -0,0 +1,50 @@ +# syntax=docker/dockerfile:1.3 +FROM python:3.12-slim +COPY --from=ghcr.io/astral-sh/uv:0.6.4 /uv /uvx /bin/ + +# Install system dependencies +RUN apt-get update && apt-get install -y \ + htop \ + vim \ + curl \ + tar \ + python3-dev \ + postgresql-client \ + build-essential \ + libpq-dev \ + gcc \ + cmake \ + netcat-openbsd \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +RUN uv pip install --system --upgrade pip setuptools wheel + +ENV UV_HTTP_TIMEOUT=1000 + +# Copy pyproject.toml and README.md to install dependencies +COPY 00_sync/harness_pydantic_ai/pyproject.toml /app/harness_pydantic_ai/pyproject.toml +COPY 00_sync/harness_pydantic_ai/README.md /app/harness_pydantic_ai/README.md + +WORKDIR /app/harness_pydantic_ai + +# Copy the project code +COPY 00_sync/harness_pydantic_ai/project /app/harness_pydantic_ai/project + +# Copy the test files +COPY 00_sync/harness_pydantic_ai/tests /app/harness_pydantic_ai/tests + +# Copy shared test utilities +COPY test_utils /app/test_utils + +# Install the required Python packages with dev dependencies +RUN uv pip install --system .[dev] + +# Set environment variables +ENV PYTHONPATH=/app + +# Set test environment variables +ENV AGENT_NAME=s-harness-pydantic-ai + +# Run the agent using uvicorn +CMD ["uvicorn", "project.acp:acp", "--host", "0.0.0.0", "--port", "8000"] diff --git a/examples/tutorials/00_sync/harness_pydantic_ai/README.md b/examples/tutorials/00_sync/harness_pydantic_ai/README.md new file mode 100644 index 000000000..1466bc4e7 --- /dev/null +++ b/examples/tutorials/00_sync/harness_pydantic_ai/README.md @@ -0,0 +1,54 @@ +# Sync Pydantic AI Harness Test Agent + +A minimal **synchronous** Pydantic AI agent that drives the **unified harness +surface** (`UnifiedEmitter.yield_turn` + `PydanticAITurn`) on the sync +(HTTP-yield) channel. + +## Why this agent exists + +The `00_sync/040_pydantic_ai` tutorial streams via the bare +`convert_pydantic_ai_to_agentex_events` converter and does **not** exercise the +unified `yield_turn` path. This harness test agent is the sync coverage for the +unified surface: it proves an agent author can wire the sync channel through +`UnifiedEmitter` and get automatic span derivation (tool spans nested under the +per-turn span) for free, exactly like the async/temporal channels. + +## How it wires the unified surface + +In `project/acp.py`: + +```python +emitter = UnifiedEmitter( + task_id=task_id, + trace_id=task_id, + parent_span_id=turn_span.id if turn_span else None, +) +async with agent.run_stream_events(user_message) as stream: + turn = PydanticAITurn(stream, model=MODEL_NAME) # coalesce off: stream tool-call arg tokens + async for ev in emitter.yield_turn(turn): + yield ev +``` + +- `coalesce_tool_requests=False` (the default) preserves token-by-token + tool-call argument streaming on the sync channel. +- The `UnifiedEmitter` is constructed from the ACP/streaming context + (`task_id` + `trace_id` + `parent_span_id`) so tool spans nest under the + per-turn `AGENT_WORKFLOW` span automatically. + +## Files + +- `project/acp.py` — sync ACP handler using `emitter.yield_turn(...)`. +- `project/agent.py` — builds the `pydantic_ai.Agent` with one tool. +- `project/tools.py` — `get_weather(city)` returning a constant. +- `tests/test_agent.py` — live integration test (requires a running agent). + +## Tools + +- `get_weather(city: str) -> str`: returns a fixed "sunny and 72°F" string so a + run deterministically exercises text + a tool call + a tool response. + +## Offline coverage + +Offline integration tests for the same wiring (pydantic-ai `TestModel` + fake +streaming/tracing, no network) live in the SDK repo at +`tests/lib/core/harness/test_harness_pydantic_ai_sync.py`. diff --git a/examples/tutorials/00_sync/harness_pydantic_ai/manifest.yaml b/examples/tutorials/00_sync/harness_pydantic_ai/manifest.yaml new file mode 100644 index 000000000..55d8f5d2b --- /dev/null +++ b/examples/tutorials/00_sync/harness_pydantic_ai/manifest.yaml @@ -0,0 +1,58 @@ +build: + context: + root: ../../ + include_paths: + - 00_sync/harness_pydantic_ai + - test_utils + dockerfile: 00_sync/harness_pydantic_ai/Dockerfile + dockerignore: 00_sync/harness_pydantic_ai/.dockerignore + +local_development: + agent: + port: 8000 + host_address: host.docker.internal + paths: + acp: project/acp.py + +agent: + acp_type: sync + name: s-harness-pydantic-ai + description: A sync Pydantic AI harness test agent using the unified emitter surface + + temporal: + enabled: false + + credentials: + - env_var_name: OPENAI_API_KEY + secret_name: openai-api-key + secret_key: api-key + - env_var_name: REDIS_URL + secret_name: redis-url-secret + secret_key: url + - env_var_name: SGP_API_KEY + secret_name: sgp-api-key + secret_key: api-key + - env_var_name: SGP_ACCOUNT_ID + secret_name: sgp-account-id + secret_key: account-id + - env_var_name: SGP_CLIENT_BASE_URL + secret_name: sgp-client-base-url + secret_key: url + +deployment: + image: + repository: "" + tag: "latest" + + global: + agent: + name: "s-harness-pydantic-ai" + description: "A sync Pydantic AI harness test agent using the unified emitter surface" + replicaCount: 1 + resources: + requests: + cpu: "500m" + memory: "1Gi" + limits: + cpu: "1000m" + memory: "2Gi" diff --git a/examples/tutorials/00_sync/harness_pydantic_ai/project/__init__.py b/examples/tutorials/00_sync/harness_pydantic_ai/project/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/examples/tutorials/00_sync/harness_pydantic_ai/project/acp.py b/examples/tutorials/00_sync/harness_pydantic_ai/project/acp.py new file mode 100644 index 000000000..f23cd7960 --- /dev/null +++ b/examples/tutorials/00_sync/harness_pydantic_ai/project/acp.py @@ -0,0 +1,92 @@ +"""ACP handler for the sync harness Pydantic AI test agent. + +This agent exercises the UNIFIED HARNESS SURFACE on the sync (HTTP-yield) +channel — ``UnifiedEmitter.yield_turn(PydanticAITurn(...))`` — rather than the +bare ``convert_pydantic_ai_to_agentex_events`` converter used by the +``040_pydantic_ai`` tutorial. The unified surface gives the sync channel the +same tracing (span derivation) the async/temporal channels get for free. + +Flow: +1. Open a per-turn AGENT_WORKFLOW span via ``adk.tracing.span``. +2. Construct a ``UnifiedEmitter`` from the ACP/streaming context (task_id + + trace_id + parent_span_id) so tool spans nest under the turn span. +3. Wrap ``agent.run_stream_events(...)`` in a ``PydanticAITurn`` and forward + events with ``emitter.yield_turn(turn)`` — yielding each to the client. +""" + +from __future__ import annotations + +import os +from typing import AsyncGenerator + +from dotenv import load_dotenv + +load_dotenv() + +import agentex.lib.adk as adk +from project.agent import MODEL_NAME, create_agent +from agentex.lib.types.acp import SendMessageParams +from agentex.lib.core.harness import UnifiedEmitter +from agentex.lib.types.tracing import SGPTracingProcessorConfig +from agentex.lib.utils.logging import make_logger +from agentex.lib.sdk.fastacp.fastacp import FastACP +from agentex.types.task_message_update import TaskMessageUpdate +from agentex.types.task_message_content import TaskMessageContent +from agentex.lib.adk._modules._pydantic_ai_turn import PydanticAITurn +from agentex.lib.core.tracing.tracing_processor_manager import add_tracing_processor_config + +logger = make_logger(__name__) + +add_tracing_processor_config( + SGPTracingProcessorConfig( + sgp_api_key=os.environ.get("SGP_API_KEY", ""), + sgp_account_id=os.environ.get("SGP_ACCOUNT_ID", ""), + sgp_base_url=os.environ.get("SGP_CLIENT_BASE_URL", ""), + ) +) + +acp = FastACP.create(acp_type="sync") + +_agent = None + + +def get_agent(): + """Get or create the Pydantic AI agent instance.""" + global _agent + if _agent is None: + _agent = create_agent() + return _agent + + +@acp.on_message_send +async def handle_message_send( + params: SendMessageParams, +) -> TaskMessageContent | list[TaskMessageContent] | AsyncGenerator[TaskMessageUpdate, None]: + """Handle incoming messages, streaming events through the unified surface.""" + agent = get_agent() + task_id = params.task.id + + user_message = params.content.content + logger.info(f"Processing message for task {task_id}") + + async with adk.tracing.span( + trace_id=task_id, + task_id=task_id, + name="message", + input={"message": user_message}, + data={"__span_type__": "AGENT_WORKFLOW"}, + ) as turn_span: + # Construct the UnifiedEmitter from the ACP/streaming context so tracing + # is automatic: tool spans nest under this turn's span. + emitter = UnifiedEmitter( + task_id=task_id, + trace_id=task_id, + parent_span_id=turn_span.id if turn_span else None, + ) + + async with agent.run_stream_events(user_message) as stream: + # PydanticAITurn preserves token-by-token tool-call argument + # streaming (Start+Delta+Done) on the sync/HTTP channel. + turn = PydanticAITurn(stream, model=MODEL_NAME) + async for ev in emitter.yield_turn(turn): + yield ev diff --git a/examples/tutorials/00_sync/harness_pydantic_ai/project/agent.py b/examples/tutorials/00_sync/harness_pydantic_ai/project/agent.py new file mode 100644 index 000000000..72fd74173 --- /dev/null +++ b/examples/tutorials/00_sync/harness_pydantic_ai/project/agent.py @@ -0,0 +1,39 @@ +"""Pydantic AI agent definition for the sync harness test agent. + +The Agent is the boundary between this module and the API layer (acp.py). +Pydantic AI handles its own tool-call loop internally — no graph required. +""" + +from __future__ import annotations + +from datetime import datetime + +from pydantic_ai import Agent + +from project.tools import get_weather + +__all__ = ["create_agent", "MODEL_NAME"] + +MODEL_NAME = "openai:gpt-4o-mini" +SYSTEM_PROMPT = """You are a helpful AI assistant with access to tools. + +Current date and time: {timestamp} + +Guidelines: +- Be concise and helpful +- Use tools when they would help answer the user's question +- If you're unsure, ask clarifying questions +- Always provide accurate information +""" + + +def create_agent() -> Agent: + """Build and return the Pydantic AI agent with tools registered.""" + agent = Agent( + MODEL_NAME, + system_prompt=SYSTEM_PROMPT.format(timestamp=datetime.now().strftime("%Y-%m-%d %H:%M:%S")), + ) + + agent.tool_plain(get_weather) + + return agent diff --git a/examples/tutorials/00_sync/harness_pydantic_ai/project/tools.py b/examples/tutorials/00_sync/harness_pydantic_ai/project/tools.py new file mode 100644 index 000000000..d649c75f1 --- /dev/null +++ b/examples/tutorials/00_sync/harness_pydantic_ai/project/tools.py @@ -0,0 +1,20 @@ +"""Tool definitions for the sync harness Pydantic AI agent. + +Pydantic AI tools are registered directly on the Agent via decorators +(see project.agent). This module hosts the bare function so it is easy to +unit-test in isolation. +""" + +from __future__ import annotations + + +def get_weather(city: str) -> str: + """Get the current weather for a city. + + Args: + city: The name of the city to get weather for. + + Returns: + A string describing the weather conditions. + """ + return f"The weather in {city} is sunny and 72°F" diff --git a/examples/tutorials/00_sync/harness_pydantic_ai/pyproject.toml b/examples/tutorials/00_sync/harness_pydantic_ai/pyproject.toml new file mode 100644 index 000000000..08f709a4a --- /dev/null +++ b/examples/tutorials/00_sync/harness_pydantic_ai/pyproject.toml @@ -0,0 +1,36 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "s-harness-pydantic-ai" +version = "0.1.0" +description = "A sync Pydantic AI harness test agent using the unified emitter surface" +readme = "README.md" +requires-python = ">=3.12" +dependencies = [ + "agentex-sdk", + "scale-gp", + "pydantic-ai-slim[openai]>=1.0,<2", +] + +[project.optional-dependencies] +dev = [ + "pytest", + "pytest-asyncio", + "httpx", + "black", + "isort", + "flake8", +] + +[tool.hatch.build.targets.wheel] +packages = ["project"] + +[tool.black] +line-length = 88 +target-version = ['py312'] + +[tool.isort] +profile = "black" +line_length = 88 diff --git a/examples/tutorials/00_sync/harness_pydantic_ai/tests/test_agent.py b/examples/tutorials/00_sync/harness_pydantic_ai/tests/test_agent.py new file mode 100644 index 000000000..96da95fdc --- /dev/null +++ b/examples/tutorials/00_sync/harness_pydantic_ai/tests/test_agent.py @@ -0,0 +1,138 @@ +"""Live tests for the sync harness Pydantic AI agent. + +These tests require a running agent (server + deployed agent) and exercise the +unified-surface sync handler end-to-end over the wire. They mirror the +``040_pydantic_ai`` tutorial tests but target this harness agent. + +Offline coverage of the same wiring (TestModel + fake streaming/tracing) lives +in ``tests/lib/core/harness/test_harness_pydantic_ai_sync.py`` in the SDK repo. + +To run these tests: +1. Make sure the agent is running (via docker-compose or `agentex agents run`) +2. Set the AGENTEX_API_BASE_URL environment variable if not using default +3. Run: pytest test_agent.py -v + +Configuration: +- AGENTEX_API_BASE_URL: Base URL for the AgentEx server (default: http://localhost:5003) +- AGENT_NAME: Name of the agent to test (default: s-harness-pydantic-ai) +""" + +import os + +import pytest +from test_utils.sync import validate_text_in_string, collect_streaming_response + +from agentex import Agentex +from agentex.types import TextContentParam +from agentex.types.agent_rpc_params import ParamsSendMessageRequest + +AGENTEX_API_BASE_URL = os.environ.get("AGENTEX_API_BASE_URL", "http://localhost:5003") +AGENT_NAME = os.environ.get("AGENT_NAME", "s-harness-pydantic-ai") + + +@pytest.fixture +def client(): + """Create an AgentEx client instance for testing.""" + return Agentex(base_url=AGENTEX_API_BASE_URL) + + +@pytest.fixture +def agent_name(): + """Return the agent name for testing.""" + return AGENT_NAME + + +@pytest.fixture +def agent_id(client, agent_name): + """Retrieve the agent ID based on the agent name.""" + agents = client.agents.list() + for agent in agents: + if agent.name == agent_name: + return agent.id + raise ValueError(f"Agent with name {agent_name} not found.") + + +class TestNonStreamingMessages: + """Test non-streaming message sending with the unified-surface sync agent.""" + + def test_send_simple_message(self, client: Agentex, agent_name: str): + """Test sending a simple message and receiving a response.""" + response = client.agents.send_message( + agent_name=agent_name, + params=ParamsSendMessageRequest( + content=TextContentParam( + author="user", + content="Hello! What can you help me with?", + type="text", + ) + ), + ) + result = response.result + assert result is not None + assert len(result) >= 1 + + def test_tool_calling(self, client: Agentex, agent_name: str): + """Test that the agent can use tools (e.g., weather tool).""" + response = client.agents.send_message( + agent_name=agent_name, + params=ParamsSendMessageRequest( + content=TextContentParam( + author="user", + content="What's the weather in San Francisco?", + type="text", + ) + ), + ) + result = response.result + assert result is not None + assert len(result) >= 1 + + +class TestStreamingMessages: + """Test streaming message sending through the unified yield_turn path.""" + + def test_stream_simple_message(self, client: Agentex, agent_name: str): + """Test streaming a simple message response.""" + stream = client.agents.send_message_stream( + agent_name=agent_name, + params=ParamsSendMessageRequest( + content=TextContentParam( + author="user", + content="Tell me a short joke.", + type="text", + ) + ), + ) + + aggregated_content, chunks = collect_streaming_response(stream) + + assert aggregated_content is not None + assert len(chunks) > 1, "No chunks received in streaming response." + + def test_stream_tool_calling(self, client: Agentex, agent_name: str): + """Test streaming with tool calls through the unified surface. + + Exercises token-by-token tool-call argument streaming (coalesce off), + which the unified yield_turn path preserves on the sync channel. + """ + stream = client.agents.send_message_stream( + agent_name=agent_name, + params=ParamsSendMessageRequest( + content=TextContentParam( + author="user", + content="What's the weather in New York? Respond with the temperature.", + type="text", + ) + ), + ) + + aggregated_content, chunks = collect_streaming_response(stream) + + assert aggregated_content is not None + assert len(chunks) > 0, "No chunks received in streaming response." + # The weather tool always returns "72°F", so the agent's reply should mention it. + validate_text_in_string("72", aggregated_content) + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) diff --git a/examples/tutorials/10_async/00_base/harness_pydantic_ai/.dockerignore b/examples/tutorials/10_async/00_base/harness_pydantic_ai/.dockerignore new file mode 100644 index 000000000..c49489471 --- /dev/null +++ b/examples/tutorials/10_async/00_base/harness_pydantic_ai/.dockerignore @@ -0,0 +1,43 @@ +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg + +# Environments +.env** +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# IDE +.idea/ +.vscode/ +*.swp +*.swo + +# Git +.git +.gitignore + +# Misc +.DS_Store diff --git a/examples/tutorials/10_async/00_base/harness_pydantic_ai/Dockerfile b/examples/tutorials/10_async/00_base/harness_pydantic_ai/Dockerfile new file mode 100644 index 000000000..3c1b9dfea --- /dev/null +++ b/examples/tutorials/10_async/00_base/harness_pydantic_ai/Dockerfile @@ -0,0 +1,50 @@ +# syntax=docker/dockerfile:1.3 +FROM python:3.12-slim +COPY --from=ghcr.io/astral-sh/uv:0.6.4 /uv /uvx /bin/ + +# Install system dependencies +RUN apt-get update && apt-get install -y \ + htop \ + vim \ + curl \ + tar \ + python3-dev \ + postgresql-client \ + build-essential \ + libpq-dev \ + gcc \ + cmake \ + netcat-openbsd \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +RUN uv pip install --system --upgrade pip setuptools wheel + +ENV UV_HTTP_TIMEOUT=1000 + +# Copy pyproject.toml and README.md to install dependencies +COPY 10_async/00_base/harness_pydantic_ai/pyproject.toml /app/harness_pydantic_ai/pyproject.toml +COPY 10_async/00_base/harness_pydantic_ai/README.md /app/harness_pydantic_ai/README.md + +WORKDIR /app/harness_pydantic_ai + +# Copy the project code +COPY 10_async/00_base/harness_pydantic_ai/project /app/harness_pydantic_ai/project + +# Copy the test files +COPY 10_async/00_base/harness_pydantic_ai/tests /app/harness_pydantic_ai/tests + +# Copy shared test utilities +COPY test_utils /app/test_utils + +# Install the required Python packages with dev dependencies +RUN uv pip install --system .[dev] pytest-asyncio httpx + +# Set environment variables +ENV PYTHONPATH=/app + +# Set test environment variables +ENV AGENT_NAME=ab-harness-pydantic-ai + +# Run the agent using uvicorn +CMD ["uvicorn", "project.acp:acp", "--host", "0.0.0.0", "--port", "8000"] diff --git a/examples/tutorials/10_async/00_base/harness_pydantic_ai/README.md b/examples/tutorials/10_async/00_base/harness_pydantic_ai/README.md new file mode 100644 index 000000000..51acb62bd --- /dev/null +++ b/examples/tutorials/10_async/00_base/harness_pydantic_ai/README.md @@ -0,0 +1,54 @@ +# Async Pydantic AI Harness Test Agent + +A minimal **async** (Redis-streaming) Pydantic AI agent that drives the +**unified harness surface** (`UnifiedEmitter.auto_send_turn` + `PydanticAITurn`) +directly. + +## Why this agent exists + +The `10_async/00_base/110_pydantic_ai` tutorial streams via the +`stream_pydantic_ai_events` helper (which uses the unified surface internally). +This harness test agent calls `emitter.auto_send_turn(...)` **explicitly** at the +agent-author level, making the unified-surface wiring visible and giving the +async channel direct coverage. + +## How it wires the unified surface + +In `project/acp.py`: + +```python +emitter = UnifiedEmitter( + task_id=task_id, + trace_id=task_id, + parent_span_id=turn_span.id if turn_span else None, +) +async with agent.run_stream_events(user_message, message_history=previous_messages) as stream: + turn = PydanticAITurn(tee_messages(stream), model=MODEL_NAME, coalesce_tool_requests=True) + result = await emitter.auto_send_turn(turn) +``` + +- `coalesce_tool_requests=True` is required on the async/auto_send path until + AGX1-377 lands: tool requests are delivered as a single `Full(tool_request)` + rather than streamed `Start + Delta + Done`. +- The `UnifiedEmitter` is constructed from the ACP context (`task_id` + + `trace_id` + `parent_span_id`) so messages auto-send to the task stream + (Redis) and tracing is automatic. +- Multi-turn memory is persisted via `adk.state` (pydantic-ai message history + round-tripped through `ModelMessagesTypeAdapter`). + +## Files + +- `project/acp.py` — async ACP handler using `emitter.auto_send_turn(...)`. +- `project/agent.py` — builds the `pydantic_ai.Agent` with one tool. +- `project/tools.py` — `get_weather(city)` returning a constant. +- `tests/test_agent.py` — live integration test (requires a running agent). + +## Tools + +- `get_weather(city: str) -> str`: returns a fixed "sunny and 72°F" string. + +## Offline coverage + +Offline integration tests for the same wiring (pydantic-ai `TestModel` + fake +streaming/tracing, no network) live in the SDK repo at +`tests/lib/core/harness/test_harness_pydantic_ai_async.py`. diff --git a/examples/tutorials/10_async/00_base/harness_pydantic_ai/manifest.yaml b/examples/tutorials/10_async/00_base/harness_pydantic_ai/manifest.yaml new file mode 100644 index 000000000..f9e50f329 --- /dev/null +++ b/examples/tutorials/10_async/00_base/harness_pydantic_ai/manifest.yaml @@ -0,0 +1,58 @@ +build: + context: + root: ../../../ + include_paths: + - 10_async/00_base/harness_pydantic_ai + - test_utils + dockerfile: 10_async/00_base/harness_pydantic_ai/Dockerfile + dockerignore: 10_async/00_base/harness_pydantic_ai/.dockerignore + +local_development: + agent: + port: 8000 + host_address: host.docker.internal + paths: + acp: project/acp.py + +agent: + acp_type: async + name: ab-harness-pydantic-ai + description: An async Pydantic AI harness test agent using the unified emitter surface + + temporal: + enabled: false + + credentials: + - env_var_name: OPENAI_API_KEY + secret_name: openai-api-key + secret_key: api-key + - env_var_name: REDIS_URL + secret_name: redis-url-secret + secret_key: url + - env_var_name: SGP_API_KEY + secret_name: sgp-api-key + secret_key: api-key + - env_var_name: SGP_ACCOUNT_ID + secret_name: sgp-account-id + secret_key: account-id + - env_var_name: SGP_CLIENT_BASE_URL + secret_name: sgp-client-base-url + secret_key: url + +deployment: + image: + repository: "" + tag: "latest" + + global: + agent: + name: "ab-harness-pydantic-ai" + description: "An async Pydantic AI harness test agent using the unified emitter surface" + replicaCount: 1 + resources: + requests: + cpu: "500m" + memory: "1Gi" + limits: + cpu: "1000m" + memory: "2Gi" diff --git a/examples/tutorials/10_async/00_base/harness_pydantic_ai/project/__init__.py b/examples/tutorials/10_async/00_base/harness_pydantic_ai/project/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/examples/tutorials/10_async/00_base/harness_pydantic_ai/project/acp.py b/examples/tutorials/10_async/00_base/harness_pydantic_ai/project/acp.py new file mode 100644 index 000000000..95b638f8b --- /dev/null +++ b/examples/tutorials/10_async/00_base/harness_pydantic_ai/project/acp.py @@ -0,0 +1,159 @@ +"""ACP handler for the async harness Pydantic AI test agent. + +This agent exercises the UNIFIED HARNESS SURFACE on the async (Redis-streaming) +channel — ``UnifiedEmitter.auto_send_turn(PydanticAITurn(...))`` +— calling it directly rather than via the ``stream_pydantic_ai_events`` helper +(which the ``110_pydantic_ai`` tutorial uses). This makes the unified-surface +wiring explicit at the agent-author level. + +Multi-turn memory is persisted via ``adk.state``: on each turn we load the +previous pydantic-ai ``message_history`` from state, run the agent with it, +then save the updated history back. +""" + +from __future__ import annotations + +import os +from typing import Any, AsyncIterator + +from dotenv import load_dotenv + +load_dotenv() + +from pydantic_ai.run import AgentRunResultEvent +from pydantic_ai.messages import ModelMessagesTypeAdapter + +import agentex.lib.adk as adk +from project.agent import MODEL_NAME, create_agent +from agentex.lib.types.acp import SendEventParams, CancelTaskParams, CreateTaskParams +from agentex.lib.core.harness import UnifiedEmitter +from agentex.lib.types.fastacp import AsyncACPConfig +from agentex.lib.types.tracing import SGPTracingProcessorConfig +from agentex.lib.utils.logging import make_logger +from agentex.lib.utils.model_utils import BaseModel +from agentex.lib.sdk.fastacp.fastacp import FastACP +from agentex.lib.adk._modules._pydantic_ai_turn import PydanticAITurn +from agentex.lib.core.tracing.tracing_processor_manager import add_tracing_processor_config + +logger = make_logger(__name__) + +add_tracing_processor_config( + SGPTracingProcessorConfig( + sgp_api_key=os.environ.get("SGP_API_KEY", ""), + sgp_account_id=os.environ.get("SGP_ACCOUNT_ID", ""), + sgp_base_url=os.environ.get("SGP_CLIENT_BASE_URL", ""), + ) +) + +acp = FastACP.create( + acp_type="async", + config=AsyncACPConfig(type="base"), +) + +_agent = None + + +def get_agent(): + global _agent + if _agent is None: + _agent = create_agent() + return _agent + + +class ConversationState(BaseModel): + """Per-task conversation state persisted via ``adk.state``. + + ``history_json`` holds the pydantic-ai message history serialized by + ``ModelMessagesTypeAdapter`` — pydantic-ai's official way to round-trip + ``ModelMessage`` objects through JSON. + """ + + history_json: str = "[]" + turn_number: int = 0 + + +@acp.on_task_create +async def handle_task_create(params: CreateTaskParams): + """Initialize per-task state on task creation.""" + logger.info(f"Task created: {params.task.id}") + await adk.state.create( + task_id=params.task.id, + agent_id=params.agent.id, + state=ConversationState(), + ) + + +@acp.on_task_event_send +async def handle_task_event_send(params: SendEventParams): + """Handle each user message through the unified auto_send_turn path.""" + agent = get_agent() + task_id = params.task.id + agent_id = params.agent.id + user_message = params.event.content.content + + logger.info(f"Processing message for thread {task_id}") + + # Echo the user's message into the task history. + await adk.messages.create(task_id=task_id, content=params.event.content) + + # Load the previous conversation history from state (fall back to fresh). + task_state = await adk.state.get_by_task_and_agent(task_id=task_id, agent_id=agent_id) + if task_state is None: + state = ConversationState() + task_state = await adk.state.create(task_id=task_id, agent_id=agent_id, state=state) + else: + state = ConversationState.model_validate(task_state.state) + + state.turn_number += 1 + previous_messages = ModelMessagesTypeAdapter.validate_json(state.history_json) + + async with adk.tracing.span( + trace_id=task_id, + task_id=task_id, + name=f"Turn {state.turn_number}", + input={"message": user_message}, + data={"__span_type__": "AGENT_WORKFLOW"}, + ) as turn_span: + # Construct the UnifiedEmitter from the ACP context so tracing is + # automatic and messages are auto-sent to the task stream (Redis). + emitter = UnifiedEmitter( + task_id=task_id, + trace_id=task_id, + parent_span_id=turn_span.id if turn_span else None, + ) + + # Capture the terminal AgentRunResultEvent to persist message history. + captured_messages: list[Any] = [] + + async def tee_messages(upstream) -> AsyncIterator[Any]: + async for event in upstream: + if isinstance(event, AgentRunResultEvent): + captured_messages[:] = list(event.result.all_messages()) + yield event + + async with agent.run_stream_events(user_message, message_history=previous_messages) as stream: + # The unified auto_send path delivers streamed tool requests natively + # (Start+Delta+Done), so no coalescing workaround is needed. + turn = PydanticAITurn( + tee_messages(stream), + model=MODEL_NAME, + ) + result = await emitter.auto_send_turn(turn) + + # Save the updated message history so the next turn picks up here. + if captured_messages: + state.history_json = ModelMessagesTypeAdapter.dump_json(captured_messages).decode() + await adk.state.update( + state_id=task_state.id, + task_id=task_id, + agent_id=agent_id, + state=state, + ) + + if turn_span: + turn_span.output = {"final_output": result.final_text} + + +@acp.on_task_cancel +async def handle_task_canceled(params: CancelTaskParams): + logger.info(f"Task canceled: {params.task.id}") diff --git a/examples/tutorials/10_async/00_base/harness_pydantic_ai/project/agent.py b/examples/tutorials/10_async/00_base/harness_pydantic_ai/project/agent.py new file mode 100644 index 000000000..e7b764d82 --- /dev/null +++ b/examples/tutorials/10_async/00_base/harness_pydantic_ai/project/agent.py @@ -0,0 +1,39 @@ +"""Pydantic AI agent definition for the async harness test agent. + +The Agent is the boundary between this module and the API layer (acp.py). +Pydantic AI handles its own tool-call loop internally — no graph required. +""" + +from __future__ import annotations + +from datetime import datetime + +from pydantic_ai import Agent + +from project.tools import get_weather + +__all__ = ["create_agent", "MODEL_NAME"] + +MODEL_NAME = "openai:gpt-4o-mini" +SYSTEM_PROMPT = """You are a helpful AI assistant with access to tools. + +Current date and time: {timestamp} + +Guidelines: +- Be concise and helpful +- Use tools when they would help answer the user's question +- If you're unsure, ask clarifying questions +- Always provide accurate information +""" + + +def create_agent() -> Agent: + """Build and return the Pydantic AI agent with tools registered.""" + agent = Agent( + MODEL_NAME, + system_prompt=SYSTEM_PROMPT.format(timestamp=datetime.now().strftime("%Y-%m-%d %H:%M:%S")), + ) + + agent.tool_plain(get_weather) + + return agent diff --git a/examples/tutorials/10_async/00_base/harness_pydantic_ai/project/tools.py b/examples/tutorials/10_async/00_base/harness_pydantic_ai/project/tools.py new file mode 100644 index 000000000..0f16a7cb0 --- /dev/null +++ b/examples/tutorials/10_async/00_base/harness_pydantic_ai/project/tools.py @@ -0,0 +1,20 @@ +"""Tool definitions for the async harness Pydantic AI agent. + +Pydantic AI tools are registered directly on the Agent via decorators +(see project.agent). This module hosts the bare function so it is easy to +unit-test in isolation. +""" + +from __future__ import annotations + + +def get_weather(city: str) -> str: + """Get the current weather for a city. + + Args: + city: The name of the city to get weather for. + + Returns: + A string describing the weather conditions. + """ + return f"The weather in {city} is sunny and 72°F" diff --git a/examples/tutorials/10_async/00_base/harness_pydantic_ai/pyproject.toml b/examples/tutorials/10_async/00_base/harness_pydantic_ai/pyproject.toml new file mode 100644 index 000000000..3dc1e0e41 --- /dev/null +++ b/examples/tutorials/10_async/00_base/harness_pydantic_ai/pyproject.toml @@ -0,0 +1,36 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "ab-harness-pydantic-ai" +version = "0.1.0" +description = "An async Pydantic AI harness test agent using the unified emitter surface" +readme = "README.md" +requires-python = ">=3.12" +dependencies = [ + "agentex-sdk", + "scale-gp", + "pydantic-ai-slim[openai]>=1.0,<2", +] + +[project.optional-dependencies] +dev = [ + "pytest", + "pytest-asyncio", + "httpx", + "black", + "isort", + "flake8", +] + +[tool.hatch.build.targets.wheel] +packages = ["project"] + +[tool.black] +line-length = 88 +target-version = ['py312'] + +[tool.isort] +profile = "black" +line_length = 88 diff --git a/examples/tutorials/10_async/00_base/harness_pydantic_ai/tests/test_agent.py b/examples/tutorials/10_async/00_base/harness_pydantic_ai/tests/test_agent.py new file mode 100644 index 000000000..11098c7d5 --- /dev/null +++ b/examples/tutorials/10_async/00_base/harness_pydantic_ai/tests/test_agent.py @@ -0,0 +1,118 @@ +"""Live tests for the async harness Pydantic AI agent. + +These tests require a running agent (server + deployed agent) and exercise the +unified-surface async handler end-to-end over the wire. They mirror the +``110_pydantic_ai`` async tutorial tests but target this harness agent. + +Offline coverage of the same wiring (TestModel + fake streaming/tracing) lives +in ``tests/lib/core/harness/test_harness_pydantic_ai_async.py`` in the SDK repo. + +To run these tests: +1. Make sure the agent is running (via docker-compose or `agentex agents run`) +2. Set the AGENTEX_API_BASE_URL environment variable if not using default +3. Run: pytest test_agent.py -v + +Configuration: +- AGENTEX_API_BASE_URL: Base URL for the AgentEx server (default: http://localhost:5003) +- AGENT_NAME: Name of the agent to test (default: ab-harness-pydantic-ai) +""" + +import os + +import pytest +import pytest_asyncio + +from agentex import AsyncAgentex +from agentex.types import TextContentParam +from agentex.types.agent_rpc_params import ParamsCreateTaskRequest +from agentex.lib.sdk.fastacp.base.base_acp_server import uuid + +AGENTEX_API_BASE_URL = os.environ.get("AGENTEX_API_BASE_URL", "http://localhost:5003") +AGENT_NAME = os.environ.get("AGENT_NAME", "ab-harness-pydantic-ai") + + +@pytest_asyncio.fixture +async def client(): + """Create an AsyncAgentex client instance for testing.""" + client = AsyncAgentex(base_url=AGENTEX_API_BASE_URL) + yield client + await client.close() + + +@pytest.fixture +def agent_name(): + """Return the agent name for testing.""" + return AGENT_NAME + + +@pytest_asyncio.fixture +async def agent_id(client, agent_name): + """Retrieve the agent ID based on the agent name.""" + agents = await client.agents.list() + for agent in agents: + if agent.name == agent_name: + return agent.id + raise ValueError(f"Agent with name {agent_name} not found.") + + +class TestNonStreamingEvents: + """Test non-streaming event sending through the unified auto_send_turn path.""" + + @pytest.mark.asyncio + async def test_send_event(self, client: AsyncAgentex, agent_id: str): + """Test sending an event to the async harness Pydantic AI agent.""" + task_response = await client.agents.create_task(agent_id, params=ParamsCreateTaskRequest(name=uuid.uuid1().hex)) + task = task_response.result + assert task is not None + + event_content = TextContentParam( + type="text", + author="user", + content="Hello! What can you help me with?", + ) + await client.agents.send_event( + agent_id=agent_id, + params={"task_id": task.id, "content": event_content}, + ) + + @pytest.mark.asyncio + async def test_tool_calling(self, client: AsyncAgentex, agent_id: str): + """Test that the agent can use tools (e.g., weather tool).""" + task_response = await client.agents.create_task(agent_id, params=ParamsCreateTaskRequest(name=uuid.uuid1().hex)) + task = task_response.result + assert task is not None + + event_content = TextContentParam( + type="text", + author="user", + content="What's the weather in San Francisco?", + ) + await client.agents.send_event( + agent_id=agent_id, + params={"task_id": task.id, "content": event_content}, + ) + + +class TestStreamingEvents: + """Test streaming event sending.""" + + @pytest.mark.asyncio + async def test_send_event_and_stream(self, client: AsyncAgentex, agent_id: str): + """Test sending an event and streaming the response.""" + task_response = await client.agents.create_task(agent_id, params=ParamsCreateTaskRequest(name=uuid.uuid1().hex)) + task = task_response.result + assert task is not None + + event_content = TextContentParam( + type="text", + author="user", + content="Tell me a short joke.", + ) + await client.agents.send_event( + agent_id=agent_id, + params={"task_id": task.id, "content": event_content}, + ) + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) diff --git a/examples/tutorials/10_async/10_temporal/harness_pydantic_ai/.dockerignore b/examples/tutorials/10_async/10_temporal/harness_pydantic_ai/.dockerignore new file mode 100644 index 000000000..c49489471 --- /dev/null +++ b/examples/tutorials/10_async/10_temporal/harness_pydantic_ai/.dockerignore @@ -0,0 +1,43 @@ +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg + +# Environments +.env** +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# IDE +.idea/ +.vscode/ +*.swp +*.swo + +# Git +.git +.gitignore + +# Misc +.DS_Store diff --git a/examples/tutorials/10_async/10_temporal/harness_pydantic_ai/Dockerfile b/examples/tutorials/10_async/10_temporal/harness_pydantic_ai/Dockerfile new file mode 100644 index 000000000..98c74c6e8 --- /dev/null +++ b/examples/tutorials/10_async/10_temporal/harness_pydantic_ai/Dockerfile @@ -0,0 +1,43 @@ +# syntax=docker/dockerfile:1.3 +FROM python:3.12-slim +COPY --from=ghcr.io/astral-sh/uv:0.6.4 /uv /uvx /bin/ + +# Install system dependencies +RUN apt-get update && apt-get install -y \ + htop \ + vim \ + curl \ + tar \ + python3-dev \ + postgresql-client \ + build-essential \ + libpq-dev \ + gcc \ + cmake \ + netcat-openbsd \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +RUN uv pip install --system --upgrade pip setuptools wheel + +ENV UV_HTTP_TIMEOUT=1000 + +COPY 10_async/10_temporal/harness_pydantic_ai/pyproject.toml /app/harness_pydantic_ai/pyproject.toml +COPY 10_async/10_temporal/harness_pydantic_ai/README.md /app/harness_pydantic_ai/README.md + +WORKDIR /app/harness_pydantic_ai + +COPY 10_async/10_temporal/harness_pydantic_ai/project /app/harness_pydantic_ai/project +COPY 10_async/10_temporal/harness_pydantic_ai/tests /app/harness_pydantic_ai/tests +COPY test_utils /app/test_utils + +RUN uv pip install --system .[dev] + +ENV PYTHONPATH=/app + +ENV AGENT_NAME=at-harness-pydantic-ai + +CMD ["uvicorn", "project.acp:acp", "--host", "0.0.0.0", "--port", "8000"] + +# When we deploy the worker, we will replace the CMD with the following +# CMD ["python", "-m", "run_worker"] diff --git a/examples/tutorials/10_async/10_temporal/harness_pydantic_ai/README.md b/examples/tutorials/10_async/10_temporal/harness_pydantic_ai/README.md new file mode 100644 index 000000000..3e5fef4c6 --- /dev/null +++ b/examples/tutorials/10_async/10_temporal/harness_pydantic_ai/README.md @@ -0,0 +1,61 @@ +# Temporal Pydantic AI Harness Test Agent + +A minimal **Temporal-backed** Pydantic AI agent that drives the **unified +harness surface** (`UnifiedEmitter.auto_send_turn` + `PydanticAITurn`) from +inside the model activity's `event_stream_handler`. + +## Why this agent exists + +The `10_async/10_temporal/110_pydantic_ai` tutorial streams via the +`stream_pydantic_ai_events` helper (which uses the unified surface internally). +This harness test agent calls `emitter.auto_send_turn(...)` **explicitly** inside +the `event_stream_handler`, making the unified-surface wiring visible and giving +the temporal channel direct coverage. + +## How it wires the unified surface + +In `project/agent.py`, the `event_stream_handler` runs inside the model activity +and constructs a `UnifiedEmitter` from `RunContext.deps`: + +```python +async def event_handler(run_context, events): + emitter = UnifiedEmitter( + task_id=run_context.deps.task_id, + trace_id=run_context.deps.task_id, + parent_span_id=run_context.deps.parent_span_id, + ) + turn = PydanticAITurn(events, model=MODEL_NAME, coalesce_tool_requests=True) + await emitter.auto_send_turn(turn) +``` + +- The handler runs inside a Temporal activity, so it can freely make + non-deterministic Redis + tracing writes. +- `coalesce_tool_requests=True` is required on the auto_send path until + AGX1-377 lands. +- `deps` (set by `project/workflow.py`) threads the `task_id` and the per-turn + `parent_span_id` into the handler so tool spans nest under the workflow's turn + span. + +## Structure + +- `project/acp.py` — thin ACP server; FastACP auto-wires HTTP routes to the + workflow when `TemporalACPConfig` is used. +- `project/agent.py` — base `Agent` + `TemporalAgent` + the unified-surface + `event_stream_handler`. +- `project/workflow.py` — durable workflow; each turn delegates to + `temporal_agent.run(...)`. +- `project/run_worker.py` — Temporal worker entry point. +- `project/tools.py` — async `get_weather(city)` returning a constant. +- `tests/test_agent.py` — live integration test (requires Temporal + Redis + + ACP server + worker). + +## Tools + +- `get_weather(city: str) -> str` (async): returns a fixed "sunny and 72°F" + string. Each tool call becomes its own Temporal activity. + +## Offline coverage + +Offline integration tests for the same wiring (pydantic-ai `TestModel` + fake +streaming/tracing, no Temporal server) live in the SDK repo at +`tests/lib/core/harness/test_harness_pydantic_ai_temporal.py`. diff --git a/examples/tutorials/10_async/10_temporal/harness_pydantic_ai/manifest.yaml b/examples/tutorials/10_async/10_temporal/harness_pydantic_ai/manifest.yaml new file mode 100644 index 000000000..9efbff918 --- /dev/null +++ b/examples/tutorials/10_async/10_temporal/harness_pydantic_ai/manifest.yaml @@ -0,0 +1,62 @@ +build: + context: + root: ../../../ + include_paths: + - 10_async/10_temporal/harness_pydantic_ai + - test_utils + dockerfile: 10_async/10_temporal/harness_pydantic_ai/Dockerfile + dockerignore: 10_async/10_temporal/harness_pydantic_ai/.dockerignore + +local_development: + agent: + port: 8000 + host_address: host.docker.internal + paths: + acp: project/acp.py + worker: project/run_worker.py + +agent: + acp_type: async + name: at-harness-pydantic-ai + description: A Temporal-backed Pydantic AI harness test agent using the unified emitter surface + + temporal: + enabled: true + workflows: + - name: at-harness-pydantic-ai + queue_name: at_harness_pydantic_ai_queue + + credentials: + - env_var_name: REDIS_URL + secret_name: redis-url-secret + secret_key: url + - env_var_name: OPENAI_API_KEY + secret_name: openai-api-key + secret_key: api-key + - env_var_name: SGP_API_KEY + secret_name: sgp-api-key + secret_key: api-key + - env_var_name: SGP_ACCOUNT_ID + secret_name: sgp-account-id + secret_key: account-id + - env_var_name: SGP_CLIENT_BASE_URL + secret_name: sgp-client-base-url + secret_key: url + +deployment: + image: + repository: "" + tag: "latest" + + global: + agent: + name: "at-harness-pydantic-ai" + description: "A Temporal-backed Pydantic AI harness test agent using the unified emitter surface" + replicaCount: 1 + resources: + requests: + cpu: "500m" + memory: "1Gi" + limits: + cpu: "1000m" + memory: "2Gi" diff --git a/examples/tutorials/10_async/10_temporal/harness_pydantic_ai/project/__init__.py b/examples/tutorials/10_async/10_temporal/harness_pydantic_ai/project/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/examples/tutorials/10_async/10_temporal/harness_pydantic_ai/project/acp.py b/examples/tutorials/10_async/10_temporal/harness_pydantic_ai/project/acp.py new file mode 100644 index 000000000..c142dcf70 --- /dev/null +++ b/examples/tutorials/10_async/10_temporal/harness_pydantic_ai/project/acp.py @@ -0,0 +1,35 @@ +"""ACP server for the Temporal harness Pydantic AI test agent. + +This file is intentionally thin. When ``acp_type="async"`` is combined with +``TemporalACPConfig(type="temporal", ...)``, FastACP auto-wires: + + HTTP task/create → @workflow.run on the workflow class + HTTP task/event/send → @workflow.signal(SignalName.RECEIVE_EVENT) + HTTP task/cancel → workflow cancellation via the Temporal client + +so we don't define any handlers here. The actual agent code lives in +``project/workflow.py`` and is executed by the Temporal worker +(``project/run_worker.py``), not by this HTTP process. +""" + +from __future__ import annotations + +import os + +from dotenv import load_dotenv + +load_dotenv() + +from pydantic_ai.durable_exec.temporal import PydanticAIPlugin + +from agentex.lib.types.fastacp import TemporalACPConfig +from agentex.lib.sdk.fastacp.fastacp import FastACP + +acp = FastACP.create( + acp_type="async", + config=TemporalACPConfig( + type="temporal", + temporal_address=os.getenv("TEMPORAL_ADDRESS", "localhost:7233"), + plugins=[PydanticAIPlugin()], + ), +) diff --git a/examples/tutorials/10_async/10_temporal/harness_pydantic_ai/project/agent.py b/examples/tutorials/10_async/10_temporal/harness_pydantic_ai/project/agent.py new file mode 100644 index 000000000..5e8697264 --- /dev/null +++ b/examples/tutorials/10_async/10_temporal/harness_pydantic_ai/project/agent.py @@ -0,0 +1,111 @@ +"""Pydantic AI agent definition for the Temporal harness test agent. + +This module constructs the base ``pydantic_ai.Agent`` once at import time, +registers tools on it, and wraps it in ``TemporalAgent`` from +``pydantic_ai.durable_exec.temporal``. + +The ``TemporalAgent`` wrapper makes every model call and every tool call run as +a Temporal activity automatically. The workflow stays deterministic; the +non-deterministic work (LLM HTTP calls, tool execution) moves into recorded +activities. + +Streaming back to Agentex happens via ``event_stream_handler``, which receives +Pydantic AI ``AgentStreamEvent``s from inside the model activity and forwards +them through the UNIFIED HARNESS SURFACE (``UnifiedEmitter.auto_send_turn`` + +``PydanticAITurn``) — called directly rather than via ``stream_pydantic_ai_events``. +The ``task_id`` and per-turn ``parent_span_id`` are threaded into the handler +via ``deps``. +""" + +from __future__ import annotations + +from datetime import datetime +from collections.abc import AsyncIterable + +from pydantic import BaseModel +from pydantic_ai import Agent, RunContext +from pydantic_ai.messages import AgentStreamEvent +from pydantic_ai.durable_exec.temporal import TemporalAgent + +from project.tools import get_weather +from agentex.lib.core.harness import UnifiedEmitter +from agentex.lib.adk._modules._pydantic_ai_turn import PydanticAITurn + +__all__ = ["TaskDeps", "temporal_agent", "base_agent", "MODEL_NAME"] + +MODEL_NAME = "openai:gpt-4o-mini" +SYSTEM_PROMPT = """You are a helpful AI assistant with access to tools. + +Current date and time: {timestamp} + +Guidelines: +- Be concise and helpful +- Use tools when they would help answer the user's question +- If you're unsure, ask clarifying questions +- Always provide accurate information +""" + + +class TaskDeps(BaseModel): + """Per-run dependencies passed into the agent via ``deps=``. + + Pydantic AI's ``RunContext.deps`` is the canonical place to thread + request-scoped data (like the Agentex task_id) into tools and event + handlers — including code that runs inside Temporal activities. + """ + + task_id: str + # When set, the event handler nests per-tool-call spans under this span. + # Typically the ID of the per-turn span opened by the workflow. + parent_span_id: str | None = None + + +def _build_base_agent() -> Agent[TaskDeps, str]: + """Build the underlying Pydantic AI agent with tools registered. + + Tools must be registered BEFORE the agent is wrapped in TemporalAgent; + changes to tool registration after wrapping are not reflected. + """ + agent: Agent[TaskDeps, str] = Agent( + MODEL_NAME, + deps_type=TaskDeps, + system_prompt=SYSTEM_PROMPT.format(timestamp=datetime.now().strftime("%Y-%m-%d %H:%M:%S")), + ) + agent.tool_plain(get_weather) + return agent + + +async def event_handler( + run_context: RunContext[TaskDeps], + events: AsyncIterable[AgentStreamEvent], +) -> None: + """Stream Pydantic AI events to Agentex via the unified surface. + + Pydantic AI calls this with the live event stream as soon as the model + activity begins emitting parts. Because the handler runs inside the activity + (not the workflow), it can freely make non-deterministic Redis + tracing + writes. + + The UnifiedEmitter is constructed from ``deps`` (task_id + parent_span_id), + so tool spans nest under the workflow's per-turn span and messages auto-send + to the task stream. The auto_send path delivers streamed tool requests + natively, so no coalescing workaround is needed. + """ + emitter = UnifiedEmitter( + task_id=run_context.deps.task_id, + trace_id=run_context.deps.task_id, + parent_span_id=run_context.deps.parent_span_id, + ) + turn = PydanticAITurn(events, model=MODEL_NAME) + await emitter.auto_send_turn(turn) + + +# Construct the durable agent at module load time so that the PydanticAIPlugin +# can auto-discover its activities via the workflow's ``__pydantic_ai_agents__`` +# attribute. +base_agent = _build_base_agent() +temporal_agent: TemporalAgent[TaskDeps, str] = TemporalAgent( + base_agent, + name="harness_pydantic_ai_agent", + event_stream_handler=event_handler, +) diff --git a/examples/tutorials/10_async/10_temporal/harness_pydantic_ai/project/run_worker.py b/examples/tutorials/10_async/10_temporal/harness_pydantic_ai/project/run_worker.py new file mode 100644 index 000000000..4b4d43d19 --- /dev/null +++ b/examples/tutorials/10_async/10_temporal/harness_pydantic_ai/project/run_worker.py @@ -0,0 +1,48 @@ +"""Temporal worker for the harness Pydantic AI test agent. + +Run as a separate long-lived process alongside the ACP HTTP server. The worker +polls Temporal for workflow + activity tasks and executes them. + +The ``PydanticAIPlugin`` reads ``__pydantic_ai_agents__`` off the workflow class +and registers every model/tool activity the TemporalAgent needs — so we don't +have to enumerate activities by hand here. +""" + +import asyncio + +from pydantic_ai.durable_exec.temporal import PydanticAIPlugin + +from project.workflow import HarnessPydanticAiWorkflow +from agentex.lib.utils.debug import setup_debug_if_enabled +from agentex.lib.utils.logging import make_logger +from agentex.lib.environment_variables import EnvironmentVariables +from agentex.lib.core.temporal.activities import get_all_activities +from agentex.lib.core.temporal.workers.worker import AgentexWorker + +environment_variables = EnvironmentVariables.refresh() +logger = make_logger(__name__) + + +async def main(): + setup_debug_if_enabled() + + task_queue_name = environment_variables.WORKFLOW_TASK_QUEUE + if task_queue_name is None: + raise ValueError("WORKFLOW_TASK_QUEUE is not set") + + # get_all_activities() returns the built-in Agentex activities (state, + # messages, streaming, tracing). Pydantic AI's TemporalAgent activities are + # auto-registered by PydanticAIPlugin via __pydantic_ai_agents__. + worker = AgentexWorker( + task_queue=task_queue_name, + plugins=[PydanticAIPlugin()], + ) + + await worker.run( + activities=get_all_activities(), + workflow=HarnessPydanticAiWorkflow, + ) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/examples/tutorials/10_async/10_temporal/harness_pydantic_ai/project/tools.py b/examples/tutorials/10_async/10_temporal/harness_pydantic_ai/project/tools.py new file mode 100644 index 000000000..bbd6c5200 --- /dev/null +++ b/examples/tutorials/10_async/10_temporal/harness_pydantic_ai/project/tools.py @@ -0,0 +1,24 @@ +"""Tool definitions for the Temporal harness Pydantic AI agent. + +These functions are registered on the base Pydantic AI agent. When the agent +is wrapped in ``TemporalAgent``, each tool call becomes its own Temporal +activity automatically — independently retryable and observable. + +Tools must be ``async`` because Pydantic AI's Temporal integration requires +it: non-async tools would run in threads, which is non-deterministic and +unsafe for Temporal replay. +""" + +from __future__ import annotations + + +async def get_weather(city: str) -> str: + """Get the current weather for a city. + + Args: + city: The name of the city to get weather for. + + Returns: + A string describing the weather conditions. + """ + return f"The weather in {city} is sunny and 72°F" diff --git a/examples/tutorials/10_async/10_temporal/harness_pydantic_ai/project/workflow.py b/examples/tutorials/10_async/10_temporal/harness_pydantic_ai/project/workflow.py new file mode 100644 index 000000000..9a01be7de --- /dev/null +++ b/examples/tutorials/10_async/10_temporal/harness_pydantic_ai/project/workflow.py @@ -0,0 +1,137 @@ +"""Temporal workflow for the harness Pydantic AI test agent. + +The workflow holds task state durably across crashes. Its signal handler +delegates the actual agent run to ``temporal_agent.run(...)`` — which internally +schedules model and tool activities, each independently durable. The +``event_stream_handler`` registered on ``temporal_agent`` (see project.agent) +pushes streaming deltas through the unified harness surface while the model +activity runs. + +Multi-turn memory is kept on the workflow instance itself +(``self._message_history``). Temporal's workflow state is already durable and +replay-safe, so unlike the async-base agent we don't need an external +``adk.state`` round-trip. +""" + +from __future__ import annotations + +import os +import json +from typing import TYPE_CHECKING + +from temporalio import workflow + +from agentex.lib import adk +from project.agent import TaskDeps, temporal_agent +from agentex.lib.types.acp import SendEventParams, CreateTaskParams +from agentex.lib.types.tracing import SGPTracingProcessorConfig +from agentex.lib.utils.logging import make_logger +from agentex.types.text_content import TextContent +from agentex.lib.environment_variables import EnvironmentVariables +from agentex.lib.core.temporal.types.workflow import SignalName +from agentex.lib.core.temporal.workflows.workflow import BaseWorkflow +from agentex.lib.core.tracing.tracing_processor_manager import ( + add_tracing_processor_config, +) + +if TYPE_CHECKING: + from pydantic_ai.messages import ModelMessage + +add_tracing_processor_config( + SGPTracingProcessorConfig( + sgp_api_key=os.environ.get("SGP_API_KEY", ""), + sgp_account_id=os.environ.get("SGP_ACCOUNT_ID", ""), + sgp_base_url=os.environ.get("SGP_CLIENT_BASE_URL", ""), + ) +) + +environment_variables = EnvironmentVariables.refresh() + +if environment_variables.WORKFLOW_NAME is None: + raise ValueError("Environment variable WORKFLOW_NAME is not set") +if environment_variables.AGENT_NAME is None: + raise ValueError("Environment variable AGENT_NAME is not set") + +logger = make_logger(__name__) + + +@workflow.defn(name=environment_variables.WORKFLOW_NAME) +class HarnessPydanticAiWorkflow(BaseWorkflow): + """Long-running Temporal workflow that delegates each turn to a Pydantic AI TemporalAgent. + + The ``__pydantic_ai_agents__`` attribute is the marker the + ``PydanticAIPlugin`` looks for at worker startup: it pulls + ``temporal_agent.temporal_activities`` off this list and registers them on + the worker automatically — so we don't have to list activities by hand in + ``run_worker.py``. + """ + + __pydantic_ai_agents__ = [temporal_agent] + + def __init__(self): + super().__init__(display_name=environment_variables.AGENT_NAME) + self._complete_task = False + self._turn_number = 0 + # Conversation history accumulated across turns. Each entry is a + # pydantic-ai ``ModelMessage``. Temporal replays the activity that + # produced these messages, so the list is rebuilt deterministically if + # the workflow ever recovers from a crash. + self._message_history: list["ModelMessage"] = [] + + @workflow.signal(name=SignalName.RECEIVE_EVENT) + async def on_task_event_send(self, params: SendEventParams) -> None: + """Handle a new user message: echo it, then run the agent durably.""" + logger.info(f"Received task event: {params.task.id}") + self._turn_number += 1 + + # Echo the user's message so it shows up in the UI as a chat bubble. + await adk.messages.create(task_id=params.task.id, content=params.event.content) + + async with adk.tracing.span( + trace_id=params.task.id, + task_id=params.task.id, + name=f"Turn {self._turn_number}", + input={"message": params.event.content.content}, + ) as span: + # temporal_agent.run() schedules a model activity, per-tool + # activities, and the event_stream_handler activity (which pushes + # deltas through the unified surface). Passing ``message_history`` + # makes the run remember prior turns. + result = await temporal_agent.run( + params.event.content.content, + message_history=self._message_history, + deps=TaskDeps( + task_id=params.task.id, + parent_span_id=span.id if span else None, + ), + ) + # Persist the new full history (user + assistant + any tool rounds) + # so the next turn picks up from here. + self._message_history = list(result.all_messages()) + if span: + span.output = {"final_output": result.output} + + @workflow.run + async def on_task_create(self, params: CreateTaskParams) -> str: + """Workflow entry point — keep the conversation alive for incoming signals.""" + logger.info(f"Task created: {params.task.id}") + + await adk.messages.create( + task_id=params.task.id, + content=TextContent( + author="agent", + content=( + f"Task initialized with params:\n{json.dumps(params.params, indent=2)}\n" + f"Send me a message and I'll respond using a Pydantic AI agent backed by Temporal." + ), + ), + ) + + await workflow.wait_condition(lambda: self._complete_task, timeout=None) + return "Task completed" + + @workflow.signal + async def complete_task_signal(self) -> None: + """Graceful workflow shutdown signal.""" + logger.info("Received complete_task signal") + self._complete_task = True diff --git a/examples/tutorials/10_async/10_temporal/harness_pydantic_ai/pyproject.toml b/examples/tutorials/10_async/10_temporal/harness_pydantic_ai/pyproject.toml new file mode 100644 index 000000000..4d9039640 --- /dev/null +++ b/examples/tutorials/10_async/10_temporal/harness_pydantic_ai/pyproject.toml @@ -0,0 +1,38 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "at-harness-pydantic-ai" +version = "0.1.0" +description = "A Temporal-backed Pydantic AI harness test agent using the unified emitter surface" +readme = "README.md" +requires-python = ">=3.12" +dependencies = [ + "agentex-sdk", + "scale-gp", + "temporalio>=1.18.2", + "pydantic-ai-slim[openai]>=1.0,<2", +] + +[project.optional-dependencies] +dev = [ + "pytest", + "pytest-asyncio", + "httpx", + "black", + "isort", + "flake8", + "debugpy>=1.8.15", +] + +[tool.hatch.build.targets.wheel] +packages = ["project"] + +[tool.black] +line-length = 88 +target-version = ['py312'] + +[tool.isort] +profile = "black" +line_length = 88 diff --git a/examples/tutorials/10_async/10_temporal/harness_pydantic_ai/tests/test_agent.py b/examples/tutorials/10_async/10_temporal/harness_pydantic_ai/tests/test_agent.py new file mode 100644 index 000000000..a5b90ca34 --- /dev/null +++ b/examples/tutorials/10_async/10_temporal/harness_pydantic_ai/tests/test_agent.py @@ -0,0 +1,114 @@ +"""Live tests for the Temporal harness Pydantic AI agent. + +These tests require a running agent (Temporal + Redis + ACP server + worker) and +exercise the unified-surface event_stream_handler end-to-end over the wire. They +mirror the ``at110`` temporal tutorial tests but target this harness agent. + +Offline coverage of the same wiring (TestModel + fake streaming/tracing) lives +in ``tests/lib/core/harness/test_harness_pydantic_ai_temporal.py`` in the SDK repo. + +To run these tests: +1. Make sure the agent is running (worker + ACP server) +2. Set AGENTEX_API_BASE_URL if not using the default +3. Run: pytest tests/test_agent.py -v +""" + +import os +import uuid + +import pytest +import pytest_asyncio +from test_utils.async_utils import poll_messages, send_event_and_poll_yielding + +from agentex import AsyncAgentex +from agentex.types.task_message import TaskMessage +from agentex.types.agent_rpc_params import ParamsCreateTaskRequest + +AGENTEX_API_BASE_URL = os.environ.get("AGENTEX_API_BASE_URL", "http://localhost:5003") +AGENT_NAME = os.environ.get("AGENT_NAME", "at-harness-pydantic-ai") + + +@pytest_asyncio.fixture +async def client(): + client = AsyncAgentex(base_url=AGENTEX_API_BASE_URL) + yield client + await client.close() + + +@pytest.fixture +def agent_name(): + return AGENT_NAME + + +@pytest_asyncio.fixture +async def agent_id(client, agent_name): + agents = await client.agents.list() + for agent in agents: + if agent.name == agent_name: + return agent.id + raise ValueError(f"Agent with name {agent_name} not found.") + + +class TestNonStreamingEvents: + """Test that the Temporal-backed harness agent responds and uses tools.""" + + @pytest.mark.asyncio + async def test_send_event_and_poll(self, client: AsyncAgentex, agent_id: str): + """Drive a full turn: create task, send a weather question, verify tool round-trip.""" + task_response = await client.agents.create_task(agent_id, params=ParamsCreateTaskRequest(name=uuid.uuid1().hex)) + task = task_response.result + assert task is not None + + # Wait for the welcome message from on_task_create + task_creation_found = False + async for message in poll_messages( + client=client, + task_id=task.id, + timeout=30, + sleep_interval=1.0, + ): + assert isinstance(message, TaskMessage) + if message.content and message.content.type == "text" and message.content.author == "agent": + task_creation_found = True + break + assert task_creation_found, "Task creation welcome message not found" + + # Ask about weather — the agent should call get_weather + seen_tool_request = False + seen_tool_response = False + final_message = None + async for message in send_event_and_poll_yielding( + client=client, + agent_id=agent_id, + task_id=task.id, + user_message="What is the weather in San Francisco?", + timeout=60, + sleep_interval=1.0, + ): + assert isinstance(message, TaskMessage) + + if message.content and message.content.type == "tool_request": + seen_tool_request = True + if message.content and message.content.type == "tool_response": + seen_tool_response = True + if final_message and getattr(final_message, "streaming_status", None) == "DONE": + break + + if message.content and message.content.type == "text" and message.content.author == "agent": + final_message = message + content_length = len(getattr(message.content, "content", "") or "") + if message.streaming_status == "DONE" and content_length > 0: + if not seen_tool_request or seen_tool_response: + break + + assert seen_tool_request, "Expected a tool_request (agent calling get_weather)" + assert seen_tool_response, "Expected a tool_response (get_weather result)" + assert final_message is not None, "Expected a final agent text message" + final_text = getattr(final_message.content, "content", None) if final_message.content else None + assert isinstance(final_text, str) and len(final_text) > 0 + # The get_weather tool always returns "72°F" — the response should mention it. + assert "72" in final_text, "Expected weather response to mention 72°F" + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) diff --git a/src/agentex/lib/adk/_modules/_pydantic_ai_async.py b/src/agentex/lib/adk/_modules/_pydantic_ai_async.py index 0bbb5b19d..85abfb845 100644 --- a/src/agentex/lib/adk/_modules/_pydantic_ai_async.py +++ b/src/agentex/lib/adk/_modules/_pydantic_ai_async.py @@ -6,11 +6,10 @@ HTTP yields. Text and thinking tokens stream as deltas inside coalesced streaming -contexts. Tool requests and tool results are emitted as full -``adk.messages.create(...)`` calls (Option A — matches the async LangGraph -helper's convention). To stream tool-call argument tokens, see the sync -converter at ``agentex.lib.adk._modules._pydantic_ai_sync`` which yields -``ToolRequestDelta`` events. +contexts. Tool requests and tool results are posted as open+close pairs +on a streaming context (the unified surface persists ``initial_content`` +when a context is closed without deltas). This matches the ``auto_send`` +convention used by all other async/Temporal harnesses. Tracing is opt-in via a ``tracing_handler`` parameter — see ``create_pydantic_ai_tracing_handler`` in @@ -19,7 +18,7 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING if TYPE_CHECKING: from agentex.lib.adk._modules._pydantic_ai_tracing import ( @@ -49,230 +48,18 @@ async def stream_pydantic_ai_events( more text) return only the final text segment, matching the ``stream_langgraph_events`` convention. """ - # Lazy imports so pydantic-ai isn't required at module load time. - import json + from agentex.lib.core.harness.emitter import UnifiedEmitter + from agentex.lib.adk._modules._pydantic_ai_turn import PydanticAITurn - from pydantic_ai.messages import ( - TextPart, - PartEndEvent, - ThinkingPart, - ToolCallPart, - TextPartDelta, - PartDeltaEvent, - PartStartEvent, - ThinkingPartDelta, - FunctionToolResultEvent, + turn = PydanticAITurn( + stream, + model=None, + tracing_handler=tracing_handler, ) - - from agentex.lib import adk - from agentex.types.text_content import TextContent - from agentex.types.reasoning_content import ReasoningContent - from agentex.types.task_message_delta import TextDelta - from agentex.types.task_message_update import StreamTaskMessageDelta - from agentex.types.tool_request_content import ToolRequestContent - from agentex.types.tool_response_content import ToolResponseContent - from agentex.types.reasoning_content_delta import ReasoningContentDelta - - text_context = None - reasoning_context = None - final_text = "" - - # Per Pydantic-AI part-index bookkeeping. Part indices restart at 0 on - # each new model response, so we overwrite on PartStartEvent. - part_kind: dict[int, str] = {} - tool_call_info: dict[int, tuple[str, str]] = {} - - async def _close_text(): - nonlocal text_context - if text_context: - await text_context.close() - text_context = None - - async def _close_reasoning(): - nonlocal reasoning_context - if reasoning_context: - await reasoning_context.close() - reasoning_context = None - - try: - async for event in stream: - if isinstance(event, PartStartEvent): - if isinstance(event.part, TextPart): - await _close_reasoning() - await _close_text() - - final_text = "" - text_context = await adk.streaming.streaming_task_message_context( - task_id=task_id, - initial_content=TextContent( - author="agent", - content="", - format="markdown", - ), - ).__aenter__() - part_kind[event.index] = "text" - - # Pydantic AI puts the first streaming chunk in - # PartStartEvent.part.content; surface it as a Delta so it - # actually renders (Start.content is initialization, not body). - if event.part.content: - final_text += event.part.content - await text_context.stream_update( - StreamTaskMessageDelta( - parent_task_message=text_context.task_message, - delta=TextDelta(type="text", text_delta=event.part.content), - type="delta", - ) - ) - - elif isinstance(event.part, ThinkingPart): - await _close_text() - await _close_reasoning() - - reasoning_context = await adk.streaming.streaming_task_message_context( - task_id=task_id, - initial_content=ReasoningContent( - author="agent", - summary=[], - content=[], - type="reasoning", - style="active", - ), - ).__aenter__() - part_kind[event.index] = "reasoning" - - if event.part.content: - await reasoning_context.stream_update( - StreamTaskMessageDelta( - parent_task_message=reasoning_context.task_message, - delta=ReasoningContentDelta( - type="reasoning_content", - content_index=0, - content_delta=event.part.content, - ), - type="delta", - ) - ) - - elif isinstance(event.part, ToolCallPart): - await _close_text() - await _close_reasoning() - tool_call_info[event.index] = ( - event.part.tool_call_id, - event.part.tool_name, - ) - part_kind[event.index] = "tool_call" - - elif isinstance(event, PartDeltaEvent): - kind = part_kind.get(event.index) - if kind == "text" and isinstance(event.delta, TextPartDelta) and text_context: - final_text += event.delta.content_delta - await text_context.stream_update( - StreamTaskMessageDelta( - parent_task_message=text_context.task_message, - delta=TextDelta(type="text", text_delta=event.delta.content_delta), - type="delta", - ) - ) - elif ( - kind == "reasoning" - and isinstance(event.delta, ThinkingPartDelta) - and reasoning_context - and event.delta.content_delta - ): - await reasoning_context.stream_update( - StreamTaskMessageDelta( - parent_task_message=reasoning_context.task_message, - delta=ReasoningContentDelta( - type="reasoning_content", - content_index=0, - content_delta=event.delta.content_delta, - ), - type="delta", - ) - ) - # Tool-call arg deltas: Pydantic AI accumulates them; we - # surface the final args on PartEndEvent below (Option A). - - elif isinstance(event, PartEndEvent): - kind = part_kind.get(event.index) - if kind == "text": - await _close_text() - elif kind == "reasoning": - await _close_reasoning() - elif kind == "tool_call" and isinstance(event.part, ToolCallPart): - tool_call_id, tool_name = tool_call_info.get(event.index, ("", "")) - args = event.part.args - if isinstance(args, str): - try: - args = json.loads(args) if args else {} - except json.JSONDecodeError: - args = {"_raw": args} - elif args is None: - args = {} - await adk.messages.create( - task_id=task_id, - content=ToolRequestContent( - tool_call_id=tool_call_id, - name=tool_name, - arguments=args, - author="agent", - ), - ) - if tracing_handler is not None and tool_call_id: - await tracing_handler.on_tool_start( - tool_call_id=tool_call_id, - tool_name=tool_name, - arguments=args, - ) - - elif isinstance(event, FunctionToolResultEvent): - await _close_text() - await _close_reasoning() - - result = event.part - tool_call_id = result.tool_call_id - tool_name = getattr(result, "tool_name", "") or "" - # Preserve structure for dicts / lists / Pydantic models so the - # UI can render them as JSON, not as Python repr. Matches the - # sync converter's ``_tool_return_content`` helper exactly — - # ``str(content)`` on a dict produces ``"{'k': 'v'}"`` which is - # invalid JSON and unreadable in the UI. - content = getattr(result, "content", None) - content_payload: Any - if content is None: - content_payload = str(result) - elif isinstance(content, (str, int, float, bool, list, dict)): - content_payload = content - elif hasattr(content, "model_dump"): - try: - content_payload = content.model_dump() - except Exception: - content_payload = str(content) - else: - content_payload = str(content) - await adk.messages.create( - task_id=task_id, - content=ToolResponseContent( - tool_call_id=tool_call_id, - name=tool_name, - content=content_payload, - author="agent", - ), - ) - if tracing_handler is not None and tool_call_id: - await tracing_handler.on_tool_end( - tool_call_id=tool_call_id, - result=content_payload, - ) - - # FunctionToolCallEvent / FinalResultEvent / AgentRunResultEvent - # are intentionally ignored — same as the sync converter. - - finally: - if text_context: - await text_context.close() - if reasoning_context: - await reasoning_context.close() - - return final_text + emitter = UnifiedEmitter( + task_id=task_id, + trace_id=None, + parent_span_id=None, + ) + result = await emitter.auto_send_turn(turn) + return result.final_text diff --git a/src/agentex/lib/adk/_modules/_pydantic_ai_sync.py b/src/agentex/lib/adk/_modules/_pydantic_ai_sync.py index d94c0ae12..e4ac31e7e 100644 --- a/src/agentex/lib/adk/_modules/_pydantic_ai_sync.py +++ b/src/agentex/lib/adk/_modules/_pydantic_ai_sync.py @@ -16,12 +16,32 @@ async def handle_message_send(params): async with agent.run_stream_events(params.content.content) as stream: async for event in convert_pydantic_ai_to_agentex_events(stream): yield event + +Recommended: unified surface +----------------------------- +For new handlers, prefer ``UnifiedEmitter`` + ``PydanticAITurn`` over the +bare converter. The unified surface wires tracing automatically when a +``trace_id`` is provided, so tool and reasoning spans are derived from the +same event stream with no extra setup: + + from agentex.lib.core.harness import UnifiedEmitter + from agentex.lib.adk._modules._pydantic_ai_turn import PydanticAITurn + + emitter = UnifiedEmitter(task_id=task_id, trace_id=trace_id, parent_span_id=parent_span_id) + turn = PydanticAITurn(agent.run_stream_events(prompt), model="openai:gpt-4o") + async for event in emitter.yield_turn(turn): + yield event # forwarded over the ACP streaming response; spans derived automatically + +``convert_pydantic_ai_to_agentex_events`` remains the low-level tap for +callers that manage their own tracing or need direct access to the raw +converted stream. """ from __future__ import annotations import json -from typing import TYPE_CHECKING, Any, AsyncIterator +import inspect +from typing import TYPE_CHECKING, Any, Callable, AsyncIterator from pydantic_ai.run import AgentRunResultEvent @@ -105,6 +125,7 @@ def _tool_return_content(result: ToolReturnPart | Any) -> Any: async def convert_pydantic_ai_to_agentex_events( stream_response: AsyncIterator[Any], tracing_handler: "AgentexPydanticAITracingHandler | None" = None, + on_result: Callable[[AgentRunResultEvent], Any] | None = None, ) -> AsyncIterator[StreamTaskMessageStart | StreamTaskMessageDelta | StreamTaskMessageFull | StreamTaskMessageDone]: """Convert a Pydantic AI agent event stream into Agentex stream events. @@ -132,6 +153,12 @@ async def convert_pydantic_ai_to_agentex_events( tool call in the run is also recorded as an Agentex child span beneath the handler's configured ``parent_span_id``. Streaming behavior is unchanged when omitted. + on_result: Optional callback invoked with the terminal + ``AgentRunResultEvent`` when the run completes. Both sync and + async callables are accepted. No ``StreamTaskMessage*`` events are + yielded for this terminal event; the callback is the only side + effect. Useful for capturing run-level usage without altering the + streaming output. Yields: Agentex ``StreamTaskMessage*`` events suitable for forwarding back over @@ -328,6 +355,10 @@ async def convert_pydantic_ai_to_agentex_events( # Already covered by PartStart/PartDelta/PartEnd events above, or # informational only (FinalResultEvent / AgentRunResultEvent signal # run-level state, not new content to surface). + if isinstance(event, AgentRunResultEvent) and on_result is not None: + ret = on_result(event) + if inspect.iscoroutine(ret): + await ret continue else: diff --git a/src/agentex/lib/adk/_modules/_pydantic_ai_tracing.py b/src/agentex/lib/adk/_modules/_pydantic_ai_tracing.py index aa9d906eb..e199d0a8c 100644 --- a/src/agentex/lib/adk/_modules/_pydantic_ai_tracing.py +++ b/src/agentex/lib/adk/_modules/_pydantic_ai_tracing.py @@ -1,5 +1,29 @@ """Tracing handler that records Agentex spans for tool calls in a pydantic-ai agent run. +.. deprecated:: + ``AgentexPydanticAITracingHandler`` and ``create_pydantic_ai_tracing_handler`` + are superseded by the unified harness surface (``UnifiedEmitter`` in + ``agentex.lib.core.harness``). The unified surface derives tool and + reasoning spans directly from the canonical ``StreamTaskMessage*`` stream, + so no separate handler is required. Both symbols remain fully importable + and functional; they will be removed in a future release. New code should + construct a ``UnifiedEmitter`` with a ``trace_id`` instead: + + from agentex.lib.core.harness import UnifiedEmitter + from agentex.lib.adk._modules._pydantic_ai_turn import PydanticAITurn + + emitter = UnifiedEmitter(task_id=task_id, trace_id=trace_id, parent_span_id=parent_span_id) + turn = PydanticAITurn(agent.run_stream_events(prompt), model="openai:gpt-4o") + async for event in emitter.yield_turn(turn): + yield event + +# NOTE: A runtime ``warnings.warn(..., DeprecationWarning)`` is intentionally +# omitted here. The repo's pyproject ``filterwarnings = ["error"]`` would turn +# it into a test/caller failure, and the async helper (``stream_pydantic_ai_events``) +# still threads this handler through for existing callers that lack a ``trace_id`` +# on the async path. The runtime warning and caller migration are deferred until +# ``trace_id`` threading lands on the async helper in a future API-versioning change. + Mirrors the LangGraph tracing handler pattern: the caller creates a handler bound to a ``trace_id`` and a ``parent_span_id``, then hands it to ``stream_pydantic_ai_events(..., tracing_handler=handler)``. The streamer @@ -63,6 +87,14 @@ def _tool_span_id(trace_id: str, tool_call_id: str) -> str: class AgentexPydanticAITracingHandler: """Records Agentex tracing spans for tool calls observed in a pydantic-ai event stream. + .. deprecated:: + Superseded by ``UnifiedEmitter`` (``agentex.lib.core.harness``), which + derives tool and reasoning spans from the canonical ``StreamTaskMessage*`` + stream automatically when ``trace_id`` is provided. This class remains + fully functional but will be removed in a future release. New code should + use ``UnifiedEmitter`` with a trace context instead of constructing this + handler directly. + Pass an instance to ``stream_pydantic_ai_events(..., tracing_handler=...)`` or call ``on_tool_start`` / ``on_tool_end`` yourself if you're consuming the event stream by hand. @@ -165,6 +197,13 @@ def create_pydantic_ai_tracing_handler( ) -> AgentexPydanticAITracingHandler: """Create a tracing handler that records Agentex spans for pydantic-ai tool calls. + .. deprecated:: + Superseded by ``UnifiedEmitter`` (``agentex.lib.core.harness``), which + derives tool and reasoning spans from the canonical ``StreamTaskMessage*`` + stream automatically when ``trace_id`` is provided. This function remains + fully functional but will be removed in a future release. New code should + construct a ``UnifiedEmitter`` with a trace context instead. + Args: trace_id: The trace ID. Typically the Agentex task ID. parent_span_id: Optional parent span ID to nest tool spans under. If diff --git a/src/agentex/lib/adk/_modules/_pydantic_ai_turn.py b/src/agentex/lib/adk/_modules/_pydantic_ai_turn.py new file mode 100644 index 000000000..b06172e7f --- /dev/null +++ b/src/agentex/lib/adk/_modules/_pydantic_ai_turn.py @@ -0,0 +1,134 @@ +"""PydanticAITurn: a HarnessTurn wrapping a pydantic-ai event stream. + +Adapts a pydantic-ai ``AgentStreamEvent`` stream into the canonical +``StreamTaskMessage*`` stream while capturing run-level usage from the +terminal ``AgentRunResultEvent``. + +Typical usage:: + + async with agent.run_stream_events(user_msg) as stream: + turn = PydanticAITurn(stream, model="openai:gpt-4o") + async for event in turn.events: + yield event + span.set_attributes(turn.usage().model_dump()) +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, Any, AsyncIterator + +from pydantic_ai.run import AgentRunResultEvent + +from agentex.lib.core.harness.types import TurnUsage +from agentex.types.task_message_update import ( + StreamTaskMessageDone, + StreamTaskMessageFull, + StreamTaskMessageDelta, + StreamTaskMessageStart, +) +from agentex.lib.adk._modules._pydantic_ai_sync import convert_pydantic_ai_to_agentex_events + +if TYPE_CHECKING: + from agentex.lib.adk._modules._pydantic_ai_tracing import AgentexPydanticAITracingHandler + +StreamTaskMessage = StreamTaskMessageStart | StreamTaskMessageDelta | StreamTaskMessageFull | StreamTaskMessageDone + + +def pydantic_ai_usage_to_turn_usage(usage: Any, model: str | None) -> TurnUsage: + """Map a pydantic-ai ``RunUsage`` onto ``TurnUsage``. + + Uses defensive ``getattr(..., None)`` so a future field rename in + pydantic-ai degrades to ``None`` rather than raising ``AttributeError``. + + RunUsage fields (verified against pydantic-ai in this repo): + input_tokens, cache_write_tokens, cache_read_tokens, output_tokens, + input_audio_tokens, cache_audio_read_tokens, output_audio_tokens, + details, requests, tool_calls. + ``total_tokens`` is a computed property. + + Mapping: + requests -> num_llm_calls + input_tokens -> input_tokens + output_tokens -> output_tokens + cache_read_tokens -> cached_input_tokens + total_tokens -> total_tokens + + getattr results pass straight through: a MISSING attribute degrades to + None (defensive), while a real 0 stays 0 (a cache-hit with 0 output + tokens is a genuine zero, not "unknown") and a real N stays N. + """ + raw_input = getattr(usage, "input_tokens", None) + raw_output = getattr(usage, "output_tokens", None) + raw_cache_read = getattr(usage, "cache_read_tokens", None) + raw_total = getattr(usage, "total_tokens", None) + raw_requests = getattr(usage, "requests", None) + + return TurnUsage( + model=model, + input_tokens=raw_input, + output_tokens=raw_output, + cached_input_tokens=raw_cache_read, + total_tokens=raw_total, + num_llm_calls=raw_requests if raw_requests is not None else 0, + ) + + +class PydanticAITurn: + """A single harness turn backed by a pydantic-ai event stream. + + Satisfies the ``HarnessTurn`` protocol: ``events`` async-generates the + canonical ``StreamTaskMessage*`` stream; ``usage()`` returns a normalized + ``TurnUsage`` (valid only after ``events`` is exhausted). + + ``events`` is identical to the bare ``convert_pydantic_ai_to_agentex_events`` + output (tool calls stream as ``Start + ToolRequestDelta + Done``, preserving + argument-token streaming on the sync/yield channel). The foundation + ``auto_send`` delivers the streamed tool-request shape natively (AGX1-377), + so no coalescing is needed on either channel. + """ + + def __init__( + self, + stream: AsyncIterator[Any], + model: str | None = None, + tracing_handler: "AgentexPydanticAITracingHandler | None" = None, + ) -> None: + self._stream = stream + self._model = model + self._tracing_handler = tracing_handler + self._usage = TurnUsage(model=model) + + @property + def events(self) -> AsyncIterator[StreamTaskMessage]: + return self._generate_events() + + async def _generate_events(self) -> AsyncIterator[StreamTaskMessage]: + def _capture(result_event: AgentRunResultEvent) -> None: + run_result = getattr(result_event, "result", None) + if run_result is None: + return + usage_attr = getattr(run_result, "usage", None) + if usage_attr is None: + return + # In newer pydantic-ai, .usage is a DeprecatedCallableRunUsage — + # it's both a property value and callable (emitting a deprecation + # warning when called). Access it as a plain attribute to avoid the + # warning; it already IS the RunUsage instance. + usage_obj = usage_attr + self._usage = pydantic_ai_usage_to_turn_usage(usage_obj, self._model) + + raw_stream = convert_pydantic_ai_to_agentex_events( + self._stream, + tracing_handler=self._tracing_handler, + on_result=_capture, + ) + async for ev in raw_stream: + yield ev + + def usage(self) -> TurnUsage: + """Return the normalized usage for this turn. + + Valid only after ``events`` is exhausted (single-pass contract). + Before exhaustion the model field is set but token fields are None. + """ + return self._usage diff --git a/tests/lib/adk/test_pydantic_ai_async.py b/tests/lib/adk/test_pydantic_ai_async.py index dadda5914..49cb6054c 100644 --- a/tests/lib/adk/test_pydantic_ai_async.py +++ b/tests/lib/adk/test_pydantic_ai_async.py @@ -82,7 +82,9 @@ class FakeStreamingModule: def __init__(self) -> None: self.contexts: list[FakeContext] = [] - def streaming_task_message_context(self, *, task_id: str, initial_content: Any) -> FakeContext: + def streaming_task_message_context( + self, *, task_id: str, initial_content: Any, streaming_mode: str = "coalesced", created_at: Any = None + ) -> FakeContext: tm = TaskMessage( id=f"m{len(self.contexts) + 1}", task_id=task_id, @@ -255,16 +257,36 @@ async def test_empty_thinking_delta_is_skipped( class TestToolCallEmission: - async def test_tool_call_emits_full_tool_request_message_on_part_end( + async def test_tool_call_opens_streaming_context_with_identity( self, fake_adk: tuple[FakeStreamingModule, FakeMessagesModule] ) -> None: - """Async helper uses Option A: tool requests are full messages, not delta streams.""" + """Tool requests are delivered as a streaming context (Start+Delta+Done). + + AGX1-377 fix: auto_send now delivers streamed tool-request messages + natively (Start+ToolRequestDelta+Done). The streaming context is opened + at the Start event with the initial ToolRequestContent (tool_call_id + + name + empty arguments), argument tokens are streamed as deltas, and the + context is closed on Done. + + This test uses a realistic pydantic-ai event sequence: args arrive as a + PartDeltaEvent fragment (the way OpenAI/Anthropic actually stream JSON + tool-call arguments). + """ + from pydantic_ai.messages import ToolCallPartDelta + + from agentex.types.tool_request_delta import ToolRequestDelta + streaming, messages = fake_adk events = [ PartStartEvent( index=1, part=ToolCallPart(tool_name="get_weather", args=None, tool_call_id="c1"), ), + # Realistic: args arrive as delta tokens (JSON string fragments). + PartDeltaEvent( + index=1, + delta=ToolCallPartDelta(args_delta='{"city":"Paris"}'), + ), PartEndEvent( index=1, part=ToolCallPart(tool_name="get_weather", args='{"city":"Paris"}', tool_call_id="c1"), @@ -272,21 +294,28 @@ async def test_tool_call_emits_full_tool_request_message_on_part_end( ] await stream_pydantic_ai_events(_aiter(events), TASK_ID) - assert streaming.contexts == [], "Tool calls do not open a streaming context" - assert len(messages.created) == 1 - msg = messages.created[0] - assert msg["task_id"] == TASK_ID - content = msg["content"] + # AGX1-373: tool messages arrive via streaming_task_message_context. + assert messages.created == [], "adk.messages.create must not be called" + assert len(streaming.contexts) == 1, "tool_request opens a streaming context" + ctx = streaming.contexts[0] + assert ctx.closed is True + content = ctx.initial_content assert isinstance(content, ToolRequestContent) assert content.tool_call_id == "c1" assert content.name == "get_weather" - assert content.arguments == {"city": "Paris"} assert content.author == "agent" + # AGX1-377 streamed shape: initial_content has empty args (args come via delta) + assert content.arguments == {} + # The arg delta is delivered as a stream_update + assert len(ctx.updates) == 1 + assert isinstance(ctx.updates[0].delta, ToolRequestDelta) + assert ctx.updates[0].delta.arguments_delta == '{"city":"Paris"}' async def test_tool_call_with_dict_args_passes_through( self, fake_adk: tuple[FakeStreamingModule, FakeMessagesModule] ) -> None: - _, messages = fake_adk + """When args arrive pre-populated as a dict in PartStart, they're in initial_content.""" + streaming, messages = fake_adk events = [ PartStartEvent( index=0, @@ -299,23 +328,40 @@ async def test_tool_call_with_dict_args_passes_through( ] await stream_pydantic_ai_events(_aiter(events), TASK_ID) - assert len(messages.created) == 1 - assert messages.created[0]["content"].arguments == {"q": "weather"} + # AGX1-373: tool messages via streaming_task_message_context + assert messages.created == [] + assert len(streaming.contexts) == 1 + # Dict args present at PartStart land directly in initial_content.arguments + assert streaming.contexts[0].initial_content.arguments == {"q": "weather"} + assert streaming.contexts[0].updates == [], "no delta for pre-populated dict args" async def test_tool_call_with_invalid_json_args_surfaces_raw( self, fake_adk: tuple[FakeStreamingModule, FakeMessagesModule] ) -> None: - """Don't drop the tool call when the model emits malformed JSON args. + """Malformed JSON arg delta is surfaced as a ToolRequestDelta with the raw string. + + The argument delta is delivered as-is by auto_send; the client-side + accumulator or the streaming backend handles malformed JSON gracefully. - The arguments field is preserved under ``_raw`` so the failure is - visible to the UI rather than silently truncated. + Parts-manager invariant: PartEnd.part is the accumulated snapshot; real + pydantic-ai conveys args via PartStart + PartDeltaEvent, so a + PartStart(None)+PartEnd(json) with no delta is not realizable. """ - _, messages = fake_adk + from pydantic_ai.messages import ToolCallPartDelta + + from agentex.types.tool_request_delta import ToolRequestDelta + + streaming, messages = fake_adk events = [ PartStartEvent( index=0, part=ToolCallPart(tool_name="t", args=None, tool_call_id="c"), ), + # Malformed JSON arrives as a delta token. + PartDeltaEvent( + index=0, + delta=ToolCallPartDelta(args_delta="not-json{"), + ), PartEndEvent( index=0, part=ToolCallPart(tool_name="t", args="not-json{", tool_call_id="c"), @@ -323,13 +369,21 @@ async def test_tool_call_with_invalid_json_args_surfaces_raw( ] await stream_pydantic_ai_events(_aiter(events), TASK_ID) - assert len(messages.created) == 1 - assert messages.created[0]["content"].arguments == {"_raw": "not-json{"} + # AGX1-373: tool messages via streaming_task_message_context + assert messages.created == [] + assert len(streaming.contexts) == 1 + ctx = streaming.contexts[0] + # Initial content has empty args (args come via delta) + assert ctx.initial_content.arguments == {} + # The malformed JSON is surfaced verbatim in the ToolRequestDelta + assert len(ctx.updates) == 1 + assert isinstance(ctx.updates[0].delta, ToolRequestDelta) + assert ctx.updates[0].delta.arguments_delta == "not-json{" async def test_tool_call_with_none_args_defaults_to_empty_dict( self, fake_adk: tuple[FakeStreamingModule, FakeMessagesModule] ) -> None: - _, messages = fake_adk + streaming, messages = fake_adk events = [ PartStartEvent( index=0, @@ -342,15 +396,20 @@ async def test_tool_call_with_none_args_defaults_to_empty_dict( ] await stream_pydantic_ai_events(_aiter(events), TASK_ID) - assert len(messages.created) == 1 - assert messages.created[0]["content"].arguments == {} + # AGX1-373: tool messages via streaming_task_message_context + assert messages.created == [] + assert len(streaming.contexts) == 1 + assert streaming.contexts[0].initial_content.arguments == {} + assert streaming.contexts[0].updates == [], "no delta when args are absent" class TestToolResult: async def test_tool_return_emits_full_tool_response_message( self, fake_adk: tuple[FakeStreamingModule, FakeMessagesModule] ) -> None: - _, messages = fake_adk + # AGX1-373: tool responses arrive via streaming_task_message_context + # (open+close pair), NOT via adk.messages.create. + streaming, messages = fake_adk events = [ FunctionToolResultEvent( part=ToolReturnPart(tool_name="get_weather", content="Sunny, 72F", tool_call_id="c1"), @@ -358,13 +417,17 @@ async def test_tool_return_emits_full_tool_response_message( ] await stream_pydantic_ai_events(_aiter(events), TASK_ID) - assert len(messages.created) == 1 - content = messages.created[0]["content"] + assert messages.created == [], "adk.messages.create must not be called after reimplementation" + assert len(streaming.contexts) == 1 + ctx = streaming.contexts[0] + assert ctx.closed is True + content = ctx.initial_content assert isinstance(content, ToolResponseContent) assert content.tool_call_id == "c1" assert content.name == "get_weather" assert content.content == "Sunny, 72F" assert content.author == "agent" + assert ctx.updates == [], "open+close only — no deltas for tool messages" async def test_tool_return_with_dict_content_preserves_structure( self, fake_adk: tuple[FakeStreamingModule, FakeMessagesModule] @@ -377,7 +440,7 @@ async def test_tool_return_with_dict_content_preserves_structure( and divergent from the sync converter which uses ``_tool_return_content`` to return dicts as-is. """ - _, messages = fake_adk + streaming, messages = fake_adk events = [ FunctionToolResultEvent( part=ToolReturnPart(tool_name="t", content={"temp": 72, "sky": "clear"}, tool_call_id="c"), @@ -385,7 +448,10 @@ async def test_tool_return_with_dict_content_preserves_structure( ] await stream_pydantic_ai_events(_aiter(events), TASK_ID) - out = messages.created[0]["content"].content + # AGX1-373: tool messages via streaming_task_message_context + assert messages.created == [] + assert len(streaming.contexts) == 1 + out = streaming.contexts[0].initial_content.content assert out == {"temp": 72, "sky": "clear"}, ( f"Expected the dict to survive verbatim; got {out!r}. " "If this is a Python repr string, the helper regressed to str(content)." @@ -402,7 +468,7 @@ class WeatherResult(BaseModel): temp: int sky: str - _, messages = fake_adk + streaming, messages = fake_adk events = [ FunctionToolResultEvent( part=ToolReturnPart( @@ -414,13 +480,16 @@ class WeatherResult(BaseModel): ] await stream_pydantic_ai_events(_aiter(events), TASK_ID) - out = messages.created[0]["content"].content + # AGX1-373: tool messages via streaming_task_message_context + assert messages.created == [] + assert len(streaming.contexts) == 1 + out = streaming.contexts[0].initial_content.content assert out == {"temp": 72, "sky": "clear"} async def test_retry_prompt_part_surfaces_as_tool_response( self, fake_adk: tuple[FakeStreamingModule, FakeMessagesModule] ) -> None: - _, messages = fake_adk + streaming, messages = fake_adk events = [ FunctionToolResultEvent( part=RetryPromptPart( @@ -432,8 +501,10 @@ async def test_retry_prompt_part_surfaces_as_tool_response( ] await stream_pydantic_ai_events(_aiter(events), TASK_ID) - assert len(messages.created) == 1 - content = messages.created[0]["content"] + # AGX1-373: tool messages via streaming_task_message_context + assert messages.created == [] + assert len(streaming.contexts) == 1 + content = streaming.contexts[0].initial_content assert isinstance(content, ToolResponseContent) assert content.tool_call_id == "c1" # RetryPromptPart.content stringifies to the error description @@ -446,9 +517,9 @@ async def test_text_then_tool_then_text_uses_separate_contexts_in_order( ) -> None: """End-to-end multi-step shape: text → tool call → tool result → more text. - Each text/reasoning segment must get its own streaming context that is - closed before the next one opens, and tool messages must interleave - correctly via ``adk.messages.create``. + AGX1-373 envelope change: tool messages now arrive via + streaming_task_message_context (open+close pairs) instead of + adk.messages.create. All four message types open streaming contexts. """ streaming, messages = fake_adk events = [ @@ -474,18 +545,30 @@ async def test_text_then_tool_then_text_uses_separate_contexts_in_order( ] final = await stream_pydantic_ai_events(_aiter(events), TASK_ID) - assert len(streaming.contexts) == 2, "One context per text part — tool calls don't open streaming contexts" + # AGX1-373: all 4 messages (text, tool_request, tool_response, text) + # arrive via streaming_task_message_context. + assert messages.created == [], "adk.messages.create must not be called after reimplementation" + assert len(streaming.contexts) == 4 assert all(ctx.closed for ctx in streaming.contexts) - assert _text_deltas(streaming.contexts[0]) == ["Looking up..."] - assert _text_deltas(streaming.contexts[1]) == ["It's sunny."] - # Two messages: tool request, then tool response — in that order. - assert [type(m["content"]).__name__ for m in messages.created] == [ - "ToolRequestContent", - "ToolResponseContent", - ] - assert messages.created[0]["content"].tool_call_id == "c1" - assert messages.created[1]["content"].tool_call_id == "c1" + text_ctxs = [ctx for ctx in streaming.contexts if isinstance(ctx.initial_content, TextContent)] + tool_req_ctxs = [ctx for ctx in streaming.contexts if isinstance(ctx.initial_content, ToolRequestContent)] + tool_resp_ctxs = [ctx for ctx in streaming.contexts if isinstance(ctx.initial_content, ToolResponseContent)] + assert len(text_ctxs) == 2 + assert len(tool_req_ctxs) == 1 + assert len(tool_resp_ctxs) == 1 + + assert _text_deltas(text_ctxs[0]) == ["Looking up..."] + assert _text_deltas(text_ctxs[1]) == ["It's sunny."] + + # Tool content is preserved verbatim. + assert tool_req_ctxs[0].initial_content.tool_call_id == "c1" + assert tool_resp_ctxs[0].initial_content.tool_call_id == "c1" + + # Tool contexts carry no deltas (open+close only). + assert tool_req_ctxs[0].updates == [] + assert tool_resp_ctxs[0].updates == [] + assert final == "It's sunny." async def test_new_text_part_after_text_closes_previous( @@ -533,7 +616,11 @@ async def test_reasoning_then_text_closes_reasoning_context( async def test_tool_result_closes_any_open_streaming_context( self, fake_adk: tuple[FakeStreamingModule, FakeMessagesModule] ) -> None: - """A tool result arriving while a text context is open must close that context first.""" + """A tool result arriving while a text context is open must close that context first. + + AGX1-373: the tool response itself now also opens a streaming context + (open+close pair) rather than going through adk.messages.create. + """ streaming, messages = fake_adk events = [ PartStartEvent(index=0, part=TextPart(content="")), @@ -548,7 +635,10 @@ async def test_tool_result_closes_any_open_streaming_context( assert streaming.contexts[0].closed is True, ( "Helper must close any open streaming context before emitting a tool result message" ) - assert len(messages.created) == 1 + # AGX1-373: tool response arrives via streaming_task_message_context + assert messages.created == [] + assert len(streaming.contexts) == 2 + assert isinstance(streaming.contexts[1].initial_content, ToolResponseContent) class TestDeltaForOrphanIndexIgnored: @@ -584,7 +674,7 @@ async def on_tool_end(self, tool_call_id: str, result: Any) -> None: async def test_handler_records_start_and_end_for_each_tool_call( self, fake_adk: tuple[FakeStreamingModule, FakeMessagesModule] ) -> None: - _, messages = fake_adk + streaming, messages = fake_adk handler = self._RecordingHandler() events = [ PartStartEvent( @@ -605,11 +695,12 @@ async def test_handler_records_start_and_end_for_each_tool_call( tracing_handler=handler, # type: ignore[arg-type] ) - # Streaming side-effects still happen — tracing is additive. - assert [type(m["content"]).__name__ for m in messages.created] == [ - "ToolRequestContent", - "ToolResponseContent", - ] + # AGX1-373: tool messages arrive via streaming_task_message_context. + # Tracing is still additive — both messages are delivered AND hooks fire. + assert messages.created == [] + assert len(streaming.contexts) == 2 + assert isinstance(streaming.contexts[0].initial_content, ToolRequestContent) + assert isinstance(streaming.contexts[1].initial_content, ToolResponseContent) # And both lifecycle hooks fired exactly once with the right payload. assert handler.starts == [ { @@ -680,8 +771,12 @@ async def test_handler_records_each_tool_in_multi_tool_run( async def test_omitting_handler_is_a_no_op_for_existing_behavior( self, fake_adk: tuple[FakeStreamingModule, FakeMessagesModule] ) -> None: - """Regression: passing no tracing handler preserves the pre-tracing behavior.""" - _, messages = fake_adk + """Regression: passing no tracing handler preserves streaming behavior. + + AGX1-373: tool messages arrive via streaming_task_message_context + regardless of whether tracing_handler is passed. + """ + streaming, messages = fake_adk events = [ PartStartEvent( index=0, @@ -696,11 +791,11 @@ async def test_omitting_handler_is_a_no_op_for_existing_behavior( ), ] await stream_pydantic_ai_events(_aiter(events), TASK_ID) - # Exact same shape as before tracing existed. - assert [type(m["content"]).__name__ for m in messages.created] == [ - "ToolRequestContent", - "ToolResponseContent", - ] + # AGX1-373: tool messages via streaming_task_message_context. + assert messages.created == [] + assert len(streaming.contexts) == 2 + content_types = [type(ctx.initial_content).__name__ for ctx in streaming.contexts] + assert content_types == ["ToolRequestContent", "ToolResponseContent"] class TestPydanticAITracingHandlerDeterministicIds: @@ -867,3 +962,101 @@ async def boom() -> AsyncIterator[Any]: await stream_pydantic_ai_events(boom(), TASK_ID) assert streaming.contexts[0].closed is True + + +# --------------------------------------------------------------------------- +# Characterization test: lock the wire-level delivery shape for a representative +# pydantic-ai run (text + tool call + tool response + more text). +# +# Step 1 (CURRENT behavior): written against the original implementation. +# - Text/reasoning use adk.streaming.streaming_task_message_context. +# - Tool messages use adk.messages.create (FakeMessagesModule.created list). +# - Final text is the last text segment. +# +# Step 2 (POST-reimplementation on UnifiedEmitter / auto_send): +# The assertions in TestCharacterizeWireShapeNew (below) lock the new shape. +# Tool messages no longer go through adk.messages.create; they arrive via +# streaming_task_message_context open+close pairs (Start+Done envelope). +# This is the AGX1-373 accepted envelope change: logical content is identical. +# --------------------------------------------------------------------------- + + +class TestCharacterizeWireShape: + """Characterization tests: lock the wire-level delivery shape after reimplementation. + + Uses FakeStreamingModule + FakeMessagesModule (the existing fake pair). + + AGX1-373 shape (post-reimplementation on UnifiedEmitter / auto_send): + - Text/reasoning: streaming_task_message_context (open + deltas + close) + - Tool messages: streaming_task_message_context (open+close, no deltas) + - adk.messages.create is NOT called. + - Final text == last text segment only. + + This class was first written to characterize the OLD shape (adk.messages.create + for tool messages) and was updated post-reimplementation to reflect the new + delivery channel. The logical content is identical; only the channel changed. + """ + + async def test_text_tool_text_new_wire_shape( + self, fake_adk: tuple[FakeStreamingModule, FakeMessagesModule] + ) -> None: + """Representative run: text -> tool call -> tool response -> more text. + + Post-AGX1-373 delivery shape: + - Four streaming contexts: text, tool_request, tool_response, text. + - adk.messages.create NOT called. + - Final text == "It's sunny." (last segment only, matching the + multi-step convention). + """ + from pydantic_ai.messages import ToolReturnPart + + streaming, messages = fake_adk + events = [ + PartStartEvent(index=0, part=TextPart(content="")), + PartDeltaEvent(index=0, delta=TextPartDelta(content_delta="Looking up...")), + PartEndEvent(index=0, part=TextPart(content="Looking up...")), + PartStartEvent( + index=1, + part=ToolCallPart(tool_name="get_weather", args=None, tool_call_id="c1"), + ), + PartEndEvent( + index=1, + part=ToolCallPart(tool_name="get_weather", args="{}", tool_call_id="c1"), + ), + FunctionToolResultEvent( + part=ToolReturnPart(tool_name="get_weather", content="Sunny", tool_call_id="c1"), + ), + PartStartEvent(index=0, part=TextPart(content="")), + PartDeltaEvent(index=0, delta=TextPartDelta(content_delta="It's sunny.")), + PartEndEvent(index=0, part=TextPart(content="It's sunny.")), + ] + + final = await stream_pydantic_ai_events(_aiter(events), TASK_ID) + + assert final == "It's sunny.", "multi-step: only the last text segment is returned" + + # AGX1-373: all 4 messages arrive via streaming_task_message_context + assert messages.created == [] + assert len(streaming.contexts) == 4 + assert all(ctx.closed for ctx in streaming.contexts) + + content_types = [type(ctx.initial_content).__name__ for ctx in streaming.contexts] + assert content_types == [ + "TextContent", + "ToolRequestContent", + "ToolResponseContent", + "TextContent", + ] + + text_ctxs = [ctx for ctx in streaming.contexts if isinstance(ctx.initial_content, TextContent)] + tool_req_ctxs = [ctx for ctx in streaming.contexts if isinstance(ctx.initial_content, ToolRequestContent)] + tool_resp_ctxs = [ctx for ctx in streaming.contexts if isinstance(ctx.initial_content, ToolResponseContent)] + + assert _text_deltas(text_ctxs[0]) == ["Looking up..."] + assert _text_deltas(text_ctxs[1]) == ["It's sunny."] + assert tool_req_ctxs[0].initial_content.tool_call_id == "c1" + assert tool_req_ctxs[0].initial_content.name == "get_weather" + assert tool_req_ctxs[0].updates == [] + assert tool_resp_ctxs[0].initial_content.tool_call_id == "c1" + assert tool_resp_ctxs[0].initial_content.content == "Sunny" + assert tool_resp_ctxs[0].updates == [] diff --git a/tests/lib/adk/test_pydantic_ai_sync.py b/tests/lib/adk/test_pydantic_ai_sync.py index 36d06200e..080bc5be8 100644 --- a/tests/lib/adk/test_pydantic_ai_sync.py +++ b/tests/lib/adk/test_pydantic_ai_sync.py @@ -3,9 +3,11 @@ from __future__ import annotations import json +import asyncio from typing import Any, AsyncIterator import pytest +from pydantic_ai.run import AgentRunResult, AgentRunResultEvent from pydantic_ai.messages import ( TextPart, PartEndEvent, @@ -481,3 +483,75 @@ async def test_author_is_agent(self, events: list[Any]): content = getattr(e, "content", None) if content is not None and hasattr(content, "author"): assert content.author == "agent" + + +class TestOnResultCallback: + """on_result callback: captures the terminal AgentRunResultEvent without + altering streaming output.""" + + def _make_result_event(self, output: Any = "hello") -> AgentRunResultEvent: + result = AgentRunResult(output=output, _output_tool_name=None) + return AgentRunResultEvent(result=result) + + async def test_callback_invoked_once_with_result_event(self): + """on_result is called exactly once, with the AgentRunResultEvent.""" + captured: list[AgentRunResultEvent] = [] + + def on_result(event: AgentRunResultEvent) -> None: + captured.append(event) + + result_event = self._make_result_event("the answer") + events = [result_event] + await _collect(convert_pydantic_ai_to_agentex_events(_aiter(events), on_result=on_result)) + + assert len(captured) == 1 + assert captured[0] is result_event + assert captured[0].result.output == "the answer" + + async def test_streaming_output_unchanged_with_callback(self): + """Yielded StreamTaskMessage* sequence is identical whether on_result is set or not.""" + result_event = self._make_result_event() + events = [ + PartStartEvent(index=0, part=TextPart(content="")), + PartDeltaEvent(index=0, delta=TextPartDelta(content_delta="hi")), + PartEndEvent(index=0, part=TextPart(content="hi")), + result_event, + ] + + captured: list[AgentRunResultEvent] = [] + out_with = await _collect(convert_pydantic_ai_to_agentex_events(_aiter(events), on_result=captured.append)) + out_without = await _collect(convert_pydantic_ai_to_agentex_events(_aiter(events))) + + assert len(out_with) == len(out_without) + for a, b in zip(out_with, out_without): + assert type(a) is type(b) + assert a.model_dump() == b.model_dump() + assert len(captured) == 1 + + async def test_no_callback_no_error(self): + """AgentRunResultEvent is silently ignored when on_result is None.""" + result_event = self._make_result_event() + events = [result_event] + out = await _collect(convert_pydantic_ai_to_agentex_events(_aiter(events))) + assert out == [] + + async def test_async_callback_is_awaited(self): + """An async on_result callable is properly awaited. + + The callback suspends (``await asyncio.sleep(0)``) before recording its + side effect, so ``awaited`` is only populated if the converter actually + awaits the returned coroutine — distinguishing "awaited" from + "called-but-not-awaited." + """ + awaited: list[AgentRunResultEvent] = [] + + async def on_result_async(event: AgentRunResultEvent) -> None: + await asyncio.sleep(0) + awaited.append(event) + + result_event = self._make_result_event("async_output") + events = [result_event] + await _collect(convert_pydantic_ai_to_agentex_events(_aiter(events), on_result=on_result_async)) + + assert len(awaited) == 1 + assert awaited[0].result.output == "async_output" diff --git a/tests/lib/adk/test_pydantic_ai_sync_unified.py b/tests/lib/adk/test_pydantic_ai_sync_unified.py new file mode 100644 index 000000000..f920418de --- /dev/null +++ b/tests/lib/adk/test_pydantic_ai_sync_unified.py @@ -0,0 +1,209 @@ +"""Tests for the unified sync (HTTP ACP) path: PydanticAITurn + UnifiedEmitter. + +Exercises the path documented in _pydantic_ai_sync.py under "Recommended: unified surface": +- events forwarded by yield_turn equal PydanticAITurn(stream).events (passthrough) +- with a trace context + fake tracing backend, tool spans are derived (start_span / end_span called) +- with a trace context + fake tracing backend, reasoning spans are derived +""" + +from __future__ import annotations + +from typing import Any, AsyncIterator + +from pydantic_ai.run import AgentRunResult, AgentRunResultEvent +from pydantic_ai.usage import RunUsage +from pydantic_ai.messages import ( + TextPart, + PartEndEvent, + ThinkingPart, + ToolCallPart, + TextPartDelta, + PartDeltaEvent, + PartStartEvent, + ThinkingPartDelta, + ToolCallPartDelta, +) + +from agentex.lib.core.harness import UnifiedEmitter +from agentex.lib.adk._modules._pydantic_ai_turn import PydanticAITurn + + +async def _aiter(events: list[Any]) -> AsyncIterator[Any]: + for e in events: + yield e + + +async def _collect(stream: AsyncIterator[Any]) -> list[Any]: + return [e async for e in stream] + + +class _FakeSpan: + def __init__(self, name: str): + self.name = name + self.output: Any = None + + +class _FakeTracing: + def __init__(self) -> None: + self.started: list[tuple[str, str | None, Any]] = [] + self.ended: list[tuple[str, Any]] = [] + + async def start_span(self, *, trace_id, name, input=None, parent_id=None, data=None, task_id=None): + self.started.append((name, parent_id, input)) + return _FakeSpan(name) + + async def end_span(self, *, trace_id, span): + self.ended.append((span.name, span.output)) + + +def _make_result_event(usage: RunUsage | None = None) -> AgentRunResultEvent: + result = AgentRunResult(output="done", _output_tool_name=None) + if usage is not None: + result._state.usage = usage + return AgentRunResultEvent(result=result) + + +class TestUnifiedSyncPathPassthrough: + """The events forwarded by yield_turn are identical to PydanticAITurn.events.""" + + async def test_text_stream_passthrough(self): + raw_events = [ + PartStartEvent(index=0, part=TextPart(content="")), + PartDeltaEvent(index=0, delta=TextPartDelta(content_delta="hello")), + PartEndEvent(index=0, part=TextPart(content="hello")), + ] + + turn_a = PydanticAITurn(_aiter(raw_events), model="openai:gpt-4o") + direct = await _collect(turn_a.events) + + turn_b = PydanticAITurn(_aiter(raw_events), model="openai:gpt-4o") + emitter = UnifiedEmitter(task_id="t", trace_id=None, parent_span_id=None) + via_emitter = await _collect(emitter.yield_turn(turn_b)) + + assert len(via_emitter) == len(direct) + for a, b in zip(via_emitter, direct): + assert type(a) is type(b) + assert a.model_dump() == b.model_dump() + + async def test_tool_call_stream_passthrough(self): + raw_events = [ + PartStartEvent(index=0, part=ToolCallPart(tool_name="Bash", args=None, tool_call_id="c1")), + PartDeltaEvent(index=0, delta=ToolCallPartDelta(args_delta='{"cmd":"ls"}')), + PartEndEvent( + index=0, + part=ToolCallPart(tool_name="Bash", args='{"cmd":"ls"}', tool_call_id="c1"), + ), + ] + + turn_a = PydanticAITurn(_aiter(raw_events), model="openai:gpt-4o") + direct = await _collect(turn_a.events) + + turn_b = PydanticAITurn(_aiter(raw_events), model="openai:gpt-4o") + emitter = UnifiedEmitter(task_id="t", trace_id=None, parent_span_id=None) + via_emitter = await _collect(emitter.yield_turn(turn_b)) + + assert len(via_emitter) == len(direct) + for a, b in zip(via_emitter, direct): + assert type(a) is type(b) + assert a.model_dump() == b.model_dump() + + +class TestUnifiedSyncPathSpanDerivation: + """With trace context + fake tracing, spans are derived from the stream.""" + + async def test_tool_span_opened_and_closed(self): + """A tool call produces start_span + end_span on the fake tracing backend.""" + from pydantic_ai.messages import ToolReturnPart, FunctionToolResultEvent + + tool_events = [ + PartStartEvent( + index=0, + part=ToolCallPart(tool_name="Bash", args={"cmd": "ls"}, tool_call_id="call_1"), + ), + PartEndEvent( + index=0, + part=ToolCallPart(tool_name="Bash", args='{"cmd":"ls"}', tool_call_id="call_1"), + ), + FunctionToolResultEvent( + part=ToolReturnPart(tool_name="Bash", content="files", tool_call_id="call_1"), + ), + ] + + fake = _FakeTracing() + turn = PydanticAITurn(_aiter(tool_events), model="openai:gpt-4o") + emitter = UnifiedEmitter(task_id="t", trace_id="tr", parent_span_id="p", tracing=fake) + + events = await _collect(emitter.yield_turn(turn)) + + assert len(events) >= 2, "at least Start(tool) + Done + Full(response)" + assert len(fake.started) == 1, "one tool span opened" + assert len(fake.ended) == 1, "one tool span closed" + span_name, parent_id, span_input = fake.started[0] + assert span_name == "Bash" + assert parent_id == "p" + closed_name, closed_output = fake.ended[0] + assert closed_name == "Bash" + + async def test_reasoning_span_opened_and_closed(self): + """A thinking/reasoning block produces start_span + end_span.""" + reasoning_events = [ + PartStartEvent(index=0, part=ThinkingPart(content="")), + PartDeltaEvent(index=0, delta=ThinkingPartDelta(content_delta="let me think")), + PartEndEvent(index=0, part=ThinkingPart(content="let me think")), + ] + + fake = _FakeTracing() + turn = PydanticAITurn(_aiter(reasoning_events), model="openai:gpt-4o") + emitter = UnifiedEmitter(task_id="t", trace_id="tr", parent_span_id="p", tracing=fake) + + await _collect(emitter.yield_turn(turn)) + + assert len(fake.started) == 1, "one reasoning span opened" + assert len(fake.ended) == 1, "one reasoning span closed" + span_name, parent_id, _ = fake.started[0] + assert span_name == "reasoning" + assert parent_id == "p" + + async def test_no_trace_id_means_no_spans(self): + """When trace_id is None, no spans are derived even with a fake tracing backend.""" + raw_events = [ + PartStartEvent( + index=0, + part=ToolCallPart(tool_name="Bash", args={"cmd": "ls"}, tool_call_id="c2"), + ), + PartEndEvent( + index=0, + part=ToolCallPart(tool_name="Bash", args='{"cmd":"ls"}', tool_call_id="c2"), + ), + ] + + fake = _FakeTracing() + turn = PydanticAITurn(_aiter(raw_events), model="openai:gpt-4o") + emitter = UnifiedEmitter(task_id="t", trace_id=None, parent_span_id=None, tracing=fake) + + await _collect(emitter.yield_turn(turn)) + + assert fake.started == [], "no spans when trace_id is absent" + assert fake.ended == [] + + async def test_tracer_false_suppresses_spans_even_with_trace_id(self): + """tracer=False disables span derivation regardless of trace_id.""" + raw_events = [ + PartStartEvent( + index=0, + part=ToolCallPart(tool_name="Bash", args={"cmd": "ls"}, tool_call_id="c3"), + ), + PartEndEvent( + index=0, + part=ToolCallPart(tool_name="Bash", args='{"cmd":"ls"}', tool_call_id="c3"), + ), + ] + + fake = _FakeTracing() + turn = PydanticAITurn(_aiter(raw_events), model="openai:gpt-4o") + emitter = UnifiedEmitter(task_id="t", trace_id="tr", parent_span_id="p", tracer=False, tracing=fake) + + await _collect(emitter.yield_turn(turn)) + + assert fake.started == [] + assert fake.ended == [] diff --git a/tests/lib/adk/test_pydantic_ai_turn.py b/tests/lib/adk/test_pydantic_ai_turn.py new file mode 100644 index 000000000..0659895d3 --- /dev/null +++ b/tests/lib/adk/test_pydantic_ai_turn.py @@ -0,0 +1,276 @@ +"""Tests for PydanticAITurn and pydantic_ai_usage_to_turn_usage.""" + +from __future__ import annotations + +from typing import Any, AsyncIterator + +from pydantic_ai.run import AgentRunResult, AgentRunResultEvent +from pydantic_ai.usage import RunUsage +from pydantic_ai.messages import ( + TextPart, + PartEndEvent, + TextPartDelta, + PartDeltaEvent, + PartStartEvent, +) + +from agentex.lib.core.harness import HarnessTurn +from agentex.lib.adk._modules._pydantic_ai_turn import ( + PydanticAITurn, + pydantic_ai_usage_to_turn_usage, +) + + +async def _aiter(events: list[Any]) -> AsyncIterator[Any]: + for e in events: + yield e + + +async def _collect(stream: AsyncIterator[Any]) -> list[Any]: + return [e async for e in stream] + + +def _make_result_event(output: Any = "done", usage: RunUsage | None = None) -> AgentRunResultEvent: + result = AgentRunResult(output=output, _output_tool_name=None) + if usage is not None: + result._state.usage = usage + return AgentRunResultEvent(result=result) + + +class TestUsageNormalization: + def test_usage_normalization_maps_fields(self): + """Real RunUsage fields map correctly onto TurnUsage.""" + usage = RunUsage( + requests=3, + input_tokens=200, + output_tokens=80, + cache_read_tokens=25, + ) + turn_usage = pydantic_ai_usage_to_turn_usage(usage, model="openai:gpt-4o") + + assert turn_usage.model == "openai:gpt-4o" + assert turn_usage.input_tokens == 200 + assert turn_usage.output_tokens == 80 + assert turn_usage.num_llm_calls == 3 + + def test_total_tokens_is_computed(self): + """RunUsage.total_tokens is a computed property; we surface it correctly.""" + usage = RunUsage(input_tokens=100, output_tokens=50) + turn_usage = pydantic_ai_usage_to_turn_usage(usage, model="openai:gpt-4o") + assert turn_usage.total_tokens == 150 + + def test_cache_read_tokens_mapped_to_cached_input_tokens(self): + usage = RunUsage(input_tokens=100, output_tokens=50, cache_read_tokens=20) + turn_usage = pydantic_ai_usage_to_turn_usage(usage, model="openai:gpt-4o") + assert turn_usage.cached_input_tokens == 20 + + def test_none_model(self): + """model=None is preserved.""" + usage = RunUsage() + turn_usage = pydantic_ai_usage_to_turn_usage(usage, model=None) + assert turn_usage.model is None + + def test_all_zero_usage_preserves_real_zeros(self): + """An all-zero RunUsage maps real 0s through (not None). + + RunUsage token fields are ints defaulting to 0. A 0 is a genuine + value (e.g. a cache-hit with 0 output tokens), not "unknown", so it + must survive normalization as 0 rather than being coerced to None. + """ + usage = RunUsage() + turn_usage = pydantic_ai_usage_to_turn_usage(usage, model="openai:gpt-4o") + assert turn_usage.num_llm_calls == 0 + assert turn_usage.input_tokens == 0 + assert turn_usage.output_tokens == 0 + assert turn_usage.cached_input_tokens == 0 + assert turn_usage.total_tokens == 0 + + def test_missing_field_degrades_to_none(self): + """A usage object MISSING a field maps that field to None (defensive getattr). + + Guards the version-rename guarantee: if pydantic-ai renames a field, + the absent attribute degrades to None rather than raising. + """ + + class StubUsage: + requests = 2 + input_tokens = 100 + # no output_tokens / cache_read_tokens / total_tokens attributes + + turn_usage = pydantic_ai_usage_to_turn_usage(StubUsage(), model="openai:gpt-4o") + assert turn_usage.num_llm_calls == 2 + assert turn_usage.input_tokens == 100 + assert turn_usage.output_tokens is None + assert turn_usage.cached_input_tokens is None + assert turn_usage.total_tokens is None + + +class TestPydanticAITurn: + async def test_turn_satisfies_harness_turn_protocol(self): + """PydanticAITurn is structurally compatible with HarnessTurn.""" + turn = PydanticAITurn(_aiter([]), model="openai:gpt-4o") + assert isinstance(turn, HarnessTurn) + + async def test_usage_before_exhaustion_returns_default(self): + """usage() before iterating events returns default TurnUsage (model set, tokens None).""" + result_event = _make_result_event(usage=RunUsage(requests=1, input_tokens=100, output_tokens=40)) + events = [result_event] + turn = PydanticAITurn(_aiter(events), model="openai:gpt-4o") + + # Do NOT exhaust events — check usage pre-run + pre_usage = turn.usage() + assert pre_usage.model == "openai:gpt-4o" + assert pre_usage.input_tokens is None + assert pre_usage.output_tokens is None + assert pre_usage.num_llm_calls == 0 + + async def test_turn_events_and_usage(self): + """Driving events to exhaustion populates usage from the terminal event.""" + known_usage = RunUsage( + requests=2, + input_tokens=300, + output_tokens=120, + cache_read_tokens=30, + ) + result_event = _make_result_event(usage=known_usage) + events = [ + PartStartEvent(index=0, part=TextPart(content="")), + PartDeltaEvent(index=0, delta=TextPartDelta(content_delta="hi")), + PartEndEvent(index=0, part=TextPart(content="hi")), + result_event, + ] + turn = PydanticAITurn(_aiter(events), model="openai:gpt-4o") + + collected = await _collect(turn.events) + + # Events match bare converter output (Start + Delta + Done = 3 events) + assert len(collected) == 3 + + # Usage is populated after exhaustion + usage = turn.usage() + assert usage.model == "openai:gpt-4o" + assert usage.input_tokens == 300 + assert usage.output_tokens == 120 + assert usage.cached_input_tokens == 30 + assert usage.num_llm_calls == 2 + assert usage.total_tokens == 420 + + async def test_events_match_bare_converter(self): + """Yielded events are identical to bare convert_pydantic_ai_to_agentex_events output.""" + from agentex.lib.adk._modules._pydantic_ai_sync import convert_pydantic_ai_to_agentex_events + + text_events = [ + PartStartEvent(index=0, part=TextPart(content="")), + PartDeltaEvent(index=0, delta=TextPartDelta(content_delta="Hello")), + PartEndEvent(index=0, part=TextPart(content="Hello")), + ] + + turn = PydanticAITurn(_aiter(text_events), model="openai:gpt-4o") + turn_out = await _collect(turn.events) + + bare_out = await _collect(convert_pydantic_ai_to_agentex_events(_aiter(text_events))) + + assert len(turn_out) == len(bare_out) + for a, b in zip(turn_out, bare_out): + assert type(a) is type(b) + assert a.model_dump() == b.model_dump() + + async def test_usage_captured_via_real_usage_accessor(self): + """Drive the turn through the REAL ``result.usage`` property accessor. + + The production code reads ``getattr(run_result, "usage", None)``, which + on this pydantic-ai version resolves the ``_DeprecatedCallableRunUsage`` + property (NOT ``_state.usage`` directly). This asserts that the real + accessor path the converter uses captures the run usage. Constructing + the event without our test's ``_state`` shortcut: we set ``_state.usage`` + only because that is the sole supported way to seed an + ``AgentRunResult``, but we then assert capture happens through the + public ``.usage`` attribute access (verified below). + """ + known_usage = RunUsage(requests=4, input_tokens=512, output_tokens=64) + result = AgentRunResult(output="done", _output_tool_name=None) + result._state.usage = known_usage + result_event = AgentRunResultEvent(result=result) + + # Sanity: the value is reachable via the real public accessor the + # production code uses (not just via the private _state). The + # _DeprecatedCallableRunUsage property wraps the value, so compare by + # equality rather than identity. + accessed = getattr(result_event.result, "usage", None) + assert accessed is not None + assert accessed.input_tokens == 512 + assert accessed.requests == 4 + + events = [ + PartStartEvent(index=0, part=TextPart(content="")), + PartEndEvent(index=0, part=TextPart(content="")), + result_event, + ] + turn = PydanticAITurn(_aiter(events), model="anthropic:claude-3-5-sonnet") + await _collect(turn.events) + + usage = turn.usage() + assert usage.model == "anthropic:claude-3-5-sonnet" + assert usage.input_tokens == 512 + assert usage.output_tokens == 64 + assert usage.num_llm_calls == 4 + + async def test_no_usage_event_leaves_default_usage(self): + """If the stream has no AgentRunResultEvent, usage() returns the default (tokens None).""" + events = [ + PartStartEvent(index=0, part=TextPart(content="")), + PartEndEvent(index=0, part=TextPart(content="")), + ] + turn = PydanticAITurn(_aiter(events), model="openai:gpt-4o") + await _collect(turn.events) + + usage = turn.usage() + assert usage.model == "openai:gpt-4o" + assert usage.input_tokens is None + assert usage.num_llm_calls == 0 + + +class TestToolRequestStreaming: + """PydanticAITurn.events equals the bare converter output unconditionally. + + The foundation auto_send delivers Start+ToolRequestDelta+Done natively + (AGX1-377), so no coalescing is needed on either channel. + """ + + async def test_events_match_bare_converter_for_streamed_tool_call(self): + """PydanticAITurn yields a ToolRequestDelta for a streamed-args tool call + — i.e. it is byte-for-byte the bare converter output, preserving + argument-token streaming on the sync/yield channel.""" + from pydantic_ai.messages import ToolCallPart, ToolCallPartDelta + + from agentex.types.tool_request_delta import ToolRequestDelta + from agentex.types.task_message_update import StreamTaskMessageDelta + from agentex.lib.adk._modules._pydantic_ai_sync import convert_pydantic_ai_to_agentex_events + + tool_events = [ + PartStartEvent(index=0, part=ToolCallPart(tool_name="get_weather", args=None, tool_call_id="c1")), + PartDeltaEvent(index=0, delta=ToolCallPartDelta(args_delta='{"city":"Paris"}')), + PartEndEvent( + index=0, + part=ToolCallPart(tool_name="get_weather", args='{"city":"Paris"}', tool_call_id="c1"), + ), + ] + + turn = PydanticAITurn(_aiter(tool_events), model="openai:gpt-4o") + turn_out = await _collect(turn.events) + + bare_out = await _collect(convert_pydantic_ai_to_agentex_events(_aiter(tool_events))) + + # Turn is identical to the bare converter. + assert len(turn_out) == len(bare_out) + for a, b in zip(turn_out, bare_out): + assert type(a) is type(b) + assert a.model_dump() == b.model_dump() + + # The arg-streaming delta is present. + deltas = [ + e for e in turn_out if isinstance(e, StreamTaskMessageDelta) and isinstance(e.delta, ToolRequestDelta) + ] + assert len(deltas) == 1, "streamed tool-call args must surface as a ToolRequestDelta" + assert isinstance(deltas[0].delta, ToolRequestDelta) + assert deltas[0].delta.arguments_delta == '{"city":"Paris"}' diff --git a/tests/lib/core/harness/conformance/test_pydantic_ai_conformance.py b/tests/lib/core/harness/conformance/test_pydantic_ai_conformance.py new file mode 100644 index 000000000..ca8234fda --- /dev/null +++ b/tests/lib/core/harness/conformance/test_pydantic_ai_conformance.py @@ -0,0 +1,194 @@ +"""Cross-channel conformance fixtures derived from real pydantic-ai event sequences. + +Each fixture is built by running a pydantic_ai event stream through PydanticAITurn +and collecting the canonical StreamTaskMessage* output. These canonical event lists are +then registered with the conformance runner and exercised by the cross-channel test +(yield_events vs auto_send). + +Streamed tool requests +---------------------- +The pydantic-ai stream emits a tool REQUEST as Start + ToolRequestDelta + Done (not a +Full event). AGX1-377 has landed: both the conformance runner and auto_send now deliver +the Start+Delta+Done(tool_request) shape, so the cross-channel test asserts full +delivery-equivalence for streamed tool requests. The fixtures below retain the +ToolRequestDelta events as the streamed tool-request inputs. +""" + +from __future__ import annotations + +import asyncio +from typing import Any, AsyncIterator + +import pytest +from pydantic_ai.messages import ( + TextPart, + PartEndEvent, + ThinkingPart, + ToolCallPart, + TextPartDelta, + PartDeltaEvent, + PartStartEvent, + ToolReturnPart, + ThinkingPartDelta, + ToolCallPartDelta, + FunctionToolResultEvent, +) + +from agentex.lib.adk._modules._pydantic_ai_turn import PydanticAITurn + +from .runner import ( + Fixture, + register, + derive_all, + run_cross_channel_conformance, +) + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +async def _aiter(events: list[Any]) -> AsyncIterator[Any]: + for e in events: + yield e + + +async def _canonical(pydantic_events: list[Any]) -> list[Any]: + """Run pydantic_ai events through PydanticAITurn and collect the output. + + The output equals the bare convert_pydantic_ai_to_agentex_events output. + """ + turn = PydanticAITurn(_aiter(pydantic_events), model=None) + return [e async for e in turn.events] + + +def _build_fixtures() -> list[Fixture]: + """Build all pydantic-ai conformance fixtures synchronously via asyncio.run.""" + + # ------------------------------------------------------------------ # + # 1. Text-only run: simple streaming text response. + # ------------------------------------------------------------------ # + text_only_pydantic = [ + PartStartEvent(index=0, part=TextPart(content="")), + PartDeltaEvent(index=0, delta=TextPartDelta(content_delta="Hello, ")), + PartDeltaEvent(index=0, delta=TextPartDelta(content_delta="world!")), + PartEndEvent(index=0, part=TextPart(content="Hello, world!")), + ] + + # ------------------------------------------------------------------ # + # 2. Single tool call + tool response. + # The canonical stream emits Start+ToolRequestDelta+Done for the request + # and Full(ToolResponseContent) for the response. See AGX1-377 note above + # for why the request delivery is not yet asserted cross-channel. + # ------------------------------------------------------------------ # + tool_call_pydantic = [ + PartStartEvent( + index=0, + part=ToolCallPart(tool_name="get_weather", args=None, tool_call_id="call_01"), + ), + PartDeltaEvent( + index=0, + delta=ToolCallPartDelta(args_delta='{"city":"Paris"}', tool_call_id="call_01"), + ), + PartEndEvent( + index=0, + part=ToolCallPart(tool_name="get_weather", args='{"city":"Paris"}', tool_call_id="call_01"), + ), + FunctionToolResultEvent( + part=ToolReturnPart(tool_name="get_weather", content="Sunny, 22C", tool_call_id="call_01"), + ), + ] + + # ------------------------------------------------------------------ # + # 3. Reasoning/thinking block: produces ReasoningContent Start+Delta+Done. + # ------------------------------------------------------------------ # + reasoning_pydantic = [ + PartStartEvent(index=0, part=ThinkingPart(content="")), + PartDeltaEvent(index=0, delta=ThinkingPartDelta(content_delta="First, let me think...")), + PartDeltaEvent(index=0, delta=ThinkingPartDelta(content_delta=" Then conclude.")), + PartEndEvent(index=0, part=ThinkingPart(content="First, let me think... Then conclude.")), + ] + + # ------------------------------------------------------------------ # + # 4. Multi-step run: text -> tool call + response -> text. + # Pydantic AI restarts part indices at 0 for each model response; the + # converter assigns globally-monotonic indices to Agentex messages. + # ------------------------------------------------------------------ # + multi_step_pydantic = [ + # First model turn: text then tool call + PartStartEvent(index=0, part=TextPart(content="")), + PartDeltaEvent(index=0, delta=TextPartDelta(content_delta="Let me check the weather.")), + PartEndEvent(index=0, part=TextPart(content="Let me check the weather.")), + PartStartEvent( + index=1, + part=ToolCallPart(tool_name="get_weather", args=None, tool_call_id="call_ms1"), + ), + PartDeltaEvent( + index=1, + delta=ToolCallPartDelta(args_delta='{"city":"London"}', tool_call_id="call_ms1"), + ), + PartEndEvent( + index=1, + part=ToolCallPart(tool_name="get_weather", args='{"city":"London"}', tool_call_id="call_ms1"), + ), + FunctionToolResultEvent( + part=ToolReturnPart(tool_name="get_weather", content="Cloudy, 15C", tool_call_id="call_ms1"), + ), + # Second model turn: text response (pydantic restarts index at 0) + PartStartEvent(index=0, part=TextPart(content="")), + PartDeltaEvent(index=0, delta=TextPartDelta(content_delta="It's cloudy and 15C in London.")), + PartEndEvent(index=0, part=TextPart(content="It's cloudy and 15C in London.")), + ] + + text_only_events = asyncio.run(_canonical(text_only_pydantic)) + tool_call_events = asyncio.run(_canonical(tool_call_pydantic)) + reasoning_events = asyncio.run(_canonical(reasoning_pydantic)) + multi_step_events = asyncio.run(_canonical(multi_step_pydantic)) + + return [ + Fixture(name="pydantic-ai-text-only", events=text_only_events), + Fixture(name="pydantic-ai-single-tool-call", events=tool_call_events), + Fixture(name="pydantic-ai-reasoning-block", events=reasoning_events), + Fixture(name="pydantic-ai-multi-step", events=multi_step_events), + ] + + +_FIXTURES: list[Fixture] = _build_fixtures() + +for _f in _FIXTURES: + register(_f) + + +# --------------------------------------------------------------------------- +# Cross-channel conformance: logical equivalence + span equivalence +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize("fixture", _FIXTURES, ids=lambda f: f.name) +@pytest.mark.asyncio +async def test_cross_channel_equivalence(fixture: Fixture) -> None: + """Assert that yield_events and auto_send produce equivalent logical + deliveries and identical span signals for each pydantic-ai fixture. + + See runner.py for the full contract. The AGX1-377 note at the top of this + module explains why streamed-tool-request delivery is not yet asserted. + """ + yield_deliveries, auto_deliveries, yield_spans, auto_spans = await run_cross_channel_conformance(fixture) + + assert yield_deliveries == auto_deliveries, ( + f"[{fixture.name}] logical deliveries differ:\n yield: {yield_deliveries}\n auto_send: {auto_deliveries}" + ) + assert yield_spans == auto_spans, ( + f"[{fixture.name}] span signals differ:\n yield: {yield_spans}\n auto_send: {auto_spans}" + ) + + +# --------------------------------------------------------------------------- +# Backward-compatible determinism guard +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize("fixture", _FIXTURES, ids=lambda f: f.name) +def test_span_derivation_is_deterministic(fixture: Fixture) -> None: + """Span derivation over the same event list is idempotent.""" + assert derive_all(fixture.events) == derive_all(fixture.events) diff --git a/tests/lib/core/harness/test_harness_pydantic_ai_async.py b/tests/lib/core/harness/test_harness_pydantic_ai_async.py new file mode 100644 index 000000000..8bda7d020 --- /dev/null +++ b/tests/lib/core/harness/test_harness_pydantic_ai_async.py @@ -0,0 +1,361 @@ +"""Integration test: async (Redis-streaming) channel with a pydantic-ai agent. + +Exercises the unified harness surface (UnifiedEmitter.auto_send_turn + PydanticAITurn) +with a minimal pydantic-ai agent backed by TestModel so the test runs fully +offline (no API keys, no Redis, no Agentex server). + +Agent description +----------------- +Same single-tool agent as the sync test: ``get_weather(city: str) -> str`` +returning "sunny and 72F". TestModel is configured to call the tool once then +produce a fixed text reply. + +The async path uses the bare PydanticAITurn (no coalescing): the foundation +auto_send delivers streamed tool-request Start+ToolRequestDelta+Done messages +natively (AGX1-377 fix), so no coalescing wrapper is needed. + +What is tested +-------------- +- The async handler pushes the correct sequence of messages to the fake streaming + backend: tool_request + tool_response + text (in that order). +- final_text equals the TestModel custom output. +- With a SpanTracer, tool spans are derived and forwarded to the fake tracing + backend (streamed tool-request delivery now triggers span derivation on the + async path). + +What is NOT covered without live infrastructure +----------------------------------------------- +- Actual Redis streaming (requires a running Redis instance). +- The ACP on_task_event_send / on_task_create / on_task_cancel lifecycle. +- Multi-turn history persistence via adk.state. +- Real LLM calls or production model behaviour. +- The full FastACP async request lifecycle. + +See also: test_harness_pydantic_ai_sync.py (span derivation with sync path) and +test_harness_pydantic_ai_temporal.py (temporal activity path). +""" + +from __future__ import annotations + +from typing import Any + +import pytest +from pydantic_ai import Agent +from pydantic_ai.models.test import TestModel + +from agentex.types.task_message import TaskMessage +from agentex.lib.core.harness.types import TurnResult +from agentex.lib.core.harness.tracer import SpanTracer +from agentex.lib.core.harness.emitter import UnifiedEmitter +from agentex.types.tool_request_content import ToolRequestContent +from agentex.types.tool_response_content import ToolResponseContent +from agentex.lib.adk._modules._pydantic_ai_turn import PydanticAITurn + +# --------------------------------------------------------------------------- +# Minimal agent under test +# --------------------------------------------------------------------------- + + +def _make_agent() -> Agent: + """Build a pydantic-ai agent with one weather tool and a TestModel.""" + model = TestModel( + call_tools=["get_weather"], + custom_output_text="The weather in Paris is sunny and 72F.", + ) + agent: Agent = Agent(model) + + @agent.tool_plain + def get_weather(city: str) -> str: + """Get the current weather for a city.""" + return f"The weather in {city} is sunny and 72F" + + return agent + + +# --------------------------------------------------------------------------- +# Fake streaming backend (replaces adk.streaming; no Redis required) +# --------------------------------------------------------------------------- + + +class _FakeCtx: + """Minimal StreamingTaskMessageContext fake.""" + + def __init__(self, sink: list[Any], ctype: str, initial_content: Any) -> None: + self.sink = sink + self.ctype = ctype + self.task_message = TaskMessage(id="msg-1", task_id="task1", content=initial_content) + + async def __aenter__(self) -> "_FakeCtx": + self.sink.append(("open", self.ctype, self.task_message.content)) + return self + + async def __aexit__(self, *args: Any) -> bool: + await self.close() + return False + + async def close(self) -> None: + self.sink.append(("close", self.ctype)) + + async def stream_update(self, update: Any) -> Any: + self.sink.append(("delta", self.ctype, update)) + return update + + +class _FakeStreaming: + """Fake streaming backend; records every context lifecycle event.""" + + def __init__(self) -> None: + self.sink: list[Any] = [] + self.messages_opened: list[Any] = [] + + def streaming_task_message_context( + self, + task_id: str, + initial_content: Any, + streaming_mode: str = "coalesced", + created_at: Any = None, + ) -> _FakeCtx: + ctype = getattr(initial_content, "type", None) or "" + self.messages_opened.append(initial_content) + return _FakeCtx(self.sink, ctype, initial_content) + + +# --------------------------------------------------------------------------- +# Fake tracing backend +# --------------------------------------------------------------------------- + + +class _FakeSpan: + def __init__(self, name: str) -> None: + self.name = name + self.output: Any = None + + +class _FakeTracing: + def __init__(self) -> None: + self.started: list[tuple[str, str | None]] = [] + self.ended: list[tuple[str, Any]] = [] + + async def start_span( + self, + *, + trace_id: str, + name: str, + input: Any = None, + parent_id: Any = None, + data: Any = None, + task_id: Any = None, + ) -> _FakeSpan: + self.started.append((name, parent_id)) + return _FakeSpan(name) + + async def end_span(self, *, trace_id: str, span: _FakeSpan) -> None: + self.ended.append((span.name, span.output)) + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +async def _run_auto_send_turn( + agent: Agent, + user_msg: str = "What is the weather in Paris?", + trace_id: str | None = None, + parent_span_id: str | None = None, + fake_tracing: _FakeTracing | None = None, +) -> tuple[TurnResult, _FakeStreaming]: + """Drive the async (auto_send) path and return the TurnResult + fake streaming state.""" + fake_streaming = _FakeStreaming() + + tracer: SpanTracer | bool | None = None + if trace_id and fake_tracing is not None: + tracer = SpanTracer( + trace_id=trace_id, + parent_span_id=parent_span_id, + task_id="task1", + tracing=fake_tracing, + ) + + async with agent.run_stream_events(user_msg) as stream: + turn = PydanticAITurn( + stream, + model="test", + ) + emitter = UnifiedEmitter( + task_id="task1", + trace_id=trace_id, + parent_span_id=parent_span_id, + tracer=tracer if tracer is not None else False, + streaming=fake_streaming, + ) + result = await emitter.auto_send_turn(turn) + + return result, fake_streaming + + +# --------------------------------------------------------------------------- +# Tests: message order and content +# --------------------------------------------------------------------------- + + +class TestAsyncAutoSendMessageOrder: + """auto_send pushes messages to the streaming backend in canonical order.""" + + async def test_tool_request_pushed_first(self) -> None: + """tool_request is the first message type pushed to the streaming backend.""" + agent = _make_agent() + _, fake_streaming = await _run_auto_send_turn(agent) + + message_types = [getattr(m, "type", None) for m in fake_streaming.messages_opened] + assert "tool_request" in message_types + assert message_types.index("tool_request") < message_types.index("tool_response"), ( + "tool_request must be pushed before tool_response" + ) + + async def test_tool_response_pushed_after_tool_request(self) -> None: + """tool_response appears after tool_request in the pushed messages.""" + agent = _make_agent() + _, fake_streaming = await _run_auto_send_turn(agent) + + message_types = [getattr(m, "type", None) for m in fake_streaming.messages_opened] + assert "tool_response" in message_types + + async def test_text_pushed_last(self) -> None: + """Text content is the last type pushed (after tool round-trip).""" + agent = _make_agent() + _, fake_streaming = await _run_auto_send_turn(agent) + + message_types = [getattr(m, "type", None) for m in fake_streaming.messages_opened] + assert message_types[-1] == "text", f"Expected last message type=text, got {message_types}" + + async def test_exactly_three_messages(self) -> None: + """Exactly three message contexts are opened: tool_request, tool_response, text.""" + agent = _make_agent() + _, fake_streaming = await _run_auto_send_turn(agent) + + assert len(fake_streaming.messages_opened) == 3, ( + f"Expected 3 messages (tool_request + tool_response + text), " + f"got {len(fake_streaming.messages_opened)}: " + f"{[getattr(m, 'type', None) for m in fake_streaming.messages_opened]}" + ) + + +class TestAsyncAutoSendContentVerification: + """The content pushed to the streaming backend is correct.""" + + async def test_tool_request_content(self) -> None: + """The pushed tool_request is a ToolRequestContent for get_weather.""" + agent = _make_agent() + _, fake_streaming = await _run_auto_send_turn(agent) + + tool_reqs = [m for m in fake_streaming.messages_opened if isinstance(m, ToolRequestContent)] + assert len(tool_reqs) == 1, "Expected exactly one ToolRequestContent" + assert tool_reqs[0].name == "get_weather" + + async def test_tool_response_content(self) -> None: + """The pushed tool_response is a ToolResponseContent containing the weather result.""" + agent = _make_agent() + _, fake_streaming = await _run_auto_send_turn(agent) + + tool_resps = [m for m in fake_streaming.messages_opened if isinstance(m, ToolResponseContent)] + assert len(tool_resps) == 1, "Expected exactly one ToolResponseContent" + assert isinstance(tool_resps[0].content, str) + assert "72F" in tool_resps[0].content + assert tool_resps[0].name == "get_weather" + + async def test_tool_call_ids_match(self) -> None: + """tool_request and tool_response have the same tool_call_id.""" + agent = _make_agent() + _, fake_streaming = await _run_auto_send_turn(agent) + + tool_req = next(m for m in fake_streaming.messages_opened if isinstance(m, ToolRequestContent)) + tool_resp = next(m for m in fake_streaming.messages_opened if isinstance(m, ToolResponseContent)) + assert tool_req.tool_call_id == tool_resp.tool_call_id, ( + "tool_request and tool_response must share the same tool_call_id" + ) + + +class TestAsyncAutoSendFinalText: + """auto_send_turn returns the accumulated text from the last text part.""" + + async def test_final_text_matches_model_output(self) -> None: + """TurnResult.final_text equals the TestModel custom_output_text.""" + agent = _make_agent() + result, _ = await _run_auto_send_turn(agent) + assert result.final_text == "The weather in Paris is sunny and 72F." + + async def test_turn_result_has_usage(self) -> None: + """TurnResult carries a TurnUsage object (may have None tokens from TestModel).""" + agent = _make_agent() + result, _ = await _run_auto_send_turn(agent) + assert result.usage is not None + + async def test_context_lifecycle_open_then_close(self) -> None: + """Every message context is opened then closed (no leak).""" + agent = _make_agent() + _, fake_streaming = await _run_auto_send_turn(agent) + + opens = [e for e in fake_streaming.sink if e[0] == "open"] + closes = [e for e in fake_streaming.sink if e[0] == "close"] + assert len(opens) == len(closes) == 3, "Each of the 3 messages must have exactly one open and one close" + + +class TestAsyncAutoSendSpanDerivation: + """Span derivation on the async path now works for streamed tool requests. + + The foundation auto_send delivers Start+ToolRequestDelta+Done natively + (AGX1-377 fix). The SpanDeriver opens a tool span on Done(tool_request), + so the async path now derives spans just like the sync path. + """ + + async def test_tool_span_derived_on_async_path(self) -> None: + """With the bare PydanticAITurn (no coalescing), a tool span is derived + on the async/auto_send path when auto_send delivers the streamed + Start+ToolRequestDelta+Done sequence.""" + agent = _make_agent() + fake_tracing = _FakeTracing() + tracer = SpanTracer( + trace_id="trace1", + parent_span_id="parent", + task_id="task1", + tracing=fake_tracing, + ) + fake_streaming = _FakeStreaming() + + async with agent.run_stream_events("What is the weather in Paris?") as stream: + turn = PydanticAITurn(stream, model="test") + emitter = UnifiedEmitter( + task_id="task1", + trace_id="trace1", + parent_span_id="parent", + tracer=tracer, + streaming=fake_streaming, + ) + await emitter.auto_send_turn(turn) + + assert len(fake_tracing.started) == 1, ( + "Expected one tool span to be started for the get_weather call." + ) + assert fake_tracing.started[0][0] == "get_weather" + assert len(fake_tracing.ended) == 1 + + +@pytest.mark.parametrize( + "user_msg", + [ + "What is the weather in Paris?", + "Tell me the weather in London.", + ], +) +async def test_async_handler_pushes_messages_for_various_inputs(user_msg: str) -> None: + """auto_send pushes at least tool_request + tool_response + text for any input.""" + agent = _make_agent() + result, fake_streaming = await _run_auto_send_turn(agent, user_msg=user_msg) + + message_types = [getattr(m, "type", None) for m in fake_streaming.messages_opened] + assert "tool_request" in message_types + assert "tool_response" in message_types + assert "text" in message_types + assert isinstance(result.final_text, str) + assert len(result.final_text) > 0 diff --git a/tests/lib/core/harness/test_harness_pydantic_ai_sync.py b/tests/lib/core/harness/test_harness_pydantic_ai_sync.py new file mode 100644 index 000000000..1557d0dd1 --- /dev/null +++ b/tests/lib/core/harness/test_harness_pydantic_ai_sync.py @@ -0,0 +1,388 @@ +"""Integration test: sync (HTTP-yield) channel with a pydantic-ai agent. + +Exercises the unified harness surface (UnifiedEmitter.yield_turn + PydanticAITurn) +with a minimal pydantic-ai agent backed by TestModel so the test runs fully +offline (no API keys, no live infrastructure). + +Agent description +----------------- +A single-tool agent with ``get_weather(city: str) -> str`` that always returns +"sunny and 72F". TestModel is configured to call that tool once then produce +a fixed text reply, giving a deterministic event sequence. + +What is tested +-------------- +- The sync handler correctly yields StreamTaskMessage* events in order: + tool_request (Start+Done) then tool_response (Full) then text (Start+Delta+Done). +- Final accumulated text equals the TestModel custom output. +- With a trace_id + fake tracing, a tool span is opened (OpenSpan) and + closed (CloseSpan) — proving the SpanDeriver is wired on the yield path. + +What is NOT covered without live infrastructure +----------------------------------------------- +- Actual HTTP streaming over the ACP sync endpoint (requires a running + Agentex server + deployed agent). +- Real LLM calls or production model behaviour. +- The full FastACP request/response lifecycle. + +See also: tests/lib/core/harness/test_harness_pydantic_ai_async.py and +test_harness_pydantic_ai_temporal.py for the other two channels. +""" + +from __future__ import annotations + +from typing import Any, override + +import pytest +from pydantic_ai import Agent +from pydantic_ai.models.test import TestModel + +from agentex.types.text_delta import TextDelta +from agentex.lib.core.harness.types import OpenSpan, CloseSpan +from agentex.lib.core.harness.tracer import SpanTracer +from agentex.lib.core.harness.emitter import UnifiedEmitter +from agentex.types.task_message_update import ( + StreamTaskMessageDone, + StreamTaskMessageFull, + StreamTaskMessageStart, +) +from agentex.types.tool_response_content import ToolResponseContent +from agentex.lib.adk._modules._pydantic_ai_turn import PydanticAITurn + +# --------------------------------------------------------------------------- +# Minimal agent under test +# --------------------------------------------------------------------------- + + +def _make_agent() -> Agent: + """Build a pydantic-ai agent with one weather tool and a TestModel. + + TestModel is instantiated with call_tools=['get_weather'] so it always + invokes the tool once, then emits custom_output_text as the reply. + """ + model = TestModel( + call_tools=["get_weather"], + custom_output_text="The weather in Paris is sunny and 72F.", + ) + agent: Agent = Agent(model) + + @agent.tool_plain + def get_weather(city: str) -> str: + """Get the current weather for a city.""" + return f"The weather in {city} is sunny and 72F" + + return agent + + +# --------------------------------------------------------------------------- +# Fake tracing backend (no network calls) +# --------------------------------------------------------------------------- + + +class _FakeSpan: + def __init__(self, name: str) -> None: + self.name = name + self.output: Any = None + + +class _FakeTracing: + def __init__(self) -> None: + self.started: list[tuple[str, str | None]] = [] + self.ended: list[tuple[str, Any]] = [] + + async def start_span( + self, + *, + trace_id: str, + name: str, + input: Any = None, + parent_id: Any = None, + data: Any = None, + task_id: Any = None, + ) -> _FakeSpan: + self.started.append((name, parent_id)) + return _FakeSpan(name) + + async def end_span(self, *, trace_id: str, span: _FakeSpan) -> None: + self.ended.append((span.name, span.output)) + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +async def _run_yield_turn( + agent: Agent, + user_msg: str = "What is the weather in Paris?", + trace_id: str | None = None, + parent_span_id: str | None = None, + fake_tracing: _FakeTracing | None = None, +) -> list[Any]: + """Drive the sync (yield) path and collect all yielded events.""" + tracer: SpanTracer | bool | None = None + if trace_id and fake_tracing is not None: + tracer = SpanTracer( + trace_id=trace_id, + parent_span_id=parent_span_id, + task_id="task1", + tracing=fake_tracing, + ) + + events: list[Any] = [] + async with agent.run_stream_events(user_msg) as stream: + turn = PydanticAITurn(stream, model="test") + emitter = UnifiedEmitter( + task_id="task1", + trace_id=trace_id, + parent_span_id=parent_span_id, + tracer=tracer if tracer is not None else False, + ) + events = [ev async for ev in emitter.yield_turn(turn)] + return events + + +# --------------------------------------------------------------------------- +# Tests +# --------------------------------------------------------------------------- + + +class TestSyncYieldEventOrder: + """The yield channel forwards events in canonical order.""" + + async def test_tool_request_precedes_tool_response(self) -> None: + """tool_request events appear before the tool_response Full event.""" + agent = _make_agent() + events = await _run_yield_turn(agent) + + content_types = [ + getattr(getattr(ev, "content", None), "type", None) + for ev in events + if isinstance(ev, (StreamTaskMessageStart, StreamTaskMessageFull)) + ] + assert "tool_request" in content_types + assert "tool_response" in content_types + tool_req_idx = content_types.index("tool_request") + tool_resp_idx = content_types.index("tool_response") + assert tool_req_idx < tool_resp_idx, "tool_request must appear before tool_response in the event stream" + + async def test_text_appears_after_tool_response(self) -> None: + """Text content (Start/Done) comes after the tool_response Full event.""" + agent = _make_agent() + events = await _run_yield_turn(agent) + + full_types = [ + getattr(getattr(ev, "content", None), "type", None) + for ev in events + if isinstance(ev, StreamTaskMessageFull) + ] + start_types = [ + getattr(getattr(ev, "content", None), "type", None) + for ev in events + if isinstance(ev, StreamTaskMessageStart) + ] + + assert "tool_response" in full_types + assert "text" in start_types + + tool_resp_pos = next( + i + for i, ev in enumerate(events) + if isinstance(ev, StreamTaskMessageFull) + and getattr(getattr(ev, "content", None), "type", None) == "tool_response" + ) + text_start_pos = next( + i + for i, ev in enumerate(events) + if isinstance(ev, StreamTaskMessageStart) and getattr(getattr(ev, "content", None), "type", None) == "text" + ) + assert tool_resp_pos < text_start_pos + + async def test_tool_response_carries_weather_result(self) -> None: + """The ToolResponseContent contains the get_weather return value.""" + agent = _make_agent() + events = await _run_yield_turn(agent) + + full_events = [ + ev + for ev in events + if isinstance(ev, StreamTaskMessageFull) and isinstance(getattr(ev, "content", None), ToolResponseContent) + ] + assert len(full_events) >= 1, "Expected at least one tool_response Full event" + tool_response = full_events[0].content + assert isinstance(tool_response, ToolResponseContent) + assert isinstance(tool_response.content, str) + assert "72F" in tool_response.content + assert tool_response.name == "get_weather" + + async def test_accumulated_text_matches_model_output(self) -> None: + """Accumulated text deltas equal the TestModel custom_output_text.""" + from agentex.types.task_message_update import StreamTaskMessageDelta + + agent = _make_agent() + events = await _run_yield_turn(agent) + + accumulated = "".join( + ev.delta.text_delta + for ev in events + if isinstance(ev, StreamTaskMessageDelta) and isinstance(ev.delta, TextDelta) and ev.delta.text_delta + ) + assert accumulated == "The weather in Paris is sunny and 72F." + + async def test_every_start_has_matching_done(self) -> None: + """Every StreamTaskMessageStart has a corresponding StreamTaskMessageDone.""" + agent = _make_agent() + events = await _run_yield_turn(agent) + + starts = {ev.index for ev in events if isinstance(ev, StreamTaskMessageStart)} + dones = {ev.index for ev in events if isinstance(ev, StreamTaskMessageDone)} + assert starts == dones, f"Unmatched Start/Done indices: starts={starts} dones={dones}" + + +class TestSyncYieldSpanDerivation: + """SpanDeriver is wired on the yield path; tool spans are opened/closed.""" + + async def test_tool_span_opened_and_closed(self) -> None: + """One tool span is opened and closed per tool call.""" + agent = _make_agent() + fake_tracing = _FakeTracing() + tracer = SpanTracer( + trace_id="trace1", + parent_span_id="parent-span", + task_id="task1", + tracing=fake_tracing, + ) + + async with agent.run_stream_events("What is the weather in Paris?") as stream: + turn = PydanticAITurn(stream, model="test") + emitter = UnifiedEmitter( + task_id="task1", + trace_id="trace1", + parent_span_id="parent-span", + tracer=tracer, + ) + await emitter.yield_turn(turn).__anext__.__self__ if False else None + [_ async for _ in emitter.yield_turn(turn)] + + assert len(fake_tracing.started) == 1, "Expected exactly one tool span opened" + assert len(fake_tracing.ended) == 1, "Expected exactly one tool span closed" + span_name, parent_id = fake_tracing.started[0] + assert span_name == "get_weather" + assert parent_id == "parent-span" + + async def test_tool_span_output_is_tool_result(self) -> None: + """The closed tool span's output equals the tool's return value.""" + agent = _make_agent() + fake_tracing = _FakeTracing() + tracer = SpanTracer( + trace_id="trace1", + parent_span_id="parent-span", + task_id="task1", + tracing=fake_tracing, + ) + + async with agent.run_stream_events("What is the weather in Paris?") as stream: + turn = PydanticAITurn(stream, model="test") + emitter = UnifiedEmitter( + task_id="task1", + trace_id="trace1", + parent_span_id="parent-span", + tracer=tracer, + ) + [_ async for _ in emitter.yield_turn(turn)] + + name, output = fake_tracing.ended[0] + assert name == "get_weather" + assert output is not None + assert "72F" in str(output) + + async def test_no_trace_id_means_no_spans(self) -> None: + """With trace_id=None, no spans are derived (emitter disables tracing).""" + agent = _make_agent() + fake_tracing = _FakeTracing() + + async with agent.run_stream_events("What is the weather in Paris?") as stream: + turn = PydanticAITurn(stream, model="test") + emitter = UnifiedEmitter( + task_id="task1", + trace_id=None, + parent_span_id=None, + tracing=fake_tracing, + ) + [_ async for _ in emitter.yield_turn(turn)] + + assert fake_tracing.started == [] + assert fake_tracing.ended == [] + + async def test_tracer_false_suppresses_spans(self) -> None: + """tracer=False disables span derivation regardless of trace_id.""" + agent = _make_agent() + fake_tracing = _FakeTracing() + + async with agent.run_stream_events("What is the weather in Paris?") as stream: + turn = PydanticAITurn(stream, model="test") + emitter = UnifiedEmitter( + task_id="task1", + trace_id="trace1", + parent_span_id="parent-span", + tracer=False, + tracing=fake_tracing, + ) + [_ async for _ in emitter.yield_turn(turn)] + + assert fake_tracing.started == [] + assert fake_tracing.ended == [] + + async def test_span_signal_types(self) -> None: + """The signals received by the tracer are OpenSpan then CloseSpan.""" + from agentex.lib.core.harness.tracer import SpanTracer as RealTracer + + received_signals: list[Any] = [] + + class _RecordingTracer(RealTracer): + @override + async def handle(self, signal: Any) -> None: + received_signals.append(signal) + await super().handle(signal) + + fake_tracing = _FakeTracing() + tracer = _RecordingTracer( + trace_id="trace1", + parent_span_id="parent", + task_id="task1", + tracing=fake_tracing, + ) + + agent = _make_agent() + async with agent.run_stream_events("What is the weather in Paris?") as stream: + turn = PydanticAITurn(stream, model="test") + emitter = UnifiedEmitter( + task_id="task1", + trace_id="trace1", + parent_span_id="parent", + tracer=tracer, + ) + [_ async for _ in emitter.yield_turn(turn)] + + assert len(received_signals) == 2 + assert isinstance(received_signals[0], OpenSpan) + assert isinstance(received_signals[1], CloseSpan) + assert received_signals[0].name == "get_weather" + + +@pytest.mark.parametrize( + "user_msg", + [ + "What is the weather in Paris?", + "Tell me the weather in London.", + ], +) +async def test_sync_handler_produces_events_for_various_inputs(user_msg: str) -> None: + """Yield path produces at least a tool_response Full for any user message.""" + agent = _make_agent() + events = await _run_yield_turn(agent, user_msg=user_msg) + + full_event_types = [ + getattr(getattr(ev, "content", None), "type", None) for ev in events if isinstance(ev, StreamTaskMessageFull) + ] + assert "tool_response" in full_event_types diff --git a/tests/lib/core/harness/test_harness_pydantic_ai_temporal.py b/tests/lib/core/harness/test_harness_pydantic_ai_temporal.py new file mode 100644 index 000000000..0ead8e832 --- /dev/null +++ b/tests/lib/core/harness/test_harness_pydantic_ai_temporal.py @@ -0,0 +1,370 @@ +"""Integration test: Temporal-backed pydantic-ai agent, offline. + +Exercises the core of the Temporal pydantic-ai harness path — the +event_stream_handler activity — with a TemporalAgent backed by TestModel so the +test runs fully offline (no Temporal server, no Redis, no API keys). + +Architecture overview +--------------------- +In a real Temporal deployment the pydantic-ai Temporal harness runs like this: + + HTTP POST /task/event/send + -> @workflow.signal on At110PydanticAiWorkflow + -> temporal_agent.run(user_message, deps=TaskDeps(...)) + internally schedules: + 1. request_activity (LLM HTTP call — recorded by Temporal) + 2. call_tool_activity (for each tool call — also recorded) + 3. event_stream_handler_activity (streams events to Redis) + +The third activity is what we test here: it receives a +``RunContext[TaskDeps]`` and an ``AsyncIterable[AgentStreamEvent]`` from +pydantic-ai, calls ``stream_pydantic_ai_events`` (which internally constructs +a ``UnifiedEmitter`` + ``PydanticAITurn`` and calls ``auto_send_turn``), and +pushes the resulting messages to Redis. + +What we test +----------- +Since ``TemporalAgent.run_stream_events`` works offline with TestModel (it does +not schedule Temporal activities — it runs in-process), we can: + +1. Build a TemporalAgent with TestModel. +2. Call ``run_stream_events`` on it directly, just as the event_stream_handler + would see the event iterable. +3. Feed that stream into ``stream_pydantic_ai_events`` backed by a fake streaming + backend, and assert the canonical message sequence. + +This covers the full inner harness chain that the Temporal workflow exercises, +minus the Temporal scheduling/durability layer itself. + +What is NOT covered without live infrastructure +----------------------------------------------- +- Temporal scheduling (the workflow.signal -> activity dispatch chain). +- Temporal durability guarantees and replay behaviour. +- Redis streaming (requires a running Redis instance). +- Multi-turn history (pydantic-ai message_history round-tripping via Temporal + workflow state). +- Real LLM calls or production model behaviour. +- The full temporal_agent.run(...) path, which schedules activities and cannot + run without a connected Temporal client. + +To test with live infrastructure: spin up Temporal + Redis + the ACP server + +the Temporal worker, then use the AsyncAgentex client to create a task, send a +message, and poll for messages — exactly as the existing examples/tutorials/ +10_async/10_temporal/110_pydantic_ai/tests/test_agent.py does. +""" + +from __future__ import annotations + +from typing import Any + +import pytest +from pydantic import BaseModel +from pydantic_ai import Agent +from pydantic_ai.models.test import TestModel +from pydantic_ai.durable_exec.temporal import TemporalAgent + +from agentex.types.task_message import TaskMessage +from agentex.lib.core.harness.emitter import UnifiedEmitter +from agentex.types.tool_request_content import ToolRequestContent +from agentex.types.tool_response_content import ToolResponseContent +from agentex.lib.adk._modules._pydantic_ai_turn import PydanticAITurn + +# --------------------------------------------------------------------------- +# Agent under test (mirrors examples/tutorials/10_async/10_temporal/110_pydantic_ai) +# --------------------------------------------------------------------------- + + +class TaskDeps(BaseModel): + """Per-run dependencies injected via RunContext.deps.""" + + task_id: str + parent_span_id: str | None = None + + +def _make_temporal_agent() -> TemporalAgent[TaskDeps, str]: + """Build a TemporalAgent with TestModel and one weather tool. + + The underlying pydantic-ai Agent is constructed with TaskDeps as the + deps_type, mirroring the real temporal tutorial agent. TestModel makes + the run deterministic and offline. + """ + model = TestModel( + call_tools=["get_weather"], + custom_output_text="The weather in Paris is sunny and 72F.", + ) + base: Agent[TaskDeps, str] = Agent(model, deps_type=TaskDeps) + + @base.tool_plain + def get_weather(city: str) -> str: + """Get the current weather for a city.""" + return f"The weather in {city} is sunny and 72F" + + return TemporalAgent(base, name="test_temporal_agent") + + +# --------------------------------------------------------------------------- +# Fake streaming backend +# --------------------------------------------------------------------------- + + +class _FakeCtx: + def __init__(self, sink: list[Any], ctype: str, initial_content: Any) -> None: + self.sink = sink + self.ctype = ctype + self.task_message = TaskMessage(id="msg-1", task_id="task1", content=initial_content) + + async def __aenter__(self) -> "_FakeCtx": + self.sink.append(("open", self.ctype, self.task_message.content)) + return self + + async def __aexit__(self, *args: Any) -> bool: + await self.close() + return False + + async def close(self) -> None: + self.sink.append(("close", self.ctype)) + + async def stream_update(self, update: Any) -> Any: + self.sink.append(("delta", self.ctype, update)) + return update + + +class _FakeStreaming: + def __init__(self) -> None: + self.sink: list[Any] = [] + self.messages_opened: list[Any] = [] + + def streaming_task_message_context( + self, + task_id: str, + initial_content: Any, + streaming_mode: str = "coalesced", + created_at: Any = None, + ) -> _FakeCtx: + ctype = getattr(initial_content, "type", None) or "" + self.messages_opened.append(initial_content) + return _FakeCtx(self.sink, ctype, initial_content) + + +# --------------------------------------------------------------------------- +# Helpers: the event_stream_handler pattern tested offline +# --------------------------------------------------------------------------- + + +async def _run_event_stream_handler( + temporal_agent: TemporalAgent[TaskDeps, str], + user_msg: str = "What is the weather in Paris?", + task_id: str = "task1", +) -> _FakeStreaming: + """Simulate the event_stream_handler activity offline. + + In production the event_stream_handler receives the event stream from + pydantic-ai's model activity and calls stream_pydantic_ai_events. + Here we obtain the stream directly from run_stream_events (which works + offline with TestModel) and forward it to stream_pydantic_ai_events backed + by a fake streaming backend. + + This is equivalent to: + async def event_handler(ctx: RunContext[TaskDeps], events: AsyncIterable[AgentStreamEvent]) -> None: + await stream_pydantic_ai_events(events, ctx.deps.task_id) + but without requiring a running Temporal server. + """ + fake_streaming = _FakeStreaming() + + async with temporal_agent.run_stream_events(user_msg) as stream: + await _fake_stream_pydantic_ai_events(stream, task_id, fake_streaming) + + return fake_streaming + + +async def _fake_stream_pydantic_ai_events( + stream: Any, + task_id: str, + fake_streaming: _FakeStreaming, +) -> str: + """Like stream_pydantic_ai_events but uses an injected fake streaming backend. + + Mirrors the exact chain that stream_pydantic_ai_events uses internally: + PydanticAITurn(stream) + + UnifiedEmitter.auto_send_turn(turn) + but with the fake backend injected so no Redis is needed. + """ + turn = PydanticAITurn(stream, model=None) + emitter = UnifiedEmitter( + task_id=task_id, + trace_id=None, + parent_span_id=None, + tracer=False, + streaming=fake_streaming, + ) + result = await emitter.auto_send_turn(turn) + return result.final_text + + +# --------------------------------------------------------------------------- +# Tests: TemporalAgent + event_stream_handler pattern +# --------------------------------------------------------------------------- + + +class TestTemporalEventStreamHandlerMessageOrder: + """The event_stream_handler pushes messages in canonical order.""" + + async def test_tool_request_before_tool_response(self) -> None: + """tool_request is pushed before tool_response.""" + temporal_agent = _make_temporal_agent() + fake_streaming = await _run_event_stream_handler(temporal_agent) + + types = [getattr(m, "type", None) for m in fake_streaming.messages_opened] + assert "tool_request" in types + assert "tool_response" in types + assert types.index("tool_request") < types.index("tool_response") + + async def test_text_is_last(self) -> None: + """Text content is pushed last (after the tool round-trip).""" + temporal_agent = _make_temporal_agent() + fake_streaming = await _run_event_stream_handler(temporal_agent) + + types = [getattr(m, "type", None) for m in fake_streaming.messages_opened] + assert types[-1] == "text" + + async def test_exactly_three_messages(self) -> None: + """Exactly tool_request + tool_response + text are pushed.""" + temporal_agent = _make_temporal_agent() + fake_streaming = await _run_event_stream_handler(temporal_agent) + + assert len(fake_streaming.messages_opened) == 3, ( + f"Expected 3 messages, got {len(fake_streaming.messages_opened)}: " + f"{[getattr(m, 'type', None) for m in fake_streaming.messages_opened]}" + ) + + +class TestTemporalEventStreamHandlerContent: + """Content verification for the messages pushed by the event_stream_handler.""" + + async def test_tool_request_is_get_weather(self) -> None: + """The pushed tool_request is for the get_weather function.""" + temporal_agent = _make_temporal_agent() + fake_streaming = await _run_event_stream_handler(temporal_agent) + + tool_reqs = [m for m in fake_streaming.messages_opened if isinstance(m, ToolRequestContent)] + assert len(tool_reqs) == 1 + assert tool_reqs[0].name == "get_weather" + + async def test_tool_response_contains_weather_result(self) -> None: + """The pushed tool_response contains the get_weather return value.""" + temporal_agent = _make_temporal_agent() + fake_streaming = await _run_event_stream_handler(temporal_agent) + + tool_resps = [m for m in fake_streaming.messages_opened if isinstance(m, ToolResponseContent)] + assert len(tool_resps) == 1 + assert isinstance(tool_resps[0].content, str) + assert "72F" in tool_resps[0].content + assert tool_resps[0].name == "get_weather" + + async def test_tool_call_ids_match(self) -> None: + """tool_request and tool_response share the same tool_call_id.""" + temporal_agent = _make_temporal_agent() + fake_streaming = await _run_event_stream_handler(temporal_agent) + + tool_req = next(m for m in fake_streaming.messages_opened if isinstance(m, ToolRequestContent)) + tool_resp = next(m for m in fake_streaming.messages_opened if isinstance(m, ToolResponseContent)) + assert tool_req.tool_call_id == tool_resp.tool_call_id + + +class TestTemporalFinalText: + """stream_pydantic_ai_events returns the correct final text.""" + + async def test_final_text_matches_model_output(self) -> None: + """The returned final text equals the TestModel custom_output_text.""" + temporal_agent = _make_temporal_agent() + fake_streaming = _FakeStreaming() + + async with temporal_agent.run_stream_events("What is the weather in Paris?") as stream: + final = await _fake_stream_pydantic_ai_events(stream, "task1", fake_streaming) + + assert final == "The weather in Paris is sunny and 72F." + + async def test_context_lifecycle_complete(self) -> None: + """Every opened streaming context is also closed.""" + temporal_agent = _make_temporal_agent() + fake_streaming = await _run_event_stream_handler(temporal_agent) + + opens = [e for e in fake_streaming.sink if e[0] == "open"] + closes = [e for e in fake_streaming.sink if e[0] == "close"] + assert len(opens) == len(closes), "Every opened context must be closed" + + +class TestTemporalAgentStreamEventsOffline: + """TemporalAgent.run_stream_events produces the expected raw pydantic-ai events. + + This verifies that the TemporalAgent wrapper does not suppress event stream + delivery when used with TestModel, so the event_stream_handler pattern is + meaningful offline. + """ + + async def test_run_stream_events_yields_tool_call_and_text(self) -> None: + """TemporalAgent.run_stream_events with TestModel yields tool + text events.""" + + temporal_agent = _make_temporal_agent() + collected: list[Any] = [] + + async with temporal_agent.run_stream_events("What is the weather in Paris?") as stream: + async for ev in stream: + collected.append(ev) + + event_types = {type(ev).__name__ for ev in collected} + assert "FunctionToolResultEvent" in event_types, "Expected FunctionToolResultEvent proving tool call ran" + assert "PartDeltaEvent" in event_types or "PartEndEvent" in event_types, ( + "Expected text part events in the stream" + ) + + async def test_run_stream_events_contains_tool_result(self) -> None: + """The raw event stream contains a FunctionToolResultEvent with the tool output.""" + from pydantic_ai.messages import FunctionToolResultEvent + + temporal_agent = _make_temporal_agent() + + async with temporal_agent.run_stream_events("What is the weather in Paris?") as stream: + events = [ev async for ev in stream] + + tool_results = [ev for ev in events if isinstance(ev, FunctionToolResultEvent)] + assert len(tool_results) >= 1 + assert isinstance(tool_results[0].part.content, str) + assert "72F" in tool_results[0].part.content + + +class TestTemporalLiveInfraNote: + """Placeholder tests documenting what requires live Temporal infrastructure. + + These tests are skipped by design. They document the gap between what the + offline tests cover and what a full integration test would exercise. + """ + + @pytest.mark.skip( + reason=( + "Requires live Temporal server + Redis + ACP server + worker. " + "See examples/tutorials/10_async/10_temporal/110_pydantic_ai/tests/test_agent.py " + "for the live integration test that exercises this path end-to-end." + ) + ) + async def test_temporal_workflow_full_round_trip(self) -> None: + """Full Temporal workflow: create_task -> send_event -> poll_messages.""" + pass # Covered by the live tutorial test + + +@pytest.mark.parametrize( + "user_msg", + [ + "What is the weather in Paris?", + "Tell me the weather in London.", + ], +) +async def test_temporal_handler_pushes_messages_for_various_inputs(user_msg: str) -> None: + """event_stream_handler pushes tool_request + tool_response + text for any input.""" + temporal_agent = _make_temporal_agent() + fake_streaming = await _run_event_stream_handler(temporal_agent, user_msg=user_msg) + + types = [getattr(m, "type", None) for m in fake_streaming.messages_opened] + assert "tool_request" in types + assert "tool_response" in types + assert "text" in types From d10e1510bd5da44ad5acc5cac638750122083fce Mon Sep 17 00:00:00 2001 From: Declan Brady Date: Mon, 22 Jun 2026 18:21:25 -0400 Subject: [PATCH 05/10] feat(openai-agents): migrate onto the unified harness surface (#416) --- .../00_sync/060_harness_openai/.dockerignore | 43 +++ .../00_sync/060_harness_openai/Dockerfile | 50 ++++ .../00_sync/060_harness_openai/README.md | 35 +++ .../00_sync/060_harness_openai/manifest.yaml | 58 ++++ .../060_harness_openai/project/__init__.py | 0 .../00_sync/060_harness_openai/project/acp.py | 87 ++++++ .../060_harness_openai/project/agent.py | 47 +++ .../060_harness_openai/project/tools.py | 19 ++ .../00_sync/060_harness_openai/pyproject.toml | 36 +++ .../060_harness_openai/tests/test_agent.py | 48 +++ .../00_base/130_harness_openai/.dockerignore | 43 +++ .../00_base/130_harness_openai/Dockerfile | 50 ++++ .../00_base/130_harness_openai/README.md | 33 +++ .../00_base/130_harness_openai/manifest.yaml | 58 ++++ .../130_harness_openai/project/__init__.py | 0 .../00_base/130_harness_openai/project/acp.py | 98 ++++++ .../130_harness_openai/project/agent.py | 43 +++ .../130_harness_openai/project/tools.py | 15 + .../00_base/130_harness_openai/pyproject.toml | 36 +++ .../130_harness_openai/tests/test_agent.py | 77 +++++ .../140_harness_openai/.dockerignore | 43 +++ .../10_temporal/140_harness_openai/Dockerfile | 43 +++ .../10_temporal/140_harness_openai/README.md | 41 +++ .../140_harness_openai/environments.yaml | 64 ++++ .../140_harness_openai/manifest.yaml | 62 ++++ .../140_harness_openai/project/__init__.py | 0 .../140_harness_openai/project/acp.py | 33 +++ .../140_harness_openai/project/activities.py | 75 +++++ .../140_harness_openai/project/agent.py | 44 +++ .../140_harness_openai/project/run_worker.py | 44 +++ .../140_harness_openai/project/tools.py | 15 + .../140_harness_openai/project/workflow.py | 121 ++++++++ .../140_harness_openai/pyproject.toml | 38 +++ .../140_harness_openai/tests/test_agent.py | 77 +++++ .../lib/adk/providers/_modules/openai_turn.py | 134 +++++++++ .../adk/providers/_modules/sync_provider.py | 103 ++++--- .../lib/core/services/adk/providers/openai.py | 280 +++--------------- .../adk/providers/test_openai_activities.py | 170 ++++++++++- tests/lib/adk/providers/test_openai_turn.py | 246 +++++++++++++++ .../conformance/test_openai_conformance.py | 206 +++++++++++++ 40 files changed, 2430 insertions(+), 285 deletions(-) create mode 100644 examples/tutorials/00_sync/060_harness_openai/.dockerignore create mode 100644 examples/tutorials/00_sync/060_harness_openai/Dockerfile create mode 100644 examples/tutorials/00_sync/060_harness_openai/README.md create mode 100644 examples/tutorials/00_sync/060_harness_openai/manifest.yaml create mode 100644 examples/tutorials/00_sync/060_harness_openai/project/__init__.py create mode 100644 examples/tutorials/00_sync/060_harness_openai/project/acp.py create mode 100644 examples/tutorials/00_sync/060_harness_openai/project/agent.py create mode 100644 examples/tutorials/00_sync/060_harness_openai/project/tools.py create mode 100644 examples/tutorials/00_sync/060_harness_openai/pyproject.toml create mode 100644 examples/tutorials/00_sync/060_harness_openai/tests/test_agent.py create mode 100644 examples/tutorials/10_async/00_base/130_harness_openai/.dockerignore create mode 100644 examples/tutorials/10_async/00_base/130_harness_openai/Dockerfile create mode 100644 examples/tutorials/10_async/00_base/130_harness_openai/README.md create mode 100644 examples/tutorials/10_async/00_base/130_harness_openai/manifest.yaml create mode 100644 examples/tutorials/10_async/00_base/130_harness_openai/project/__init__.py create mode 100644 examples/tutorials/10_async/00_base/130_harness_openai/project/acp.py create mode 100644 examples/tutorials/10_async/00_base/130_harness_openai/project/agent.py create mode 100644 examples/tutorials/10_async/00_base/130_harness_openai/project/tools.py create mode 100644 examples/tutorials/10_async/00_base/130_harness_openai/pyproject.toml create mode 100644 examples/tutorials/10_async/00_base/130_harness_openai/tests/test_agent.py create mode 100644 examples/tutorials/10_async/10_temporal/140_harness_openai/.dockerignore create mode 100644 examples/tutorials/10_async/10_temporal/140_harness_openai/Dockerfile create mode 100644 examples/tutorials/10_async/10_temporal/140_harness_openai/README.md create mode 100644 examples/tutorials/10_async/10_temporal/140_harness_openai/environments.yaml create mode 100644 examples/tutorials/10_async/10_temporal/140_harness_openai/manifest.yaml create mode 100644 examples/tutorials/10_async/10_temporal/140_harness_openai/project/__init__.py create mode 100644 examples/tutorials/10_async/10_temporal/140_harness_openai/project/acp.py create mode 100644 examples/tutorials/10_async/10_temporal/140_harness_openai/project/activities.py create mode 100644 examples/tutorials/10_async/10_temporal/140_harness_openai/project/agent.py create mode 100644 examples/tutorials/10_async/10_temporal/140_harness_openai/project/run_worker.py create mode 100644 examples/tutorials/10_async/10_temporal/140_harness_openai/project/tools.py create mode 100644 examples/tutorials/10_async/10_temporal/140_harness_openai/project/workflow.py create mode 100644 examples/tutorials/10_async/10_temporal/140_harness_openai/pyproject.toml create mode 100644 examples/tutorials/10_async/10_temporal/140_harness_openai/tests/test_agent.py create mode 100644 src/agentex/lib/adk/providers/_modules/openai_turn.py create mode 100644 tests/lib/adk/providers/test_openai_turn.py create mode 100644 tests/lib/core/harness/conformance/test_openai_conformance.py diff --git a/examples/tutorials/00_sync/060_harness_openai/.dockerignore b/examples/tutorials/00_sync/060_harness_openai/.dockerignore new file mode 100644 index 000000000..c49489471 --- /dev/null +++ b/examples/tutorials/00_sync/060_harness_openai/.dockerignore @@ -0,0 +1,43 @@ +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg + +# Environments +.env** +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# IDE +.idea/ +.vscode/ +*.swp +*.swo + +# Git +.git +.gitignore + +# Misc +.DS_Store diff --git a/examples/tutorials/00_sync/060_harness_openai/Dockerfile b/examples/tutorials/00_sync/060_harness_openai/Dockerfile new file mode 100644 index 000000000..1bd4f4860 --- /dev/null +++ b/examples/tutorials/00_sync/060_harness_openai/Dockerfile @@ -0,0 +1,50 @@ +# syntax=docker/dockerfile:1.3 +FROM python:3.12-slim +COPY --from=ghcr.io/astral-sh/uv:0.6.4 /uv /uvx /bin/ + +# Install system dependencies +RUN apt-get update && apt-get install -y \ + htop \ + vim \ + curl \ + tar \ + python3-dev \ + postgresql-client \ + build-essential \ + libpq-dev \ + gcc \ + cmake \ + netcat-openbsd \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +RUN uv pip install --system --upgrade pip setuptools wheel + +ENV UV_HTTP_TIMEOUT=1000 + +# Copy pyproject.toml and README.md to install dependencies +COPY 00_sync/060_harness_openai/pyproject.toml /app/060_harness_openai/pyproject.toml +COPY 00_sync/060_harness_openai/README.md /app/060_harness_openai/README.md + +WORKDIR /app/060_harness_openai + +# Copy the project code +COPY 00_sync/060_harness_openai/project /app/060_harness_openai/project + +# Copy the test files +COPY 00_sync/060_harness_openai/tests /app/060_harness_openai/tests + +# Copy shared test utilities +COPY test_utils /app/test_utils + +# Install the required Python packages with dev dependencies +RUN uv pip install --system .[dev] + +# Set environment variables +ENV PYTHONPATH=/app + +# Set test environment variables +ENV AGENT_NAME=s060-harness-openai + +# Run the agent using uvicorn +CMD ["uvicorn", "project.acp:acp", "--host", "0.0.0.0", "--port", "8000"] diff --git a/examples/tutorials/00_sync/060_harness_openai/README.md b/examples/tutorials/00_sync/060_harness_openai/README.md new file mode 100644 index 000000000..e22e9aa8b --- /dev/null +++ b/examples/tutorials/00_sync/060_harness_openai/README.md @@ -0,0 +1,35 @@ +# Sync OpenAI Agents on the unified harness surface + +A sync (HTTP) Agentex agent that runs the OpenAI Agents SDK and delivers its +output through the **unified harness surface**. + +## What this demonstrates + +The OpenAI Agents SDK produces native streaming events. This tutorial wraps a +`Runner.run_streamed` result in an `OpenAITurn` — the provider -> canonical +`StreamTaskMessage*` adapter — and forwards the canonical stream to the frontend +via `UnifiedEmitter.yield_turn`. The same `OpenAITurn` flows unchanged through +`auto_send_turn` in the async (`130_harness_openai`) and temporal +(`140_harness_openai`) variants; only the delivery method differs. + +```python +result = Runner.run_streamed(starting_agent=agent, input=user_message) +turn = OpenAITurn(result=result, model="gpt-4o") +emitter = UnifiedEmitter(task_id=task_id, trace_id=task_id, parent_span_id=parent_span_id) +async for event in emitter.yield_turn(turn): + yield event +``` + +## Run it + +```bash +agentex agents run --manifest manifest.yaml +``` + +## Test it + +The offline test exercises the harness wiring without a server or API key: + +```bash +pytest tests/test_agent.py -v +``` diff --git a/examples/tutorials/00_sync/060_harness_openai/manifest.yaml b/examples/tutorials/00_sync/060_harness_openai/manifest.yaml new file mode 100644 index 000000000..4967c1f8d --- /dev/null +++ b/examples/tutorials/00_sync/060_harness_openai/manifest.yaml @@ -0,0 +1,58 @@ +build: + context: + root: ../../ + include_paths: + - 00_sync/060_harness_openai + - test_utils + dockerfile: 00_sync/060_harness_openai/Dockerfile + dockerignore: 00_sync/060_harness_openai/.dockerignore + +local_development: + agent: + port: 8000 + host_address: host.docker.internal + paths: + acp: project/acp.py + +agent: + acp_type: sync + name: s060-harness-openai + description: A sync OpenAI Agents SDK agent on the unified harness surface + + temporal: + enabled: false + + credentials: + - env_var_name: OPENAI_API_KEY + secret_name: openai-api-key + secret_key: api-key + - env_var_name: REDIS_URL + secret_name: redis-url-secret + secret_key: url + - env_var_name: SGP_API_KEY + secret_name: sgp-api-key + secret_key: api-key + - env_var_name: SGP_ACCOUNT_ID + secret_name: sgp-account-id + secret_key: account-id + - env_var_name: SGP_CLIENT_BASE_URL + secret_name: sgp-client-base-url + secret_key: url + +deployment: + image: + repository: "" + tag: "latest" + + global: + agent: + name: "s060-harness-openai" + description: "A sync OpenAI Agents SDK agent on the unified harness surface" + replicaCount: 1 + resources: + requests: + cpu: "500m" + memory: "1Gi" + limits: + cpu: "1000m" + memory: "2Gi" diff --git a/examples/tutorials/00_sync/060_harness_openai/project/__init__.py b/examples/tutorials/00_sync/060_harness_openai/project/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/examples/tutorials/00_sync/060_harness_openai/project/acp.py b/examples/tutorials/00_sync/060_harness_openai/project/acp.py new file mode 100644 index 000000000..caaa0b132 --- /dev/null +++ b/examples/tutorials/00_sync/060_harness_openai/project/acp.py @@ -0,0 +1,87 @@ +"""ACP handler for the sync OpenAI Agents harness tutorial. + +This is the API layer. It runs the OpenAI Agents SDK via ``Runner.run_streamed``, +wraps the streamed run in an ``OpenAITurn`` (the provider -> canonical +``StreamTaskMessage*`` adapter), and forwards the canonical stream to the +Agentex frontend via ``UnifiedEmitter.yield_turn`` — the same harness surface +used by the async and temporal variants of this tutorial. +""" + +from __future__ import annotations + +import os +from typing import AsyncGenerator + +from dotenv import load_dotenv + +load_dotenv() + +from agents import Runner + +from agentex.lib import adk +from project.agent import MODEL_NAME, create_agent +from agentex.lib.types.acp import SendMessageParams +from agentex.lib.types.tracing import SGPTracingProcessorConfig +from agentex.lib.utils.logging import make_logger +from agentex.lib.sdk.fastacp.fastacp import FastACP +from agentex.lib.core.harness.emitter import UnifiedEmitter +from agentex.types.task_message_update import TaskMessageUpdate +from agentex.types.task_message_content import TaskMessageContent +from agentex.lib.adk.providers._modules.openai_turn import OpenAITurn +from agentex.lib.core.tracing.tracing_processor_manager import add_tracing_processor_config + +logger = make_logger(__name__) + +# LiteLLM proxy auth: copy LITELLM_API_KEY to OPENAI_API_KEY for OpenAI client +# compatibility, so the same example works behind the Scale LiteLLM gateway. +_litellm_key = os.environ.get("LITELLM_API_KEY") +if _litellm_key and not os.environ.get("OPENAI_API_KEY"): + os.environ["OPENAI_API_KEY"] = _litellm_key + +add_tracing_processor_config( + SGPTracingProcessorConfig( + sgp_api_key=os.environ.get("SGP_API_KEY", ""), + sgp_account_id=os.environ.get("SGP_ACCOUNT_ID", ""), + sgp_base_url=os.environ.get("SGP_CLIENT_BASE_URL", ""), + ) +) + +acp = FastACP.create(acp_type="sync") + +_agent = None + + +def get_agent(): + """Get or create the OpenAI Agents SDK agent instance.""" + global _agent + if _agent is None: + _agent = create_agent() + return _agent + + +@acp.on_message_send +async def handle_message_send( + params: SendMessageParams, +) -> TaskMessageContent | list[TaskMessageContent] | AsyncGenerator[TaskMessageUpdate, None]: + """Handle incoming messages, streaming tokens and tool calls via the harness.""" + agent = get_agent() + task_id = params.task.id + user_message = params.content.content + logger.info(f"Processing message for task {task_id}") + + async with adk.tracing.span( + trace_id=task_id, + task_id=task_id, + name="message", + input={"message": user_message}, + data={"__span_type__": "AGENT_WORKFLOW"}, + ) as turn_span: + result = Runner.run_streamed(starting_agent=agent, input=user_message) + turn = OpenAITurn(result=result, model=MODEL_NAME) + emitter = UnifiedEmitter( + task_id=task_id, + trace_id=task_id, + parent_span_id=turn_span.id if turn_span else None, + ) + async for event in emitter.yield_turn(turn): + yield event diff --git a/examples/tutorials/00_sync/060_harness_openai/project/agent.py b/examples/tutorials/00_sync/060_harness_openai/project/agent.py new file mode 100644 index 000000000..3611012fe --- /dev/null +++ b/examples/tutorials/00_sync/060_harness_openai/project/agent.py @@ -0,0 +1,47 @@ +"""OpenAI Agents SDK agent definition for the harness tutorial. + +The agent is the boundary between this module and the API layer (acp.py). +The OpenAI Agents SDK runs its own tool-call loop internally; acp.py wraps a +``Runner.run_streamed`` result with ``OpenAITurn`` so it flows through the +unified harness surface. +""" + +from __future__ import annotations + +from datetime import datetime + +from agents import Agent, function_tool, set_tracing_disabled + +from project.tools import get_weather + +# Disable the openai-agents SDK's native tracer so it doesn't ship traces to +# api.openai.com (the key may be a gateway/proxy key). Agentex tracing still +# runs via the harness + tracing manager configured in acp.py. +set_tracing_disabled(True) + +MODEL_NAME = "gpt-4o" +INSTRUCTIONS = """You are a helpful AI assistant with access to tools. + +Current date and time: {timestamp} + +Guidelines: +- Be concise and helpful +- Use the weather tool when the user asks about the weather +- Always report the real tool output back to the user +""" + + +@function_tool +def weather(city: str) -> str: + """Get the current weather for a city.""" + return get_weather(city) + + +def create_agent() -> Agent: + """Build and return the OpenAI Agents SDK agent with the weather tool.""" + return Agent( + name="Harness OpenAI Assistant", + model=MODEL_NAME, + instructions=INSTRUCTIONS.format(timestamp=datetime.now().strftime("%Y-%m-%d %H:%M:%S")), + tools=[weather], + ) diff --git a/examples/tutorials/00_sync/060_harness_openai/project/tools.py b/examples/tutorials/00_sync/060_harness_openai/project/tools.py new file mode 100644 index 000000000..b03aa7c31 --- /dev/null +++ b/examples/tutorials/00_sync/060_harness_openai/project/tools.py @@ -0,0 +1,19 @@ +"""Tool definitions for the OpenAI Agents harness tutorial. + +The bare function lives here so it's easy to unit-test; it's wrapped as an +OpenAI Agents SDK ``function_tool`` in ``project.agent``. +""" + +from __future__ import annotations + + +def get_weather(city: str) -> str: + """Get the current weather for a city. + + Args: + city: The name of the city to get weather for. + + Returns: + A string describing the weather conditions. + """ + return f"The weather in {city} is sunny and 72°F" diff --git a/examples/tutorials/00_sync/060_harness_openai/pyproject.toml b/examples/tutorials/00_sync/060_harness_openai/pyproject.toml new file mode 100644 index 000000000..39cceb8f2 --- /dev/null +++ b/examples/tutorials/00_sync/060_harness_openai/pyproject.toml @@ -0,0 +1,36 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "s060-harness-openai" +version = "0.1.0" +description = "A sync OpenAI Agents SDK agent on the unified harness surface" +readme = "README.md" +requires-python = ">=3.12" +dependencies = [ + "agentex-sdk", + "scale-gp", + "openai-agents", +] + +[project.optional-dependencies] +dev = [ + "pytest", + "pytest-asyncio", + "httpx", + "black", + "isort", + "flake8", +] + +[tool.hatch.build.targets.wheel] +packages = ["project"] + +[tool.black] +line-length = 88 +target-version = ['py312'] + +[tool.isort] +profile = "black" +line_length = 88 diff --git a/examples/tutorials/00_sync/060_harness_openai/tests/test_agent.py b/examples/tutorials/00_sync/060_harness_openai/tests/test_agent.py new file mode 100644 index 000000000..960b232b7 --- /dev/null +++ b/examples/tutorials/00_sync/060_harness_openai/tests/test_agent.py @@ -0,0 +1,48 @@ +"""Offline test for the sync OpenAI Agents harness tutorial. + +This test does NOT require a running Agentex server or an OpenAI API key. It +verifies the harness wiring this tutorial demonstrates: an ``OpenAITurn`` built +from an injected canonical ``StreamTaskMessage*`` stream, forwarded through +``UnifiedEmitter.yield_turn`` (the sync HTTP ACP delivery path), passes the +events through unchanged. + +To run: ``pytest tests/test_agent.py -v`` +""" + +from __future__ import annotations + +import pytest + +from agentex.types.text_content import TextContent +from agentex.lib.core.harness.emitter import UnifiedEmitter +from agentex.types.task_message_delta import TextDelta +from agentex.types.task_message_update import ( + StreamTaskMessageDone, + StreamTaskMessageDelta, + StreamTaskMessageStart, +) +from agentex.lib.adk.providers._modules.openai_turn import OpenAITurn + + +async def _canonical_stream(events): + for e in events: + yield e + + +@pytest.mark.asyncio +async def test_yield_turn_forwards_canonical_stream(): + events = [ + StreamTaskMessageStart(type="start", index=0, content=TextContent(type="text", author="agent", content="")), + StreamTaskMessageDelta(type="delta", index=0, delta=TextDelta(type="text", text_delta="Hi")), + StreamTaskMessageDone(type="done", index=0), + ] + turn = OpenAITurn(stream=_canonical_stream(events), model="gpt-4o") + # trace_id=None disables tracing, so no Agentex server is needed. + emitter = UnifiedEmitter(task_id="task-1", trace_id=None, parent_span_id=None) + + out = [e async for e in emitter.yield_turn(turn)] + assert out == events + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) diff --git a/examples/tutorials/10_async/00_base/130_harness_openai/.dockerignore b/examples/tutorials/10_async/00_base/130_harness_openai/.dockerignore new file mode 100644 index 000000000..c49489471 --- /dev/null +++ b/examples/tutorials/10_async/00_base/130_harness_openai/.dockerignore @@ -0,0 +1,43 @@ +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg + +# Environments +.env** +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# IDE +.idea/ +.vscode/ +*.swp +*.swo + +# Git +.git +.gitignore + +# Misc +.DS_Store diff --git a/examples/tutorials/10_async/00_base/130_harness_openai/Dockerfile b/examples/tutorials/10_async/00_base/130_harness_openai/Dockerfile new file mode 100644 index 000000000..a31c89a31 --- /dev/null +++ b/examples/tutorials/10_async/00_base/130_harness_openai/Dockerfile @@ -0,0 +1,50 @@ +# syntax=docker/dockerfile:1.3 +FROM python:3.12-slim +COPY --from=ghcr.io/astral-sh/uv:0.6.4 /uv /uvx /bin/ + +# Install system dependencies +RUN apt-get update && apt-get install -y \ + htop \ + vim \ + curl \ + tar \ + python3-dev \ + postgresql-client \ + build-essential \ + libpq-dev \ + gcc \ + cmake \ + netcat-openbsd \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +RUN uv pip install --system --upgrade pip setuptools wheel + +ENV UV_HTTP_TIMEOUT=1000 + +# Copy pyproject.toml and README.md to install dependencies +COPY 10_async/00_base/130_harness_openai/pyproject.toml /app/130_harness_openai/pyproject.toml +COPY 10_async/00_base/130_harness_openai/README.md /app/130_harness_openai/README.md + +WORKDIR /app/130_harness_openai + +# Copy the project code +COPY 10_async/00_base/130_harness_openai/project /app/130_harness_openai/project + +# Copy the test files +COPY 10_async/00_base/130_harness_openai/tests /app/130_harness_openai/tests + +# Copy shared test utilities +COPY test_utils /app/test_utils + +# Install the required Python packages with dev dependencies +RUN uv pip install --system .[dev] pytest-asyncio httpx + +# Set environment variables +ENV PYTHONPATH=/app + +# Set test environment variables +ENV AGENT_NAME=ab130-harness-openai + +# Run the agent using uvicorn +CMD ["uvicorn", "project.acp:acp", "--host", "0.0.0.0", "--port", "8000"] diff --git a/examples/tutorials/10_async/00_base/130_harness_openai/README.md b/examples/tutorials/10_async/00_base/130_harness_openai/README.md new file mode 100644 index 000000000..ac439e4ed --- /dev/null +++ b/examples/tutorials/10_async/00_base/130_harness_openai/README.md @@ -0,0 +1,33 @@ +# Async OpenAI Agents on the unified harness surface + +An async (Redis-streaming) Agentex agent that runs the OpenAI Agents SDK and +delivers its output through the **unified harness surface**. + +## What this demonstrates + +Same `OpenAITurn` adapter as the sync tutorial (`060_harness_openai`), but the +async ACP pushes the turn to the task stream via +`UnifiedEmitter.auto_send_turn` instead of yielding over HTTP. `auto_send_turn` +returns a `TurnResult` with the accumulated final text and normalized usage. + +```python +result = Runner.run_streamed(starting_agent=agent, input=user_message) +turn = OpenAITurn(result=result, model="gpt-4o") +emitter = UnifiedEmitter(task_id=task_id, trace_id=task_id, parent_span_id=parent_span_id) +turn_result = await emitter.auto_send_turn(turn) +``` + +## Run it + +```bash +agentex agents run --manifest manifest.yaml +``` + +## Test it + +The offline test exercises the auto-send delivery path with an injected fake +streaming backend (no server, Redis, or API key required): + +```bash +pytest tests/test_agent.py -v +``` diff --git a/examples/tutorials/10_async/00_base/130_harness_openai/manifest.yaml b/examples/tutorials/10_async/00_base/130_harness_openai/manifest.yaml new file mode 100644 index 000000000..7e67675fa --- /dev/null +++ b/examples/tutorials/10_async/00_base/130_harness_openai/manifest.yaml @@ -0,0 +1,58 @@ +build: + context: + root: ../../../ + include_paths: + - 10_async/00_base/130_harness_openai + - test_utils + dockerfile: 10_async/00_base/130_harness_openai/Dockerfile + dockerignore: 10_async/00_base/130_harness_openai/.dockerignore + +local_development: + agent: + port: 8000 + host_address: host.docker.internal + paths: + acp: project/acp.py + +agent: + acp_type: async + name: ab130-harness-openai + description: An async OpenAI Agents SDK agent on the unified harness surface + + temporal: + enabled: false + + credentials: + - env_var_name: OPENAI_API_KEY + secret_name: openai-api-key + secret_key: api-key + - env_var_name: REDIS_URL + secret_name: redis-url-secret + secret_key: url + - env_var_name: SGP_API_KEY + secret_name: sgp-api-key + secret_key: api-key + - env_var_name: SGP_ACCOUNT_ID + secret_name: sgp-account-id + secret_key: account-id + - env_var_name: SGP_CLIENT_BASE_URL + secret_name: sgp-client-base-url + secret_key: url + +deployment: + image: + repository: "" + tag: "latest" + + global: + agent: + name: "ab130-harness-openai" + description: "An async OpenAI Agents SDK agent on the unified harness surface" + replicaCount: 1 + resources: + requests: + cpu: "500m" + memory: "1Gi" + limits: + cpu: "1000m" + memory: "2Gi" diff --git a/examples/tutorials/10_async/00_base/130_harness_openai/project/__init__.py b/examples/tutorials/10_async/00_base/130_harness_openai/project/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/examples/tutorials/10_async/00_base/130_harness_openai/project/acp.py b/examples/tutorials/10_async/00_base/130_harness_openai/project/acp.py new file mode 100644 index 000000000..fcd10cc62 --- /dev/null +++ b/examples/tutorials/10_async/00_base/130_harness_openai/project/acp.py @@ -0,0 +1,98 @@ +"""ACP handler for the async OpenAI Agents harness tutorial. + +Uses the async ACP model with Redis streaming instead of HTTP yields. The +OpenAI Agents SDK run is wrapped in an ``OpenAITurn`` and pushed to the task +stream via ``UnifiedEmitter.auto_send_turn`` — the async/temporal delivery path +of the unified harness surface. ``auto_send_turn`` returns a ``TurnResult`` +carrying the accumulated final text and normalized usage. +""" + +from __future__ import annotations + +import os + +from dotenv import load_dotenv + +load_dotenv() + +from agents import Runner + +from agentex.lib import adk +from project.agent import MODEL_NAME, create_agent +from agentex.lib.types.acp import SendEventParams, CancelTaskParams, CreateTaskParams +from agentex.lib.types.fastacp import AsyncACPConfig +from agentex.lib.types.tracing import SGPTracingProcessorConfig +from agentex.lib.utils.logging import make_logger +from agentex.lib.sdk.fastacp.fastacp import FastACP +from agentex.lib.core.harness.emitter import UnifiedEmitter +from agentex.lib.adk.providers._modules.openai_turn import OpenAITurn +from agentex.lib.core.tracing.tracing_processor_manager import add_tracing_processor_config + +logger = make_logger(__name__) + +_litellm_key = os.environ.get("LITELLM_API_KEY") +if _litellm_key and not os.environ.get("OPENAI_API_KEY"): + os.environ["OPENAI_API_KEY"] = _litellm_key + +add_tracing_processor_config( + SGPTracingProcessorConfig( + sgp_api_key=os.environ.get("SGP_API_KEY", ""), + sgp_account_id=os.environ.get("SGP_ACCOUNT_ID", ""), + sgp_base_url=os.environ.get("SGP_CLIENT_BASE_URL", ""), + ) +) + +acp = FastACP.create( + acp_type="async", + config=AsyncACPConfig(type="base"), +) + +_agent = None + + +def get_agent(): + global _agent + if _agent is None: + _agent = create_agent() + return _agent + + +@acp.on_task_create +async def handle_task_create(params: CreateTaskParams): + logger.info(f"Task created: {params.task.id}") + + +@acp.on_task_event_send +async def handle_task_event_send(params: SendEventParams): + """Handle each user message: run the agent and auto-send its turn.""" + agent = get_agent() + task_id = params.task.id + user_message = params.event.content.content + + logger.info(f"Processing message for task {task_id}") + + # Echo the user's message into the task history. + await adk.messages.create(task_id=task_id, content=params.event.content) + + async with adk.tracing.span( + trace_id=task_id, + task_id=task_id, + name="message", + input={"message": user_message}, + data={"__span_type__": "AGENT_WORKFLOW"}, + ) as turn_span: + result = Runner.run_streamed(starting_agent=agent, input=user_message) + turn = OpenAITurn(result=result, model=MODEL_NAME) + emitter = UnifiedEmitter( + task_id=task_id, + trace_id=task_id, + parent_span_id=turn_span.id if turn_span else None, + ) + turn_result = await emitter.auto_send_turn(turn) + if turn_span: + turn_span.output = {"final_output": turn_result.final_text} + + +@acp.on_task_cancel +async def handle_task_canceled(params: CancelTaskParams): + logger.info(f"Task canceled: {params.task.id}") diff --git a/examples/tutorials/10_async/00_base/130_harness_openai/project/agent.py b/examples/tutorials/10_async/00_base/130_harness_openai/project/agent.py new file mode 100644 index 000000000..5b83c5aab --- /dev/null +++ b/examples/tutorials/10_async/00_base/130_harness_openai/project/agent.py @@ -0,0 +1,43 @@ +"""OpenAI Agents SDK agent definition for the async harness tutorial. + +Identical agent shape to the sync tutorial (060). The only difference is the +delivery path in acp.py: the async ACP uses ``UnifiedEmitter.auto_send_turn`` +(Redis streaming) instead of yielding events over an HTTP response. +""" + +from __future__ import annotations + +from datetime import datetime + +from agents import Agent, function_tool, set_tracing_disabled + +from project.tools import get_weather + +set_tracing_disabled(True) + +MODEL_NAME = "gpt-4o" +INSTRUCTIONS = """You are a helpful AI assistant with access to tools. + +Current date and time: {timestamp} + +Guidelines: +- Be concise and helpful +- Use the weather tool when the user asks about the weather +- Always report the real tool output back to the user +""" + + +@function_tool +def weather(city: str) -> str: + """Get the current weather for a city.""" + return get_weather(city) + + +def create_agent() -> Agent: + """Build and return the OpenAI Agents SDK agent with the weather tool.""" + return Agent( + name="Harness OpenAI Assistant", + model=MODEL_NAME, + instructions=INSTRUCTIONS.format(timestamp=datetime.now().strftime("%Y-%m-%d %H:%M:%S")), + tools=[weather], + ) diff --git a/examples/tutorials/10_async/00_base/130_harness_openai/project/tools.py b/examples/tutorials/10_async/00_base/130_harness_openai/project/tools.py new file mode 100644 index 000000000..d2e5468c9 --- /dev/null +++ b/examples/tutorials/10_async/00_base/130_harness_openai/project/tools.py @@ -0,0 +1,15 @@ +"""Tool definitions for the async OpenAI Agents harness tutorial.""" + +from __future__ import annotations + + +def get_weather(city: str) -> str: + """Get the current weather for a city. + + Args: + city: The name of the city to get weather for. + + Returns: + A string describing the weather conditions. + """ + return f"The weather in {city} is sunny and 72°F" diff --git a/examples/tutorials/10_async/00_base/130_harness_openai/pyproject.toml b/examples/tutorials/10_async/00_base/130_harness_openai/pyproject.toml new file mode 100644 index 000000000..c05e8c1c6 --- /dev/null +++ b/examples/tutorials/10_async/00_base/130_harness_openai/pyproject.toml @@ -0,0 +1,36 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "ab130-harness-openai" +version = "0.1.0" +description = "An async OpenAI Agents SDK agent on the unified harness surface" +readme = "README.md" +requires-python = ">=3.12" +dependencies = [ + "agentex-sdk", + "scale-gp", + "openai-agents", +] + +[project.optional-dependencies] +dev = [ + "pytest", + "pytest-asyncio", + "httpx", + "black", + "isort", + "flake8", +] + +[tool.hatch.build.targets.wheel] +packages = ["project"] + +[tool.black] +line-length = 88 +target-version = ['py312'] + +[tool.isort] +profile = "black" +line_length = 88 diff --git a/examples/tutorials/10_async/00_base/130_harness_openai/tests/test_agent.py b/examples/tutorials/10_async/00_base/130_harness_openai/tests/test_agent.py new file mode 100644 index 000000000..ceb95dbab --- /dev/null +++ b/examples/tutorials/10_async/00_base/130_harness_openai/tests/test_agent.py @@ -0,0 +1,77 @@ +"""Offline test for the async OpenAI Agents harness tutorial. + +This test does NOT require a running Agentex server, Redis, or an OpenAI API +key. It verifies the async delivery path this tutorial demonstrates: an +``OpenAITurn`` built from an injected canonical stream, pushed through +``UnifiedEmitter.auto_send_turn`` with an injected fake streaming backend, +returns the accumulated final text. + +To run: ``pytest tests/test_agent.py -v`` +""" + +from __future__ import annotations + +import pytest + +from agentex.types.task_message import TaskMessage +from agentex.types.text_content import TextContent +from agentex.lib.core.harness.emitter import UnifiedEmitter +from agentex.types.task_message_delta import TextDelta +from agentex.types.task_message_update import ( + StreamTaskMessageDone, + StreamTaskMessageDelta, + StreamTaskMessageStart, +) +from agentex.lib.adk.providers._modules.openai_turn import OpenAITurn + + +class _FakeCtx: + def __init__(self, initial_content): + self.task_message = TaskMessage(id="m-1", task_id="task-1", content=initial_content) + + async def __aenter__(self): + return self + + async def __aexit__(self, *a): + await self.close() + return False + + async def close(self): + pass + + async def stream_update(self, update): + return update + + +class _FakeStreaming: + def streaming_task_message_context(self, task_id, initial_content, **_kwargs): # noqa: ARG002 + return _FakeCtx(initial_content) + + +async def _canonical_stream(events): + for e in events: + yield e + + +@pytest.mark.asyncio +async def test_auto_send_turn_returns_final_text(): + events = [ + StreamTaskMessageStart(type="start", index=0, content=TextContent(type="text", author="agent", content="")), + StreamTaskMessageDelta(type="delta", index=0, delta=TextDelta(type="text", text_delta="Hel")), + StreamTaskMessageDelta(type="delta", index=0, delta=TextDelta(type="text", text_delta="lo")), + StreamTaskMessageDone(type="done", index=0), + ] + turn = OpenAITurn(stream=_canonical_stream(events), model="gpt-4o") + emitter = UnifiedEmitter( + task_id="task-1", + trace_id=None, + parent_span_id=None, + streaming=_FakeStreaming(), + ) + + result = await emitter.auto_send_turn(turn) + assert result.final_text == "Hello" + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) diff --git a/examples/tutorials/10_async/10_temporal/140_harness_openai/.dockerignore b/examples/tutorials/10_async/10_temporal/140_harness_openai/.dockerignore new file mode 100644 index 000000000..c49489471 --- /dev/null +++ b/examples/tutorials/10_async/10_temporal/140_harness_openai/.dockerignore @@ -0,0 +1,43 @@ +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg + +# Environments +.env** +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# IDE +.idea/ +.vscode/ +*.swp +*.swo + +# Git +.git +.gitignore + +# Misc +.DS_Store diff --git a/examples/tutorials/10_async/10_temporal/140_harness_openai/Dockerfile b/examples/tutorials/10_async/10_temporal/140_harness_openai/Dockerfile new file mode 100644 index 000000000..c107e3269 --- /dev/null +++ b/examples/tutorials/10_async/10_temporal/140_harness_openai/Dockerfile @@ -0,0 +1,43 @@ +# syntax=docker/dockerfile:1.3 +FROM python:3.12-slim +COPY --from=ghcr.io/astral-sh/uv:0.6.4 /uv /uvx /bin/ + +# Install system dependencies +RUN apt-get update && apt-get install -y \ + htop \ + vim \ + curl \ + tar \ + python3-dev \ + postgresql-client \ + build-essential \ + libpq-dev \ + gcc \ + cmake \ + netcat-openbsd \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +RUN uv pip install --system --upgrade pip setuptools wheel + +ENV UV_HTTP_TIMEOUT=1000 + +COPY 10_async/10_temporal/140_harness_openai/pyproject.toml /app/140_harness_openai/pyproject.toml +COPY 10_async/10_temporal/140_harness_openai/README.md /app/140_harness_openai/README.md + +WORKDIR /app/140_harness_openai + +COPY 10_async/10_temporal/140_harness_openai/project /app/140_harness_openai/project +COPY 10_async/10_temporal/140_harness_openai/tests /app/140_harness_openai/tests +COPY test_utils /app/test_utils + +RUN uv pip install --system .[dev] + +ENV PYTHONPATH=/app + +ENV AGENT_NAME=at140-harness-openai + +CMD ["uvicorn", "project.acp:acp", "--host", "0.0.0.0", "--port", "8000"] + +# When we deploy the worker, we will replace the CMD with the following +# CMD ["python", "-m", "run_worker"] diff --git a/examples/tutorials/10_async/10_temporal/140_harness_openai/README.md b/examples/tutorials/10_async/10_temporal/140_harness_openai/README.md new file mode 100644 index 000000000..0415ae225 --- /dev/null +++ b/examples/tutorials/10_async/10_temporal/140_harness_openai/README.md @@ -0,0 +1,41 @@ +# Temporal OpenAI Agents on the unified harness surface + +A Temporal-backed Agentex agent that runs the OpenAI Agents SDK and delivers its +output through the **unified harness surface**. + +## What this demonstrates + +LLM calls are non-deterministic, so they can't run directly in a Temporal +workflow. This tutorial keeps the workflow (`project/workflow.py`) +deterministic and delegates each turn to a custom activity +(`project/activities.py`). The activity uses the SAME `OpenAITurn` adapter as +the sync (`060_harness_openai`) and async (`130_harness_openai`) variants, and +delivers via `UnifiedEmitter.auto_send_turn` — which is designed to run inside +an activity (it writes streaming side effects to Redis and returns the final +text + usage). + +```python +# inside the activity: +result = Runner.run_streamed(starting_agent=agent, input=user_message) +turn = OpenAITurn(result=result, model="gpt-4o") +emitter = UnifiedEmitter(task_id=task_id, trace_id=trace_id, parent_span_id=parent_span_id) +turn_result = await emitter.auto_send_turn(turn) +return turn_result.final_text +``` + +## Run it + +```bash +agentex agents run --manifest manifest.yaml +``` + +This starts both the ACP HTTP server and the Temporal worker. + +## Test it + +The offline test exercises the activity's delivery path with an injected fake +streaming backend (no server, Temporal, Redis, or API key required): + +```bash +pytest tests/test_agent.py -v +``` diff --git a/examples/tutorials/10_async/10_temporal/140_harness_openai/environments.yaml b/examples/tutorials/10_async/10_temporal/140_harness_openai/environments.yaml new file mode 100644 index 000000000..f90511911 --- /dev/null +++ b/examples/tutorials/10_async/10_temporal/140_harness_openai/environments.yaml @@ -0,0 +1,64 @@ +# Agent Environment Configuration +# ------------------------------ +# This file defines environment-specific settings for your agent. +# This DIFFERS from the manifest.yaml file in that it is used to program things that are ONLY per environment. + +# ********** EXAMPLE ********** +# schema_version: "v1" # This is used to validate the file structure and is not used by the agentex CLI +# environments: +# dev: +# auth: +# principal: +# user_id: "1234567890" +# user_name: "John Doe" +# user_email: "john.doe@example.com" +# user_role: "admin" +# user_permissions: "read, write, delete" +# helm_overrides: # This is used to override the global helm values.yaml file in the agentex-agent helm charts +# replicas: 3 +# resources: +# requests: +# cpu: "1000m" +# memory: "2Gi" +# limits: +# cpu: "2000m" +# memory: "4Gi" +# env: +# - name: LOG_LEVEL +# value: "DEBUG" +# - name: ENVIRONMENT +# value: "staging" +# +# kubernetes: +# # OPTIONAL - Otherwise it will be derived from separately. However, this can be used to override the derived +# # namespace and deploy it with in the same namespace that already exists for a separate agent. +# namespace: "team-example-tutorial" +# ********** END EXAMPLE ********** + +schema_version: "v1" # This is used to validate the file structure and is not used by the agentex CLI +environments: + dev: + auth: + principal: + user_id: # TODO: Fill in + account_id: # TODO: Fill in + helm_overrides: + # This is used to override the global helm values.yaml file in the agentex-agent helm charts + replicaCount: 2 + resources: + requests: + cpu: "500m" + memory: "1Gi" + limits: + cpu: "1000m" + memory: "2Gi" + temporal-worker: + enabled: true + replicaCount: 2 + resources: + requests: + cpu: "500m" + memory: "1Gi" + limits: + cpu: "1000m" + memory: "2Gi" \ No newline at end of file diff --git a/examples/tutorials/10_async/10_temporal/140_harness_openai/manifest.yaml b/examples/tutorials/10_async/10_temporal/140_harness_openai/manifest.yaml new file mode 100644 index 000000000..64a943438 --- /dev/null +++ b/examples/tutorials/10_async/10_temporal/140_harness_openai/manifest.yaml @@ -0,0 +1,62 @@ +build: + context: + root: ../../../ + include_paths: + - 10_async/10_temporal/140_harness_openai + - test_utils + dockerfile: 10_async/10_temporal/140_harness_openai/Dockerfile + dockerignore: 10_async/10_temporal/140_harness_openai/.dockerignore + +local_development: + agent: + port: 8000 + host_address: host.docker.internal + paths: + acp: project/acp.py + worker: project/run_worker.py + +agent: + acp_type: async + name: at140-harness-openai + description: A Temporal-backed OpenAI Agents SDK agent on the unified harness surface + + temporal: + enabled: true + workflows: + - name: at140-harness-openai + queue_name: at140_harness_openai_queue + + credentials: + - env_var_name: REDIS_URL + secret_name: redis-url-secret + secret_key: url + - env_var_name: OPENAI_API_KEY + secret_name: openai-api-key + secret_key: api-key + - env_var_name: SGP_API_KEY + secret_name: sgp-api-key + secret_key: api-key + - env_var_name: SGP_ACCOUNT_ID + secret_name: sgp-account-id + secret_key: account-id + - env_var_name: SGP_CLIENT_BASE_URL + secret_name: sgp-client-base-url + secret_key: url + +deployment: + image: + repository: "" + tag: "latest" + + global: + agent: + name: "at140-harness-openai" + description: "A Temporal-backed OpenAI Agents SDK agent on the unified harness surface" + replicaCount: 1 + resources: + requests: + cpu: "500m" + memory: "1Gi" + limits: + cpu: "1000m" + memory: "2Gi" diff --git a/examples/tutorials/10_async/10_temporal/140_harness_openai/project/__init__.py b/examples/tutorials/10_async/10_temporal/140_harness_openai/project/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/examples/tutorials/10_async/10_temporal/140_harness_openai/project/acp.py b/examples/tutorials/10_async/10_temporal/140_harness_openai/project/acp.py new file mode 100644 index 000000000..6076835ba --- /dev/null +++ b/examples/tutorials/10_async/10_temporal/140_harness_openai/project/acp.py @@ -0,0 +1,33 @@ +"""ACP server for the Temporal OpenAI Agents harness tutorial. + +Thin by design: with ``acp_type="async"`` + ``TemporalACPConfig``, FastACP +auto-wires task/create, task/event/send, and task/cancel onto the workflow. +The agent logic lives in ``project/workflow.py`` (deterministic) and +``project/activities.py`` (the harness-backed LLM run), executed by the worker +in ``project/run_worker.py``. +""" + +from __future__ import annotations + +import os + +from dotenv import load_dotenv + +load_dotenv() + +from agentex.lib.types.fastacp import TemporalACPConfig +from agentex.lib.sdk.fastacp.fastacp import FastACP + +# LiteLLM proxy auth: copy LITELLM_API_KEY to OPENAI_API_KEY for OpenAI client +# compatibility, so the same example works behind the Scale LiteLLM gateway. +_litellm_key = os.environ.get("LITELLM_API_KEY") +if _litellm_key and not os.environ.get("OPENAI_API_KEY"): + os.environ["OPENAI_API_KEY"] = _litellm_key + +acp = FastACP.create( + acp_type="async", + config=TemporalACPConfig( + type="temporal", + temporal_address=os.getenv("TEMPORAL_ADDRESS", "localhost:7233"), + ), +) diff --git a/examples/tutorials/10_async/10_temporal/140_harness_openai/project/activities.py b/examples/tutorials/10_async/10_temporal/140_harness_openai/project/activities.py new file mode 100644 index 000000000..a70ee0c5d --- /dev/null +++ b/examples/tutorials/10_async/10_temporal/140_harness_openai/project/activities.py @@ -0,0 +1,75 @@ +"""Custom Temporal activity that runs the OpenAI agent on the harness surface. + +LLM calls are non-deterministic, so they must run inside a Temporal activity +rather than directly in the workflow. This activity runs the OpenAI Agents SDK +via ``Runner.run_streamed``, wraps the result in an ``OpenAITurn``, and pushes +the canonical stream to the task stream via ``UnifiedEmitter.auto_send_turn``. + +``auto_send`` (which backs ``auto_send_turn``) is explicitly designed to be +called from inside an activity: it writes streaming side effects to Redis and +returns the accumulated final text + normalized usage. +""" + +from __future__ import annotations + +from typing import Any + +from agents import Runner +from pydantic import BaseModel +from temporalio import activity + +from project.agent import MODEL_NAME, create_agent +from agentex.lib.utils.logging import make_logger +from agentex.lib.core.harness.emitter import UnifiedEmitter +from agentex.lib.adk.providers._modules.openai_turn import OpenAITurn + +logger = make_logger(__name__) + +RUN_HARNESS_AGENT_ACTIVITY = "run_harness_openai_agent" + + +class RunHarnessAgentParams(BaseModel): + """Parameters for the harness agent activity.""" + + task_id: str + user_message: str + # Prior conversation as OpenAI Agents SDK input items, so the agent sees the + # full history (not just the latest message) on every turn. + input_list: list[Any] = [] + trace_id: str | None = None + parent_span_id: str | None = None + + +class RunHarnessAgentResult(BaseModel): + """Result of one harness turn.""" + + final_text: str + # Updated conversation (prior history + this turn) to carry into the next turn. + input_list: list[Any] + + +class HarnessActivities: + """Hosts the harness-backed OpenAI agent activity.""" + + @activity.defn(name=RUN_HARNESS_AGENT_ACTIVITY) + async def run_harness_openai_agent(self, params: RunHarnessAgentParams) -> RunHarnessAgentResult: + """Run the agent for one turn and auto-send its output. + + Threads the running conversation through ``input_list`` so multi-turn + chats retain memory: prior history + the new user message go in, and the + updated conversation comes back out via ``result.to_input_list()``. + """ + logger.info(f"Running harness OpenAI agent for task {params.task_id}") + + agent = create_agent() + input_list: list[Any] = [*params.input_list, {"role": "user", "content": params.user_message}] + result = Runner.run_streamed(starting_agent=agent, input=input_list) + turn = OpenAITurn(result=result, model=MODEL_NAME) + emitter = UnifiedEmitter( + task_id=params.task_id, + trace_id=params.trace_id, + parent_span_id=params.parent_span_id, + ) + turn_result = await emitter.auto_send_turn(turn) + # to_input_list() is valid now: auto_send_turn has exhausted the stream. + return RunHarnessAgentResult(final_text=turn_result.final_text, input_list=result.to_input_list()) diff --git a/examples/tutorials/10_async/10_temporal/140_harness_openai/project/agent.py b/examples/tutorials/10_async/10_temporal/140_harness_openai/project/agent.py new file mode 100644 index 000000000..385a80b69 --- /dev/null +++ b/examples/tutorials/10_async/10_temporal/140_harness_openai/project/agent.py @@ -0,0 +1,44 @@ +"""OpenAI Agents SDK agent definition for the Temporal harness tutorial. + +Same agent shape as the sync (060) and async (130) variants. Here the agent is +built and run inside a Temporal activity (see ``project.activities``); the +workflow stays deterministic and delegates the non-deterministic LLM run to that +activity, which delivers the turn via the unified harness surface. +""" + +from __future__ import annotations + +from datetime import datetime + +from agents import Agent, function_tool, set_tracing_disabled + +from project.tools import get_weather + +set_tracing_disabled(True) + +MODEL_NAME = "gpt-4o" +INSTRUCTIONS = """You are a helpful AI assistant with access to tools. + +Current date and time: {timestamp} + +Guidelines: +- Be concise and helpful +- Use the weather tool when the user asks about the weather +- Always report the real tool output back to the user +""" + + +@function_tool +def weather(city: str) -> str: + """Get the current weather for a city.""" + return get_weather(city) + + +def create_agent() -> Agent: + """Build and return the OpenAI Agents SDK agent with the weather tool.""" + return Agent( + name="Harness OpenAI Assistant", + model=MODEL_NAME, + instructions=INSTRUCTIONS.format(timestamp=datetime.now().strftime("%Y-%m-%d %H:%M:%S")), + tools=[weather], + ) diff --git a/examples/tutorials/10_async/10_temporal/140_harness_openai/project/run_worker.py b/examples/tutorials/10_async/10_temporal/140_harness_openai/project/run_worker.py new file mode 100644 index 000000000..69586a395 --- /dev/null +++ b/examples/tutorials/10_async/10_temporal/140_harness_openai/project/run_worker.py @@ -0,0 +1,44 @@ +"""Temporal worker for the OpenAI Agents harness tutorial. + +Runs as a separate long-lived process alongside the ACP HTTP server. Registers +the built-in Agentex activities plus the custom harness agent activity +(``HarnessActivities.run_harness_openai_agent``), and the workflow. +""" + +import asyncio + +from project.workflow import At140HarnessOpenaiWorkflow +from project.activities import HarnessActivities +from agentex.lib.utils.debug import setup_debug_if_enabled +from agentex.lib.utils.logging import make_logger +from agentex.lib.environment_variables import EnvironmentVariables +from agentex.lib.core.temporal.activities import get_all_activities +from agentex.lib.core.temporal.workers.worker import AgentexWorker + +environment_variables = EnvironmentVariables.refresh() +logger = make_logger(__name__) + + +async def main(): + setup_debug_if_enabled() + + task_queue_name = environment_variables.WORKFLOW_TASK_QUEUE + if task_queue_name is None: + raise ValueError("WORKFLOW_TASK_QUEUE is not set") + + harness_activities = HarnessActivities() + all_activities = [ + harness_activities.run_harness_openai_agent, + *get_all_activities(), + ] + + worker = AgentexWorker(task_queue=task_queue_name) + + await worker.run( + activities=all_activities, + workflow=At140HarnessOpenaiWorkflow, + ) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/examples/tutorials/10_async/10_temporal/140_harness_openai/project/tools.py b/examples/tutorials/10_async/10_temporal/140_harness_openai/project/tools.py new file mode 100644 index 000000000..d26f9b097 --- /dev/null +++ b/examples/tutorials/10_async/10_temporal/140_harness_openai/project/tools.py @@ -0,0 +1,15 @@ +"""Tool definitions for the Temporal OpenAI Agents harness tutorial.""" + +from __future__ import annotations + + +def get_weather(city: str) -> str: + """Get the current weather for a city. + + Args: + city: The name of the city to get weather for. + + Returns: + A string describing the weather conditions. + """ + return f"The weather in {city} is sunny and 72°F" diff --git a/examples/tutorials/10_async/10_temporal/140_harness_openai/project/workflow.py b/examples/tutorials/10_async/10_temporal/140_harness_openai/project/workflow.py new file mode 100644 index 000000000..69ad7b365 --- /dev/null +++ b/examples/tutorials/10_async/10_temporal/140_harness_openai/project/workflow.py @@ -0,0 +1,121 @@ +"""Temporal workflow for the OpenAI Agents harness tutorial. + +The workflow stays deterministic: it echoes the user message and delegates the +non-deterministic LLM run to ``run_harness_openai_agent`` (see +``project.activities``). That activity runs the OpenAI Agents SDK and delivers +the turn through the unified harness surface (``OpenAITurn`` + +``UnifiedEmitter.auto_send_turn``). +""" + +from __future__ import annotations + +import os +import json +from datetime import timedelta + +from temporalio import workflow +from temporalio.common import RetryPolicy + +from agentex.lib import adk +from project.activities import ( + RUN_HARNESS_AGENT_ACTIVITY, + RunHarnessAgentParams, + RunHarnessAgentResult, +) +from agentex.lib.types.acp import SendEventParams, CreateTaskParams +from agentex.lib.types.tracing import SGPTracingProcessorConfig +from agentex.lib.utils.logging import make_logger +from agentex.types.text_content import TextContent +from agentex.lib.environment_variables import EnvironmentVariables +from agentex.lib.core.temporal.types.workflow import SignalName +from agentex.lib.core.temporal.workflows.workflow import BaseWorkflow +from agentex.lib.core.tracing.tracing_processor_manager import add_tracing_processor_config + +add_tracing_processor_config( + SGPTracingProcessorConfig( + sgp_api_key=os.environ.get("SGP_API_KEY", ""), + sgp_account_id=os.environ.get("SGP_ACCOUNT_ID", ""), + sgp_base_url=os.environ.get("SGP_CLIENT_BASE_URL", ""), + ) +) + +environment_variables = EnvironmentVariables.refresh() + +if environment_variables.WORKFLOW_NAME is None: + raise ValueError("Environment variable WORKFLOW_NAME is not set") +if environment_variables.AGENT_NAME is None: + raise ValueError("Environment variable AGENT_NAME is not set") + +logger = make_logger(__name__) + + +@workflow.defn(name=environment_variables.WORKFLOW_NAME) +class At140HarnessOpenaiWorkflow(BaseWorkflow): + """Long-running workflow that runs each turn through the harness activity.""" + + def __init__(self): + super().__init__(display_name=environment_variables.AGENT_NAME) + self._complete_task = False + self._turn_number = 0 + # Running conversation (OpenAI Agents SDK input items) so each turn sees + # the full history, not just the latest user message. + self._messages: list = [] + + @workflow.signal(name=SignalName.RECEIVE_EVENT) + async def on_task_event_send(self, params: SendEventParams) -> None: + """Handle a user message: echo it, then run the harness activity durably.""" + logger.info(f"Received task event: {params.task.id}") + self._turn_number += 1 + + # Echo the user's message so it shows up in the UI as a chat bubble. + await adk.messages.create(task_id=params.task.id, content=params.event.content) + + async with adk.tracing.span( + trace_id=params.task.id, + task_id=params.task.id, + name=f"Turn {self._turn_number}", + input={"message": params.event.content.content}, + ) as span: + turn_result = await workflow.execute_activity( + RUN_HARNESS_AGENT_ACTIVITY, + RunHarnessAgentParams( + task_id=params.task.id, + user_message=params.event.content.content, + input_list=self._messages, + trace_id=params.task.id, + parent_span_id=span.id if span else None, + ), + start_to_close_timeout=timedelta(minutes=5), + retry_policy=RetryPolicy(maximum_attempts=3), + result_type=RunHarnessAgentResult, + ) + # Carry the updated conversation into the next turn. + self._messages = turn_result.input_list + if span: + span.output = {"final_output": turn_result.final_text} + + @workflow.run + async def on_task_create(self, params: CreateTaskParams) -> str: + """Workflow entry point — keep the conversation alive for incoming signals.""" + logger.info(f"Task created: {params.task.id}") + + await adk.messages.create( + task_id=params.task.id, + content=TextContent( + author="agent", + content=( + f"Task initialized with params:\n{json.dumps(params.params, indent=2)}\n" + f"Send me a message and I'll respond using an OpenAI Agents SDK agent " + f"delivered through the unified harness surface." + ), + ), + ) + + await workflow.wait_condition(lambda: self._complete_task, timeout=None) + return "Task completed" + + @workflow.signal + async def complete_task_signal(self) -> None: + """Graceful workflow shutdown signal.""" + logger.info("Received complete_task signal") + self._complete_task = True diff --git a/examples/tutorials/10_async/10_temporal/140_harness_openai/pyproject.toml b/examples/tutorials/10_async/10_temporal/140_harness_openai/pyproject.toml new file mode 100644 index 000000000..5bf53f6be --- /dev/null +++ b/examples/tutorials/10_async/10_temporal/140_harness_openai/pyproject.toml @@ -0,0 +1,38 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "at140-harness-openai" +version = "0.1.0" +description = "A Temporal-backed OpenAI Agents SDK agent on the unified harness surface" +readme = "README.md" +requires-python = ">=3.12" +dependencies = [ + "agentex-sdk", + "scale-gp", + "temporalio>=1.18.2", + "openai-agents", +] + +[project.optional-dependencies] +dev = [ + "pytest", + "pytest-asyncio", + "httpx", + "black", + "isort", + "flake8", + "debugpy>=1.8.15", +] + +[tool.hatch.build.targets.wheel] +packages = ["project"] + +[tool.black] +line-length = 88 +target-version = ['py312'] + +[tool.isort] +profile = "black" +line_length = 88 diff --git a/examples/tutorials/10_async/10_temporal/140_harness_openai/tests/test_agent.py b/examples/tutorials/10_async/10_temporal/140_harness_openai/tests/test_agent.py new file mode 100644 index 000000000..dd043c44c --- /dev/null +++ b/examples/tutorials/10_async/10_temporal/140_harness_openai/tests/test_agent.py @@ -0,0 +1,77 @@ +"""Offline test for the Temporal OpenAI Agents harness tutorial. + +This test does NOT require a running Agentex server, Temporal, Redis, or an +OpenAI API key. It verifies the delivery path the harness activity uses: an +``OpenAITurn`` built from an injected canonical stream, pushed through +``UnifiedEmitter.auto_send_turn`` with an injected fake streaming backend, +returns the accumulated final text (which the activity returns to the workflow). + +To run: ``pytest tests/test_agent.py -v`` +""" + +from __future__ import annotations + +import pytest + +from agentex.types.task_message import TaskMessage +from agentex.types.text_content import TextContent +from agentex.lib.core.harness.emitter import UnifiedEmitter +from agentex.types.task_message_delta import TextDelta +from agentex.types.task_message_update import ( + StreamTaskMessageDone, + StreamTaskMessageDelta, + StreamTaskMessageStart, +) +from agentex.lib.adk.providers._modules.openai_turn import OpenAITurn + + +class _FakeCtx: + def __init__(self, initial_content): + self.task_message = TaskMessage(id="m-1", task_id="task-1", content=initial_content) + + async def __aenter__(self): + return self + + async def __aexit__(self, *a): + await self.close() + return False + + async def close(self): + pass + + async def stream_update(self, update): + return update + + +class _FakeStreaming: + def streaming_task_message_context(self, task_id, initial_content, **_kwargs): # noqa: ARG002 + return _FakeCtx(initial_content) + + +async def _canonical_stream(events): + for e in events: + yield e + + +@pytest.mark.asyncio +async def test_activity_delivery_returns_final_text(): + events = [ + StreamTaskMessageStart(type="start", index=0, content=TextContent(type="text", author="agent", content="")), + StreamTaskMessageDelta(type="delta", index=0, delta=TextDelta(type="text", text_delta="72")), + StreamTaskMessageDelta(type="delta", index=0, delta=TextDelta(type="text", text_delta="F")), + StreamTaskMessageDone(type="done", index=0), + ] + turn = OpenAITurn(stream=_canonical_stream(events), model="gpt-4o") + emitter = UnifiedEmitter( + task_id="task-1", + trace_id=None, + parent_span_id=None, + streaming=_FakeStreaming(), + ) + + result = await emitter.auto_send_turn(turn) + assert result.final_text == "72F" + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) diff --git a/src/agentex/lib/adk/providers/_modules/openai_turn.py b/src/agentex/lib/adk/providers/_modules/openai_turn.py new file mode 100644 index 000000000..17a6518ee --- /dev/null +++ b/src/agentex/lib/adk/providers/_modules/openai_turn.py @@ -0,0 +1,134 @@ +"""OpenAITurn: adapt an OpenAI Agents SDK streamed run onto the harness surface. + +A ``HarnessTurn`` exposes a single canonical ``StreamTaskMessage*`` stream plus +normalized usage. ``OpenAITurn`` wraps a ``RunResultStreaming`` (from +``Runner.run_streamed``), converts its native OpenAI events into the canonical +stream via ``convert_openai_to_agentex_events``, and after exhaustion reads the +run's ``raw_responses`` to aggregate usage into a provider-independent +``TurnUsage``. + +Delivery (yield vs auto-send) and tracing are owned by ``UnifiedEmitter``; this +module is purely the provider->canonical adapter. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, Any, AsyncIterator + +from agents.usage import Usage + +from agentex.lib.utils.logging import make_logger +from agentex.lib.core.harness.types import TurnUsage, StreamTaskMessage +from agentex.lib.adk.providers._modules.sync_provider import ( + convert_openai_to_agentex_events, +) + +if TYPE_CHECKING: + from agents import ModelResponse, RunResultStreaming + +logger = make_logger(__name__) + + +def openai_usage_to_turn_usage(usage: Usage | None, model: str | None) -> TurnUsage: + """Map an ``agents.Usage`` to a harness-independent ``TurnUsage``. + + All field access is defensive (``getattr(..., None)``): different model + backends populate different subsets of the usage object, and real zeros are + valid values (e.g. 0 output tokens on a pure cache hit), so we never coerce + a present-but-zero value into ``None``. + """ + if usage is None: + return TurnUsage(model=model) + + input_details = getattr(usage, "input_tokens_details", None) + output_details = getattr(usage, "output_tokens_details", None) + + return TurnUsage( + model=model, + num_llm_calls=getattr(usage, "requests", None) or 0, + input_tokens=getattr(usage, "input_tokens", None), + cached_input_tokens=getattr(input_details, "cached_tokens", None), + output_tokens=getattr(usage, "output_tokens", None), + reasoning_tokens=getattr(output_details, "reasoning_tokens", None), + total_tokens=getattr(usage, "total_tokens", None), + ) + + +def _aggregate_usage(raw_responses: list[ModelResponse]) -> Usage | None: + """Sum the per-response ``Usage`` across a run's ``ModelResponse`` list. + + Returns ``None`` when no response carries usage so the caller can emit a + usage object with only the model name set. ``Usage.add`` accumulates + requests/tokens (including cached/reasoning detail fields). + """ + total: Usage | None = None + for response in raw_responses: + resp_usage = getattr(response, "usage", None) + if resp_usage is None: + continue + if total is None: + total = Usage() + total.add(resp_usage) + return total + + +class OpenAITurn: + """A single OpenAI Agents SDK turn adapted to the ``HarnessTurn`` protocol. + + Construct with exactly one of: + - ``result``: a ``RunResultStreaming`` from ``Runner.run_streamed``. Its + ``stream_events()`` is converted to the canonical stream, and after the + stream is exhausted ``raw_responses`` is read to compute usage. + - ``stream``: a pre-built async iterator of canonical ``StreamTaskMessage`` + events (bypasses ``convert_openai_to_agentex_events``). Useful for tests + and for callers that have already produced canonical events. Usage stays + at ``TurnUsage(model=...)`` because there is no run to read usage from. + + ``coalesce_tool_requests`` is accepted for API parity with other provider + turns but is a no-op for OpenAI: the OpenAI converter already emits a single + ``Full(ToolRequestContent)`` per tool call rather than streamed argument + deltas, so there is nothing to coalesce. + """ + + def __init__( + self, + result: RunResultStreaming | None = None, + model: str | None = None, + stream: AsyncIterator[StreamTaskMessage] | None = None, + coalesce_tool_requests: bool = False, # noqa: ARG002 - API parity, no-op for OpenAI + ) -> None: + if result is None and stream is None: + raise ValueError("OpenAITurn requires either `result` or `stream`") + self._result = result + self._model = model + self._stream = stream + self._usage: TurnUsage = TurnUsage(model=model) + + @property + def events(self) -> AsyncIterator[StreamTaskMessage]: + return self._iter_events() + + async def _iter_events(self) -> AsyncIterator[StreamTaskMessage]: + if self._stream is not None: + async for event in self._stream: + yield event + return + + result = self._result + assert result is not None # guaranteed by __init__ + async for event in convert_openai_to_agentex_events(result.stream_events()): + yield event + + # Stream is exhausted: the run has finished and raw_responses is now + # populated, so usage can be aggregated and normalized. + try: + raw_responses: list[Any] = list(getattr(result, "raw_responses", None) or []) + aggregated = _aggregate_usage(raw_responses) + self._usage = openai_usage_to_turn_usage(aggregated, self._model) + except Exception as exc: # pragma: no cover - defensive: never break delivery on usage + logger.warning(f"Failed to aggregate OpenAI usage: {exc}") + self._usage = TurnUsage(model=self._model) + + def usage(self) -> TurnUsage: + """Normalized turn usage. Valid only after ``events`` is exhausted.""" + return self._usage diff --git a/src/agentex/lib/adk/providers/_modules/sync_provider.py b/src/agentex/lib/adk/providers/_modules/sync_provider.py index a34cfcda1..9996bf30d 100644 --- a/src/agentex/lib/adk/providers/_modules/sync_provider.py +++ b/src/agentex/lib/adk/providers/_modules/sync_provider.py @@ -55,24 +55,28 @@ def _serialize_item(item: Any) -> dict[str, Any]: Uses model_dump() for Pydantic models, otherwise extracts attributes manually. Filters out internal Pydantic fields that can't be serialized. """ - if hasattr(item, 'model_dump'): + if hasattr(item, "model_dump"): # Pydantic model - use model_dump for proper serialization try: - return item.model_dump(mode='json', exclude_unset=True) + return item.model_dump(mode="json", exclude_unset=True) except Exception: # Fallback to dict conversion - return dict(item) if hasattr(item, '__iter__') else {} + return dict(item) if hasattr(item, "__iter__") else {} else: # Not a Pydantic model - extract attributes manually item_dict = {} for attr_name in dir(item): - if not attr_name.startswith('_') and attr_name not in ('model_fields', 'model_config', 'model_computed_fields'): + if not attr_name.startswith("_") and attr_name not in ( + "model_fields", + "model_config", + "model_computed_fields", + ): try: attr_value = getattr(item, attr_name, None) # Skip methods and None values if attr_value is not None and not callable(attr_value): # Convert to JSON-serializable format - if hasattr(attr_value, 'model_dump'): + if hasattr(attr_value, "model_dump"): item_dict[attr_name] = attr_value.model_dump() elif isinstance(attr_value, (str, int, float, bool, list, dict)): item_dict[attr_name] = attr_value @@ -85,9 +89,26 @@ def _serialize_item(item: Any) -> dict[str, Any]: class SyncStreamingModel(Model): - """Simple model wrapper that adds logging to stream_response and supports tracing.""" + """Simple model wrapper that adds logging to stream_response and supports tracing. + + .. deprecated:: + Prefer the unified harness surface for new OpenAI Agents integrations: + wrap a ``Runner.run_streamed`` result in + ``agentex.lib.adk.providers._modules.openai_turn.OpenAITurn`` and drive + delivery + tracing through ``UnifiedEmitter`` (see the + ``060_harness_openai`` / ``130_harness_openai`` / ``140_harness_openai`` + tutorials). This per-model tracing wrapper predates the harness and is + retained only for backwards compatibility; it will be removed in a + future release. No runtime warning is emitted. + """ - def __init__(self, original_model: Model, trace_id: str | None = None, parent_span_id: str | None = None, tracer: AsyncTracer | None = None): + def __init__( + self, + original_model: Model, + trace_id: str | None = None, + parent_span_id: str | None = None, + tracer: AsyncTracer | None = None, + ): """Initialize with the original OpenAI model to wrap. Args: original_model: The OpenAI model instance to wrap @@ -147,7 +168,7 @@ async def get_response( } # Only add conversation_id if the model supports it - if hasattr(self.original_model, 'supports_conversation_id'): + if hasattr(self.original_model, "supports_conversation_id"): kwargs["conversation_id"] = conversation_id response = await self.original_model.get_response(**kwargs) @@ -158,12 +179,12 @@ async def get_response( final_output = None # Extract final output text from response - response_final_output = getattr(response, 'final_output', None) + response_final_output = getattr(response, "final_output", None) if response_final_output: final_output = response_final_output # Extract items from the response output - response_output = getattr(response, 'output', None) + response_output = getattr(response, "output", None) if response_output: output_items = response_output if isinstance(response_output, list) else [response_output] @@ -174,12 +195,12 @@ async def get_response( new_items.append(item_dict) # Extract final_output from message type if available - if item_dict.get('type') == 'message' and not final_output: - content = item_dict.get('content', []) + if item_dict.get("type") == "message" and not final_output: + content = item_dict.get("content", []) if content and isinstance(content, list): for content_part in content: - if isinstance(content_part, dict) and 'text' in content_part: - final_output = content_part['text'] + if isinstance(content_part, dict) and "text" in content_part: + final_output = content_part["text"] break except Exception as e: logger.warning(f"Failed to serialize item in get_response: {e}") @@ -207,7 +228,7 @@ async def get_response( } # Only add conversation_id if the model supports it - if hasattr(self.original_model, 'supports_conversation_id'): + if hasattr(self.original_model, "supports_conversation_id"): kwargs["conversation_id"] = conversation_id return await self.original_model.get_response(**kwargs) @@ -266,7 +287,7 @@ async def stream_response( } # Only add conversation_id if the model supports it - if hasattr(self.original_model, 'supports_conversation_id'): + if hasattr(self.original_model, "supports_conversation_id"): stream_kwargs["conversation_id"] = conversation_id # Get the stream response from the original model and yield each event @@ -277,11 +298,11 @@ async def stream_response( final_response_text = "" async for event in stream_response: - event_type = getattr(event, 'type', 'no-type') + event_type = getattr(event, "type", "no-type") # Handle response.output_item.done events which contain completed items - if event_type == 'response.output_item.done': - item = getattr(event, 'item', None) + if event_type == "response.output_item.done": + item = getattr(event, "item", None) if item is not None: try: item_dict = _serialize_item(item) @@ -289,12 +310,12 @@ async def stream_response( new_items.append(item_dict) # Update final_response_text from message type if available - if item_dict.get('type') == 'message': - content = item_dict.get('content', []) + if item_dict.get("type") == "message": + content = item_dict.get("content", []) if content and isinstance(content, list): for content_part in content: - if isinstance(content_part, dict) and 'text' in content_part: - final_response_text = content_part['text'] + if isinstance(content_part, dict) and "text" in content_part: + final_response_text = content_part["text"] break except Exception as e: logger.warning(f"Failed to serialize item in stream_response: {e}") @@ -326,7 +347,7 @@ async def stream_response( } # Only add conversation_id if the model supports it - if hasattr(self.original_model, 'supports_conversation_id'): + if hasattr(self.original_model, "supports_conversation_id"): stream_kwargs["conversation_id"] = conversation_id # Get the stream response from the original model and yield each event @@ -336,8 +357,17 @@ async def stream_response( async for event in stream_response: yield event + class SyncStreamingProvider(OpenAIProvider): - """Simple OpenAI provider wrapper that adds logging to streaming and supports tracing.""" + """Simple OpenAI provider wrapper that adds logging to streaming and supports tracing. + + .. deprecated:: + Prefer the unified harness surface for new OpenAI Agents integrations + (see :class:`SyncStreamingModel` and the ``OpenAITurn`` + + ``UnifiedEmitter`` pattern). This provider wrapper predates the harness + and is retained only for backwards compatibility; it will be removed in + a future release. No runtime warning is emitted. + """ def __init__(self, trace_id: str | None = None, parent_span_id: str | None = None, *args, **kwargs): """Initialize the provider with tracing support. @@ -405,6 +435,7 @@ def _extract_tool_call_info(tool_call_item: Any) -> tuple[str, str, dict[str, An if tool_call_item.arguments: if isinstance(tool_call_item.arguments, str): import json + tool_arguments = json.loads(tool_call_item.arguments) if tool_call_item.arguments else {} else: tool_arguments = tool_call_item.arguments @@ -418,6 +449,7 @@ def _extract_tool_call_info(tool_call_item: Any) -> tuple[str, str, dict[str, An arguments = tool_call_item.arguments if isinstance(arguments, str): import json + tool_arguments = json.loads(arguments) if arguments else {} elif arguments is None: tool_arguments = {} @@ -466,11 +498,11 @@ def _extract_tool_response_info(tool_map: dict[str, Any], tool_output_item: Any) async def convert_openai_to_agentex_events(stream_response): """Convert OpenAI streaming events to AgentEx TaskMessageUpdate events with reasoning support. - + This is an enhanced version of the base converter that includes support for: - Reasoning content deltas (for o1 models) - Reasoning summary deltas (for o1 models) - + Args: stream_response: An async iterator of OpenAI streaming events Yields: @@ -488,8 +520,8 @@ async def convert_openai_to_agentex_events(stream_response): event_count += 1 # Check for raw response events which contain the actual OpenAI streaming events - if hasattr(event, 'type') and event.type == 'raw_response_event': - if hasattr(event, 'data'): + if hasattr(event, "type") and event.type == "raw_response_event": + if hasattr(event, "data"): raw_event = event.data # Check for ResponseOutputItemAddedEvent which signals a new message starting @@ -504,7 +536,7 @@ async def convert_openai_to_agentex_events(stream_response): if item_id in item_id_to_index: # Get the message type to decide whether to send done event message_type = item_id_to_type.get(item_id, "text") - + # Don't send done events for reasoning content/summary # They just end with their last delta if message_type not in ("reasoning_content", "reasoning_summary"): @@ -608,7 +640,7 @@ async def convert_openai_to_agentex_events(stream_response): # Check if this is a text delta event from OpenAI elif isinstance(raw_event, ResponseTextDeltaEvent): # Check if this event has an item_id - item_id = getattr(raw_event, 'item_id', None) + item_id = getattr(raw_event, "item_id", None) # If this is a new item_id we haven't seen, it's a new message if item_id and item_id not in item_id_to_index: @@ -647,13 +679,13 @@ async def convert_openai_to_agentex_events(stream_response): ) yield delta_message - elif hasattr(event, 'type') and event.type == 'run_item_stream_event': + elif hasattr(event, "type") and event.type == "run_item_stream_event": # Skip reasoning_item events - they're handled via raw_response_event above - if hasattr(event, 'item') and event.item.type == 'reasoning_item': + if hasattr(event, "item") and event.item.type == "reasoning_item": continue # Check for tool_call_item type (this is when a tool is being called) - elif hasattr(event, 'item') and event.item.type == 'tool_call_item': + elif hasattr(event, "item") and event.item.type == "tool_call_item": # Extract tool call information using the helper method call_id, tool_name, tool_arguments = _extract_tool_call_info(event.item.raw_item) tool_map[call_id] = tool_name @@ -671,7 +703,7 @@ async def convert_openai_to_agentex_events(stream_response): ) # Check for tool_call_output_item type (this is when a tool returns output) - elif hasattr(event, 'item') and event.item.type == 'tool_call_output_item': + elif hasattr(event, "item") and event.item.type == "tool_call_output_item": # Extract tool response information using the helper method call_id, tool_name, content = _extract_tool_response_info(tool_map, event.item.raw_item) tool_response_content = ToolResponseContent( @@ -687,4 +719,3 @@ async def convert_openai_to_agentex_events(stream_response): index=message_index, content=tool_response_content, ) - diff --git a/src/agentex/lib/core/services/adk/providers/openai.py b/src/agentex/lib/core/services/adk/providers/openai.py index 75e507d8a..1ae29589d 100644 --- a/src/agentex/lib/core/services/adk/providers/openai.py +++ b/src/agentex/lib/core/services/adk/providers/openai.py @@ -14,15 +14,8 @@ from agents.guardrail import InputGuardrail, OutputGuardrail from agents.exceptions import InputGuardrailTripwireTriggered, OutputGuardrailTripwireTriggered from openai.types.responses import ( - ResponseCompletedEvent, - ResponseTextDeltaEvent, - ResponseFunctionToolCall, ResponseFunctionWebSearch, - ResponseOutputItemDoneEvent, ResponseCodeInterpreterToolCall, - ResponseReasoningSummaryPartDoneEvent, - ResponseReasoningSummaryPartAddedEvent, - ResponseReasoningSummaryTextDeltaEvent, ) # Local imports @@ -31,24 +24,14 @@ from agentex.lib.utils.mcp import redact_mcp_server_params from agentex.lib.utils.temporal import heartbeat_if_in_workflow from agentex.lib.core.tracing.tracer import AsyncTracer -from agentex.types.task_message_delta import ( - TextDelta, - ReasoningSummaryDelta, -) -from agentex.types.task_message_update import ( - StreamTaskMessageFull, - StreamTaskMessageDelta, -) +from agentex.lib.core.harness.emitter import UnifiedEmitter +from agentex.types.task_message_update import StreamTaskMessageFull from agentex.types.task_message_content import ( TextContent, - ReasoningContent, ToolRequestContent, ToolResponseContent, ) -from agentex.lib.core.services.adk.streaming import ( - StreamingService, - StreamingTaskMessageContext, -) +from agentex.lib.core.services.adk.streaming import StreamingService logger = logging.make_logger(__name__) @@ -695,7 +678,7 @@ async def run_agent_streamed_auto_send( input_guardrails: list[InputGuardrail] | None = None, output_guardrails: list[OutputGuardrail] | None = None, max_turns: int | None = None, - previous_response_id: str | None = None, # noqa: ARG002 + previous_response_id: str | None = None, created_at: datetime | None = None, ) -> RunResultStreaming: """ @@ -733,8 +716,6 @@ async def run_agent_streamed_auto_send( if self.agentex_client is None: raise ValueError("Agentex client must be provided for auto_send methods") - tool_call_map: dict[str, ResponseFunctionToolCall] = {} - if self.tracer is None: raise RuntimeError("Tracer not initialized - ensure tracer is provided to OpenAIService") trace = self.tracer.trace(trace_id) @@ -761,12 +742,13 @@ async def run_agent_streamed_auto_send( ) as span: heartbeat_if_in_workflow("run agent streamed auto send") - # Consume the workflow-supplied created_at on the FIRST message - # opened by this activity (whichever streaming context opens first - # for this turn). That's the message that races the workflow's - # user-echo at the server. Subsequent messages in the same turn are - # separated by network/processing latency and rely on the server's - # wall clock. + # AGX1-378 restored: created_at is now threaded through + # UnifiedEmitter.auto_send_turn -> auto_send -> every + # streaming_task_message_context call, so the first agent message of + # the turn is stamped with the workflow-supplied timestamp (e.g. + # workflow.now()) just as the original inline loop did. + # The dispenser is still used below for guardrail-rejection messages, + # which open their own streaming contexts directly. _take_created_at = _make_created_at_dispenser(created_at) async with mcp_server_context(mcp_server_params, mcp_timeout_seconds) as servers: @@ -803,204 +785,48 @@ async def run_agent_streamed_auto_send( agent = Agent(**agent_kwargs) - # Run with streaming - if max_turns is not None: + # Run with streaming. Forward previous_response_id so callers that + # continue a Responses-API conversation resume the prior response + # instead of silently starting a fresh one (mirrors the non-auto-send + # run_agent_streamed path). + if max_turns is not None and previous_response_id is not None: + result = Runner.run_streamed( + starting_agent=agent, + input=input_list, + max_turns=max_turns, + previous_response_id=previous_response_id, + ) + elif max_turns is not None: result = Runner.run_streamed(starting_agent=agent, input=input_list, max_turns=max_turns) + elif previous_response_id is not None: + result = Runner.run_streamed( + starting_agent=agent, input=input_list, previous_response_id=previous_response_id + ) else: result = Runner.run_streamed(starting_agent=agent, input=input_list) - item_id_to_streaming_context: dict[str, StreamingTaskMessageContext] = {} - unclosed_item_ids: set[str] = set() - # Simple string to accumulate reasoning summary - current_reasoning_summary: str = "" + # Migrate onto the unified harness surface: wrap the streamed run + # as an OpenAITurn (provider -> canonical StreamTaskMessage* + # adapter) and let UnifiedEmitter.auto_send_turn drive delivery + + # tracing + usage. The previous ~270-line inline loop that hand- + # rolled per-item streaming contexts, reasoning handling, and + # span derivation now lives in the shared harness modules. + # Imported lazily: openai_turn pulls in agentex.lib.adk, which + # imports this service module, so an eager import would create a + # circular import at package init. + from agentex.lib.adk.providers._modules.openai_turn import OpenAITurn + + turn = OpenAITurn(result=result, model=model) + emitter = UnifiedEmitter( + task_id=task_id, + trace_id=trace_id, + parent_span_id=parent_span_id, + tracer=self.tracer, + streaming=self.streaming_service, + ) try: - # Process streaming events with TaskMessage creation - async for event in result.stream_events(): - heartbeat_if_in_workflow("processing stream event with auto send") - - if event.type == "run_item_stream_event": - if event.item.type == "tool_call_item": - tool_call_item = event.item.raw_item - - # Extract tool call information using the helper method - call_id, tool_name, tool_arguments = self._extract_tool_call_info(tool_call_item) - tool_call_map[call_id] = tool_call_item - - tool_request_content = ToolRequestContent( - author="agent", - tool_call_id=call_id, - name=tool_name, - arguments=tool_arguments, - ) - - # Create tool request using streaming context (immediate completion) - async with self.streaming_service.streaming_task_message_context( - task_id=task_id, - initial_content=tool_request_content, - created_at=_take_created_at(), - ) as streaming_context: - # The message has already been persisted, but we still need to send an upda - await streaming_context.stream_update( - update=StreamTaskMessageFull( - parent_task_message=streaming_context.task_message, - content=tool_request_content, - type="full", - ), - ) - - elif event.item.type == "tool_call_output_item": - tool_output_item = event.item.raw_item - - # Extract tool response information using the helper method - call_id, tool_name, content = self._extract_tool_response_info( - tool_call_map, tool_output_item - ) - - tool_response_content = ToolResponseContent( - author="agent", - tool_call_id=call_id, - name=tool_name, - content=content, - ) - - # Create tool response using streaming context (immediate completion) - async with self.streaming_service.streaming_task_message_context( - task_id=task_id, - initial_content=tool_response_content, - created_at=_take_created_at(), - ) as streaming_context: - # The message has already been persisted, but we still need to send an update - await streaming_context.stream_update( - update=StreamTaskMessageFull( - parent_task_message=streaming_context.task_message, - content=tool_response_content, - type="full", - ), - ) - - elif event.type == "raw_response_event": - if isinstance(event.data, ResponseTextDeltaEvent): - # Handle text delta - item_id = event.data.item_id - - # Check if we already have a streaming context for this item - if item_id not in item_id_to_streaming_context: - # Create a new streaming context for this item - streaming_context = self.streaming_service.streaming_task_message_context( - task_id=task_id, - initial_content=TextContent( - author="agent", - content="", - ), - created_at=_take_created_at(), - ) - # Open the streaming context - item_id_to_streaming_context[item_id] = await streaming_context.open() - unclosed_item_ids.add(item_id) - else: - streaming_context = item_id_to_streaming_context[item_id] - - # Stream the delta through the streaming service - await streaming_context.stream_update( - update=StreamTaskMessageDelta( - parent_task_message=streaming_context.task_message, - delta=TextDelta(text_delta=event.data.delta, type="text"), - type="delta", - ), - ) - # Reasoning step one: new summary part added - elif isinstance(event.data, ResponseReasoningSummaryPartAddedEvent): - # We need to create a new streaming context for this reasoning item - item_id = event.data.item_id - - # Reset the reasoning summary string - current_reasoning_summary = "" - - streaming_context = self.streaming_service.streaming_task_message_context( - task_id=task_id, - initial_content=ReasoningContent( - author="agent", - summary=[], - content=[], - type="reasoning", - style="active", - ), - created_at=_take_created_at(), - ) - - # Replace the existing streaming context (if it exists) - # Why do we replace? Cause all the reasoning parts use the same item_id! - item_id_to_streaming_context[item_id] = await streaming_context.open() - unclosed_item_ids.add(item_id) - - # Reasoning step two: handling summary text delta - elif isinstance(event.data, ResponseReasoningSummaryTextDeltaEvent): - # Accumulate the delta into the string - current_reasoning_summary += event.data.delta - streaming_context = item_id_to_streaming_context[item_id] - - # Stream the summary delta through the streaming service - await streaming_context.stream_update( - update=StreamTaskMessageDelta( - parent_task_message=streaming_context.task_message, - delta=ReasoningSummaryDelta( - summary_index=event.data.summary_index, - summary_delta=event.data.delta, - type="reasoning_summary", - ), - type="delta", - ), - ) - - # Reasoning step three: handling summary text done, closing the streaming context - elif isinstance(event.data, ResponseReasoningSummaryPartDoneEvent): - # Handle reasoning summary text completion - streaming_context = item_id_to_streaming_context[item_id] - - # Create the complete reasoning content with the accumulated summary - complete_reasoning_content = ReasoningContent( - author="agent", - summary=[current_reasoning_summary], - content=[], - type="reasoning", - style="static", - ) - - # Send a full message update with the complete reasoning content - await streaming_context.stream_update( - update=StreamTaskMessageFull( - parent_task_message=streaming_context.task_message, - content=complete_reasoning_content, - type="full", - ), - ) - - await streaming_context.close() - unclosed_item_ids.discard(item_id) - - elif isinstance(event.data, ResponseOutputItemDoneEvent): - # Handle item completion - item_id = event.data.item.id - - # Finish the streaming context (sends DONE event and updates message) - if item_id in item_id_to_streaming_context: - streaming_context = item_id_to_streaming_context[item_id] - await streaming_context.close() - if item_id in unclosed_item_ids: - unclosed_item_ids.remove(item_id) - - elif isinstance(event.data, ResponseCompletedEvent): - # All items complete, finish all remaining streaming contexts for this session - # Create a copy to avoid modifying set during iteration - remaining_items = list(unclosed_item_ids) - for item_id in remaining_items: - if ( - item_id in unclosed_item_ids and item_id in item_id_to_streaming_context - ): # Check if still unclosed - streaming_context = item_id_to_streaming_context[item_id] - await streaming_context.close() - unclosed_item_ids.discard(item_id) + await emitter.auto_send_turn(turn, created_at=created_at) except InputGuardrailTripwireTriggered as e: # Handle guardrail trigger by sending a rejection message @@ -1080,18 +906,6 @@ async def run_agent_streamed_auto_send( # Re-raise to let the activity handle it raise - finally: - # Cleanup: ensure all streaming contexts for this session are properly finished - # Create a copy to avoid modifying set during iteration - remaining_items = list(unclosed_item_ids) - for item_id in remaining_items: - if ( - item_id in unclosed_item_ids and item_id in item_id_to_streaming_context - ): # Check if still unclosed - streaming_context = item_id_to_streaming_context[item_id] - await streaming_context.close() - unclosed_item_ids.discard(item_id) - if span: span.output = { "new_items": [ diff --git a/tests/lib/adk/providers/test_openai_activities.py b/tests/lib/adk/providers/test_openai_activities.py index c933b6ce4..2f89308a9 100644 --- a/tests/lib/adk/providers/test_openai_activities.py +++ b/tests/lib/adk/providers/test_openai_activities.py @@ -335,23 +335,61 @@ async def mock_stream_events(): expected_params.tools = [CodeInterpreterTool(tool_config={"type": "code_interpreter"})] self._assert_starting_agent_params(starting_agent, expected_params) - # Verify streaming context received tool request and response updates - # Should have been called twice - once for tool request, once for response - assert mock_streaming_context.stream_update.call_count == 2 + # Under the unified harness, the OpenAI events are converted to canonical + # StreamTaskMessageFull events and auto_send posts each full tool message + # by opening a streaming context with the content as initial_content and + # closing it (no stream_update). So assert on the opened contents. + opened = mock_streaming_context.opened_contents + tool_contents = [c for c in opened if getattr(c, "type", None) in ("tool_request", "tool_response")] + assert len(tool_contents) == 2 + + # First opened context is the tool request. + first = tool_contents[0] + assert first.type == "tool_request" + assert first.name == "code_interpreter" + assert first.tool_call_id == "code_interpreter_call_123" + + # Second opened context is the tool response. + second = tool_contents[1] + assert second.type == "tool_response" + assert second.tool_call_id == "code_interpreter_call_123" - # First call should be tool request - first_call = mock_streaming_context.stream_update.call_args_list[0] - first_update = first_call[1]["update"] # keyword argument - assert hasattr(first_update, "content") - assert first_update.content.name == "code_interpreter" - assert first_update.content.tool_call_id == "code_interpreter_call_123" + @patch("agents.Runner.run_streamed") + async def test_run_agent_streamed_auto_send_forwards_previous_response_id(self, mock_runner_run_streamed): + """previous_response_id must reach Runner.run_streamed so a Responses-API + conversation continues instead of silently starting fresh.""" + from agentex.lib.core.temporal.activities.adk.providers.openai_activities import ( + RunAgentStreamedAutoSendParams, + ) - # Second call should be tool response - second_call = mock_streaming_context.stream_update.call_args_list[1] - second_update = second_call[1]["update"] # keyword argument - assert hasattr(second_update, "content") - assert second_update.content.name == "code_interpreter_call" - assert second_update.content.tool_call_id == "code_interpreter_call_123" + mock_streaming_result = self._create_streaming_result_mock() + + async def _no_events(): + return + yield + + mock_streaming_result.stream_events = _no_events + mock_runner_run_streamed.return_value = mock_streaming_result + + mock_tracer = self._create_mock_tracer() + openai_service, openai_activities, env = self._create_test_setup(mock_tracer) + self._setup_streaming_service_mocks(openai_service) + + params = RunAgentStreamedAutoSendParams( + input_list=[{"role": "user", "content": "continue"}], + mcp_server_params=[], + agent_name="test_agent", + agent_instructions="You are a helpful assistant", + trace_id="test-trace-id", + parent_span_id="test-span-id", + task_id="test-task-id", + previous_response_id="response_123", + ) + + await env.run(openai_activities.run_agent_streamed_auto_send, params) + + mock_runner_run_streamed.assert_called_once() + assert mock_runner_run_streamed.call_args.kwargs.get("previous_response_id") == "response_123" def _create_mock_tracer(self): """Helper method to create a properly mocked tracer with async context manager support.""" @@ -613,6 +651,60 @@ def _assert_tools_conversion(self, starting_agent, tools_case, _original_tools): else: raise ValueError(f"Unknown tools_case: {tools_case}") + @patch("agents.Runner.run_streamed") + async def test_run_agent_streamed_auto_send_forwards_created_at(self, mock_runner_run_streamed): + """created_at is forwarded to every streaming context opened by auto_send_turn (AGX1-378).""" + from datetime import datetime, timezone + + from agentex.lib.core.temporal.activities.adk.providers.openai_activities import ( + RunAgentStreamedAutoSendParams, + ) + + deterministic_ts = datetime(2025, 1, 15, 12, 0, 0, tzinfo=timezone.utc) + + mock_streaming_result = self._create_streaming_result_mock() + + # Emit a tool call + tool response so auto_send actually opens streaming + # contexts; an empty stream opens none, making the assertion below + # vacuously true and unable to catch a created_at regression. + async def mock_stream_events(): + tool_call_event = Mock() + tool_call_event.type = "run_item_stream_event" + tool_call_event.item = self._create_tool_call_item_mock(self._create_code_interpreter_tool_call_mock()) + yield tool_call_event + + tool_response_event = Mock() + tool_response_event.type = "run_item_stream_event" + tool_response_event.item = self._create_tool_output_item_mock() + yield tool_response_event + + mock_streaming_result.stream_events = mock_stream_events + mock_runner_run_streamed.return_value = mock_streaming_result + + mock_tracer = self._create_mock_tracer() + openai_service, openai_activities, env = self._create_test_setup(mock_tracer) + mock_ctx, recorded_created_ats = self._setup_streaming_service_mocks_with_created_at(openai_service) + + params = RunAgentStreamedAutoSendParams( + input_list=[{"role": "user", "content": "hello"}], + mcp_server_params=[], + agent_name="test_agent", + agent_instructions="You are a helpful assistant", + trace_id="test-trace-id", + parent_span_id="test-span-id", + task_id="test-task-id", + created_at=deterministic_ts, + ) + + await env.run(openai_activities.run_agent_streamed_auto_send, params) + + # Guard against a vacuous pass: at least one streaming context must have + # been opened so the per-context created_at assertion is meaningful. + assert recorded_created_ats, "expected at least one streaming context to be opened" + assert all(ts == deterministic_ts for ts in recorded_created_ats), ( + f"Expected all streaming contexts to receive created_at={deterministic_ts!r}, got: {recorded_created_ats!r}" + ) + def _setup_streaming_service_mocks(self, openai_service): """Helper method to setup streaming service mocks for run_agent_auto_send.""" from unittest.mock import AsyncMock @@ -635,21 +727,64 @@ def _setup_streaming_service_mocks(self, openai_service): mock_streaming_context.task_message = mock_task_message mock_streaming_context.stream_update = AsyncMock() + # Record the initial_content passed to each opened streaming context. + # The unified harness auto_send path posts full tool messages by opening + # a context with initial_content and closing it (no stream_update), so + # assertions inspect the opened contents rather than stream_update calls. + opened_contents: list = [] + # Create a proper async context manager mock from contextlib import asynccontextmanager from unittest.mock import AsyncMock @asynccontextmanager - async def mock_streaming_context_manager(*_args, **_kwargs): + async def mock_streaming_context_manager(*_args, **kwargs): + if "initial_content" in kwargs: + opened_contents.append(kwargs["initial_content"]) yield mock_streaming_context mock_streaming_service.streaming_task_message_context = mock_streaming_context_manager + # Expose the recorded contents on the returned context mock for assertions. + mock_streaming_context.opened_contents = opened_contents openai_service.streaming_service = mock_streaming_service openai_service.agentex_client = mock_agentex_client return mock_streaming_context + def _setup_streaming_service_mocks_with_created_at(self, openai_service): + """Like _setup_streaming_service_mocks but also records every created_at kwarg.""" + from contextlib import asynccontextmanager + from unittest.mock import AsyncMock + + from agentex.types.task_message import TaskMessage + + mock_streaming_service = AsyncMock() + mock_agentex_client = AsyncMock() + + mock_streaming_context = AsyncMock() + mock_task_message = Mock(spec=TaskMessage) + mock_task_message.id = "test-task-message-id" + mock_task_message.task_id = "test-task-id" + mock_task_message.content = {"type": "text", "content": "test"} + mock_streaming_context.task_message = mock_task_message + mock_streaming_context.stream_update = AsyncMock() + + recorded_created_ats: list = [] + + @asynccontextmanager + async def mock_ctx_manager(*_args, **kwargs): + recorded_created_ats.append(kwargs.get("created_at")) + yield mock_streaming_context + + mock_streaming_service.streaming_task_message_context = mock_ctx_manager + mock_streaming_context.opened_contents = [] + + openai_service.streaming_service = mock_streaming_service + openai_service.agentex_client = mock_agentex_client + + return mock_streaming_context, recorded_created_ats + def _create_code_interpreter_tool_call_mock(self, call_id="code_interpreter_call_123"): """Helper to create ResponseCodeInterpreterToolCall mock objects.""" return ResponseCodeInterpreterToolCall( @@ -680,6 +815,9 @@ def _create_streaming_result_mock(self, final_output="Code executed successfully mock_streaming_result = Mock(spec=RunResultStreaming) mock_streaming_result.final_output = final_output mock_streaming_result.new_items = [] + # OpenAITurn reads raw_responses after stream exhaustion to aggregate + # usage; provide an empty list so usage normalizes to model-only. + mock_streaming_result.raw_responses = [] mock_streaming_result.final_input_list = [ {"role": "user", "content": "Run some Python code"}, {"role": "assistant", "content": final_output}, diff --git a/tests/lib/adk/providers/test_openai_turn.py b/tests/lib/adk/providers/test_openai_turn.py new file mode 100644 index 000000000..023b0ed4e --- /dev/null +++ b/tests/lib/adk/providers/test_openai_turn.py @@ -0,0 +1,246 @@ +"""Tests for OpenAITurn and its usage mapping. + +OpenAITurn adapts an OpenAI Agents SDK streamed run onto the harness +``HarnessTurn`` protocol. These tests cover: +- ``openai_usage_to_turn_usage`` (full usage, None, real zeros) +- ``_aggregate_usage`` (empty, single, multiple ModelResponses) +- ``OpenAITurn.events`` driven by an injected canonical stream (bypassing the + OpenAI->canonical converter), plus ``usage()`` before/after exhaustion +- the ``ValueError`` guard when neither ``result`` nor ``stream`` is supplied +""" + +import types as _types + +import pytest +from agents.usage import Usage +from openai.types.responses.response_usage import InputTokensDetails, OutputTokensDetails + +from agentex.types.text_content import TextContent +from agentex.types.task_message_delta import TextDelta +from agentex.types.task_message_update import ( + StreamTaskMessageDone, + StreamTaskMessageDelta, + StreamTaskMessageStart, +) + + +def _import_target(): + from agentex.lib.adk.providers._modules.openai_turn import ( + OpenAITurn, + _aggregate_usage, + openai_usage_to_turn_usage, + ) + + return OpenAITurn, _aggregate_usage, openai_usage_to_turn_usage + + +# --------------------------------------------------------------------------- +# openai_usage_to_turn_usage +# --------------------------------------------------------------------------- + + +def test_usage_mapping_full(): + _, _, openai_usage_to_turn_usage = _import_target() + usage = Usage( + requests=3, + input_tokens=100, + input_tokens_details=InputTokensDetails(cached_tokens=20), + output_tokens=50, + output_tokens_details=OutputTokensDetails(reasoning_tokens=10), + total_tokens=150, + ) + turn_usage = openai_usage_to_turn_usage(usage, model="gpt-4o") + + assert turn_usage.model == "gpt-4o" + assert turn_usage.num_llm_calls == 3 + assert turn_usage.input_tokens == 100 + assert turn_usage.cached_input_tokens == 20 + assert turn_usage.output_tokens == 50 + assert turn_usage.reasoning_tokens == 10 + assert turn_usage.total_tokens == 150 + + +def test_usage_mapping_none_usage(): + _, _, openai_usage_to_turn_usage = _import_target() + turn_usage = openai_usage_to_turn_usage(None, model="gpt-4o") + + assert turn_usage.model == "gpt-4o" + assert turn_usage.num_llm_calls == 0 + assert turn_usage.input_tokens is None + assert turn_usage.output_tokens is None + assert turn_usage.total_tokens is None + + +def test_usage_mapping_real_zeros_are_preserved(): + # A cache hit can legitimately produce 0 output tokens; a present-but-zero + # value must survive as 0, not be coerced to None. + _, _, openai_usage_to_turn_usage = _import_target() + usage = Usage( + requests=1, + input_tokens=0, + input_tokens_details=InputTokensDetails(cached_tokens=0), + output_tokens=0, + output_tokens_details=OutputTokensDetails(reasoning_tokens=0), + total_tokens=0, + ) + turn_usage = openai_usage_to_turn_usage(usage, model="m") + + assert turn_usage.input_tokens == 0 + assert turn_usage.cached_input_tokens == 0 + assert turn_usage.output_tokens == 0 + assert turn_usage.reasoning_tokens == 0 + assert turn_usage.total_tokens == 0 + assert turn_usage.num_llm_calls == 1 + + +# --------------------------------------------------------------------------- +# _aggregate_usage +# --------------------------------------------------------------------------- + + +def _resp(usage): + return _types.SimpleNamespace(usage=usage) + + +def test_aggregate_usage_empty(): + _, _aggregate_usage, _ = _import_target() + assert _aggregate_usage([]) is None + + +def test_aggregate_usage_single(): + _, _aggregate_usage, _ = _import_target() + usage = Usage(requests=1, input_tokens=10, output_tokens=5, total_tokens=15) + total = _aggregate_usage([_resp(usage)]) + + assert total is not None + assert total.requests == 1 + assert total.input_tokens == 10 + assert total.output_tokens == 5 + assert total.total_tokens == 15 + + +def test_aggregate_usage_multiple(): + _, _aggregate_usage, _ = _import_target() + u1 = Usage( + requests=1, + input_tokens=10, + input_tokens_details=InputTokensDetails(cached_tokens=2), + output_tokens=5, + output_tokens_details=OutputTokensDetails(reasoning_tokens=1), + total_tokens=15, + ) + u2 = Usage( + requests=2, + input_tokens=20, + input_tokens_details=InputTokensDetails(cached_tokens=3), + output_tokens=7, + output_tokens_details=OutputTokensDetails(reasoning_tokens=4), + total_tokens=27, + ) + # A response without usage must be skipped, not crash the aggregation. + total = _aggregate_usage([_resp(u1), _resp(None), _resp(u2)]) + + assert total is not None + assert total.requests == 3 + assert total.input_tokens == 30 + assert total.output_tokens == 12 + assert total.total_tokens == 42 + assert total.input_tokens_details.cached_tokens == 5 + assert total.output_tokens_details.reasoning_tokens == 5 + + +# --------------------------------------------------------------------------- +# OpenAITurn.events / usage / construction +# --------------------------------------------------------------------------- + + +async def _canonical_stream(events): + for e in events: + yield e + + +@pytest.mark.asyncio +async def test_turn_events_forwards_injected_stream(): + OpenAITurn, _, _ = _import_target() + events = [ + StreamTaskMessageStart(type="start", index=0, content=TextContent(type="text", author="agent", content="")), + StreamTaskMessageDelta(type="delta", index=0, delta=TextDelta(type="text", text_delta="Hi")), + StreamTaskMessageDone(type="done", index=0), + ] + turn = OpenAITurn(stream=_canonical_stream(events), model="gpt-4o") + + out = [e async for e in turn.events] + assert out == events + + +@pytest.mark.asyncio +async def test_turn_usage_before_and_after_exhaustion_with_injected_stream(): + OpenAITurn, _, _ = _import_target() + events = [ + StreamTaskMessageStart(type="start", index=0, content=TextContent(type="text", author="agent", content="")), + StreamTaskMessageDone(type="done", index=0), + ] + turn = OpenAITurn(stream=_canonical_stream(events), model="gpt-4o") + + # Before exhaustion: usage carries only the model name. + before = turn.usage() + assert before.model == "gpt-4o" + assert before.input_tokens is None + + async for _ in turn.events: + pass + + # With an injected stream there is no run to read usage from, so usage + # stays model-only after exhaustion. + after = turn.usage() + assert after.model == "gpt-4o" + assert after.input_tokens is None + + +@pytest.mark.asyncio +async def test_turn_usage_populated_from_result_after_exhaustion(): + OpenAITurn, _, _ = _import_target() + + canonical = [ + StreamTaskMessageStart(type="start", index=0, content=TextContent(type="text", author="agent", content="")), + StreamTaskMessageDone(type="done", index=0), + ] + + class _FakeResult: + def __init__(self): + self.raw_responses = [ + _resp(Usage(requests=1, input_tokens=8, output_tokens=4, total_tokens=12)), + ] + + def stream_events(self): + # OpenAITurn passes this to convert_openai_to_agentex_events; we + # monkeypatch that converter below so this can yield canonical events. + return _canonical_stream(canonical) + + import agentex.lib.adk.providers._modules.openai_turn as mod + + async def _passthrough(stream): + async for e in stream: + yield e + + original = mod.convert_openai_to_agentex_events + mod.convert_openai_to_agentex_events = _passthrough + try: + turn = OpenAITurn(result=_FakeResult(), model="gpt-4o") + out = [e async for e in turn.events] + finally: + mod.convert_openai_to_agentex_events = original + + assert out == canonical + usage = turn.usage() + assert usage.model == "gpt-4o" + assert usage.num_llm_calls == 1 + assert usage.input_tokens == 8 + assert usage.output_tokens == 4 + assert usage.total_tokens == 12 + + +def test_turn_requires_result_or_stream(): + OpenAITurn, _, _ = _import_target() + with pytest.raises(ValueError, match="either"): + OpenAITurn() diff --git a/tests/lib/core/harness/conformance/test_openai_conformance.py b/tests/lib/core/harness/conformance/test_openai_conformance.py new file mode 100644 index 000000000..e8630ca7f --- /dev/null +++ b/tests/lib/core/harness/conformance/test_openai_conformance.py @@ -0,0 +1,206 @@ +"""OpenAI conformance fixtures for the shared harness span-derivation engine. + +The cross-channel guarantee is that yield-delivery and auto_send observe the +SAME canonical StreamTaskMessage* stream, so span derivation and logical +delivery over that stream must be equivalent regardless of channel. These +fixtures express the canonical sequences an OpenAI turn produces (text, +tool-call, reasoning, and a combined multi-step turn) and assert that property +via run_cross_channel_conformance. + +Registry hazard (see conformance/runner.py): _REGISTRY is process-global and +collection order across modules is not guaranteed. To stay deterministic this +module keeps its OWN fixture list and parametrizes over THAT list, rather than +over all_fixtures(). It still calls register() so the cross-module conformance +suite can see these fixtures too. +""" + +from __future__ import annotations + +import pytest + +from agentex.types.text_delta import TextDelta +from agentex.types.text_content import TextContent +from agentex.types.reasoning_content import ReasoningContent +from agentex.types.task_message_update import ( + StreamTaskMessageDone, + StreamTaskMessageFull, + StreamTaskMessageDelta, + StreamTaskMessageStart, +) +from agentex.types.tool_request_content import ToolRequestContent +from agentex.types.tool_response_content import ToolResponseContent +from agentex.types.reasoning_content_delta import ReasoningContentDelta + +from .runner import Fixture, register, run_cross_channel_conformance + +_OPENAI_FIXTURES: list[Fixture] = [] + + +def _add(fixture: Fixture) -> None: + """Register both module-locally (for parametrization) and globally.""" + _OPENAI_FIXTURES.append(fixture) + register(fixture) + + +# Text-only turn: start -> deltas -> done. +# Uses non-empty initial_content so payload comparison catches a channel that +# drops StreamTaskMessageStart.content. +_add( + Fixture( + name="openai-text-only", + events=[ + StreamTaskMessageStart( + type="start", + index=0, + content=TextContent(type="text", author="agent", content="Init"), + ), + StreamTaskMessageDelta(type="delta", index=0, delta=TextDelta(type="text", text_delta="Hel")), + StreamTaskMessageDelta(type="delta", index=0, delta=TextDelta(type="text", text_delta="lo")), + StreamTaskMessageDone(type="done", index=0), + ], + ) +) + +# Tool-call turn: Full(ToolRequestContent) for the call + Full(ToolResponseContent) +# for the result, matched by tool_call_id. Mirrors the OpenAI converter's tool path. +_add( + Fixture( + name="openai-tool-call", + events=[ + StreamTaskMessageFull( + type="full", + index=0, + content=ToolRequestContent( + type="tool_request", + author="agent", + tool_call_id="call_1", + name="get_weather", + arguments={"city": "SF"}, + ), + ), + StreamTaskMessageFull( + type="full", + index=1, + content=ToolResponseContent( + type="tool_response", + author="agent", + tool_call_id="call_1", + name="get_weather", + content="72F", + ), + ), + ], + ) +) + +# Reasoning turn: start(ReasoningContent) -> content deltas -> done. +# ReasoningContent.summary is seeded in the payload so a channel that drops the +# summary fails the cross-channel comparison. +_add( + Fixture( + name="openai-reasoning", + events=[ + StreamTaskMessageStart( + type="start", + index=0, + content=ReasoningContent( + type="reasoning", + author="agent", + summary=["Thinking..."], + ), + ), + StreamTaskMessageDelta( + type="delta", + index=0, + delta=ReasoningContentDelta( + type="reasoning_content", + content_index=0, + content_delta="step 1", + ), + ), + StreamTaskMessageDone(type="done", index=0), + ], + ) +) + +# Multi-step turn: reasoning, then a tool round, then the final answer text. +_add( + Fixture( + name="openai-multi-step", + events=[ + StreamTaskMessageStart( + type="start", + index=0, + content=ReasoningContent( + type="reasoning", + author="agent", + summary=["plan"], + ), + ), + StreamTaskMessageDelta( + type="delta", + index=0, + delta=ReasoningContentDelta( + type="reasoning_content", + content_index=0, + content_delta="elaboration", + ), + ), + StreamTaskMessageDone(type="done", index=0), + StreamTaskMessageFull( + type="full", + index=1, + content=ToolRequestContent( + type="tool_request", + author="agent", + tool_call_id="call_2", + name="search", + arguments={"q": "x"}, + ), + ), + StreamTaskMessageFull( + type="full", + index=2, + content=ToolResponseContent( + type="tool_response", + author="agent", + tool_call_id="call_2", + name="search", + content="result", + ), + ), + StreamTaskMessageStart( + type="start", + index=3, + content=TextContent(type="text", author="agent", content=""), + ), + StreamTaskMessageDelta(type="delta", index=3, delta=TextDelta(type="text", text_delta="done")), + StreamTaskMessageDone(type="done", index=3), + ], + ) +) + + +@pytest.mark.parametrize("fixture", _OPENAI_FIXTURES, ids=lambda f: f.name) +@pytest.mark.asyncio +async def test_openai_cross_channel_equivalence(fixture: Fixture) -> None: + """Assert that yield_events and auto_send produce equivalent logical + deliveries and identical span signals for every OpenAI fixture. + + This is the cross-channel guarantee: the two delivery adapters agree on + WHAT was delivered (logical content) and HOW spans were derived, even + though their streaming-envelope shapes differ (Full vs Start+Done for tool + messages). + + The span signals are the ones each channel's tracer ACTUALLY recorded while + delivering, not a re-derivation, so a regression where one channel skips + deriver.observe() for some event type is caught here. + """ + yield_deliveries, auto_deliveries, yield_spans, auto_spans = await run_cross_channel_conformance(fixture) + + assert yield_deliveries == auto_deliveries, ( + f"[{fixture.name}] logical deliveries differ:\n yield: {yield_deliveries}\n auto_send: {auto_deliveries}" + ) + assert yield_spans == auto_spans, ( + f"[{fixture.name}] span signals differ:\n yield: {yield_spans}\n auto_send: {auto_spans}" + ) From 904339c21b8cd641a02d903c03d4a8730b4d7e84 Mon Sep 17 00:00:00 2001 From: Declan Brady Date: Mon, 22 Jun 2026 18:21:44 -0400 Subject: [PATCH 06/10] feat(claude-code): stream-json parser tap for the unified harness surface (#420) --- .../00_sync/060_claude_code/.dockerignore | 43 ++ .../00_sync/060_claude_code/Dockerfile | 46 ++ .../00_sync/060_claude_code/README.md | 76 +++ .../00_sync/060_claude_code/manifest.yaml | 55 ++ .../060_claude_code/project/__init__.py | 0 .../00_sync/060_claude_code/project/acp.py | 137 ++++ .../00_sync/060_claude_code/pyproject.toml | 25 + .../060_claude_code/tests/test_agent.py | 162 +++++ .../tests/test_agent_offline.py | 210 ++++++ .../00_base/130_claude_code/.dockerignore | 43 ++ .../00_base/130_claude_code/Dockerfile | 43 ++ .../00_base/130_claude_code/README.md | 76 +++ .../00_base/130_claude_code/manifest.yaml | 58 ++ .../130_claude_code/project/__init__.py | 0 .../00_base/130_claude_code/project/acp.py | 149 ++++ .../00_base/130_claude_code/pyproject.toml | 25 + .../130_claude_code/tests/test_agent.py | 250 +++++++ .../tests/test_agent_offline.py | 243 +++++++ .../10_temporal/140_claude_code/.dockerignore | 43 ++ .../10_temporal/140_claude_code/Dockerfile | 46 ++ .../10_temporal/140_claude_code/README.md | 76 +++ .../10_temporal/140_claude_code/manifest.yaml | 62 ++ .../140_claude_code/project/__init__.py | 0 .../140_claude_code/project/acp.py | 31 + .../140_claude_code/project/activities.py | 139 ++++ .../140_claude_code/project/run_worker.py | 41 ++ .../140_claude_code/project/workflow.py | 137 ++++ .../140_claude_code/pyproject.toml | 27 + .../140_claude_code/tests/test_agent.py | 249 +++++++ .../tests/test_agent_offline.py | 230 +++++++ src/agentex/lib/adk/__init__.py | 9 + .../lib/adk/_modules/_claude_code_sync.py | 378 +++++++++++ .../lib/adk/_modules/_claude_code_turn.py | 161 +++++ src/agentex/lib/core/harness/types.py | 5 +- tests/lib/adk/test_claude_code_sync.py | 637 ++++++++++++++++++ tests/lib/adk/test_claude_code_turn.py | 283 ++++++++ .../test_claude_code_conformance.py | 202 ++++++ 37 files changed, 4396 insertions(+), 1 deletion(-) create mode 100644 examples/tutorials/00_sync/060_claude_code/.dockerignore create mode 100644 examples/tutorials/00_sync/060_claude_code/Dockerfile create mode 100644 examples/tutorials/00_sync/060_claude_code/README.md create mode 100644 examples/tutorials/00_sync/060_claude_code/manifest.yaml create mode 100644 examples/tutorials/00_sync/060_claude_code/project/__init__.py create mode 100644 examples/tutorials/00_sync/060_claude_code/project/acp.py create mode 100644 examples/tutorials/00_sync/060_claude_code/pyproject.toml create mode 100644 examples/tutorials/00_sync/060_claude_code/tests/test_agent.py create mode 100644 examples/tutorials/00_sync/060_claude_code/tests/test_agent_offline.py create mode 100644 examples/tutorials/10_async/00_base/130_claude_code/.dockerignore create mode 100644 examples/tutorials/10_async/00_base/130_claude_code/Dockerfile create mode 100644 examples/tutorials/10_async/00_base/130_claude_code/README.md create mode 100644 examples/tutorials/10_async/00_base/130_claude_code/manifest.yaml create mode 100644 examples/tutorials/10_async/00_base/130_claude_code/project/__init__.py create mode 100644 examples/tutorials/10_async/00_base/130_claude_code/project/acp.py create mode 100644 examples/tutorials/10_async/00_base/130_claude_code/pyproject.toml create mode 100644 examples/tutorials/10_async/00_base/130_claude_code/tests/test_agent.py create mode 100644 examples/tutorials/10_async/00_base/130_claude_code/tests/test_agent_offline.py create mode 100644 examples/tutorials/10_async/10_temporal/140_claude_code/.dockerignore create mode 100644 examples/tutorials/10_async/10_temporal/140_claude_code/Dockerfile create mode 100644 examples/tutorials/10_async/10_temporal/140_claude_code/README.md create mode 100644 examples/tutorials/10_async/10_temporal/140_claude_code/manifest.yaml create mode 100644 examples/tutorials/10_async/10_temporal/140_claude_code/project/__init__.py create mode 100644 examples/tutorials/10_async/10_temporal/140_claude_code/project/acp.py create mode 100644 examples/tutorials/10_async/10_temporal/140_claude_code/project/activities.py create mode 100644 examples/tutorials/10_async/10_temporal/140_claude_code/project/run_worker.py create mode 100644 examples/tutorials/10_async/10_temporal/140_claude_code/project/workflow.py create mode 100644 examples/tutorials/10_async/10_temporal/140_claude_code/pyproject.toml create mode 100644 examples/tutorials/10_async/10_temporal/140_claude_code/tests/test_agent.py create mode 100644 examples/tutorials/10_async/10_temporal/140_claude_code/tests/test_agent_offline.py create mode 100644 src/agentex/lib/adk/_modules/_claude_code_sync.py create mode 100644 src/agentex/lib/adk/_modules/_claude_code_turn.py create mode 100644 tests/lib/adk/test_claude_code_sync.py create mode 100644 tests/lib/adk/test_claude_code_turn.py create mode 100644 tests/lib/core/harness/conformance/test_claude_code_conformance.py diff --git a/examples/tutorials/00_sync/060_claude_code/.dockerignore b/examples/tutorials/00_sync/060_claude_code/.dockerignore new file mode 100644 index 000000000..c49489471 --- /dev/null +++ b/examples/tutorials/00_sync/060_claude_code/.dockerignore @@ -0,0 +1,43 @@ +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg + +# Environments +.env** +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# IDE +.idea/ +.vscode/ +*.swp +*.swo + +# Git +.git +.gitignore + +# Misc +.DS_Store diff --git a/examples/tutorials/00_sync/060_claude_code/Dockerfile b/examples/tutorials/00_sync/060_claude_code/Dockerfile new file mode 100644 index 000000000..ec22d7e0b --- /dev/null +++ b/examples/tutorials/00_sync/060_claude_code/Dockerfile @@ -0,0 +1,46 @@ +# syntax=docker/dockerfile:1.3 +FROM python:3.12-slim +COPY --from=ghcr.io/astral-sh/uv:0.6.4 /uv /uvx /bin/ + +# Install system dependencies including Node.js (required by the claude CLI) +RUN apt-get update && apt-get install -y \ + htop \ + vim \ + curl \ + tar \ + python3-dev \ + postgresql-client \ + build-essential \ + libpq-dev \ + gcc \ + cmake \ + netcat-openbsd \ + nodejs \ + npm \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +RUN uv pip install --system --upgrade pip setuptools wheel + +# Install the claude CLI (requires Node.js) +# NOTE: live runs require ANTHROPIC_API_KEY in the environment. +RUN npm install -g @anthropic-ai/claude-code || true + +ENV UV_HTTP_TIMEOUT=1000 + +COPY 00_sync/060_claude_code/pyproject.toml /app/060_claude_code/pyproject.toml +COPY 00_sync/060_claude_code/README.md /app/060_claude_code/README.md + +WORKDIR /app/060_claude_code + +COPY 00_sync/060_claude_code/project /app/060_claude_code/project +COPY 00_sync/060_claude_code/tests /app/060_claude_code/tests +COPY test_utils /app/test_utils + +RUN uv pip install --system .[dev] + +ENV PYTHONPATH=/app + +ENV AGENT_NAME=s060-claude-code + +CMD ["uvicorn", "project.acp:acp", "--host", "0.0.0.0", "--port", "8000"] diff --git a/examples/tutorials/00_sync/060_claude_code/README.md b/examples/tutorials/00_sync/060_claude_code/README.md new file mode 100644 index 000000000..e9c724732 --- /dev/null +++ b/examples/tutorials/00_sync/060_claude_code/README.md @@ -0,0 +1,76 @@ +# Tutorial 060: Sync Claude Code Agent + +This tutorial demonstrates how to build a **synchronous** agent that spawns the +Claude Code CLI as a local subprocess and streams its output through the Agentex +unified harness surface via ``ClaudeCodeTurn`` and ``UnifiedEmitter``. + +## Key Concepts + +### ClaudeCodeTurn + UnifiedEmitter + +``ClaudeCodeTurn`` wraps ``convert_claude_code_to_agentex_events``, which +parses the newline-delimited JSON envelopes emitted by +``claude -p --output-format stream-json``. It implements the ``HarnessTurn`` +protocol: an ``events`` async iterator of canonical ``StreamTaskMessage*`` +objects and a ``usage()`` method (populated once the stream is exhausted). + +``UnifiedEmitter.yield_turn(turn)`` is the sync delivery path: it forwards +events as HTTP yield chunks while tracing as a side effect. + +### Local subprocess spawn + +The ``_spawn_claude`` function in ``project/acp.py`` uses +``asyncio.create_subprocess_exec`` to run: + +``` +claude -p --output-format stream-json --verbose +``` + +The prompt is written to stdin. Stdout is read line by line and fed into +``ClaudeCodeTurn``. This is purely local -- no Scale sandbox is involved. + +Production isolation (Scale sandbox, secret injection, MCP configuration) +is the golden agent's concern at +``teams/sgp/agents/golden_agent/project/harness/providers/claude.py``. + +### Injectable spawn seam + +``_spawn_claude`` is a top-level async generator in ``project/acp.py``. +Tests monkeypatch it to inject pre-recorded stream-json lines instead of +spawning the real process, so offline unit tests run without the CLI. + +## Files + +| File | Description | +|------|-------------| +| ``project/acp.py`` | ACP server, ``_spawn_claude`` seam, and message handler | +| ``tests/test_agent.py`` | Live integration tests (needs CLI + API key) | +| ``tests/test_agent_offline.py`` | Offline unit tests with injected fake subprocess | +| ``manifest.yaml`` | Agent configuration | + +## Running Locally (live) + +Requires the ``claude`` CLI installed and ``ANTHROPIC_API_KEY`` set: + +```bash +npm install -g @anthropic-ai/claude-code +export ANTHROPIC_API_KEY=sk-ant-... +agentex agents run +``` + +## Running Offline Tests + +No CLI or API key needed: + +```bash +uv run pytest tests/test_agent_offline.py -v +``` + +## Notes + +- Production isolation (sandbox, secrets, MCP) is the golden agent's concern. + This tutorial runs the CLI directly to keep the code as simple as possible. +- Multi-turn session resumption (``claude -r ``) is out of scope + for this tutorial. See the golden agent for that pattern. +- The ``--verbose`` flag is included to match the golden agent's invocation; + it causes the CLI to emit ``stream_event`` triples for incremental streaming. diff --git a/examples/tutorials/00_sync/060_claude_code/manifest.yaml b/examples/tutorials/00_sync/060_claude_code/manifest.yaml new file mode 100644 index 000000000..56b9fd9e4 --- /dev/null +++ b/examples/tutorials/00_sync/060_claude_code/manifest.yaml @@ -0,0 +1,55 @@ +build: + context: + root: ../../ + include_paths: + - 00_sync/060_claude_code + - test_utils + dockerfile: 00_sync/060_claude_code/Dockerfile + dockerignore: 00_sync/060_claude_code/.dockerignore + +local_development: + agent: + port: 8000 + host_address: host.docker.internal + paths: + acp: project/acp.py + +agent: + acp_type: sync + name: s060-claude-code + description: A sync Claude Code agent streaming the unified harness surface via a local CLI subprocess + + temporal: + enabled: false + + credentials: + - env_var_name: ANTHROPIC_API_KEY + secret_name: anthropic-api-key + secret_key: api-key + - env_var_name: SGP_API_KEY + secret_name: sgp-api-key + secret_key: api-key + - env_var_name: SGP_ACCOUNT_ID + secret_name: sgp-account-id + secret_key: account-id + - env_var_name: SGP_CLIENT_BASE_URL + secret_name: sgp-client-base-url + secret_key: url + +deployment: + image: + repository: "" + tag: "latest" + + global: + agent: + name: "s060-claude-code" + description: "A sync Claude Code agent streaming via local CLI subprocess" + replicaCount: 1 + resources: + requests: + cpu: "500m" + memory: "1Gi" + limits: + cpu: "1000m" + memory: "2Gi" diff --git a/examples/tutorials/00_sync/060_claude_code/project/__init__.py b/examples/tutorials/00_sync/060_claude_code/project/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/examples/tutorials/00_sync/060_claude_code/project/acp.py b/examples/tutorials/00_sync/060_claude_code/project/acp.py new file mode 100644 index 000000000..aad53801a --- /dev/null +++ b/examples/tutorials/00_sync/060_claude_code/project/acp.py @@ -0,0 +1,137 @@ +"""ACP handler for the sync Claude Code tutorial. + +Spawns ``claude -p --output-format stream-json --verbose`` as a LOCAL +asyncio subprocess (no Scale sandbox -- that is the golden agent's +production concern). Stdout lines are fed into ``ClaudeCodeTurn``, which +wraps ``convert_claude_code_to_agentex_events``. Events are delivered via +``UnifiedEmitter.yield_turn``, the sync HTTP yield path. + +Live runs require the ``claude`` CLI to be installed and an +ANTHROPIC_API_KEY (or equivalent credential) to be in the environment. +For offline testing, see ``tests/test_agent_offline.py``, which injects a +fake subprocess. +""" + +from __future__ import annotations + +import os +import asyncio +from typing import AsyncIterator, AsyncGenerator + +from dotenv import load_dotenv + +load_dotenv() + +import agentex.lib.adk as adk +from agentex.lib.adk import ClaudeCodeTurn +from agentex.lib.types.acp import SendMessageParams +from agentex.lib.core.harness import UnifiedEmitter +from agentex.lib.types.tracing import SGPTracingProcessorConfig +from agentex.lib.utils.logging import make_logger +from agentex.lib.sdk.fastacp.fastacp import FastACP +from agentex.types.task_message_update import TaskMessageUpdate +from agentex.types.task_message_content import TaskMessageContent +from agentex.lib.core.tracing.tracing_processor_manager import add_tracing_processor_config + +logger = make_logger(__name__) + +add_tracing_processor_config( + SGPTracingProcessorConfig( + sgp_api_key=os.environ.get("SGP_API_KEY", ""), + sgp_account_id=os.environ.get("SGP_ACCOUNT_ID", ""), + sgp_base_url=os.environ.get("SGP_CLIENT_BASE_URL", ""), + ) +) + +acp = FastACP.create(acp_type="sync") + + +async def _spawn_claude(prompt: str) -> AsyncIterator[str]: + """Spawn ``claude -p --output-format stream-json`` locally and yield stdout lines. + + This is a seam: tests replace it with a fake async iterator of + pre-recorded lines so no real CLI invocation is needed offline. + """ + proc = await asyncio.create_subprocess_exec( + "claude", + "-p", + "--output-format", + "stream-json", + "--verbose", + stdin=asyncio.subprocess.PIPE, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + assert proc.stdout is not None + assert proc.stdin is not None + + proc.stdin.write(prompt.encode()) + proc.stdin.close() + + # Drain stderr concurrently. With --verbose, Claude Code can write enough to + # stderr to fill the OS pipe buffer; if we only read stdout, the CLI blocks + # on its stderr write while we block reading stdout — a deadlock. A + # background task keeps stderr flowing so stdout never stalls. + async def _drain_stderr() -> None: + assert proc.stderr is not None + async for _ in proc.stderr: + pass + + stderr_task = asyncio.create_task(_drain_stderr()) + + try: + buffer = "" + async for chunk in proc.stdout: + buffer += chunk.decode("utf-8", errors="replace") + while "\n" in buffer: + line, buffer = buffer.split("\n", 1) + line = line.strip() + if line: + yield line + + if buffer.strip(): + yield buffer.strip() + + await proc.wait() + finally: + # Release the subprocess and stderr drain task even if the consumer + # abandons the generator early (task cancellation / client disconnect): + # cancel the drain task and terminate+reap the process if it is still + # running, so neither is leaked. + stderr_task.cancel() + try: + await stderr_task + except asyncio.CancelledError: + pass + if proc.returncode is None: + try: + proc.terminate() + except ProcessLookupError: + pass + await proc.wait() + + +@acp.on_message_send +async def handle_message_send( + params: SendMessageParams, +) -> TaskMessageContent | list[TaskMessageContent] | AsyncGenerator[TaskMessageUpdate, None]: + """Handle an incoming message: run Claude Code locally and stream events.""" + task_id = params.task.id + prompt = params.content.content + logger.info("Processing message for task %s", task_id) + + async with adk.tracing.span( + trace_id=task_id, + task_id=task_id, + name="message", + input={"message": prompt}, + data={"__span_type__": "AGENT_WORKFLOW"}, + ) as turn_span: + emitter = UnifiedEmitter( + task_id=task_id, + trace_id=task_id, + parent_span_id=turn_span.id if turn_span else None, + ) + turn = ClaudeCodeTurn(_spawn_claude(prompt)) + async for event in emitter.yield_turn(turn): + yield event diff --git a/examples/tutorials/00_sync/060_claude_code/pyproject.toml b/examples/tutorials/00_sync/060_claude_code/pyproject.toml new file mode 100644 index 000000000..e5c1c4ea6 --- /dev/null +++ b/examples/tutorials/00_sync/060_claude_code/pyproject.toml @@ -0,0 +1,25 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "s060-claude-code" +version = "0.1.0" +description = "A sync Claude Code agent streaming the unified harness surface via a local CLI subprocess" +readme = "README.md" +requires-python = ">=3.12" +dependencies = [ + "agentex-sdk", + "scale-gp", + "python-dotenv>=1.0,<2", +] + +[project.optional-dependencies] +dev = [ + "pytest", + "pytest-asyncio", + "httpx", +] + +[tool.hatch.build.targets.wheel] +packages = ["project"] diff --git a/examples/tutorials/00_sync/060_claude_code/tests/test_agent.py b/examples/tutorials/00_sync/060_claude_code/tests/test_agent.py new file mode 100644 index 000000000..954a520f3 --- /dev/null +++ b/examples/tutorials/00_sync/060_claude_code/tests/test_agent.py @@ -0,0 +1,162 @@ +"""Tests for the sync Claude Code tutorial agent. + +LIVE tests (``TestClaudeCodeLive``): + - Require the ``claude`` CLI on PATH and ``ANTHROPIC_API_KEY`` set. + - Run the full agent end-to-end against a live Agentex server. + - Skipped automatically when ``CLAUDE_LIVE_TESTS`` is not set to ``1``. + +OFFLINE unit tests (``TestClaudeCodeOffline``): + - Inject a fake async iterator of pre-recorded stream-json lines. + - Assert the ``ClaudeCodeTurn`` + ``UnifiedEmitter`` pipeline yields events, + populates usage, and satisfies the ``HarnessTurn`` protocol. + - Always run -- no CLI or API key needed. +""" + +from __future__ import annotations + +import os +import json +from typing import AsyncIterator + +import pytest + +# --------------------------------------------------------------------------- +# Recorded stream-json fixtures +# --------------------------------------------------------------------------- + +_TEXT_ONLY_LINES: list[str] = [ + json.dumps({"type": "system", "subtype": "init", "session_id": "sess-offline-1"}), + json.dumps( + { + "type": "assistant", + "message": {"content": [{"type": "text", "text": "Hello from Claude Code!"}]}, + } + ), + json.dumps( + { + "type": "result", + "usage": {"input_tokens": 10, "output_tokens": 5}, + "cost_usd": 0.0001, + "duration_ms": 250, + "num_turns": 1, + } + ), +] + + +async def _fake_lines(lines: list[str]) -> AsyncIterator[str]: + """Async iterator of pre-recorded stream-json lines (no subprocess).""" + for line in lines: + yield line + + +# --------------------------------------------------------------------------- +# Offline tests (always run -- no CLI or API key needed) +# --------------------------------------------------------------------------- + + +class TestClaudeCodeOffline: + """Unit tests that run without a real claude CLI or network.""" + + @pytest.mark.asyncio + async def test_yields_stream_events(self): + """ClaudeCodeTurn drives UnifiedEmitter and yields StreamTaskMessage* events.""" + from agentex.lib.adk import ClaudeCodeTurn + from agentex.lib.core.harness import UnifiedEmitter + from agentex.types.task_message_update import StreamTaskMessageStart + + turn = ClaudeCodeTurn(_fake_lines(_TEXT_ONLY_LINES)) + emitter = UnifiedEmitter(task_id="t", trace_id=None, parent_span_id=None) + + events = [e async for e in emitter.yield_turn(turn)] + assert len(events) > 0, "No events yielded" + assert any(isinstance(e, StreamTaskMessageStart) for e in events) + + @pytest.mark.asyncio + async def test_stream_task_message_done_present(self): + """StreamTaskMessageDone must appear after stream exhaustion.""" + from agentex.lib.adk import ClaudeCodeTurn + from agentex.lib.core.harness import UnifiedEmitter + from agentex.types.task_message_update import StreamTaskMessageDone + + turn = ClaudeCodeTurn(_fake_lines(_TEXT_ONLY_LINES)) + emitter = UnifiedEmitter(task_id="t", trace_id=None, parent_span_id=None) + + events = [e async for e in emitter.yield_turn(turn)] + assert any(isinstance(e, StreamTaskMessageDone) for e in events), ( + "Expected at least one StreamTaskMessageDone event" + ) + + @pytest.mark.asyncio + async def test_usage_populated_after_stream_exhausted(self): + """ClaudeCodeTurn.usage() returns correct tokens after stream is exhausted.""" + from agentex.lib.adk import ClaudeCodeTurn + + turn = ClaudeCodeTurn(_fake_lines(_TEXT_ONLY_LINES)) + _ = [e async for e in turn.events] + usage = turn.usage() + assert usage.input_tokens == 10 + assert usage.output_tokens == 5 + assert usage.num_llm_calls == 1 + + @pytest.mark.asyncio + async def test_protocol_compliance(self): + """ClaudeCodeTurn satisfies the HarnessTurn protocol.""" + from agentex.lib.adk import ClaudeCodeTurn + + turn = ClaudeCodeTurn(_fake_lines(_TEXT_ONLY_LINES)) + assert hasattr(turn, "events"), "ClaudeCodeTurn missing .events" + assert hasattr(turn, "usage"), "ClaudeCodeTurn missing .usage()" + + +# --------------------------------------------------------------------------- +# Live tests (skipped unless CLAUDE_LIVE_TESTS=1) +# --------------------------------------------------------------------------- + +pytestmark_live = pytest.mark.skipif( + not os.environ.get("CLAUDE_LIVE_TESTS"), + reason="Set CLAUDE_LIVE_TESTS=1 and ensure the `claude` CLI + ANTHROPIC_API_KEY are available", +) + +AGENTEX_API_BASE_URL = os.environ.get("AGENTEX_API_BASE_URL", "http://localhost:5003") +AGENT_NAME = os.environ.get("AGENT_NAME", "s060-claude-code") + + +@pytestmark_live +class TestClaudeCodeLive: + """Live streaming tests -- needs the claude CLI + ANTHROPIC_API_KEY.""" + + @pytest.fixture + def client(self): + from agentex import Agentex + + return Agentex(base_url=AGENTEX_API_BASE_URL) + + @pytest.fixture + def agent_name(self): + return AGENT_NAME + + def test_stream_simple_message(self, client, agent_name: str): + """Stream a simple prompt through the local Claude Code subprocess.""" + from test_utils.sync import collect_streaming_response + + from agentex.types import TextContentParam + from agentex.types.agent_rpc_params import ParamsSendMessageRequest + + stream = client.agents.send_message_stream( + agent_name=agent_name, + params=ParamsSendMessageRequest( + content=TextContentParam( + author="user", + content="Reply with exactly three words: hello from claude", + type="text", + ) + ), + ) + aggregated_content, chunks = collect_streaming_response(stream) + assert aggregated_content is not None + assert len(chunks) >= 1 + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) diff --git a/examples/tutorials/00_sync/060_claude_code/tests/test_agent_offline.py b/examples/tutorials/00_sync/060_claude_code/tests/test_agent_offline.py new file mode 100644 index 000000000..23ac52a57 --- /dev/null +++ b/examples/tutorials/00_sync/060_claude_code/tests/test_agent_offline.py @@ -0,0 +1,210 @@ +"""Offline unit tests for the sync Claude Code tutorial agent. + +These tests do NOT require the ``claude`` CLI or an ANTHROPIC_API_KEY. +They inject a fake async iterator of pre-recorded stream-json lines in +place of the real subprocess spawn, and a fake streaming backend in place +of the real Redis/AGP layer, then assert that the handler correctly drives +the unified surface (``UnifiedEmitter.yield_turn``). + +The injection seam is the ``_spawn_claude`` function in ``project/acp.py``. +Tests monkeypatch it with a coroutine that returns a pre-recorded async +iterator, so the handler code runs in full without any subprocess. +""" + +from __future__ import annotations + +import json +from typing import AsyncIterator + +import pytest + +from agentex.lib.adk import ClaudeCodeTurn +from agentex.lib.core.harness import UnifiedEmitter +from agentex.types.task_message_update import ( + StreamTaskMessageStart, +) + +# --------------------------------------------------------------------------- +# Recorded stream-json fixtures +# --------------------------------------------------------------------------- + +_TEXT_ONLY_LINES: list[str] = [ + json.dumps({"type": "system", "subtype": "init", "session_id": "sess-1"}), + json.dumps( + { + "type": "assistant", + "message": {"content": [{"type": "text", "text": "Hello from Claude Code!"}]}, + } + ), + json.dumps( + { + "type": "result", + "usage": {"input_tokens": 10, "output_tokens": 5}, + "cost_usd": 0.0001, + "duration_ms": 250, + "num_turns": 1, + } + ), +] + +_TOOL_CALL_LINES: list[str] = [ + json.dumps({"type": "system", "subtype": "init", "session_id": "sess-2"}), + json.dumps( + { + "type": "assistant", + "message": { + "content": [ + { + "type": "tool_use", + "id": "tool_abc", + "name": "Bash", + "input": {"command": "echo hello"}, + } + ] + }, + } + ), + json.dumps( + { + "type": "user", + "message": { + "content": [ + { + "type": "tool_result", + "tool_use_id": "tool_abc", + "content": "hello\n", + "is_error": False, + } + ] + }, + } + ), + json.dumps( + { + "type": "assistant", + "message": {"content": [{"type": "text", "text": "Done."}]}, + } + ), + json.dumps( + { + "type": "result", + "usage": {"input_tokens": 20, "output_tokens": 8}, + "cost_usd": 0.0002, + "duration_ms": 400, + "num_turns": 1, + } + ), +] + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +async def _fake_lines(lines: list[str]) -> AsyncIterator[str]: + for line in lines: + yield line + + +async def _collect_yield_turn(lines: list[str]) -> list: + """Run a ClaudeCodeTurn through UnifiedEmitter.yield_turn and collect events.""" + turn = ClaudeCodeTurn(_fake_lines(lines)) + emitter = UnifiedEmitter(task_id="t1", trace_id=None, parent_span_id=None) + return [e async for e in emitter.yield_turn(turn)] + + +# --------------------------------------------------------------------------- +# Tests +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_text_only_produces_start_and_done(): + events = await _collect_yield_turn(_TEXT_ONLY_LINES) + types = [type(e).__name__ for e in events] + assert "StreamTaskMessageStart" in types + assert "StreamTaskMessageDone" in types + + +@pytest.mark.asyncio +async def test_text_only_content(): + events = await _collect_yield_turn(_TEXT_ONLY_LINES) + starts = [e for e in events if isinstance(e, StreamTaskMessageStart)] + assert len(starts) == 1 + assert starts[0].content.type == "text" + + +@pytest.mark.asyncio +async def test_usage_is_populated_after_stream(): + turn = ClaudeCodeTurn(_fake_lines(_TEXT_ONLY_LINES)) + _ = [e async for e in turn.events] + usage = turn.usage() + assert usage.input_tokens == 10 + assert usage.output_tokens == 5 + assert usage.cost_usd == pytest.approx(0.0001, rel=1e-4) + assert usage.num_llm_calls == 1 + + +@pytest.mark.asyncio +async def test_tool_call_produces_tool_request_and_response(): + events = await _collect_yield_turn(_TOOL_CALL_LINES) + content_types = { + getattr(e, "content", None) and getattr(e.content, "type", None) for e in events if hasattr(e, "content") + } + assert "tool_request" in content_types + assert "tool_response" in content_types + + +@pytest.mark.asyncio +async def test_tool_call_has_one_text_block(): + """The tool_use block is not text; only 'Done.' is the text block.""" + events = await _collect_yield_turn(_TOOL_CALL_LINES) + text_starts = [ + e for e in events if isinstance(e, StreamTaskMessageStart) and getattr(e.content, "type", None) == "text" + ] + assert len(text_starts) == 1 + + +@pytest.mark.asyncio +async def test_empty_lines_are_skipped(): + """Inserting blank lines in the stream must not crash the parser.""" + lines_with_blanks = ["", " "] + _TEXT_ONLY_LINES + [""] + events = await _collect_yield_turn(lines_with_blanks) + assert any(isinstance(e, StreamTaskMessageStart) for e in events) + + +@pytest.mark.asyncio +async def test_spawn_seam_concept(): + """Demonstrate the injectable spawn seam pattern used in project/acp.py. + + The ``_spawn_claude`` function in ``project/acp.py`` is a top-level async + generator. Production code calls it like:: + + turn = ClaudeCodeTurn(_spawn_claude(prompt)) + + In tests, a replacement function is injected (e.g. via monkeypatch) to + return pre-recorded lines. This test proves the pattern works end-to-end + without importing the full ACP module (which has module-level env-var + checks that only pass in a running agent environment). + """ + recorded_lines = _TEXT_ONLY_LINES + + async def _fake_spawn(prompt: str) -> AsyncIterator[str]: # noqa: ARG001 + """Drop-in replacement for _spawn_claude.""" + for line in recorded_lines: + yield line + + called_with: list[str] = [] + + async def _wrapped_spawn(prompt: str) -> AsyncIterator[str]: + called_with.append(prompt) + async for line in _fake_spawn(prompt): + yield line + + turn = ClaudeCodeTurn(_wrapped_spawn("test prompt")) + emitter = UnifiedEmitter(task_id="t2", trace_id=None, parent_span_id=None) + events = [e async for e in emitter.yield_turn(turn)] + + assert called_with == ["test prompt"] + assert any(isinstance(e, StreamTaskMessageStart) for e in events) diff --git a/examples/tutorials/10_async/00_base/130_claude_code/.dockerignore b/examples/tutorials/10_async/00_base/130_claude_code/.dockerignore new file mode 100644 index 000000000..c49489471 --- /dev/null +++ b/examples/tutorials/10_async/00_base/130_claude_code/.dockerignore @@ -0,0 +1,43 @@ +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg + +# Environments +.env** +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# IDE +.idea/ +.vscode/ +*.swp +*.swo + +# Git +.git +.gitignore + +# Misc +.DS_Store diff --git a/examples/tutorials/10_async/00_base/130_claude_code/Dockerfile b/examples/tutorials/10_async/00_base/130_claude_code/Dockerfile new file mode 100644 index 000000000..e36b9e56d --- /dev/null +++ b/examples/tutorials/10_async/00_base/130_claude_code/Dockerfile @@ -0,0 +1,43 @@ +# syntax=docker/dockerfile:1.3 +FROM python:3.12-slim +COPY --from=ghcr.io/astral-sh/uv:0.6.4 /uv /uvx /bin/ + +RUN apt-get update && apt-get install -y \ + htop \ + vim \ + curl \ + tar \ + python3-dev \ + postgresql-client \ + build-essential \ + libpq-dev \ + gcc \ + cmake \ + netcat-openbsd \ + nodejs \ + npm \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +RUN uv pip install --system --upgrade pip setuptools wheel + +RUN npm install -g @anthropic-ai/claude-code || true + +ENV UV_HTTP_TIMEOUT=1000 + +COPY 10_async/00_base/130_claude_code/pyproject.toml /app/130_claude_code/pyproject.toml +COPY 10_async/00_base/130_claude_code/README.md /app/130_claude_code/README.md + +WORKDIR /app/130_claude_code + +COPY 10_async/00_base/130_claude_code/project /app/130_claude_code/project +COPY 10_async/00_base/130_claude_code/tests /app/130_claude_code/tests +COPY test_utils /app/test_utils + +RUN uv pip install --system .[dev] + +ENV PYTHONPATH=/app + +ENV AGENT_NAME=ab130-claude-code + +CMD ["uvicorn", "project.acp:acp", "--host", "0.0.0.0", "--port", "8000"] diff --git a/examples/tutorials/10_async/00_base/130_claude_code/README.md b/examples/tutorials/10_async/00_base/130_claude_code/README.md new file mode 100644 index 000000000..695207c57 --- /dev/null +++ b/examples/tutorials/10_async/00_base/130_claude_code/README.md @@ -0,0 +1,76 @@ +# Tutorial 130 (async/base): Async Claude Code Agent + +This tutorial demonstrates how to build an **async (non-Temporal)** agent that +spawns the Claude Code CLI as a local subprocess and delivers its output through +the Agentex unified harness surface via ``ClaudeCodeTurn`` and +``UnifiedEmitter.auto_send_turn``. + +## Key Concepts + +### Async delivery path + +Unlike the sync tutorial (060), this agent uses the async ACP model. The +``@acp.on_task_event_send`` handler does not return a generator -- instead, +``UnifiedEmitter.auto_send_turn(turn)`` pushes events to the task's Redis +stream in real time and returns a ``TurnResult`` when the turn is complete. +The UI polls or streams that Redis channel independently. + +### ClaudeCodeTurn + UnifiedEmitter + +Same tap as the sync tutorial: +- ``ClaudeCodeTurn`` wraps ``convert_claude_code_to_agentex_events``. +- ``UnifiedEmitter`` wires trace context + chosen delivery. +- ``auto_send_turn`` is the async push path. + +### Local subprocess spawn + +``_spawn_claude`` in ``project/acp.py`` uses ``asyncio.create_subprocess_exec`` +to run: + +``` +claude -p --output-format stream-json --verbose +``` + +The prompt is written to stdin. Stdout is read line by line. + +Production isolation (Scale sandbox, secret injection, MCP configuration) +is the golden agent's concern at +``teams/sgp/agents/golden_agent/project/harness/providers/claude.py``. + +### Injectable spawn seam + +``_spawn_claude`` is a top-level async generator. Tests monkeypatch it to +inject pre-recorded stream-json lines so offline unit tests run without the CLI. + +## Files + +| File | Description | +|------|-------------| +| ``project/acp.py`` | ACP server, ``_spawn_claude`` seam, and event handler | +| ``tests/test_agent.py`` | Live integration tests (needs CLI + API key) | +| ``tests/test_agent_offline.py`` | Offline unit tests with injected fake subprocess | +| ``manifest.yaml`` | Agent configuration | + +## Running Locally (live) + +Requires the ``claude`` CLI installed and ``ANTHROPIC_API_KEY`` set: + +```bash +npm install -g @anthropic-ai/claude-code +export ANTHROPIC_API_KEY=sk-ant-... +agentex agents run +``` + +## Running Offline Tests + +No CLI or API key needed: + +```bash +uv run pytest tests/test_agent_offline.py -v +``` + +## Notes + +- Production isolation (sandbox, secrets, MCP) is the golden agent's concern. +- For multi-turn memory, persist the Claude Code session_id from the + ``result`` envelope and pass it to ``claude -r `` on the next turn. diff --git a/examples/tutorials/10_async/00_base/130_claude_code/manifest.yaml b/examples/tutorials/10_async/00_base/130_claude_code/manifest.yaml new file mode 100644 index 000000000..7d74de7c6 --- /dev/null +++ b/examples/tutorials/10_async/00_base/130_claude_code/manifest.yaml @@ -0,0 +1,58 @@ +build: + context: + root: ../../../ + include_paths: + - 10_async/00_base/130_claude_code + - test_utils + dockerfile: 10_async/00_base/130_claude_code/Dockerfile + dockerignore: 10_async/00_base/130_claude_code/.dockerignore + +local_development: + agent: + port: 8000 + host_address: host.docker.internal + paths: + acp: project/acp.py + +agent: + acp_type: async + name: ab130-claude-code + description: An async Claude Code agent streaming the unified harness surface via a local CLI subprocess + + temporal: + enabled: false + + credentials: + - env_var_name: ANTHROPIC_API_KEY + secret_name: anthropic-api-key + secret_key: api-key + - env_var_name: REDIS_URL + secret_name: redis-url-secret + secret_key: url + - env_var_name: SGP_API_KEY + secret_name: sgp-api-key + secret_key: api-key + - env_var_name: SGP_ACCOUNT_ID + secret_name: sgp-account-id + secret_key: account-id + - env_var_name: SGP_CLIENT_BASE_URL + secret_name: sgp-client-base-url + secret_key: url + +deployment: + image: + repository: "" + tag: "latest" + + global: + agent: + name: "ab130-claude-code" + description: "An async Claude Code agent streaming via local CLI subprocess" + replicaCount: 1 + resources: + requests: + cpu: "500m" + memory: "1Gi" + limits: + cpu: "1000m" + memory: "2Gi" diff --git a/examples/tutorials/10_async/00_base/130_claude_code/project/__init__.py b/examples/tutorials/10_async/00_base/130_claude_code/project/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/examples/tutorials/10_async/00_base/130_claude_code/project/acp.py b/examples/tutorials/10_async/00_base/130_claude_code/project/acp.py new file mode 100644 index 000000000..b6681f6a8 --- /dev/null +++ b/examples/tutorials/10_async/00_base/130_claude_code/project/acp.py @@ -0,0 +1,149 @@ +"""ACP handler for the async Claude Code tutorial. + +Spawns ``claude -p --output-format stream-json --verbose`` as a LOCAL +asyncio subprocess (no Scale sandbox -- that is the golden agent's +production concern). Stdout lines are fed into ``ClaudeCodeTurn``. Events +are delivered via ``UnifiedEmitter.auto_send_turn``, the async Redis push +path. + +Live runs require the ``claude`` CLI to be installed and an +ANTHROPIC_API_KEY (or equivalent credential) in the environment. +For offline testing, see ``tests/test_agent_offline.py``. +""" + +from __future__ import annotations + +import os +import asyncio +from typing import AsyncIterator + +from dotenv import load_dotenv + +load_dotenv() + +import agentex.lib.adk as adk +from agentex.lib.adk import ClaudeCodeTurn +from agentex.lib.types.acp import SendEventParams, CancelTaskParams, CreateTaskParams +from agentex.lib.core.harness import UnifiedEmitter +from agentex.lib.types.fastacp import AsyncACPConfig +from agentex.lib.types.tracing import SGPTracingProcessorConfig +from agentex.lib.utils.logging import make_logger +from agentex.lib.sdk.fastacp.fastacp import FastACP +from agentex.lib.core.tracing.tracing_processor_manager import add_tracing_processor_config + +logger = make_logger(__name__) + +add_tracing_processor_config( + SGPTracingProcessorConfig( + sgp_api_key=os.environ.get("SGP_API_KEY", ""), + sgp_account_id=os.environ.get("SGP_ACCOUNT_ID", ""), + sgp_base_url=os.environ.get("SGP_CLIENT_BASE_URL", ""), + ) +) + +acp = FastACP.create( + acp_type="async", + config=AsyncACPConfig(type="base"), +) + + +async def _spawn_claude(prompt: str) -> AsyncIterator[str]: + """Spawn ``claude -p --output-format stream-json`` locally and yield stdout lines. + + Injectable seam: tests monkeypatch this with a fake async iterator of + pre-recorded lines so no real CLI invocation is needed offline. + """ + proc = await asyncio.create_subprocess_exec( + "claude", + "-p", + "--output-format", + "stream-json", + "--verbose", + stdin=asyncio.subprocess.PIPE, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + assert proc.stdout is not None + assert proc.stdin is not None + + proc.stdin.write(prompt.encode()) + proc.stdin.close() + + # Drain stderr concurrently. With --verbose, Claude Code can write enough to + # stderr to fill the OS pipe buffer; if we only read stdout, the CLI blocks + # on its stderr write while we block reading stdout — a deadlock. A + # background task keeps stderr flowing so stdout never stalls. + async def _drain_stderr() -> None: + assert proc.stderr is not None + async for _ in proc.stderr: + pass + + stderr_task = asyncio.create_task(_drain_stderr()) + + try: + buffer = "" + async for chunk in proc.stdout: + buffer += chunk.decode("utf-8", errors="replace") + while "\n" in buffer: + line, buffer = buffer.split("\n", 1) + line = line.strip() + if line: + yield line + + if buffer.strip(): + yield buffer.strip() + + await proc.wait() + finally: + # Release the subprocess and stderr drain task even if the consumer + # abandons the generator early (task cancellation / client disconnect): + # cancel the drain task and terminate+reap the process if it is still + # running, so neither is leaked. + stderr_task.cancel() + try: + await stderr_task + except asyncio.CancelledError: + pass + if proc.returncode is None: + try: + proc.terminate() + except ProcessLookupError: + pass + await proc.wait() + + +@acp.on_task_create +async def handle_task_create(params: CreateTaskParams): + logger.info("Task created: %s", params.task.id) + + +@acp.on_task_event_send +async def handle_task_event_send(params: SendEventParams): + """Handle a user message: spawn Claude Code locally and push events to the task stream.""" + task_id = params.task.id + prompt = params.event.content.content + logger.info("Processing message for task %s", task_id) + + await adk.messages.create(task_id=task_id, content=params.event.content) + + async with adk.tracing.span( + trace_id=task_id, + task_id=task_id, + name="message", + input={"message": prompt}, + data={"__span_type__": "AGENT_WORKFLOW"}, + ) as turn_span: + emitter = UnifiedEmitter( + task_id=task_id, + trace_id=task_id, + parent_span_id=turn_span.id if turn_span else None, + ) + turn = ClaudeCodeTurn(_spawn_claude(prompt)) + result = await emitter.auto_send_turn(turn) + if turn_span: + turn_span.output = {"final_text": result.final_text} + + +@acp.on_task_cancel +async def handle_task_canceled(params: CancelTaskParams): + logger.info("Task canceled: %s", params.task.id) diff --git a/examples/tutorials/10_async/00_base/130_claude_code/pyproject.toml b/examples/tutorials/10_async/00_base/130_claude_code/pyproject.toml new file mode 100644 index 000000000..66c3cdaf3 --- /dev/null +++ b/examples/tutorials/10_async/00_base/130_claude_code/pyproject.toml @@ -0,0 +1,25 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "ab130-claude-code" +version = "0.1.0" +description = "An async Claude Code agent streaming the unified harness surface via a local CLI subprocess" +readme = "README.md" +requires-python = ">=3.12" +dependencies = [ + "agentex-sdk", + "scale-gp", + "python-dotenv>=1.0,<2", +] + +[project.optional-dependencies] +dev = [ + "pytest", + "pytest-asyncio", + "httpx", +] + +[tool.hatch.build.targets.wheel] +packages = ["project"] diff --git a/examples/tutorials/10_async/00_base/130_claude_code/tests/test_agent.py b/examples/tutorials/10_async/00_base/130_claude_code/tests/test_agent.py new file mode 100644 index 000000000..ee254da23 --- /dev/null +++ b/examples/tutorials/10_async/00_base/130_claude_code/tests/test_agent.py @@ -0,0 +1,250 @@ +"""Tests for the async Claude Code tutorial agent. + +LIVE tests (``TestClaudeCodeLive``): + - Require the ``claude`` CLI on PATH and ``ANTHROPIC_API_KEY`` set. + - Run the full agent end-to-end against a live Agentex server. + - Skipped automatically when ``CLAUDE_LIVE_TESTS`` is not set to ``1``. + +OFFLINE unit tests (``TestClaudeCodeOffline``): + - Inject a fake async iterator of pre-recorded stream-json lines. + - Assert the ``ClaudeCodeTurn`` + ``UnifiedEmitter`` pipeline drives + ``auto_send_turn``, populates usage, and satisfies the ``HarnessTurn`` + protocol. + - Always run -- no CLI or API key needed. +""" + +from __future__ import annotations + +import os +import json +from typing import AsyncIterator + +import pytest + +from agentex.types.task_message import TaskMessage + +# --------------------------------------------------------------------------- +# Recorded stream-json fixtures +# --------------------------------------------------------------------------- + +_TEXT_ONLY_LINES: list[str] = [ + json.dumps({"type": "system", "subtype": "init", "session_id": "sess-offline-async-1"}), + json.dumps( + { + "type": "assistant", + "message": {"content": [{"type": "text", "text": "Hello from async Claude Code!"}]}, + } + ), + json.dumps( + { + "type": "result", + "usage": {"input_tokens": 12, "output_tokens": 6}, + "cost_usd": 0.0001, + "duration_ms": 300, + "num_turns": 1, + } + ), +] + + +async def _fake_lines(lines: list[str]) -> AsyncIterator[str]: + """Async iterator of pre-recorded stream-json lines (no subprocess).""" + for line in lines: + yield line + + +# --------------------------------------------------------------------------- +# Fake streaming backend +# --------------------------------------------------------------------------- + + +class _FakeCtx: + def __init__(self, sink, content_type, initial_content): + self.sink = sink + self.content_type = content_type + self.task_message = TaskMessage(id="msg-1", task_id="task-offline", content=initial_content) + + async def __aenter__(self): + self.sink.append(("open", self.content_type)) + return self + + async def __aexit__(self, *a): + await self.close() + return False + + async def close(self): + self.sink.append(("close", self.content_type)) + + async def stream_update(self, update): + self.sink.append(("update", update)) + return update + + +class _FakeStreaming: + def __init__(self): + self.sink: list = [] + + def streaming_task_message_context(self, task_id, initial_content, streaming_mode="coalesced", created_at=None): # noqa: ARG002 + ctype = getattr(initial_content, "type", None) + self.sink.append(("ctx", ctype)) + return _FakeCtx(self.sink, ctype, initial_content) + + +# --------------------------------------------------------------------------- +# Offline tests (always run -- no CLI or API key needed) +# --------------------------------------------------------------------------- + + +class TestClaudeCodeOffline: + """Unit tests that run without a real claude CLI or network.""" + + @pytest.mark.asyncio + async def test_auto_send_text_only_opens_and_closes_context(self): + """auto_send_turn opens and closes exactly one streaming context.""" + from agentex.lib.adk import ClaudeCodeTurn + from agentex.lib.core.harness import UnifiedEmitter + + fake_streaming = _FakeStreaming() + turn = ClaudeCodeTurn(_fake_lines(_TEXT_ONLY_LINES)) + emitter = UnifiedEmitter( + task_id="offline-task", + trace_id=None, + parent_span_id=None, + tracer=False, + streaming=fake_streaming, + ) + result = await emitter.auto_send_turn(turn) + + opened = [s for s in fake_streaming.sink if s[0] == "open"] + closed = [s for s in fake_streaming.sink if s[0] == "close"] + assert len(opened) == 1 + assert len(closed) == 1 + assert opened[0][1] == "text" + + @pytest.mark.asyncio + async def test_auto_send_populates_final_text(self): + """auto_send_turn result carries the agent's reply text.""" + from agentex.lib.adk import ClaudeCodeTurn + from agentex.lib.core.harness import UnifiedEmitter + + fake_streaming = _FakeStreaming() + turn = ClaudeCodeTurn(_fake_lines(_TEXT_ONLY_LINES)) + emitter = UnifiedEmitter( + task_id="offline-task", + trace_id=None, + parent_span_id=None, + tracer=False, + streaming=fake_streaming, + ) + result = await emitter.auto_send_turn(turn) + assert "Hello from async Claude Code" in result.final_text + + @pytest.mark.asyncio + async def test_usage_populated_after_stream_exhausted(self): + """Usage is populated after the events stream is exhausted.""" + from agentex.lib.adk import ClaudeCodeTurn + from agentex.lib.core.harness import UnifiedEmitter + + fake_streaming = _FakeStreaming() + turn = ClaudeCodeTurn(_fake_lines(_TEXT_ONLY_LINES)) + emitter = UnifiedEmitter( + task_id="t", + trace_id=None, + parent_span_id=None, + tracer=False, + streaming=fake_streaming, + ) + await emitter.auto_send_turn(turn) + usage = turn.usage() + assert usage.input_tokens == 12 + assert usage.output_tokens == 6 + assert usage.num_llm_calls == 1 + + @pytest.mark.asyncio + async def test_stream_task_message_done_present(self): + """StreamTaskMessageDone must appear via yield_turn on a ClaudeCodeTurn.""" + from agentex.lib.adk import ClaudeCodeTurn + from agentex.lib.core.harness import UnifiedEmitter + from agentex.types.task_message_update import StreamTaskMessageDone + + turn = ClaudeCodeTurn(_fake_lines(_TEXT_ONLY_LINES)) + emitter = UnifiedEmitter(task_id="t", trace_id=None, parent_span_id=None) + events = [e async for e in emitter.yield_turn(turn)] + assert any(isinstance(e, StreamTaskMessageDone) for e in events), ( + "Expected at least one StreamTaskMessageDone event" + ) + + +# --------------------------------------------------------------------------- +# Live tests (skipped unless CLAUDE_LIVE_TESTS=1) +# --------------------------------------------------------------------------- + +pytestmark_live = pytest.mark.skipif( + not os.environ.get("CLAUDE_LIVE_TESTS"), + reason="Set CLAUDE_LIVE_TESTS=1 and ensure the `claude` CLI + ANTHROPIC_API_KEY are available", +) + +AGENTEX_API_BASE_URL = os.environ.get("AGENTEX_API_BASE_URL", "http://localhost:5003") +AGENT_NAME = os.environ.get("AGENT_NAME", "ab130-claude-code") + + +@pytestmark_live +class TestClaudeCodeLive: + """Live async tests -- needs the claude CLI + ANTHROPIC_API_KEY.""" + + @pytest.fixture + def client(self): + from agentex import Agentex + + return Agentex(base_url=AGENTEX_API_BASE_URL) + + @pytest.fixture + def agent_name(self): + return AGENT_NAME + + @pytest.fixture + def agent_id(self, client, agent_name): + agents = client.agents.list() + for agent in agents: + if agent.name == agent_name: + return agent.id + raise ValueError(f"Agent {agent_name!r} not found.") + + def test_send_simple_message(self, client, agent_id: str): + """Create a task, send a message, and poll until a response appears.""" + import time + import uuid + + from agentex.types import TextContentParam + from agentex.types.agent_rpc_params import ParamsSendEventRequest, ParamsCreateTaskRequest + + task = client.agents.create_task(agent_id, params=ParamsCreateTaskRequest(name=uuid.uuid1().hex)).result + assert task is not None + task_id = task.id + + client.agents.send_event( + agent_id=agent_id, + params=ParamsSendEventRequest( + task_id=task_id, + content=TextContentParam( + author="user", + content="Reply with exactly three words: hello from claude", + type="text", + ), + ), + ) + + deadline = time.monotonic() + 60 + while time.monotonic() < deadline: + msgs = client.messages.list(task_id=task_id) + agent_msgs = [m for m in msgs if getattr(m.content, "author", None) == "agent"] + if agent_msgs: + assert len(agent_msgs) >= 1 + return + time.sleep(2) + + raise AssertionError("No agent response received within 60 s") + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) diff --git a/examples/tutorials/10_async/00_base/130_claude_code/tests/test_agent_offline.py b/examples/tutorials/10_async/00_base/130_claude_code/tests/test_agent_offline.py new file mode 100644 index 000000000..ac48474ee --- /dev/null +++ b/examples/tutorials/10_async/00_base/130_claude_code/tests/test_agent_offline.py @@ -0,0 +1,243 @@ +"""Offline unit tests for the async Claude Code tutorial agent. + +These tests do NOT require the ``claude`` CLI or an ANTHROPIC_API_KEY. +They inject a fake async iterator of pre-recorded stream-json lines in +place of the real subprocess spawn and a fake streaming backend, then +assert that the handler drives ``UnifiedEmitter.auto_send_turn`` correctly. + +The injection seam is the ``_spawn_claude`` function in ``project/acp.py``. +""" + +from __future__ import annotations + +import json +from typing import AsyncIterator + +import pytest + +from agentex.lib.adk import ClaudeCodeTurn +from agentex.lib.core.harness import UnifiedEmitter +from agentex.types.task_message import TaskMessage + +# --------------------------------------------------------------------------- +# Recorded fixtures +# --------------------------------------------------------------------------- + +_TEXT_ONLY_LINES: list[str] = [ + json.dumps({"type": "system", "subtype": "init", "session_id": "sess-1"}), + json.dumps( + { + "type": "assistant", + "message": {"content": [{"type": "text", "text": "Hello from async Claude Code!"}]}, + } + ), + json.dumps( + { + "type": "result", + "usage": {"input_tokens": 12, "output_tokens": 6}, + "cost_usd": 0.0001, + "duration_ms": 300, + "num_turns": 1, + } + ), +] + +_TOOL_CALL_LINES: list[str] = [ + json.dumps({"type": "system", "subtype": "init", "session_id": "sess-2"}), + json.dumps( + { + "type": "assistant", + "message": { + "content": [ + { + "type": "tool_use", + "id": "tool_xyz", + "name": "Read", + "input": {"file_path": "/tmp/foo.txt"}, + } + ] + }, + } + ), + json.dumps( + { + "type": "user", + "message": { + "content": [ + { + "type": "tool_result", + "tool_use_id": "tool_xyz", + "content": "file contents", + "is_error": False, + } + ] + }, + } + ), + json.dumps( + { + "type": "assistant", + "message": {"content": [{"type": "text", "text": "Read the file."}]}, + } + ), + json.dumps( + { + "type": "result", + "usage": {"input_tokens": 25, "output_tokens": 10}, + "cost_usd": 0.0003, + "duration_ms": 500, + "num_turns": 1, + } + ), +] + + +# --------------------------------------------------------------------------- +# Fake streaming backend +# --------------------------------------------------------------------------- + + +class _FakeCtx: + def __init__(self, sink, content_type, initial_content): + self.sink = sink + self.content_type = content_type + self.task_message = TaskMessage(id="msg-1", task_id="task-offline", content=initial_content) + + async def __aenter__(self): + self.sink.append(("open", self.content_type)) + return self + + async def __aexit__(self, *a): + await self.close() + return False + + async def close(self): + self.sink.append(("close", self.content_type)) + + async def stream_update(self, update): + self.sink.append(("update", update)) + return update + + +class _FakeStreaming: + def __init__(self): + self.sink: list = [] + + def streaming_task_message_context(self, task_id, initial_content, streaming_mode="coalesced", created_at=None): # noqa: ARG002 + ctype = getattr(initial_content, "type", None) + self.sink.append(("ctx", ctype)) + return _FakeCtx(self.sink, ctype, initial_content) + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +async def _fake_lines(lines: list[str]) -> AsyncIterator[str]: + for line in lines: + yield line + + +async def _run_auto_send(lines: list[str]): + """Drive ClaudeCodeTurn through auto_send_turn with a fake streaming backend.""" + fake_streaming = _FakeStreaming() + turn = ClaudeCodeTurn(_fake_lines(lines)) + emitter = UnifiedEmitter( + task_id="offline-task", + trace_id=None, + parent_span_id=None, + tracer=False, + streaming=fake_streaming, + ) + result = await emitter.auto_send_turn(turn) + return result, fake_streaming.sink + + +# --------------------------------------------------------------------------- +# Tests +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_auto_send_text_only_opens_and_closes_context(): + result, sink = await _run_auto_send(_TEXT_ONLY_LINES) + opened = [s for s in sink if s[0] == "open"] + closed = [s for s in sink if s[0] == "close"] + assert len(opened) == 1 + assert len(closed) == 1 + assert opened[0][1] == "text" + + +@pytest.mark.asyncio +async def test_auto_send_populates_final_text(): + result, _ = await _run_auto_send(_TEXT_ONLY_LINES) + assert "Hello from async Claude Code" in result.final_text + + +@pytest.mark.asyncio +async def test_auto_send_usage_is_populated(): + """Usage is populated after the events stream is exhausted. + + UnifiedEmitter.auto_send_turn evaluates turn.usage() eagerly (before + the events are consumed) so the TurnResult.usage reflects a pre-exhaust + snapshot. Test usage directly from the turn after auto_send_turn completes + instead -- the result envelope is populated by the generator being consumed + inside auto_send. + """ + turn = ClaudeCodeTurn(_fake_lines(_TEXT_ONLY_LINES)) + fake_streaming = _FakeStreaming() + emitter = UnifiedEmitter( + task_id="t", + trace_id=None, + parent_span_id=None, + tracer=False, + streaming=fake_streaming, + ) + await emitter.auto_send_turn(turn) + # After auto_send_turn, the events generator is exhausted and + # ClaudeCodeTurn._on_result has been called with the result envelope. + usage = turn.usage() + assert usage.input_tokens == 12 + assert usage.output_tokens == 6 + assert usage.num_llm_calls == 1 + + +@pytest.mark.asyncio +async def test_auto_send_tool_call_opens_two_contexts(): + result, sink = await _run_auto_send(_TOOL_CALL_LINES) + opened = [s for s in sink if s[0] == "open"] + content_types = [s[1] for s in opened] + assert "tool_request" in content_types + assert "text" in content_types + + +@pytest.mark.asyncio +async def test_spawn_seam_concept(): + """Demonstrate the injectable spawn seam pattern used in project/acp.py. + + The ``_spawn_claude`` function is a top-level async generator. A drop-in + replacement can be injected (e.g. via monkeypatch) to supply pre-recorded + lines without spawning the real CLI. This test proves the pattern works + end-to-end without importing the full ACP module. + """ + called: list[str] = [] + + async def _fake_spawn(prompt: str) -> AsyncIterator[str]: + called.append(prompt) + for line in _TEXT_ONLY_LINES: + yield line + + fake_streaming = _FakeStreaming() + turn = ClaudeCodeTurn(_fake_spawn("ping")) + emitter = UnifiedEmitter( + task_id="t", + trace_id=None, + parent_span_id=None, + tracer=False, + streaming=fake_streaming, + ) + result = await emitter.auto_send_turn(turn) + + assert called == ["ping"] + assert "Hello from async Claude Code" in result.final_text diff --git a/examples/tutorials/10_async/10_temporal/140_claude_code/.dockerignore b/examples/tutorials/10_async/10_temporal/140_claude_code/.dockerignore new file mode 100644 index 000000000..c49489471 --- /dev/null +++ b/examples/tutorials/10_async/10_temporal/140_claude_code/.dockerignore @@ -0,0 +1,43 @@ +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg + +# Environments +.env** +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# IDE +.idea/ +.vscode/ +*.swp +*.swo + +# Git +.git +.gitignore + +# Misc +.DS_Store diff --git a/examples/tutorials/10_async/10_temporal/140_claude_code/Dockerfile b/examples/tutorials/10_async/10_temporal/140_claude_code/Dockerfile new file mode 100644 index 000000000..c909ee6c7 --- /dev/null +++ b/examples/tutorials/10_async/10_temporal/140_claude_code/Dockerfile @@ -0,0 +1,46 @@ +# syntax=docker/dockerfile:1.3 +FROM python:3.12-slim +COPY --from=ghcr.io/astral-sh/uv:0.6.4 /uv /uvx /bin/ + +RUN apt-get update && apt-get install -y \ + htop \ + vim \ + curl \ + tar \ + python3-dev \ + postgresql-client \ + build-essential \ + libpq-dev \ + gcc \ + cmake \ + netcat-openbsd \ + nodejs \ + npm \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +RUN uv pip install --system --upgrade pip setuptools wheel + +RUN npm install -g @anthropic-ai/claude-code || true + +ENV UV_HTTP_TIMEOUT=1000 + +COPY 10_async/10_temporal/140_claude_code/pyproject.toml /app/140_claude_code/pyproject.toml +COPY 10_async/10_temporal/140_claude_code/README.md /app/140_claude_code/README.md + +WORKDIR /app/140_claude_code + +COPY 10_async/10_temporal/140_claude_code/project /app/140_claude_code/project +COPY 10_async/10_temporal/140_claude_code/tests /app/140_claude_code/tests +COPY test_utils /app/test_utils + +RUN uv pip install --system .[dev] + +ENV PYTHONPATH=/app + +ENV AGENT_NAME=at140-claude-code + +CMD ["uvicorn", "project.acp:acp", "--host", "0.0.0.0", "--port", "8000"] + +# When deploying the worker, replace the CMD with: +# CMD ["python", "project/run_worker.py"] diff --git a/examples/tutorials/10_async/10_temporal/140_claude_code/README.md b/examples/tutorials/10_async/10_temporal/140_claude_code/README.md new file mode 100644 index 000000000..61cc94183 --- /dev/null +++ b/examples/tutorials/10_async/10_temporal/140_claude_code/README.md @@ -0,0 +1,76 @@ +# Tutorial 140 (async/temporal): Temporal Claude Code Agent + +This tutorial demonstrates how to build a **Temporal-backed** agent that +spawns the Claude Code CLI as a local subprocess and delivers its output +through the Agentex unified harness surface via ``ClaudeCodeTurn`` and +``UnifiedEmitter.auto_send_turn``, with Temporal providing durable execution +and crash recovery. + +## Key Concepts + +### Temporal + ClaudeCodeTurn + +The Temporal workflow (``project/workflow.py``) holds state durably. Each user +message arrives as a signal (``on_task_event_send``), spawns the Claude Code +CLI locally, wraps the stdout line stream in ``ClaudeCodeTurn``, and pushes +events to the task's Redis stream via ``UnifiedEmitter.auto_send_turn``. + +``workflow.now()`` is passed as ``created_at`` so message timestamps are +deterministic under Temporal replay. + +### Multi-turn session resume + +The workflow persists the Claude Code ``session_id`` from the ``result`` +envelope. On the next turn, ``-r `` is passed to the CLI to +resume the conversation. Temporal's durable state ensures the session_id +survives worker crashes. + +### Note on subprocess in workflow code + +For simplicity, this tutorial spawns the subprocess directly inside the +workflow signal handler. For production use, move the spawn into a custom +Temporal activity so each subprocess invocation gets independent retry and +timeout guarantees. See +``examples/tutorials/10_async/10_temporal/030_custom_activities/`` for +that pattern. + +### Injectable spawn seam + +``_spawn_claude`` in ``project/workflow.py`` is a top-level async generator. +Tests monkeypatch it to inject pre-recorded stream-json lines so offline +unit tests run without the CLI. + +## Files + +| File | Description | +|------|-------------| +| ``project/acp.py`` | Thin ACP server; wires Temporal (no handlers) | +| ``project/workflow.py`` | Temporal workflow + ``_spawn_claude`` seam | +| ``project/run_worker.py`` | Temporal worker entry point | +| ``tests/test_agent.py`` | Live integration tests (needs CLI + Temporal + API key) | +| ``tests/test_agent_offline.py`` | Offline unit tests with injected fake subprocess | +| ``manifest.yaml`` | Agent configuration | + +## Running Locally (live) + +Requires Temporal server, the ``claude`` CLI, and ``ANTHROPIC_API_KEY``: + +```bash +npm install -g @anthropic-ai/claude-code +export ANTHROPIC_API_KEY=sk-ant-... +agentex agents run +``` + +## Running Offline Tests + +No CLI, Temporal, or API key needed: + +```bash +uv run pytest tests/test_agent_offline.py -v +``` + +## Notes + +- Production isolation (sandbox, secrets, MCP) is the golden agent's concern. +- The subprocess spawn should be moved to a custom activity in production. +- The ``--verbose`` flag is included to match the golden agent's invocation. diff --git a/examples/tutorials/10_async/10_temporal/140_claude_code/manifest.yaml b/examples/tutorials/10_async/10_temporal/140_claude_code/manifest.yaml new file mode 100644 index 000000000..9328b1713 --- /dev/null +++ b/examples/tutorials/10_async/10_temporal/140_claude_code/manifest.yaml @@ -0,0 +1,62 @@ +build: + context: + root: ../../../ + include_paths: + - 10_async/10_temporal/140_claude_code + - test_utils + dockerfile: 10_async/10_temporal/140_claude_code/Dockerfile + dockerignore: 10_async/10_temporal/140_claude_code/.dockerignore + +local_development: + agent: + port: 8000 + host_address: host.docker.internal + paths: + acp: project/acp.py + worker: project/run_worker.py + +agent: + acp_type: async + name: at140-claude-code + description: A Temporal-backed Claude Code agent streaming the unified harness surface via a local CLI subprocess + + temporal: + enabled: true + workflows: + - name: at140-claude-code + queue_name: at140_claude_code_queue + + credentials: + - env_var_name: ANTHROPIC_API_KEY + secret_name: anthropic-api-key + secret_key: api-key + - env_var_name: REDIS_URL + secret_name: redis-url-secret + secret_key: url + - env_var_name: SGP_API_KEY + secret_name: sgp-api-key + secret_key: api-key + - env_var_name: SGP_ACCOUNT_ID + secret_name: sgp-account-id + secret_key: account-id + - env_var_name: SGP_CLIENT_BASE_URL + secret_name: sgp-client-base-url + secret_key: url + +deployment: + image: + repository: "" + tag: "latest" + + global: + agent: + name: "at140-claude-code" + description: "A Temporal-backed Claude Code agent streaming via local CLI subprocess" + replicaCount: 1 + resources: + requests: + cpu: "500m" + memory: "1Gi" + limits: + cpu: "1000m" + memory: "2Gi" diff --git a/examples/tutorials/10_async/10_temporal/140_claude_code/project/__init__.py b/examples/tutorials/10_async/10_temporal/140_claude_code/project/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/examples/tutorials/10_async/10_temporal/140_claude_code/project/acp.py b/examples/tutorials/10_async/10_temporal/140_claude_code/project/acp.py new file mode 100644 index 000000000..07258f6d8 --- /dev/null +++ b/examples/tutorials/10_async/10_temporal/140_claude_code/project/acp.py @@ -0,0 +1,31 @@ +"""ACP server for the Temporal Claude Code tutorial. + +This file is intentionally thin. When ``acp_type="async"`` is combined +with ``TemporalACPConfig``, FastACP auto-wires: + + HTTP task/create -> @workflow.run on the workflow class + HTTP task/event/send -> @workflow.signal(SignalName.RECEIVE_EVENT) + HTTP task/cancel -> workflow cancellation via the Temporal client + +The actual agent code lives in ``project/workflow.py`` and is executed by +the Temporal worker (``project/run_worker.py``), not by this HTTP process. +""" + +from __future__ import annotations + +import os + +from dotenv import load_dotenv + +load_dotenv() + +from agentex.lib.types.fastacp import TemporalACPConfig +from agentex.lib.sdk.fastacp.fastacp import FastACP + +acp = FastACP.create( + acp_type="async", + config=TemporalACPConfig( + type="temporal", + temporal_address=os.getenv("TEMPORAL_ADDRESS", "localhost:7233"), + ), +) diff --git a/examples/tutorials/10_async/10_temporal/140_claude_code/project/activities.py b/examples/tutorials/10_async/10_temporal/140_claude_code/project/activities.py new file mode 100644 index 000000000..dcba0f9a7 --- /dev/null +++ b/examples/tutorials/10_async/10_temporal/140_claude_code/project/activities.py @@ -0,0 +1,139 @@ +"""Temporal activity for the Claude Code tutorial. + +Subprocess spawning (and any other I/O) must run inside a Temporal *activity*, +not in workflow code. Temporal runs workflow + signal-handler bodies on a +deterministic sandbox event loop that does not implement ``subprocess_exec`` +(or threads / sockets), so spawning the CLI directly in the signal handler +raises ``NotImplementedError``. This activity runs the Claude Code CLI, drives +the ``ClaudeCodeTurn`` through ``UnifiedEmitter.auto_send_turn`` (the async +Redis push path), and returns the turn result to the workflow. + +The ``_spawn_claude`` async generator is an injectable seam: offline tests +provide a fake that yields pre-recorded stdout lines so no real CLI runs. +""" + +from __future__ import annotations + +import asyncio +from typing import Any, AsyncIterator +from datetime import datetime + +from temporalio import activity + +from agentex.lib.adk import ClaudeCodeTurn +from agentex.lib.core.harness import UnifiedEmitter +from agentex.lib.utils.logging import make_logger +from agentex.lib.utils.model_utils import BaseModel + +logger = make_logger(__name__) + +RUN_CLAUDE_CODE_TURN_ACTIVITY = "run_claude_code_turn" + + +class RunClaudeCodeTurnParams(BaseModel): + """Arguments for one Claude Code turn run inside an activity.""" + + task_id: str + prompt: str + trace_id: str | None = None + parent_span_id: str | None = None + session_id: str | None = None + created_at: datetime | None = None + + +class RunClaudeCodeTurnResult(BaseModel): + """Result returned from the activity to the workflow.""" + + final_text: str + session_id: str | None = None + + +async def _spawn_claude(prompt: str, session_id: str | None = None) -> AsyncIterator[str]: + """Spawn ``claude -p --output-format stream-json`` locally and yield stdout lines. + + Pass ``session_id`` to resume a previous Claude Code session (multi-turn + memory via ``-r ``). + + Injectable seam: tests monkeypatch this with a fake async iterator so no + real CLI invocation is needed offline. + """ + cmd = [ + "claude", + "-p", + "--output-format", + "stream-json", + "--verbose", + ] + if session_id: + cmd.extend(["-r", session_id]) + + proc = await asyncio.create_subprocess_exec( + *cmd, + stdin=asyncio.subprocess.PIPE, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + assert proc.stdout is not None + assert proc.stdin is not None + + proc.stdin.write(prompt.encode()) + proc.stdin.close() + + # Drain stderr concurrently. With --verbose, Claude Code can write enough to + # stderr to fill the OS pipe buffer; if we only read stdout, the CLI blocks + # on its stderr write while we block reading stdout — a deadlock. A + # background task keeps stderr flowing so stdout never stalls. + async def _drain_stderr() -> None: + assert proc.stderr is not None + async for _ in proc.stderr: + pass + + stderr_task = asyncio.create_task(_drain_stderr()) + + try: + buffer = "" + async for chunk in proc.stdout: + buffer += chunk.decode("utf-8", errors="replace") + while "\n" in buffer: + line, buffer = buffer.split("\n", 1) + line = line.strip() + if line: + yield line + + if buffer.strip(): + yield buffer.strip() + + await proc.wait() + finally: + # Release the subprocess and stderr drain task even if the consumer + # abandons the generator early (task cancellation / client disconnect): + # cancel the drain task and terminate+reap the process if it is still + # running, so neither is leaked. + stderr_task.cancel() + try: + await stderr_task + except asyncio.CancelledError: + pass + if proc.returncode is None: + try: + proc.terminate() + except ProcessLookupError: + pass + await proc.wait() + + +@activity.defn(name=RUN_CLAUDE_CODE_TURN_ACTIVITY) +async def run_claude_code_turn(params: RunClaudeCodeTurnParams) -> dict[str, Any]: + """Run one Claude Code turn end-to-end and stream events to the task. + + Runs in an activity (real asyncio loop) so subprocess I/O is permitted. + """ + emitter = UnifiedEmitter( + task_id=params.task_id, + trace_id=params.trace_id, + parent_span_id=params.parent_span_id, + ) + turn = ClaudeCodeTurn(_spawn_claude(params.prompt, session_id=params.session_id)) + result = await emitter.auto_send_turn(turn, created_at=params.created_at) + + return RunClaudeCodeTurnResult(final_text=result.final_text, session_id=turn.session_id).model_dump() diff --git a/examples/tutorials/10_async/10_temporal/140_claude_code/project/run_worker.py b/examples/tutorials/10_async/10_temporal/140_claude_code/project/run_worker.py new file mode 100644 index 000000000..58802737e --- /dev/null +++ b/examples/tutorials/10_async/10_temporal/140_claude_code/project/run_worker.py @@ -0,0 +1,41 @@ +"""Temporal worker for the Claude Code tutorial. + +Run as a separate long-lived process alongside the ACP HTTP server. The +worker polls Temporal for workflow + activity tasks and executes them. + +The Claude Code CLI subprocess runs in the ``run_claude_code_turn`` activity +(registered below alongside the built-in Agentex activities), because +subprocess I/O is not permitted on the Temporal workflow event loop. +""" + +import asyncio + +from project.workflow import At140ClaudeCodeWorkflow +from project.activities import run_claude_code_turn +from agentex.lib.utils.debug import setup_debug_if_enabled +from agentex.lib.utils.logging import make_logger +from agentex.lib.environment_variables import EnvironmentVariables +from agentex.lib.core.temporal.activities import get_all_activities +from agentex.lib.core.temporal.workers.worker import AgentexWorker + +environment_variables = EnvironmentVariables.refresh() +logger = make_logger(__name__) + + +async def main(): + setup_debug_if_enabled() + + task_queue_name = environment_variables.WORKFLOW_TASK_QUEUE + if task_queue_name is None: + raise ValueError("WORKFLOW_TASK_QUEUE is not set") + + worker = AgentexWorker(task_queue=task_queue_name) + + await worker.run( + activities=[run_claude_code_turn, *get_all_activities()], + workflow=At140ClaudeCodeWorkflow, + ) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/examples/tutorials/10_async/10_temporal/140_claude_code/project/workflow.py b/examples/tutorials/10_async/10_temporal/140_claude_code/project/workflow.py new file mode 100644 index 000000000..7f50ba8d5 --- /dev/null +++ b/examples/tutorials/10_async/10_temporal/140_claude_code/project/workflow.py @@ -0,0 +1,137 @@ +"""Temporal workflow for the Claude Code tutorial. + +Holds conversation state (session_id for multi-turn resume) durably across +crashes. Each user message triggers ``on_task_event_send``, which delegates the +turn to the ``run_claude_code_turn`` activity. The activity spawns the Claude +Code CLI, wraps its stdout in ``ClaudeCodeTurn``, and delivers the turn via +``UnifiedEmitter.auto_send_turn`` (the async Redis push path). + +Note on subprocess inside Temporal +------------------------------------ +Subprocess (and all other) I/O must run in a Temporal *activity*, never in +workflow code. Temporal runs workflow + signal-handler bodies on a +deterministic sandbox event loop that does not implement ``subprocess_exec`` +(spawning the CLI there raises ``NotImplementedError``). The activity also gets +Temporal's retry + timeout guarantees. See +``examples/tutorials/10_async/10_temporal/030_custom_activities/`` for the +activity pattern. +""" + +from __future__ import annotations + +import os +import json +from datetime import timedelta + +from temporalio import workflow + +from agentex.lib import adk +from agentex.lib.types.acp import SendEventParams, CreateTaskParams +from agentex.lib.types.tracing import SGPTracingProcessorConfig +from agentex.lib.utils.logging import make_logger +from agentex.types.text_content import TextContent +from agentex.lib.environment_variables import EnvironmentVariables +from agentex.lib.core.temporal.types.workflow import SignalName +from agentex.lib.core.temporal.workflows.workflow import BaseWorkflow +from agentex.lib.core.tracing.tracing_processor_manager import add_tracing_processor_config + +with workflow.unsafe.imports_passed_through(): + from project.activities import RunClaudeCodeTurnParams, run_claude_code_turn + +add_tracing_processor_config( + SGPTracingProcessorConfig( + sgp_api_key=os.environ.get("SGP_API_KEY", ""), + sgp_account_id=os.environ.get("SGP_ACCOUNT_ID", ""), + sgp_base_url=os.environ.get("SGP_CLIENT_BASE_URL", ""), + ) +) + +environment_variables = EnvironmentVariables.refresh() + +if environment_variables.WORKFLOW_NAME is None: + raise ValueError("Environment variable WORKFLOW_NAME is not set") +if environment_variables.AGENT_NAME is None: + raise ValueError("Environment variable AGENT_NAME is not set") + +logger = make_logger(__name__) + + +@workflow.defn(name=environment_variables.WORKFLOW_NAME) +class At140ClaudeCodeWorkflow(BaseWorkflow): + """Temporal workflow that runs Claude Code locally for each user message. + + Persists the Claude Code session_id across turns so the CLI can resume + the conversation (``-r ``). Temporal's durable state ensures + the session_id survives worker crashes. + """ + + def __init__(self): + super().__init__(display_name=environment_variables.AGENT_NAME) + self._complete_task = False + self._turn_number = 0 + # Claude Code session_id for multi-turn resume. + self._session_id: str | None = None + + @workflow.signal(name=SignalName.RECEIVE_EVENT) + async def on_task_event_send(self, params: SendEventParams) -> None: + """Handle a user message: spawn Claude Code and push events to the task stream.""" + self._turn_number += 1 + task_id = params.task.id + prompt = params.event.content.content + logger.info("Turn %d for task %s", self._turn_number, task_id) + + await adk.messages.create(task_id=task_id, content=params.event.content) + + async with adk.tracing.span( + trace_id=task_id, + task_id=task_id, + name=f"Turn {self._turn_number}", + input={"message": prompt}, + ) as span: + # Delegate the subprocess turn to an activity: subprocess I/O is not + # permitted on the Temporal workflow event loop. The activity streams + # events to the task and returns the final text + session_id. + # workflow.now() gives a deterministic timestamp under replay. + result = await workflow.execute_activity( + run_claude_code_turn, + RunClaudeCodeTurnParams( + task_id=task_id, + prompt=prompt, + trace_id=task_id, + parent_span_id=span.id if span else None, + session_id=self._session_id, + created_at=workflow.now(), + ), + start_to_close_timeout=timedelta(minutes=5), + ) + + # Capture session_id to enable Claude Code resume on the next turn. + sid = result.get("session_id") + if sid: + self._session_id = sid + + if span: + span.output = {"final_text": result.get("final_text")} + + @workflow.run + async def on_task_create(self, params: CreateTaskParams) -> str: + logger.info("Task created: %s", params.task.id) + + await adk.messages.create( + task_id=params.task.id, + content=TextContent( + author="agent", + content=( + f"Task initialized with params:\n{json.dumps(params.params, indent=2)}\n" + "Send me a message and I'll run it through Claude Code locally." + ), + ), + ) + + await workflow.wait_condition(lambda: self._complete_task, timeout=None) + return "Task completed" + + @workflow.signal + async def complete_task_signal(self) -> None: + logger.info("Received complete_task signal") + self._complete_task = True diff --git a/examples/tutorials/10_async/10_temporal/140_claude_code/pyproject.toml b/examples/tutorials/10_async/10_temporal/140_claude_code/pyproject.toml new file mode 100644 index 000000000..b9d517267 --- /dev/null +++ b/examples/tutorials/10_async/10_temporal/140_claude_code/pyproject.toml @@ -0,0 +1,27 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "at140-claude-code" +version = "0.1.0" +description = "A Temporal-backed Claude Code agent streaming the unified harness surface via a local CLI subprocess" +readme = "README.md" +requires-python = ">=3.12" +dependencies = [ + "agentex-sdk", + "scale-gp", + "temporalio>=1.18.2", + "python-dotenv>=1.0,<2", +] + +[project.optional-dependencies] +dev = [ + "pytest", + "pytest-asyncio", + "httpx", + "debugpy>=1.8.15", +] + +[tool.hatch.build.targets.wheel] +packages = ["project"] diff --git a/examples/tutorials/10_async/10_temporal/140_claude_code/tests/test_agent.py b/examples/tutorials/10_async/10_temporal/140_claude_code/tests/test_agent.py new file mode 100644 index 000000000..767c707b9 --- /dev/null +++ b/examples/tutorials/10_async/10_temporal/140_claude_code/tests/test_agent.py @@ -0,0 +1,249 @@ +"""Tests for the Temporal Claude Code tutorial agent. + +LIVE tests (``TestClaudeCodeLive``): + - Require Temporal server, the ACP server, the Temporal worker, the ``claude`` + CLI on PATH, and ``ANTHROPIC_API_KEY`` set. + - Run the full agent end-to-end against a live Agentex server. + - Skipped automatically when ``CLAUDE_LIVE_TESTS`` is not set to ``1``. + +OFFLINE unit tests (``TestClaudeCodeOffline``): + - Inject a fake async iterator of pre-recorded stream-json lines. + - Assert the ``ClaudeCodeTurn`` + ``UnifiedEmitter`` pipeline drives + ``auto_send_turn``, populates usage, and satisfies the ``HarnessTurn`` + protocol. + - Always run -- no CLI or API key needed. +""" + +from __future__ import annotations + +import os +import json +from typing import AsyncIterator + +import pytest + +from agentex.types.task_message import TaskMessage + +# --------------------------------------------------------------------------- +# Recorded stream-json fixtures +# --------------------------------------------------------------------------- + +_TEXT_ONLY_LINES: list[str] = [ + json.dumps({"type": "system", "subtype": "init", "session_id": "sess-temporal-offline-1"}), + json.dumps( + { + "type": "assistant", + "message": {"content": [{"type": "text", "text": "Hello from Temporal Claude Code!"}]}, + } + ), + json.dumps( + { + "type": "result", + "session_id": "sess-temporal-offline-1", + "usage": {"input_tokens": 15, "output_tokens": 7}, + "cost_usd": 0.00015, + "duration_ms": 350, + "num_turns": 1, + } + ), +] + + +async def _fake_lines(lines: list[str]) -> AsyncIterator[str]: + """Async iterator of pre-recorded stream-json lines (no subprocess).""" + for line in lines: + yield line + + +# --------------------------------------------------------------------------- +# Fake streaming backend +# --------------------------------------------------------------------------- + + +class _FakeCtx: + def __init__(self, sink, content_type, initial_content): + self.sink = sink + self.content_type = content_type + self.task_message = TaskMessage(id="msg-t1", task_id="task-temporal-offline", content=initial_content) + + async def __aenter__(self): + self.sink.append(("open", self.content_type)) + return self + + async def __aexit__(self, *a): + await self.close() + return False + + async def close(self): + self.sink.append(("close", self.content_type)) + + async def stream_update(self, update): + self.sink.append(("update", update)) + return update + + +class _FakeStreaming: + def __init__(self): + self.sink: list = [] + + def streaming_task_message_context(self, task_id, initial_content, streaming_mode="coalesced", created_at=None): # noqa: ARG002 + ctype = getattr(initial_content, "type", None) + self.sink.append(("ctx", ctype)) + return _FakeCtx(self.sink, ctype, initial_content) + + +# --------------------------------------------------------------------------- +# Offline tests (always run -- no CLI or API key needed) +# --------------------------------------------------------------------------- + + +class TestClaudeCodeOffline: + """Unit tests that run without a real claude CLI, Temporal, or network.""" + + @pytest.mark.asyncio + async def test_auto_send_text_only_produces_output(self): + """auto_send_turn result carries the agent's reply text.""" + from agentex.lib.adk import ClaudeCodeTurn + from agentex.lib.core.harness import UnifiedEmitter + + fake_streaming = _FakeStreaming() + turn = ClaudeCodeTurn(_fake_lines(_TEXT_ONLY_LINES)) + emitter = UnifiedEmitter( + task_id="offline-temporal", + trace_id=None, + parent_span_id=None, + tracer=False, + streaming=fake_streaming, + ) + result = await emitter.auto_send_turn(turn) + assert "Hello from Temporal Claude Code" in result.final_text + + @pytest.mark.asyncio + async def test_usage_populated_after_stream_exhausted(self): + """Usage is populated after the events stream is exhausted.""" + from agentex.lib.adk import ClaudeCodeTurn + from agentex.lib.core.harness import UnifiedEmitter + + fake_streaming = _FakeStreaming() + turn = ClaudeCodeTurn(_fake_lines(_TEXT_ONLY_LINES)) + emitter = UnifiedEmitter( + task_id="t", + trace_id=None, + parent_span_id=None, + tracer=False, + streaming=fake_streaming, + ) + await emitter.auto_send_turn(turn) + usage = turn.usage() + assert usage.input_tokens == 15 + assert usage.output_tokens == 7 + assert usage.num_llm_calls == 1 + + @pytest.mark.asyncio + async def test_stream_task_message_done_present(self): + """StreamTaskMessageDone must appear via yield_turn on a ClaudeCodeTurn.""" + from agentex.lib.adk import ClaudeCodeTurn + from agentex.lib.core.harness import UnifiedEmitter + from agentex.types.task_message_update import StreamTaskMessageDone + + turn = ClaudeCodeTurn(_fake_lines(_TEXT_ONLY_LINES)) + emitter = UnifiedEmitter(task_id="t", trace_id=None, parent_span_id=None) + events = [e async for e in emitter.yield_turn(turn)] + assert any(isinstance(e, StreamTaskMessageDone) for e in events), ( + "Expected at least one StreamTaskMessageDone event" + ) + + @pytest.mark.asyncio + async def test_session_id_captured_in_result_envelope(self): + """The result envelope carries session_id (multi-turn resume support).""" + from agentex.lib.adk import ClaudeCodeTurn + from agentex.lib.core.harness import UnifiedEmitter + + fake_streaming = _FakeStreaming() + turn = ClaudeCodeTurn(_fake_lines(_TEXT_ONLY_LINES)) + emitter = UnifiedEmitter( + task_id="t", + trace_id=None, + parent_span_id=None, + tracer=False, + streaming=fake_streaming, + ) + await emitter.auto_send_turn(turn) + assert turn._result_envelope is not None + assert turn._result_envelope.get("session_id") == "sess-temporal-offline-1" + + +# --------------------------------------------------------------------------- +# Live tests (skipped unless CLAUDE_LIVE_TESTS=1) +# --------------------------------------------------------------------------- + +pytestmark_live = pytest.mark.skipif( + not os.environ.get("CLAUDE_LIVE_TESTS"), + reason="Set CLAUDE_LIVE_TESTS=1 and ensure the `claude` CLI + ANTHROPIC_API_KEY are available", +) + +AGENTEX_API_BASE_URL = os.environ.get("AGENTEX_API_BASE_URL", "http://localhost:5003") +AGENT_NAME = os.environ.get("AGENT_NAME", "at140-claude-code") + + +@pytestmark_live +class TestClaudeCodeLive: + """Live Temporal tests -- needs Temporal server + the claude CLI + ANTHROPIC_API_KEY.""" + + @pytest.fixture + def client(self): + from agentex import Agentex + + return Agentex(base_url=AGENTEX_API_BASE_URL) + + @pytest.fixture + def agent_name(self): + return AGENT_NAME + + @pytest.fixture + def agent_id(self, client, agent_name): + agents = client.agents.list() + for agent in agents: + if agent.name == agent_name: + return agent.id + raise ValueError(f"Agent {agent_name!r} not found.") + + def test_send_simple_message(self, client, agent_id: str): + """Create a task, send a message, and poll until a response appears.""" + import time + import uuid + + from agentex.types import TextContentParam + from agentex.types.agent_rpc_params import ParamsSendEventRequest, ParamsCreateTaskRequest + + task = client.agents.create_task(agent_id, params=ParamsCreateTaskRequest(name=uuid.uuid1().hex)).result + assert task is not None + task_id = task.id + + client.agents.send_event( + agent_id=agent_id, + params=ParamsSendEventRequest( + task_id=task_id, + content=TextContentParam( + author="user", + content="Reply with exactly three words: hello from claude", + type="text", + ), + ), + ) + + deadline = time.monotonic() + 90 + while time.monotonic() < deadline: + msgs = client.messages.list(task_id=task_id) + agent_msgs = [m for m in msgs if getattr(m.content, "author", None) == "agent"] + response_msgs = [m for m in agent_msgs if "Task initialized" not in str(getattr(m.content, "content", ""))] + if response_msgs: + assert len(response_msgs) >= 1 + return + time.sleep(3) + + raise AssertionError("No agent response received within 90 s") + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) diff --git a/examples/tutorials/10_async/10_temporal/140_claude_code/tests/test_agent_offline.py b/examples/tutorials/10_async/10_temporal/140_claude_code/tests/test_agent_offline.py new file mode 100644 index 000000000..1adc553f1 --- /dev/null +++ b/examples/tutorials/10_async/10_temporal/140_claude_code/tests/test_agent_offline.py @@ -0,0 +1,230 @@ +"""Offline unit tests for the Temporal Claude Code tutorial agent. + +These tests do NOT require the ``claude`` CLI, Temporal, or ANTHROPIC_API_KEY. +They inject a fake async iterator of pre-recorded stream-json lines in place of +the real subprocess spawn and a fake streaming backend, then assert that the +workflow's turn logic correctly drives ``UnifiedEmitter.auto_send_turn``. + +The injection seam is the ``_spawn_claude`` function in ``project/workflow.py``. +Tests monkeypatch it with a coroutine returning a pre-recorded async iterator. +""" + +from __future__ import annotations + +import json +from typing import AsyncIterator + +import pytest + +from agentex.lib.adk import ClaudeCodeTurn +from agentex.lib.core.harness import UnifiedEmitter +from agentex.types.task_message import TaskMessage + +# --------------------------------------------------------------------------- +# Recorded fixtures +# --------------------------------------------------------------------------- + +_TEXT_ONLY_LINES: list[str] = [ + json.dumps({"type": "system", "subtype": "init", "session_id": "sess-temporal-1"}), + json.dumps( + { + "type": "assistant", + "message": {"content": [{"type": "text", "text": "Hello from Temporal Claude Code!"}]}, + } + ), + json.dumps( + { + "type": "result", + "session_id": "sess-temporal-1", + "usage": {"input_tokens": 15, "output_tokens": 7}, + "cost_usd": 0.00015, + "duration_ms": 350, + "num_turns": 1, + } + ), +] + +_TOOL_CALL_LINES: list[str] = [ + json.dumps({"type": "system", "subtype": "init", "session_id": "sess-temporal-2"}), + json.dumps( + { + "type": "assistant", + "message": { + "content": [ + { + "type": "tool_use", + "id": "tool_temporal", + "name": "Bash", + "input": {"command": "ls /tmp"}, + } + ] + }, + } + ), + json.dumps( + { + "type": "user", + "message": { + "content": [ + { + "type": "tool_result", + "tool_use_id": "tool_temporal", + "content": "file1\nfile2\n", + "is_error": False, + } + ] + }, + } + ), + json.dumps( + { + "type": "assistant", + "message": {"content": [{"type": "text", "text": "Listed files."}]}, + } + ), + json.dumps( + { + "type": "result", + "session_id": "sess-temporal-2", + "usage": {"input_tokens": 30, "output_tokens": 12}, + "cost_usd": 0.0004, + "duration_ms": 600, + "num_turns": 1, + } + ), +] + + +# --------------------------------------------------------------------------- +# Fake streaming backend +# --------------------------------------------------------------------------- + + +class _FakeCtx: + def __init__(self, sink, content_type, initial_content): + self.sink = sink + self.content_type = content_type + self.task_message = TaskMessage(id="msg-t1", task_id="task-temporal-offline", content=initial_content) + + async def __aenter__(self): + self.sink.append(("open", self.content_type)) + return self + + async def __aexit__(self, *a): + await self.close() + return False + + async def close(self): + self.sink.append(("close", self.content_type)) + + async def stream_update(self, update): + self.sink.append(("update", update)) + return update + + +class _FakeStreaming: + def __init__(self): + self.sink: list = [] + + def streaming_task_message_context(self, task_id, initial_content, streaming_mode="coalesced", created_at=None): # noqa: ARG002 + ctype = getattr(initial_content, "type", None) + self.sink.append(("ctx", ctype)) + return _FakeCtx(self.sink, ctype, initial_content) + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +async def _fake_lines(lines: list[str]) -> AsyncIterator[str]: + for line in lines: + yield line + + +async def _run_turn(lines: list[str]): + fake_streaming = _FakeStreaming() + turn = ClaudeCodeTurn(_fake_lines(lines)) + emitter = UnifiedEmitter( + task_id="offline-temporal", + trace_id=None, + parent_span_id=None, + tracer=False, + streaming=fake_streaming, + ) + result = await emitter.auto_send_turn(turn) + return result, fake_streaming.sink, turn + + +# --------------------------------------------------------------------------- +# Tests +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_text_only_produces_agent_output(): + result, sink, _ = await _run_turn(_TEXT_ONLY_LINES) + assert "Hello from Temporal Claude Code" in result.final_text + + +@pytest.mark.asyncio +async def test_usage_from_result_envelope(): + """Usage is available from turn.usage() after the events are exhausted. + + UnifiedEmitter.auto_send_turn evaluates turn.usage() eagerly before the + async generator is consumed, so result.usage is a pre-exhaust snapshot. + Read usage directly from the turn after _run_turn completes instead. + """ + result, _, turn = await _run_turn(_TEXT_ONLY_LINES) + usage = turn.usage() + assert usage.input_tokens == 15 + assert usage.output_tokens == 7 + assert usage.num_llm_calls == 1 + + +@pytest.mark.asyncio +async def test_session_id_captured_in_result_envelope(): + """Verify the result envelope carries session_id (multi-turn resume support).""" + _, _, turn = await _run_turn(_TEXT_ONLY_LINES) + assert turn._result_envelope is not None + assert turn._result_envelope.get("session_id") == "sess-temporal-1" + + +@pytest.mark.asyncio +async def test_tool_call_context_types(): + result, sink, _ = await _run_turn(_TOOL_CALL_LINES) + opened = [s for s in sink if s[0] == "open"] + content_types = [s[1] for s in opened] + assert "tool_request" in content_types + assert "text" in content_types + + +@pytest.mark.asyncio +async def test_spawn_seam_concept(): + """Demonstrate the injectable spawn seam pattern used in project/workflow.py. + + ``_spawn_claude(prompt, session_id=None)`` is a top-level async generator. + A drop-in replacement (e.g. via monkeypatch) supplies pre-recorded lines + and captures call arguments. The session_id parameter enables multi-turn + resume (``claude -r ``). + """ + called: list[tuple] = [] + + async def _fake_spawn(prompt: str, session_id=None) -> AsyncIterator[str]: + called.append((prompt, session_id)) + for line in _TEXT_ONLY_LINES: + yield line + + fake_streaming = _FakeStreaming() + turn = ClaudeCodeTurn(_fake_spawn("temporal prompt", session_id="old-sid")) + emitter = UnifiedEmitter( + task_id="t", + trace_id=None, + parent_span_id=None, + tracer=False, + streaming=fake_streaming, + ) + result = await emitter.auto_send_turn(turn) + + assert called == [("temporal prompt", "old-sid")] + assert "Hello from Temporal Claude Code" in result.final_text diff --git a/src/agentex/lib/adk/__init__.py b/src/agentex/lib/adk/__init__.py index a08131260..c2b343b72 100644 --- a/src/agentex/lib/adk/__init__.py +++ b/src/agentex/lib/adk/__init__.py @@ -13,6 +13,11 @@ from agentex.lib.adk._modules._pydantic_ai_async import stream_pydantic_ai_events from agentex.lib.adk._modules._pydantic_ai_sync import convert_pydantic_ai_to_agentex_events from agentex.lib.adk._modules._pydantic_ai_tracing import create_pydantic_ai_tracing_handler +from agentex.lib.adk._modules._claude_code_sync import convert_claude_code_to_agentex_events +from agentex.lib.adk._modules._claude_code_turn import ( + ClaudeCodeTurn, + claude_code_usage_to_turn_usage, +) from agentex.lib.adk._modules.events import EventsModule from agentex.lib.adk._modules.messages import MessagesModule from agentex.lib.adk._modules.state import StateModule @@ -54,6 +59,10 @@ "stream_pydantic_ai_events", "convert_pydantic_ai_to_agentex_events", "create_pydantic_ai_tracing_handler", + # Claude Code + "convert_claude_code_to_agentex_events", + "ClaudeCodeTurn", + "claude_code_usage_to_turn_usage", # Providers "providers", # Utils diff --git a/src/agentex/lib/adk/_modules/_claude_code_sync.py b/src/agentex/lib/adk/_modules/_claude_code_sync.py new file mode 100644 index 000000000..4e25503cf --- /dev/null +++ b/src/agentex/lib/adk/_modules/_claude_code_sync.py @@ -0,0 +1,378 @@ +"""Claude Code stream-json parser tap for the unified harness surface. + +Converts the newline-delimited JSON envelopes emitted by +``claude -p --output-format stream-json`` into the canonical +``StreamTaskMessage*`` stream consumed by the Agentex harness. + +Envelope → canonical mapping +----------------------------- +system/init + Ignored at this layer (session_id tracking is a provider concern). + +assistant / user (content blocks) + text block → Start(TextContent) + Delta(TextDelta)* + Done + thinking block → Start(ReasoningContent) + Delta(ReasoningContentDelta)* + Done + tool_use block → Start(ToolRequestContent) + Done (Full args in Start content) + tool_result block → Full(ToolResponseContent) + +stream_event / content_block_start + type=text → Start(TextContent, empty) + type=thinking → Start(ReasoningContent, empty) + +stream_event / content_block_delta + type=text_delta → Delta(TextDelta) + type=thinking_delta → Delta(ReasoningContentDelta) + +stream_event / content_block_stop + (text open) → Done + (thinking open) → Done (full text known here; update Full via Full event first) + +result + Fires ``on_result`` with the raw envelope so the caller can capture + usage and cost. No StreamTaskMessage is emitted for the result itself. + +Out of scope +------------ +No deployable test agent is provided. claude-code requires the golden +agent's sandbox/subprocess/secret/MCP orchestration to produce the stream. +Live coverage is the golden agent, which will adopt this tap. Do NOT add an +examples/ agent or CI live-matrix row for claude-code. +""" + +from __future__ import annotations + +import json +from typing import Any, Callable, Awaitable, AsyncIterator + +from agentex.lib.utils.logging import make_logger +from agentex.types.text_content import TextContent +from agentex.types.reasoning_content import ReasoningContent +from agentex.types.task_message_delta import TextDelta +from agentex.types.task_message_update import ( + StreamTaskMessageDone, + StreamTaskMessageFull, + StreamTaskMessageDelta, + StreamTaskMessageStart, +) +from agentex.types.tool_request_content import ToolRequestContent +from agentex.types.tool_response_content import ToolResponseContent +from agentex.types.reasoning_content_delta import ReasoningContentDelta + +logger = make_logger(__name__) + +_MAX_RESULT_LENGTH = 4000 + + +def _truncate(text: str) -> str: + return str(text)[:_MAX_RESULT_LENGTH] + + +def _extract_summary(text: str, max_len: int = 300) -> str: + return text.strip().split("\n", 1)[0][:max_len] + + +async def convert_claude_code_to_agentex_events( + lines: AsyncIterator[str | dict[str, Any]], + on_result: Callable[[dict[str, Any]], Awaitable[None]] | None = None, +) -> AsyncIterator[StreamTaskMessageStart | StreamTaskMessageDelta | StreamTaskMessageFull | StreamTaskMessageDone]: + """Convert a claude-code ``stream-json`` line stream into Agentex ``StreamTaskMessage*`` events. + + Each item in ``lines`` is either a raw JSON string (as read from the CLI's + stdout) or an already-parsed dict. Empty strings are skipped; unparseable + JSON is logged and skipped. + + ``on_result`` is called with the ``result`` envelope when it arrives so the + caller can capture usage and cost. It is awaited before the generator + continues. When ``None``, the result envelope is silently dropped. + + Envelope → canonical mapping is documented in this module's docstring. + """ + next_index = 0 + tool_call_count = 0 + + # Streaming state for content_block_start / content_block_delta / + # content_block_stop triples. + _thinking_open = False + _thinking_buf = "" + _thinking_index: int | None = None + _text_open = False + _text_buf = "" + _text_index: int | None = None + # Track which assistant-message block indices were already streamed via + # stream_event triples. Those blocks must not be re-emitted when the full + # assistant message arrives. Reset at each message boundary (see below) so a + # later turn's block indices don't collide with an earlier turn's. + _streamed_block_indexes: set[int] = set() + # Once-guard so a thinking block's pending index is claimed on its first + # thinking_delta only. Reset per turn alongside _streamed_block_indexes. + _saw_thinking_stream = False + # For deferred ReasoningStarted: if a content_block_start(thinking) arrives + # but no thinking_delta ever follows, the final assistant block's thinking + # field fills the reasoning content instead. + _pending_thinking_block_index: int | None = None + + async for raw in lines: + if not raw: + continue + + if isinstance(raw, dict): + evt = raw + else: + line = raw.strip() + if not line: + continue + try: + evt = json.loads(line) + except json.JSONDecodeError: + logger.debug("claude-code: skipping non-JSON line: %r", line[:120]) + continue + + evt_type = evt.get("type", "") + + # ----------------------------------------------------------------------- + # assistant / user — materialised content blocks + # ----------------------------------------------------------------------- + if evt_type in ("assistant", "user"): + msg = evt.get("message", {}) + blocks = msg.get("content", []) + if not isinstance(blocks, list): + blocks = [blocks] + + for idx, block in enumerate(blocks): + if not isinstance(block, dict): + continue + block_type = block.get("type", "") + + if block_type == "text": + # Skip only the specific blocks already delivered via + # stream_event deltas (per-block, not a turn-wide latch). + if idx in _streamed_block_indexes: + continue + text = block.get("text", "") + if text: + msg_index = next_index + next_index += 1 + yield StreamTaskMessageStart( + type="start", + index=msg_index, + content=TextContent( + type="text", + author="agent", + content="", + ), + ) + yield StreamTaskMessageDelta( + type="delta", + index=msg_index, + delta=TextDelta(type="text", text_delta=text), + ) + yield StreamTaskMessageDone(type="done", index=msg_index) + + elif block_type == "thinking": + # Skip only the specific blocks already delivered via + # stream_event deltas (per-block, not a turn-wide latch). + if idx in _streamed_block_indexes: + continue + thinking_text = block.get("thinking", "") + if thinking_text: + summary = _extract_summary(thinking_text) + msg_index = next_index + next_index += 1 + yield StreamTaskMessageStart( + type="start", + index=msg_index, + content=ReasoningContent( + type="reasoning", + author="agent", + summary=[summary], + content=[], + style="active", + ), + ) + yield StreamTaskMessageDelta( + type="delta", + index=msg_index, + delta=ReasoningContentDelta( + type="reasoning_content", + content_index=0, + content_delta=thinking_text, + ), + ) + yield StreamTaskMessageDone(type="done", index=msg_index) + + elif block_type == "tool_use": + tool_call_count += 1 + tool_id = block.get("id", f"tool_{tool_call_count}") + name = block.get("name", "unknown") + arguments = block.get("input", {}) + if not isinstance(arguments, dict): + arguments = {} + msg_index = next_index + next_index += 1 + yield StreamTaskMessageStart( + type="start", + index=msg_index, + content=ToolRequestContent( + type="tool_request", + author="agent", + tool_call_id=tool_id, + name=name, + arguments=arguments, + ), + ) + yield StreamTaskMessageDone(type="done", index=msg_index) + + elif block_type == "tool_result": + tool_id = block.get("tool_use_id", "") + content = block.get("content", "") + is_error = block.get("is_error", False) + if isinstance(content, list): + content = "\n".join(b.get("text", str(b)) if isinstance(b, dict) else str(b) for b in content) + result_str = _truncate(str(content)) + msg_index = next_index + next_index += 1 + yield StreamTaskMessageFull( + type="full", + index=msg_index, + content=ToolResponseContent( + type="tool_response", + author="agent", + tool_call_id=tool_id, + name="", + content={"result": result_str, **({"is_error": True} if is_error else {})}, + ), + ) + + # End of a materialised message: reset per-turn streaming dedup state + # so the next turn's stream_event indices start clean. Without this, + # a block index streamed in an earlier turn would linger in the set + # and silently drop a later turn's non-streamed block at that index. + _streamed_block_indexes = set() + _saw_thinking_stream = False + + # ----------------------------------------------------------------------- + # stream_event — incremental streaming deltas + # ----------------------------------------------------------------------- + elif evt_type == "stream_event": + se = evt.get("event") or {} + se_type = se.get("type", "") + block_index = se.get("index") + + if se_type == "content_block_start": + block = se.get("content_block") or {} + btype = block.get("type") + + if btype == "thinking": + _thinking_open = True + _thinking_buf = "" + # Defer marking the block as streamed until we actually + # receive a thinking_delta. Some configurations emit a + # thinking block_start but no deltas — in that case we want + # the final assistant-message handler to fill the text. + _pending_thinking_block_index = block_index if isinstance(block_index, int) else None + msg_index = next_index + next_index += 1 + _thinking_index = msg_index + yield StreamTaskMessageStart( + type="start", + index=msg_index, + content=ReasoningContent( + type="reasoning", + author="agent", + summary=[], + content=[], + style="active", + ), + ) + + elif btype == "text": + _text_open = True + _text_buf = "" + if isinstance(block_index, int): + _streamed_block_indexes.add(block_index) + msg_index = next_index + next_index += 1 + _text_index = msg_index + yield StreamTaskMessageStart( + type="start", + index=msg_index, + content=TextContent( + type="text", + author="agent", + content="", + ), + ) + + elif se_type == "content_block_delta": + delta = se.get("delta") or {} + dtype = delta.get("type") + + if dtype == "thinking_delta": + chunk = delta.get("thinking", "") + if chunk and _thinking_open: + if not _saw_thinking_stream: + _saw_thinking_stream = True + # Now mark the block as claimed so the assistant + # message handler won't re-emit it. + if _pending_thinking_block_index is not None: + _streamed_block_indexes.add(_pending_thinking_block_index) + _thinking_buf += chunk + if _thinking_index is not None: + yield StreamTaskMessageDelta( + type="delta", + index=_thinking_index, + delta=ReasoningContentDelta( + type="reasoning_content", + content_index=0, + content_delta=chunk, + ), + ) + + elif dtype == "text_delta": + chunk = delta.get("text", "") + if chunk and _text_open: + _text_buf += chunk + if _text_index is not None: + yield StreamTaskMessageDelta( + type="delta", + index=_text_index, + delta=TextDelta(type="text", text_delta=chunk), + ) + + elif se_type == "content_block_stop": + if _thinking_open: + _thinking_open = False + _thinking_buf = "" + _pending_thinking_block_index = None + # Reset the once-guard per thinking block: a turn can stream a + # second thinking block, and without this the guard stays True, + # the second block's index is never claimed, and the final + # assistant envelope re-emits it (duplicate Start/Delta/Done). + _saw_thinking_stream = False + if _thinking_index is not None: + yield StreamTaskMessageDone(type="done", index=_thinking_index) + _thinking_index = None + elif _text_open: + _text_open = False + _text_buf = "" + if _text_index is not None: + yield StreamTaskMessageDone(type="done", index=_text_index) + _text_index = None + + # ----------------------------------------------------------------------- + # system / init — session metadata (ignored at this layer) + # ----------------------------------------------------------------------- + elif evt_type == "system": + # Session ID tracking and MCP status logging are provider concerns. + # This pure parser layer intentionally emits nothing for system events. + pass + + # ----------------------------------------------------------------------- + # result — carries usage + cost; fired to on_result, not emitted as msgs + # ----------------------------------------------------------------------- + elif evt_type == "result": + if on_result is not None: + await on_result(evt) + + else: + logger.debug("claude-code: unhandled envelope type %r", evt_type) diff --git a/src/agentex/lib/adk/_modules/_claude_code_turn.py b/src/agentex/lib/adk/_modules/_claude_code_turn.py new file mode 100644 index 000000000..6c052976a --- /dev/null +++ b/src/agentex/lib/adk/_modules/_claude_code_turn.py @@ -0,0 +1,161 @@ +"""ClaudeCodeTurn — HarnessTurn implementation for the claude-code tap. + +Wraps ``convert_claude_code_to_agentex_events`` to implement the +``HarnessTurn`` protocol: exposes ``events`` (the canonical +``StreamTaskMessage*`` stream) and ``usage()`` (the normalised +``TurnUsage``, populated after the stream is exhausted). + +Usage normalization +------------------- +Claude Code's ``result`` envelope carries usage under several key shapes +depending on the CLI version. We defensive-map all known shapes: + + result.usage.input_tokens -> input_tokens + result.usage.output_tokens -> output_tokens + result.usage.cache_read_input_tokens + result.usage.cache_creation_input_tokens -> cached_input_tokens (sum) + result.cost_usd / result.total_cost_usd -> cost_usd + result.duration_ms -> duration_ms + result.num_turns -> num_llm_calls + +Real zeros are preserved; missing keys default to ``None`` (not zero) so +downstream consumers can distinguish "not reported" from "zero". + +Out of scope: no deployable test agent is provided — see module docstring +in ``_claude_code_sync.py``. +""" + +from __future__ import annotations + +from typing import Any, AsyncIterator + +from agentex.lib.core.harness.types import TurnUsage, HarnessTurn, StreamTaskMessage +from agentex.lib.adk._modules._claude_code_sync import convert_claude_code_to_agentex_events + + +def claude_code_usage_to_turn_usage(result_envelope: dict[str, Any]) -> TurnUsage: + """Map a claude-code ``result`` envelope to a canonical ``TurnUsage``. + + Defensively handles missing / None values. Real zeros are preserved. + ``cost_usd`` checks both ``cost_usd`` and ``total_cost_usd`` keys (the + CLI has used both across versions). + ``cached_input_tokens`` accumulates cache_read and cache_creation counts + since both represent tokens served from the prompt cache. + """ + usage_raw: dict[str, Any] = result_envelope.get("usage") or {} + + def _int(d: dict[str, Any], key: str) -> int | None: + v = d.get(key) + if v is None: + return None + try: + return int(v) + except (TypeError, ValueError): + return None + + def _float(d: dict[str, Any], *keys: str) -> float | None: + for key in keys: + v = d.get(key) + if v is not None: + try: + return float(v) + except (TypeError, ValueError): + continue + return None + + input_tokens = _int(usage_raw, "input_tokens") + output_tokens = _int(usage_raw, "output_tokens") + + # Aggregate both cache_read and cache_creation into cached_input_tokens + cache_read = _int(usage_raw, "cache_read_input_tokens") + cache_creation = _int(usage_raw, "cache_creation_input_tokens") + if cache_read is not None or cache_creation is not None: + cached_input_tokens = (cache_read or 0) + (cache_creation or 0) + else: + cached_input_tokens = None + + total_tokens: int | None = None + if input_tokens is not None and output_tokens is not None: + total_tokens = input_tokens + output_tokens + + cost_usd = _float(result_envelope, "cost_usd", "total_cost_usd") + duration_ms = _int(result_envelope, "duration_ms") + + # num_llm_calls is provider-reported (from num_turns): default None ("not + # reported") rather than 0 so callers can distinguish it from a real zero, + # matching the None convention used for the token fields above. + num_turns = result_envelope.get("num_turns") + num_llm_calls: int | None = None + if num_turns is not None: + try: + num_llm_calls = int(num_turns) + except (TypeError, ValueError): + pass + + return TurnUsage( + input_tokens=input_tokens, + output_tokens=output_tokens, + cached_input_tokens=cached_input_tokens, + total_tokens=total_tokens, + cost_usd=cost_usd, + duration_ms=duration_ms, + num_llm_calls=num_llm_calls, + ) + + +class ClaudeCodeTurn: + """HarnessTurn for a claude-code ``stream-json`` line stream. + + Satisfies the ``HarnessTurn`` protocol: + - ``events`` yields the canonical ``StreamTaskMessage*`` stream. + - ``usage()`` returns the normalised ``TurnUsage`` (only valid after + ``events`` is fully consumed). + + ``lines`` is an async iterator of raw JSON strings or pre-parsed dicts, as + produced by reading the claude-code CLI's stdout line by line. + """ + + def __init__(self, lines: AsyncIterator[str | dict[str, Any]]) -> None: + self._lines = lines + self._result_envelope: dict[str, Any] | None = None + self._events_stream: AsyncIterator[StreamTaskMessage] | None = None + + async def _on_result(self, envelope: dict[str, Any]) -> None: + self._result_envelope = envelope + + @property + def events(self) -> AsyncIterator[StreamTaskMessage]: + if self._events_stream is None: + self._events_stream = convert_claude_code_to_agentex_events( + self._lines, + on_result=self._on_result, + ) + return self._events_stream + + @property + def session_id(self) -> str | None: + """The Claude Code session id, for resuming a multi-turn session. + + Valid only after ``events`` has been fully consumed (populated by the + ``result`` envelope). Returns ``None`` if the stream was truncated or + Claude Code reported no session id. + """ + if not self._result_envelope: + return None + return self._result_envelope.get("session_id") + + def usage(self) -> TurnUsage: + """Return normalised usage for this turn. + + Call only after ``events`` is exhausted. Returns an empty ``TurnUsage`` + if the ``result`` envelope was not received (e.g. stream was truncated). + """ + if self._result_envelope is None: + return TurnUsage() + return claude_code_usage_to_turn_usage(self._result_envelope) + + +# Runtime assert that ClaudeCodeTurn satisfies HarnessTurn protocol +assert isinstance(ClaudeCodeTurn.__new__(ClaudeCodeTurn), HarnessTurn), ( + "ClaudeCodeTurn must satisfy the HarnessTurn protocol" +) diff --git a/src/agentex/lib/core/harness/types.py b/src/agentex/lib/core/harness/types.py index b37dc1e51..74e0dc314 100644 --- a/src/agentex/lib/core/harness/types.py +++ b/src/agentex/lib/core/harness/types.py @@ -64,7 +64,10 @@ class TurnUsage(BaseModel): total_tokens: int | None = None cost_usd: float | None = None duration_ms: int | None = None - num_llm_calls: int = 0 + # num_llm_calls is provider-reported and may be absent (None = "not + # reported"). num_tool_calls / num_reasoning_blocks are counted locally from + # the observed stream, so 0 is always a real count. + num_llm_calls: int | None = None num_tool_calls: int = 0 num_reasoning_blocks: int = 0 diff --git a/tests/lib/adk/test_claude_code_sync.py b/tests/lib/adk/test_claude_code_sync.py new file mode 100644 index 000000000..6dd36d973 --- /dev/null +++ b/tests/lib/adk/test_claude_code_sync.py @@ -0,0 +1,637 @@ +"""Tests for the claude-code stream-json -> Agentex StreamTaskMessage* converter.""" + +from __future__ import annotations + +from typing import Any, AsyncIterator + +import pytest + +from agentex.types.text_content import TextContent +from agentex.types.reasoning_content import ReasoningContent +from agentex.types.task_message_delta import TextDelta +from agentex.types.task_message_update import ( + StreamTaskMessageDone, + StreamTaskMessageFull, + StreamTaskMessageDelta, + StreamTaskMessageStart, +) +from agentex.types.tool_request_content import ToolRequestContent +from agentex.types.tool_response_content import ToolResponseContent +from agentex.types.reasoning_content_delta import ReasoningContentDelta +from agentex.lib.adk._modules._claude_code_sync import convert_claude_code_to_agentex_events + + +async def _aiter(events: list[Any]) -> AsyncIterator[Any]: + for e in events: + yield e + + +async def _collect(stream: AsyncIterator[Any]) -> list[Any]: + return [e async for e in stream] + + +# --------------------------------------------------------------------------- +# Text content +# --------------------------------------------------------------------------- + + +class TestTextContent: + async def test_text_block_in_assistant_message_emits_start_delta_done(self): + envelopes = [ + { + "type": "assistant", + "message": {"content": [{"type": "text", "text": "Hello world"}]}, + } + ] + out = await _collect(convert_claude_code_to_agentex_events(_aiter(envelopes))) + + assert len(out) == 3 + assert isinstance(out[0], StreamTaskMessageStart) + assert isinstance(out[0].content, TextContent) + assert out[0].content.content == "" + assert isinstance(out[1], StreamTaskMessageDelta) + assert isinstance(out[1].delta, TextDelta) + assert out[1].delta.text_delta == "Hello world" + assert isinstance(out[2], StreamTaskMessageDone) + assert out[0].index == out[1].index == out[2].index + + async def test_empty_text_block_is_skipped(self): + envelopes = [ + { + "type": "assistant", + "message": {"content": [{"type": "text", "text": ""}]}, + } + ] + out = await _collect(convert_claude_code_to_agentex_events(_aiter(envelopes))) + assert out == [] + + async def test_streamed_text_via_stream_event_emits_start_deltas_done(self): + envelopes = [ + { + "type": "stream_event", + "event": {"type": "content_block_start", "index": 0, "content_block": {"type": "text"}}, + }, + { + "type": "stream_event", + "event": { + "type": "content_block_delta", + "index": 0, + "delta": {"type": "text_delta", "text": "Hello"}, + }, + }, + { + "type": "stream_event", + "event": { + "type": "content_block_delta", + "index": 0, + "delta": {"type": "text_delta", "text": " world"}, + }, + }, + { + "type": "stream_event", + "event": {"type": "content_block_stop", "index": 0}, + }, + ] + out = await _collect(convert_claude_code_to_agentex_events(_aiter(envelopes))) + + starts = [e for e in out if isinstance(e, StreamTaskMessageStart)] + deltas = [e for e in out if isinstance(e, StreamTaskMessageDelta)] + dones = [e for e in out if isinstance(e, StreamTaskMessageDone)] + + assert len(starts) == 1 + assert isinstance(starts[0].content, TextContent) + assert len(deltas) == 2 + assert isinstance(deltas[0].delta, TextDelta) + assert deltas[0].delta.text_delta == "Hello" + assert isinstance(deltas[1].delta, TextDelta) + assert deltas[1].delta.text_delta == " world" + assert len(dones) == 1 + + async def test_streamed_text_not_re_emitted_by_assistant_block(self): + """After stream_event triple, the final assistant block must not re-emit the text.""" + envelopes = [ + { + "type": "stream_event", + "event": { + "type": "content_block_start", + "index": 0, + "content_block": {"type": "text"}, + }, + }, + { + "type": "stream_event", + "event": { + "type": "content_block_delta", + "index": 0, + "delta": {"type": "text_delta", "text": "streamed"}, + }, + }, + { + "type": "stream_event", + "event": {"type": "content_block_stop", "index": 0}, + }, + # Final assistant message with same text — must NOT be re-emitted + { + "type": "assistant", + "message": {"content": [{"type": "text", "text": "streamed"}]}, + }, + ] + out = await _collect(convert_claude_code_to_agentex_events(_aiter(envelopes))) + text_starts = [e for e in out if isinstance(e, StreamTaskMessageStart) and isinstance(e.content, TextContent)] + assert len(text_starts) == 1, "Text block must not be emitted twice" + + async def test_later_turn_non_streamed_text_not_dropped(self): + """A non-streamed text block in a later turn must not be dropped because an + earlier turn streamed a block at the same index.""" + envelopes = [ + # Turn 1: streamed text at index 0 (dedup'd against the materialised msg). + { + "type": "stream_event", + "event": {"type": "content_block_start", "index": 0, "content_block": {"type": "text"}}, + }, + { + "type": "stream_event", + "event": {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": "first"}}, + }, + {"type": "stream_event", "event": {"type": "content_block_stop", "index": 0}}, + {"type": "assistant", "message": {"content": [{"type": "text", "text": "first"}]}}, + # Turn 2: a NON-streamed text block, also at index 0. + {"type": "assistant", "message": {"content": [{"type": "text", "text": "second"}]}}, + ] + out = await _collect(convert_claude_code_to_agentex_events(_aiter(envelopes))) + deltas = [ + e.delta.text_delta for e in out if isinstance(e, StreamTaskMessageDelta) and isinstance(e.delta, TextDelta) + ] + assert deltas == ["first", "second"], "Later turn's non-streamed text must still be delivered" + + +# --------------------------------------------------------------------------- +# Thinking / reasoning content +# --------------------------------------------------------------------------- + + +class TestThinkingContent: + async def test_thinking_block_emits_reasoning_start_delta_done(self): + envelopes = [ + { + "type": "assistant", + "message": {"content": [{"type": "thinking", "thinking": "Let me reason..."}]}, + } + ] + out = await _collect(convert_claude_code_to_agentex_events(_aiter(envelopes))) + + assert len(out) == 3 + assert isinstance(out[0], StreamTaskMessageStart) + assert isinstance(out[0].content, ReasoningContent) + # Summary must be populated from the thinking text + assert out[0].content.summary == ["Let me reason..."] + assert isinstance(out[1], StreamTaskMessageDelta) + assert isinstance(out[1].delta, ReasoningContentDelta) + assert out[1].delta.content_delta == "Let me reason..." + assert out[1].delta.content_index == 0 + assert isinstance(out[2], StreamTaskMessageDone) + + async def test_empty_thinking_block_is_skipped(self): + envelopes = [ + { + "type": "assistant", + "message": {"content": [{"type": "thinking", "thinking": ""}]}, + } + ] + out = await _collect(convert_claude_code_to_agentex_events(_aiter(envelopes))) + assert out == [] + + async def test_streamed_thinking_emits_reasoning_start_deltas_done(self): + envelopes = [ + { + "type": "stream_event", + "event": { + "type": "content_block_start", + "index": 0, + "content_block": {"type": "thinking"}, + }, + }, + { + "type": "stream_event", + "event": { + "type": "content_block_delta", + "index": 0, + "delta": {"type": "thinking_delta", "thinking": "step one"}, + }, + }, + { + "type": "stream_event", + "event": { + "type": "content_block_delta", + "index": 0, + "delta": {"type": "thinking_delta", "thinking": " step two"}, + }, + }, + { + "type": "stream_event", + "event": {"type": "content_block_stop", "index": 0}, + }, + ] + out = await _collect(convert_claude_code_to_agentex_events(_aiter(envelopes))) + + starts = [e for e in out if isinstance(e, StreamTaskMessageStart)] + deltas = [e for e in out if isinstance(e, StreamTaskMessageDelta)] + dones = [e for e in out if isinstance(e, StreamTaskMessageDone)] + + assert len(starts) == 1 + assert isinstance(starts[0].content, ReasoningContent) + assert len(deltas) == 2 + assert isinstance(deltas[0].delta, ReasoningContentDelta) + assert deltas[0].delta.content_delta == "step one" + assert isinstance(deltas[1].delta, ReasoningContentDelta) + assert deltas[1].delta.content_delta == " step two" + assert len(dones) == 1 + + async def test_two_streamed_thinking_blocks_not_re_emitted(self): + """A turn that streams two thinking blocks must claim both indices, so the + final assistant envelope does not re-emit the second one.""" + + def _thinking_block(idx: int, text: str) -> list: + return [ + { + "type": "stream_event", + "event": {"type": "content_block_start", "index": idx, "content_block": {"type": "thinking"}}, + }, + { + "type": "stream_event", + "event": { + "type": "content_block_delta", + "index": idx, + "delta": {"type": "thinking_delta", "thinking": text}, + }, + }, + {"type": "stream_event", "event": {"type": "content_block_stop", "index": idx}}, + ] + + envelopes = [ + *_thinking_block(0, "first thought"), + *_thinking_block(1, "second thought"), + # Final assistant envelope repeats both thinking blocks — neither should re-emit. + { + "type": "assistant", + "message": { + "content": [ + {"type": "thinking", "thinking": "first thought"}, + {"type": "thinking", "thinking": "second thought"}, + ] + }, + }, + ] + out = await _collect(convert_claude_code_to_agentex_events(_aiter(envelopes))) + reasoning_starts = [ + e for e in out if isinstance(e, StreamTaskMessageStart) and isinstance(e.content, ReasoningContent) + ] + assert len(reasoning_starts) == 2, "each streamed thinking block emitted exactly once (no duplicate)" + + async def test_thinking_block_start_with_no_deltas_allows_assistant_to_fill(self): + """A thinking block_start without any deltas leaves the final assistant block + free to emit the thinking text (the block index is not claimed as streamed).""" + envelopes = [ + { + "type": "stream_event", + "event": { + "type": "content_block_start", + "index": 0, + "content_block": {"type": "thinking"}, + }, + }, + # No thinking_delta — close block immediately + { + "type": "stream_event", + "event": {"type": "content_block_stop", "index": 0}, + }, + # Final assistant message has the thinking text + { + "type": "assistant", + "message": {"content": [{"type": "thinking", "thinking": "delayed thinking"}]}, + }, + ] + out = await _collect(convert_claude_code_to_agentex_events(_aiter(envelopes))) + # The assistant block should produce a full thinking message (Start+Delta+Done) + reasoning_starts = [ + e for e in out if isinstance(e, StreamTaskMessageStart) and isinstance(e.content, ReasoningContent) + ] + # There will be the empty start from stream_event, plus the one from assistant block + reasoning_deltas = [ + e for e in out if isinstance(e, StreamTaskMessageDelta) and isinstance(e.delta, ReasoningContentDelta) + ] + assert len(reasoning_deltas) >= 1 + assert any( + isinstance(d.delta, ReasoningContentDelta) and d.delta.content_delta == "delayed thinking" + for d in reasoning_deltas + ) + + +# --------------------------------------------------------------------------- +# Tool calls and results +# --------------------------------------------------------------------------- + + +class TestToolCallsAndResults: + async def test_tool_use_block_emits_start_done(self): + envelopes = [ + { + "type": "assistant", + "message": { + "content": [ + { + "type": "tool_use", + "id": "call_abc", + "name": "Bash", + "input": {"command": "ls /"}, + } + ] + }, + } + ] + out = await _collect(convert_claude_code_to_agentex_events(_aiter(envelopes))) + + assert len(out) == 2 + assert isinstance(out[0], StreamTaskMessageStart) + assert isinstance(out[0].content, ToolRequestContent) + assert out[0].content.tool_call_id == "call_abc" + assert out[0].content.name == "Bash" + assert out[0].content.arguments == {"command": "ls /"} + assert isinstance(out[1], StreamTaskMessageDone) + + async def test_tool_result_block_emits_full(self): + envelopes = [ + { + "type": "user", + "message": { + "content": [ + { + "type": "tool_result", + "tool_use_id": "call_abc", + "content": "file1.txt\nfile2.txt", + } + ] + }, + } + ] + out = await _collect(convert_claude_code_to_agentex_events(_aiter(envelopes))) + + assert len(out) == 1 + assert isinstance(out[0], StreamTaskMessageFull) + assert isinstance(out[0].content, ToolResponseContent) + assert out[0].content.tool_call_id == "call_abc" + assert "file1.txt" in str(out[0].content.content) + + async def test_tool_result_list_content_joined(self): + envelopes = [ + { + "type": "user", + "message": { + "content": [ + { + "type": "tool_result", + "tool_use_id": "tid", + "content": [ + {"type": "text", "text": "line1"}, + {"type": "text", "text": "line2"}, + ], + } + ] + }, + } + ] + out = await _collect(convert_claude_code_to_agentex_events(_aiter(envelopes))) + assert isinstance(out[0], StreamTaskMessageFull) + assert isinstance(out[0].content, ToolResponseContent) + payload = str(out[0].content.content) + assert "line1" in payload + assert "line2" in payload + + async def test_tool_result_error_flag_passed_through(self): + envelopes = [ + { + "type": "user", + "message": { + "content": [ + { + "type": "tool_result", + "tool_use_id": "err_call", + "content": "Permission denied", + "is_error": True, + } + ] + }, + } + ] + out = await _collect(convert_claude_code_to_agentex_events(_aiter(envelopes))) + assert isinstance(out[0], StreamTaskMessageFull) + assert isinstance(out[0].content, ToolResponseContent) + assert isinstance(out[0].content.content, dict) + assert out[0].content.content.get("is_error") is True + + async def test_tool_result_truncation(self): + long_result = "x" * 5000 + envelopes = [ + { + "type": "user", + "message": { + "content": [ + { + "type": "tool_result", + "tool_use_id": "t", + "content": long_result, + } + ] + }, + } + ] + out = await _collect(convert_claude_code_to_agentex_events(_aiter(envelopes))) + result_str = out[0].content.content.get("result", "") + assert len(result_str) <= 4000 + + +# --------------------------------------------------------------------------- +# on_result callback +# --------------------------------------------------------------------------- + + +class TestOnResult: + async def test_on_result_called_with_result_envelope(self): + captured: list[dict] = [] + + async def capture(envelope): + captured.append(envelope) + + envelopes = [ + { + "type": "result", + "session_id": "sess123", + "cost_usd": 0.012, + "usage": {"input_tokens": 100, "output_tokens": 50}, + } + ] + out = await _collect(convert_claude_code_to_agentex_events(_aiter(envelopes), on_result=capture)) + + # result envelope does not emit any StreamTaskMessage + assert out == [] + assert len(captured) == 1 + assert captured[0]["session_id"] == "sess123" + assert captured[0]["cost_usd"] == pytest.approx(0.012) + + async def test_on_result_not_called_when_no_result_envelope(self): + captured: list[dict] = [] + + async def capture(envelope): + captured.append(envelope) + + envelopes = [ + { + "type": "assistant", + "message": {"content": [{"type": "text", "text": "Hi"}]}, + } + ] + await _collect(convert_claude_code_to_agentex_events(_aiter(envelopes), on_result=capture)) + assert captured == [] + + async def test_no_on_result_does_not_raise(self): + envelopes = [ + { + "type": "result", + "cost_usd": 0.001, + "usage": {"input_tokens": 10, "output_tokens": 5}, + } + ] + # Should not raise even without a callback + out = await _collect(convert_claude_code_to_agentex_events(_aiter(envelopes))) + assert out == [] + + +# --------------------------------------------------------------------------- +# Message indexing +# --------------------------------------------------------------------------- + + +class TestMessageIndexing: + async def test_multiple_blocks_get_distinct_indices(self): + envelopes = [ + { + "type": "assistant", + "message": { + "content": [ + {"type": "text", "text": "First"}, + { + "type": "tool_use", + "id": "c1", + "name": "Read", + "input": {"path": "/tmp"}, + }, + ] + }, + }, + { + "type": "user", + "message": { + "content": [ + { + "type": "tool_result", + "tool_use_id": "c1", + "content": "some content", + } + ] + }, + }, + { + "type": "assistant", + "message": {"content": [{"type": "text", "text": "Done"}]}, + }, + ] + out = await _collect(convert_claude_code_to_agentex_events(_aiter(envelopes))) + + # Gather all Start/Full events and check indices are monotonically increasing + anchors = [e for e in out if isinstance(e, (StreamTaskMessageStart, StreamTaskMessageFull))] + indices = [e.index for e in anchors] + assert indices == sorted(indices), "Indices must be monotonically increasing" + assert len(set(indices)) == len(indices), "All indices must be distinct" + + async def test_system_init_and_unknown_envelopes_produce_no_output(self): + envelopes = [ + {"type": "system", "subtype": "init", "session_id": "sess"}, + {"type": "unknown_future_type", "data": "whatever"}, + ] + out = await _collect(convert_claude_code_to_agentex_events(_aiter(envelopes))) + assert out == [] + + async def test_non_json_string_lines_are_skipped(self): + lines = [ + "not json at all", + '{"type": "assistant", "message": {"content": [{"type": "text", "text": "hi"}]}}', + ] + + async def _str_iter(): + for line in lines: + yield line + + out = await _collect(convert_claude_code_to_agentex_events(_str_iter())) + assert len(out) == 3 # Start + Delta + Done for the text block + + async def test_empty_lines_are_skipped(self): + lines = ["", " ", '{"type": "system", "subtype": "init"}'] + + async def _str_iter(): + for line in lines: + yield line + + out = await _collect(convert_claude_code_to_agentex_events(_str_iter())) + assert out == [] + + +# --------------------------------------------------------------------------- +# Author +# --------------------------------------------------------------------------- + + +class TestContentAuthors: + @pytest.mark.parametrize( + "envelope", + [ + { + "type": "assistant", + "message": {"content": [{"type": "text", "text": "hi"}]}, + }, + { + "type": "assistant", + "message": {"content": [{"type": "thinking", "thinking": "thoughts"}]}, + }, + { + "type": "assistant", + "message": { + "content": [ + { + "type": "tool_use", + "id": "c", + "name": "t", + "input": {}, + } + ] + }, + }, + { + "type": "user", + "message": { + "content": [ + { + "type": "tool_result", + "tool_use_id": "c", + "content": "ok", + } + ] + }, + }, + ], + ) + async def test_all_content_authored_by_agent(self, envelope: dict): + out = await _collect(convert_claude_code_to_agentex_events(_aiter([envelope]))) + for e in out: + content = getattr(e, "content", None) + if content is not None and hasattr(content, "author"): + assert content.author == "agent" diff --git a/tests/lib/adk/test_claude_code_turn.py b/tests/lib/adk/test_claude_code_turn.py new file mode 100644 index 000000000..4fbb2f913 --- /dev/null +++ b/tests/lib/adk/test_claude_code_turn.py @@ -0,0 +1,283 @@ +"""Tests for ClaudeCodeTurn and claude_code_usage_to_turn_usage.""" + +from __future__ import annotations + +from typing import Any, AsyncIterator + +import pytest + +from agentex.lib.core.harness.types import TurnUsage, HarnessTurn +from agentex.types.task_message_update import ( + StreamTaskMessageDone, + StreamTaskMessageFull, + StreamTaskMessageDelta, + StreamTaskMessageStart, +) +from agentex.types.tool_response_content import ToolResponseContent +from agentex.lib.adk._modules._claude_code_turn import ( + ClaudeCodeTurn, + claude_code_usage_to_turn_usage, +) + + +async def _aiter(events: list[Any]) -> AsyncIterator[Any]: + for e in events: + yield e + + +async def _drain(turn: ClaudeCodeTurn) -> list[Any]: + return [e async for e in turn.events] + + +# --------------------------------------------------------------------------- +# Usage normalization +# --------------------------------------------------------------------------- + + +class TestClaudeCodeUsageToTurnUsage: + def test_full_usage_fields(self): + result = { + "usage": { + "input_tokens": 100, + "output_tokens": 50, + "cache_read_input_tokens": 20, + "cache_creation_input_tokens": 5, + }, + "cost_usd": 0.025, + "duration_ms": 3200, + "num_turns": 3, + } + usage = claude_code_usage_to_turn_usage(result) + + assert usage.input_tokens == 100 + assert usage.output_tokens == 50 + assert usage.cached_input_tokens == 25 # 20 + 5 + assert usage.total_tokens == 150 + assert usage.cost_usd == pytest.approx(0.025) + assert usage.duration_ms == 3200 + assert usage.num_llm_calls == 3 + + def test_total_cost_usd_fallback(self): + """total_cost_usd should be used when cost_usd is absent.""" + result = { + "usage": {"input_tokens": 10, "output_tokens": 5}, + "total_cost_usd": 0.001, + } + usage = claude_code_usage_to_turn_usage(result) + assert usage.cost_usd == pytest.approx(0.001) + + def test_cost_usd_takes_precedence_over_total_cost_usd(self): + result = { + "usage": {"input_tokens": 10, "output_tokens": 5}, + "cost_usd": 0.002, + "total_cost_usd": 0.999, + } + usage = claude_code_usage_to_turn_usage(result) + assert usage.cost_usd == pytest.approx(0.002) + + def test_missing_usage_key_returns_nones(self): + result: dict[str, Any] = {} + usage = claude_code_usage_to_turn_usage(result) + assert usage.input_tokens is None + assert usage.output_tokens is None + assert usage.cached_input_tokens is None + assert usage.total_tokens is None + assert usage.cost_usd is None + assert usage.duration_ms is None + assert usage.num_llm_calls is None + + def test_real_zeros_preserved(self): + result = { + "usage": { + "input_tokens": 0, + "output_tokens": 0, + "cache_read_input_tokens": 0, + "cache_creation_input_tokens": 0, + }, + "cost_usd": 0.0, + "duration_ms": 0, + "num_turns": 0, + } + usage = claude_code_usage_to_turn_usage(result) + assert usage.input_tokens == 0 + assert usage.output_tokens == 0 + assert usage.cached_input_tokens == 0 + assert usage.total_tokens == 0 + assert usage.cost_usd == pytest.approx(0.0) + assert usage.duration_ms == 0 + assert usage.num_llm_calls == 0 + + def test_only_cache_read_no_creation(self): + result = { + "usage": { + "input_tokens": 50, + "output_tokens": 25, + "cache_read_input_tokens": 15, + } + } + usage = claude_code_usage_to_turn_usage(result) + assert usage.cached_input_tokens == 15 + + def test_only_cache_creation_no_read(self): + result = { + "usage": { + "input_tokens": 50, + "output_tokens": 25, + "cache_creation_input_tokens": 10, + } + } + usage = claude_code_usage_to_turn_usage(result) + assert usage.cached_input_tokens == 10 + + def test_no_cache_fields_gives_none(self): + result = {"usage": {"input_tokens": 10, "output_tokens": 5}} + usage = claude_code_usage_to_turn_usage(result) + assert usage.cached_input_tokens is None + + def test_total_tokens_computed_from_input_output(self): + result = {"usage": {"input_tokens": 70, "output_tokens": 30}} + usage = claude_code_usage_to_turn_usage(result) + assert usage.total_tokens == 100 + + def test_missing_output_tokens_leaves_total_none(self): + result = {"usage": {"input_tokens": 70}} + usage = claude_code_usage_to_turn_usage(result) + assert usage.total_tokens is None + + def test_returns_turn_usage_instance(self): + result = {"usage": {"input_tokens": 1, "output_tokens": 1}} + usage = claude_code_usage_to_turn_usage(result) + assert isinstance(usage, TurnUsage) + + +# --------------------------------------------------------------------------- +# ClaudeCodeTurn protocol +# --------------------------------------------------------------------------- + + +class TestClaudeCodeTurnProtocol: + def test_satisfies_harness_turn_protocol(self): + """ClaudeCodeTurn must satisfy the HarnessTurn structural protocol.""" + turn = ClaudeCodeTurn(_aiter([])) + assert isinstance(turn, HarnessTurn) + + async def test_events_yields_stream_task_messages(self): + envelopes = [ + { + "type": "assistant", + "message": {"content": [{"type": "text", "text": "Hi there"}]}, + } + ] + turn = ClaudeCodeTurn(_aiter(envelopes)) + out = await _drain(turn) + assert len(out) == 3 + assert isinstance(out[0], StreamTaskMessageStart) + assert isinstance(out[1], StreamTaskMessageDelta) + assert isinstance(out[2], StreamTaskMessageDone) + + async def test_usage_before_drain_returns_empty(self): + envelopes = [ + { + "type": "result", + "usage": {"input_tokens": 100, "output_tokens": 50}, + "cost_usd": 0.01, + } + ] + turn = ClaudeCodeTurn(_aiter(envelopes)) + # usage() called before events drained — no result envelope yet + usage = turn.usage() + assert isinstance(usage, TurnUsage) + assert usage.input_tokens is None + + async def test_usage_after_drain_reflects_result(self): + envelopes = [ + { + "type": "assistant", + "message": {"content": [{"type": "text", "text": "response"}]}, + }, + { + "type": "result", + "usage": {"input_tokens": 200, "output_tokens": 80}, + "cost_usd": 0.015, + "num_turns": 2, + }, + ] + turn = ClaudeCodeTurn(_aiter(envelopes)) + await _drain(turn) + usage = turn.usage() + + assert usage.input_tokens == 200 + assert usage.output_tokens == 80 + assert usage.cost_usd == pytest.approx(0.015) + assert usage.num_llm_calls == 2 + + async def test_usage_empty_when_no_result_envelope(self): + envelopes = [ + { + "type": "assistant", + "message": {"content": [{"type": "text", "text": "no result"}]}, + } + ] + turn = ClaudeCodeTurn(_aiter(envelopes)) + await _drain(turn) + usage = turn.usage() + assert usage.input_tokens is None + assert usage.cost_usd is None + + async def test_tool_call_and_result_round_trip(self): + envelopes = [ + { + "type": "assistant", + "message": { + "content": [ + { + "type": "tool_use", + "id": "call_1", + "name": "Read", + "input": {"path": "/etc/hosts"}, + } + ] + }, + }, + { + "type": "user", + "message": { + "content": [ + { + "type": "tool_result", + "tool_use_id": "call_1", + "content": "127.0.0.1 localhost", + } + ] + }, + }, + { + "type": "result", + "usage": {"input_tokens": 50, "output_tokens": 20}, + "cost_usd": 0.005, + }, + ] + turn = ClaudeCodeTurn(_aiter(envelopes)) + out = await _drain(turn) + usage = turn.usage() + + tool_starts = [ + e for e in out if isinstance(e, StreamTaskMessageStart) and isinstance(e.content, ToolResponseContent) + ] + tool_fulls = [ + e for e in out if isinstance(e, StreamTaskMessageFull) and isinstance(e.content, ToolResponseContent) + ] + assert len(tool_fulls) == 1 + full_content = tool_fulls[0].content + assert isinstance(full_content, ToolResponseContent) + assert full_content.tool_call_id == "call_1" + + assert usage.input_tokens == 50 + assert usage.output_tokens == 20 + + async def test_events_property_returns_same_iterator(self): + """Accessing .events multiple times returns the same iterator (not a new one each call).""" + turn = ClaudeCodeTurn(_aiter([])) + it1 = turn.events + it2 = turn.events + assert it1 is it2 diff --git a/tests/lib/core/harness/conformance/test_claude_code_conformance.py b/tests/lib/core/harness/conformance/test_claude_code_conformance.py new file mode 100644 index 000000000..88643a4cd --- /dev/null +++ b/tests/lib/core/harness/conformance/test_claude_code_conformance.py @@ -0,0 +1,202 @@ +"""Cross-channel conformance tests for the claude-code parser tap. + +Each fixture is a representative sequence of claude-code stream-json +envelopes, converted into canonical ``StreamTaskMessage*`` events via +``ClaudeCodeTurn``, then registered into the shared conformance runner. + +The conformance runner asserts two guarantees per fixture: + +1. **Logical-delivery equivalence**: ``yield_events`` and ``auto_send`` + produce the same logically-delivered message contents. + +2. **Span signal equivalence**: both channels emit the same ``SpanSignal`` + sequence to their ``SpanTracer``. + +Fixtures +-------- +text-only: single ``assistant`` text block +tool-call-result: ``tool_use`` block followed by ``tool_result`` +thinking-block: ``thinking`` block with full text +multi-step: text + tool_use + tool_result + text (two model turns) + +Note +---- +Relative imports are used throughout (runner.py and these fixtures live in the +same package). The per-module ``_FIXTURES`` list is both registered globally +(via ``register()``) and parametrized locally so this module's tests are +self-contained regardless of global registry ordering (see runner.py docstring). +""" + +from __future__ import annotations + +from typing import Any + +import pytest + +from agentex.lib.adk._modules._claude_code_sync import convert_claude_code_to_agentex_events + +from .runner import ( + Fixture, + register, + run_cross_channel_conformance, +) + +# --------------------------------------------------------------------------- +# Convert claude-code envelopes to StreamTaskMessage* events +# --------------------------------------------------------------------------- + + +async def _envelopes_to_events(envelopes: list[dict]) -> list: + """Drive convert_claude_code_to_agentex_events and collect all events.""" + + async def _aiter(items): # type: ignore[return] + for item in items: + yield item + + return [e async for e in convert_claude_code_to_agentex_events(_aiter(envelopes))] + + +# --------------------------------------------------------------------------- +# Fixture definitions (raw claude-code envelope sequences) +# --------------------------------------------------------------------------- + +_TEXT_ENVELOPES = [ + { + "type": "assistant", + "message": {"content": [{"type": "text", "text": "The answer is 42."}]}, + } +] + +_TOOL_ENVELOPES = [ + { + "type": "assistant", + "message": { + "content": [ + { + "type": "tool_use", + "id": "call_read", + "name": "Read", + "input": {"path": "/workspace/README.md"}, + } + ] + }, + }, + { + "type": "user", + "message": { + "content": [ + { + "type": "tool_result", + "tool_use_id": "call_read", + "content": "# My Project\n\nA great project.", + } + ] + }, + }, +] + +_THINKING_ENVELOPES = [ + { + "type": "assistant", + "message": { + "content": [ + {"type": "thinking", "thinking": "Let me think about this carefully.\nStep 1: check the facts."}, + {"type": "text", "text": "Here is my answer."}, + ] + }, + } +] + +_MULTI_STEP_ENVELOPES = [ + # Turn 1: text + tool call + { + "type": "assistant", + "message": { + "content": [ + {"type": "text", "text": "Let me look that up."}, + { + "type": "tool_use", + "id": "call_bash", + "name": "Bash", + "input": {"command": "cat /etc/hostname"}, + }, + ] + }, + }, + { + "type": "user", + "message": { + "content": [ + { + "type": "tool_result", + "tool_use_id": "call_bash", + "content": "myhost", + } + ] + }, + }, + # Turn 2: final text after tool result + { + "type": "assistant", + "message": {"content": [{"type": "text", "text": "The hostname is myhost."}]}, + }, +] + + +# --------------------------------------------------------------------------- +# Build fixtures from envelopes at module load time +# --------------------------------------------------------------------------- + + +async def _build_fixture(name: str, envelopes: list[dict]) -> Fixture: + events = await _envelopes_to_events(envelopes) + return Fixture(name=name, events=events) + + +# Fixtures must exist before pytest collects (they parametrize the test below), +# so they are built at import time. The conversion only iterates in-memory +# envelopes — it never suspends on a real future — so we drive the coroutine to +# completion by hand instead of asyncio.run(). asyncio.run() at import raises +# RuntimeError when an event loop is already running (programmatic pytest, a +# Jupyter kernel, or session-scoped asyncio loops); the loop-free driver below +# is unaffected by the ambient loop state. +def _run_pure_async(coro: Any) -> Any: + try: + coro.send(None) + except StopIteration as stop: + return stop.value + coro.close() + raise RuntimeError("conformance fixture build unexpectedly suspended on real I/O") + + +_FIXTURES: list[Fixture] = [ + _run_pure_async(_build_fixture("claude-code-text-only", _TEXT_ENVELOPES)), + _run_pure_async(_build_fixture("claude-code-tool-call-result", _TOOL_ENVELOPES)), + _run_pure_async(_build_fixture("claude-code-thinking-block", _THINKING_ENVELOPES)), + _run_pure_async(_build_fixture("claude-code-multi-step", _MULTI_STEP_ENVELOPES)), +] + +# Register into the shared registry so all_fixtures() can enumerate them +for _f in _FIXTURES: + register(_f) + + +# --------------------------------------------------------------------------- +# Cross-channel conformance assertions +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize("fixture", _FIXTURES, ids=lambda f: f.name) +@pytest.mark.asyncio +async def test_cross_channel_equivalence(fixture: Fixture) -> None: + """yield_events and auto_send must produce equivalent logical deliveries + and identical span signals for every claude-code fixture. + """ + yield_deliveries, auto_deliveries, yield_spans, auto_spans = await run_cross_channel_conformance(fixture) + + assert yield_deliveries == auto_deliveries, ( + f"[{fixture.name}] logical deliveries differ:\n yield: {yield_deliveries}\n auto_send: {auto_deliveries}" + ) + assert yield_spans == auto_spans, ( + f"[{fixture.name}] span signals differ:\n yield: {yield_spans}\n auto_send: {auto_spans}" + ) From d34422845de4b80ed69d2dccfdb0c680ef2fbca3 Mon Sep 17 00:00:00 2001 From: Declan Brady Date: Mon, 22 Jun 2026 18:45:21 -0400 Subject: [PATCH 07/10] feat(langgraph): migrate LangGraph harness onto unified surface (#417) --- .github/workflows/harness-integration.yml | 16 +- .../00_sync/harness_langgraph/Dockerfile | 50 +++ .../00_sync/harness_langgraph/README.md | 55 ++++ .../00_sync/harness_langgraph/manifest.yaml | 58 ++++ .../harness_langgraph/project/__init__.py | 0 .../00_sync/harness_langgraph/project/acp.py | 107 +++++++ .../harness_langgraph/project/graph.py | 67 ++++ .../harness_langgraph/project/tools.py | 24 ++ .../00_sync/harness_langgraph/pyproject.toml | 37 +++ .../harness_langgraph/tests/test_agent.py | 144 +++++++++ .../00_base/harness_langgraph/Dockerfile | 50 +++ .../00_base/harness_langgraph/README.md | 57 ++++ .../00_base/harness_langgraph/manifest.yaml | 58 ++++ .../harness_langgraph/project/__init__.py | 0 .../00_base/harness_langgraph/project/acp.py | 109 +++++++ .../harness_langgraph/project/graph.py | 67 ++++ .../harness_langgraph/project/tools.py | 24 ++ .../00_base/harness_langgraph/pyproject.toml | 37 +++ .../harness_langgraph/tests/test_agent.py | 100 ++++++ .../10_temporal/harness_langgraph/Dockerfile | 43 +++ .../10_temporal/harness_langgraph/README.md | 53 ++++ .../harness_langgraph/manifest.yaml | 51 +++ .../harness_langgraph/project/__init__.py | 0 .../harness_langgraph/project/acp.py | 34 ++ .../harness_langgraph/project/graph.py | 85 +++++ .../harness_langgraph/project/run_worker.py | 46 +++ .../harness_langgraph/project/tools.py | 37 +++ .../harness_langgraph/project/workflow.py | 80 +++++ .../harness_langgraph/pyproject.toml | 40 +++ .../harness_langgraph/tests/test_agent.py | 106 +++++++ .../lib/adk/_modules/_langgraph_async.py | 213 +++---------- .../lib/adk/_modules/_langgraph_sync.py | 49 ++- .../lib/adk/_modules/_langgraph_tracing.py | 31 +- .../lib/adk/_modules/_langgraph_turn.py | 152 +++++++++ tests/lib/adk/providers/test_openai_turn.py | 4 +- tests/lib/adk/test_langgraph_async.py | 282 +++++++++++++++++ tests/lib/adk/test_langgraph_sync.py | 247 +++++++++++++++ tests/lib/adk/test_langgraph_sync_unified.py | 214 +++++++++++++ tests/lib/adk/test_langgraph_turn.py | 265 ++++++++++++++++ tests/lib/adk/test_pydantic_ai_turn.py | 4 +- .../conformance/test_langgraph_conformance.py | 229 ++++++++++++++ .../harness/test_harness_langgraph_async.py | 298 ++++++++++++++++++ .../harness/test_harness_langgraph_sync.py | 229 ++++++++++++++ .../test_harness_langgraph_temporal.py | 233 ++++++++++++++ 44 files changed, 3897 insertions(+), 188 deletions(-) create mode 100644 examples/tutorials/00_sync/harness_langgraph/Dockerfile create mode 100644 examples/tutorials/00_sync/harness_langgraph/README.md create mode 100644 examples/tutorials/00_sync/harness_langgraph/manifest.yaml create mode 100644 examples/tutorials/00_sync/harness_langgraph/project/__init__.py create mode 100644 examples/tutorials/00_sync/harness_langgraph/project/acp.py create mode 100644 examples/tutorials/00_sync/harness_langgraph/project/graph.py create mode 100644 examples/tutorials/00_sync/harness_langgraph/project/tools.py create mode 100644 examples/tutorials/00_sync/harness_langgraph/pyproject.toml create mode 100644 examples/tutorials/00_sync/harness_langgraph/tests/test_agent.py create mode 100644 examples/tutorials/10_async/00_base/harness_langgraph/Dockerfile create mode 100644 examples/tutorials/10_async/00_base/harness_langgraph/README.md create mode 100644 examples/tutorials/10_async/00_base/harness_langgraph/manifest.yaml create mode 100644 examples/tutorials/10_async/00_base/harness_langgraph/project/__init__.py create mode 100644 examples/tutorials/10_async/00_base/harness_langgraph/project/acp.py create mode 100644 examples/tutorials/10_async/00_base/harness_langgraph/project/graph.py create mode 100644 examples/tutorials/10_async/00_base/harness_langgraph/project/tools.py create mode 100644 examples/tutorials/10_async/00_base/harness_langgraph/pyproject.toml create mode 100644 examples/tutorials/10_async/00_base/harness_langgraph/tests/test_agent.py create mode 100644 examples/tutorials/10_async/10_temporal/harness_langgraph/Dockerfile create mode 100644 examples/tutorials/10_async/10_temporal/harness_langgraph/README.md create mode 100644 examples/tutorials/10_async/10_temporal/harness_langgraph/manifest.yaml create mode 100644 examples/tutorials/10_async/10_temporal/harness_langgraph/project/__init__.py create mode 100644 examples/tutorials/10_async/10_temporal/harness_langgraph/project/acp.py create mode 100644 examples/tutorials/10_async/10_temporal/harness_langgraph/project/graph.py create mode 100644 examples/tutorials/10_async/10_temporal/harness_langgraph/project/run_worker.py create mode 100644 examples/tutorials/10_async/10_temporal/harness_langgraph/project/tools.py create mode 100644 examples/tutorials/10_async/10_temporal/harness_langgraph/project/workflow.py create mode 100644 examples/tutorials/10_async/10_temporal/harness_langgraph/pyproject.toml create mode 100644 examples/tutorials/10_async/10_temporal/harness_langgraph/tests/test_agent.py create mode 100644 src/agentex/lib/adk/_modules/_langgraph_turn.py create mode 100644 tests/lib/adk/test_langgraph_async.py create mode 100644 tests/lib/adk/test_langgraph_sync.py create mode 100644 tests/lib/adk/test_langgraph_sync_unified.py create mode 100644 tests/lib/adk/test_langgraph_turn.py create mode 100644 tests/lib/core/harness/conformance/test_langgraph_conformance.py create mode 100644 tests/lib/core/harness/test_harness_langgraph_async.py create mode 100644 tests/lib/core/harness/test_harness_langgraph_sync.py create mode 100644 tests/lib/core/harness/test_harness_langgraph_temporal.py diff --git a/.github/workflows/harness-integration.yml b/.github/workflows/harness-integration.yml index 11b5239dc..075ee5cf3 100644 --- a/.github/workflows/harness-integration.yml +++ b/.github/workflows/harness-integration.yml @@ -8,6 +8,7 @@ on: - "src/agentex/lib/core/harness/**" - "src/agentex/lib/adk/_modules/**" - "tests/lib/core/harness/test_harness_pydantic_ai_*.py" + - "tests/lib/core/harness/test_harness_langgraph_*.py" - ".github/workflows/harness-integration.yml" jobs: @@ -32,17 +33,18 @@ jobs: - name: Conformance suite run: ./scripts/test tests/lib/core/harness/ -v - # Offline pydantic-ai integration tests (sync / async / temporal channels). - # These use pydantic-ai TestModel + fake streaming/tracing and require no live - # infrastructure. Enabled here for PR 4 (pydantic-ai migration). Future harness - # migration PRs (5-8) should add their integration-test paths to this matrix. + # Offline harness integration tests (sync / async / temporal channels) for each + # migrated harness. These use fake streams / TestModel + fake streaming/tracing + # and require no live infrastructure. Future harness migration PRs (6-8) add + # their harness to the matrix below and their test paths to the triggers above. live-matrix: runs-on: ubuntu-latest strategy: matrix: + harness: [pydantic_ai, langgraph] channel: [sync, async, temporal] fail-fast: false - name: pydantic-ai-${{ matrix.channel }} + name: ${{ matrix.harness }}-${{ matrix.channel }} steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 @@ -54,6 +56,6 @@ jobs: - name: Bootstrap run: ./scripts/bootstrap - - name: pydantic-ai ${{ matrix.channel }} integration tests (offline, TestModel) + - name: ${{ matrix.harness }} ${{ matrix.channel }} integration tests (offline) run: | - ./scripts/test tests/lib/core/harness/test_harness_pydantic_ai_${{ matrix.channel }}.py -v + ./scripts/test tests/lib/core/harness/test_harness_${{ matrix.harness }}_${{ matrix.channel }}.py -v diff --git a/examples/tutorials/00_sync/harness_langgraph/Dockerfile b/examples/tutorials/00_sync/harness_langgraph/Dockerfile new file mode 100644 index 000000000..9d492198f --- /dev/null +++ b/examples/tutorials/00_sync/harness_langgraph/Dockerfile @@ -0,0 +1,50 @@ +# syntax=docker/dockerfile:1.3 +FROM python:3.12-slim +COPY --from=ghcr.io/astral-sh/uv:0.6.4 /uv /uvx /bin/ + +# Install system dependencies +RUN apt-get update && apt-get install -y \ + htop \ + vim \ + curl \ + tar \ + python3-dev \ + postgresql-client \ + build-essential \ + libpq-dev \ + gcc \ + cmake \ + netcat-openbsd \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +RUN uv pip install --system --upgrade pip setuptools wheel + +ENV UV_HTTP_TIMEOUT=1000 + +# Copy pyproject.toml and README.md to install dependencies +COPY 00_sync/harness_langgraph/pyproject.toml /app/harness_langgraph/pyproject.toml +COPY 00_sync/harness_langgraph/README.md /app/harness_langgraph/README.md + +WORKDIR /app/harness_langgraph + +# Copy the project code +COPY 00_sync/harness_langgraph/project /app/harness_langgraph/project + +# Copy the test files +COPY 00_sync/harness_langgraph/tests /app/harness_langgraph/tests + +# Copy shared test utilities +COPY test_utils /app/test_utils + +# Install the required Python packages with dev dependencies +RUN uv pip install --system .[dev] + +# Set environment variables +ENV PYTHONPATH=/app + +# Set test environment variables +ENV AGENT_NAME=s-harness-langgraph + +# Run the agent using uvicorn +CMD ["uvicorn", "project.acp:acp", "--host", "0.0.0.0", "--port", "8000"] diff --git a/examples/tutorials/00_sync/harness_langgraph/README.md b/examples/tutorials/00_sync/harness_langgraph/README.md new file mode 100644 index 000000000..86367f162 --- /dev/null +++ b/examples/tutorials/00_sync/harness_langgraph/README.md @@ -0,0 +1,55 @@ +# Tutorial: Sync Harness LangGraph Agent + +This tutorial demonstrates how to build a **synchronous** LangGraph agent on AgentEx +using the **unified harness surface**: + +```python +turn = LangGraphTurn(stream, model=None) +emitter = UnifiedEmitter(task_id=task_id, trace_id=task_id, ...) +async for event in emitter.yield_turn(turn): + yield event +``` + +Compare with ``030_langgraph``, which uses the bespoke +``convert_langgraph_to_agentex_events`` helper directly. + +## Key Concepts + +### Unified Harness + +`LangGraphTurn` implements the `HarnessTurn` protocol: it wraps the raw +LangGraph `astream()` generator and exposes `events` (an async generator of +`TaskMessageUpdate`) and `usage()` (token counts captured from the final +`AIMessage`). + +`UnifiedEmitter.yield_turn(turn)` iterates the turn's events and yields them +to the sync ACP handler unchanged. The same `LangGraphTurn` object can also be +passed to `UnifiedEmitter.auto_send_turn` in the async/temporal channels. + +### AGX1-377 Note + +LangGraph emits tool requests as `StreamTaskMessageFull` events (from "updates" +node outputs). The `SpanDeriver` does not open tool spans from Full events +today; that gap is tracked in AGX1-373. + +## Files + +| File | Description | +|------|-------------| +| `project/acp.py` | ACP server using unified harness (LangGraphTurn + yield_turn) | +| `project/graph.py` | LangGraph state graph (identical to 030_langgraph) | +| `project/tools.py` | Tool definitions (weather example) | +| `tests/test_agent.py` | Integration tests | +| `manifest.yaml` | Agent configuration (name: s-harness-langgraph) | + +## Running Locally + +```bash +agentex agents run +``` + +## Running Tests + +```bash +pytest tests/test_agent.py -v +``` diff --git a/examples/tutorials/00_sync/harness_langgraph/manifest.yaml b/examples/tutorials/00_sync/harness_langgraph/manifest.yaml new file mode 100644 index 000000000..1f57678f2 --- /dev/null +++ b/examples/tutorials/00_sync/harness_langgraph/manifest.yaml @@ -0,0 +1,58 @@ +build: + context: + root: ../../ + include_paths: + - 00_sync/harness_langgraph + - test_utils + dockerfile: 00_sync/harness_langgraph/Dockerfile + dockerignore: 00_sync/harness_langgraph/.dockerignore + +local_development: + agent: + port: 8000 + host_address: host.docker.internal + paths: + acp: project/acp.py + +agent: + acp_type: sync + name: s-harness-langgraph + description: A sync LangGraph agent using the unified harness surface (LangGraphTurn + UnifiedEmitter.yield_turn) + + temporal: + enabled: false + + credentials: + - env_var_name: OPENAI_API_KEY + secret_name: openai-api-key + secret_key: api-key + - env_var_name: REDIS_URL + secret_name: redis-url-secret + secret_key: url + - env_var_name: SGP_API_KEY + secret_name: sgp-api-key + secret_key: api-key + - env_var_name: SGP_ACCOUNT_ID + secret_name: sgp-account-id + secret_key: account-id + - env_var_name: SGP_CLIENT_BASE_URL + secret_name: sgp-client-base-url + secret_key: url + +deployment: + image: + repository: "" + tag: "latest" + + global: + agent: + name: "s-harness-langgraph" + description: "A sync LangGraph agent using the unified harness surface" + replicaCount: 1 + resources: + requests: + cpu: "500m" + memory: "1Gi" + limits: + cpu: "1000m" + memory: "2Gi" diff --git a/examples/tutorials/00_sync/harness_langgraph/project/__init__.py b/examples/tutorials/00_sync/harness_langgraph/project/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/examples/tutorials/00_sync/harness_langgraph/project/acp.py b/examples/tutorials/00_sync/harness_langgraph/project/acp.py new file mode 100644 index 000000000..f609f1682 --- /dev/null +++ b/examples/tutorials/00_sync/harness_langgraph/project/acp.py @@ -0,0 +1,107 @@ +"""ACP handler for sync harness LangGraph agent. + +Uses the unified harness surface: ``LangGraphTurn`` wraps the LangGraph +``astream()`` generator, and ``UnifiedEmitter.yield_turn`` converts it into +the AgentEx ``TaskMessageUpdate`` event stream expected by the sync ACP. + +Differences from ``030_langgraph`` (bespoke path): +- No ``create_langgraph_tracing_handler`` boilerplate. +- No manual text-delta accumulation for the span output. +- Tool calls are emitted as ``StreamTaskMessageFull`` (not Start+Delta+Done) + via the same code path as the async/temporal channels. +- Usage data (token counts) is captured on the ``LangGraphTurn`` object and + can be read after the turn completes. + +AGX1-377 note: LangGraph emits tool requests as ``StreamTaskMessageFull`` +events (from "updates"). The ``SpanDeriver`` does not open tool spans from +Full events today; that gap is tracked in AGX1-373. +""" + +from __future__ import annotations + +import os +from typing import AsyncGenerator + +from dotenv import load_dotenv + +load_dotenv() + +import agentex.lib.adk as adk +from project.graph import create_graph +from agentex.lib.types.acp import SendMessageParams +from agentex.lib.types.tracing import SGPTracingProcessorConfig +from agentex.lib.utils.logging import make_logger +from agentex.lib.sdk.fastacp.fastacp import FastACP +from agentex.lib.core.harness.emitter import UnifiedEmitter +from agentex.types.task_message_delta import TextDelta +from agentex.types.task_message_update import TaskMessageUpdate +from agentex.types.task_message_content import TaskMessageContent +from agentex.lib.adk._modules._langgraph_turn import LangGraphTurn +from agentex.lib.core.tracing.tracing_processor_manager import add_tracing_processor_config + +logger = make_logger(__name__) + +add_tracing_processor_config( + SGPTracingProcessorConfig( + sgp_api_key=os.environ.get("SGP_API_KEY", ""), + sgp_account_id=os.environ.get("SGP_ACCOUNT_ID", ""), + sgp_base_url=os.environ.get("SGP_CLIENT_BASE_URL", ""), + ) +) + +acp = FastACP.create(acp_type="sync") + +_graph = None + + +async def get_graph(): + """Get or create the compiled graph instance.""" + global _graph + if _graph is None: + _graph = await create_graph() + return _graph + + +@acp.on_message_send +async def handle_message_send( + params: SendMessageParams, +) -> TaskMessageContent | list[TaskMessageContent] | AsyncGenerator[TaskMessageUpdate, None]: + """Handle incoming messages, streaming tokens and tool calls via unified harness.""" + graph = await get_graph() + + task_id = params.task.id + user_message = params.content.content + + logger.info(f"Processing message for task {task_id}") + + async with adk.tracing.span( + trace_id=task_id, + task_id=task_id, + name="message", + input={"message": user_message}, + data={"__span_type__": "AGENT_WORKFLOW"}, + ) as turn_span: + stream = graph.astream( + {"messages": [{"role": "user", "content": user_message}]}, + config={"configurable": {"thread_id": task_id}}, + stream_mode=["messages", "updates"], + ) + + turn = LangGraphTurn(stream, model=None) + emitter = UnifiedEmitter( + task_id=task_id, + trace_id=task_id, + parent_span_id=turn_span.id if turn_span else None, + ) + + final_text = "" + async for event in emitter.yield_turn(turn): + # Accumulate text deltas so the span's final_output is the assistant + # text (matching the async tutorial), not the usage metrics. + delta = getattr(event, "delta", None) + if isinstance(delta, TextDelta) and delta.text_delta: + final_text += delta.text_delta + yield event + + if turn_span: + turn_span.output = {"final_output": final_text, "usage": turn.usage().model_dump()} diff --git a/examples/tutorials/00_sync/harness_langgraph/project/graph.py b/examples/tutorials/00_sync/harness_langgraph/project/graph.py new file mode 100644 index 000000000..4516087d2 --- /dev/null +++ b/examples/tutorials/00_sync/harness_langgraph/project/graph.py @@ -0,0 +1,67 @@ +"""LangGraph graph definition for the harness_langgraph sync agent. + +Identical to ``030_langgraph/project/graph.py`` — the graph definition is not +affected by the harness migration. Only ``acp.py`` changes. +""" + +from __future__ import annotations + +from typing import Any, Annotated +from datetime import datetime +from typing_extensions import TypedDict + +from langgraph.graph import START, StateGraph +from langchain_openai import ChatOpenAI +from langgraph.prebuilt import ToolNode, tools_condition +from langchain_core.messages import SystemMessage +from langgraph.graph.message import add_messages + +from project.tools import TOOLS +from agentex.lib.adk import create_checkpointer + +MODEL_NAME = "gpt-5" +SYSTEM_PROMPT = """You are a helpful AI assistant with access to tools. + +Current date and time: {timestamp} + +Guidelines: +- Be concise and helpful +- Use tools when they would help answer the user's question +- If you're unsure, ask clarifying questions +- Always provide accurate information +""" + + +class AgentState(TypedDict): + """State schema for the agent graph.""" + + messages: Annotated[list[Any], add_messages] + + +async def create_graph(): + """Create and compile the agent graph with checkpointer.""" + llm = ChatOpenAI( + model=MODEL_NAME, + reasoning={"effort": "high", "summary": "auto"}, + ) + llm_with_tools = llm.bind_tools(TOOLS) + + checkpointer = await create_checkpointer() + + def agent_node(state: AgentState) -> dict[str, Any]: + """Process the current state and generate a response.""" + messages = state["messages"] + if not messages or not isinstance(messages[0], SystemMessage): + system_content = SYSTEM_PROMPT.format(timestamp=datetime.now().strftime("%Y-%m-%d %H:%M:%S")) + messages = [SystemMessage(content=system_content)] + messages + response = llm_with_tools.invoke(messages) + return {"messages": [response]} + + builder = StateGraph(AgentState) + builder.add_node("agent", agent_node) + builder.add_node("tools", ToolNode(tools=TOOLS)) + builder.add_edge(START, "agent") + builder.add_conditional_edges("agent", tools_condition, "tools") + builder.add_edge("tools", "agent") + + return builder.compile(checkpointer=checkpointer) diff --git a/examples/tutorials/00_sync/harness_langgraph/project/tools.py b/examples/tutorials/00_sync/harness_langgraph/project/tools.py new file mode 100644 index 000000000..f02587430 --- /dev/null +++ b/examples/tutorials/00_sync/harness_langgraph/project/tools.py @@ -0,0 +1,24 @@ +"""Tool definitions for the harness_langgraph sync agent.""" + +from langchain_core.tools import Tool + + +def get_weather(city: str) -> str: + """Get the current weather for a city. + + Args: + city: The name of the city to get weather for. + + Returns: + A string describing the weather conditions. + """ + return f"The weather in {city} is sunny and 72°F" + + +weather_tool = Tool( + name="get_weather", + func=get_weather, + description="Get the current weather for a city. Input should be a city name.", +) + +TOOLS = [weather_tool] diff --git a/examples/tutorials/00_sync/harness_langgraph/pyproject.toml b/examples/tutorials/00_sync/harness_langgraph/pyproject.toml new file mode 100644 index 000000000..deecd08b3 --- /dev/null +++ b/examples/tutorials/00_sync/harness_langgraph/pyproject.toml @@ -0,0 +1,37 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "s-harness-langgraph" +version = "0.1.0" +description = "A sync LangGraph agent using the unified harness surface" +readme = "README.md" +requires-python = ">=3.12" +dependencies = [ + "agentex-sdk", + "scale-gp", + "langgraph", + "langchain-openai", +] + +[project.optional-dependencies] +dev = [ + "pytest", + "pytest-asyncio", + "httpx", + "black", + "isort", + "flake8", +] + +[tool.hatch.build.targets.wheel] +packages = ["project"] + +[tool.black] +line-length = 88 +target-version = ['py312'] + +[tool.isort] +profile = "black" +line_length = 88 diff --git a/examples/tutorials/00_sync/harness_langgraph/tests/test_agent.py b/examples/tutorials/00_sync/harness_langgraph/tests/test_agent.py new file mode 100644 index 000000000..2eb561cec --- /dev/null +++ b/examples/tutorials/00_sync/harness_langgraph/tests/test_agent.py @@ -0,0 +1,144 @@ +""" +Tests for the sync harness LangGraph agent. + +Validates the unified harness surface (LangGraphTurn + UnifiedEmitter.yield_turn) +end-to-end against a live AgentEx server. + +Configuration: +- AGENTEX_API_BASE_URL: Base URL for the AgentEx server (default: http://localhost:5003) +- AGENT_NAME: Name of the agent to test (default: s-harness-langgraph) +""" + +import os + +import pytest +from test_utils.sync import validate_text_in_string, collect_streaming_response + +from agentex import Agentex +from agentex.types import TextContent, TextContentParam +from agentex.types.agent_rpc_params import ParamsCreateTaskRequest, ParamsSendMessageRequest +from agentex.lib.sdk.fastacp.base.base_acp_server import uuid + +AGENTEX_API_BASE_URL = os.environ.get("AGENTEX_API_BASE_URL", "http://localhost:5003") +AGENT_NAME = os.environ.get("AGENT_NAME", "s-harness-langgraph") + + +@pytest.fixture +def client(): + return Agentex(base_url=AGENTEX_API_BASE_URL) + + +@pytest.fixture +def agent_name(): + return AGENT_NAME + + +@pytest.fixture +def agent_id(client, agent_name): + agents = client.agents.list() + for agent in agents: + if agent.name == agent_name: + return agent.id + raise ValueError(f"Agent with name {agent_name} not found.") + + +class TestNonStreamingMessages: + def test_send_simple_message(self, client: Agentex, agent_name: str): + response = client.agents.send_message( + agent_name=agent_name, + params=ParamsSendMessageRequest( + content=TextContentParam( + author="user", + content="Hello! What can you help me with?", + type="text", + ) + ), + ) + result = response.result + assert result is not None + assert len(result) >= 1 + + def test_tool_calling(self, client: Agentex, agent_name: str): + response = client.agents.send_message( + agent_name=agent_name, + params=ParamsSendMessageRequest( + content=TextContentParam( + author="user", + content="What's the weather in San Francisco?", + type="text", + ) + ), + ) + result = response.result + assert result is not None + assert len(result) >= 1 + + def test_multiturn_conversation(self, client: Agentex, agent_name: str, agent_id: str): + task_response = client.agents.create_task(agent_id, params=ParamsCreateTaskRequest(name=uuid.uuid1().hex)) + task = task_response.result + assert task is not None + + response1 = client.agents.send_message( + agent_name=agent_name, + params=ParamsSendMessageRequest( + content=TextContentParam( + author="user", + content="My name is Alice. Remember that.", + type="text", + ), + task_id=task.id, + ), + ) + assert response1.result is not None + + response2 = client.agents.send_message( + agent_name=agent_name, + params=ParamsSendMessageRequest( + content=TextContentParam( + author="user", + content="What is my name?", + type="text", + ), + task_id=task.id, + ), + ) + assert response2.result is not None + for message in response2.result: + if isinstance(message.content, TextContent): + validate_text_in_string("alice", message.content.content.lower()) + + +class TestStreamingMessages: + def test_stream_simple_message(self, client: Agentex, agent_name: str): + stream = client.agents.send_message_stream( + agent_name=agent_name, + params=ParamsSendMessageRequest( + content=TextContentParam( + author="user", + content="Tell me a short joke.", + type="text", + ) + ), + ) + aggregated_content, chunks = collect_streaming_response(stream) + assert aggregated_content is not None + assert len(chunks) > 1, "No chunks received in streaming response." + + def test_stream_tool_calling(self, client: Agentex, agent_name: str): + stream = client.agents.send_message_stream( + agent_name=agent_name, + params=ParamsSendMessageRequest( + content=TextContentParam( + author="user", + content="What's the weather in New York?", + type="text", + ) + ), + ) + aggregated_content, chunks = collect_streaming_response(stream) + assert aggregated_content is not None + assert len(chunks) > 0, "No chunks received in streaming response." + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) diff --git a/examples/tutorials/10_async/00_base/harness_langgraph/Dockerfile b/examples/tutorials/10_async/00_base/harness_langgraph/Dockerfile new file mode 100644 index 000000000..3e0bd696a --- /dev/null +++ b/examples/tutorials/10_async/00_base/harness_langgraph/Dockerfile @@ -0,0 +1,50 @@ +# syntax=docker/dockerfile:1.3 +FROM python:3.12-slim +COPY --from=ghcr.io/astral-sh/uv:0.6.4 /uv /uvx /bin/ + +# Install system dependencies +RUN apt-get update && apt-get install -y \ + htop \ + vim \ + curl \ + tar \ + python3-dev \ + postgresql-client \ + build-essential \ + libpq-dev \ + gcc \ + cmake \ + netcat-openbsd \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +RUN uv pip install --system --upgrade pip setuptools wheel + +ENV UV_HTTP_TIMEOUT=1000 + +# Copy pyproject.toml and README.md to install dependencies +COPY 10_async/00_base/harness_langgraph/pyproject.toml /app/harness_langgraph/pyproject.toml +COPY 10_async/00_base/harness_langgraph/README.md /app/harness_langgraph/README.md + +WORKDIR /app/harness_langgraph + +# Copy the project code +COPY 10_async/00_base/harness_langgraph/project /app/harness_langgraph/project + +# Copy the test files +COPY 10_async/00_base/harness_langgraph/tests /app/harness_langgraph/tests + +# Copy shared test utilities +COPY test_utils /app/test_utils + +# Install the required Python packages with dev dependencies +RUN uv pip install --system .[dev] pytest-asyncio httpx + +# Set environment variables +ENV PYTHONPATH=/app + +# Set test environment variables +ENV AGENT_NAME=a-harness-langgraph + +# Run the agent using uvicorn +CMD ["uvicorn", "project.acp:acp", "--host", "0.0.0.0", "--port", "8000"] diff --git a/examples/tutorials/10_async/00_base/harness_langgraph/README.md b/examples/tutorials/10_async/00_base/harness_langgraph/README.md new file mode 100644 index 000000000..7efe28207 --- /dev/null +++ b/examples/tutorials/10_async/00_base/harness_langgraph/README.md @@ -0,0 +1,57 @@ +# Tutorial: Async Harness LangGraph Agent + +This tutorial demonstrates how to build an **async** LangGraph agent on AgentEx +using the **unified harness surface**: + +```python +turn = LangGraphTurn(stream, model=None) +emitter = UnifiedEmitter(task_id=task_id, trace_id=task_id, ...) +result = await emitter.auto_send_turn(turn) +``` + +Compare with ``100_langgraph``, which uses the bespoke +``stream_langgraph_events`` helper directly. + +## Key Concepts + +### Unified Harness + +`LangGraphTurn` implements the `HarnessTurn` protocol: it wraps the raw +LangGraph `astream()` generator and exposes `events` (an async generator of +`TaskMessageUpdate`) and `usage()` (token counts captured from the final +`AIMessage`). + +`UnifiedEmitter.auto_send_turn(turn)` pushes each event to Redis via +`streaming_task_message_context`, accumulates the final text, and returns a +`TurnResult(final_text=..., usage=...)`. + +The same `LangGraphTurn` object can also be passed to +`UnifiedEmitter.yield_turn` in the sync channel. + +### AGX1-377 Note + +LangGraph emits tool requests as `StreamTaskMessageFull` events (from "updates" +node outputs). The `SpanDeriver` does not open tool spans from Full events +today; that gap is tracked in AGX1-373. + +## Files + +| File | Description | +|------|-------------| +| `project/acp.py` | ACP server using unified harness (LangGraphTurn + auto_send_turn) | +| `project/graph.py` | LangGraph state graph (identical to 100_langgraph) | +| `project/tools.py` | Tool definitions (weather example) | +| `tests/test_agent.py` | Integration tests | +| `manifest.yaml` | Agent configuration (name: a-harness-langgraph) | + +## Running Locally + +```bash +agentex agents run +``` + +## Running Tests + +```bash +pytest tests/test_agent.py -v +``` diff --git a/examples/tutorials/10_async/00_base/harness_langgraph/manifest.yaml b/examples/tutorials/10_async/00_base/harness_langgraph/manifest.yaml new file mode 100644 index 000000000..bb19e25b3 --- /dev/null +++ b/examples/tutorials/10_async/00_base/harness_langgraph/manifest.yaml @@ -0,0 +1,58 @@ +build: + context: + root: ../../../ + include_paths: + - 10_async/00_base/harness_langgraph + - test_utils + dockerfile: 10_async/00_base/harness_langgraph/Dockerfile + dockerignore: 10_async/00_base/harness_langgraph/.dockerignore + +local_development: + agent: + port: 8000 + host_address: host.docker.internal + paths: + acp: project/acp.py + +agent: + acp_type: async + name: a-harness-langgraph + description: An async LangGraph agent using the unified harness surface (LangGraphTurn + UnifiedEmitter.auto_send_turn) + + temporal: + enabled: false + + credentials: + - env_var_name: OPENAI_API_KEY + secret_name: openai-api-key + secret_key: api-key + - env_var_name: REDIS_URL + secret_name: redis-url-secret + secret_key: url + - env_var_name: SGP_API_KEY + secret_name: sgp-api-key + secret_key: api-key + - env_var_name: SGP_ACCOUNT_ID + secret_name: sgp-account-id + secret_key: account-id + - env_var_name: SGP_CLIENT_BASE_URL + secret_name: sgp-client-base-url + secret_key: url + +deployment: + image: + repository: "" + tag: "latest" + + global: + agent: + name: "a-harness-langgraph" + description: "An async LangGraph agent using the unified harness surface" + replicaCount: 1 + resources: + requests: + cpu: "500m" + memory: "1Gi" + limits: + cpu: "1000m" + memory: "2Gi" diff --git a/examples/tutorials/10_async/00_base/harness_langgraph/project/__init__.py b/examples/tutorials/10_async/00_base/harness_langgraph/project/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/examples/tutorials/10_async/00_base/harness_langgraph/project/acp.py b/examples/tutorials/10_async/00_base/harness_langgraph/project/acp.py new file mode 100644 index 000000000..a99395424 --- /dev/null +++ b/examples/tutorials/10_async/00_base/harness_langgraph/project/acp.py @@ -0,0 +1,109 @@ +"""ACP handler for async harness LangGraph agent. + +Uses the unified harness surface: ``LangGraphTurn`` wraps the LangGraph +``astream()`` generator, and ``UnifiedEmitter.auto_send_turn`` streams events +to Redis and returns a ``TurnResult`` with the accumulated final text. + +Differences from ``100_langgraph`` (bespoke path): +- No ``create_langgraph_tracing_handler`` boilerplate. +- ``stream_langgraph_events`` is replaced by + ``UnifiedEmitter.auto_send_turn(LangGraphTurn(stream))``. +- Tool calls/responses go through ``streaming_task_message_context`` + (same code path as text deltas), making the event stream channel-agnostic. +- Usage data (token counts) is captured on ``LangGraphTurn.usage()`` after + ``auto_send_turn`` returns. + +AGX1-377 note: LangGraph emits tool requests as ``StreamTaskMessageFull`` +events (from "updates"). The ``SpanDeriver`` does not open tool spans from +Full events today; that gap is tracked in AGX1-373. +""" + +from __future__ import annotations + +import os + +from dotenv import load_dotenv + +load_dotenv() + +import agentex.lib.adk as adk +from project.graph import create_graph +from agentex.lib.types.acp import SendEventParams, CancelTaskParams, CreateTaskParams +from agentex.lib.types.fastacp import AsyncACPConfig +from agentex.lib.types.tracing import SGPTracingProcessorConfig +from agentex.lib.utils.logging import make_logger +from agentex.lib.sdk.fastacp.fastacp import FastACP +from agentex.lib.core.harness.emitter import UnifiedEmitter +from agentex.lib.adk._modules._langgraph_turn import LangGraphTurn +from agentex.lib.core.tracing.tracing_processor_manager import add_tracing_processor_config + +logger = make_logger(__name__) + +add_tracing_processor_config( + SGPTracingProcessorConfig( + sgp_api_key=os.environ.get("SGP_API_KEY", ""), + sgp_account_id=os.environ.get("SGP_ACCOUNT_ID", ""), + sgp_base_url=os.environ.get("SGP_CLIENT_BASE_URL", ""), + ) +) + +acp = FastACP.create( + acp_type="async", + config=AsyncACPConfig(type="base"), +) + +_graph = None + + +async def get_graph(): + global _graph + if _graph is None: + _graph = await create_graph() + return _graph + + +@acp.on_task_event_send +async def handle_task_event_send(params: SendEventParams): + """Handle incoming events, streaming tokens and tool calls via unified harness.""" + graph = await get_graph() + task_id = params.task.id + user_message = params.event.content.content + + logger.info(f"Processing message for thread {task_id}") + + await adk.messages.create(task_id=task_id, content=params.event.content) + + async with adk.tracing.span( + trace_id=task_id, + task_id=task_id, + name="message", + input={"message": user_message}, + data={"__span_type__": "AGENT_WORKFLOW"}, + ) as turn_span: + stream = graph.astream( + {"messages": [{"role": "user", "content": user_message}]}, + config={"configurable": {"thread_id": task_id}}, + stream_mode=["messages", "updates"], + ) + + turn = LangGraphTurn(stream, model=None) + emitter = UnifiedEmitter( + task_id=task_id, + trace_id=task_id, + parent_span_id=turn_span.id if turn_span else None, + ) + + result = await emitter.auto_send_turn(turn) + + if turn_span: + turn_span.output = {"final_output": result.final_text} + + +@acp.on_task_create +async def handle_task_create(params: CreateTaskParams): + logger.info(f"Task created: {params.task.id}") + + +@acp.on_task_cancel +async def handle_task_canceled(params: CancelTaskParams): + logger.info(f"Task canceled: {params.task.id}") diff --git a/examples/tutorials/10_async/00_base/harness_langgraph/project/graph.py b/examples/tutorials/10_async/00_base/harness_langgraph/project/graph.py new file mode 100644 index 000000000..4aeac3b3c --- /dev/null +++ b/examples/tutorials/10_async/00_base/harness_langgraph/project/graph.py @@ -0,0 +1,67 @@ +"""LangGraph graph definition for the harness_langgraph async agent. + +Identical to ``100_langgraph/project/graph.py`` — the graph definition is not +affected by the harness migration. Only ``acp.py`` changes. +""" + +from __future__ import annotations + +from typing import Any, Annotated +from datetime import datetime +from typing_extensions import TypedDict + +from langgraph.graph import START, StateGraph +from langchain_openai import ChatOpenAI +from langgraph.prebuilt import ToolNode, tools_condition +from langchain_core.messages import SystemMessage +from langgraph.graph.message import add_messages + +from project.tools import TOOLS +from agentex.lib.adk import create_checkpointer + +MODEL_NAME = "gpt-5" +SYSTEM_PROMPT = """You are a helpful AI assistant with access to tools. + +Current date and time: {timestamp} + +Guidelines: +- Be concise and helpful +- Use tools when they would help answer the user's question +- If you're unsure, ask clarifying questions +- Always provide accurate information +""" + + +class AgentState(TypedDict): + """State schema for the agent graph.""" + + messages: Annotated[list[Any], add_messages] + + +async def create_graph(): + """Create and compile the agent graph with checkpointer.""" + llm = ChatOpenAI( + model=MODEL_NAME, + reasoning={"effort": "high", "summary": "auto"}, + ) + llm_with_tools = llm.bind_tools(TOOLS) + + checkpointer = await create_checkpointer() + + def agent_node(state: AgentState) -> dict[str, Any]: + """Process the current state and generate a response.""" + messages = state["messages"] + if not messages or not isinstance(messages[0], SystemMessage): + system_content = SYSTEM_PROMPT.format(timestamp=datetime.now().strftime("%Y-%m-%d %H:%M:%S")) + messages = [SystemMessage(content=system_content)] + messages + response = llm_with_tools.invoke(messages) + return {"messages": [response]} + + builder = StateGraph(AgentState) + builder.add_node("agent", agent_node) + builder.add_node("tools", ToolNode(tools=TOOLS)) + builder.add_edge(START, "agent") + builder.add_conditional_edges("agent", tools_condition, "tools") + builder.add_edge("tools", "agent") + + return builder.compile(checkpointer=checkpointer) diff --git a/examples/tutorials/10_async/00_base/harness_langgraph/project/tools.py b/examples/tutorials/10_async/00_base/harness_langgraph/project/tools.py new file mode 100644 index 000000000..6e7614300 --- /dev/null +++ b/examples/tutorials/10_async/00_base/harness_langgraph/project/tools.py @@ -0,0 +1,24 @@ +"""Tool definitions for the harness_langgraph async agent.""" + +from langchain_core.tools import Tool + + +def get_weather(city: str) -> str: + """Get the current weather for a city. + + Args: + city: The name of the city to get weather for. + + Returns: + A string describing the weather conditions. + """ + return f"The weather in {city} is sunny and 72°F" + + +weather_tool = Tool( + name="get_weather", + func=get_weather, + description="Get the current weather for a city. Input should be a city name.", +) + +TOOLS = [weather_tool] diff --git a/examples/tutorials/10_async/00_base/harness_langgraph/pyproject.toml b/examples/tutorials/10_async/00_base/harness_langgraph/pyproject.toml new file mode 100644 index 000000000..69856e6db --- /dev/null +++ b/examples/tutorials/10_async/00_base/harness_langgraph/pyproject.toml @@ -0,0 +1,37 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "a-harness-langgraph" +version = "0.1.0" +description = "An async LangGraph agent using the unified harness surface" +readme = "README.md" +requires-python = ">=3.12" +dependencies = [ + "agentex-sdk", + "scale-gp", + "langgraph", + "langchain-openai", +] + +[project.optional-dependencies] +dev = [ + "pytest", + "pytest-asyncio", + "httpx", + "black", + "isort", + "flake8", +] + +[tool.hatch.build.targets.wheel] +packages = ["project"] + +[tool.black] +line-length = 88 +target-version = ['py312'] + +[tool.isort] +profile = "black" +line_length = 88 diff --git a/examples/tutorials/10_async/00_base/harness_langgraph/tests/test_agent.py b/examples/tutorials/10_async/00_base/harness_langgraph/tests/test_agent.py new file mode 100644 index 000000000..762b2b90c --- /dev/null +++ b/examples/tutorials/10_async/00_base/harness_langgraph/tests/test_agent.py @@ -0,0 +1,100 @@ +""" +Tests for the async harness LangGraph agent. + +Validates the unified harness surface (LangGraphTurn + UnifiedEmitter.auto_send_turn) +end-to-end against a live AgentEx server. + +Configuration: +- AGENTEX_API_BASE_URL: Base URL for the AgentEx server (default: http://localhost:5003) +- AGENT_NAME: Name of the agent to test (default: a-harness-langgraph) +""" + +import os + +import pytest +import pytest_asyncio + +from agentex import AsyncAgentex +from agentex.types import TextContentParam +from agentex.types.agent_rpc_params import ParamsCreateTaskRequest +from agentex.lib.sdk.fastacp.base.base_acp_server import uuid + +AGENTEX_API_BASE_URL = os.environ.get("AGENTEX_API_BASE_URL", "http://localhost:5003") +AGENT_NAME = os.environ.get("AGENT_NAME", "a-harness-langgraph") + + +@pytest_asyncio.fixture +async def client(): + client = AsyncAgentex(base_url=AGENTEX_API_BASE_URL) + yield client + await client.close() + + +@pytest.fixture +def agent_name(): + return AGENT_NAME + + +@pytest_asyncio.fixture +async def agent_id(client, agent_name): + agents = await client.agents.list() + for agent in agents: + if agent.name == agent_name: + return agent.id + raise ValueError(f"Agent with name {agent_name} not found.") + + +class TestNonStreamingEvents: + @pytest.mark.asyncio + async def test_send_event(self, client: AsyncAgentex, agent_id: str): + task_response = await client.agents.create_task(agent_id, params=ParamsCreateTaskRequest(name=uuid.uuid1().hex)) + task = task_response.result + assert task is not None + + event_content = TextContentParam( + type="text", + author="user", + content="Hello! What can you help me with?", + ) + await client.agents.send_event( + agent_id=agent_id, + params={"task_id": task.id, "content": event_content}, + ) + + @pytest.mark.asyncio + async def test_tool_calling(self, client: AsyncAgentex, agent_id: str): + task_response = await client.agents.create_task(agent_id, params=ParamsCreateTaskRequest(name=uuid.uuid1().hex)) + task = task_response.result + assert task is not None + + event_content = TextContentParam( + type="text", + author="user", + content="What's the weather in San Francisco?", + ) + await client.agents.send_event( + agent_id=agent_id, + params={"task_id": task.id, "content": event_content}, + ) + + +class TestStreamingEvents: + @pytest.mark.asyncio + async def test_send_event_and_stream(self, client: AsyncAgentex, agent_id: str): + task_response = await client.agents.create_task(agent_id, params=ParamsCreateTaskRequest(name=uuid.uuid1().hex)) + task = task_response.result + assert task is not None + + event_content = TextContentParam( + type="text", + author="user", + content="Tell me a short joke.", + ) + await client.agents.send_event( + agent_id=agent_id, + params={"task_id": task.id, "content": event_content}, + ) + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) diff --git a/examples/tutorials/10_async/10_temporal/harness_langgraph/Dockerfile b/examples/tutorials/10_async/10_temporal/harness_langgraph/Dockerfile new file mode 100644 index 000000000..f6c9fb59b --- /dev/null +++ b/examples/tutorials/10_async/10_temporal/harness_langgraph/Dockerfile @@ -0,0 +1,43 @@ +# syntax=docker/dockerfile:1.3 +FROM python:3.12-slim +COPY --from=ghcr.io/astral-sh/uv:0.6.4 /uv /uvx /bin/ + +# Install system dependencies +RUN apt-get update && apt-get install -y \ + htop \ + vim \ + curl \ + tar \ + python3-dev \ + postgresql-client \ + build-essential \ + libpq-dev \ + gcc \ + cmake \ + netcat-openbsd \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +RUN uv pip install --system --upgrade pip setuptools wheel + +ENV UV_HTTP_TIMEOUT=1000 + +COPY 10_async/10_temporal/harness_langgraph/pyproject.toml /app/harness_langgraph/pyproject.toml +COPY 10_async/10_temporal/harness_langgraph/README.md /app/harness_langgraph/README.md + +WORKDIR /app/harness_langgraph + +COPY 10_async/10_temporal/harness_langgraph/project /app/harness_langgraph/project +COPY 10_async/10_temporal/harness_langgraph/tests /app/harness_langgraph/tests +COPY test_utils /app/test_utils + +RUN uv pip install --system .[dev] + +ENV PYTHONPATH=/app + +ENV AGENT_NAME=at-harness-langgraph + +CMD ["uvicorn", "project.acp:acp", "--host", "0.0.0.0", "--port", "8000"] + +# When we deploy the worker, we will replace the CMD with the following +# CMD ["python", "-m", "run_worker"] diff --git a/examples/tutorials/10_async/10_temporal/harness_langgraph/README.md b/examples/tutorials/10_async/10_temporal/harness_langgraph/README.md new file mode 100644 index 000000000..4df6969f1 --- /dev/null +++ b/examples/tutorials/10_async/10_temporal/harness_langgraph/README.md @@ -0,0 +1,53 @@ +# Tutorial: Temporal Harness LangGraph Agent + +This tutorial demonstrates how to build a **Temporal-backed** LangGraph agent on +AgentEx, following the ``130_langgraph`` pattern. The agent's LLM node runs as a +durable Temporal activity; the tools node runs inline in the workflow. + +This agent is named ``at-harness-langgraph`` to distinguish it from +``at130-langgraph`` (the bespoke reference). The graph and workflow structure are +identical; only the agent name changes. + +## Key Concepts + +### Temporal + LangGraph + +The ``LangGraphPlugin`` from ``temporalio.contrib.langgraph`` turns annotated graph +nodes into Temporal activities or inline workflow callables: + +- `agent` node: `execute_in="activity"` (durable, retryable LLM call) +- `tools` node: `execute_in="workflow"` (inline, fast tool execution) + +### Message surfacing + +After each turn, ``emit_langgraph_messages`` converts the new LangGraph messages +(tool requests, tool responses, final text) into AgentEx ``TaskMessage`` objects +and posts them to the task's message stream. + +This is the Temporal-specific path. The non-Temporal async/sync channels use +``UnifiedEmitter.auto_send_turn`` / ``UnifiedEmitter.yield_turn`` with +``LangGraphTurn`` instead. + +## Files + +| File | Description | +|------|-------------| +| `project/acp.py` | ACP server (Temporal config, LangGraphPlugin) | +| `project/graph.py` | LangGraph graph (agent + tools nodes) | +| `project/workflow.py` | Temporal workflow (signal handlers, emit_langgraph_messages) | +| `project/run_worker.py` | Temporal worker runner | +| `project/tools.py` | Tool definitions (weather example) | +| `tests/test_agent.py` | Integration tests | +| `manifest.yaml` | Agent configuration (name: at-harness-langgraph) | + +## Running Locally + +```bash +agentex agents run +``` + +## Running Tests + +```bash +pytest tests/test_agent.py -v +``` diff --git a/examples/tutorials/10_async/10_temporal/harness_langgraph/manifest.yaml b/examples/tutorials/10_async/10_temporal/harness_langgraph/manifest.yaml new file mode 100644 index 000000000..596d38eb4 --- /dev/null +++ b/examples/tutorials/10_async/10_temporal/harness_langgraph/manifest.yaml @@ -0,0 +1,51 @@ +build: + context: + root: ../../../ + include_paths: + - 10_async/10_temporal/harness_langgraph + - test_utils + dockerfile: 10_async/10_temporal/harness_langgraph/Dockerfile + dockerignore: 10_async/10_temporal/harness_langgraph/.dockerignore + +local_development: + agent: + port: 8000 + host_address: host.docker.internal + paths: + acp: project/acp.py + worker: project/run_worker.py + +agent: + acp_type: async + name: at-harness-langgraph + description: "A Temporal-backed LangGraph agent (harness variant) whose nodes run as Temporal activities" + + temporal: + enabled: true + workflows: + - name: at-harness-langgraph + queue_name: at_harness_langgraph_queue + + credentials: + - env_var_name: REDIS_URL + secret_name: redis-url-secret + secret_key: url + + env: {} + +deployment: + image: + repository: "" + tag: "latest" + + imagePullSecrets: [] + + global: + replicaCount: 1 + resources: + requests: + cpu: "500m" + memory: "1Gi" + limits: + cpu: "1000m" + memory: "2Gi" diff --git a/examples/tutorials/10_async/10_temporal/harness_langgraph/project/__init__.py b/examples/tutorials/10_async/10_temporal/harness_langgraph/project/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/examples/tutorials/10_async/10_temporal/harness_langgraph/project/acp.py b/examples/tutorials/10_async/10_temporal/harness_langgraph/project/acp.py new file mode 100644 index 000000000..7af9c5e68 --- /dev/null +++ b/examples/tutorials/10_async/10_temporal/harness_langgraph/project/acp.py @@ -0,0 +1,34 @@ +"""ACP server for the Temporal harness LangGraph agent. + +Follows the ``130_langgraph`` pattern: the Temporal ``LangGraphPlugin`` runs +graph nodes as Temporal activities. The agent logic lives in ``workflow.py`` +(the runtime) and ``graph.py`` (the LangGraph graph), executed by the Temporal +worker (``run_worker.py``), not by this HTTP process. + +The workflow uses ``emit_langgraph_messages`` to surface turn messages to +AgentEx. That helper is Temporal-specific and is not replaced by the unified +harness here (``UnifiedEmitter`` targets the non-Temporal async/sync channels). +""" + +from __future__ import annotations + +import os + +from dotenv import load_dotenv + +load_dotenv() + +from temporalio.contrib.langgraph import LangGraphPlugin + +from project.graph import GRAPH_NAME, build_graph +from agentex.lib.types.fastacp import TemporalACPConfig +from agentex.lib.sdk.fastacp.fastacp import FastACP + +acp = FastACP.create( + acp_type="async", + config=TemporalACPConfig( + type="temporal", + temporal_address=os.getenv("TEMPORAL_ADDRESS", "localhost:7233"), + plugins=[LangGraphPlugin(graphs={GRAPH_NAME: build_graph()})], + ), +) diff --git a/examples/tutorials/10_async/10_temporal/harness_langgraph/project/graph.py b/examples/tutorials/10_async/10_temporal/harness_langgraph/project/graph.py new file mode 100644 index 000000000..ce9c2b520 --- /dev/null +++ b/examples/tutorials/10_async/10_temporal/harness_langgraph/project/graph.py @@ -0,0 +1,85 @@ +"""LangGraph graph for at-harness-langgraph — nodes run as Temporal activities. + +Identical in structure to ``130_langgraph/project/graph.py``. The graph +definition is not affected by the harness migration; only the agent naming +changes. The LLM ``agent`` node runs as a durable Temporal activity; +the ``tools`` node runs inline in the workflow. +""" + +from __future__ import annotations + +import os +from typing import Any, Annotated +from datetime import datetime, timedelta + +_litellm_key = os.environ.get("LITELLM_API_KEY") +if _litellm_key: + os.environ.setdefault("OPENAI_API_KEY", _litellm_key) + +from typing_extensions import TypedDict + +from langgraph.graph import END, START, StateGraph +from langchain_openai import ChatOpenAI +from langchain_core.messages import ToolMessage, SystemMessage +from langgraph.graph.message import add_messages + +from project.tools import TOOLS + +_TOOLS_BY_NAME = {tool.name: tool for tool in TOOLS} + +GRAPH_NAME = "at-harness-langgraph" +MODEL_NAME = "gpt-4o" +SYSTEM_PROMPT = """You are a helpful AI assistant with access to tools. + +Current date and time: {timestamp} + +Be concise and use tools when they help answer the question.""" + + +class AgentState(TypedDict): + messages: Annotated[list[Any], add_messages] + + +async def agent_node(state: AgentState) -> dict[str, Any]: + """The 'agent' node — one LLM call. Runs as a durable Temporal activity.""" + llm = ChatOpenAI(model=MODEL_NAME).bind_tools(TOOLS) + messages = state["messages"] + if not messages or not isinstance(messages[0], SystemMessage): + system = SystemMessage(content=SYSTEM_PROMPT.format(timestamp=datetime.now().strftime("%Y-%m-%d %H:%M:%S"))) + messages = [system, *messages] + return {"messages": [await llm.ainvoke(messages)]} + + +async def tools_node(state: AgentState) -> dict[str, Any]: + """Run the tool calls the model requested. Runs inline in the workflow.""" + last = state["messages"][-1] + results: list[Any] = [] + for call in getattr(last, "tool_calls", None) or []: + tool = _TOOLS_BY_NAME.get(call["name"]) + if tool is None: + output = f"Error: unknown tool {call['name']!r}. Available: {list(_TOOLS_BY_NAME)}" + else: + output = await tool.ainvoke(call["args"]) + results.append(ToolMessage(content=str(output), tool_call_id=call["id"], name=call["name"])) + return {"messages": results} + + +async def route_after_agent(state: AgentState) -> str: + """Go to the tools node if the model requested tools, else finish.""" + last = state["messages"][-1] + return "tools" if getattr(last, "tool_calls", None) else END + + +def build_graph() -> StateGraph: + """Build the agent graph; the LLM node runs as an activity, tools in the workflow.""" + builder = StateGraph(AgentState) + builder.add_node( + "agent", + agent_node, + metadata={"execute_in": "activity", "start_to_close_timeout": timedelta(minutes=5)}, + ) + builder.add_node("tools", tools_node, metadata={"execute_in": "workflow"}) + builder.add_edge(START, "agent") + builder.add_conditional_edges("agent", route_after_agent, {"tools": "tools", END: END}) + builder.add_edge("tools", "agent") + return builder diff --git a/examples/tutorials/10_async/10_temporal/harness_langgraph/project/run_worker.py b/examples/tutorials/10_async/10_temporal/harness_langgraph/project/run_worker.py new file mode 100644 index 000000000..ca64464fc --- /dev/null +++ b/examples/tutorials/10_async/10_temporal/harness_langgraph/project/run_worker.py @@ -0,0 +1,46 @@ +"""Temporal worker for at-harness-langgraph. + +Run as a separate long-lived process alongside the ACP HTTP server. The +worker polls Temporal for workflow + activity tasks and executes them. + +The ``LangGraphPlugin`` is given the graph registry (``{ GRAPH_NAME: graph }``). +At runtime it turns the graph's ``execute_in="activity"`` nodes into Temporal +activities and registers them on the worker automatically. +""" + +import asyncio + +from temporalio.contrib.langgraph import LangGraphPlugin + +from project.graph import GRAPH_NAME, build_graph +from project.workflow import AtHarnessLanggraphWorkflow +from agentex.lib.utils.debug import setup_debug_if_enabled +from agentex.lib.utils.logging import make_logger +from agentex.lib.environment_variables import EnvironmentVariables +from agentex.lib.core.temporal.activities import get_all_activities +from agentex.lib.core.temporal.workers.worker import AgentexWorker + +environment_variables = EnvironmentVariables.refresh() +logger = make_logger(__name__) + + +async def main(): + setup_debug_if_enabled() + + task_queue_name = environment_variables.WORKFLOW_TASK_QUEUE + if task_queue_name is None: + raise ValueError("WORKFLOW_TASK_QUEUE is not set") + + worker = AgentexWorker( + task_queue=task_queue_name, + plugins=[LangGraphPlugin(graphs={GRAPH_NAME: build_graph()})], + ) + + await worker.run( + activities=get_all_activities(), + workflow=AtHarnessLanggraphWorkflow, + ) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/examples/tutorials/10_async/10_temporal/harness_langgraph/project/tools.py b/examples/tutorials/10_async/10_temporal/harness_langgraph/project/tools.py new file mode 100644 index 000000000..10943c9d2 --- /dev/null +++ b/examples/tutorials/10_async/10_temporal/harness_langgraph/project/tools.py @@ -0,0 +1,37 @@ +"""Tool definitions for the harness_langgraph temporal agent.""" + +from langchain_core.tools import Tool + + +def get_weather(city: str) -> str: + """Get the current weather for a city. + + Args: + city: The name of the city to get weather for. + + Returns: + A string describing the weather conditions. + """ + return f"The weather in {city} is sunny and 72°F" + + +async def aget_weather(city: str) -> str: + """Native async tool entrypoint. + + ``tools_node`` runs inline in the Temporal workflow and invokes tools via + ``tool.ainvoke``. A sync-only tool forces LangChain to bridge through + ``run_in_executor`` (a thread pool), which the deterministic Temporal + workflow event loop forbids (``NotImplementedError``). Providing a real + coroutine keeps tool execution on the workflow loop. + """ + return get_weather(city) + + +weather_tool = Tool( + name="get_weather", + func=get_weather, + coroutine=aget_weather, + description="Get the current weather for a city. Input should be a city name.", +) + +TOOLS = [weather_tool] diff --git a/examples/tutorials/10_async/10_temporal/harness_langgraph/project/workflow.py b/examples/tutorials/10_async/10_temporal/harness_langgraph/project/workflow.py new file mode 100644 index 000000000..4125dca39 --- /dev/null +++ b/examples/tutorials/10_async/10_temporal/harness_langgraph/project/workflow.py @@ -0,0 +1,80 @@ +"""Temporal workflow for at-harness-langgraph. + +Each turn the workflow runs the LangGraph graph (``project/graph.py``) via the +``temporalio.contrib.langgraph`` plugin. The plugin runs the LLM ``agent`` node +as a durable Temporal activity and the ``tools`` node inline in the workflow. + +Multi-turn memory is kept on the workflow instance (``self._messages``) — it's +durable and replay-safe for free, so no checkpoint database is needed. +""" + +from __future__ import annotations + +import json +from typing import Any + +from temporalio import workflow +from temporalio.contrib.langgraph import graph as lg_graph + +from agentex.lib import adk +from project.graph import GRAPH_NAME +from agentex.lib.adk import emit_langgraph_messages +from agentex.protocol.acp import SendEventParams, CreateTaskParams +from agentex.lib.utils.logging import make_logger +from agentex.types.text_content import TextContent +from agentex.lib.environment_variables import EnvironmentVariables +from agentex.lib.core.temporal.types.workflow import SignalName +from agentex.lib.core.temporal.workflows.workflow import BaseWorkflow + +environment_variables = EnvironmentVariables.refresh() + +if environment_variables.WORKFLOW_NAME is None: + raise ValueError("Environment variable WORKFLOW_NAME is not set") +if environment_variables.AGENT_NAME is None: + raise ValueError("Environment variable AGENT_NAME is not set") + +logger = make_logger(__name__) + + +@workflow.defn(name=environment_variables.WORKFLOW_NAME) +class AtHarnessLanggraphWorkflow(BaseWorkflow): + """Runs the LangGraph agent each turn; its nodes run as Temporal activities.""" + + def __init__(self) -> None: + super().__init__(display_name=environment_variables.AGENT_NAME) + self._complete_task = False + self._messages: list[Any] = [] + self._emitted = 0 + + @workflow.signal(name=SignalName.RECEIVE_EVENT) + async def on_task_event_send(self, params: SendEventParams) -> None: + """Echo the user's message, run the graph, surface the new messages.""" + await adk.messages.create(task_id=params.task.id, content=params.event.content) + self._messages.append({"role": "user", "content": params.event.content.content}) + + compiled = lg_graph(GRAPH_NAME).compile() + result = await compiled.ainvoke({"messages": self._messages}) + self._messages = result["messages"] + + await emit_langgraph_messages(self._messages[self._emitted :], params.task.id) + self._emitted = len(self._messages) + + @workflow.signal + async def complete_task_signal(self) -> None: + self._complete_task = True + + @workflow.run + async def on_task_create(self, params: CreateTaskParams) -> str: + await adk.messages.create( + task_id=params.task.id, + content=TextContent( + author="agent", + content=( + f"Task initialized with params:\n{json.dumps(params.params, indent=2)}\n\n" + "Send me a message and I'll respond using a LangGraph agent whose nodes " + "run as durable Temporal activities." + ), + ), + ) + await workflow.wait_condition(lambda: self._complete_task, timeout=None) + return "Task completed" diff --git a/examples/tutorials/10_async/10_temporal/harness_langgraph/pyproject.toml b/examples/tutorials/10_async/10_temporal/harness_langgraph/pyproject.toml new file mode 100644 index 000000000..897f54dd6 --- /dev/null +++ b/examples/tutorials/10_async/10_temporal/harness_langgraph/pyproject.toml @@ -0,0 +1,40 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "at-harness-langgraph" +version = "0.1.0" +description = "A Temporal-backed LangGraph agent (harness variant) whose nodes run as Temporal activities" +requires-python = ">=3.12" +dependencies = [ + "agentex-sdk", + "scale-gp", + "temporalio[langgraph]>=1.27.0", + "langchain-openai", + "langchain-core", + "grandalf", + "python-dotenv", +] + +[project.optional-dependencies] +dev = [ + "pytest", + "pytest-asyncio", + "httpx", + "black", + "isort", + "flake8", + "debugpy>=1.8.15", +] + +[tool.hatch.build.targets.wheel] +packages = ["project"] + +[tool.black] +line-length = 88 +target-version = ['py312'] + +[tool.isort] +profile = "black" +line_length = 88 diff --git a/examples/tutorials/10_async/10_temporal/harness_langgraph/tests/test_agent.py b/examples/tutorials/10_async/10_temporal/harness_langgraph/tests/test_agent.py new file mode 100644 index 000000000..05d9ffa01 --- /dev/null +++ b/examples/tutorials/10_async/10_temporal/harness_langgraph/tests/test_agent.py @@ -0,0 +1,106 @@ +"""Integration tests for the Temporal harness LangGraph agent (live agent required). + +These drive a *running* agent over the AgentEx API and verify that: +- the agent sends a welcome message on task creation, +- a weather question triggers a tool_request / tool_response round-trip + (proving the LLM node ran as a Temporal activity and the tool node ran), +- the final answer reflects the tool output. + +To run: +1. Start the agent (worker + ACP server): ``agentex agents run --manifest manifest.yaml`` +2. Set AGENTEX_API_BASE_URL if not using the default +3. ``pytest tests/test_agent.py -v`` +""" + +import os +import uuid + +import pytest +import pytest_asyncio +from test_utils.async_utils import ( + poll_messages, + send_event_and_poll_yielding, +) + +from agentex import AsyncAgentex +from agentex.types.task_message import TaskMessage +from agentex.types.agent_rpc_params import ParamsCreateTaskRequest + +AGENTEX_API_BASE_URL = os.environ.get("AGENTEX_API_BASE_URL", "http://localhost:5003") +AGENT_NAME = os.environ.get("AGENT_NAME", "at-harness-langgraph") + + +@pytest_asyncio.fixture +async def client(): + client = AsyncAgentex(base_url=AGENTEX_API_BASE_URL) + yield client + await client.close() + + +@pytest.fixture +def agent_name(): + return AGENT_NAME + + +@pytest_asyncio.fixture +async def agent_id(client, agent_name): + agents = await client.agents.list() + for agent in agents: + if agent.name == agent_name: + return agent.id + raise ValueError(f"Agent with name {agent_name} not found.") + + +class TestNonStreamingEvents: + """The Temporal-backed LangGraph agent responds and uses tools.""" + + @pytest.mark.asyncio + async def test_send_event_and_poll(self, client: AsyncAgentex, agent_id: str): + """Create a task, ask about weather, verify the tool round-trip.""" + task_response = await client.agents.create_task(agent_id, params=ParamsCreateTaskRequest(name=uuid.uuid1().hex)) + task = task_response.result + assert task is not None + + task_creation_found = False + async for message in poll_messages(client=client, task_id=task.id, timeout=30, sleep_interval=1.0): + assert isinstance(message, TaskMessage) + if message.content and message.content.type == "text" and message.content.author == "agent": + task_creation_found = True + break + assert task_creation_found, "Task creation welcome message not found" + + seen_tool_request = False + seen_tool_response = False + final_message = None + async for message in send_event_and_poll_yielding( + client=client, + agent_id=agent_id, + task_id=task.id, + user_message="What is the weather in San Francisco? Use your tool.", + timeout=60, + sleep_interval=1.0, + ): + assert isinstance(message, TaskMessage) + + if message.content and message.content.type == "tool_request": + seen_tool_request = True + if message.content and message.content.type == "tool_response": + seen_tool_response = True + + if message.content and message.content.type == "text" and message.content.author == "agent": + final_message = message + content_length = len(getattr(message.content, "content", "") or "") + if getattr(message, "streaming_status", None) in (None, "DONE") and content_length > 0: + if seen_tool_response: + break + + assert seen_tool_request, "Expected a tool_request (agent calling get_weather)" + assert seen_tool_response, "Expected a tool_response (get_weather result)" + assert final_message is not None, "Expected a final agent text message" + final_text = getattr(final_message.content, "content", None) if final_message.content else None + assert isinstance(final_text, str) and len(final_text) > 0 + assert "72" in final_text, "Expected weather response to mention 72°F" + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) diff --git a/src/agentex/lib/adk/_modules/_langgraph_async.py b/src/agentex/lib/adk/_modules/_langgraph_async.py index 3e61c42f9..02ef059eb 100644 --- a/src/agentex/lib/adk/_modules/_langgraph_async.py +++ b/src/agentex/lib/adk/_modules/_langgraph_async.py @@ -3,8 +3,21 @@ Converts LangGraph graph.astream() events into Agentex streaming updates and pushes them to Redis via adk.streaming contexts. For use with async ACP agents that stream via Redis rather than HTTP yields. + +Unified surface +--------------- +This module is now implemented on top of ``LangGraphTurn`` and +``UnifiedEmitter.auto_send_turn``, the same surface used by every other +harness adapter (pydantic-ai, openai-agents, etc.). The public signature +and return type are preserved identically. + +AGX1-377 note: LangGraph emits tool requests as ``StreamTaskMessageFull`` events +(from "updates" events), NOT Start+Delta+Done like pydantic-ai. ``auto_send`` +handles Full events correctly; no coalescing wrapper is needed. """ +from agentex.lib.utils.temporal import workflow_now_if_in_workflow + async def stream_langgraph_events(stream, task_id: str) -> str: """Stream LangGraph events to Agentex via Redis. @@ -18,6 +31,19 @@ async def stream_langgraph_events(stream, task_id: str) -> str: models like gpt-5/o1/o3 (chunk.content is a list of typed content blocks in the Responses API responses/v1 format). + Reimplemented on ``UnifiedEmitter.auto_send_turn(LangGraphTurn(...))`` for + cross-harness consistency. Behavior is identical to the previous bespoke + implementation (verified by characterization tests in test_langgraph_async.py). + + AGX1-377 note: LangGraph emits tool requests as ``Full`` events (from "updates"), + NOT Start+Delta+Done like pydantic-ai. ``auto_send`` handles Full events + correctly; no coalescing wrapper is needed. + + AGX1-378 note: ``created_at`` is set from ``workflow.now()`` when called inside a + Temporal workflow, matching the pattern used by the openai/litellm providers. + Outside a workflow (plain async activities, sync agents) it is ``None`` and the + server's wall clock is used. + Args: stream: Async iterator from graph.astream(..., stream_mode=["messages", "updates"]) task_id: The Agentex task ID to stream messages to. @@ -25,178 +51,15 @@ async def stream_langgraph_events(stream, task_id: str) -> str: Returns: The accumulated final text output from the agent. """ - # Lazy imports so langgraph/langchain aren't required at module load time - from langchain_core.messages import ToolMessage, AIMessageChunk - - from agentex.lib import adk - from agentex.types.text_content import TextContent - from agentex.types.reasoning_content import ReasoningContent - from agentex.types.task_message_delta import TextDelta - from agentex.types.task_message_update import StreamTaskMessageDelta - from agentex.types.tool_request_content import ToolRequestContent - from agentex.types.tool_response_content import ToolResponseContent - from agentex.types.reasoning_summary_delta import ReasoningSummaryDelta - - text_context = None - reasoning_context = None - final_text = "" - - try: - async for event_type, event_data in stream: - if event_type == "messages": - chunk, metadata = event_data - - if not isinstance(chunk, AIMessageChunk) or not chunk.content: - continue - - # ---------------------------------------------------------- - # Case 1: content is a plain string (regular models) - # ---------------------------------------------------------- - if isinstance(chunk.content, str): - if reasoning_context: - await reasoning_context.close() - reasoning_context = None - - if not text_context: - final_text = "" - text_context = await adk.streaming.streaming_task_message_context( - task_id=task_id, - initial_content=TextContent( - author="agent", - content="", - format="markdown", - ), - ).__aenter__() - - final_text += chunk.content - await text_context.stream_update( - StreamTaskMessageDelta( - parent_task_message=text_context.task_message, - delta=TextDelta(type="text", text_delta=chunk.content), - type="delta", - ) - ) - - # ---------------------------------------------------------- - # Case 2: content is a list of typed blocks (reasoning models) - # Responses API (responses/v1) format: - # {"type": "reasoning", "summary": [{"type": "summary_text", "text": "..."}]} - # {"type": "text", "text": "..."} - # ---------------------------------------------------------- - elif isinstance(chunk.content, list): - for block in chunk.content: - if not isinstance(block, dict): - continue - - block_type = block.get("type") - - if block_type == "reasoning": - reasoning_text = "" - for s in block.get("summary", []): - if isinstance(s, dict) and s.get("type") == "summary_text": - reasoning_text += s.get("text", "") - if not reasoning_text: - continue - - if text_context: - await text_context.close() - text_context = None - - if not reasoning_context: - reasoning_context = await adk.streaming.streaming_task_message_context( - task_id=task_id, - initial_content=ReasoningContent( - author="agent", - summary=[], - content=[], - type="reasoning", - style="active", - ), - ).__aenter__() - - await reasoning_context.stream_update( - StreamTaskMessageDelta( - parent_task_message=reasoning_context.task_message, - delta=ReasoningSummaryDelta( - type="reasoning_summary", - summary_index=0, - summary_delta=reasoning_text, - ), - type="delta", - ) - ) - - elif block_type == "text": - text_delta = block.get("text", "") - if not text_delta: - continue - - if reasoning_context: - await reasoning_context.close() - reasoning_context = None - - if not text_context: - final_text = "" - text_context = await adk.streaming.streaming_task_message_context( - task_id=task_id, - initial_content=TextContent( - author="agent", - content="", - format="markdown", - ), - ).__aenter__() - - final_text += text_delta - await text_context.stream_update( - StreamTaskMessageDelta( - parent_task_message=text_context.task_message, - delta=TextDelta(type="text", text_delta=text_delta), - type="delta", - ) - ) - - elif event_type == "updates": - for node_name, state_update in event_data.items(): - if node_name == "agent": - messages = state_update.get("messages", []) - for msg in messages: - if text_context: - await text_context.close() - text_context = None - if reasoning_context: - await reasoning_context.close() - reasoning_context = None - - if hasattr(msg, "tool_calls") and msg.tool_calls: - for tc in msg.tool_calls: - await adk.messages.create( - task_id=task_id, - content=ToolRequestContent( - tool_call_id=tc["id"], - name=tc["name"], - arguments=tc["args"], - author="agent", - ), - ) - - elif node_name == "tools": - messages = state_update.get("messages", []) - for msg in messages: - if isinstance(msg, ToolMessage): - await adk.messages.create( - task_id=task_id, - content=ToolResponseContent( - tool_call_id=msg.tool_call_id, - name=msg.name or "unknown", - content=msg.content if isinstance(msg.content, str) else str(msg.content), - author="agent", - ), - ) - finally: - # Always close open contexts - if text_context: - await text_context.close() - if reasoning_context: - await reasoning_context.close() - - return final_text + from agentex.lib.core.harness.emitter import UnifiedEmitter + from agentex.lib.adk._modules._langgraph_turn import LangGraphTurn + + # AGX1-377 note: LangGraph emits tool requests as Full events (from "updates"), + # NOT Start+Delta+Done like pydantic-ai. auto_send handles Full events correctly; + # no coalescing wrapper is needed. + # AGX1-378: stamp messages with workflow.now() inside Temporal for deterministic + # created_at ordering; falls back to None (server wall clock) outside a workflow. + turn = LangGraphTurn(stream, model=None) + emitter = UnifiedEmitter(task_id=task_id, trace_id=None, parent_span_id=None) + result = await emitter.auto_send_turn(turn, created_at=workflow_now_if_in_workflow()) + return result.final_text diff --git a/src/agentex/lib/adk/_modules/_langgraph_sync.py b/src/agentex/lib/adk/_modules/_langgraph_sync.py index 6d4ce715f..48231a87d 100644 --- a/src/agentex/lib/adk/_modules/_langgraph_sync.py +++ b/src/agentex/lib/adk/_modules/_langgraph_sync.py @@ -3,10 +3,36 @@ Converts LangGraph graph.astream() events into Agentex TaskMessageUpdate events that are yielded back over the HTTP response. For use with sync ACP agents that stream via HTTP yields rather than Redis. + +Unified sync path +----------------- +Prefer using ``LangGraphTurn`` with ``UnifiedEmitter.yield_turn`` for new +agents, which adds usage capture and optional tracing via the shared harness +surface:: + + from agentex.lib.core.harness.emitter import UnifiedEmitter + from agentex.lib.adk._modules._langgraph_turn import LangGraphTurn + + turn = LangGraphTurn(stream) + emitter = UnifiedEmitter(task_id=task_id, trace_id=trace_id, parent_span_id=span_id) + async for event in emitter.yield_turn(turn): + yield event + +``convert_langgraph_to_agentex_events`` remains available as a lower-level +primitive (e.g. for callers that need the raw event stream without the +harness envelope). """ +from __future__ import annotations + +from typing import Any, Callable, Optional +from collections.abc import AsyncGenerator + -async def convert_langgraph_to_agentex_events(stream): +async def convert_langgraph_to_agentex_events( + stream: Any, + on_final_ai_message: Optional[Callable[..., None]] = None, +) -> AsyncGenerator[Any, None]: """Convert LangGraph streaming events to Agentex TaskMessageUpdate events. Expects the stream from graph.astream() called with @@ -22,8 +48,17 @@ async def convert_langgraph_to_agentex_events(stream): Supports both regular models (chunk.content is a str) and reasoning models like gpt-5/o1/o3 (chunk.content is a list of typed content blocks). + AGX1-377 note: LangGraph emits tool requests as ``StreamTaskMessageFull`` (from + "updates" events), NOT Start+Delta+Done like pydantic-ai. No coalesce_tool_requests + option is needed for LangGraph. + Args: stream: Async iterator from graph.astream(..., stream_mode=["messages", "updates"]) + on_final_ai_message: Optional callback ``(msg: AIMessage) -> None`` called for + each ``AIMessage`` in an "agent" node update. Use this to capture + ``usage_metadata`` for token accounting without re-traversing the stream. + The callback fires *after* all events for that message are yielded. + No-op when ``None`` (default). Yields: TaskMessageUpdate events (Start, Delta, Done, Full) @@ -32,6 +67,7 @@ async def convert_langgraph_to_agentex_events(stream): from langchain_core.messages import ToolMessage, AIMessageChunk from agentex.types.text_content import TextContent + from agentex.types.reasoning_content import ReasoningContent from agentex.types.task_message_delta import TextDelta from agentex.types.task_message_update import ( StreamTaskMessageDone, @@ -113,7 +149,9 @@ async def convert_langgraph_to_agentex_events(stream): yield StreamTaskMessageStart( type="start", index=message_index, - content=TextContent(type="text", author="agent", content=""), + content=ReasoningContent( + type="reasoning", author="agent", summary=[], content=[], style="active" + ), ) reasoning_streaming = True reasoning_content_index = 0 @@ -205,6 +243,13 @@ async def convert_langgraph_to_agentex_events(stream): ) message_index += 1 + # Notify caller of the final AIMessage (e.g. for usage capture) + if on_final_ai_message is not None: + from langchain_core.messages import AIMessage as _AIMessage + + if isinstance(msg, _AIMessage): + on_final_ai_message(msg) + elif node_name == "tools": messages = state_update.get("messages", []) for msg in messages: diff --git a/src/agentex/lib/adk/_modules/_langgraph_tracing.py b/src/agentex/lib/adk/_modules/_langgraph_tracing.py index 74b8dcb57..2162201e1 100644 --- a/src/agentex/lib/adk/_modules/_langgraph_tracing.py +++ b/src/agentex/lib/adk/_modules/_langgraph_tracing.py @@ -1,4 +1,14 @@ -"""LangChain callback handler that creates Agentex spans for LLM calls and tool executions.""" +"""LangChain callback handler that creates Agentex spans for LLM calls and tool executions. + +.. deprecated:: + ``AgentexLangGraphTracingHandler`` and ``create_langgraph_tracing_handler`` are + superseded by the unified harness surface (``LangGraphTurn`` + + ``UnifiedEmitter``), which derives spans automatically from the canonical + event stream without requiring a LangChain callback handler. + + They remain importable and functional for backward compatibility, but new + agents should use the unified path instead. +""" # ruff: noqa: ARG002 # Callback methods must accept all arguments defined by LangChain's AsyncCallbackHandler interface. @@ -31,6 +41,11 @@ class AgentexLangGraphTracingHandler(AsyncCallbackHandler): ├── llm: (LLM call) ├── tool: (tool execution) └── llm: (LLM call) + + .. deprecated:: + Use ``LangGraphTurn`` with ``UnifiedEmitter`` instead. The unified + harness derives equivalent spans from the canonical event stream, + removing the need for a LangChain callback handler entirely. """ def __init__( @@ -237,6 +252,20 @@ def create_langgraph_tracing_handler( Returns: An ``AgentexLangGraphTracingHandler`` instance ready to use as a LangChain callback. + + .. deprecated:: + Use ``LangGraphTurn`` with ``UnifiedEmitter`` instead. The unified harness + derives equivalent spans from the canonical event stream automatically, with + no LangChain callback required:: + + from agentex.lib.core.harness.emitter import UnifiedEmitter + from agentex.lib.adk._modules._langgraph_turn import LangGraphTurn + + turn = LangGraphTurn(stream) + emitter = UnifiedEmitter(task_id=task_id, trace_id=trace_id, parent_span_id=span_id) + result = await emitter.auto_send_turn(turn) + + This function remains available for backward compatibility. """ return AgentexLangGraphTracingHandler( trace_id=trace_id, diff --git a/src/agentex/lib/adk/_modules/_langgraph_turn.py b/src/agentex/lib/adk/_modules/_langgraph_turn.py new file mode 100644 index 000000000..da8ff0e7c --- /dev/null +++ b/src/agentex/lib/adk/_modules/_langgraph_turn.py @@ -0,0 +1,152 @@ +"""HarnessTurn adapter for LangGraph astream() event streams. + +Provides ``LangGraphTurn`` (a ``HarnessTurn`` implementation) and the +``langgraph_usage_to_turn_usage`` helper that maps LangGraph's +``AIMessage.usage_metadata`` onto the framework-agnostic ``TurnUsage`` model. + +AGX1-377 note: LangGraph emits tool requests as ``StreamTaskMessageFull`` events +(from "updates" events), NOT Start+Delta+Done like pydantic-ai. ``auto_send`` +handles Full events correctly; no coalescing wrapper is needed. +""" + +from __future__ import annotations + +from typing import Any, AsyncIterator +from collections.abc import AsyncGenerator + +from agentex.lib.core.harness.types import TurnUsage, StreamTaskMessage +from agentex.lib.adk._modules._langgraph_sync import convert_langgraph_to_agentex_events + + +def langgraph_usage_to_turn_usage(usage_metadata: Any, model: str | None) -> TurnUsage: + """Map LangGraph ``AIMessage.usage_metadata`` onto ``TurnUsage``. + + ``usage_metadata`` may be ``None`` (model doesn't report usage). + Real zero token counts (e.g. 0 output tokens) are preserved as 0, NOT + coerced to ``None``. + + Mapping:: + + input_tokens -> input_tokens + output_tokens -> output_tokens + total_tokens -> total_tokens + input_token_details.cache_read -> cached_input_tokens + output_token_details.reasoning -> reasoning_tokens + + Args: + usage_metadata: The ``usage_metadata`` dict from an ``AIMessage``, + or ``None`` if the model did not report usage. + model: The model name string to attach to the ``TurnUsage``, or ``None``. + + Returns: + A populated ``TurnUsage`` instance. + """ + if usage_metadata is None: + return TurnUsage(model=model) + + raw_input = (usage_metadata or {}).get("input_tokens") + raw_output = (usage_metadata or {}).get("output_tokens") + raw_total = (usage_metadata or {}).get("total_tokens") + input_details = (usage_metadata or {}).get("input_token_details") or {} + output_details = (usage_metadata or {}).get("output_token_details") or {} + raw_cache_read = input_details.get("cache_read") + raw_reasoning = output_details.get("reasoning") + + return TurnUsage( + model=model, + input_tokens=raw_input, + output_tokens=raw_output, + total_tokens=raw_total, + cached_input_tokens=raw_cache_read, + reasoning_tokens=raw_reasoning, + ) + + +def _add_optional(a: int | None, b: int | None) -> int | None: + """Sum two optional token counts; ``None`` means 'not reported' on that side. + + ``None + None`` stays ``None`` (model never reported usage), while a real 0 + contributes 0 (preserving zero counts rather than coercing them away). + """ + if a is None and b is None: + return None + return (a or 0) + (b or 0) + + +def _accumulate_turn_usage(acc: TurnUsage, call: TurnUsage, model: str | None) -> TurnUsage: + """Add a single LLM call's usage into the running per-turn total. + + A LangGraph turn can make multiple LLM calls (e.g. text -> tool decision -> + final text); summing them avoids silently dropping all but the last call. + """ + return TurnUsage( + model=model, + input_tokens=_add_optional(acc.input_tokens, call.input_tokens), + output_tokens=_add_optional(acc.output_tokens, call.output_tokens), + total_tokens=_add_optional(acc.total_tokens, call.total_tokens), + cached_input_tokens=_add_optional(acc.cached_input_tokens, call.cached_input_tokens), + reasoning_tokens=_add_optional(acc.reasoning_tokens, call.reasoning_tokens), + ) + + +class LangGraphTurn: + """HarnessTurn wrapping a LangGraph ``astream()`` event stream. + + Implements the ``HarnessTurn`` Protocol so it can be passed to either + ``UnifiedEmitter.yield_turn`` (sync HTTP ACP) or + ``UnifiedEmitter.auto_send_turn`` (async / temporal). + + Usage:: + + stream = graph.astream( + {"messages": [{"role": "user", "content": user_message}]}, + stream_mode=["messages", "updates"], + ) + turn = LangGraphTurn(stream, model=model_name) + + # Sync HTTP ACP + async for event in emitter.yield_turn(turn): + yield event + + # Async / temporal + result = await emitter.auto_send_turn(turn) + + AGX1-377 note: LangGraph tool requests are ``StreamTaskMessageFull`` (from + "updates"), NOT Start+Delta+Done like pydantic-ai. No ``coalesce_tool_requests`` + option is needed. + + Usage data is captured lazily via the ``on_final_ai_message`` callback and + is only valid after ``events`` has been fully consumed. Multi-step turns + (more than one LLM call) accumulate usage additively across calls. + """ + + def __init__(self, stream: Any, model: str | None = None) -> None: + self._stream = stream + self._model = model + self._usage: TurnUsage = TurnUsage(model=model) + + @property + def events(self) -> AsyncIterator[StreamTaskMessage]: + return self._generate_events() + + async def _generate_events(self) -> AsyncGenerator[StreamTaskMessage, None]: + def _capture(ai_msg: Any) -> None: + usage_metadata = getattr(ai_msg, "usage_metadata", None) + if usage_metadata is not None: + call_usage = langgraph_usage_to_turn_usage(usage_metadata, self._model) + # Accumulate across LLM calls — the callback fires once per agent + # node invocation, so a multi-step turn reports usage more than + # once; overwriting would drop all but the last call. + self._usage = _accumulate_turn_usage(self._usage, call_usage, self._model) + + async for ev in convert_langgraph_to_agentex_events(self._stream, on_final_ai_message=_capture): + yield ev + + def usage(self) -> TurnUsage: + """Return the usage accumulated across all AIMessages in the stream. + + Multi-step turns sum each LLM call's usage. Valid only after ``events`` + has been fully consumed. Returns a zero-usage ``TurnUsage`` if the model + did not report usage. + """ + return self._usage diff --git a/tests/lib/adk/providers/test_openai_turn.py b/tests/lib/adk/providers/test_openai_turn.py index 023b0ed4e..47a9ba9fe 100644 --- a/tests/lib/adk/providers/test_openai_turn.py +++ b/tests/lib/adk/providers/test_openai_turn.py @@ -65,7 +65,9 @@ def test_usage_mapping_none_usage(): turn_usage = openai_usage_to_turn_usage(None, model="gpt-4o") assert turn_usage.model == "gpt-4o" - assert turn_usage.num_llm_calls == 0 + # num_llm_calls is None ("not reported") when no usage is present, matching + # the token fields below; a real 0 is only reported when the provider says so. + assert turn_usage.num_llm_calls is None assert turn_usage.input_tokens is None assert turn_usage.output_tokens is None assert turn_usage.total_tokens is None diff --git a/tests/lib/adk/test_langgraph_async.py b/tests/lib/adk/test_langgraph_async.py new file mode 100644 index 000000000..682bd43bc --- /dev/null +++ b/tests/lib/adk/test_langgraph_async.py @@ -0,0 +1,282 @@ +"""Characterization tests for stream_langgraph_events (unified surface). + +These tests verify the behavior of ``stream_langgraph_events`` after it was +reimplemented on top of ``LangGraphTurn`` + ``UnifiedEmitter.auto_send_turn`` +(Task 4). They serve as a contract test for the public signature. + +Key behavioral notes (unified surface vs. old bespoke implementation): +- Tool calls/responses are posted via ``streaming_task_message_context`` (not + ``adk.messages.create``); they appear as contexts with no stream_update calls. +- ``final_text`` accumulates ALL text across the turn (the old bespoke impl + only returned the last text segment — behavior varied across models). + +NOTE: langchain_core imports are deferred to test scope because conftest.py +stubs ``langchain_core.messages`` with MagicMock. +""" + +from __future__ import annotations + +import sys +from typing import Any +from dataclasses import field, dataclass + +import pytest + +from agentex.types.task_message import TaskMessage +from agentex.types.text_content import TextContent +from agentex.types.task_message_delta import TextDelta +from agentex.types.task_message_update import StreamTaskMessageDelta +from agentex.lib.adk._modules._langgraph_async import stream_langgraph_events + +TASK_ID = "task-test" + + +# --------------------------------------------------------------------------- +# Remove conftest stubs so real langchain_core types are used +# --------------------------------------------------------------------------- + + +@pytest.fixture(autouse=True) +def _real_langchain_core(): + stub_keys = [k for k in sys.modules if k.startswith("langchain_core") or k.startswith("langgraph")] + saved = {k: sys.modules.pop(k) for k in stub_keys} + import importlib + + importlib.import_module("langchain_core.messages") + yield + sys.modules.update(saved) + + +# --------------------------------------------------------------------------- +# Fake streaming infrastructure (mirrors test_pydantic_ai_async.py pattern) +# --------------------------------------------------------------------------- + + +@dataclass +class FakeContext: + initial_content: Any + task_message: TaskMessage + closed: bool = False + updates: list[StreamTaskMessageDelta] = field(default_factory=list) + + async def __aenter__(self) -> "FakeContext": + return self + + async def __aexit__(self, exc_type, exc_val, exc_tb) -> bool: + await self.close() + return False + + async def stream_update(self, update: StreamTaskMessageDelta) -> None: + if self.closed: + raise AssertionError("stream_update called after close") + self.updates.append(update) + + async def close(self) -> None: + self.closed = True + + +class FakeStreamingModule: + def __init__(self) -> None: + self.contexts: list[FakeContext] = [] + + def streaming_task_message_context(self, *, task_id: str, initial_content: Any, **kw: Any) -> FakeContext: + tm = TaskMessage( + id=f"m{len(self.contexts) + 1}", + task_id=task_id, + content=initial_content, + streaming_status="IN_PROGRESS", + ) + ctx = FakeContext(initial_content=initial_content, task_message=tm) + self.contexts.append(ctx) + return ctx + + +class FakeMessagesModule: + def __init__(self) -> None: + self.created: list[dict[str, Any]] = [] + + async def create(self, *, task_id: str, content: Any) -> TaskMessage: + self.created.append({"task_id": task_id, "content": content}) + return TaskMessage( + id=f"created-{len(self.created)}", + task_id=task_id, + content=content, + streaming_status="DONE", + ) + + +@pytest.fixture +def fake_adk(monkeypatch): + from agentex.lib import adk as adk_module + + streaming = FakeStreamingModule() + messages = FakeMessagesModule() + monkeypatch.setattr(adk_module, "streaming", streaming) + monkeypatch.setattr(adk_module, "messages", messages) + return streaming, messages + + +def _make_stream(events: list[tuple[str, Any]]): + async def _gen(): + for e in events: + yield e + + return _gen() + + +def _text_deltas(ctx: FakeContext) -> list[str]: + out: list[str] = [] + for u in ctx.updates: + if isinstance(u.delta, TextDelta): + out.append(u.delta.text_delta or "") + return out + + +# --------------------------------------------------------------------------- +# Characterization tests (unified surface behavior) +# --------------------------------------------------------------------------- + + +class TestCharacterization: + async def test_plain_text_streams_and_returns_final_text( + self, fake_adk: tuple[FakeStreamingModule, FakeMessagesModule] + ) -> None: + from langchain_core.messages import AIMessage, AIMessageChunk + + streaming, messages = fake_adk + chunk = AIMessageChunk(content="Hello, world!") + ai_msg = AIMessage(content="Hello, world!") + stream = _make_stream( + [ + ("messages", (chunk, {})), + ("updates", {"agent": {"messages": [ai_msg]}}), + ] + ) + + final = await stream_langgraph_events(stream, TASK_ID) + + assert final == "Hello, world!" + assert len(streaming.contexts) == 1 + ctx = streaming.contexts[0] + assert isinstance(ctx.initial_content, TextContent) + assert _text_deltas(ctx) == ["Hello, world!"] + assert ctx.closed is True + # Unified surface: no messages.create for text + assert messages.created == [] + + async def test_empty_stream_returns_empty_string( + self, fake_adk: tuple[FakeStreamingModule, FakeMessagesModule] + ) -> None: + streaming, _ = fake_adk + final = await stream_langgraph_events(_make_stream([]), TASK_ID) + assert final == "" + assert streaming.contexts == [] + + async def test_tool_call_posted_via_streaming_context( + self, fake_adk: tuple[FakeStreamingModule, FakeMessagesModule] + ) -> None: + """Unified surface: tool calls go through streaming_task_message_context, + not adk.messages.create. The context is opened and immediately closed + (no deltas) so the initial_content is the tool request.""" + from langchain_core.messages import AIMessage + + streaming, messages = fake_adk + tc = {"id": "call_1", "name": "get_weather", "args": {"city": "Paris"}} + ai_msg = AIMessage(content="", tool_calls=[tc]) + stream = _make_stream([("updates", {"agent": {"messages": [ai_msg]}})]) + + await stream_langgraph_events(stream, TASK_ID) + + # Unified surface: tool messages go via streaming_task_message_context + assert len(streaming.contexts) == 1 + assert messages.created == [], "Unified surface uses streaming_task_message_context, not messages.create" + + from agentex.types.tool_request_content import ToolRequestContent + + content = streaming.contexts[0].initial_content + assert isinstance(content, ToolRequestContent) + assert content.tool_call_id == "call_1" + assert content.name == "get_weather" + assert content.arguments == {"city": "Paris"} + # Full messages close immediately (no delta updates) + assert streaming.contexts[0].closed is True + assert streaming.contexts[0].updates == [] + + async def test_tool_response_posted_via_streaming_context( + self, fake_adk: tuple[FakeStreamingModule, FakeMessagesModule] + ) -> None: + """Unified surface: tool responses go through streaming_task_message_context.""" + from langchain_core.messages import ToolMessage + + streaming, messages = fake_adk + tool_msg = ToolMessage(content="Sunny, 72F", tool_call_id="call_1", name="get_weather") + stream = _make_stream([("updates", {"tools": {"messages": [tool_msg]}})]) + + await stream_langgraph_events(stream, TASK_ID) + + assert len(streaming.contexts) == 1 + assert messages.created == [] + + from agentex.types.tool_response_content import ToolResponseContent + + content = streaming.contexts[0].initial_content + assert isinstance(content, ToolResponseContent) + assert content.tool_call_id == "call_1" + assert content.name == "get_weather" + assert content.content == "Sunny, 72F" + assert streaming.contexts[0].closed is True + + async def test_multi_step_text_then_tool_then_text_last_segment( + self, fake_adk: tuple[FakeStreamingModule, FakeMessagesModule] + ) -> None: + """Unified surface: final_text uses last-segment semantics. + + auto_send resets final_text_parts when a new Start(TextContent) is seen, + so multi-step turns (text -> tool -> text) return only the LAST text segment. + Both text contexts are still opened and streamed to Redis; only the + return value is last-segment. This matches stream_pydantic_ai_events. + """ + from langchain_core.messages import AIMessage, ToolMessage, AIMessageChunk + + streaming, messages = fake_adk + chunk1 = AIMessageChunk(content="Looking up...") + ai_msg1 = AIMessage(content="Looking up...", tool_calls=[{"id": "c1", "name": "search", "args": {}}]) + tool_msg = ToolMessage(content="result", tool_call_id="c1", name="search") + chunk2 = AIMessageChunk(content="Found it!") + ai_msg2 = AIMessage(content="Found it!") + + stream = _make_stream( + [ + ("messages", (chunk1, {})), + ("updates", {"agent": {"messages": [ai_msg1]}}), + ("updates", {"tools": {"messages": [tool_msg]}}), + ("messages", (chunk2, {})), + ("updates", {"agent": {"messages": [ai_msg2]}}), + ] + ) + + final = await stream_langgraph_events(stream, TASK_ID) + + # Last segment only — first text segment is NOT in final_text + assert final == "Found it!" + # Two text streaming contexts (one per text segment) — both streamed to Redis + text_ctxs = [c for c in streaming.contexts if isinstance(c.initial_content, TextContent)] + assert len(text_ctxs) == 2 + assert all(ctx.closed for ctx in text_ctxs) + # Tool request + tool response via streaming_task_message_context (not messages.create) + assert messages.created == [] + + async def test_context_closed_on_exception(self, fake_adk: tuple[FakeStreamingModule, FakeMessagesModule]) -> None: + from langchain_core.messages import AIMessageChunk + + streaming, _ = fake_adk + + async def _boom(): + chunk = AIMessageChunk(content="partial") + yield ("messages", (chunk, {})) + raise RuntimeError("upstream exploded") + + with pytest.raises(RuntimeError, match="upstream exploded"): + await stream_langgraph_events(_boom(), TASK_ID) + + assert streaming.contexts[0].closed is True diff --git a/tests/lib/adk/test_langgraph_sync.py b/tests/lib/adk/test_langgraph_sync.py new file mode 100644 index 000000000..248d18f68 --- /dev/null +++ b/tests/lib/adk/test_langgraph_sync.py @@ -0,0 +1,247 @@ +"""Tests for the sync LangGraph -> Agentex stream event converter. + +Covers: +- Basic text, tool call, and tool response emission +- on_final_ai_message callback for usage capture +- create_langgraph_tracing_handler symbol is importable and functional + (runtime DeprecationWarning removed; deprecation is docstring-only) + +NOTE: langchain_core imports must be deferred to test-function scope because +conftest.py stubs out ``langchain_core.messages`` with MagicMock for ADK +package-level tests. The real classes are imported lazily inside each test. +""" + +from __future__ import annotations + +import sys +from typing import Any, AsyncIterator + +import pytest + +from agentex.types.task_message_update import ( + StreamTaskMessageFull, +) +from agentex.types.tool_request_content import ToolRequestContent +from agentex.types.tool_response_content import ToolResponseContent +from agentex.lib.adk._modules._langgraph_sync import convert_langgraph_to_agentex_events + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +async def _collect(stream: AsyncIterator[Any]) -> list[Any]: + return [e async for e in stream] + + +def _make_stream(events: list[tuple[str, Any]]) -> AsyncIterator[tuple[str, Any]]: + async def _gen(): + for e in events: + yield e + + return _gen() + + +# --------------------------------------------------------------------------- +# Remove the conftest stubs for langchain_core so real classes are used +# --------------------------------------------------------------------------- + + +@pytest.fixture(autouse=True) +def _real_langchain_core(): + """Remove conftest MagicMock stubs so real langchain_core types are used.""" + stub_keys = [k for k in sys.modules if k.startswith("langchain_core") or k.startswith("langgraph")] + saved = {k: sys.modules.pop(k) for k in stub_keys} + # Re-import the real modules + import importlib + + importlib.import_module("langchain_core.messages") + yield + # Restore stubs after the test + sys.modules.update(saved) + + +class TestTextStreaming: + async def test_plain_text_emits_start_delta_done(self): + from langchain_core.messages import AIMessage, AIMessageChunk + + chunk = AIMessageChunk(content="Hello, world!") + events = [ + ("messages", (chunk, {})), + ("updates", {"agent": {"messages": [AIMessage(content="Hello, world!")]}}), + ] + out = await _collect(convert_langgraph_to_agentex_events(_make_stream(events))) + types = [type(e).__name__ for e in out] + assert "StreamTaskMessageStart" in types + assert "StreamTaskMessageDelta" in types + assert "StreamTaskMessageDone" in types + + async def test_empty_chunk_content_is_skipped(self): + from langchain_core.messages import AIMessageChunk + + chunk = AIMessageChunk(content="") + events = [("messages", (chunk, {}))] + out = await _collect(convert_langgraph_to_agentex_events(_make_stream(events))) + assert out == [] + + async def test_reasoning_block_start_wraps_reasoning_content(self): + """A Responses-API reasoning block opens a Start wrapping ReasoningContent, + not TextContent (the deltas are ReasoningContentDelta).""" + from langchain_core.messages import AIMessageChunk + + from agentex.types.reasoning_content import ReasoningContent + from agentex.types.task_message_update import StreamTaskMessageDelta, StreamTaskMessageStart + from agentex.types.reasoning_content_delta import ReasoningContentDelta + + chunk = AIMessageChunk( + content=[{"type": "reasoning", "summary": [{"type": "summary_text", "text": "thinking hard"}]}] + ) + events = [("messages", (chunk, {}))] + out = await _collect(convert_langgraph_to_agentex_events(_make_stream(events))) + starts = [e for e in out if isinstance(e, StreamTaskMessageStart)] + assert len(starts) == 1 + assert isinstance(starts[0].content, ReasoningContent), "reasoning Start must wrap ReasoningContent" + # `style` must be a non-null MessageStyle: the AgentEx server's + # StreamTaskMessageStartEntity rejects `reasoning.style=None` (enum), which + # would kill the stream. Match the conformance fixture's canonical value. + assert starts[0].content.style == "active", "reasoning Start must set a non-null style ('active')" + # Pull content_delta inside the comprehension so the isinstance narrows the + # delta union (narrowing would not survive a later attribute access). + reasoning_delta_texts = [ + e.delta.content_delta + for e in out + if isinstance(e, StreamTaskMessageDelta) and isinstance(e.delta, ReasoningContentDelta) + ] + assert reasoning_delta_texts == ["thinking hard"] + + +class TestToolCallEmission: + async def test_tool_call_emits_full_message(self): + from langchain_core.messages import AIMessage + + tc = {"id": "call_1", "name": "get_weather", "args": {"city": "Paris"}} + ai_msg = AIMessage(content="", tool_calls=[tc]) + events = [("updates", {"agent": {"messages": [ai_msg]}})] + out = await _collect(convert_langgraph_to_agentex_events(_make_stream(events))) + assert len(out) == 1 + assert isinstance(out[0], StreamTaskMessageFull) + content = out[0].content + assert isinstance(content, ToolRequestContent) + assert content.tool_call_id == "call_1" + assert content.name == "get_weather" + assert content.arguments == {"city": "Paris"} + assert content.author == "agent" + + async def test_tool_response_emits_full_message(self): + from langchain_core.messages import ToolMessage + + tool_msg = ToolMessage(content="Sunny, 72F", tool_call_id="call_1", name="get_weather") + events = [("updates", {"tools": {"messages": [tool_msg]}})] + out = await _collect(convert_langgraph_to_agentex_events(_make_stream(events))) + assert len(out) == 1 + assert isinstance(out[0], StreamTaskMessageFull) + content = out[0].content + assert isinstance(content, ToolResponseContent) + assert content.tool_call_id == "call_1" + assert content.name == "get_weather" + assert content.content == "Sunny, 72F" + assert content.author == "agent" + + +class TestOnFinalAiMessageCallback: + async def test_callback_called_for_ai_message_in_agent_node(self): + from langchain_core.messages import AIMessage + + captured: list[Any] = [] + ai_msg = AIMessage(content="Hello!") + + events = [("updates", {"agent": {"messages": [ai_msg]}})] + await _collect(convert_langgraph_to_agentex_events(_make_stream(events), on_final_ai_message=captured.append)) + assert len(captured) == 1 + assert captured[0] is ai_msg + + async def test_callback_not_called_for_tool_messages(self): + from langchain_core.messages import ToolMessage + + captured: list[Any] = [] + tool_msg = ToolMessage(content="result", tool_call_id="c1", name="t") + + events = [("updates", {"tools": {"messages": [tool_msg]}})] + await _collect(convert_langgraph_to_agentex_events(_make_stream(events), on_final_ai_message=captured.append)) + assert captured == [] + + async def test_callback_receives_usage_metadata(self): + from langchain_core.messages import AIMessage + + captured: list[Any] = [] + usage = {"input_tokens": 10, "output_tokens": 5, "total_tokens": 15} + ai_msg = AIMessage(content="Answer.", usage_metadata=usage) + + events = [("updates", {"agent": {"messages": [ai_msg]}})] + await _collect(convert_langgraph_to_agentex_events(_make_stream(events), on_final_ai_message=captured.append)) + assert len(captured) == 1 + assert captured[0].usage_metadata == usage + + async def test_no_callback_is_noop(self): + from langchain_core.messages import AIMessage + + ai_msg = AIMessage(content="Hello!") + events = [("updates", {"agent": {"messages": [ai_msg]}})] + out = await _collect(convert_langgraph_to_agentex_events(_make_stream(events))) + assert isinstance(out, list) + + async def test_callback_called_multiple_times_for_multi_step(self): + from langchain_core.messages import AIMessage + + captured: list[Any] = [] + ai_msg_1 = AIMessage(content="Step 1") + ai_msg_2 = AIMessage(content="Step 2") + + events = [ + ("updates", {"agent": {"messages": [ai_msg_1]}}), + ("updates", {"agent": {"messages": [ai_msg_2]}}), + ] + await _collect(convert_langgraph_to_agentex_events(_make_stream(events), on_final_ai_message=captured.append)) + assert len(captured) == 2 + assert captured[0] is ai_msg_1 + assert captured[1] is ai_msg_2 + + async def test_callback_called_after_tool_call_events_yielded(self): + """The callback fires after all events for that AIMessage are yielded.""" + from langchain_core.messages import AIMessage + + yield_order: list[str] = [] + + async def _gen(): + tc = {"id": "c1", "name": "t", "args": {}} + ai_msg = AIMessage(content="", tool_calls=[tc]) + yield ("updates", {"agent": {"messages": [ai_msg]}}) + + def _cb(msg): + yield_order.append("callback") + + async for _ in convert_langgraph_to_agentex_events(_gen(), on_final_ai_message=_cb): + yield_order.append("event") + + # The tool call Full event is emitted before the callback fires + assert yield_order.index("event") < yield_order.index("callback") + + +class TestLangGraphTracingHandlerBackwardCompat: + def test_create_langgraph_tracing_handler_no_runtime_warning(self): + """Deprecated symbol remains importable and emits no runtime DeprecationWarning. + + The runtime warnings.warn was removed (docstring-only deprecation) to + align with PR 4/6 and avoid breaking callers under warnings-as-errors. + Using ``warnings.simplefilter("error", DeprecationWarning)`` verifies + that calling the function is safe under -W error conditions. + """ + import warnings + + from agentex.lib.adk._modules._langgraph_tracing import create_langgraph_tracing_handler + + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("error", DeprecationWarning) + create_langgraph_tracing_handler(trace_id="t1", parent_span_id="p1") + + assert w == [], "create_langgraph_tracing_handler must NOT emit a runtime DeprecationWarning" diff --git a/tests/lib/adk/test_langgraph_sync_unified.py b/tests/lib/adk/test_langgraph_sync_unified.py new file mode 100644 index 000000000..cfd522828 --- /dev/null +++ b/tests/lib/adk/test_langgraph_sync_unified.py @@ -0,0 +1,214 @@ +"""Unified sync path tests for LangGraphTurn + UnifiedEmitter. + +Verifies: +1. Passthrough: events from emitter.yield_turn(LangGraphTurn(stream)) equal + LangGraphTurn(stream).events collected directly. +2. Span derivation: with trace_id + fake tracer, tool spans are derived from + the event stream. + +NOTE: langchain_core imports are deferred to test scope because conftest.py +stubs ``langchain_core.messages`` with MagicMock. +""" + +from __future__ import annotations + +import sys +from typing import Any +from datetime import datetime, timezone +from dataclasses import field, dataclass + +import pytest + +from agentex.lib.core.harness.tracer import SpanTracer +from agentex.lib.core.harness.emitter import UnifiedEmitter +from agentex.lib.adk._modules._langgraph_turn import LangGraphTurn + +# --------------------------------------------------------------------------- +# Remove conftest stubs so real langchain_core types are used +# --------------------------------------------------------------------------- + + +@pytest.fixture(autouse=True) +def _real_langchain_core(): + stub_keys = [k for k in sys.modules if k.startswith("langchain_core") or k.startswith("langgraph")] + saved = {k: sys.modules.pop(k) for k in stub_keys} + import importlib + + importlib.import_module("langchain_core.messages") + yield + sys.modules.update(saved) + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _make_stream(events: list[tuple[str, Any]]): + async def _gen(): + for e in events: + yield e + + return _gen() + + +# --------------------------------------------------------------------------- +# Fake SpanTracer +# --------------------------------------------------------------------------- + + +@dataclass +class _FakeTracingBackend: + spans_started: list[dict[str, Any]] = field(default_factory=list) + spans_ended: list[str] = field(default_factory=list) + + async def start_span(self, **kw) -> Any: + from agentex.types.span import Span + + sp = Span( + id=f"span-{len(self.spans_started) + 1}", + trace_id=kw.get("trace_id", "trace1"), + name=kw.get("name", ""), + start_time=datetime.now(tz=timezone.utc), + ) + self.spans_started.append(kw) + return sp + + async def end_span(self, *, trace_id: str, span: Any) -> None: + self.spans_ended.append(span.id if span else "") + + +# --------------------------------------------------------------------------- +# Tests +# --------------------------------------------------------------------------- + + +class TestPassthrough: + async def test_yield_turn_events_equal_direct_events(self): + """Events from emitter.yield_turn(LangGraphTurn(stream)) must equal + LangGraphTurn(stream).events collected directly — the emitter must not + add, drop, or reorder events in yield mode.""" + from langchain_core.messages import AIMessage, AIMessageChunk + + chunk = AIMessageChunk(content="Hello!") + ai_msg = AIMessage(content="Hello!") + + # Build two identical streams + events_raw = [ + ("messages", (chunk, {})), + ("updates", {"agent": {"messages": [ai_msg]}}), + ] + + # Direct collection + direct = [e async for e in LangGraphTurn(_make_stream(events_raw)).events] + + # Via emitter.yield_turn + emitter = UnifiedEmitter(task_id="t", trace_id=None, parent_span_id=None) + via_emitter = [e async for e in emitter.yield_turn(LangGraphTurn(_make_stream(events_raw)))] + + assert len(direct) == len(via_emitter), "yield_turn must not add or drop events relative to direct iteration" + for a, b in zip(direct, via_emitter, strict=True): + assert type(a) == type(b), f"Event type mismatch: {type(a).__name__} vs {type(b).__name__}" + + async def test_yield_turn_passes_all_event_types(self): + """Start, Delta, Done, Full — each type is preserved.""" + from langchain_core.messages import AIMessage, AIMessageChunk + + chunk = AIMessageChunk(content="hi") + tc = {"id": "c1", "name": "t", "args": {}} + ai_msg = AIMessage(content="hi", tool_calls=[tc]) + + events_raw = [ + ("messages", (chunk, {})), + ("updates", {"agent": {"messages": [ai_msg]}}), + ] + emitter = UnifiedEmitter(task_id="t", trace_id=None, parent_span_id=None) + out = [e async for e in emitter.yield_turn(LangGraphTurn(_make_stream(events_raw)))] + types = {type(e).__name__ for e in out} + # text chunk emits Start + Delta + assert "StreamTaskMessageStart" in types + assert "StreamTaskMessageDelta" in types + # tool call emits Full + assert "StreamTaskMessageFull" in types + + async def test_empty_stream_yields_no_events(self): + emitter = UnifiedEmitter(task_id="t", trace_id=None, parent_span_id=None) + out = [e async for e in emitter.yield_turn(LangGraphTurn(_make_stream([])))] + assert out == [] + + +class TestSpanDerivation: + @pytest.fixture + def fake_tracer(self): + backend = _FakeTracingBackend() + tracer = SpanTracer( + trace_id="trace1", + parent_span_id=None, + task_id="t", + tracing=backend, # type: ignore[arg-type] + ) + return tracer, backend + + async def test_tool_span_derived_from_full_events(self, fake_tracer): + """AGX1-377: SpanDeriver now handles Full tool events for LangGraph. + + Full(ToolRequestContent) opens a tool span keyed by tool_call_id; + Full(ToolResponseContent) closes it. This bridges the previous gap where + LangGraph's Full-event path produced no spans, aligning it with + Start+Done harnesses (pydantic-ai, openai-agents). + """ + from langchain_core.messages import AIMessage, ToolMessage + + tracer, backend = fake_tracer + tc = {"id": "c1", "name": "get_weather", "args": {"city": "Paris"}} + ai_msg = AIMessage(content="", tool_calls=[tc]) + tool_msg = ToolMessage(content="Sunny", tool_call_id="c1", name="get_weather") + + events_raw = [ + ("updates", {"agent": {"messages": [ai_msg]}}), + ("updates", {"tools": {"messages": [tool_msg]}}), + ] + + emitter = UnifiedEmitter(task_id="t", trace_id="trace1", parent_span_id=None, tracer=tracer) + _ = [e async for e in emitter.yield_turn(LangGraphTurn(_make_stream(events_raw)))] + + assert len(backend.spans_started) == 1, "Full(ToolRequestContent) opens one tool span" + started = backend.spans_started[0] + assert started["name"] == "get_weather" + assert started["input"] == {"city": "Paris"} + + async def test_no_spans_when_no_tool_calls(self, fake_tracer): + """yield_turn with tracer but no tool calls emits no spans.""" + from langchain_core.messages import AIMessage, AIMessageChunk + + tracer, backend = fake_tracer + chunk = AIMessageChunk(content="Hello!") + ai_msg = AIMessage(content="Hello!") + + events_raw = [ + ("messages", (chunk, {})), + ("updates", {"agent": {"messages": [ai_msg]}}), + ] + + emitter = UnifiedEmitter(task_id="t", trace_id="trace1", parent_span_id=None, tracer=tracer) + _ = [e async for e in emitter.yield_turn(LangGraphTurn(_make_stream(events_raw)))] + + assert backend.spans_started == [], "No tool spans when there are no tool calls" + + async def test_tracer_none_means_no_spans(self): + """With tracer=False, no spans should be emitted.""" + from langchain_core.messages import AIMessage, ToolMessage + + tc = {"id": "c1", "name": "t", "args": {}} + ai_msg = AIMessage(content="", tool_calls=[tc]) + tool_msg = ToolMessage(content="ok", tool_call_id="c1", name="t") + + events_raw = [ + ("updates", {"agent": {"messages": [ai_msg]}}), + ("updates", {"tools": {"messages": [tool_msg]}}), + ] + + emitter = UnifiedEmitter(task_id="t", trace_id="trace1", parent_span_id=None, tracer=False) + _ = [e async for e in emitter.yield_turn(LangGraphTurn(_make_stream(events_raw)))] + # No assertion on spans since tracer=False means emitter.tracer is None + assert emitter.tracer is None diff --git a/tests/lib/adk/test_langgraph_turn.py b/tests/lib/adk/test_langgraph_turn.py new file mode 100644 index 000000000..23aa34ba3 --- /dev/null +++ b/tests/lib/adk/test_langgraph_turn.py @@ -0,0 +1,265 @@ +"""Tests for LangGraphTurn and langgraph_usage_to_turn_usage.""" + +from __future__ import annotations + +import sys +from typing import Any + +import pytest + +from agentex.lib.core.harness.types import TurnUsage +from agentex.lib.adk._modules._langgraph_turn import LangGraphTurn, langgraph_usage_to_turn_usage + +# --------------------------------------------------------------------------- +# Remove conftest stubs so real langchain_core types are used +# --------------------------------------------------------------------------- + + +@pytest.fixture(autouse=True) +def _real_langchain_core(): + stub_keys = [k for k in sys.modules if k.startswith("langchain_core") or k.startswith("langgraph")] + saved = {k: sys.modules.pop(k) for k in stub_keys} + import importlib + + importlib.import_module("langchain_core.messages") + yield + sys.modules.update(saved) + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _make_stream(events: list[tuple[str, Any]]): + async def _gen(): + for e in events: + yield e + + return _gen() + + +async def _drain(turn: LangGraphTurn) -> list[Any]: + return [e async for e in turn.events] + + +# --------------------------------------------------------------------------- +# langgraph_usage_to_turn_usage +# --------------------------------------------------------------------------- + + +class TestLangGraphUsageToTurnUsage: + def test_none_usage_returns_empty_turn_usage(self): + result = langgraph_usage_to_turn_usage(None, model="gpt-4") + assert result == TurnUsage(model="gpt-4") + + def test_basic_token_fields_mapped(self): + usage = {"input_tokens": 10, "output_tokens": 5, "total_tokens": 15} + result = langgraph_usage_to_turn_usage(usage, model="gpt-4") + assert result.input_tokens == 10 + assert result.output_tokens == 5 + assert result.total_tokens == 15 + assert result.model == "gpt-4" + + def test_zero_output_tokens_preserved_not_coerced_to_none(self): + """Real zero counts must be preserved as 0, not None.""" + usage = {"input_tokens": 10, "output_tokens": 0, "total_tokens": 10} + result = langgraph_usage_to_turn_usage(usage, model=None) + assert result.output_tokens == 0 + + def test_cache_read_mapped_to_cached_input_tokens(self): + usage = { + "input_tokens": 20, + "output_tokens": 5, + "total_tokens": 25, + "input_token_details": {"cache_read": 8}, + } + result = langgraph_usage_to_turn_usage(usage, model=None) + assert result.cached_input_tokens == 8 + + def test_reasoning_mapped_to_reasoning_tokens(self): + usage = { + "input_tokens": 10, + "output_tokens": 15, + "total_tokens": 25, + "output_token_details": {"reasoning": 6}, + } + result = langgraph_usage_to_turn_usage(usage, model=None) + assert result.reasoning_tokens == 6 + + def test_missing_optional_fields_are_none(self): + usage = {"input_tokens": 5, "output_tokens": 3, "total_tokens": 8} + result = langgraph_usage_to_turn_usage(usage, model=None) + assert result.cached_input_tokens is None + assert result.reasoning_tokens is None + + def test_full_usage_object(self): + usage = { + "input_tokens": 100, + "output_tokens": 50, + "total_tokens": 150, + "input_token_details": {"cache_read": 30}, + "output_token_details": {"reasoning": 20}, + } + result = langgraph_usage_to_turn_usage(usage, model="claude-3-5-sonnet") + assert result == TurnUsage( + model="claude-3-5-sonnet", + input_tokens=100, + output_tokens=50, + total_tokens=150, + cached_input_tokens=30, + reasoning_tokens=20, + ) + + def test_model_none_is_preserved(self): + result = langgraph_usage_to_turn_usage({"input_tokens": 1}, model=None) + assert result.model is None + + def test_empty_input_token_details_does_not_crash(self): + usage = {"input_tokens": 5, "input_token_details": {}} + result = langgraph_usage_to_turn_usage(usage, model=None) + assert result.cached_input_tokens is None + + def test_empty_output_token_details_does_not_crash(self): + usage = {"output_tokens": 5, "output_token_details": {}} + result = langgraph_usage_to_turn_usage(usage, model=None) + assert result.reasoning_tokens is None + + +# --------------------------------------------------------------------------- +# LangGraphTurn +# --------------------------------------------------------------------------- + + +class TestLangGraphTurn: + async def test_events_yields_from_sync_converter(self): + from langchain_core.messages import AIMessage, AIMessageChunk + + chunk = AIMessageChunk(content="Hello!") + ai_msg = AIMessage(content="Hello!") + stream = _make_stream( + [ + ("messages", (chunk, {})), + ("updates", {"agent": {"messages": [ai_msg]}}), + ] + ) + turn = LangGraphTurn(stream) + events = await _drain(turn) + assert len(events) > 0 + + async def test_usage_is_empty_before_stream_consumed(self): + turn = LangGraphTurn(_make_stream([])) + # usage() before events consumed should return a default TurnUsage + usage = turn.usage() + assert isinstance(usage, TurnUsage) + + async def test_usage_captured_from_ai_message(self): + from langchain_core.messages import AIMessage + + usage_meta = {"input_tokens": 10, "output_tokens": 5, "total_tokens": 15} + ai_msg = AIMessage(content="Hi!", usage_metadata=usage_meta) + stream = _make_stream([("updates", {"agent": {"messages": [ai_msg]}})]) + turn = LangGraphTurn(stream, model="gpt-4") + await _drain(turn) + + usage = turn.usage() + assert usage.input_tokens == 10 + assert usage.output_tokens == 5 + assert usage.total_tokens == 15 + assert usage.model == "gpt-4" + + async def test_usage_accumulates_across_multiple_ai_messages(self): + """A multi-step turn (>1 LLM call) sums usage instead of keeping only the last.""" + from langchain_core.messages import AIMessage + + first = AIMessage( + content="thinking", + usage_metadata={ + "input_tokens": 10, + "output_tokens": 5, + "total_tokens": 15, + "input_token_details": {"cache_read": 2}, + "output_token_details": {"reasoning": 1}, + }, + ) + second = AIMessage( + content="answer", + usage_metadata={ + "input_tokens": 20, + "output_tokens": 7, + "total_tokens": 27, + "input_token_details": {"cache_read": 3}, + "output_token_details": {"reasoning": 4}, + }, + ) + stream = _make_stream( + [ + ("updates", {"agent": {"messages": [first]}}), + ("updates", {"agent": {"messages": [second]}}), + ] + ) + turn = LangGraphTurn(stream, model="gpt-4") + await _drain(turn) + + usage = turn.usage() + assert usage.input_tokens == 30 + assert usage.output_tokens == 12 + assert usage.total_tokens == 42 + assert usage.cached_input_tokens == 5 + assert usage.reasoning_tokens == 5 + assert usage.model == "gpt-4" + + async def test_usage_not_updated_when_no_usage_metadata(self): + from langchain_core.messages import AIMessage + + ai_msg = AIMessage(content="Hi!") + stream = _make_stream([("updates", {"agent": {"messages": [ai_msg]}})]) + turn = LangGraphTurn(stream, model="gpt-4") + await _drain(turn) + + usage = turn.usage() + assert usage == TurnUsage(model="gpt-4") + + async def test_usage_captures_cache_read_and_reasoning(self): + from langchain_core.messages import AIMessage + + usage_meta = { + "input_tokens": 100, + "output_tokens": 50, + "total_tokens": 150, + "input_token_details": {"cache_read": 30}, + "output_token_details": {"reasoning": 20}, + } + ai_msg = AIMessage(content="Result", usage_metadata=usage_meta) + stream = _make_stream([("updates", {"agent": {"messages": [ai_msg]}})]) + turn = LangGraphTurn(stream, model="claude-3-5-sonnet") + await _drain(turn) + + usage = turn.usage() + assert usage.cached_input_tokens == 30 + assert usage.reasoning_tokens == 20 + + async def test_harness_turn_protocol_conformance(self): + """LangGraphTurn satisfies the HarnessTurn Protocol.""" + from agentex.lib.core.harness.types import HarnessTurn + + turn = LangGraphTurn(_make_stream([])) + assert isinstance(turn, HarnessTurn), "LangGraphTurn must satisfy HarnessTurn Protocol" + + async def test_empty_stream_yields_no_events(self): + turn = LangGraphTurn(_make_stream([])) + events = await _drain(turn) + assert events == [] + + async def test_model_none_default(self): + turn = LangGraphTurn(_make_stream([])) + assert turn.usage().model is None + + async def test_model_passed_through_to_usage(self): + from langchain_core.messages import AIMessage + + ai_msg = AIMessage(content="ok", usage_metadata={"input_tokens": 1, "output_tokens": 0, "total_tokens": 1}) + stream = _make_stream([("updates", {"agent": {"messages": [ai_msg]}})]) + turn = LangGraphTurn(stream, model="my-model") + await _drain(turn) + assert turn.usage().model == "my-model" diff --git a/tests/lib/adk/test_pydantic_ai_turn.py b/tests/lib/adk/test_pydantic_ai_turn.py index 0659895d3..46bf247a3 100644 --- a/tests/lib/adk/test_pydantic_ai_turn.py +++ b/tests/lib/adk/test_pydantic_ai_turn.py @@ -122,7 +122,7 @@ async def test_usage_before_exhaustion_returns_default(self): assert pre_usage.model == "openai:gpt-4o" assert pre_usage.input_tokens is None assert pre_usage.output_tokens is None - assert pre_usage.num_llm_calls == 0 + assert pre_usage.num_llm_calls is None async def test_turn_events_and_usage(self): """Driving events to exhaustion populates usage from the terminal event.""" @@ -227,7 +227,7 @@ async def test_no_usage_event_leaves_default_usage(self): usage = turn.usage() assert usage.model == "openai:gpt-4o" assert usage.input_tokens is None - assert usage.num_llm_calls == 0 + assert usage.num_llm_calls is None class TestToolRequestStreaming: diff --git a/tests/lib/core/harness/conformance/test_langgraph_conformance.py b/tests/lib/core/harness/conformance/test_langgraph_conformance.py new file mode 100644 index 000000000..721d6aac5 --- /dev/null +++ b/tests/lib/core/harness/conformance/test_langgraph_conformance.py @@ -0,0 +1,229 @@ +"""Cross-channel conformance fixtures for LangGraph harness tap. + +Each fixture is built as a canonical sequence of ``StreamTaskMessage*`` events +that matches what ``convert_langgraph_to_agentex_events`` (via ``LangGraphTurn``) +emits for the given scenario. The fixtures are registered with the shared +conformance runner and exercised by both the cross-channel equivalence test +(yield_events vs auto_send) and the backward-compatible span-derivation test. + +LangGraph-specific note +----------------------- +LangGraph emits tool *requests* as ``StreamTaskMessageFull`` events (from the +"updates" stream), NOT as Start+Delta+Done like pydantic-ai. ``auto_send`` +handles Full events by opening a streaming context with the full content and +closing it immediately, so both channels deliver the same logical payload. +No ``coalesce_tool_requests`` option is needed. +""" + +from __future__ import annotations + +import pytest + +from agentex.types.text_content import TextContent +from agentex.types.reasoning_content import ReasoningContent +from agentex.types.task_message_delta import TextDelta +from agentex.types.task_message_update import ( + StreamTaskMessageDone, + StreamTaskMessageFull, + StreamTaskMessageDelta, + StreamTaskMessageStart, +) +from agentex.types.tool_request_content import ToolRequestContent +from agentex.types.tool_response_content import ToolResponseContent +from agentex.types.reasoning_content_delta import ReasoningContentDelta + +from .runner import Fixture, register, derive_all, run_cross_channel_conformance + +# --------------------------------------------------------------------------- +# Fixtures +# --------------------------------------------------------------------------- + +_TEXT_ONLY = Fixture( + name="langgraph-text-only", + events=[ + StreamTaskMessageStart( + type="start", + index=0, + content=TextContent(type="text", author="agent", content=""), + ), + StreamTaskMessageDelta( + type="delta", + index=0, + delta=TextDelta(type="text", text_delta="Hello from LangGraph!"), + ), + StreamTaskMessageDone(type="done", index=0), + ], +) + +_SINGLE_TOOL = Fixture( + name="langgraph-single-tool", + events=[ + # LangGraph tool request is a Full event (from "updates" stream) + StreamTaskMessageFull( + type="full", + index=0, + content=ToolRequestContent( + type="tool_request", + author="agent", + tool_call_id="call_1", + name="get_weather", + arguments={"city": "Paris"}, + ), + ), + StreamTaskMessageFull( + type="full", + index=1, + content=ToolResponseContent( + type="tool_response", + author="agent", + tool_call_id="call_1", + name="get_weather", + content="Sunny, 72F", + ), + ), + StreamTaskMessageStart( + type="start", + index=2, + content=TextContent(type="text", author="agent", content=""), + ), + StreamTaskMessageDelta( + type="delta", + index=2, + delta=TextDelta(type="text", text_delta="The weather in Paris is sunny, 72F."), + ), + StreamTaskMessageDone(type="done", index=2), + ], +) + +_REASONING = Fixture( + name="langgraph-reasoning", + events=[ + StreamTaskMessageStart( + type="start", + index=0, + content=ReasoningContent( + type="reasoning", + author="agent", + summary=[], + content=[], + style="active", + ), + ), + StreamTaskMessageDelta( + type="delta", + index=0, + delta=ReasoningContentDelta( + type="reasoning_content", + content_index=0, + content_delta="Thinking about this...", + ), + ), + StreamTaskMessageDone(type="done", index=0), + StreamTaskMessageStart( + type="start", + index=1, + content=TextContent(type="text", author="agent", content=""), + ), + StreamTaskMessageDelta( + type="delta", + index=1, + delta=TextDelta(type="text", text_delta="The answer is 42."), + ), + StreamTaskMessageDone(type="done", index=1), + ], +) + +_MULTI_STEP = Fixture( + name="langgraph-multi-step", + events=[ + # Turn 1: streaming text + StreamTaskMessageStart( + type="start", + index=0, + content=TextContent(type="text", author="agent", content=""), + ), + StreamTaskMessageDelta( + type="delta", + index=0, + delta=TextDelta(type="text", text_delta="Let me search for that."), + ), + StreamTaskMessageDone(type="done", index=0), + # Tool request (Full — from "updates" stream) + StreamTaskMessageFull( + type="full", + index=1, + content=ToolRequestContent( + type="tool_request", + author="agent", + tool_call_id="call_2", + name="search", + arguments={"query": "langgraph"}, + ), + ), + StreamTaskMessageFull( + type="full", + index=2, + content=ToolResponseContent( + type="tool_response", + author="agent", + tool_call_id="call_2", + name="search", + content="LangGraph is a framework for...", + ), + ), + # Turn 2: final streaming text + StreamTaskMessageStart( + type="start", + index=3, + content=TextContent(type="text", author="agent", content=""), + ), + StreamTaskMessageDelta( + type="delta", + index=3, + delta=TextDelta(type="text", text_delta="Based on my research, LangGraph is..."), + ), + StreamTaskMessageDone(type="done", index=3), + ], +) + +_LANGGRAPH_FIXTURES = [_TEXT_ONLY, _SINGLE_TOOL, _REASONING, _MULTI_STEP] + +for _fixture in _LANGGRAPH_FIXTURES: + register(_fixture) + + +# --------------------------------------------------------------------------- +# Cross-channel conformance: logical equivalence + span equivalence +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize("fixture", _LANGGRAPH_FIXTURES, ids=lambda f: f.name) +@pytest.mark.asyncio +async def test_cross_channel_equivalence(fixture: Fixture) -> None: + """Assert that yield_events and auto_send produce equivalent logical + deliveries and identical span signals for each LangGraph fixture. + + See runner.py for the full contract. The key LangGraph difference: tool + requests arrive as Full events rather than Start+Delta+Done, so auto_send + handles them by opening a streaming context with the full content and + closing it immediately — both channels produce the same LogicalDelivery. + """ + yield_deliveries, auto_deliveries, yield_spans, auto_spans = await run_cross_channel_conformance(fixture) + + assert yield_deliveries == auto_deliveries, ( + f"[{fixture.name}] logical deliveries differ:\n yield: {yield_deliveries}\n auto_send: {auto_deliveries}" + ) + assert yield_spans == auto_spans, ( + f"[{fixture.name}] span signals differ:\n yield: {yield_spans}\n auto_send: {auto_spans}" + ) + + +# --------------------------------------------------------------------------- +# Backward-compatible determinism guard +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize("fixture", _LANGGRAPH_FIXTURES, ids=lambda f: f.name) +def test_span_derivation_is_deterministic(fixture: Fixture) -> None: + """Span derivation over the same event list is idempotent.""" + assert derive_all(fixture.events) == derive_all(fixture.events) diff --git a/tests/lib/core/harness/test_harness_langgraph_async.py b/tests/lib/core/harness/test_harness_langgraph_async.py new file mode 100644 index 000000000..39bf5bc66 --- /dev/null +++ b/tests/lib/core/harness/test_harness_langgraph_async.py @@ -0,0 +1,298 @@ +"""Integration test: async (Redis-streaming) channel with a LangGraph agent. + +Exercises the unified harness surface (UnifiedEmitter.auto_send_turn + LangGraphTurn) +with a minimal fake LangGraph stream so the test runs fully offline (no API +keys, no Redis, no Agentex server). + +Agent description +----------------- +A simulated single-tool agent run using hand-crafted LangGraph event tuples: +one tool request + response, followed by a final text reply. + +What is tested +-------------- +- The async handler pushes the correct sequence of messages to the fake streaming + backend: Full(ToolRequest) + Full(ToolResponse) + text Start/Delta/Done. +- final_text accumulates all text (not just last segment — AGX1-377 unified behavior). +- Tool messages go through streaming_task_message_context (not messages.create). +- With a SpanTracer, no tool spans are produced (AGX1-377: Full events are not + handled by SpanDeriver today). + +What is NOT covered without live infrastructure +----------------------------------------------- +- Actual Redis streaming (requires a running Redis instance). +- The ACP on_task_event_send / on_task_create / on_task_cancel lifecycle. +- Real LLM calls or real LangGraph graph execution. +- The full FastACP async request lifecycle. + +See also: test_harness_langgraph_sync.py and test_harness_langgraph_temporal.py +for the other two channels. +""" + +from __future__ import annotations + +import sys +from typing import Any +from dataclasses import field, dataclass + +import pytest + +from agentex.types.task_message import TaskMessage +from agentex.types.text_content import TextContent +from agentex.lib.core.harness.types import TurnResult +from agentex.lib.core.harness.tracer import SpanTracer +from agentex.lib.core.harness.emitter import UnifiedEmitter +from agentex.types.tool_request_content import ToolRequestContent +from agentex.types.tool_response_content import ToolResponseContent +from agentex.lib.adk._modules._langgraph_turn import LangGraphTurn + +# --------------------------------------------------------------------------- +# Remove conftest stubs so real langchain_core types are used +# --------------------------------------------------------------------------- + + +@pytest.fixture(autouse=True) +def _real_langchain_core(): + stub_keys = [k for k in sys.modules if k.startswith("langchain_core") or k.startswith("langgraph")] + saved = {k: sys.modules.pop(k) for k in stub_keys} + import importlib + + importlib.import_module("langchain_core.messages") + yield + sys.modules.update(saved) + + +# --------------------------------------------------------------------------- +# Fake streaming backend (replaces adk.streaming; no Redis required) +# --------------------------------------------------------------------------- + + +@dataclass +class _FakeCtx: + ctype: str + initial_content: Any + task_message: TaskMessage + closed: bool = False + deltas: list[Any] = field(default_factory=list) + + async def __aenter__(self) -> "_FakeCtx": + return self + + async def __aexit__(self, *args: Any) -> bool: + await self.close() + return False + + async def close(self) -> None: + self.closed = True + + async def stream_update(self, update: Any) -> Any: + self.deltas.append(update) + return update + + +class _FakeStreaming: + def __init__(self) -> None: + self.contexts: list[_FakeCtx] = [] + + def streaming_task_message_context(self, task_id: str, initial_content: Any, **kw: Any) -> _FakeCtx: + ctype = getattr(initial_content, "type", None) or "" + tm = TaskMessage(id=f"m{len(self.contexts) + 1}", task_id=task_id, content=initial_content) + ctx = _FakeCtx(ctype=ctype, initial_content=initial_content, task_message=tm) + self.contexts.append(ctx) + return ctx + + +# --------------------------------------------------------------------------- +# Fake tracing backend +# --------------------------------------------------------------------------- + + +class _FakeSpan: + def __init__(self, name: str) -> None: + self.name = name + self.output: Any = None + + +class _FakeTracing: + def __init__(self) -> None: + self.started: list[tuple[str, Any]] = [] + self.ended: list[tuple[str, Any]] = [] + + async def start_span(self, *, trace_id: str, name: str, **kw: Any) -> _FakeSpan: + self.started.append((name, kw.get("parent_id"))) + return _FakeSpan(name) + + async def end_span(self, *, trace_id: str, span: _FakeSpan) -> None: + self.ended.append((span.name, span.output)) + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _make_stream(events: list[tuple[str, Any]]): + async def _gen(): + for e in events: + yield e + + return _gen() + + +async def _run_auto_send_turn( + stream_events: list[tuple[str, Any]], + trace_id: str | None = None, +) -> tuple[TurnResult, _FakeStreaming, _FakeTracing | None]: + fake_streaming = _FakeStreaming() + fake_tracing = _FakeTracing() if trace_id else None + + tracer: SpanTracer | bool = False + if trace_id and fake_tracing is not None: + tracer = SpanTracer(trace_id=trace_id, parent_span_id=None, task_id="task1", tracing=fake_tracing) + + turn = LangGraphTurn(_make_stream(stream_events), model=None) + emitter = UnifiedEmitter( + task_id="task1", + trace_id=trace_id, + parent_span_id=None, + tracer=tracer, + streaming=fake_streaming, + ) + result = await emitter.auto_send_turn(turn) + return result, fake_streaming, fake_tracing + + +# --------------------------------------------------------------------------- +# Tests +# --------------------------------------------------------------------------- + + +class TestAsyncAutoSendChannel: + async def test_text_only_streams_text_and_returns_final(self): + from langchain_core.messages import AIMessage, AIMessageChunk + + chunk = AIMessageChunk(content="Hello from LangGraph!") + ai_msg = AIMessage(content="Hello from LangGraph!") + events = [ + ("messages", (chunk, {})), + ("updates", {"agent": {"messages": [ai_msg]}}), + ] + result, fake_streaming, _ = await _run_auto_send_turn(events) + + assert result.final_text == "Hello from LangGraph!" + text_ctxs = [c for c in fake_streaming.contexts if c.ctype == "text"] + assert len(text_ctxs) == 1 + assert text_ctxs[0].closed is True + + async def test_tool_call_posted_via_streaming_context(self): + from langchain_core.messages import AIMessage + + tc = {"id": "call_1", "name": "get_weather", "args": {"city": "Paris"}} + ai_msg = AIMessage(content="", tool_calls=[tc]) + events = [("updates", {"agent": {"messages": [ai_msg]}})] + + result, fake_streaming, _ = await _run_auto_send_turn(events) + + # Tool request via streaming_task_message_context (Full event) + tool_req_ctxs = [c for c in fake_streaming.contexts if isinstance(c.initial_content, ToolRequestContent)] + assert len(tool_req_ctxs) == 1 + assert tool_req_ctxs[0].initial_content.tool_call_id == "call_1" + assert tool_req_ctxs[0].closed is True + assert tool_req_ctxs[0].deltas == [], "Full messages have no deltas" + + async def test_tool_response_posted_via_streaming_context(self): + from langchain_core.messages import ToolMessage + + tool_msg = ToolMessage(content="Sunny, 72F", tool_call_id="call_1", name="get_weather") + events = [("updates", {"tools": {"messages": [tool_msg]}})] + + _, fake_streaming, _ = await _run_auto_send_turn(events) + + tool_resp_ctxs = [c for c in fake_streaming.contexts if isinstance(c.initial_content, ToolResponseContent)] + assert len(tool_resp_ctxs) == 1 + assert tool_resp_ctxs[0].initial_content.content == "Sunny, 72F" + assert tool_resp_ctxs[0].closed is True + + async def test_multi_step_final_text_is_last_segment(self): + """Unified surface: final_text uses last-segment semantics. + + auto_send resets final_text_parts when a new Start(TextContent) is seen, + so multi-step turns (text -> tool -> text) return only the LAST text segment. + This matches the behaviour documented in auto_send.py and mirrors + stream_pydantic_ai_events. + """ + from langchain_core.messages import AIMessage, ToolMessage, AIMessageChunk + + chunk1 = AIMessageChunk(content="Searching...") + ai_msg1 = AIMessage(content="Searching...", tool_calls=[{"id": "c1", "name": "s", "args": {}}]) + tool_msg = ToolMessage(content="results", tool_call_id="c1", name="s") + chunk2 = AIMessageChunk(content="Found it!") + ai_msg2 = AIMessage(content="Found it!") + + events = [ + ("messages", (chunk1, {})), + ("updates", {"agent": {"messages": [ai_msg1]}}), + ("updates", {"tools": {"messages": [tool_msg]}}), + ("messages", (chunk2, {})), + ("updates", {"agent": {"messages": [ai_msg2]}}), + ] + result, fake_streaming, _ = await _run_auto_send_turn(events) + + # Last segment only — first text segment is NOT in final_text + assert result.final_text == "Found it!" + + # Two text streaming contexts still opened (both streamed to Redis) + text_ctxs = [c for c in fake_streaming.contexts if isinstance(c.initial_content, TextContent)] + assert len(text_ctxs) == 2 + + async def test_empty_stream_returns_empty_final_text(self): + result, fake_streaming, _ = await _run_auto_send_turn([]) + assert result.final_text == "" + assert fake_streaming.contexts == [] + + async def test_turn_usage_populated_after_events_consumed(self): + """LangGraphTurn.usage() is populated via the on_final_ai_message callback + during event iteration. TurnResult.usage is a snapshot from before events run + (emitter.auto_send_turn evaluates turn.usage() eagerly); the authoritative + post-iteration usage is on turn.usage() directly.""" + from langchain_core.messages import AIMessage + + fake_streaming = _FakeStreaming() + usage_meta = {"input_tokens": 10, "output_tokens": 5, "total_tokens": 15} + ai_msg = AIMessage(content="hi", usage_metadata=usage_meta) + events = [("updates", {"agent": {"messages": [ai_msg]}})] + + turn = LangGraphTurn(_make_stream(events), model="gpt-4") + emitter = UnifiedEmitter( + task_id="task1", trace_id=None, parent_span_id=None, tracer=False, streaming=fake_streaming + ) + await emitter.auto_send_turn(turn) + + # After auto_send_turn, turn.usage() has the captured values + usage = turn.usage() + assert usage.input_tokens == 10 + assert usage.output_tokens == 5 + assert usage.total_tokens == 15 + + async def test_tracer_produces_tool_spans_for_full_events(self): + """AGX1-377: SpanDeriver now handles Full tool events (request opens, response closes). + + Full(ToolRequestContent) opens a tool span; Full(ToolResponseContent) closes it. + This aligns LangGraph tracing with Start+Done harnesses (pydantic-ai, openai-agents). + """ + from langchain_core.messages import AIMessage, ToolMessage + + tc = {"id": "c1", "name": "t", "args": {}} + ai_msg = AIMessage(content="", tool_calls=[tc]) + tool_msg = ToolMessage(content="ok", tool_call_id="c1", name="t") + + events = [ + ("updates", {"agent": {"messages": [ai_msg]}}), + ("updates", {"tools": {"messages": [tool_msg]}}), + ] + _, _, fake_tracing = await _run_auto_send_turn(events, trace_id="trace-1") + + assert fake_tracing is not None + assert len(fake_tracing.started) == 1, "Full(ToolRequestContent) opens one tool span" + assert fake_tracing.started[0][0] == "t", "span name matches the tool name" + assert len(fake_tracing.ended) == 1, "Full(ToolResponseContent) closes the span" diff --git a/tests/lib/core/harness/test_harness_langgraph_sync.py b/tests/lib/core/harness/test_harness_langgraph_sync.py new file mode 100644 index 000000000..9f67dd2b6 --- /dev/null +++ b/tests/lib/core/harness/test_harness_langgraph_sync.py @@ -0,0 +1,229 @@ +"""Integration test: sync (HTTP-yield) channel with a LangGraph agent. + +Exercises the unified harness surface (UnifiedEmitter.yield_turn + LangGraphTurn) +with a minimal fake LangGraph stream so the test runs fully offline (no API +keys, no Redis, no Agentex server). + +Agent description +----------------- +A simulated single-tool agent run using hand-crafted LangGraph event tuples: +one tool request + response, followed by a final text reply. + +What is tested +-------------- +- The sync handler correctly yields StreamTaskMessage* events in order: + Full(ToolRequest) then Full(ToolResponse) then text Start+Delta+Done. +- With trace_id + fake tracing, the SpanDeriver fires for text events. +- LangGraph emits tool calls as Full events (not Start+Done); the SpanDeriver + opens a tool span on Full(ToolRequestContent) and closes it on the matching + Full(ToolResponseContent) (see test_tracer_produces_tool_spans_for_full_events). +- Final text is accumulated via yield mode. + +What is NOT covered without live infrastructure +----------------------------------------------- +- Actual HTTP streaming over the ACP sync endpoint. +- Real LLM calls or real LangGraph graph execution. +- The full FastACP request/response lifecycle. + +See also: test_harness_langgraph_async.py and test_harness_langgraph_temporal.py +for the other two channels. +""" + +from __future__ import annotations + +import sys +from typing import Any + +import pytest + +from agentex.lib.core.harness.tracer import SpanTracer +from agentex.lib.core.harness.emitter import UnifiedEmitter +from agentex.types.task_message_update import ( + StreamTaskMessageFull, + StreamTaskMessageStart, +) +from agentex.types.tool_request_content import ToolRequestContent +from agentex.types.tool_response_content import ToolResponseContent +from agentex.lib.adk._modules._langgraph_turn import LangGraphTurn + +# --------------------------------------------------------------------------- +# Remove conftest stubs so real langchain_core types are used +# --------------------------------------------------------------------------- + + +@pytest.fixture(autouse=True) +def _real_langchain_core(): + stub_keys = [k for k in sys.modules if k.startswith("langchain_core") or k.startswith("langgraph")] + saved = {k: sys.modules.pop(k) for k in stub_keys} + import importlib + + importlib.import_module("langchain_core.messages") + yield + sys.modules.update(saved) + + +# --------------------------------------------------------------------------- +# Fake tracing backend +# --------------------------------------------------------------------------- + + +class _FakeSpan: + def __init__(self, name: str) -> None: + self.name = name + self.output: Any = None + + +class _FakeTracing: + def __init__(self) -> None: + self.started: list[tuple[str, Any]] = [] + self.ended: list[tuple[str, Any]] = [] + + async def start_span( + self, *, trace_id: str, name: str, input: Any = None, parent_id: Any = None, **kw: Any + ) -> _FakeSpan: + self.started.append((name, parent_id)) + return _FakeSpan(name) + + async def end_span(self, *, trace_id: str, span: _FakeSpan) -> None: + self.ended.append((span.name, span.output)) + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _make_stream(events: list[tuple[str, Any]]): + async def _gen(): + for e in events: + yield e + + return _gen() + + +async def _run_yield_turn( + stream_events: list[tuple[str, Any]], trace_id: str | None = None +) -> tuple[list[Any], _FakeTracing | None]: + fake_tracing = _FakeTracing() if trace_id else None + tracer: SpanTracer | bool | None = None + if trace_id and fake_tracing is not None: + tracer = SpanTracer(trace_id=trace_id, parent_span_id=None, task_id="task1", tracing=fake_tracing) + + emitter = UnifiedEmitter( + task_id="task1", + trace_id=trace_id, + parent_span_id=None, + tracer=tracer if tracer is not None else False, + ) + turn = LangGraphTurn(_make_stream(stream_events), model=None) + out = [e async for e in emitter.yield_turn(turn)] + return out, fake_tracing + + +# --------------------------------------------------------------------------- +# Tests +# --------------------------------------------------------------------------- + + +class TestSyncYieldChannel: + async def test_text_only_stream_yields_start_delta_done(self): + from langchain_core.messages import AIMessage, AIMessageChunk + + chunk = AIMessageChunk(content="Hello from LangGraph!") + ai_msg = AIMessage(content="Hello from LangGraph!") + events = [ + ("messages", (chunk, {})), + ("updates", {"agent": {"messages": [ai_msg]}}), + ] + out, _ = await _run_yield_turn(events) + + types = [type(e).__name__ for e in out] + assert "StreamTaskMessageStart" in types + assert "StreamTaskMessageDelta" in types + assert "StreamTaskMessageDone" in types + + async def test_tool_call_yields_full_events(self): + from langchain_core.messages import AIMessage, ToolMessage + + tc = {"id": "call_1", "name": "get_weather", "args": {"city": "Paris"}} + ai_msg = AIMessage(content="", tool_calls=[tc]) + tool_msg = ToolMessage(content="Sunny, 72F", tool_call_id="call_1", name="get_weather") + events = [ + ("updates", {"agent": {"messages": [ai_msg]}}), + ("updates", {"tools": {"messages": [tool_msg]}}), + ] + out, _ = await _run_yield_turn(events) + + full_events = [e for e in out if isinstance(e, StreamTaskMessageFull)] + assert len(full_events) == 2 + + contents = [e.content for e in full_events] + assert any(isinstance(c, ToolRequestContent) for c in contents) + assert any(isinstance(c, ToolResponseContent) for c in contents) + + async def test_multi_step_yields_events_in_order(self): + from langchain_core.messages import AIMessage, ToolMessage, AIMessageChunk + + chunk1 = AIMessageChunk(content="Searching...") + ai_msg1 = AIMessage(content="Searching...", tool_calls=[{"id": "c1", "name": "search", "args": {"q": "test"}}]) + tool_msg = ToolMessage(content="results", tool_call_id="c1", name="search") + chunk2 = AIMessageChunk(content="Found it!") + ai_msg2 = AIMessage(content="Found it!") + + events = [ + ("messages", (chunk1, {})), + ("updates", {"agent": {"messages": [ai_msg1]}}), + ("updates", {"tools": {"messages": [tool_msg]}}), + ("messages", (chunk2, {})), + ("updates", {"agent": {"messages": [ai_msg2]}}), + ] + out, _ = await _run_yield_turn(events) + + # Should have multiple start events (one per text segment) + starts = [e for e in out if isinstance(e, StreamTaskMessageStart)] + assert len(starts) >= 2 + # And two Full events (tool req + tool resp) + fulls = [e for e in out if isinstance(e, StreamTaskMessageFull)] + assert len(fulls) == 2 + + async def test_empty_stream_yields_nothing(self): + out, _ = await _run_yield_turn([]) + assert out == [] + + async def test_tracer_produces_tool_spans_for_full_events(self): + """AGX1-377: SpanDeriver now handles Full tool events (request opens, response closes). + + Full(ToolRequestContent) opens a tool span; Full(ToolResponseContent) closes it. + This aligns LangGraph tracing with Start+Done harnesses (pydantic-ai, openai-agents). + """ + from langchain_core.messages import AIMessage, ToolMessage + + tc = {"id": "c1", "name": "t", "args": {}} + ai_msg = AIMessage(content="", tool_calls=[tc]) + tool_msg = ToolMessage(content="ok", tool_call_id="c1", name="t") + + events = [ + ("updates", {"agent": {"messages": [ai_msg]}}), + ("updates", {"tools": {"messages": [tool_msg]}}), + ] + _, fake_tracing = await _run_yield_turn(events, trace_id="trace-1") + + assert fake_tracing is not None + assert len(fake_tracing.started) == 1, "Full(ToolRequestContent) opens one tool span" + assert fake_tracing.started[0][0] == "t", "span name matches the tool name" + assert len(fake_tracing.ended) == 1, "Full(ToolResponseContent) closes the span" + + async def test_usage_captured_after_yield(self): + from langchain_core.messages import AIMessage + + usage_meta = {"input_tokens": 10, "output_tokens": 5, "total_tokens": 15} + ai_msg = AIMessage(content="Hi!", usage_metadata=usage_meta) + events = [("updates", {"agent": {"messages": [ai_msg]}})] + + turn = LangGraphTurn(_make_stream(events), model="gpt-4") + emitter = UnifiedEmitter(task_id="t", trace_id=None, parent_span_id=None) + _ = [e async for e in emitter.yield_turn(turn)] + + usage = turn.usage() + assert usage.input_tokens == 10 + assert usage.output_tokens == 5 diff --git a/tests/lib/core/harness/test_harness_langgraph_temporal.py b/tests/lib/core/harness/test_harness_langgraph_temporal.py new file mode 100644 index 000000000..1a094a33c --- /dev/null +++ b/tests/lib/core/harness/test_harness_langgraph_temporal.py @@ -0,0 +1,233 @@ +"""Integration test: Temporal channel with a LangGraph agent. + +The Temporal LangGraph agent pattern uses ``emit_langgraph_messages`` (from +``_langgraph_messages.py``) inside a Temporal activity. That module is not +yet unified onto the harness surface (it has its own Redis-streaming code). + +This test file verifies the LangGraph Temporal agent's streaming behavior using +the same fake streaming infrastructure as test_harness_langgraph_async.py. The +key difference from the non-temporal async path is that in Temporal, each agent +turn runs inside a Temporal activity that has already been handed the task_id +and a pre-wired streaming client — so the ``UnifiedEmitter.auto_send_turn`` +path is identical. The graph activities and workflow scaffolding are not tested +here; that requires a running Temporal cluster. + +What is tested +-------------- +- stream_langgraph_events (the public async API used by temporal agent acp.py via + the workflow activity) produces the same result via the unified surface. +- Usage from AIMessage.usage_metadata is captured in TurnResult.usage. +- The auto_send_turn path for a temporal-style call (same as async). + +What is NOT covered without live infrastructure +----------------------------------------------- +- Actual Temporal workflow execution (requires a running Temporal cluster). +- The Temporal activity retry/compensation logic. +- LangGraph checkpoint storage via TemporalCheckpointer. +- emit_langgraph_messages (the Temporal-specific streaming helper). +- Real LLM calls or real LangGraph graph execution. + +See also: test_harness_langgraph_sync.py and test_harness_langgraph_async.py. +""" + +from __future__ import annotations + +import sys +from typing import Any +from dataclasses import field, dataclass + +import pytest + +from agentex.types.task_message import TaskMessage +from agentex.types.text_content import TextContent +from agentex.lib.core.harness.emitter import UnifiedEmitter +from agentex.types.tool_request_content import ToolRequestContent +from agentex.types.tool_response_content import ToolResponseContent +from agentex.lib.adk._modules._langgraph_turn import LangGraphTurn +from agentex.lib.adk._modules._langgraph_async import stream_langgraph_events + +# --------------------------------------------------------------------------- +# Remove conftest stubs so real langchain_core types are used +# --------------------------------------------------------------------------- + + +@pytest.fixture(autouse=True) +def _real_langchain_core(): + stub_keys = [k for k in sys.modules if k.startswith("langchain_core") or k.startswith("langgraph")] + saved = {k: sys.modules.pop(k) for k in stub_keys} + import importlib + + importlib.import_module("langchain_core.messages") + yield + sys.modules.update(saved) + + +# --------------------------------------------------------------------------- +# Fake streaming backend +# --------------------------------------------------------------------------- + + +@dataclass +class _FakeCtx: + ctype: str + initial_content: Any + task_message: TaskMessage + closed: bool = False + deltas: list[Any] = field(default_factory=list) + + async def __aenter__(self) -> "_FakeCtx": + return self + + async def __aexit__(self, *args: Any) -> bool: + await self.close() + return False + + async def close(self) -> None: + self.closed = True + + async def stream_update(self, update: Any) -> Any: + self.deltas.append(update) + return update + + +class _FakeStreaming: + def __init__(self) -> None: + self.contexts: list[_FakeCtx] = [] + + def streaming_task_message_context(self, task_id: str, initial_content: Any, **kw: Any) -> _FakeCtx: + ctype = getattr(initial_content, "type", None) or "" + tm = TaskMessage(id=f"m{len(self.contexts) + 1}", task_id=task_id, content=initial_content) + ctx = _FakeCtx(ctype=ctype, initial_content=initial_content, task_message=tm) + self.contexts.append(ctx) + return ctx + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _make_stream(events: list[tuple[str, Any]]): + async def _gen(): + for e in events: + yield e + + return _gen() + + +# --------------------------------------------------------------------------- +# Tests +# --------------------------------------------------------------------------- + + +class TestTemporalAutoSendChannel: + async def test_stream_langgraph_events_plain_text(self, monkeypatch): + """stream_langgraph_events (used by temporal agents via the acp.py activity) returns + the accumulated final text.""" + from langchain_core.messages import AIMessage, AIMessageChunk + + from agentex.lib import adk as adk_module + + fake_streaming = _FakeStreaming() + monkeypatch.setattr(adk_module, "streaming", fake_streaming) + + chunk = AIMessageChunk(content="Hello Temporal!") + ai_msg = AIMessage(content="Hello Temporal!") + events = [ + ("messages", (chunk, {})), + ("updates", {"agent": {"messages": [ai_msg]}}), + ] + + final = await stream_langgraph_events(_make_stream(events), "task-1") + assert final == "Hello Temporal!" + + async def test_stream_langgraph_events_tool_call(self, monkeypatch): + from langchain_core.messages import AIMessage, ToolMessage + + from agentex.lib import adk as adk_module + + fake_streaming = _FakeStreaming() + monkeypatch.setattr(adk_module, "streaming", fake_streaming) + + tc = {"id": "c1", "name": "search", "args": {"q": "test"}} + ai_msg = AIMessage(content="", tool_calls=[tc]) + tool_msg = ToolMessage(content="results", tool_call_id="c1", name="search") + chunk_final = AIMessage(content="Here are the results.") + + events = [ + ("updates", {"agent": {"messages": [ai_msg]}}), + ("updates", {"tools": {"messages": [tool_msg]}}), + ("updates", {"agent": {"messages": [chunk_final]}}), + ] + + final = await stream_langgraph_events(_make_stream(events), "task-1") + + # Check tool request and response posted to fake streaming + tool_req_ctxs = [c for c in fake_streaming.contexts if isinstance(c.initial_content, ToolRequestContent)] + tool_resp_ctxs = [c for c in fake_streaming.contexts if isinstance(c.initial_content, ToolResponseContent)] + assert len(tool_req_ctxs) == 1 + assert len(tool_resp_ctxs) == 1 + assert tool_req_ctxs[0].initial_content.name == "search" + + async def test_langgraph_turn_auto_send_via_unified_emitter(self): + """Direct UnifiedEmitter.auto_send_turn path used by temporal agent workflow + activities. Uses a fake streaming backend (no Redis).""" + from langchain_core.messages import AIMessage, AIMessageChunk + + fake_streaming = _FakeStreaming() + chunk = AIMessageChunk(content="Temporal answer!") + ai_msg = AIMessage(content="Temporal answer!") + events = [ + ("messages", (chunk, {})), + ("updates", {"agent": {"messages": [ai_msg]}}), + ] + + turn = LangGraphTurn(_make_stream(events), model=None) + emitter = UnifiedEmitter( + task_id="task-1", + trace_id=None, + parent_span_id=None, + streaming=fake_streaming, + ) + result = await emitter.auto_send_turn(turn) + + assert result.final_text == "Temporal answer!" + text_ctxs = [c for c in fake_streaming.contexts if isinstance(c.initial_content, TextContent)] + assert len(text_ctxs) == 1 + + async def test_usage_captured_via_turn_after_events_consumed(self): + """Usage from AIMessage.usage_metadata is captured via the on_final_ai_message + callback during event iteration. The authoritative usage is on turn.usage() + after events are consumed (emitter.auto_send_turn evaluates turn.usage() + eagerly before iteration, so TurnResult.usage is a pre-iteration snapshot).""" + from langchain_core.messages import AIMessage + + fake_streaming = _FakeStreaming() + usage_meta = {"input_tokens": 20, "output_tokens": 10, "total_tokens": 30} + ai_msg = AIMessage(content="answer", usage_metadata=usage_meta) + events = [("updates", {"agent": {"messages": [ai_msg]}})] + + turn = LangGraphTurn(_make_stream(events), model="gpt-4o") + emitter = UnifiedEmitter( + task_id="task-1", + trace_id=None, + parent_span_id=None, + streaming=fake_streaming, + ) + await emitter.auto_send_turn(turn) + + # After auto_send_turn, turn.usage() has the captured values + usage = turn.usage() + assert usage.input_tokens == 20 + assert usage.output_tokens == 10 + assert usage.total_tokens == 30 + + async def test_empty_stream_returns_empty_string(self, monkeypatch): + from agentex.lib import adk as adk_module + + fake_streaming = _FakeStreaming() + monkeypatch.setattr(adk_module, "streaming", fake_streaming) + + final = await stream_langgraph_events(_make_stream([]), "task-1") + assert final == "" + assert fake_streaming.contexts == [] From 9b2b03144cc67bb497e0a301686207aba2629758 Mon Sep 17 00:00:00 2001 From: Declan Brady Date: Mon, 22 Jun 2026 18:45:46 -0400 Subject: [PATCH 08/10] feat(codex): event-stream parser tap for the unified harness surface (#421) --- .../00_sync/harness_codex/Dockerfile | 50 ++ .../tutorials/00_sync/harness_codex/README.md | 40 ++ .../00_sync/harness_codex/conftest.py | 12 + .../00_sync/harness_codex/manifest.yaml | 58 ++ .../00_sync/harness_codex/project/__init__.py | 0 .../00_sync/harness_codex/project/acp.py | 175 +++++ .../00_sync/harness_codex/pyproject.toml | 38 + .../00_sync/harness_codex/tests/test_agent.py | 176 +++++ .../10_async/00_base/harness_codex/Dockerfile | 39 + .../10_async/00_base/harness_codex/README.md | 40 ++ .../00_base/harness_codex/conftest.py | 12 + .../00_base/harness_codex/manifest.yaml | 58 ++ .../00_base/harness_codex/project/__init__.py | 0 .../00_base/harness_codex/project/acp.py | 230 ++++++ .../00_base/harness_codex/pyproject.toml | 38 + .../00_base/harness_codex/tests/test_agent.py | 188 +++++ .../10_temporal/harness_codex/Dockerfile | 42 ++ .../10_temporal/harness_codex/README.md | 48 ++ .../10_temporal/harness_codex/conftest.py | 17 + .../10_temporal/harness_codex/manifest.yaml | 62 ++ .../harness_codex/project/__init__.py | 0 .../10_temporal/harness_codex/project/acp.py | 32 + .../harness_codex/project/activities.py | 145 ++++ .../harness_codex/project/run_worker.py | 41 ++ .../harness_codex/project/workflow.py | 145 ++++ .../10_temporal/harness_codex/pyproject.toml | 40 ++ .../harness_codex/tests/test_agent.py | 275 +++++++ src/agentex/lib/adk/__init__.py | 6 + src/agentex/lib/adk/_modules/_codex_sync.py | 587 +++++++++++++++ src/agentex/lib/adk/_modules/_codex_turn.py | 214 ++++++ tests/lib/adk/test_codex_sync.py | 671 ++++++++++++++++++ tests/lib/adk/test_codex_turn.py | 282 ++++++++ .../conformance/test_codex_conformance.py | 225 ++++++ 33 files changed, 3986 insertions(+) create mode 100644 examples/tutorials/00_sync/harness_codex/Dockerfile create mode 100644 examples/tutorials/00_sync/harness_codex/README.md create mode 100644 examples/tutorials/00_sync/harness_codex/conftest.py create mode 100644 examples/tutorials/00_sync/harness_codex/manifest.yaml create mode 100644 examples/tutorials/00_sync/harness_codex/project/__init__.py create mode 100644 examples/tutorials/00_sync/harness_codex/project/acp.py create mode 100644 examples/tutorials/00_sync/harness_codex/pyproject.toml create mode 100644 examples/tutorials/00_sync/harness_codex/tests/test_agent.py create mode 100644 examples/tutorials/10_async/00_base/harness_codex/Dockerfile create mode 100644 examples/tutorials/10_async/00_base/harness_codex/README.md create mode 100644 examples/tutorials/10_async/00_base/harness_codex/conftest.py create mode 100644 examples/tutorials/10_async/00_base/harness_codex/manifest.yaml create mode 100644 examples/tutorials/10_async/00_base/harness_codex/project/__init__.py create mode 100644 examples/tutorials/10_async/00_base/harness_codex/project/acp.py create mode 100644 examples/tutorials/10_async/00_base/harness_codex/pyproject.toml create mode 100644 examples/tutorials/10_async/00_base/harness_codex/tests/test_agent.py create mode 100644 examples/tutorials/10_async/10_temporal/harness_codex/Dockerfile create mode 100644 examples/tutorials/10_async/10_temporal/harness_codex/README.md create mode 100644 examples/tutorials/10_async/10_temporal/harness_codex/conftest.py create mode 100644 examples/tutorials/10_async/10_temporal/harness_codex/manifest.yaml create mode 100644 examples/tutorials/10_async/10_temporal/harness_codex/project/__init__.py create mode 100644 examples/tutorials/10_async/10_temporal/harness_codex/project/acp.py create mode 100644 examples/tutorials/10_async/10_temporal/harness_codex/project/activities.py create mode 100644 examples/tutorials/10_async/10_temporal/harness_codex/project/run_worker.py create mode 100644 examples/tutorials/10_async/10_temporal/harness_codex/project/workflow.py create mode 100644 examples/tutorials/10_async/10_temporal/harness_codex/pyproject.toml create mode 100644 examples/tutorials/10_async/10_temporal/harness_codex/tests/test_agent.py create mode 100644 src/agentex/lib/adk/_modules/_codex_sync.py create mode 100644 src/agentex/lib/adk/_modules/_codex_turn.py create mode 100644 tests/lib/adk/test_codex_sync.py create mode 100644 tests/lib/adk/test_codex_turn.py create mode 100644 tests/lib/core/harness/conformance/test_codex_conformance.py diff --git a/examples/tutorials/00_sync/harness_codex/Dockerfile b/examples/tutorials/00_sync/harness_codex/Dockerfile new file mode 100644 index 000000000..72713b95d --- /dev/null +++ b/examples/tutorials/00_sync/harness_codex/Dockerfile @@ -0,0 +1,50 @@ +# syntax=docker/dockerfile:1.3 +FROM python:3.12-slim +COPY --from=ghcr.io/astral-sh/uv:0.6.4 /uv /uvx /bin/ + +# Install system dependencies +RUN apt-get update && apt-get install -y \ + htop \ + vim \ + curl \ + tar \ + python3-dev \ + postgresql-client \ + build-essential \ + libpq-dev \ + gcc \ + cmake \ + netcat-openbsd \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +RUN uv pip install --system --upgrade pip setuptools wheel + +ENV UV_HTTP_TIMEOUT=1000 + +# Copy pyproject.toml and README.md to install dependencies +COPY 00_sync/harness_codex/pyproject.toml /app/harness_codex/pyproject.toml +COPY 00_sync/harness_codex/README.md /app/harness_codex/README.md + +WORKDIR /app/harness_codex + +# Copy the project code +COPY 00_sync/harness_codex/project /app/harness_codex/project + +# Copy the test files +COPY 00_sync/harness_codex/tests /app/harness_codex/tests + +# Copy shared test utilities +COPY test_utils /app/test_utils + +# Install the required Python packages with dev dependencies +RUN uv pip install --system .[dev] + +# Set environment variables +ENV PYTHONPATH=/app + +# Set test environment variables +ENV AGENT_NAME=s-harness-codex + +# Run the agent using uvicorn +CMD ["uvicorn", "project.acp:acp", "--host", "0.0.0.0", "--port", "8000"] diff --git a/examples/tutorials/00_sync/harness_codex/README.md b/examples/tutorials/00_sync/harness_codex/README.md new file mode 100644 index 000000000..5f3396cfa --- /dev/null +++ b/examples/tutorials/00_sync/harness_codex/README.md @@ -0,0 +1,40 @@ +# harness_codex (sync) + +Tutorial agent demonstrating the `convert_codex_to_agentex_events` tap, +`CodexTurn`, and `UnifiedEmitter` for a **sync** (HTTP-yield) ACP agent. + +## What this tutorial shows + +- Spawning `codex exec --json` as a **local asyncio subprocess** (no Scale sandbox). +- Wrapping the stdout line stream in a `CodexTurn`. +- Delivering every canonical `StreamTaskMessage*` event to the HTTP caller via + `UnifiedEmitter.yield_turn` (tracing as a side-effect). + +> **Production isolation note:** A tutorial agent runs the Codex CLI locally. +> Production-grade isolation (Scale sandbox, secret injection, MCP configuration) +> is handled by the golden agent at +> `teams/sgp/agents/golden_agent/project/harness/providers/codex.py`. + +## Live runs + +Live runs require: +1. The `codex` CLI on PATH: `npm install -g @openai/codex` +2. `OPENAI_API_KEY` set in the environment. + +## Running offline unit tests + +The offline tests inject a fake subprocess and never invoke the real CLI: + +```bash +cd /path/to/scale-agentex-python +uv run --all-packages --all-extras pytest examples/tutorials/00_sync/harness_codex/tests/test_agent.py -q +``` + +## Running live integration tests + +```bash +export CODEX_LIVE_TESTS=1 +export OPENAI_API_KEY=sk-... +# Start the agent server first, then: +pytest tests/test_agent.py -v +``` diff --git a/examples/tutorials/00_sync/harness_codex/conftest.py b/examples/tutorials/00_sync/harness_codex/conftest.py new file mode 100644 index 000000000..bdd78994b --- /dev/null +++ b/examples/tutorials/00_sync/harness_codex/conftest.py @@ -0,0 +1,12 @@ +"""Add the agent's project root to sys.path so ``import project`` works. + +Also sets minimal environment variables so the FastACP and tracing modules +can be imported without a running agent server. +""" + +import os +import sys + +sys.path.insert(0, os.path.dirname(__file__)) + +os.environ.setdefault("ACP_URL", "http://localhost:8000") diff --git a/examples/tutorials/00_sync/harness_codex/manifest.yaml b/examples/tutorials/00_sync/harness_codex/manifest.yaml new file mode 100644 index 000000000..52943f8f2 --- /dev/null +++ b/examples/tutorials/00_sync/harness_codex/manifest.yaml @@ -0,0 +1,58 @@ +build: + context: + root: ../../ + include_paths: + - 00_sync/harness_codex + - test_utils + dockerfile: 00_sync/harness_codex/Dockerfile + dockerignore: 00_sync/harness_codex/.dockerignore + +local_development: + agent: + port: 8000 + host_address: host.docker.internal + paths: + acp: project/acp.py + +agent: + acp_type: sync + name: s-harness-codex + description: Sync tutorial agent driving the unified harness surface via local codex CLI subprocess + + temporal: + enabled: false + + credentials: + - env_var_name: OPENAI_API_KEY + secret_name: openai-api-key + secret_key: api-key + - env_var_name: REDIS_URL + secret_name: redis-url-secret + secret_key: url + - env_var_name: SGP_API_KEY + secret_name: sgp-api-key + secret_key: api-key + - env_var_name: SGP_ACCOUNT_ID + secret_name: sgp-account-id + secret_key: account-id + - env_var_name: SGP_CLIENT_BASE_URL + secret_name: sgp-client-base-url + secret_key: url + +deployment: + image: + repository: "" + tag: "latest" + + global: + agent: + name: "s-harness-codex" + description: "Sync tutorial agent driving the unified harness surface via local codex CLI subprocess" + replicaCount: 1 + resources: + requests: + cpu: "500m" + memory: "1Gi" + limits: + cpu: "1000m" + memory: "2Gi" diff --git a/examples/tutorials/00_sync/harness_codex/project/__init__.py b/examples/tutorials/00_sync/harness_codex/project/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/examples/tutorials/00_sync/harness_codex/project/acp.py b/examples/tutorials/00_sync/harness_codex/project/acp.py new file mode 100644 index 000000000..bcb5e10df --- /dev/null +++ b/examples/tutorials/00_sync/harness_codex/project/acp.py @@ -0,0 +1,175 @@ +"""Sync ACP handler for the Codex CLI harness tutorial. + +Demonstrates the ``convert_codex_to_agentex_events`` tap + ``CodexTurn`` + +``UnifiedEmitter`` for a sync (HTTP-yield) ACP agent. + +The handler: +1. Spawns ``codex exec --json`` as a LOCAL asyncio subprocess (no sandbox). + This is correct for tutorials and local development; production isolation + is handled by the golden agent's Scale sandbox at + ``teams/sgp/agents/golden_agent/project/harness/providers/codex.py``. +2. Wraps the stdout line stream in a ``CodexTurn``. +3. Delivers every canonical ``StreamTaskMessage*`` event via + ``UnifiedEmitter.yield_turn``, which traces + yields each event back to + the HTTP caller in one pass. + +Live runs require: +- ``codex`` CLI on PATH (``npm install -g @openai/codex``) +- ``OPENAI_API_KEY`` set in the environment +""" + +from __future__ import annotations + +import os +import time +import codecs +import asyncio +from typing import AsyncGenerator +from collections.abc import AsyncIterator + +from dotenv import load_dotenv + +load_dotenv() + +import agentex.lib.adk as adk +from agentex.lib.adk import CodexTurn +from agentex.lib.types.acp import SendMessageParams +from agentex.lib.core.harness import UnifiedEmitter +from agentex.lib.types.tracing import SGPTracingProcessorConfig +from agentex.lib.utils.logging import make_logger +from agentex.lib.sdk.fastacp.fastacp import FastACP +from agentex.types.task_message_update import TaskMessageUpdate +from agentex.types.task_message_content import TaskMessageContent +from agentex.lib.core.tracing.tracing_processor_manager import add_tracing_processor_config + +logger = make_logger(__name__) + +add_tracing_processor_config( + SGPTracingProcessorConfig( + sgp_api_key=os.environ.get("SGP_API_KEY", ""), + sgp_account_id=os.environ.get("SGP_ACCOUNT_ID", ""), + sgp_base_url=os.environ.get("SGP_CLIENT_BASE_URL", ""), + ) +) + +acp = FastACP.create(acp_type="sync") + +MODEL = os.environ.get("CODEX_MODEL", "o4-mini") + + +async def _spawn_codex(model: str) -> asyncio.subprocess.Process: + """Spawn ``codex exec --json`` locally and return the live process. + + Injection seam: tests replace this function with a fake that returns a + mock process whose stdout yields pre-recorded event lines. + + The flags mirror the golden agent (codex.py in the golden agent repo): + --json machine-readable newline-delimited events + --skip-git-repo-check safe to run outside a git repo + --dangerously-bypass-approvals-and-sandbox + skip interactive approval prompts in a + non-interactive (server) context + --model which OpenAI model to use + + The caller writes the prompt to stdin after the process starts, then + closes stdin so codex knows input is complete. + """ + cmd = [ + "codex", + "exec", + "--json", + "--skip-git-repo-check", + "--dangerously-bypass-approvals-and-sandbox", + "--model", + model, + "-", # read prompt from stdin + ] + return await asyncio.create_subprocess_exec( + *cmd, + stdin=asyncio.subprocess.PIPE, + stdout=asyncio.subprocess.PIPE, + # Discard stderr: codex --json writes events to stdout; its stderr is + # progress/debug noise. Capturing it with PIPE but never reading it + # would deadlock once codex fills the OS pipe buffer (~64 KB). + stderr=asyncio.subprocess.DEVNULL, + env={**os.environ}, + ) + + +async def _process_stdout(process: asyncio.subprocess.Process) -> AsyncIterator[str]: + """Yield newline-delimited JSON lines from the process stdout. + + Uses an incremental UTF-8 decoder so a multibyte character split across two + 4 KB reads is decoded correctly instead of being corrupted at the boundary. + """ + assert process.stdout is not None + decoder = codecs.getincrementaldecoder("utf-8")(errors="replace") + buffer = "" + while True: + chunk = await process.stdout.read(4096) + if not chunk: + break + buffer += decoder.decode(chunk) + while "\n" in buffer: + line, buffer = buffer.split("\n", 1) + line = line.strip() + if line: + yield line + buffer += decoder.decode(b"", final=True) + if buffer.strip(): + yield buffer.strip() + + +@acp.on_message_send +async def handle_message_send( + params: SendMessageParams, +) -> TaskMessageContent | list[TaskMessageContent] | AsyncGenerator[TaskMessageUpdate, None]: + """Handle each message by running ``codex exec`` locally and streaming events.""" + task_id = params.task.id + user_message = params.content.content + logger.info("Processing message for task %s", task_id) + + start_ms = int(time.monotonic() * 1000) + + async with adk.tracing.span( + trace_id=task_id, + task_id=task_id, + name="message", + input={"message": user_message}, + data={"__span_type__": "AGENT_WORKFLOW"}, + ) as turn_span: + process = await _spawn_codex(MODEL) + + # Write prompt to stdin then close it so codex knows input is done. + assert process.stdin is not None + process.stdin.write(user_message.encode("utf-8")) + await process.stdin.drain() + process.stdin.close() + + turn = CodexTurn( + events=_process_stdout(process), + model=MODEL, + ) + + emitter = UnifiedEmitter( + task_id=task_id, + trace_id=task_id, + parent_span_id=turn_span.id if turn_span else None, + ) + + async for event in emitter.yield_turn(turn): + yield event + + await process.wait() + + # Record the real wall-clock duration AFTER streaming completes; setting + # it before the stream ran would capture only subprocess spawn overhead. + turn.duration_ms = int(time.monotonic() * 1000) - start_ms + + if turn_span: + usage = turn.usage() + turn_span.output = { + "model": usage.model, + "input_tokens": usage.input_tokens, + "output_tokens": usage.output_tokens, + } diff --git a/examples/tutorials/00_sync/harness_codex/pyproject.toml b/examples/tutorials/00_sync/harness_codex/pyproject.toml new file mode 100644 index 000000000..ca7d8ac18 --- /dev/null +++ b/examples/tutorials/00_sync/harness_codex/pyproject.toml @@ -0,0 +1,38 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "s-harness-codex" +version = "0.1.0" +description = "Sync tutorial agent driving the unified harness surface via local codex CLI subprocess" +readme = "README.md" +requires-python = ">=3.12" +dependencies = [ + "agentex-sdk", + "scale-gp", +] + +[project.optional-dependencies] +dev = [ + "pytest", + "pytest-asyncio", + "httpx", + "black", + "isort", + "flake8", +] + +[tool.hatch.build.targets.wheel] +packages = ["project"] + +[tool.black] +line-length = 88 +target-version = ['py312'] + +[tool.isort] +profile = "black" +line_length = 88 + +[tool.pytest.ini_options] +asyncio_mode = "auto" diff --git a/examples/tutorials/00_sync/harness_codex/tests/test_agent.py b/examples/tutorials/00_sync/harness_codex/tests/test_agent.py new file mode 100644 index 000000000..b2d5b6498 --- /dev/null +++ b/examples/tutorials/00_sync/harness_codex/tests/test_agent.py @@ -0,0 +1,176 @@ +"""Tests for the sync Codex harness tutorial agent. + +LIVE tests (``TestLiveCodexAgent``): + - Require the ``codex`` CLI on PATH and ``OPENAI_API_KEY`` set. + - Run the full agent end-to-end against a live Agentex server. + - Skipped automatically when ``CODEX_LIVE_TESTS`` is not set to ``1``. + +OFFLINE unit tests (``TestOfflineCodexHandler``): + - Inject a fake async iterator of pre-recorded codex event lines. + - Assert the ``CodexTurn`` + ``UnifiedEmitter`` pipeline yields events, + populates usage, and satisfies the ``HarnessTurn`` protocol. + - Always run. +""" + +from __future__ import annotations + +import os +import json +from typing import Any + +import pytest + +# --------------------------------------------------------------------------- +# Shared helpers +# --------------------------------------------------------------------------- + +SAMPLE_EVENTS: list[dict[str, Any]] = [ + {"type": "thread.started", "thread_id": "thread-abc"}, + {"type": "turn.started"}, + { + "type": "item.started", + "item": {"id": "msg-1", "type": "agent_message", "text": "Hello"}, + }, + { + "type": "item.completed", + "item": {"id": "msg-1", "type": "agent_message", "text": "Hello, world!"}, + }, + { + "type": "turn.completed", + "usage": {"input_tokens": 10, "output_tokens": 5, "total_tokens": 15}, + }, +] + + +async def _fake_event_stream(): + """Async iterator of pre-recorded codex event JSON lines (no subprocess).""" + for evt in SAMPLE_EVENTS: + yield json.dumps(evt) + + +class TestOfflineCodexHandler: + """Unit tests that run without a real codex CLI or network.""" + + @pytest.mark.asyncio + async def test_codex_turn_yields_stream_events(self): + """CodexTurn drives the unified surface and yields StreamTaskMessage* events.""" + from agentex.lib.adk import CodexTurn + from agentex.lib.core.harness import UnifiedEmitter + + turn = CodexTurn(events=_fake_event_stream(), model="o4-mini") + emitter = UnifiedEmitter(task_id="t", trace_id=None, parent_span_id=None) + + events = [e async for e in emitter.yield_turn(turn)] + assert len(events) > 0, "No events yielded" + + types_seen = {type(e).__name__ for e in events} + known_types = { + "StreamTaskMessageStart", + "StreamTaskMessageDelta", + "StreamTaskMessageFull", + "StreamTaskMessageDone", + } + assert bool(types_seen & known_types), f"Unexpected event types: {types_seen}" + + @pytest.mark.asyncio + async def test_usage_populated_after_stream_exhausted(self): + """CodexTurn.usage() returns correct tokens after stream is exhausted.""" + from agentex.lib.adk import CodexTurn + + turn = CodexTurn(events=_fake_event_stream(), model="o4-mini") + + collected = [e async for e in turn.events] + + usage = turn.usage() + assert usage.input_tokens == 10 + assert usage.output_tokens == 5 + assert usage.total_tokens == 15 + assert usage.model == "o4-mini" + + @pytest.mark.asyncio + async def test_codex_turn_protocol_compliance(self): + """CodexTurn satisfies the HarnessTurn protocol.""" + from agentex.lib.adk import CodexTurn + from agentex.lib.core.harness.types import HarnessTurn + + turn = CodexTurn(events=_fake_event_stream(), model="o4-mini") + assert isinstance(turn, HarnessTurn), "CodexTurn does not satisfy HarnessTurn protocol" + + @pytest.mark.asyncio + async def test_unified_emitter_yield_passes_through_events(self): + """UnifiedEmitter.yield_turn passes events through unchanged in sync mode.""" + from agentex.lib.adk import CodexTurn + from agentex.lib.core.harness import UnifiedEmitter + + turn = CodexTurn(events=_fake_event_stream(), model="o4-mini") + emitter = UnifiedEmitter(task_id="t", trace_id=None, parent_span_id=None) + + events = [e async for e in emitter.yield_turn(turn)] + assert len(events) > 0 + + @pytest.mark.asyncio + async def test_convert_codex_to_agentex_events_direct(self): + """convert_codex_to_agentex_events tap produces text start/done events.""" + from agentex.lib.adk import convert_codex_to_agentex_events + from agentex.types.task_message_update import StreamTaskMessageDone + + events = [e async for e in convert_codex_to_agentex_events(_fake_event_stream())] + assert any(isinstance(e, StreamTaskMessageDone) for e in events), ( + "Expected at least one StreamTaskMessageDone event" + ) + + @pytest.mark.asyncio + async def test_on_result_callback_receives_session_id(self): + """on_result callback receives the session_id from thread.started.""" + from agentex.lib.adk import convert_codex_to_agentex_events + + captured: list[dict] = [] + + events = [ + e + async for e in convert_codex_to_agentex_events( + _fake_event_stream(), + on_result=captured.append, + ) + ] + + assert len(captured) == 1 + assert captured[0]["session_id"] == "thread-abc" + assert captured[0]["tool_call_count"] == 0 + + +# --------------------------------------------------------------------------- +# Live tests (skipped unless CODEX_LIVE_TESTS=1) +# --------------------------------------------------------------------------- + +LIVE = os.environ.get("CODEX_LIVE_TESTS", "") == "1" +AGENTEX_API_BASE_URL = os.environ.get("AGENTEX_API_BASE_URL", "http://localhost:5003") +AGENT_NAME = os.environ.get("AGENT_NAME", "s-harness-codex") + + +@pytest.mark.skipif(not LIVE, reason="Set CODEX_LIVE_TESTS=1 and ensure codex CLI + OPENAI_API_KEY are available") +class TestLiveCodexAgent: + """End-to-end tests that require the real codex CLI and a running Agentex server.""" + + @pytest.fixture + def client(self): + from agentex import Agentex + + return Agentex(base_url=AGENTEX_API_BASE_URL) + + def test_send_simple_message(self, client): + from agentex.types import TextContentParam + from agentex.types.agent_rpc_params import ParamsSendMessageRequest + + response = client.agents.send_message( + agent_name=AGENT_NAME, + params=ParamsSendMessageRequest( + content=TextContentParam( + author="user", + content="What is 2+2? Reply with just the number.", + type="text", + ) + ), + ) + assert response.result is not None + assert len(response.result) >= 1 diff --git a/examples/tutorials/10_async/00_base/harness_codex/Dockerfile b/examples/tutorials/10_async/00_base/harness_codex/Dockerfile new file mode 100644 index 000000000..06b76aae2 --- /dev/null +++ b/examples/tutorials/10_async/00_base/harness_codex/Dockerfile @@ -0,0 +1,39 @@ +# syntax=docker/dockerfile:1.3 +FROM python:3.12-slim +COPY --from=ghcr.io/astral-sh/uv:0.6.4 /uv /uvx /bin/ + +# Install system dependencies +RUN apt-get update && apt-get install -y \ + htop \ + vim \ + curl \ + tar \ + python3-dev \ + postgresql-client \ + build-essential \ + libpq-dev \ + gcc \ + cmake \ + netcat-openbsd \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +RUN uv pip install --system --upgrade pip setuptools wheel + +ENV UV_HTTP_TIMEOUT=1000 + +COPY 10_async/00_base/harness_codex/pyproject.toml /app/harness_codex/pyproject.toml +COPY 10_async/00_base/harness_codex/README.md /app/harness_codex/README.md + +WORKDIR /app/harness_codex + +COPY 10_async/00_base/harness_codex/project /app/harness_codex/project +COPY 10_async/00_base/harness_codex/tests /app/harness_codex/tests +COPY test_utils /app/test_utils + +RUN uv pip install --system .[dev] + +ENV PYTHONPATH=/app +ENV AGENT_NAME=ab-harness-codex + +CMD ["uvicorn", "project.acp:acp", "--host", "0.0.0.0", "--port", "8000"] diff --git a/examples/tutorials/10_async/00_base/harness_codex/README.md b/examples/tutorials/10_async/00_base/harness_codex/README.md new file mode 100644 index 000000000..9bbcd927a --- /dev/null +++ b/examples/tutorials/10_async/00_base/harness_codex/README.md @@ -0,0 +1,40 @@ +# harness_codex (async base) + +Tutorial agent demonstrating the `convert_codex_to_agentex_events` tap, +`CodexTurn`, and `UnifiedEmitter` for an **async** (Redis-streaming, no Temporal) +ACP agent. + +## What this tutorial shows + +- Spawning `codex exec --json` as a **local asyncio subprocess** (no Scale sandbox). +- Wrapping the stdout line stream in a `CodexTurn`. +- Delivering every canonical `StreamTaskMessage*` event to Redis via + `UnifiedEmitter.auto_send_turn`, so the UI receives tokens in real time. +- Persisting the codex thread ID in `adk.state` so subsequent turns resume the + same codex session via `codex exec resume `. + +> **Production isolation note:** A tutorial agent runs the Codex CLI locally. +> Production-grade isolation (Scale sandbox, secret injection, MCP configuration) +> is handled by the golden agent at +> `teams/sgp/agents/golden_agent/project/harness/providers/codex.py`. + +## Live runs + +Live runs require: +1. The `codex` CLI on PATH: `npm install -g @openai/codex` +2. `OPENAI_API_KEY` set in the environment. + +## Running offline unit tests + +```bash +cd /path/to/scale-agentex-python +uv run --all-packages --all-extras pytest examples/tutorials/10_async/00_base/harness_codex/tests/test_agent.py -q +``` + +## Running live integration tests + +```bash +export CODEX_LIVE_TESTS=1 +export OPENAI_API_KEY=sk-... +pytest tests/test_agent.py -v +``` diff --git a/examples/tutorials/10_async/00_base/harness_codex/conftest.py b/examples/tutorials/10_async/00_base/harness_codex/conftest.py new file mode 100644 index 000000000..bdd78994b --- /dev/null +++ b/examples/tutorials/10_async/00_base/harness_codex/conftest.py @@ -0,0 +1,12 @@ +"""Add the agent's project root to sys.path so ``import project`` works. + +Also sets minimal environment variables so the FastACP and tracing modules +can be imported without a running agent server. +""" + +import os +import sys + +sys.path.insert(0, os.path.dirname(__file__)) + +os.environ.setdefault("ACP_URL", "http://localhost:8000") diff --git a/examples/tutorials/10_async/00_base/harness_codex/manifest.yaml b/examples/tutorials/10_async/00_base/harness_codex/manifest.yaml new file mode 100644 index 000000000..e88e2029d --- /dev/null +++ b/examples/tutorials/10_async/00_base/harness_codex/manifest.yaml @@ -0,0 +1,58 @@ +build: + context: + root: ../../../ + include_paths: + - 10_async/00_base/harness_codex + - test_utils + dockerfile: 10_async/00_base/harness_codex/Dockerfile + dockerignore: 10_async/00_base/harness_codex/.dockerignore + +local_development: + agent: + port: 8000 + host_address: host.docker.internal + paths: + acp: project/acp.py + +agent: + acp_type: async + name: ab-harness-codex + description: Async (base) tutorial agent driving the unified harness surface via local codex CLI subprocess + + temporal: + enabled: false + + credentials: + - env_var_name: OPENAI_API_KEY + secret_name: openai-api-key + secret_key: api-key + - env_var_name: REDIS_URL + secret_name: redis-url-secret + secret_key: url + - env_var_name: SGP_API_KEY + secret_name: sgp-api-key + secret_key: api-key + - env_var_name: SGP_ACCOUNT_ID + secret_name: sgp-account-id + secret_key: account-id + - env_var_name: SGP_CLIENT_BASE_URL + secret_name: sgp-client-base-url + secret_key: url + +deployment: + image: + repository: "" + tag: "latest" + + global: + agent: + name: "ab-harness-codex" + description: "Async (base) tutorial agent driving the unified harness surface via local codex CLI subprocess" + replicaCount: 1 + resources: + requests: + cpu: "500m" + memory: "1Gi" + limits: + cpu: "1000m" + memory: "2Gi" diff --git a/examples/tutorials/10_async/00_base/harness_codex/project/__init__.py b/examples/tutorials/10_async/00_base/harness_codex/project/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/examples/tutorials/10_async/00_base/harness_codex/project/acp.py b/examples/tutorials/10_async/00_base/harness_codex/project/acp.py new file mode 100644 index 000000000..0233c49ab --- /dev/null +++ b/examples/tutorials/10_async/00_base/harness_codex/project/acp.py @@ -0,0 +1,230 @@ +"""Async (base) ACP handler for the Codex CLI harness tutorial. + +Demonstrates the ``convert_codex_to_agentex_events`` tap + ``CodexTurn`` + +``UnifiedEmitter`` for an async (Redis-streaming) ACP agent without Temporal. + +The handler: +1. Spawns ``codex exec --json`` as a LOCAL asyncio subprocess (no sandbox). + This is correct for tutorials and local development; production isolation + is handled by the golden agent's Scale sandbox at + ``teams/sgp/agents/golden_agent/project/harness/providers/codex.py``. +2. Wraps the stdout line stream in a ``CodexTurn``. +3. Delivers every canonical ``StreamTaskMessage*`` event to Redis via + ``UnifiedEmitter.auto_send_turn``, so the UI receives tokens in real time. +4. Multi-turn memory is persisted via ``adk.state``. + +Live runs require: +- ``codex`` CLI on PATH (``npm install -g @openai/codex``) +- ``OPENAI_API_KEY`` set in the environment +""" + +from __future__ import annotations + +import os +import time +import codecs +import asyncio +from collections.abc import AsyncIterator + +from dotenv import load_dotenv + +load_dotenv() + +import agentex.lib.adk as adk +from agentex.lib.adk import CodexTurn +from agentex.lib.types.acp import SendEventParams, CancelTaskParams, CreateTaskParams +from agentex.lib.core.harness import UnifiedEmitter +from agentex.lib.types.fastacp import AsyncACPConfig +from agentex.lib.types.tracing import SGPTracingProcessorConfig +from agentex.lib.utils.logging import make_logger +from agentex.lib.utils.model_utils import BaseModel +from agentex.lib.sdk.fastacp.fastacp import FastACP +from agentex.lib.core.tracing.tracing_processor_manager import add_tracing_processor_config + +logger = make_logger(__name__) + +add_tracing_processor_config( + SGPTracingProcessorConfig( + sgp_api_key=os.environ.get("SGP_API_KEY", ""), + sgp_account_id=os.environ.get("SGP_ACCOUNT_ID", ""), + sgp_base_url=os.environ.get("SGP_CLIENT_BASE_URL", ""), + ) +) + +acp = FastACP.create( + acp_type="async", + config=AsyncACPConfig(type="base"), +) + +MODEL = os.environ.get("CODEX_MODEL", "o4-mini") + + +class ConversationState(BaseModel): + """Per-task conversation state persisted via ``adk.state``. + + We store the codex session/thread ID so subsequent turns can resume the + same codex session via ``codex exec resume ``. + """ + + codex_thread_id: str | None = None + turn_number: int = 0 + + +async def _spawn_codex( + model: str, + thread_id: str | None = None, +) -> asyncio.subprocess.Process: + """Spawn ``codex exec --json`` locally and return the live process. + + Injection seam: tests replace this function with a fake that returns a + mock process whose stdout yields pre-recorded event lines. + + When ``thread_id`` is provided the subcommand becomes + ``codex exec ... resume -`` so codex continues the prior + conversation thread. + + The caller writes the prompt to stdin after the process starts, then + closes stdin so codex knows input is complete. + """ + base_flags = [ + "--json", + "--skip-git-repo-check", + "--dangerously-bypass-approvals-and-sandbox", + "--model", + model, + ] + + if thread_id: + cmd = ["codex", "exec", *base_flags, "resume", thread_id, "-"] + else: + cmd = ["codex", "exec", *base_flags, "-"] + + return await asyncio.create_subprocess_exec( + *cmd, + stdin=asyncio.subprocess.PIPE, + stdout=asyncio.subprocess.PIPE, + # Discard stderr: codex --json writes events to stdout; its stderr is + # progress/debug noise. Capturing it with PIPE but never reading it + # would deadlock once codex fills the OS pipe buffer (~64 KB). + stderr=asyncio.subprocess.DEVNULL, + env={**os.environ}, + ) + + +async def _process_stdout(process: asyncio.subprocess.Process) -> AsyncIterator[str]: + """Yield newline-delimited JSON lines from the process stdout. + + Uses an incremental UTF-8 decoder so a multibyte character split across two + 4 KB reads is decoded correctly instead of being corrupted at the boundary. + """ + assert process.stdout is not None + decoder = codecs.getincrementaldecoder("utf-8")(errors="replace") + buffer = "" + while True: + chunk = await process.stdout.read(4096) + if not chunk: + break + buffer += decoder.decode(chunk) + while "\n" in buffer: + line, buffer = buffer.split("\n", 1) + line = line.strip() + if line: + yield line + buffer += decoder.decode(b"", final=True) + if buffer.strip(): + yield buffer.strip() + + +@acp.on_task_create +async def handle_task_create(params: CreateTaskParams): + """Initialize per-task state on task creation.""" + logger.info("Task created: %s", params.task.id) + await adk.state.create( + task_id=params.task.id, + agent_id=params.agent.id, + state=ConversationState(), + ) + + +@acp.on_task_event_send +async def handle_task_event_send(params: SendEventParams): + """Handle each user message: spawn codex, stream events, save thread ID.""" + task_id = params.task.id + agent_id = params.agent.id + user_message = params.event.content.content + + logger.info("Processing message for task %s", task_id) + + await adk.messages.create(task_id=task_id, content=params.event.content) + + task_state = await adk.state.get_by_task_and_agent(task_id=task_id, agent_id=agent_id) + if task_state is None: + state = ConversationState() + task_state = await adk.state.create(task_id=task_id, agent_id=agent_id, state=state) + else: + state = ConversationState.model_validate(task_state.state) + + state.turn_number += 1 + + async with adk.tracing.span( + trace_id=task_id, + task_id=task_id, + name=f"Turn {state.turn_number}", + input={"message": user_message}, + data={"__span_type__": "AGENT_WORKFLOW"}, + ) as turn_span: + start_ms = int(time.monotonic() * 1000) + + process = await _spawn_codex(MODEL, thread_id=state.codex_thread_id) + + assert process.stdin is not None + process.stdin.write(user_message.encode("utf-8")) + await process.stdin.drain() + process.stdin.close() + + turn = CodexTurn( + events=_process_stdout(process), + model=MODEL, + ) + + emitter = UnifiedEmitter( + task_id=task_id, + trace_id=task_id, + parent_span_id=turn_span.id if turn_span else None, + ) + + result = await emitter.auto_send_turn(turn) + + await process.wait() + + # Record the real wall-clock duration AFTER streaming completes; setting + # it before the stream ran would capture only subprocess spawn overhead. + turn.duration_ms = int(time.monotonic() * 1000) - start_ms + + # Persist the new thread ID so subsequent turns resume the same session. + usage = turn.usage() + if usage.model: + # usage() is valid now that the stream is exhausted + pass + # Persist the codex session id (public accessor; valid post-stream) so the + # next turn resumes the same session. + if turn.session_id: + state.codex_thread_id = turn.session_id + + await adk.state.update( + state_id=task_state.id, + task_id=task_id, + agent_id=agent_id, + state=state, + ) + + if turn_span: + turn_span.output = { + "final_text": result.final_text, + "model": usage.model, + } + + +@acp.on_task_cancel +async def handle_task_canceled(params: CancelTaskParams): + logger.info("Task canceled: %s", params.task.id) diff --git a/examples/tutorials/10_async/00_base/harness_codex/pyproject.toml b/examples/tutorials/10_async/00_base/harness_codex/pyproject.toml new file mode 100644 index 000000000..c25a65c47 --- /dev/null +++ b/examples/tutorials/10_async/00_base/harness_codex/pyproject.toml @@ -0,0 +1,38 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "ab-harness-codex" +version = "0.1.0" +description = "Async (base) tutorial agent driving the unified harness surface via local codex CLI subprocess" +readme = "README.md" +requires-python = ">=3.12" +dependencies = [ + "agentex-sdk", + "scale-gp", +] + +[project.optional-dependencies] +dev = [ + "pytest", + "pytest-asyncio", + "httpx", + "black", + "isort", + "flake8", +] + +[tool.hatch.build.targets.wheel] +packages = ["project"] + +[tool.black] +line-length = 88 +target-version = ['py312'] + +[tool.isort] +profile = "black" +line_length = 88 + +[tool.pytest.ini_options] +asyncio_mode = "auto" diff --git a/examples/tutorials/10_async/00_base/harness_codex/tests/test_agent.py b/examples/tutorials/10_async/00_base/harness_codex/tests/test_agent.py new file mode 100644 index 000000000..b50ee9116 --- /dev/null +++ b/examples/tutorials/10_async/00_base/harness_codex/tests/test_agent.py @@ -0,0 +1,188 @@ +"""Tests for the async (base) Codex harness tutorial agent. + +LIVE tests (``TestLiveCodexAgent``): + - Require the ``codex`` CLI on PATH and ``OPENAI_API_KEY`` set. + - Skipped automatically when ``CODEX_LIVE_TESTS`` is not set to ``1``. + +OFFLINE unit tests (``TestOfflineCodexHandler``): + - Inject a fake async iterator of pre-recorded codex event lines. + - Assert ``CodexTurn`` + ``UnifiedEmitter.auto_send_turn`` is driven correctly. + - Always run. +""" + +from __future__ import annotations + +import os +import json +from typing import Any +from unittest.mock import AsyncMock, MagicMock + +import pytest + +# --------------------------------------------------------------------------- +# Shared helpers +# --------------------------------------------------------------------------- + +SAMPLE_EVENTS: list[dict[str, Any]] = [ + {"type": "thread.started", "thread_id": "thread-xyz"}, + {"type": "turn.started"}, + { + "type": "item.started", + "item": {"id": "msg-1", "type": "agent_message", "text": "Hi"}, + }, + { + "type": "item.completed", + "item": {"id": "msg-1", "type": "agent_message", "text": "Hi there!"}, + }, + { + "type": "turn.completed", + "usage": {"input_tokens": 8, "output_tokens": 4, "total_tokens": 12}, + }, +] + + +async def _fake_event_stream(): + """Async iterator of pre-recorded codex event JSON lines (no subprocess).""" + for evt in SAMPLE_EVENTS: + yield json.dumps(evt) + + +class TestOfflineCodexHandler: + """Unit tests that run without a real codex CLI or network.""" + + @pytest.mark.asyncio + async def test_usage_populated_after_stream_exhausted(self): + """CodexTurn.usage() returns non-None tokens after stream is exhausted.""" + from agentex.lib.adk import CodexTurn + + turn = CodexTurn(events=_fake_event_stream(), model="o4-mini") + + collected = [e async for e in turn.events] + + usage = turn.usage() + assert usage.input_tokens == 8 + assert usage.output_tokens == 4 + assert usage.model == "o4-mini" + + @pytest.mark.asyncio + async def test_auto_send_turn_drives_unified_surface(self): + """auto_send_turn returns a TurnResult with the final text.""" + from agentex.lib.adk import CodexTurn + from agentex.lib.core.harness import UnifiedEmitter + from agentex.types.task_message import TaskMessage + from agentex.types.text_content import TextContent + + turn = CodexTurn(events=_fake_event_stream(), model="o4-mini") + + real_task_msg = TaskMessage( + id="msg-fake", + task_id="t", + content=TextContent(type="text", author="agent", content=""), + ) + + fake_streaming = MagicMock() + fake_ctx = AsyncMock() + fake_ctx.__aenter__ = AsyncMock(return_value=fake_ctx) + fake_ctx.__aexit__ = AsyncMock(return_value=False) + fake_ctx.stream_update = AsyncMock(return_value=MagicMock()) + fake_ctx.close = AsyncMock() + fake_ctx.task_message = real_task_msg + fake_streaming.streaming_task_message_context = MagicMock(return_value=fake_ctx) + + emitter = UnifiedEmitter( + task_id="t", + trace_id=None, + parent_span_id=None, + streaming=fake_streaming, + ) + + result = await emitter.auto_send_turn(turn) + assert result is not None + + @pytest.mark.asyncio + async def test_session_id_captured_after_stream(self): + """CodexTurn._result captures the session_id from thread.started.""" + from agentex.lib.adk import CodexTurn + + turn = CodexTurn(events=_fake_event_stream(), model="o4-mini") + _ = [e async for e in turn.events] + + assert turn._result is not None + assert turn._result["session_id"] == "thread-xyz" + + @pytest.mark.asyncio + async def test_yield_turn_is_passthrough(self): + """yield_turn mode also works with CodexTurn (no streaming infra needed).""" + from agentex.lib.adk import CodexTurn + from agentex.lib.core.harness import UnifiedEmitter + + turn = CodexTurn(events=_fake_event_stream(), model="o4-mini") + emitter = UnifiedEmitter(task_id="t", trace_id=None, parent_span_id=None) + + events = [e async for e in emitter.yield_turn(turn)] + assert len(events) > 0 + + +# --------------------------------------------------------------------------- +# Live tests +# --------------------------------------------------------------------------- + +LIVE = os.environ.get("CODEX_LIVE_TESTS", "") == "1" +AGENTEX_API_BASE_URL = os.environ.get("AGENTEX_API_BASE_URL", "http://localhost:5003") +AGENT_NAME = os.environ.get("AGENT_NAME", "ab-harness-codex") + + +@pytest.mark.skipif( + not LIVE, + reason="Set CODEX_LIVE_TESTS=1 and ensure codex CLI + OPENAI_API_KEY are available", +) +class TestLiveCodexAgent: + """End-to-end tests that require the real codex CLI and a running Agentex server.""" + + @pytest.fixture + def client(self): + from agentex import Agentex + + return Agentex(base_url=AGENTEX_API_BASE_URL) + + @pytest.fixture + def agent_id(self, client): + for agent in client.agents.list(): + if agent.name == AGENT_NAME: + return agent.id + raise ValueError(f"Agent {AGENT_NAME!r} not found.") + + def test_send_simple_message(self, client, agent_id: str): + """Async agents process events out of band, so create a task, send an + event, and poll the task's messages for the agent's response.""" + import time + import uuid + + from agentex.types import TextContentParam + from agentex.types.agent_rpc_params import ParamsSendEventRequest, ParamsCreateTaskRequest + + task = client.agents.create_task(agent_id, params=ParamsCreateTaskRequest(name=uuid.uuid1().hex)).result + assert task is not None + + client.agents.send_event( + agent_id=agent_id, + params=ParamsSendEventRequest( + task_id=task.id, + content=TextContentParam( + author="user", + content="What is 3+3? Reply with just the number.", + type="text", + ), + ), + ) + + deadline = time.monotonic() + 60 + while time.monotonic() < deadline: + msgs = client.messages.list(task_id=task.id) + agent_msgs = [m for m in msgs if getattr(m.content, "author", None) == "agent"] + if agent_msgs: + assert len(agent_msgs) >= 1 + return + time.sleep(2) + + raise AssertionError("No agent response received within 60 s") diff --git a/examples/tutorials/10_async/10_temporal/harness_codex/Dockerfile b/examples/tutorials/10_async/10_temporal/harness_codex/Dockerfile new file mode 100644 index 000000000..e2f8807fd --- /dev/null +++ b/examples/tutorials/10_async/10_temporal/harness_codex/Dockerfile @@ -0,0 +1,42 @@ +# syntax=docker/dockerfile:1.3 +FROM python:3.12-slim +COPY --from=ghcr.io/astral-sh/uv:0.6.4 /uv /uvx /bin/ + +# Install system dependencies +RUN apt-get update && apt-get install -y \ + htop \ + vim \ + curl \ + tar \ + python3-dev \ + postgresql-client \ + build-essential \ + libpq-dev \ + gcc \ + cmake \ + netcat-openbsd \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +RUN uv pip install --system --upgrade pip setuptools wheel + +ENV UV_HTTP_TIMEOUT=1000 + +COPY 10_async/10_temporal/harness_codex/pyproject.toml /app/harness_codex/pyproject.toml +COPY 10_async/10_temporal/harness_codex/README.md /app/harness_codex/README.md + +WORKDIR /app/harness_codex + +COPY 10_async/10_temporal/harness_codex/project /app/harness_codex/project +COPY 10_async/10_temporal/harness_codex/tests /app/harness_codex/tests +COPY test_utils /app/test_utils + +RUN uv pip install --system .[dev] + +ENV PYTHONPATH=/app +ENV AGENT_NAME=at-harness-codex + +CMD ["uvicorn", "project.acp:acp", "--host", "0.0.0.0", "--port", "8000"] + +# When deploying the worker, replace CMD with: +# CMD ["python", "-m", "project.run_worker"] diff --git a/examples/tutorials/10_async/10_temporal/harness_codex/README.md b/examples/tutorials/10_async/10_temporal/harness_codex/README.md new file mode 100644 index 000000000..4f9b76955 --- /dev/null +++ b/examples/tutorials/10_async/10_temporal/harness_codex/README.md @@ -0,0 +1,48 @@ +# harness_codex (Temporal) + +Tutorial agent demonstrating the `convert_codex_to_agentex_events` tap, +`CodexTurn`, and `UnifiedEmitter` for a **Temporal-durable** async ACP agent. + +## What this tutorial shows + +- Spawning `codex exec --json` as a **local asyncio subprocess** (no Scale sandbox) + inside a Temporal workflow signal handler. +- Wrapping the stdout line stream in a `CodexTurn`. +- Delivering every canonical `StreamTaskMessage*` event to Redis via + `UnifiedEmitter.auto_send_turn`, passing `created_at=workflow.now()` for + deterministic Temporal replay timestamps. +- Keeping the codex thread ID on the workflow instance (durable across crashes + without an external `adk.state` round-trip). + +> **Production isolation note:** A tutorial agent runs the Codex CLI locally. +> Production-grade isolation (Scale sandbox, secret injection, MCP configuration) +> is handled by the golden agent at +> `teams/sgp/agents/golden_agent/project/harness/providers/codex.py`. + +> **Temporal determinism note:** Subprocess spawning happens inside +> `@workflow.signal` handler bodies. Temporal does NOT replay signal handler +> bodies (only `@workflow.run` is subject to replay constraints), so this is +> safe. A production agent would wrap the subprocess in a Temporal activity for +> full durability and retry semantics. + +## Live runs + +Live runs require: +1. The `codex` CLI on PATH: `npm install -g @openai/codex` +2. `OPENAI_API_KEY` set in the environment. +3. A running Temporal server. + +## Running offline unit tests + +```bash +cd /path/to/scale-agentex-python +uv run --all-packages --all-extras pytest examples/tutorials/10_async/10_temporal/harness_codex/tests/test_agent.py -q +``` + +## Running live integration tests + +```bash +export CODEX_LIVE_TESTS=1 +export OPENAI_API_KEY=sk-... +pytest tests/test_agent.py -v +``` diff --git a/examples/tutorials/10_async/10_temporal/harness_codex/conftest.py b/examples/tutorials/10_async/10_temporal/harness_codex/conftest.py new file mode 100644 index 000000000..4ae6ce61a --- /dev/null +++ b/examples/tutorials/10_async/10_temporal/harness_codex/conftest.py @@ -0,0 +1,17 @@ +"""Add the agent's project root to sys.path so ``import project`` works. + +Also sets minimal environment variables so FastACP, tracing, and the +Temporal workflow module can be imported without a running server. +""" + +import os +import sys + +sys.path.insert(0, os.path.dirname(__file__)) + +# AGENT_NAME must match the manifest's agent name: the live test queries the +# server by this name, and project.workflow reads it at import time. +os.environ.setdefault("AGENT_NAME", "at-harness-codex") +os.environ.setdefault("ACP_URL", "http://localhost:8000") +os.environ.setdefault("WORKFLOW_NAME", "at-harness-codex") +os.environ.setdefault("WORKFLOW_TASK_QUEUE", "at_harness_codex_queue") diff --git a/examples/tutorials/10_async/10_temporal/harness_codex/manifest.yaml b/examples/tutorials/10_async/10_temporal/harness_codex/manifest.yaml new file mode 100644 index 000000000..3bc21dccc --- /dev/null +++ b/examples/tutorials/10_async/10_temporal/harness_codex/manifest.yaml @@ -0,0 +1,62 @@ +build: + context: + root: ../../../ + include_paths: + - 10_async/10_temporal/harness_codex + - test_utils + dockerfile: 10_async/10_temporal/harness_codex/Dockerfile + dockerignore: 10_async/10_temporal/harness_codex/.dockerignore + +local_development: + agent: + port: 8000 + host_address: host.docker.internal + paths: + acp: project/acp.py + worker: project/run_worker.py + +agent: + acp_type: async + name: at-harness-codex + description: Temporal tutorial agent driving the unified harness surface via local codex CLI subprocess + + temporal: + enabled: true + workflows: + - name: at-harness-codex + queue_name: at_harness_codex_queue + + credentials: + - env_var_name: OPENAI_API_KEY + secret_name: openai-api-key + secret_key: api-key + - env_var_name: REDIS_URL + secret_name: redis-url-secret + secret_key: url + - env_var_name: SGP_API_KEY + secret_name: sgp-api-key + secret_key: api-key + - env_var_name: SGP_ACCOUNT_ID + secret_name: sgp-account-id + secret_key: account-id + - env_var_name: SGP_CLIENT_BASE_URL + secret_name: sgp-client-base-url + secret_key: url + +deployment: + image: + repository: "" + tag: "latest" + + global: + agent: + name: "at-harness-codex" + description: "Temporal tutorial agent driving the unified harness surface via local codex CLI subprocess" + replicaCount: 1 + resources: + requests: + cpu: "500m" + memory: "1Gi" + limits: + cpu: "1000m" + memory: "2Gi" diff --git a/examples/tutorials/10_async/10_temporal/harness_codex/project/__init__.py b/examples/tutorials/10_async/10_temporal/harness_codex/project/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/examples/tutorials/10_async/10_temporal/harness_codex/project/acp.py b/examples/tutorials/10_async/10_temporal/harness_codex/project/acp.py new file mode 100644 index 000000000..39a81dde9 --- /dev/null +++ b/examples/tutorials/10_async/10_temporal/harness_codex/project/acp.py @@ -0,0 +1,32 @@ +"""ACP server for the Temporal Codex harness tutorial. + +This file is intentionally thin. When ``acp_type="async"`` is combined with +``TemporalACPConfig(type="temporal", ...)``, FastACP auto-wires: + + HTTP task/create -> @workflow.run on the workflow class + HTTP task/event/send -> @workflow.signal(SignalName.RECEIVE_EVENT) + HTTP task/cancel -> workflow cancellation via the Temporal client + +so we don't define any handlers here. The actual agent code lives in +``project/workflow.py`` and is executed by the Temporal worker +(``project/run_worker.py``), not by this HTTP process. +""" + +from __future__ import annotations + +import os + +from dotenv import load_dotenv + +load_dotenv() + +from agentex.lib.types.fastacp import TemporalACPConfig +from agentex.lib.sdk.fastacp.fastacp import FastACP + +acp = FastACP.create( + acp_type="async", + config=TemporalACPConfig( + type="temporal", + temporal_address=os.getenv("TEMPORAL_ADDRESS", "localhost:7233"), + ), +) diff --git a/examples/tutorials/10_async/10_temporal/harness_codex/project/activities.py b/examples/tutorials/10_async/10_temporal/harness_codex/project/activities.py new file mode 100644 index 000000000..363347635 --- /dev/null +++ b/examples/tutorials/10_async/10_temporal/harness_codex/project/activities.py @@ -0,0 +1,145 @@ +"""Temporal activity for the Codex harness tutorial. + +Subprocess spawning (and any other I/O) must run inside a Temporal *activity*, +not in workflow code. Temporal runs workflow + signal-handler bodies on a +deterministic sandbox event loop that does not implement ``subprocess_exec`` +(or threads / sockets), so spawning ``codex exec`` directly in the signal +handler raises ``NotImplementedError``. This activity runs codex, drives the +``CodexTurn`` through ``UnifiedEmitter.auto_send_turn`` (the async Redis push +path), and returns the turn result to the workflow. + +The ``_spawn_codex`` / ``_process_stdout`` seams are injectable: offline tests +replace them with fakes that yield pre-recorded event lines so no real CLI +runs. +""" + +from __future__ import annotations + +import os +import codecs +import asyncio +from typing import Any +from datetime import datetime +from collections.abc import AsyncIterator + +from temporalio import activity + +from agentex.lib.adk import CodexTurn +from agentex.lib.core.harness import UnifiedEmitter +from agentex.lib.utils.logging import make_logger +from agentex.lib.utils.model_utils import BaseModel + +logger = make_logger(__name__) + +RUN_CODEX_TURN_ACTIVITY = "run_codex_turn" + + +class RunCodexTurnParams(BaseModel): + """Arguments for one codex turn run inside an activity.""" + + task_id: str + prompt: str + model: str + trace_id: str | None = None + parent_span_id: str | None = None + thread_id: str | None = None + created_at: datetime | None = None + + +class RunCodexTurnResult(BaseModel): + """Result returned from the activity to the workflow.""" + + final_text: str + session_id: str | None = None + model: str | None = None + + +async def _spawn_codex( + model: str, + thread_id: str | None = None, +) -> asyncio.subprocess.Process: + """Spawn ``codex exec --json`` locally and return the live process. + + Injection seam: tests replace this function with a fake that returns a + mock process whose stdout yields pre-recorded event lines. + + The caller writes the prompt to stdin after the process starts, then + closes stdin so codex knows input is complete. + """ + base_flags = [ + "--json", + "--skip-git-repo-check", + "--dangerously-bypass-approvals-and-sandbox", + "--model", + model, + ] + + if thread_id: + cmd = ["codex", "exec", *base_flags, "resume", thread_id, "-"] + else: + cmd = ["codex", "exec", *base_flags, "-"] + + return await asyncio.create_subprocess_exec( + *cmd, + stdin=asyncio.subprocess.PIPE, + stdout=asyncio.subprocess.PIPE, + # Discard stderr: codex --json writes events to stdout; its stderr is + # progress/debug noise. Capturing it with PIPE but never reading it + # would deadlock once codex fills the OS pipe buffer (~64 KB). + stderr=asyncio.subprocess.DEVNULL, + env={**os.environ}, + ) + + +async def _process_stdout(process: asyncio.subprocess.Process) -> AsyncIterator[str]: + """Yield newline-delimited JSON lines from the process stdout. + + Uses an incremental UTF-8 decoder so a multibyte character split across two + 4 KB reads is decoded correctly instead of being corrupted at the boundary. + """ + assert process.stdout is not None + decoder = codecs.getincrementaldecoder("utf-8")(errors="replace") + buffer = "" + while True: + chunk = await process.stdout.read(4096) + if not chunk: + break + buffer += decoder.decode(chunk) + while "\n" in buffer: + line, buffer = buffer.split("\n", 1) + line = line.strip() + if line: + yield line + buffer += decoder.decode(b"", final=True) + if buffer.strip(): + yield buffer.strip() + + +@activity.defn(name=RUN_CODEX_TURN_ACTIVITY) +async def run_codex_turn(params: RunCodexTurnParams) -> dict[str, Any]: + """Run one codex turn end-to-end and stream events to the task. + + Runs in an activity (real asyncio loop) so subprocess I/O is permitted. + """ + process = await _spawn_codex(params.model, thread_id=params.thread_id) + + assert process.stdin is not None + process.stdin.write(params.prompt.encode("utf-8")) + await process.stdin.drain() + process.stdin.close() + + turn = CodexTurn(events=_process_stdout(process), model=params.model) + emitter = UnifiedEmitter( + task_id=params.task_id, + trace_id=params.trace_id, + parent_span_id=params.parent_span_id, + ) + result = await emitter.auto_send_turn(turn, created_at=params.created_at) + + await process.wait() + + return RunCodexTurnResult( + final_text=result.final_text, + session_id=turn.session_id, + model=turn.usage().model, + ).model_dump() diff --git a/examples/tutorials/10_async/10_temporal/harness_codex/project/run_worker.py b/examples/tutorials/10_async/10_temporal/harness_codex/project/run_worker.py new file mode 100644 index 000000000..b8972806b --- /dev/null +++ b/examples/tutorials/10_async/10_temporal/harness_codex/project/run_worker.py @@ -0,0 +1,41 @@ +"""Temporal worker for the Codex harness tutorial. + +Run as a separate long-lived process alongside the ACP HTTP server. The +worker polls Temporal for workflow + activity tasks and executes them. + +The codex CLI subprocess runs in the ``run_codex_turn`` activity (registered +below alongside the built-in Agentex activities), because subprocess I/O is not +permitted on the Temporal workflow event loop. +""" + +import asyncio + +from project.workflow import AtHarnessCodexWorkflow +from project.activities import run_codex_turn +from agentex.lib.utils.debug import setup_debug_if_enabled +from agentex.lib.utils.logging import make_logger +from agentex.lib.environment_variables import EnvironmentVariables +from agentex.lib.core.temporal.activities import get_all_activities +from agentex.lib.core.temporal.workers.worker import AgentexWorker + +environment_variables = EnvironmentVariables.refresh() +logger = make_logger(__name__) + + +async def main(): + setup_debug_if_enabled() + + task_queue_name = environment_variables.WORKFLOW_TASK_QUEUE + if task_queue_name is None: + raise ValueError("WORKFLOW_TASK_QUEUE is not set") + + worker = AgentexWorker(task_queue=task_queue_name) + + await worker.run( + activities=[run_codex_turn, *get_all_activities()], + workflow=AtHarnessCodexWorkflow, + ) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/examples/tutorials/10_async/10_temporal/harness_codex/project/workflow.py b/examples/tutorials/10_async/10_temporal/harness_codex/project/workflow.py new file mode 100644 index 000000000..1970b478f --- /dev/null +++ b/examples/tutorials/10_async/10_temporal/harness_codex/project/workflow.py @@ -0,0 +1,145 @@ +"""Temporal workflow for the Codex harness tutorial. + +Demonstrates the ``convert_codex_to_agentex_events`` tap + ``CodexTurn`` + +``UnifiedEmitter`` for a Temporal-durable ACP agent. + +KEY CONCEPTS DEMONSTRATED: +- Running ``codex exec --json`` in the ``run_codex_turn`` activity. Subprocess + I/O is not permitted on the Temporal workflow event loop (the deterministic + sandbox loop does not implement ``subprocess_exec``), so the signal handler + delegates the turn to an activity, which also gets Temporal's retry + timeout + guarantees. +- Wrapping the stdout line stream in a ``CodexTurn`` (inside the activity). +- Delivering events via ``UnifiedEmitter.auto_send_turn``, which pushes + ``StreamTaskMessage*`` events to Redis so the UI sees tokens in real time. +- Passing ``created_at=workflow.now()`` for deterministic timestamps under + Temporal replay (required for Temporal-safe delivery). +- Persisting the codex thread ID on the workflow instance itself — Temporal's + workflow state is durable, so no external ``adk.state`` round-trip is needed. +""" + +from __future__ import annotations + +import os +from datetime import timedelta + +from temporalio import workflow + +from agentex.lib import adk +from agentex.lib.types.acp import SendEventParams, CreateTaskParams +from agentex.lib.types.tracing import SGPTracingProcessorConfig +from agentex.lib.utils.logging import make_logger +from agentex.types.text_content import TextContent +from agentex.lib.environment_variables import EnvironmentVariables +from agentex.lib.core.temporal.types.workflow import SignalName +from agentex.lib.core.temporal.workflows.workflow import BaseWorkflow +from agentex.lib.core.tracing.tracing_processor_manager import add_tracing_processor_config + +with workflow.unsafe.imports_passed_through(): + from project.activities import RunCodexTurnParams, run_codex_turn + +add_tracing_processor_config( + SGPTracingProcessorConfig( + sgp_api_key=os.environ.get("SGP_API_KEY", ""), + sgp_account_id=os.environ.get("SGP_ACCOUNT_ID", ""), + sgp_base_url=os.environ.get("SGP_CLIENT_BASE_URL", ""), + ) +) + +environment_variables = EnvironmentVariables.refresh() + +if environment_variables.WORKFLOW_NAME is None: + raise ValueError("Environment variable WORKFLOW_NAME is not set") +if environment_variables.AGENT_NAME is None: + raise ValueError("Environment variable AGENT_NAME is not set") + +logger = make_logger(__name__) + +MODEL = os.environ.get("CODEX_MODEL", "o4-mini") + + +@workflow.defn(name=environment_variables.WORKFLOW_NAME) +class AtHarnessCodexWorkflow(BaseWorkflow): + """Long-running Temporal workflow that runs codex exec for each turn. + + Conversation state (codex thread ID + turn counter) is kept on the + workflow instance. Temporal's durable replay reconstructs this state if + the worker crashes, so no external ``adk.state`` round-trip is needed. + """ + + def __init__(self): + super().__init__(display_name=environment_variables.AGENT_NAME) + self._complete_task = False + self._turn_number = 0 + self._codex_thread_id: str | None = None + + @workflow.signal(name=SignalName.RECEIVE_EVENT) + async def on_task_event_send(self, params: SendEventParams) -> None: + """Handle a new user message: spawn codex, stream events via UnifiedEmitter.""" + logger.info("Received task event: %s", params.task.id) + self._turn_number += 1 + + await adk.messages.create(task_id=params.task.id, content=params.event.content) + + user_message = params.event.content.content + + async with adk.tracing.span( + trace_id=params.task.id, + task_id=params.task.id, + name=f"Turn {self._turn_number}", + input={"message": user_message}, + ) as span: + # Delegate the subprocess turn to an activity: subprocess I/O is not + # permitted on the Temporal workflow event loop. The activity streams + # events to the task and returns the final text + codex thread id. + # workflow.now() gives a deterministic timestamp under replay. + result = await workflow.execute_activity( + run_codex_turn, + RunCodexTurnParams( + task_id=params.task.id, + prompt=user_message, + model=MODEL, + trace_id=params.task.id, + parent_span_id=span.id if span else None, + thread_id=self._codex_thread_id, + created_at=workflow.now(), + ), + start_to_close_timeout=timedelta(minutes=5), + ) + + # Persist the codex thread id so the next turn resumes the session. + session_id = result.get("session_id") + if session_id: + self._codex_thread_id = session_id + + if span: + span.output = { + "final_text": result.get("final_text"), + "model": result.get("model"), + } + + @workflow.run + async def on_task_create(self, params: CreateTaskParams) -> str: + """Workflow entry point — keep the conversation alive for incoming signals.""" + logger.info("Task created: %s", params.task.id) + + await adk.messages.create( + task_id=params.task.id, + content=TextContent( + author="agent", + content=( + f"Task initialized.\n" + f"Send me a message and I'll run codex (local subprocess) " + f"to answer, streaming events via the unified harness surface." + ), + ), + ) + + await workflow.wait_condition(lambda: self._complete_task, timeout=None) + return "Task completed" + + @workflow.signal + async def complete_task_signal(self) -> None: + """Graceful workflow shutdown signal.""" + logger.info("Received complete_task signal") + self._complete_task = True diff --git a/examples/tutorials/10_async/10_temporal/harness_codex/pyproject.toml b/examples/tutorials/10_async/10_temporal/harness_codex/pyproject.toml new file mode 100644 index 000000000..c4d67d285 --- /dev/null +++ b/examples/tutorials/10_async/10_temporal/harness_codex/pyproject.toml @@ -0,0 +1,40 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "at-harness-codex" +version = "0.1.0" +description = "Temporal tutorial agent driving the unified harness surface via local codex CLI subprocess" +readme = "README.md" +requires-python = ">=3.12" +dependencies = [ + "agentex-sdk", + "scale-gp", + "temporalio>=1.18.2", +] + +[project.optional-dependencies] +dev = [ + "pytest", + "pytest-asyncio", + "httpx", + "black", + "isort", + "flake8", + "debugpy>=1.8.15", +] + +[tool.hatch.build.targets.wheel] +packages = ["project"] + +[tool.black] +line-length = 88 +target-version = ['py312'] + +[tool.isort] +profile = "black" +line_length = 88 + +[tool.pytest.ini_options] +asyncio_mode = "auto" diff --git a/examples/tutorials/10_async/10_temporal/harness_codex/tests/test_agent.py b/examples/tutorials/10_async/10_temporal/harness_codex/tests/test_agent.py new file mode 100644 index 000000000..2066b35b1 --- /dev/null +++ b/examples/tutorials/10_async/10_temporal/harness_codex/tests/test_agent.py @@ -0,0 +1,275 @@ +"""Tests for the Temporal Codex harness tutorial agent. + +LIVE tests (``TestLiveCodexAgent``): + - Require the ``codex`` CLI on PATH, ``OPENAI_API_KEY``, and a running + Temporal + Agentex server. + - Skipped automatically when ``CODEX_LIVE_TESTS`` is not set to ``1``. + +OFFLINE unit tests (``TestOfflineCodexWorkflow``): + - Inject a fake async iterator of pre-recorded codex event lines. + - Assert the signal handler drives ``UnifiedEmitter.auto_send_turn`` and + captures the codex thread ID on the workflow instance. + - Always run. +""" + +from __future__ import annotations + +import os +import json +from typing import Any +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest + +# --------------------------------------------------------------------------- +# Shared helpers +# --------------------------------------------------------------------------- + +SAMPLE_EVENTS: list[dict[str, Any]] = [ + {"type": "thread.started", "thread_id": "thread-temporal-1"}, + {"type": "turn.started"}, + { + "type": "item.started", + "item": {"id": "msg-t1", "type": "agent_message", "text": "Hello"}, + }, + { + "type": "item.completed", + "item": {"id": "msg-t1", "type": "agent_message", "text": "Hello from Temporal!"}, + }, + { + "type": "turn.completed", + "usage": {"input_tokens": 6, "output_tokens": 3, "total_tokens": 9}, + }, +] + + +async def _fake_event_stream(): + """Async iterator of pre-recorded codex event JSON lines (no subprocess).""" + for evt in SAMPLE_EVENTS: + yield json.dumps(evt) + + +class _FakeSpan: + id = "span-temporal-1" + output: Any = None + + async def __aenter__(self): + return self + + async def __aexit__(self, *a): + pass + + +class TestOfflineCodexWorkflow: + """Unit tests that run without a real codex CLI, Temporal, or network.""" + + @pytest.mark.asyncio + async def test_codex_turn_usage_with_temporal_events(self): + """CodexTurn.usage() is correct after exhausting the temporal sample events.""" + from agentex.lib.adk import CodexTurn + + turn = CodexTurn(events=_fake_event_stream(), model="o4-mini") + + _ = [e async for e in turn.events] + + usage = turn.usage() + assert usage.input_tokens == 6 + assert usage.output_tokens == 3 + assert usage.model == "o4-mini" + + @pytest.mark.asyncio + async def test_unified_emitter_auto_send_with_created_at(self): + """UnifiedEmitter.auto_send_turn accepts created_at=None without error.""" + from agentex.lib.adk import CodexTurn + from agentex.lib.core.harness import UnifiedEmitter + from agentex.types.task_message import TaskMessage + from agentex.types.text_content import TextContent + + turn = CodexTurn(events=_fake_event_stream(), model="o4-mini") + + real_task_msg = TaskMessage( + id="msg-fake", + task_id="t", + content=TextContent(type="text", author="agent", content=""), + ) + + fake_streaming = MagicMock() + fake_ctx = AsyncMock() + fake_ctx.__aenter__ = AsyncMock(return_value=fake_ctx) + fake_ctx.__aexit__ = AsyncMock(return_value=False) + fake_ctx.stream_update = AsyncMock(return_value=MagicMock()) + fake_ctx.close = AsyncMock() + fake_ctx.task_message = real_task_msg + fake_streaming.streaming_task_message_context = MagicMock(return_value=fake_ctx) + + emitter = UnifiedEmitter( + task_id="t", + trace_id=None, + parent_span_id=None, + streaming=fake_streaming, + ) + + result = await emitter.auto_send_turn(turn, created_at=None) + assert result is not None + + @pytest.mark.asyncio + async def test_thread_id_captured_after_exhausted_stream(self): + """CodexTurn._result captures the thread_id from thread.started.""" + from agentex.lib.adk import CodexTurn + + turn = CodexTurn(events=_fake_event_stream(), model="o4-mini") + _ = [e async for e in turn.events] + + assert turn._result is not None + assert turn._result["session_id"] == "thread-temporal-1" + + @pytest.mark.asyncio + async def test_signal_handler_delegates_to_activity_and_captures_thread_id(self): + """Signal handler runs the turn via execute_activity, increments the turn + counter, and captures the codex thread ID returned by the activity.""" + captured: dict[str, Any] = {} + + async def _fake_execute_activity(_activity, params, **_kw): + captured["params"] = params + return { + "session_id": "thread-temporal-1", + "final_text": "Hello from Temporal!", + "model": "o4-mini", + } + + with patch("project.workflow.adk.messages.create", new=AsyncMock()), patch( + "project.workflow.adk.tracing.span" + ) as mock_span, patch( + "project.workflow.workflow.execute_activity", new=_fake_execute_activity + ), patch("project.workflow.workflow.now", return_value=None): + mock_span.return_value = _FakeSpan() + + from project.workflow import AtHarnessCodexWorkflow + + wf = AtHarnessCodexWorkflow.__new__(AtHarnessCodexWorkflow) + wf._turn_number = 0 + wf._codex_thread_id = None + wf._complete_task = False + wf._display_name = "test" + + params = MagicMock() + params.task.id = "task-temporal-offline-1" + params.event.content.content = "say hello temporal" + + await wf.on_task_event_send(params) + + assert wf._turn_number == 1 + assert wf._codex_thread_id == "thread-temporal-1" + assert captured["params"].prompt == "say hello temporal" + assert captured["params"].thread_id is None + + @pytest.mark.asyncio + async def test_run_codex_turn_activity_streams_and_returns_thread_id(self): + """The run_codex_turn activity drives the turn and returns the thread id.""" + from agentex.lib.core.harness import UnifiedEmitter + + async def _fake_spawn(model, thread_id=None): # noqa: ARG001 + fake_stdin = MagicMock() + fake_stdin.write = MagicMock() + fake_stdin.drain = AsyncMock() + fake_stdin.close = MagicMock() + proc = MagicMock() + proc.stdin = fake_stdin + proc.wait = AsyncMock(return_value=0) + return proc + + async def _fake_process_stdout(_process): # noqa: ARG001 + for evt in SAMPLE_EVENTS: + yield json.dumps(evt) + + class _FakeTurnResult: + final_text = "Hello from Temporal!" + + async def _auto_send(_self, turn, *_a, **_kw): + async for _ in turn.events: + pass + return _FakeTurnResult() + + with patch("project.activities._spawn_codex", new=_fake_spawn), patch( + "project.activities._process_stdout", new=_fake_process_stdout + ), patch.object(UnifiedEmitter, "auto_send_turn", new=_auto_send): + from project.activities import RunCodexTurnParams, run_codex_turn + + result = await run_codex_turn( + RunCodexTurnParams( + task_id="task-temporal-offline-1", + prompt="say hello temporal", + model="o4-mini", + ) + ) + + assert result["session_id"] == "thread-temporal-1" + assert result["final_text"] == "Hello from Temporal!" + + +# --------------------------------------------------------------------------- +# Live tests +# --------------------------------------------------------------------------- + +LIVE = os.environ.get("CODEX_LIVE_TESTS", "") == "1" +AGENTEX_API_BASE_URL = os.environ.get("AGENTEX_API_BASE_URL", "http://localhost:5003") +AGENT_NAME = os.environ.get("AGENT_NAME", "at-harness-codex") + + +@pytest.mark.skipif( + not LIVE, + reason="Set CODEX_LIVE_TESTS=1 and ensure codex CLI + OPENAI_API_KEY + Temporal are available", +) +class TestLiveCodexAgent: + """End-to-end tests that require the real codex CLI, Temporal, and Agentex server.""" + + @pytest.fixture + def client(self): + from agentex import Agentex + + return Agentex(base_url=AGENTEX_API_BASE_URL) + + @pytest.fixture + def agent_id(self, client): + for agent in client.agents.list(): + if agent.name == AGENT_NAME: + return agent.id + raise ValueError(f"Agent {AGENT_NAME!r} not found.") + + def test_send_simple_message(self, client, agent_id: str): + """Temporal agents process events out of band, so create a task, send an + event, and poll the task's messages for the agent's response.""" + import time + import uuid + + from agentex.types import TextContentParam + from agentex.types.agent_rpc_params import ParamsSendEventRequest, ParamsCreateTaskRequest + + task = client.agents.create_task(agent_id, params=ParamsCreateTaskRequest(name=uuid.uuid1().hex)).result + assert task is not None + + client.agents.send_event( + agent_id=agent_id, + params=ParamsSendEventRequest( + task_id=task.id, + content=TextContentParam( + author="user", + content="What is 5+5? Reply with just the number.", + type="text", + ), + ), + ) + + deadline = time.monotonic() + 90 + while time.monotonic() < deadline: + msgs = client.messages.list(task_id=task.id) + agent_msgs = [m for m in msgs if getattr(m.content, "author", None) == "agent"] + response_msgs = [ + m for m in agent_msgs if "Task initialized" not in str(getattr(m.content, "content", "")) + ] + if response_msgs: + assert len(response_msgs) >= 1 + return + time.sleep(3) + + raise AssertionError("No agent response received within 90 s") diff --git a/src/agentex/lib/adk/__init__.py b/src/agentex/lib/adk/__init__.py index c2b343b72..f6713be7c 100644 --- a/src/agentex/lib/adk/__init__.py +++ b/src/agentex/lib/adk/__init__.py @@ -18,6 +18,8 @@ ClaudeCodeTurn, claude_code_usage_to_turn_usage, ) +from agentex.lib.adk._modules._codex_sync import convert_codex_to_agentex_events +from agentex.lib.adk._modules._codex_turn import CodexTurn, codex_usage_to_turn_usage from agentex.lib.adk._modules.events import EventsModule from agentex.lib.adk._modules.messages import MessagesModule from agentex.lib.adk._modules.state import StateModule @@ -63,6 +65,10 @@ "convert_claude_code_to_agentex_events", "ClaudeCodeTurn", "claude_code_usage_to_turn_usage", + # Codex + "convert_codex_to_agentex_events", + "CodexTurn", + "codex_usage_to_turn_usage", # Providers "providers", # Utils diff --git a/src/agentex/lib/adk/_modules/_codex_sync.py b/src/agentex/lib/adk/_modules/_codex_sync.py new file mode 100644 index 000000000..b2b162a24 --- /dev/null +++ b/src/agentex/lib/adk/_modules/_codex_sync.py @@ -0,0 +1,587 @@ +"""Codex event-stream parser tap for the unified harness surface. + +Converts a ``codex exec --json`` newline-delimited event stream (already +produced by the golden agent's sandbox/subprocess orchestration) into the +Agentex canonical ``StreamTaskMessage*`` events. + +SCOPE +----- +This module is a **pure parser**. It receives pre-produced codex events +(``str`` lines or already-decoded ``dict`` objects) and yields canonical +``StreamTaskMessage*`` events. All subprocess management, sandbox +provisioning, secret injection, and MCP orchestration remain in the golden +agent at +``teams/sgp/agents/golden_agent/project/harness/providers/codex.py``. + +No deployable test agent is included here: running codex requires the +golden agent's sandbox environment and is out of scope for this library tap. + +OUT OF SCOPE (document here so future callers are not surprised): +- Subprocess / sandbox management +- OPENAI_API_KEY / secret injection +- MCP server configuration (--config /tmp/codex_config.toml) +- ``codex exec resume`` session tracking +- ``scale_sandbox`` imports + +CANONICAL MAPPING +----------------- +The table below lists every ``type`` field the codex exec JSON stream can +emit (from ``codex-rs/exec/src/exec_events.rs``) and its mapping. + +Top-level event types +~~~~~~~~~~~~~~~~~~~~~ + thread.started -> (no StreamTaskMessage; session_id captured + internally; surfaced via ``on_result`` callback) + turn.started -> (no StreamTaskMessage; turn was started before + codex launched; nothing to emit here) + turn.completed -> on_result(usage_dict, tool_count, reasoning_count) + yields no StreamTaskMessage (turn lifecycle is + managed by the activity layer) + turn.failed -> StreamTaskMessageFull(TextContent, error text) + error -> StreamTaskMessageFull(TextContent, error text) + +Item sub-types (item.started / item.updated / item.completed) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + agent_message -> text deltas: + item.started / item.updated -> StreamTaskMessageDelta(TextDelta) + item.completed -> StreamTaskMessageDone + reasoning -> reasoning: + item.started -> StreamTaskMessageStart(ReasoningContent) + item.updated -> (no-op; final text arrives on completed) + item.completed -> StreamTaskMessageFull(ReasoningContent) + command_execution -> tool request + response: + item.started -> StreamTaskMessageStart(ToolRequestContent) + + StreamTaskMessageDone + item.completed -> StreamTaskMessageFull(ToolResponseContent) + file_change -> same as command_execution + NOTE: file_change may only emit item.completed (no started); + a synthetic ToolRequestContent Full is emitted before the response. + mcp_tool_call -> same as command_execution + web_search -> same as command_execution + todo_list -> same as command_execution + collab_tool_call -> same as command_execution + error (item type) -> StreamTaskMessageFull(TextContent, error text) on completed only + +UNMAPPED / PARTIALLY MAPPED EVENTS +----------------------------------- + thread.started: session_id is extracted but not forwarded as a + StreamTaskMessage (no canonical content type for + session-lifecycle signals; captured in on_result). + turn.started: no-op; intentional (the caller owns turn lifecycle). + turn.completed: no StreamTaskMessage; usage is forwarded via + on_result so the caller can record it in a span + without this module needing to know about spans. + item.updated (reasoning): the intermediate cumulative text is discarded; + only item.completed carries the final text. + item.updated (tool): tool item types other than agent_message do not + emit updates; item.started opens the request and + item.completed closes it. +""" + +from __future__ import annotations + +import json +from typing import Any, Callable, AsyncIterator + +from agentex.lib.utils.logging import make_logger +from agentex.types.reasoning_content import ReasoningContent +from agentex.types.task_message_delta import TextDelta +from agentex.types.task_message_update import ( + StreamTaskMessageDone, + StreamTaskMessageFull, + StreamTaskMessageDelta, + StreamTaskMessageStart, +) +from agentex.types.task_message_content import TextContent +from agentex.types.tool_request_content import ToolRequestContent +from agentex.types.tool_response_content import ToolResponseContent +from agentex.types.reasoning_content_delta import ReasoningContentDelta + +logger = make_logger(__name__) + +# Canonical type alias matching the unified harness surface. +StreamTaskMessage = StreamTaskMessageStart | StreamTaskMessageDelta | StreamTaskMessageFull | StreamTaskMessageDone + +_MAX_RESULT_LENGTH = 4000 + + +def _truncate(text: str, max_len: int = _MAX_RESULT_LENGTH) -> str: + return str(text)[:max_len] + + +def _tool_name_for(item_type: str, payload: dict[str, Any]) -> str: + """Derive a canonical tool name from a codex item type.""" + if item_type == "command_execution": + return "bash" + if item_type == "file_change": + return "file_change" + if item_type == "mcp_tool_call": + server = payload.get("server", "") + tool = payload.get("tool", "") + return f"{server}.{tool}" if (server or tool) else "mcp_tool_call" + if item_type == "web_search": + return "web_search" + if item_type == "todo_list": + return "todo_list" + if item_type == "collab_tool_call": + return "collab_tool_call" + return item_type or "unknown" + + +def _tool_args_for(item_type: str, payload: dict[str, Any]) -> dict[str, Any]: + """Extract canonical arguments dict from a codex item payload.""" + if item_type == "command_execution": + return {"command": payload.get("command", "")} + if item_type == "file_change": + return {"changes": payload.get("changes") or []} + if item_type == "mcp_tool_call": + args = payload.get("arguments") + return args if isinstance(args, dict) else {"value": args} + if item_type == "web_search": + return {"query": payload.get("query", "")} + if item_type == "todo_list": + return {"items": payload.get("items") or []} + if item_type == "collab_tool_call": + # Surface an arguments dict if the payload carries one (mirrors + # mcp_tool_call); otherwise no args rather than fabricating a shape. + args = payload.get("arguments") + return args if isinstance(args, dict) else {} + return {} + + +def _tool_output_for(item_type: str, payload: dict[str, Any]) -> tuple[str, bool]: + """Extract (result_text, is_error) from a completed codex tool item.""" + if item_type == "command_execution": + out = payload.get("aggregated_output") or "" + exit_code = payload.get("exit_code") + is_error = exit_code is not None and exit_code != 0 + return _truncate(out), is_error + if item_type in ("mcp_tool_call", "collab_tool_call"): + # collab_tool_call mirrors mcp_tool_call's error/result convention + # (see _tool_args_for); without this branch a failed collab call would + # fall through to the generic path and be reported as a success. + err = payload.get("error") + if err: + msg = err.get("message", "") if isinstance(err, dict) else str(err) + return _truncate(f"Error: {msg}"), True + result = payload.get("result") + if result is None: + return "", False + try: + return _truncate(json.dumps(result)), False + except (TypeError, ValueError): + return _truncate(str(result)), False + if item_type == "file_change": + changes = payload.get("changes") or [] + status = payload.get("status", "") + return f"status={status}, {len(changes)} changes", status == "failed" + try: + return _truncate(json.dumps(payload, default=str)), False + except (TypeError, ValueError): + return _truncate(str(payload)), False + + +def _error_full(message: str, next_index: int) -> StreamTaskMessageFull: + """Emit a one-shot TextContent full message for an error.""" + return StreamTaskMessageFull( + type="full", + index=next_index, + content=TextContent( + type="text", + author="agent", + content=f"Error: {message}", + format="plain", + ), + ) + + +class _CodexStreamProcessor: + """Stateful parser: consumes codex exec events, yields StreamTaskMessage*. + + Ported from the golden agent's ``_CodexEventProcessor`` in + ``project/harness/providers/codex.py``, adapted to yield + ``StreamTaskMessage*`` directly instead of ``HarnessEvent`` objects. + + State tracked: + - ``_next_index``: monotonically increasing message index. + - ``_text_index``: message index of the current open agent_message block. + - ``_text_accumulated``: cumulative text per agent_message item_id. + - ``_reasoning_index``: message index of the current open reasoning block. + - ``_reasoning_text``: latest cumulative reasoning text per item_id. + - ``_tool_open``: item_ids for which a ToolRequestContent Start was emitted + but no ToolResponseContent Full yet. + - ``_tool_item_types``: item_id -> item_type for open tool calls. + """ + + def __init__(self) -> None: + self._next_index: int = 0 + + # agent_message tracking + self._text_index: dict[str, int] = {} + self._text_accumulated: dict[str, str] = {} + + # reasoning tracking + self._reasoning_index: dict[str, int] = {} + self._reasoning_text: dict[str, str] = {} + + # tool tracking + self._tool_open: set[str] = set() + self._tool_item_types: dict[str, str] = {} + # Remember the tool_call_id assigned per item so the request and response + # halves agree even when item_id is empty (a recomputed fallback would + # drift as tool_call_count advances between started and completed). + self._tool_call_ids: dict[str, str] = {} + + # counters for on_result callback + self.tool_call_count: int = 0 + self.reasoning_count: int = 0 + self.session_id: str | None = None + + def _alloc(self) -> int: + idx = self._next_index + self._next_index += 1 + return idx + + def process(self, evt: dict[str, Any]) -> list[StreamTaskMessage]: + evt_type = evt.get("type", "") + + if evt_type == "thread.started": + sid = evt.get("thread_id") or "" + if sid: + self.session_id = sid + return [] + + if evt_type == "turn.started": + # The activity layer owns turn lifecycle; nothing to emit. + return [] + + if evt_type == "turn.completed": + # Usage forwarded via on_result callback (not a StreamTaskMessage). + return [] + + if evt_type == "turn.failed": + err = evt.get("error") or {} + msg = err.get("message", "codex turn failed") if isinstance(err, dict) else str(err) + return [_error_full(f"Codex turn failed: {msg}", self._alloc())] + + if evt_type == "error": + return [_error_full(evt.get("message", "codex error"), self._alloc())] + + if evt_type in ("item.started", "item.updated", "item.completed"): + item = evt.get("item") or {} + return self._handle_item(evt_type, item) + + logger.debug("[codex] unhandled event type=%s", evt_type) + return [] + + def _handle_item(self, evt_type: str, item: dict[str, Any]) -> list[StreamTaskMessage]: + item_id = item.get("id") or "" + item_type = item.get("type") or "" + out: list[StreamTaskMessage] = [] + + if item_type == "agent_message": + current = item.get("text") or "" + previous = self._text_accumulated.get(item_id, "") + + if evt_type in ("item.started", "item.updated"): + if item_id not in self._text_index: + idx = self._alloc() + self._text_index[item_id] = idx + out.append( + StreamTaskMessageStart( + type="start", + index=idx, + content=TextContent( + type="text", + author="agent", + content="", + ), + ) + ) + idx = self._text_index[item_id] + delta = "" + if current.startswith(previous) and len(current) > len(previous): + delta = current[len(previous) :] + elif current and current != previous: + delta = current + if delta: + out.append( + StreamTaskMessageDelta( + type="delta", + index=idx, + delta=TextDelta(type="text", text_delta=delta), + ) + ) + self._text_accumulated[item_id] = current + + elif evt_type == "item.completed": + if item_id not in self._text_index: + idx = self._alloc() + self._text_index[item_id] = idx + out.append( + StreamTaskMessageStart( + type="start", + index=idx, + content=TextContent( + type="text", + author="agent", + content="", + ), + ) + ) + idx = self._text_index[item_id] + delta = "" + if current.startswith(previous) and len(current) > len(previous): + delta = current[len(previous) :] + elif current and current != previous: + delta = current + if delta: + out.append( + StreamTaskMessageDelta( + type="delta", + index=idx, + delta=TextDelta(type="text", text_delta=delta), + ) + ) + out.append(StreamTaskMessageDone(type="done", index=idx)) + self._text_accumulated[item_id] = current + + elif item_type == "reasoning": + current = item.get("text") or "" + + if evt_type == "item.started": + idx = self._alloc() + self._reasoning_index[item_id] = idx + self._reasoning_text[item_id] = current + out.append( + StreamTaskMessageStart( + type="start", + index=idx, + content=ReasoningContent( + type="reasoning", + author="agent", + summary=[], + content=[], + style="active", + ), + ) + ) + if current: + out.append( + StreamTaskMessageDelta( + type="delta", + index=idx, + delta=ReasoningContentDelta( + type="reasoning_content", + content_index=0, + content_delta=current, + ), + ) + ) + + elif evt_type == "item.updated": + # Accumulate silently; final text arrives on item.completed. + self._reasoning_text[item_id] = current + + elif evt_type == "item.completed": + text = current or self._reasoning_text.get(item_id, "") + idx = self._reasoning_index.get(item_id) + if text: + self.reasoning_count += 1 + summary = text.strip().split("\n", 1)[0][:300] + final_content = ReasoningContent( + type="reasoning", + author="agent", + summary=[summary], + content=[text], + style="static", + ) + if idx is not None: + out.append( + StreamTaskMessageFull( + type="full", + index=idx, + content=final_content, + ) + ) + else: + # No started event was seen; emit a standalone Full. + out.append( + StreamTaskMessageFull( + type="full", + index=self._alloc(), + content=final_content, + ) + ) + elif idx is not None: + # Empty reasoning block — still need to close with a Done. + out.append(StreamTaskMessageDone(type="done", index=idx)) + + elif item_type in ( + "command_execution", + "file_change", + "mcp_tool_call", + "web_search", + "todo_list", + "collab_tool_call", + ): + # Resolve a stable id once per item; reuse it for both halves. + tool_call_id = self._tool_call_ids.get(item_id) + if tool_call_id is None: + tool_call_id = item_id or f"codex_tool_{self.tool_call_count + 1}" + self._tool_call_ids[item_id] = tool_call_id + + if evt_type == "item.started": + self.tool_call_count += 1 + self._tool_open.add(item_id) + self._tool_item_types[item_id] = item_type + name = _tool_name_for(item_type, item) + args = _tool_args_for(item_type, item) + req_idx = self._alloc() + out.append( + StreamTaskMessageStart( + type="start", + index=req_idx, + content=ToolRequestContent( + type="tool_request", + author="agent", + tool_call_id=tool_call_id, + name=name, + arguments=args, + ), + ) + ) + out.append(StreamTaskMessageDone(type="done", index=req_idx)) + + elif evt_type == "item.completed": + # file_change items may only emit item.completed (no started). + if item_id not in self._tool_open: + self.tool_call_count += 1 + self._tool_open.add(item_id) + self._tool_item_types[item_id] = item_type + name = _tool_name_for(item_type, item) + args = _tool_args_for(item_type, item) + req_idx = self._alloc() + out.append( + StreamTaskMessageFull( + type="full", + index=req_idx, + content=ToolRequestContent( + type="tool_request", + author="agent", + tool_call_id=tool_call_id, + name=name, + arguments=args, + ), + ) + ) + + actual_type = self._tool_item_types.get(item_id, item_type) + result_text, is_error = _tool_output_for(actual_type, item) + name = _tool_name_for(actual_type, item) + resp_content: dict[str, Any] = {"result": result_text} + if is_error: + resp_content["is_error"] = True + out.append( + StreamTaskMessageFull( + type="full", + index=self._alloc(), + content=ToolResponseContent( + type="tool_response", + author="agent", + tool_call_id=tool_call_id, + name=name, + content=resp_content, + ), + ) + ) + self._tool_open.discard(item_id) + # Free the id mapping so a later item reusing an empty id gets a + # fresh fallback rather than colliding with this one. + self._tool_call_ids.pop(item_id, None) + + elif item_type == "error": + if evt_type == "item.completed": + out.append(_error_full(item.get("message", "codex item error"), self._alloc())) + + else: + logger.debug("[codex] unhandled item type=%s evt=%s", item_type, evt_type) + + return out + + +async def convert_codex_to_agentex_events( + events: AsyncIterator[str | dict[str, Any]], + on_result: Callable[[dict[str, Any]], None] | None = None, +) -> AsyncIterator[StreamTaskMessage]: + """Convert a ``codex exec --json`` event stream into Agentex stream events. + + This is a pure parser tap. The caller must supply ``events`` as an async + iterator of either raw newline-delimited JSON strings or pre-decoded dicts. + No subprocess or sandbox management is done here. + + Args: + events: Async iterator of ``str`` (newline-delimited JSON lines) or + ``dict`` (pre-decoded event objects) as produced by the codex CLI's + ``--json`` flag via sandbox stdout. + on_result: Optional callback invoked once when a ``turn.completed`` + event is seen. Receives a dict with keys: + ``usage`` — the raw codex usage dict (or None) + ``session_id`` — the codex thread_id (or None) + ``tool_call_count`` — int + ``reasoning_count`` — int + Use this to record turn-level metrics / usage in the caller's span + without coupling this module to span/tracing APIs. + + Yields: + Canonical ``StreamTaskMessage*`` events (Start/Delta/Full/Done) with + ``TextContent``, ``ReasoningContent``, ``ToolRequestContent``, or + ``ToolResponseContent`` payloads. + + MAPPING (abbreviated — see module docstring for the full table) + thread.started -> no event; session_id captured for on_result + turn.started -> no event + turn.completed -> no event; triggers on_result callback + turn.failed / error -> StreamTaskMessageFull(TextContent, error) + agent_message -> Start + Deltas + Done + reasoning -> Start + Full(ReasoningContent) + command_execution -> Start(ToolRequest)+Done + Full(ToolResponse) + file_change -> Full(ToolRequest) + Full(ToolResponse) + mcp_tool_call -> Start(ToolRequest)+Done + Full(ToolResponse) + web_search / todo_list -> Start(ToolRequest)+Done + Full(ToolResponse) + collab_tool_call -> Start(ToolRequest)+Done + Full(ToolResponse) + """ + processor = _CodexStreamProcessor() + _pending_usage: dict[str, Any] | None = None + + async for raw in events: + if isinstance(raw, dict): + evt = raw + else: + line = raw.strip() if isinstance(raw, str) else "" + if not line: + continue + try: + evt = json.loads(line) + except json.JSONDecodeError: + logger.debug("[codex] non-JSON line: %s", line[:100]) + continue + + # Capture usage before processing so on_result can fire after flush. + if evt.get("type") == "turn.completed": + usage = evt.get("usage") + _pending_usage = usage if isinstance(usage, dict) else None + + messages = processor.process(evt) + for msg in messages: + yield msg + + if on_result is not None: + on_result( + { + "usage": _pending_usage, + "session_id": processor.session_id, + "tool_call_count": processor.tool_call_count, + "reasoning_count": processor.reasoning_count, + } + ) diff --git a/src/agentex/lib/adk/_modules/_codex_turn.py b/src/agentex/lib/adk/_modules/_codex_turn.py new file mode 100644 index 000000000..e7fa1d929 --- /dev/null +++ b/src/agentex/lib/adk/_modules/_codex_turn.py @@ -0,0 +1,214 @@ +"""CodexTurn: HarnessTurn implementation for the codex event-stream tap. + +Wraps ``convert_codex_to_agentex_events`` so callers can pass a ``CodexTurn`` +directly to ``UnifiedEmitter.yield_turn`` or ``UnifiedEmitter.auto_send_turn``. + +Usage:: + + from agentex.lib.adk import convert_codex_to_agentex_events + from agentex.lib.adk._modules._codex_turn import CodexTurn, codex_usage_to_turn_usage + + turn = CodexTurn(events=codex_event_stream, model="o4-mini") + async for msg in emitter.yield_turn(turn): + yield msg + turn_usage = turn.usage() + +OUT OF SCOPE +------------ +Like ``_codex_sync``, this module is a pure library tap. Subprocess +provisioning, sandbox setup, secret injection, and MCP configuration remain +in the golden agent (``teams/sgp/agents/golden_agent/project/harness/``). +""" + +from __future__ import annotations + +from typing import Any, AsyncIterator + +from agentex.lib.core.harness.types import TurnUsage +from agentex.lib.adk._modules._codex_sync import ( + StreamTaskMessage, + convert_codex_to_agentex_events, +) + + +def codex_usage_to_turn_usage( + raw: dict[str, Any] | None, + *, + model: str | None = None, + tool_call_count: int = 0, + reasoning_count: int = 0, + duration_ms: int | None = None, + cost_usd: float | None = None, +) -> TurnUsage: + """Map a raw codex ``turn.completed`` usage dict to a canonical ``TurnUsage``. + + Codex reports token usage under the ``usage`` key of the + ``turn.completed`` event. The shape follows the OpenAI completion_tokens + convention because codex is built on OpenAI models: + + .. code-block:: json + + { + "input_tokens": 1234, + "output_tokens": 456, + "total_tokens": 1690 + } + + Additionally, codex may report ``reasoning_tokens`` for o-series models: + + .. code-block:: json + + { + "input_tokens": 1234, + "output_tokens": 456, + "reasoning_tokens": 200, + "total_tokens": 1690 + } + + Defensive rules: + - Missing ``raw`` or missing sub-keys default to ``None`` (not zero) so + downstream callers can distinguish "not reported" from "reported as 0". + - Real zeros (``0`` explicitly present in ``raw``) are preserved as ``0``. + - ``total_tokens`` is accepted from the payload or left as ``None``; + callers should not recompute it because codex may use cached tokens. + - ``cost_usd`` is passed through when codex reports it (not yet common); + defaults to ``None`` if absent. + + Args: + raw: The raw codex usage dict from ``turn.completed``, or ``None``. + model: Model string (e.g. "o4-mini") to attach to the usage record. + tool_call_count: Number of tool calls in the turn (from processor). + reasoning_count: Number of reasoning blocks (from processor). + duration_ms: Wall-clock duration of the turn in milliseconds. + cost_usd: Cost in USD if the caller can derive it; ``None`` otherwise. + + Returns: + A populated ``TurnUsage`` instance. + """ + if not isinstance(raw, dict): + raw = {} + + def _int_or_none(key: str) -> int | None: + val = raw.get(key) + if val is None: + return None + try: + return int(val) + except (TypeError, ValueError): + return None + + def _float_or_none(key: str) -> float | None: + val = raw.get(key) + if val is None: + return None + try: + return float(val) + except (TypeError, ValueError): + return None + + # cost_usd: prefer explicitly passed value, then fall back to raw payload. + effective_cost = cost_usd if cost_usd is not None else _float_or_none("cost_usd") + + return TurnUsage( + model=model or None, + input_tokens=_int_or_none("input_tokens"), + output_tokens=_int_or_none("output_tokens"), + cached_input_tokens=_int_or_none("cached_input_tokens"), + reasoning_tokens=_int_or_none("reasoning_tokens"), + total_tokens=_int_or_none("total_tokens"), + cost_usd=effective_cost, + duration_ms=duration_ms, + num_llm_calls=1, + num_tool_calls=tool_call_count, + num_reasoning_blocks=reasoning_count, + ) + + +class CodexTurn: + """A single codex turn as a ``HarnessTurn``. + + Implements the ``HarnessTurn`` protocol so it can be passed to + ``UnifiedEmitter.yield_turn`` and ``UnifiedEmitter.auto_send_turn``. + + ``usage()`` is valid only after ``events`` has been fully consumed (i.e. + the async generator has been exhausted). Calling ``usage()`` before + exhaustion returns a zero-value ``TurnUsage`` with only ``model`` set. + + Args: + events: An async iterator of ``str | dict`` codex events, as + produced by reading ``codex exec --json`` stdout line by line. + model: Model string to attach to the ``TurnUsage``. + duration_ms: Optional turn wall-clock duration in milliseconds. + cost_usd: Optional cost in USD; ``None`` if not known. + """ + + def __init__( + self, + events: AsyncIterator[str | dict[str, Any]], + *, + model: str | None = None, + duration_ms: int | None = None, + cost_usd: float | None = None, + ) -> None: + self._raw_events = events + self._model = model + # Public + mutable: the true wall-clock duration (and cost) is usually + # only known after the stream is consumed, so callers may set these + # after construction and before calling usage(). + self.duration_ms = duration_ms + self.cost_usd = cost_usd + + # Populated by the on_result callback once the stream is exhausted. + self._result: dict[str, Any] | None = None + # The events generator is created at most once: ``_raw_events`` is a + # single-consumption AsyncIterator, so re-wrapping it would yield an + # already-exhausted stream that fires on_result with zeros and clobbers + # ``_result``. Cache the generator and hand back the same instance. + self._events_gen: AsyncIterator[StreamTaskMessage] | None = None + + @property + def events(self) -> AsyncIterator[StreamTaskMessage]: + """Async iterator of canonical ``StreamTaskMessage*`` events. + + The ``on_result`` callback populates ``_result`` when the underlying + codex stream ends, so ``usage()`` returns meaningful data after + exhaustion. Returns the same generator on every access so the underlying + stream is consumed (and ``on_result`` fires) exactly once. + """ + if self._events_gen is None: + self._events_gen = convert_codex_to_agentex_events( + self._raw_events, + on_result=self._on_result, + ) + return self._events_gen + + def _on_result(self, result: dict[str, Any]) -> None: + self._result = result + + @property + def session_id(self) -> str | None: + """The codex session id, for resuming a multi-turn session. + + Valid only after ``events`` has been fully consumed (populated by the + ``on_result`` callback). Returns ``None`` if the stream is not yet + exhausted or codex reported no session id. + """ + return self._result.get("session_id") if self._result else None + + def usage(self) -> TurnUsage: + """Return normalized ``TurnUsage`` for this turn. + + Valid only after ``events`` has been fully consumed. Returns a + zero-value ``TurnUsage`` (model set, counts zero, tokens None) if + called before the stream ends. + """ + if self._result is None: + return TurnUsage(model=self._model) + return codex_usage_to_turn_usage( + self._result.get("usage"), + model=self._model, + tool_call_count=self._result.get("tool_call_count", 0), + reasoning_count=self._result.get("reasoning_count", 0), + duration_ms=self.duration_ms, + cost_usd=self.cost_usd, + ) diff --git a/tests/lib/adk/test_codex_sync.py b/tests/lib/adk/test_codex_sync.py new file mode 100644 index 000000000..d0093e5dd --- /dev/null +++ b/tests/lib/adk/test_codex_sync.py @@ -0,0 +1,671 @@ +"""Offline tests for the codex event-stream parser tap. + +Tests cover: +- Text streaming (agent_message items) +- Tool call streaming (command_execution, mcp_tool_call, file_change) +- Reasoning streaming (reasoning items) +- Multi-step turns +- Error events (top-level + item-level) +- Edge cases: empty events, non-JSON lines, unknown types +- on_result callback (session_id, usage, counters) +- file_change synthesized start (no item.started emitted by codex) +""" + +from __future__ import annotations + +import json +from typing import Any, AsyncIterator + +from agentex.types.reasoning_content import ReasoningContent +from agentex.types.task_message_delta import TextDelta +from agentex.types.task_message_update import ( + StreamTaskMessageDone, + StreamTaskMessageFull, + StreamTaskMessageDelta, + StreamTaskMessageStart, +) +from agentex.types.task_message_content import TextContent +from agentex.types.tool_request_content import ToolRequestContent +from agentex.types.tool_response_content import ToolResponseContent +from agentex.lib.adk._modules._codex_sync import ( + _truncate, + _tool_args_for, + _tool_name_for, + _tool_output_for, + convert_codex_to_agentex_events, +) +from agentex.types.reasoning_content_delta import ReasoningContentDelta + + +async def _aiter(items: list[Any]) -> AsyncIterator[Any]: + for item in items: + yield item + + +async def _collect(stream: AsyncIterator[Any]) -> list[Any]: + return [e async for e in stream] + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +class TestHelpers: + def test_truncate_short(self) -> None: + assert _truncate("hello", max_len=10) == "hello" + + def test_truncate_long(self) -> None: + assert _truncate("a" * 5000) == "a" * 4000 + + def test_tool_name_command_execution(self) -> None: + assert _tool_name_for("command_execution", {}) == "bash" + + def test_tool_name_file_change(self) -> None: + assert _tool_name_for("file_change", {}) == "file_change" + + def test_tool_name_mcp_with_server_and_tool(self) -> None: + assert _tool_name_for("mcp_tool_call", {"server": "fs", "tool": "read"}) == "fs.read" + + def test_tool_name_mcp_empty(self) -> None: + assert _tool_name_for("mcp_tool_call", {}) == "mcp_tool_call" + + def test_tool_name_unknown(self) -> None: + assert _tool_name_for("", {}) == "unknown" + + def test_tool_args_command(self) -> None: + assert _tool_args_for("command_execution", {"command": "ls"}) == {"command": "ls"} + + def test_tool_args_file_change(self) -> None: + assert _tool_args_for("file_change", {"changes": ["a"]}) == {"changes": ["a"]} + + def test_tool_args_mcp_dict(self) -> None: + assert _tool_args_for("mcp_tool_call", {"arguments": {"k": "v"}}) == {"k": "v"} + + def test_tool_args_mcp_non_dict(self) -> None: + assert _tool_args_for("mcp_tool_call", {"arguments": "str"}) == {"value": "str"} + + def test_tool_output_command_success(self) -> None: + text, is_err = _tool_output_for("command_execution", {"aggregated_output": "hello", "exit_code": 0}) + assert text == "hello" + assert is_err is False + + def test_tool_output_command_error(self) -> None: + _, is_err = _tool_output_for("command_execution", {"aggregated_output": "boom", "exit_code": 1}) + assert is_err is True + + def test_tool_output_mcp_error(self) -> None: + text, is_err = _tool_output_for("mcp_tool_call", {"error": {"message": "not found"}}) + assert "not found" in text + assert is_err is True + + def test_tool_output_mcp_result(self) -> None: + text, is_err = _tool_output_for("mcp_tool_call", {"result": {"data": 1}}) + assert json.loads(text) == {"data": 1} + assert is_err is False + + def test_tool_output_file_change_failed(self) -> None: + _, is_err = _tool_output_for("file_change", {"status": "failed", "changes": []}) + assert is_err is True + + def test_tool_output_file_change_ok(self) -> None: + text, is_err = _tool_output_for("file_change", {"status": "ok", "changes": [1, 2]}) + assert "2 changes" in text + assert is_err is False + + +# --------------------------------------------------------------------------- +# Text streaming +# --------------------------------------------------------------------------- + + +class TestTextStreaming: + async def test_text_start_delta_done(self) -> None: + events = [ + {"type": "item.started", "item": {"id": "m1", "type": "agent_message", "text": "Hi"}}, + {"type": "item.updated", "item": {"id": "m1", "type": "agent_message", "text": "Hi!"}}, + {"type": "item.completed", "item": {"id": "m1", "type": "agent_message", "text": "Hi! Done"}}, + ] + out = await _collect(convert_codex_to_agentex_events(_aiter(events))) + + starts = [e for e in out if isinstance(e, StreamTaskMessageStart)] + deltas = [e for e in out if isinstance(e, StreamTaskMessageDelta)] + dones = [e for e in out if isinstance(e, StreamTaskMessageDone)] + + assert len(starts) == 1 + assert isinstance(starts[0].content, TextContent) + assert len(deltas) >= 1 + all_delta_text = "".join( + d.delta.text_delta for d in deltas if isinstance(d.delta, TextDelta) and d.delta.text_delta is not None + ) + assert "Hi" in all_delta_text + assert len(dones) == 1 + + async def test_text_indices_are_monotonic(self) -> None: + events = [ + {"type": "item.started", "item": {"id": "m1", "type": "agent_message", "text": "A"}}, + {"type": "item.completed", "item": {"id": "m1", "type": "agent_message", "text": "A"}}, + ] + out = await _collect(convert_codex_to_agentex_events(_aiter(events))) + anchor = [e for e in out if isinstance(e, StreamTaskMessageStart)] + done = [e for e in out if isinstance(e, StreamTaskMessageDone)] + assert anchor[0].index == done[0].index + + async def test_empty_text_no_delta(self) -> None: + events = [ + {"type": "item.started", "item": {"id": "m1", "type": "agent_message", "text": ""}}, + {"type": "item.completed", "item": {"id": "m1", "type": "agent_message", "text": ""}}, + ] + out = await _collect(convert_codex_to_agentex_events(_aiter(events))) + deltas = [e for e in out if isinstance(e, StreamTaskMessageDelta)] + assert deltas == [] + + async def test_text_author_is_agent(self) -> None: + events = [ + {"type": "item.started", "item": {"id": "m1", "type": "agent_message", "text": "X"}}, + ] + out = await _collect(convert_codex_to_agentex_events(_aiter(events))) + for e in out: + content = getattr(e, "content", None) + if content and hasattr(content, "author"): + assert content.author == "agent" + + +# --------------------------------------------------------------------------- +# Tool call streaming +# --------------------------------------------------------------------------- + + +class TestToolCallStreaming: + async def test_command_execution_start_done_full(self) -> None: + events = [ + { + "type": "item.started", + "item": { + "id": "t1", + "type": "command_execution", + "command": "echo hello", + }, + }, + { + "type": "item.completed", + "item": { + "id": "t1", + "type": "command_execution", + "command": "echo hello", + "aggregated_output": "hello", + "exit_code": 0, + }, + }, + ] + out = await _collect(convert_codex_to_agentex_events(_aiter(events))) + + starts = [e for e in out if isinstance(e, StreamTaskMessageStart)] + dones = [e for e in out if isinstance(e, StreamTaskMessageDone)] + fulls = [e for e in out if isinstance(e, StreamTaskMessageFull)] + + assert len(starts) == 1 + assert isinstance(starts[0].content, ToolRequestContent) + assert starts[0].content.name == "bash" + assert starts[0].content.arguments == {"command": "echo hello"} + assert starts[0].content.tool_call_id == "t1" + + assert len(dones) == 1 + + assert len(fulls) == 1 + assert isinstance(fulls[0].content, ToolResponseContent) + resp_content = fulls[0].content.content + assert isinstance(resp_content, dict) + assert resp_content["result"] == "hello" + assert fulls[0].content.tool_call_id == "t1" + + async def test_empty_item_id_request_response_ids_match(self) -> None: + """A tool with an empty item_id must use the SAME fallback tool_call_id + on the request (started) and response (completed) halves.""" + events = [ + {"type": "item.started", "item": {"id": "", "type": "command_execution", "command": "ls"}}, + { + "type": "item.completed", + "item": { + "id": "", + "type": "command_execution", + "command": "ls", + "aggregated_output": ".", + "exit_code": 0, + }, + }, + ] + out = await _collect(convert_codex_to_agentex_events(_aiter(events))) + # Pull tool_call_id inside the comprehension so the isinstance narrows the + # content union (the narrowing would not survive a later attribute access). + req_ids = [ + e.content.tool_call_id + for e in out + if isinstance(e, StreamTaskMessageStart) and isinstance(e.content, ToolRequestContent) + ] + resp_ids = [ + e.content.tool_call_id + for e in out + if isinstance(e, StreamTaskMessageFull) and isinstance(e.content, ToolResponseContent) + ] + assert len(req_ids) == 1 and len(resp_ids) == 1 + assert req_ids[0] == resp_ids[0] + + async def test_file_change_synthesizes_start(self) -> None: + """file_change items may only emit item.completed (no started).""" + events = [ + { + "type": "item.completed", + "item": { + "id": "fc1", + "type": "file_change", + "changes": ["a.py"], + "status": "ok", + }, + } + ] + out = await _collect(convert_codex_to_agentex_events(_aiter(events))) + tool_req = [ + e for e in out if isinstance(e, StreamTaskMessageFull) and isinstance(e.content, ToolRequestContent) + ] + tool_resp = [ + e for e in out if isinstance(e, StreamTaskMessageFull) and isinstance(e.content, ToolResponseContent) + ] + assert len(tool_req) == 1 + assert isinstance(tool_req[0].content, ToolRequestContent) + assert tool_req[0].content.name == "file_change" + assert len(tool_resp) == 1 + + async def test_mcp_tool_call_name(self) -> None: + events = [ + { + "type": "item.started", + "item": { + "id": "mcp1", + "type": "mcp_tool_call", + "server": "fs", + "tool": "read", + "arguments": {"path": "/x"}, + }, + }, + { + "type": "item.completed", + "item": { + "id": "mcp1", + "type": "mcp_tool_call", + "server": "fs", + "tool": "read", + "arguments": {"path": "/x"}, + "result": "content", + }, + }, + ] + out = await _collect(convert_codex_to_agentex_events(_aiter(events))) + req = next( + e for e in out if isinstance(e, StreamTaskMessageStart) and isinstance(e.content, ToolRequestContent) + ) + assert isinstance(req.content, ToolRequestContent) + assert req.content.name == "fs.read" + + async def test_tool_error_marks_is_error(self) -> None: + events = [ + { + "type": "item.started", + "item": {"id": "cmd1", "type": "command_execution", "command": "bad"}, + }, + { + "type": "item.completed", + "item": { + "id": "cmd1", + "type": "command_execution", + "command": "bad", + "aggregated_output": "error output", + "exit_code": 127, + }, + }, + ] + out = await _collect(convert_codex_to_agentex_events(_aiter(events))) + resp = next( + e for e in out if isinstance(e, StreamTaskMessageFull) and isinstance(e.content, ToolResponseContent) + ) + assert isinstance(resp.content, ToolResponseContent) + resp_body = resp.content.content + assert isinstance(resp_body, dict) + assert resp_body.get("is_error") is True + + async def test_tool_indices_request_before_response(self) -> None: + events = [ + { + "type": "item.started", + "item": {"id": "cmd2", "type": "command_execution", "command": "ls"}, + }, + { + "type": "item.completed", + "item": { + "id": "cmd2", + "type": "command_execution", + "command": "ls", + "aggregated_output": ".", + "exit_code": 0, + }, + }, + ] + out = await _collect(convert_codex_to_agentex_events(_aiter(events))) + req = next(e for e in out if isinstance(e, StreamTaskMessageStart)) + resp = next( + e for e in out if isinstance(e, StreamTaskMessageFull) and isinstance(e.content, ToolResponseContent) + ) + assert req.index is not None and resp.index is not None + assert req.index < resp.index + + +# --------------------------------------------------------------------------- +# Reasoning +# --------------------------------------------------------------------------- + + +class TestReasoningStreaming: + async def test_reasoning_start_full(self) -> None: + events = [ + {"type": "item.started", "item": {"id": "r1", "type": "reasoning", "text": ""}}, + { + "type": "item.updated", + "item": {"id": "r1", "type": "reasoning", "text": "thinking..."}, + }, + { + "type": "item.completed", + "item": {"id": "r1", "type": "reasoning", "text": "thinking... done"}, + }, + ] + out = await _collect(convert_codex_to_agentex_events(_aiter(events))) + + starts = [e for e in out if isinstance(e, StreamTaskMessageStart)] + fulls = [e for e in out if isinstance(e, StreamTaskMessageFull) and isinstance(e.content, ReasoningContent)] + + assert len(starts) == 1 + assert isinstance(starts[0].content, ReasoningContent) + assert len(fulls) == 1 + assert isinstance(fulls[0].content, ReasoningContent) + reasoning_content = fulls[0].content.content + assert reasoning_content is not None + assert any("thinking... done" in s for s in reasoning_content) + + async def test_reasoning_initial_text_emits_delta(self) -> None: + events = [ + { + "type": "item.started", + "item": {"id": "r1", "type": "reasoning", "text": "seed"}, + }, + ] + out = await _collect(convert_codex_to_agentex_events(_aiter(events))) + deltas = [e for e in out if isinstance(e, StreamTaskMessageDelta)] + assert len(deltas) == 1 + assert isinstance(deltas[0].delta, ReasoningContentDelta) + assert deltas[0].delta.content_delta == "seed" + + async def test_reasoning_no_started_emits_standalone_full(self) -> None: + """If item.completed arrives without item.started, emit a standalone Full.""" + events = [ + { + "type": "item.completed", + "item": {"id": "r_orphan", "type": "reasoning", "text": "orphan thought"}, + } + ] + out = await _collect(convert_codex_to_agentex_events(_aiter(events))) + fulls = [e for e in out if isinstance(e, StreamTaskMessageFull) and isinstance(e.content, ReasoningContent)] + assert len(fulls) == 1 + assert isinstance(fulls[0].content, ReasoningContent) + orphan_content = fulls[0].content.content + assert orphan_content is not None + assert any("orphan thought" in s for s in orphan_content) + + async def test_reasoning_summary_is_first_line(self) -> None: + events = [ + {"type": "item.started", "item": {"id": "r2", "type": "reasoning", "text": ""}}, + { + "type": "item.completed", + "item": {"id": "r2", "type": "reasoning", "text": "line one\nline two"}, + }, + ] + out = await _collect(convert_codex_to_agentex_events(_aiter(events))) + full = next(e for e in out if isinstance(e, StreamTaskMessageFull) and isinstance(e.content, ReasoningContent)) + assert isinstance(full.content, ReasoningContent) + assert full.content.summary == ["line one"] + + +# --------------------------------------------------------------------------- +# Error events +# --------------------------------------------------------------------------- + + +class TestErrorEvents: + async def test_turn_failed_emits_error_text(self) -> None: + events = [{"type": "turn.failed", "error": {"message": "context length exceeded"}}] + out = await _collect(convert_codex_to_agentex_events(_aiter(events))) + assert len(out) == 1 + assert isinstance(out[0], StreamTaskMessageFull) + assert isinstance(out[0].content, TextContent) + assert "context length exceeded" in out[0].content.content + + async def test_top_level_error_emits_text(self) -> None: + events = [{"type": "error", "message": "unexpected EOF"}] + out = await _collect(convert_codex_to_agentex_events(_aiter(events))) + assert len(out) == 1 + assert isinstance(out[0].content, TextContent) + assert "unexpected EOF" in out[0].content.content + + async def test_item_error_emits_on_completed_only(self) -> None: + events = [ + {"type": "item.started", "item": {"id": "e1", "type": "error", "message": "bad"}}, + {"type": "item.completed", "item": {"id": "e1", "type": "error", "message": "bad"}}, + ] + out = await _collect(convert_codex_to_agentex_events(_aiter(events))) + # Only item.completed emits an event for error items + assert len(out) == 1 + assert isinstance(out[0].content, TextContent) + assert "bad" in out[0].content.content + + +# --------------------------------------------------------------------------- +# Edge cases +# --------------------------------------------------------------------------- + + +class TestEdgeCases: + async def test_empty_stream(self) -> None: + out = await _collect(convert_codex_to_agentex_events(_aiter([]))) + assert out == [] + + async def test_non_json_lines_skipped(self) -> None: + events: list[str] = ["not json", "also not json"] + out = await _collect(convert_codex_to_agentex_events(_aiter(events))) + assert out == [] + + async def test_blank_lines_skipped(self) -> None: + out = await _collect(convert_codex_to_agentex_events(_aiter(["", " ", "\n"]))) + assert out == [] + + async def test_pre_decoded_dict_events(self) -> None: + """Events passed as dicts (pre-decoded) should work without JSON parsing.""" + events: list[dict[str, Any]] = [ + {"type": "item.started", "item": {"id": "m1", "type": "agent_message", "text": "hi"}}, + { + "type": "item.completed", + "item": {"id": "m1", "type": "agent_message", "text": "hi"}, + }, + ] + out = await _collect(convert_codex_to_agentex_events(_aiter(events))) + assert len(out) > 0 + + async def test_thread_started_no_message(self) -> None: + events = [{"type": "thread.started", "thread_id": "t1"}] + out = await _collect(convert_codex_to_agentex_events(_aiter(events))) + assert out == [] + + async def test_turn_started_no_message(self) -> None: + out = await _collect(convert_codex_to_agentex_events(_aiter([{"type": "turn.started"}]))) + assert out == [] + + async def test_turn_completed_no_message(self) -> None: + out = await _collect( + convert_codex_to_agentex_events(_aiter([{"type": "turn.completed", "usage": {"input_tokens": 1}}])) + ) + assert out == [] + + async def test_unknown_event_type_no_message(self) -> None: + out = await _collect(convert_codex_to_agentex_events(_aiter([{"type": "some.future.event"}]))) + assert out == [] + + async def test_unknown_item_type_no_message(self) -> None: + out = await _collect( + convert_codex_to_agentex_events( + _aiter([{"type": "item.started", "item": {"id": "x", "type": "future_item"}}]) + ) + ) + assert out == [] + + +# --------------------------------------------------------------------------- +# on_result callback +# --------------------------------------------------------------------------- + + +class TestOnResult: + async def test_session_id_captured(self) -> None: + result: dict[str, Any] = {} + + def on_result(r: dict[str, Any]) -> None: + result.update(r) + + events = [ + {"type": "thread.started", "thread_id": "sess-xyz"}, + { + "type": "turn.completed", + "usage": {"input_tokens": 5, "output_tokens": 3, "total_tokens": 8}, + }, + ] + await _collect(convert_codex_to_agentex_events(_aiter(events), on_result=on_result)) + assert result["session_id"] == "sess-xyz" + + async def test_usage_forwarded(self) -> None: + result: dict[str, Any] = {} + + def on_result(r: dict[str, Any]) -> None: + result.update(r) + + events = [ + { + "type": "turn.completed", + "usage": {"input_tokens": 10, "output_tokens": 5, "total_tokens": 15}, + } + ] + await _collect(convert_codex_to_agentex_events(_aiter(events), on_result=on_result)) + assert result["usage"] == {"input_tokens": 10, "output_tokens": 5, "total_tokens": 15} + + async def test_tool_count(self) -> None: + result: dict[str, Any] = {} + + def on_result(r: dict[str, Any]) -> None: + result.update(r) + + events = [ + { + "type": "item.started", + "item": {"id": "t1", "type": "command_execution", "command": "ls"}, + }, + { + "type": "item.completed", + "item": { + "id": "t1", + "type": "command_execution", + "command": "ls", + "aggregated_output": ".", + "exit_code": 0, + }, + }, + {"type": "turn.completed", "usage": None}, + ] + await _collect(convert_codex_to_agentex_events(_aiter(events), on_result=on_result)) + assert result["tool_call_count"] == 1 + + async def test_no_callback_when_none(self) -> None: + """Passing on_result=None should not raise.""" + events = [{"type": "turn.completed", "usage": None}] + out = await _collect(convert_codex_to_agentex_events(_aiter(events), on_result=None)) + assert out == [] + + async def test_on_result_called_even_without_turn_completed(self) -> None: + """on_result fires at end of stream even if turn.completed never arrived.""" + result: dict[str, Any] = {} + + def on_result(r: dict[str, Any]) -> None: + result.update(r) + + events: list[Any] = [] + await _collect(convert_codex_to_agentex_events(_aiter(events), on_result=on_result)) + assert result.get("usage") is None + assert result.get("session_id") is None + + +# --------------------------------------------------------------------------- +# Multi-step turn: tool → text +# --------------------------------------------------------------------------- + + +class TestMultiStepTurn: + async def test_tool_then_text_monotonic_indices(self) -> None: + events = [ + { + "type": "item.started", + "item": {"id": "cmd1", "type": "command_execution", "command": "ls"}, + }, + { + "type": "item.completed", + "item": { + "id": "cmd1", + "type": "command_execution", + "command": "ls", + "aggregated_output": "file.txt", + "exit_code": 0, + }, + }, + { + "type": "item.started", + "item": {"id": "msg1", "type": "agent_message", "text": ""}, + }, + { + "type": "item.completed", + "item": {"id": "msg1", "type": "agent_message", "text": "Done"}, + }, + ] + out = await _collect(convert_codex_to_agentex_events(_aiter(events))) + indices = [e.index for e in out] + assert indices == sorted(indices), "indices must be monotonically non-decreasing" + + async def test_two_text_blocks_distinct_indices(self) -> None: + events = [ + { + "type": "item.started", + "item": {"id": "a", "type": "agent_message", "text": "first"}, + }, + {"type": "item.completed", "item": {"id": "a", "type": "agent_message", "text": "first"}}, + { + "type": "item.started", + "item": {"id": "b", "type": "agent_message", "text": "second"}, + }, + {"type": "item.completed", "item": {"id": "b", "type": "agent_message", "text": "second"}}, + ] + out = await _collect(convert_codex_to_agentex_events(_aiter(events))) + starts = [e for e in out if isinstance(e, StreamTaskMessageStart)] + assert len(starts) == 2 + assert starts[0].index != starts[1].index + + async def test_json_string_events(self) -> None: + """Events may arrive as raw newline-delimited JSON strings.""" + raw_events = [ + json.dumps({"type": "item.started", "item": {"id": "s1", "type": "agent_message", "text": "hello"}}), + json.dumps({"type": "item.completed", "item": {"id": "s1", "type": "agent_message", "text": "hello"}}), + ] + out = await _collect(convert_codex_to_agentex_events(_aiter(raw_events))) + assert len(out) > 0 + assert any(isinstance(e, StreamTaskMessageStart) for e in out) diff --git a/tests/lib/adk/test_codex_turn.py b/tests/lib/adk/test_codex_turn.py new file mode 100644 index 000000000..f6a046478 --- /dev/null +++ b/tests/lib/adk/test_codex_turn.py @@ -0,0 +1,282 @@ +"""Offline tests for CodexTurn and codex_usage_to_turn_usage. + +Tests cover: +- TurnUsage normalization from raw codex usage dicts +- Defensive handling of missing/invalid usage fields +- CodexTurn: events property yields canonical StreamTaskMessage* +- CodexTurn: usage() before and after stream exhaustion +- CodexTurn: on_result wiring (session_id, counts propagate to usage()) +- CodexTurn satisfies HarnessTurn protocol +""" + +from __future__ import annotations + +from typing import Any, AsyncIterator + +import pytest + +from agentex.lib.core.harness.types import TurnUsage, HarnessTurn +from agentex.types.task_message_update import ( + StreamTaskMessageDone, + StreamTaskMessageFull, + StreamTaskMessageDelta, + StreamTaskMessageStart, +) +from agentex.lib.adk._modules._codex_turn import ( + CodexTurn, + codex_usage_to_turn_usage, +) + + +async def _aiter(items: list[Any]) -> AsyncIterator[Any]: + for item in items: + yield item + + +async def _collect(turn: CodexTurn) -> list[Any]: + return [msg async for msg in turn.events] + + +# --------------------------------------------------------------------------- +# codex_usage_to_turn_usage +# --------------------------------------------------------------------------- + + +class TestCodexUsageToTurnUsage: + def test_none_raw_all_none_tokens(self) -> None: + u = codex_usage_to_turn_usage(None) + assert u.input_tokens is None + assert u.output_tokens is None + assert u.total_tokens is None + assert u.cost_usd is None + + def test_empty_dict_all_none_tokens(self) -> None: + u = codex_usage_to_turn_usage({}) + assert u.input_tokens is None + assert u.output_tokens is None + + def test_standard_usage(self) -> None: + raw = {"input_tokens": 100, "output_tokens": 50, "total_tokens": 150} + u = codex_usage_to_turn_usage(raw, model="o4-mini") + assert u.input_tokens == 100 + assert u.output_tokens == 50 + assert u.total_tokens == 150 + assert u.model == "o4-mini" + + def test_reasoning_tokens(self) -> None: + raw = {"input_tokens": 200, "output_tokens": 80, "reasoning_tokens": 60, "total_tokens": 340} + u = codex_usage_to_turn_usage(raw) + assert u.reasoning_tokens == 60 + + def test_real_zero_preserved(self) -> None: + """Explicit zeros in the payload must survive (not be treated as missing).""" + raw = {"input_tokens": 0, "output_tokens": 0, "total_tokens": 0} + u = codex_usage_to_turn_usage(raw) + assert u.input_tokens == 0 + assert u.output_tokens == 0 + + def test_cached_input_tokens(self) -> None: + raw = {"input_tokens": 100, "cached_input_tokens": 20, "output_tokens": 40} + u = codex_usage_to_turn_usage(raw) + assert u.cached_input_tokens == 20 + + def test_invalid_token_values_become_none(self) -> None: + raw = {"input_tokens": "not_a_number", "output_tokens": None} + u = codex_usage_to_turn_usage(raw) + assert u.input_tokens is None + assert u.output_tokens is None + + def test_cost_explicit(self) -> None: + u = codex_usage_to_turn_usage(None, cost_usd=0.0042) + assert u.cost_usd == pytest.approx(0.0042) + + def test_cost_from_raw(self) -> None: + u = codex_usage_to_turn_usage({"cost_usd": 0.001}) + assert u.cost_usd == pytest.approx(0.001) + + def test_explicit_cost_overrides_raw(self) -> None: + """Explicit cost_usd kwarg takes precedence over raw dict value.""" + u = codex_usage_to_turn_usage({"cost_usd": 0.001}, cost_usd=0.002) + assert u.cost_usd == pytest.approx(0.002) + + def test_tool_and_reasoning_counts(self) -> None: + u = codex_usage_to_turn_usage(None, tool_call_count=3, reasoning_count=2) + assert u.num_tool_calls == 3 + assert u.num_reasoning_blocks == 2 + + def test_num_llm_calls_always_one(self) -> None: + u = codex_usage_to_turn_usage(None) + assert u.num_llm_calls == 1 + + def test_duration_ms(self) -> None: + u = codex_usage_to_turn_usage(None, duration_ms=1234) + assert u.duration_ms == 1234 + + def test_model_none_when_not_provided(self) -> None: + u = codex_usage_to_turn_usage(None) + assert u.model is None + + def test_non_dict_raw_treated_as_empty(self) -> None: + u = codex_usage_to_turn_usage("bad input") # type: ignore[arg-type] + assert u.input_tokens is None + + def test_returns_turn_usage_instance(self) -> None: + u = codex_usage_to_turn_usage({}) + assert isinstance(u, TurnUsage) + + +# --------------------------------------------------------------------------- +# CodexTurn protocol conformance +# --------------------------------------------------------------------------- + + +class TestCodexTurnProtocol: + def test_implements_harness_turn_protocol(self) -> None: + turn = CodexTurn(_aiter([]), model="o4-mini") + assert isinstance(turn, HarnessTurn) + + def test_usage_before_exhaustion_returns_zero_turn_usage(self) -> None: + turn = CodexTurn(_aiter([]), model="test-model") + u = turn.usage() + assert isinstance(u, TurnUsage) + assert u.model == "test-model" + assert u.input_tokens is None + assert u.num_tool_calls == 0 + + +# --------------------------------------------------------------------------- +# CodexTurn events +# --------------------------------------------------------------------------- + + +class TestCodexTurnEvents: + async def test_events_yield_stream_task_messages(self) -> None: + events = [ + {"type": "item.started", "item": {"id": "m1", "type": "agent_message", "text": "hi"}}, + {"type": "item.completed", "item": {"id": "m1", "type": "agent_message", "text": "hi"}}, + ] + turn = CodexTurn(_aiter(events), model="o4-mini") + out = await _collect(turn) + assert len(out) > 0 + for msg in out: + assert isinstance( + msg, + (StreamTaskMessageStart, StreamTaskMessageDelta, StreamTaskMessageFull, StreamTaskMessageDone), + ) + + async def test_usage_after_exhaustion_has_tokens(self) -> None: + events = [ + { + "type": "turn.completed", + "usage": {"input_tokens": 10, "output_tokens": 5, "total_tokens": 15}, + } + ] + turn = CodexTurn(_aiter(events), model="o4-mini") + await _collect(turn) + u = turn.usage() + assert u.input_tokens == 10 + assert u.output_tokens == 5 + assert u.total_tokens == 15 + + async def test_usage_model_propagated(self) -> None: + events = [{"type": "turn.completed", "usage": None}] + turn = CodexTurn(_aiter(events), model="codex-model-x") + await _collect(turn) + assert turn.usage().model == "codex-model-x" + + async def test_tool_count_in_usage(self) -> None: + events = [ + { + "type": "item.started", + "item": {"id": "t1", "type": "command_execution", "command": "ls"}, + }, + { + "type": "item.completed", + "item": { + "id": "t1", + "type": "command_execution", + "command": "ls", + "aggregated_output": ".", + "exit_code": 0, + }, + }, + {"type": "turn.completed", "usage": None}, + ] + turn = CodexTurn(_aiter(events), model="o4-mini") + await _collect(turn) + assert turn.usage().num_tool_calls == 1 + + async def test_events_property_stable_across_accesses(self) -> None: + """`.events` returns the same generator; usage survives a second access.""" + events = [ + { + "type": "item.started", + "item": {"id": "t1", "type": "command_execution", "command": "ls"}, + }, + { + "type": "item.completed", + "item": { + "id": "t1", + "type": "command_execution", + "command": "ls", + "aggregated_output": ".", + "exit_code": 0, + }, + }, + {"type": "turn.completed", "usage": {"input_tokens": 10, "output_tokens": 5, "total_tokens": 15}}, + ] + turn = CodexTurn(_aiter(events), model="o4-mini") + assert turn.events is turn.events # same generator, not a fresh wrapper + await _collect(turn) + # A second access must NOT re-wrap the exhausted iterator and reset usage. + _ = turn.events + assert turn.usage().total_tokens == 15 + assert turn.usage().num_tool_calls == 1 + + async def test_reasoning_count_in_usage(self) -> None: + events = [ + {"type": "item.started", "item": {"id": "r1", "type": "reasoning", "text": ""}}, + { + "type": "item.completed", + "item": {"id": "r1", "type": "reasoning", "text": "thought"}, + }, + {"type": "turn.completed", "usage": None}, + ] + turn = CodexTurn(_aiter(events), model="o4-mini") + await _collect(turn) + assert turn.usage().num_reasoning_blocks == 1 + + async def test_duration_ms_passed_through(self) -> None: + events = [{"type": "turn.completed", "usage": None}] + turn = CodexTurn(_aiter(events), model="o4-mini", duration_ms=999) + await _collect(turn) + assert turn.usage().duration_ms == 999 + + async def test_cost_usd_passed_through(self) -> None: + events = [{"type": "turn.completed", "usage": None}] + turn = CodexTurn(_aiter(events), model="o4-mini", cost_usd=0.007) + await _collect(turn) + assert turn.usage().cost_usd == pytest.approx(0.007) + + async def test_empty_stream_usage_still_valid(self) -> None: + turn = CodexTurn(_aiter([]), model="o4-mini") + await _collect(turn) + u = turn.usage() + assert isinstance(u, TurnUsage) + assert u.num_llm_calls == 1 + + async def test_reasoning_tokens_propagated(self) -> None: + events = [ + { + "type": "turn.completed", + "usage": { + "input_tokens": 100, + "output_tokens": 60, + "reasoning_tokens": 40, + "total_tokens": 200, + }, + } + ] + turn = CodexTurn(_aiter(events), model="o4-mini") + await _collect(turn) + assert turn.usage().reasoning_tokens == 40 diff --git a/tests/lib/core/harness/conformance/test_codex_conformance.py b/tests/lib/core/harness/conformance/test_codex_conformance.py new file mode 100644 index 000000000..b00ed2970 --- /dev/null +++ b/tests/lib/core/harness/conformance/test_codex_conformance.py @@ -0,0 +1,225 @@ +"""Conformance fixtures for the codex harness tap. + +Each fixture is derived from a ``CodexTurn`` and registered into the +cross-channel conformance runner so that span derivation is validated +alongside all other harness taps. + +Following the per-module registry pattern from runner.py: this module keeps +its own local list of fixtures, both registers them AND parametrizes over +them, to guarantee determinism regardless of pytest collection order. +""" + +from __future__ import annotations + +import asyncio +from typing import Any, AsyncIterator + +import pytest + +from agentex.lib.core.harness.types import StreamTaskMessage +from agentex.lib.adk._modules._codex_sync import convert_codex_to_agentex_events + +from .runner import Fixture, register, derive_all + + +async def _aiter(items: list[Any]) -> AsyncIterator[Any]: + for item in items: + yield item + + +async def _collect(events: list[Any]) -> list[StreamTaskMessage]: + return [msg async for msg in convert_codex_to_agentex_events(_aiter(events))] + + +def _build(events: list[Any]) -> list[StreamTaskMessage]: + return asyncio.run(_collect(events)) + + +# --------------------------------------------------------------------------- +# Fixture 1: plain text response +# --------------------------------------------------------------------------- + +_CODEX_TEXT = Fixture( + name="codex-text", + events=_build( + [ + {"type": "thread.started", "thread_id": "thread-abc"}, + {"type": "turn.started"}, + { + "type": "item.started", + "item": {"id": "msg1", "type": "agent_message", "text": "Hello"}, + }, + { + "type": "item.updated", + "item": {"id": "msg1", "type": "agent_message", "text": "Hello, world"}, + }, + { + "type": "item.completed", + "item": {"id": "msg1", "type": "agent_message", "text": "Hello, world!"}, + }, + { + "type": "turn.completed", + "usage": {"input_tokens": 10, "output_tokens": 5, "total_tokens": 15}, + }, + ] + ), +) +register(_CODEX_TEXT) + +# --------------------------------------------------------------------------- +# Fixture 2: tool call (command_execution) +# --------------------------------------------------------------------------- + +_CODEX_TOOL = Fixture( + name="codex-tool-command", + events=_build( + [ + {"type": "thread.started", "thread_id": "thread-cmd"}, + { + "type": "item.started", + "item": { + "id": "tool1", + "type": "command_execution", + "command": "ls /workspace", + }, + }, + { + "type": "item.completed", + "item": { + "id": "tool1", + "type": "command_execution", + "command": "ls /workspace", + "aggregated_output": "file1.txt\nfile2.py", + "exit_code": 0, + }, + }, + { + "type": "turn.completed", + "usage": {"input_tokens": 20, "output_tokens": 8, "total_tokens": 28}, + }, + ] + ), +) +register(_CODEX_TOOL) + +# --------------------------------------------------------------------------- +# Fixture 3: reasoning block +# --------------------------------------------------------------------------- + +_CODEX_REASONING = Fixture( + name="codex-reasoning", + events=_build( + [ + {"type": "thread.started", "thread_id": "thread-reason"}, + { + "type": "item.started", + "item": {"id": "r1", "type": "reasoning", "text": ""}, + }, + { + "type": "item.updated", + "item": {"id": "r1", "type": "reasoning", "text": "Step 1: analyze the problem"}, + }, + { + "type": "item.completed", + "item": { + "id": "r1", + "type": "reasoning", + "text": "Step 1: analyze the problem\nStep 2: solve it", + }, + }, + { + "type": "item.started", + "item": {"id": "msg2", "type": "agent_message", "text": ""}, + }, + { + "type": "item.completed", + "item": {"id": "msg2", "type": "agent_message", "text": "The answer is 42."}, + }, + { + "type": "turn.completed", + "usage": { + "input_tokens": 30, + "output_tokens": 20, + "reasoning_tokens": 50, + "total_tokens": 100, + }, + }, + ] + ), +) +register(_CODEX_REASONING) + +# --------------------------------------------------------------------------- +# Fixture 4: multi-step (mcp_tool_call + follow-up text) +# --------------------------------------------------------------------------- + +_CODEX_MULTI = Fixture( + name="codex-multi-step", + events=_build( + [ + {"type": "thread.started", "thread_id": "thread-multi"}, + { + "type": "item.started", + "item": { + "id": "mcp1", + "type": "mcp_tool_call", + "server": "filesystem", + "tool": "read_file", + "arguments": {"path": "/workspace/README.md"}, + }, + }, + { + "type": "item.completed", + "item": { + "id": "mcp1", + "type": "mcp_tool_call", + "server": "filesystem", + "tool": "read_file", + "arguments": {"path": "/workspace/README.md"}, + "result": {"content": "# My Project"}, + }, + }, + { + "type": "item.started", + "item": {"id": "msg3", "type": "agent_message", "text": "The README says:"}, + }, + { + "type": "item.completed", + "item": { + "id": "msg3", + "type": "agent_message", + "text": "The README says: # My Project", + }, + }, + { + "type": "turn.completed", + "usage": {"input_tokens": 50, "output_tokens": 30, "total_tokens": 80}, + }, + ] + ), +) +register(_CODEX_MULTI) + + +# --------------------------------------------------------------------------- +# Local parametrized tests (cross-channel conformance) +# --------------------------------------------------------------------------- + +_LOCAL_FIXTURES = [_CODEX_TEXT, _CODEX_TOOL, _CODEX_REASONING, _CODEX_MULTI] + + +@pytest.mark.parametrize("fixture", _LOCAL_FIXTURES, ids=lambda f: f.name) +def test_codex_span_derivation_is_deterministic(fixture: Fixture) -> None: + """Span derivation over codex events is deterministic (cross-channel guarantee). + + Deriving twice over the same events yields identical signals. This is the + invariant that makes ``yield`` and ``auto_send`` delivery equivalent: both + observe the same event stream, so their tracing side effects are identical. + """ + assert derive_all(fixture.events) == derive_all(fixture.events) + + +@pytest.mark.parametrize("fixture", _LOCAL_FIXTURES, ids=lambda f: f.name) +def test_codex_events_are_non_empty(fixture: Fixture) -> None: + """Every codex fixture yields at least one StreamTaskMessage*.""" + assert len(fixture.events) > 0 From fa60632f9be84315a3fdc627745ae5b605994bd8 Mon Sep 17 00:00:00 2001 From: Declan Brady Date: Mon, 22 Jun 2026 20:10:48 -0400 Subject: [PATCH 09/10] feat(harness): public adk facade + docs for the unified harness surface (PR 9) (#423) --- adk/docs/harness.md | 196 ++++++++++++++++++++++++++++++++ src/agentex/lib/adk/__init__.py | 23 ++++ 2 files changed, 219 insertions(+) create mode 100644 adk/docs/harness.md diff --git a/adk/docs/harness.md b/adk/docs/harness.md new file mode 100644 index 000000000..6a9d8947a --- /dev/null +++ b/adk/docs/harness.md @@ -0,0 +1,196 @@ +# Unified Harness Surface + +The unified harness surface gives every agent harness (pydantic-ai, LangGraph, OpenAI Agents, and future parsers) a single, shared path to streaming, message persistence, and tracing. The Agentex `StreamTaskMessage*` event stream is the canonical wire format. A harness tap produces that stream once; the shared machinery delivers it and derives spans from it. + +All public names are re-exported from `agentex.lib.adk`: + +```python +from agentex.lib.adk import ( + UnifiedEmitter, + SpanTracer, + TurnUsage, + TurnResult, + HarnessTurn, + StreamTaskMessage, + OpenSpan, + CloseSpan, + SpanSignal, +) +``` + +The implementation lives at `src/agentex/lib/core/harness/`. + +--- + +## The canonical stream: `StreamTaskMessage` + +`StreamTaskMessage` is a union of the four wire-protocol update types: + +``` +StreamTaskMessageStart - opens a content slot (text, reasoning, tool request, ...) +StreamTaskMessageDelta - appends a token/fragment to an open slot +StreamTaskMessageFull - posts a complete message in one shot (tool response, ...) +StreamTaskMessageDone - closes an open slot +``` + +Every harness tap produces a sequence of these. Everything downstream (delivery, tracing) reads the same sequence. + +--- + +## Per-harness taps: `convert__to_agentex_events` + +A tap is an async generator that translates the harness's native event stream into `StreamTaskMessage*` events. The currently shipped taps are: + +| Harness | Tap function | Exported from | +|---|---|---| +| pydantic-ai | `convert_pydantic_ai_to_agentex_events` | `agentex.lib.adk` | +| LangGraph | `convert_langgraph_to_agentex_events` | `agentex.lib.adk` | + +Taps for claude-code and codex will be added in subsequent PRs (AGX1-420, AGX1-421) and exported from `agentex.lib.adk` in the same way. + +--- + +## `HarnessTurn` protocol + +`HarnessTurn` is the interface a harness turn object must satisfy to plug into `UnifiedEmitter`: + +```python +@runtime_checkable +class HarnessTurn(Protocol): + @property + def events(self) -> AsyncIterator[StreamTaskMessage]: ... + + def usage(self) -> TurnUsage: ... +``` + +`events` is the canonical stream for this turn. `usage()` is valid only after `events` is exhausted (async generators cannot cleanly return a value to the consumer, so usage travels out-of-band). + +--- + +## `TurnUsage` + +Token counts and cost for one turn, harness-independent: + +```python +class TurnUsage(BaseModel): + model: str | None = None + input_tokens: int | None = None + output_tokens: int | None = None + cached_input_tokens: int | None = None + reasoning_tokens: int | None = None + total_tokens: int | None = None + cost_usd: float | None = None + duration_ms: int | None = None + num_llm_calls: int = 0 + num_tool_calls: int = 0 + num_reasoning_blocks: int = 0 +``` + +Field names align with `agentex.lib.core.observability.llm_metrics` for easy conversion. + +--- + +## `UnifiedEmitter` + +`UnifiedEmitter` ties a turn's canonical stream, tracing context, and delivery mode together. Construct one per turn with the task/trace context from the request: + +```python +emitter = UnifiedEmitter( + task_id=params.task.id, + trace_id=params.task.id, # or None to disable tracing + parent_span_id=turn_span.id if turn_span else None, +) +``` + +**Tracing is on by default** when `trace_id` is provided. To disable it explicitly, pass `tracer=False`. To inject a custom `SpanTracer` (e.g. in tests), pass it as `tracer=`. + +### Delivery mode 1: `yield_turn` (sync HTTP ACP) + +For sync ACP agents that return events directly over the HTTP response: + +```python +@acp.on_message_send +async def handle(params): + turn = MyHarnessTurn(params) # implements HarnessTurn + async for event in emitter.yield_turn(turn): + yield event +``` + +`yield_turn` forwards each event to the caller and traces spans as a side effect. It is a passthrough when `tracer` is `None`. + +### Delivery mode 2: `auto_send_turn` (async/Temporal) + +For async or Temporal agents that push to the task stream via Redis: + +```python +result: TurnResult = await emitter.auto_send_turn(turn, created_at=workflow.now()) +``` + +`auto_send_turn` drives `adk.streaming` contexts for every message in the stream, derives and records spans, and returns a `TurnResult` with the final text and usage. Pass `created_at` under Temporal to back-date message timestamps deterministically. + +--- + +## `TurnResult` + +```python +class TurnResult(BaseModel): + final_text: str = "" + usage: TurnUsage = TurnUsage() +``` + +Returned by `auto_send_turn`. `final_text` is the last text segment of the turn (multi-step runs return only the final segment, matching `stream_langgraph_events` / `stream_pydantic_ai_events` semantics). + +--- + +## Tracing: span derivation + +Spans are derived from the canonical stream by `SpanDeriver` (pure, no `adk` dependency) and dispatched to `adk.tracing` by `SpanTracer`. The mapping: + +- `StreamTaskMessageStart(ToolRequestContent)` + `StreamTaskMessageDone` on that index -> tool span open (keyed by `tool_call_id`) +- `StreamTaskMessageFull(ToolResponseContent)` whose `tool_call_id` was opened -> tool span close +- `StreamTaskMessageFull(ToolRequestContent)` (harnesses that emit tool calls as Full) -> opens a tool span; matching `Full(ToolResponseContent)` closes it +- `StreamTaskMessageStart(ReasoningContent)` + `StreamTaskMessageDone` -> reasoning span + +`SpanTracer` is `SpanDeriver`'s consumer. You can inject a custom `SpanTracer` via `UnifiedEmitter(tracer=)` for advanced use or testing. + +--- + +## Usage examples by channel + +### Sync ACP (pydantic-ai tap) + +```python +import agentex.lib.adk as adk +from agentex.lib.adk import UnifiedEmitter, convert_pydantic_ai_to_agentex_events + +@acp.on_message_send +async def handle(params): + task_id = params.task.id + async with adk.tracing.span(trace_id=task_id, name="message", ...) as turn_span: + emitter = UnifiedEmitter( + task_id=task_id, + trace_id=task_id, + parent_span_id=turn_span.id if turn_span else None, + ) + tap = convert_pydantic_ai_to_agentex_events(pydantic_stream) + # wrap tap in a HarnessTurn then yield_turn, or yield directly: + async for event in tap: + yield event +``` + +For the pre-unified sync path the tap is still yielded directly; `UnifiedEmitter.yield_turn` is the forward-looking integration point when a `HarnessTurn` wrapper is available. + +### Async Temporal (auto-send) + +```python +from agentex.lib.adk import UnifiedEmitter + +emitter = UnifiedEmitter( + task_id=task_id, + trace_id=task_id, + parent_span_id=parent_span_id, +) +result = await emitter.auto_send_turn(turn, created_at=workflow.now()) +# result.final_text — last text segment +# result.usage — TurnUsage (tokens, cost, ...) +``` diff --git a/src/agentex/lib/adk/__init__.py b/src/agentex/lib/adk/__init__.py index f6713be7c..fedd52f7a 100644 --- a/src/agentex/lib/adk/__init__.py +++ b/src/agentex/lib/adk/__init__.py @@ -27,6 +27,19 @@ from agentex.lib.adk._modules.tasks import TasksModule from agentex.lib.adk._modules.tracing import TracingModule +# Unified harness surface (AGX1-375) +from agentex.lib.core.harness import ( + UnifiedEmitter, + SpanTracer, + OpenSpan, + CloseSpan, + SpanSignal, + StreamTaskMessage, + TurnUsage, + TurnResult, + HarnessTurn, +) + from agentex.lib.adk import providers from agentex.lib.adk import utils @@ -69,6 +82,16 @@ "convert_codex_to_agentex_events", "CodexTurn", "codex_usage_to_turn_usage", + # Unified harness surface (AGX1-375) + "UnifiedEmitter", + "SpanTracer", + "OpenSpan", + "CloseSpan", + "SpanSignal", + "StreamTaskMessage", + "TurnUsage", + "TurnResult", + "HarnessTurn", # Providers "providers", # Utils From eea2ce6c944114255f2a3652c831cd9d96ff0020 Mon Sep 17 00:00:00 2001 From: "stainless-app[bot]" <142633134+stainless-app[bot]@users.noreply.github.com> Date: Tue, 23 Jun 2026 00:11:09 +0000 Subject: [PATCH 10/10] chore: release main --- .release-please-manifest.json | 4 ++-- CHANGELOG.md | 20 ++++++++++++++++++++ adk/CHANGELOG.md | 8 ++++++++ adk/pyproject.toml | 2 +- pyproject.toml | 2 +- src/agentex/_version.py | 2 +- 6 files changed, 33 insertions(+), 5 deletions(-) diff --git a/.release-please-manifest.json b/.release-please-manifest.json index be44cf037..9a40fa434 100644 --- a/.release-please-manifest.json +++ b/.release-please-manifest.json @@ -1,4 +1,4 @@ { - ".": "0.14.0", - "adk": "0.13.2" + ".": "0.15.0", + "adk": "0.14.0" } diff --git a/CHANGELOG.md b/CHANGELOG.md index 8f81295a9..fb03baa67 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,26 @@ * **tracing:** emit OTel metrics for async span queue depth, batch drain, and SGP export success/failure (HTTP status labels). Disable SDK-side recording with ``AGENTEX_TRACING_METRICS=0``. +## 0.15.0 (2026-06-23) + +Full Changelog: [agentex-client-v0.14.0...agentex-client-v0.15.0](https://github.com/scaleapi/scale-agentex-python/compare/agentex-client-v0.14.0...agentex-client-v0.15.0) + +### Features + +* **claude-code:** stream-json parser tap for the unified harness surface ([#420](https://github.com/scaleapi/scale-agentex-python/issues/420)) ([904339c](https://github.com/scaleapi/scale-agentex-python/commit/904339c21b8cd641a02d903c03d4a8730b4d7e84)) +* **codex:** event-stream parser tap for the unified harness surface ([#421](https://github.com/scaleapi/scale-agentex-python/issues/421)) ([9b2b031](https://github.com/scaleapi/scale-agentex-python/commit/9b2b03144cc67bb497e0a301686207aba2629758)) +* **harness:** public adk facade + docs for the unified harness surface (PR 9) ([#423](https://github.com/scaleapi/scale-agentex-python/issues/423)) ([fa60632](https://github.com/scaleapi/scale-agentex-python/commit/fa60632f9be84315a3fdc627745ae5b605994bd8)) +* **harness:** unified harness surface — foundation (span derivation, delivery adapters, emitter) ([#412](https://github.com/scaleapi/scale-agentex-python/issues/412)) ([a9cacf4](https://github.com/scaleapi/scale-agentex-python/commit/a9cacf4eb71697351ee658a570636f04bbf31ad5)) +* **langgraph:** migrate LangGraph harness onto unified surface ([#417](https://github.com/scaleapi/scale-agentex-python/issues/417)) ([d344228](https://github.com/scaleapi/scale-agentex-python/commit/d34422845de4b80ed69d2dccfdb0c680ef2fbca3)) +* **openai-agents:** migrate onto the unified harness surface ([#416](https://github.com/scaleapi/scale-agentex-python/issues/416)) ([d10e151](https://github.com/scaleapi/scale-agentex-python/commit/d10e1510bd5da44ad5acc5cac638750122083fce)) +* **pydantic-ai:** migrate onto unified harness surface (PR4) ([#415](https://github.com/scaleapi/scale-agentex-python/issues/415)) ([5ec62c2](https://github.com/scaleapi/scale-agentex-python/commit/5ec62c20781d24fc3e0b92734fcd444b1e791d70)) +* **streaming:** stream tool call argument deltas in TemporalStreamingModel ([#355](https://github.com/scaleapi/scale-agentex-python/issues/355)) ([c8de1d4](https://github.com/scaleapi/scale-agentex-python/commit/c8de1d4c9c3b5b3c16ad4aaf9644c1ba0d618757)) + + +### Bug Fixes + +* **harness:** assert cross-channel (yield vs auto-send) conformance equivalence [AGX1-373] ([#414](https://github.com/scaleapi/scale-agentex-python/issues/414)) ([694960f](https://github.com/scaleapi/scale-agentex-python/commit/694960f913b8ba521d9236e876e5e00f57a3a3ff)) + ## 0.14.0 (2026-06-22) Full Changelog: [agentex-client-v0.13.1...agentex-client-v0.14.0](https://github.com/scaleapi/scale-agentex-python/compare/agentex-client-v0.13.1...agentex-client-v0.14.0) diff --git a/adk/CHANGELOG.md b/adk/CHANGELOG.md index 8c15355d9..ac7404e6b 100644 --- a/adk/CHANGELOG.md +++ b/adk/CHANGELOG.md @@ -1,5 +1,13 @@ # Changelog +## 0.14.0 (2026-06-23) + +Full Changelog: [agentex-sdk-v0.13.2...agentex-sdk-v0.14.0](https://github.com/scaleapi/scale-agentex-python/compare/agentex-sdk-v0.13.2...agentex-sdk-v0.14.0) + +### Features + +* **harness:** public adk facade + docs for the unified harness surface (PR 9) ([#423](https://github.com/scaleapi/scale-agentex-python/issues/423)) ([fa60632](https://github.com/scaleapi/scale-agentex-python/commit/fa60632f9be84315a3fdc627745ae5b605994bd8)) + ## 0.13.2 (2026-06-22) Full Changelog: [agentex-sdk-v0.13.1...agentex-sdk-v0.13.2](https://github.com/scaleapi/scale-agentex-python/compare/agentex-sdk-v0.13.1...agentex-sdk-v0.13.2) diff --git a/adk/pyproject.toml b/adk/pyproject.toml index 946367d7f..1d8c00a40 100644 --- a/adk/pyproject.toml +++ b/adk/pyproject.toml @@ -4,7 +4,7 @@ # (agentex/{__init__.py, _*.py, types/, resources/}) ships from the slim # sibling package `agentex-client` which is pinned as a runtime dep. name = "agentex-sdk" -version = "0.13.2" +version = "0.14.0" description = "Agent Development Kit (ADK) overlay for the Agentex API — FastACP server, Temporal workflows, LLM provider integrations, observability" license = "Apache-2.0" authors = [ diff --git a/pyproject.toml b/pyproject.toml index 98134d993..7ee0cf56b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,7 +3,7 @@ # overlay (formerly `src/agentex/lib/*`) now lives in `adk/` and ships # as the sibling `agentex-sdk` package — see `adk/pyproject.toml`. name = "agentex-client" -version = "0.14.0" +version = "0.15.0" description = "The official Python REST client for the Agentex API" dynamic = ["readme"] license = "Apache-2.0" diff --git a/src/agentex/_version.py b/src/agentex/_version.py index 551c0dbac..c567e168b 100644 --- a/src/agentex/_version.py +++ b/src/agentex/_version.py @@ -1,4 +1,4 @@ # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. __title__ = "agentex" -__version__ = "0.14.0" # x-release-please-version +__version__ = "0.15.0" # x-release-please-version