Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

15 changes: 15 additions & 0 deletions packages/agent/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,8 @@ Both classes mirror pi constructor shapes and behavior, with minimal additions:
- CUA model refs (`"provider:model"`) accepted where pi expects a concrete model
- `extraTools` to add your own pi tools alongside the built-in browser tools
- `computerUseExtra: true` to let the model use a small navigation helper
- `playwright: true` to let the model run Playwright/TypeScript against the
live browser session

If auth callbacks are omitted, both classes default to CUA env var conventions:
- OpenAI: `OPENAI_API_KEY`
Expand All @@ -124,6 +126,19 @@ URL or go back. `computerUseExtra: true` adds `computer_use_extra`, a
provider-neutral escape hatch exposing `goto`, `back`, `forward`, and `url`
so navigation works uniformly regardless of which model is driving.

Some steps are awkward as raw pointer/keyboard actions: precise DOM reads,
form fills, data extraction, or waiting on a specific selector.
`playwright: true` adds `playwright_execute`, which runs Playwright/TypeScript
directly against the live browser session. `page`, `context`, and `browser`
are in scope and the code may `return` a JSON-serializable value. Each call
runs in a fresh JS context (locals don't persist across calls) but the
browser session does carry over. No screenshot is returned automatically;
request one on a follow-up turn when the model needs to see the page.
Playwright-level failures come back as tool content (so the model can adapt)
rather than thrown errors. Verified e2e
against Anthropic, Tzafon, and Yutori CUA models; OpenAI and Google are
unit-tested.

### Model Switching

`CuaAgent` follows pi `Agent` semantics: assign `agent.state.model` to a
Expand Down
12 changes: 12 additions & 0 deletions packages/agent/src/agent.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import {
import {
type Api,
CUA_NAVIGATION_TOOL_NAME,
CUA_PLAYWRIGHT_TOOL_NAME,
type CuaModelRef,
type CuaRuntimeSpec,
type CuaSimpleStreamOptions,
Expand Down Expand Up @@ -66,6 +67,8 @@ export type CuaAgentOptions = Omit<AgentOptions, "initialState"> & {
extraTools?: AgentTool[];
/** Expose a helper for browser navigation and URL reads. */
computerUseExtra?: boolean;
/** Expose a tool that runs Playwright code against the browser session. */
playwright?: boolean;
};

/**
Expand All @@ -89,6 +92,8 @@ export type CuaAgentHarnessOptions<
extraTools?: AgentTool[];
/** Expose a helper for browser navigation and URL reads. */
computerUseExtra?: boolean;
/** Expose a tool that runs Playwright code against the browser session. */
playwright?: boolean;
/** Optional payload hook composed after the provider-specific CUA payload hook. */
onPayload?: SimpleStreamOptions["onPayload"];
};
Expand All @@ -110,6 +115,7 @@ class CuaRuntimeController {
model: CuaRuntimeInput;
extraTools?: AgentTool[];
computerUseExtra?: boolean;
playwright?: boolean;
onPayload?: SimpleStreamOptions["onPayload"];
},
) {
Expand All @@ -136,6 +142,7 @@ class CuaRuntimeController {
{
toolExecutors: this.runtimeSpec.toolExecutors,
computerUseExtra: this.options.computerUseExtra,
playwright: this.options.playwright,
},
this.translator,
),
Expand All @@ -159,6 +166,7 @@ class CuaRuntimeController {
return [
...(this.options.extraTools ?? []).map((tool) => tool.name),
...(this.options.computerUseExtra ? [CUA_NAVIGATION_TOOL_NAME] : []),
...(this.options.playwright ? [CUA_PLAYWRIGHT_TOOL_NAME] : []),
];
}

Expand Down Expand Up @@ -203,6 +211,7 @@ export class CuaAgent extends Agent {
prepareNextTurn,
extraTools,
computerUseExtra,
playwright,
...agentOptions
} = options;
const runtime = new CuaRuntimeController({
Expand All @@ -211,6 +220,7 @@ export class CuaAgent extends Agent {
model: initialState.model,
extraTools,
computerUseExtra,
playwright,
onPayload,
});
const wrappedStreamFn: StreamFn = (model, context, streamOptions) => {
Expand Down Expand Up @@ -326,6 +336,7 @@ export class CuaAgentHarness<
model,
extraTools,
computerUseExtra,
playwright,
systemPrompt,
getApiKeyAndHeaders,
onPayload,
Expand All @@ -338,6 +349,7 @@ export class CuaAgentHarness<
model,
extraTools,
computerUseExtra,
playwright,
onPayload,
});
const resolvedTools = runtime.tools();
Expand Down
1 change: 1 addition & 0 deletions packages/agent/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ export type {
ComputerToolOptions,
CuaExecutorTool,
NavigationDetails,
PlaywrightDetails,
} from "./tools";
export { CuaAgent, CuaAgentHarness } from "./agent";
export type { CuaAgentHarnessOptions, CuaAgentOptions, CuaAgentState } from "./agent";
106 changes: 99 additions & 7 deletions packages/agent/src/tools.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,13 @@ import type Kernel from "@onkernel/sdk";
import type { ImageContent, TextContent, Tool } from "@earendil-works/pi-ai";
import {
CUA_NAVIGATION_TOOL_NAME,
CUA_PLAYWRIGHT_TOOL_NAME,
createCuaNavigationToolDefinition,
createCuaPlaywrightToolDefinition,
type ComputerToolCoordinateSystem,
type CuaBatchInput,
type CuaNavigationInput,
type CuaPlaywrightInput,
type CuaScreenshotSpec,
type CuaToolExecutorSpec,
type TSchema,
Expand All @@ -20,6 +23,7 @@ export interface ComputerToolOptions {
coordinateSystem?: ComputerToolCoordinateSystem;
screenshot?: CuaScreenshotSpec;
computerUseExtra?: boolean;
playwright?: boolean;
}

type ToolContent = Array<TextContent | ImageContent>;
Expand All @@ -35,31 +39,59 @@ export interface NavigationDetails {
url?: string;
}

/**
* Structured details for a `playwright_execute` tool result. Library
* consumers can read these directly instead of re-parsing the model-facing
* tool content blocks.
*
* - `success` — whether the Playwright code itself completed without error.
* A `false` value means the code threw or the SDK reported failure; in
* that case the failure is also surfaced as tool content for the model.
* - `statusText` — short human-readable status (success or failure summary).
* - `result` — present only when the code returned a JSON-serializable value.
* - `stdout`/`stderr` — raw daemon output, present whenever the daemon
* reported a non-empty value on that stream (may be whitespace-only).
* - `error` — present only when `success` is `false`; the error message from
* the daemon.
*/
export interface PlaywrightDetails {
success: boolean;
statusText: string;
result?: unknown;
stdout?: string;
stderr?: string;
error?: string;
}
Comment thread
cursor[bot] marked this conversation as resolved.

type BatchTool = AgentTool<TSchema, BatchDetails>;
type NavigationTool = AgentTool<TSchema, NavigationDetails>;
type PlaywrightTool = AgentTool<TSchema, PlaywrightDetails>;
type ActionTool = AgentTool<TSchema, BatchDetails>;
export type CuaExecutorTool = BatchTool | NavigationTool | ActionTool;
export type CuaExecutorTool = BatchTool | NavigationTool | PlaywrightTool | ActionTool;
type NavigationExecutorSpec = { kind: "navigation"; definition: Tool };
type ComputerExecutorSpec = CuaToolExecutorSpec | NavigationExecutorSpec;
type PlaywrightExecutorSpec = { kind: "playwright"; definition: Tool };
type ComputerExecutorSpec = CuaToolExecutorSpec | NavigationExecutorSpec | PlaywrightExecutorSpec;

export function createCuaComputerTools(args: ComputerToolOptions): CuaExecutorTool[] {
return buildCuaComputerTools(args, new InternalComputerTranslator(args));
}

/** Build executor tools against an existing translator (internal; not part of the package surface). */
export function buildCuaComputerTools(
args: Pick<ComputerToolOptions, "toolExecutors" | "computerUseExtra">,
args: Pick<ComputerToolOptions, "toolExecutors" | "computerUseExtra" | "playwright">,
translator: InternalComputerTranslator,
): CuaExecutorTool[] {
return withNavigationTool(args).map((executor) => createExecutorTool(executor, translator));
return withExtraTools(args).map((executor) => createExecutorTool(executor, translator));
}

function withNavigationTool(args: Pick<ComputerToolOptions, "toolExecutors" | "computerUseExtra">): ComputerExecutorSpec[] {
function withExtraTools(args: Pick<ComputerToolOptions, "toolExecutors" | "computerUseExtra" | "playwright">): ComputerExecutorSpec[] {
const executors: ComputerExecutorSpec[] = [...args.toolExecutors];
const existing = new Set(executors.map((executor) => executor.definition.name));
if (args.computerUseExtra && !existing.has(CUA_NAVIGATION_TOOL_NAME)) {
const definition = createCuaNavigationToolDefinition();
executors.push({ kind: "navigation", definition });
executors.push({ kind: "navigation", definition: createCuaNavigationToolDefinition() });
}
if (args.playwright && !existing.has(CUA_PLAYWRIGHT_TOOL_NAME)) {
executors.push({ kind: "playwright", definition: createCuaPlaywrightToolDefinition() });
}
return executors;
}
Expand All @@ -78,6 +110,19 @@ function createExecutorTool(executor: ComputerExecutorSpec, translator: Internal
};
return tool;
}
if (isPlaywrightExecutor(executor)) {
const tool: PlaywrightTool = {
name: definition.name,
label: definition.name,
description: definition.description,
parameters: definition.parameters,
executionMode: "sequential",
async execute(_toolCallId: string, params: unknown): Promise<AgentToolResult<PlaywrightDetails>> {
return executePlaywrightTool(translator, asPlaywrightInput(params));
},
};
return tool;
}
const tool: ActionTool = {
name: definition.name,
label: definition.name,
Expand All @@ -95,6 +140,10 @@ function isNavigationExecutor(executor: ComputerExecutorSpec): executor is Navig
return "kind" in executor && executor.kind === "navigation";
}

function isPlaywrightExecutor(executor: ComputerExecutorSpec): executor is PlaywrightExecutorSpec {
return "kind" in executor && executor.kind === "playwright";
}

async function executeBatchTool(translator: InternalComputerTranslator, params: CuaBatchInput): Promise<AgentToolResult<BatchDetails>> {
const content: ToolContent = [];
const readResults: BatchDetails["readResults"] = [];
Expand Down Expand Up @@ -149,6 +198,42 @@ async function executeNavigationTool(translator: InternalComputerTranslator, par
}
}

async function executePlaywrightTool(translator: InternalComputerTranslator, params: CuaPlaywrightInput): Promise<AgentToolResult<PlaywrightDetails>> {
try {
const execution = await translator.executePlaywright(params.code, params.timeout_sec);

const content: ToolContent = [];
if (execution.result !== undefined) {
content.push({ type: "text", text: `result: ${formatPlaywrightResult(execution.result)}` });
}
if (execution.stdout?.trim()) {
content.push({ type: "text", text: `stdout:\n${execution.stdout.trimEnd()}` });
}
if (execution.stderr?.trim()) {
content.push({ type: "text", text: `stderr:\n${execution.stderr.trimEnd()}` });
}
if (!execution.success) {
content.push({ type: "text", text: `error: ${execution.error ?? "playwright execution reported failure"}` });
}

const statusText = execution.success ? "Playwright executed successfully." : `Playwright execution failed: ${execution.error ?? "unknown error"}`;
if (content.length === 0) content.push({ type: "text", text: statusText });

const details: PlaywrightDetails = { success: execution.success, statusText };
if (execution.result !== undefined) details.result = execution.result;
if (execution.stdout) details.stdout = execution.stdout;
if (execution.stderr) details.stderr = execution.stderr;
if (execution.error) details.error = execution.error;
return { content, details };
} catch (err) {
throw new Error(`playwright_execute failed: ${errorMessage(err)}`, { cause: err });
}
Comment thread
cursor[bot] marked this conversation as resolved.
}

function formatPlaywrightResult(result: unknown): string {
return typeof result === "string" ? result : JSON.stringify(result);
}
Comment thread
cursor[bot] marked this conversation as resolved.

function errorMessage(err: unknown): string {
return err instanceof Error ? err.message : String(err);
}
Expand All @@ -163,3 +248,10 @@ function asNavigationInput(value: unknown): CuaNavigationInput {
}
throw new Error("invalid computer_use_extra parameters");
}

function asPlaywrightInput(value: unknown): CuaPlaywrightInput {
if (value && typeof value === "object" && typeof (value as { code?: unknown }).code === "string") {
return value as CuaPlaywrightInput;
}
throw new Error("invalid playwright_execute parameters");
}
16 changes: 16 additions & 0 deletions packages/agent/src/translator/translator.ts
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,17 @@ export class InternalComputerTranslator {
return { x: Math.trunc(pos.x), y: Math.trunc(pos.y) };
}

async executePlaywright(code: string, timeoutSec?: number): Promise<PlaywrightExecutionResult> {
const truncated = timeoutSec !== undefined ? Math.trunc(timeoutSec) : undefined;
const timeout = truncated !== undefined && truncated >= 1
? Math.min(truncated, PLAYWRIGHT_MAX_TIMEOUT_SEC)
: undefined;
return this.client.browsers.playwright.execute(this.sessionId, {
code,
...(timeout !== undefined ? { timeout_sec: timeout } : {}),
});
Comment thread
dprevoznik marked this conversation as resolved.
}

async executeBatch(actions: CuaAction[]): Promise<BatchExecutionResult> {
const result: BatchExecutionResult = { readResults: [] };
const pending: KernelBatchAction[] = [];
Expand Down Expand Up @@ -228,6 +239,11 @@ export class InternalComputerTranslator {
type KernelBatchAction =
Parameters<Kernel["browsers"]["computer"]["batch"]>[1]["actions"][number];

export type PlaywrightExecutionResult =
Awaited<ReturnType<Kernel["browsers"]["playwright"]["execute"]>>;

const PLAYWRIGHT_MAX_TIMEOUT_SEC = 300;

const CLICK_BUTTONS: ReadonlySet<string> = new Set<CuaMouseButton>(["left", "right", "middle", "back", "forward"]);
const DRAG_BUTTONS: ReadonlySet<string> = new Set<CuaDragMouseButton>(["left", "right", "middle"]);

Expand Down
17 changes: 17 additions & 0 deletions packages/agent/test/agent.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,23 @@ describe("CuaAgent", () => {
]);
});

it("synthesizes a playwright_execute tool when requested", () => {
const runtime = resolveCuaRuntimeSpec("openai:gpt-5.5");
const agent = new CuaAgent({
browser,
client,
playwright: true,
initialState: {
model: "openai:gpt-5.5",
},
});

expect(agent.state.tools.map((tool) => tool.name)).toEqual([
...runtime.toolExecutors.map((tool) => tool.definition.name),
"playwright_execute",
]);
});

it("refreshes CUA runtime state when state.model changes", () => {
const runtime = resolveCuaRuntimeSpec("google:gemini-3-flash-preview");
const agent = new CuaAgent({
Expand Down
Loading
Loading