diff --git a/.changeset/config.json b/.changeset/config.json index 917f42b..00e3c98 100644 --- a/.changeset/config.json +++ b/.changeset/config.json @@ -4,7 +4,7 @@ "commit": false, "fixed": [], "linked": [ - ["@browseragentprotocol/protocol", "@browseragentprotocol/logger", "@browseragentprotocol/client", "@browseragentprotocol/server-playwright", "@browseragentprotocol/mcp"] + ["@browseragentprotocol/protocol", "@browseragentprotocol/logger", "@browseragentprotocol/client", "@browseragentprotocol/server-playwright", "@browseragentprotocol/mcp", "@browseragentprotocol/cli"] ], "access": "public", "baseBranch": "main", diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 95fbd5d..5d7b016 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -6,6 +6,10 @@ on: pull_request: branches: [main] +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + jobs: build: runs-on: ubuntu-latest @@ -35,5 +39,66 @@ jobs: - name: Lint run: pnpm lint + test: + runs-on: ubuntu-latest + needs: build + strategy: + fail-fast: false + matrix: + node-version: ["20", "22"] + steps: + - uses: actions/checkout@v4 + + - name: Setup pnpm + uses: pnpm/action-setup@v2 + with: + version: 9.15.0 + + - name: Setup Node.js ${{ matrix.node-version }} + uses: actions/setup-node@v4 + with: + node-version: ${{ matrix.node-version }} + cache: "pnpm" + + - name: Install dependencies + run: pnpm install --frozen-lockfile + + - name: Build + run: pnpm build + - name: Test run: pnpm test + + coverage: + runs-on: ubuntu-latest + needs: build + steps: + - uses: actions/checkout@v4 + + - name: Setup pnpm + uses: pnpm/action-setup@v2 + with: + version: 9.15.0 + + - name: Setup Node.js + uses: actions/setup-node@v4 + with: + node-version: "22" + cache: "pnpm" + + - name: Install dependencies + run: pnpm install --frozen-lockfile + + - name: Build + run: pnpm build + + - name: Test with coverage + run: pnpm test:coverage + + - name: Upload coverage + if: always() + uses: actions/upload-artifact@v4 + with: + name: coverage-report + path: coverage/ + retention-days: 14 diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..db0dcff --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,128 @@ +# Contributing to Browser Agent Protocol + +Thank you for your interest in contributing to BAP. This document covers the process for contributing to this project. + +## Getting Started + +### Prerequisites + +- Node.js >= 20.0.0 +- pnpm 9.x (`corepack enable && corepack prepare pnpm@9.15.0 --activate`) +- Git + +### Setup + +```bash +git clone https://github.com/browseragentprotocol/bap.git +cd bap +pnpm install +pnpm build +``` + +### Verify your setup + +```bash +pnpm typecheck # Type checking across all packages +pnpm lint # ESLint +pnpm test # Vitest test suites +``` + +## Repository Structure + +``` +packages/ + protocol/ # Core types, schemas, and shared utilities (Zod) + logger/ # Structured logging + client/ # TypeScript WebSocket client SDK + server-playwright/ # Playwright-based BAP server + mcp/ # MCP (Model Context Protocol) bridge + cli/ # Shell CLI for browser automation + python-sdk/ # Python client SDK +``` + +**Dependency order**: `protocol` -> `logger` -> `client` / `server-playwright` -> `mcp` / `cli` + +All packages are built with `tsup` and managed with `turborepo`. + +## Development Workflow + +### 1. Create a branch + +```bash +git checkout -b feature/your-feature-name +``` + +### 2. Make changes + +- Protocol changes go in `packages/protocol/src/types/` +- Server handler changes go in `packages/server-playwright/src/server.ts` +- Client SDK changes go in `packages/client/src/index.ts` +- CLI commands go in `packages/cli/src/commands/` + +### 3. Build and test + +```bash +pnpm build # Build all packages (respects dependency order) +pnpm typecheck # Must pass with zero errors +pnpm lint # Must pass with zero errors (warnings are acceptable) +pnpm test # All tests must pass +``` + +### 4. Submit a pull request + +- Keep PRs focused on a single change +- Include tests for new functionality +- Update relevant README files if adding user-facing features +- Reference any related issues + +## Code Style + +- TypeScript strict mode is enabled across all packages +- ESLint with `typescript-eslint` rules +- Prettier for formatting (`pnpm format`) +- Use Zod schemas for all protocol types (no raw `interface` for wire types) +- All new protocol fields must be optional for backward compatibility + +## Testing + +Tests use [Vitest](https://vitest.dev/). Each package has its own test configuration. + +```bash +# Run all tests +pnpm test + +# Run tests for a specific package +pnpm --filter @browseragentprotocol/protocol test + +# Run with coverage +pnpm test:coverage +``` + +### Test guidelines + +- Schema validation tests go in `packages/protocol/src/__tests__/` +- CLI flag/command tests go in `packages/cli/__tests__/` +- MCP tool tests go in `packages/mcp/src/__tests__/` +- Integration tests that require a browser go in `packages/server-playwright/src/__tests__/` + +## Protocol Changes + +BAP uses JSON-RPC 2.0 over WebSocket. If you are changing the protocol: + +1. Update Zod schemas in `packages/protocol/src/types/` +2. Export new types from `packages/protocol/src/types/index.ts` +3. Implement server handling in `packages/server-playwright/src/server.ts` +4. Add client passthrough in `packages/client/src/index.ts` +5. Add schema validation tests +6. All new fields must be **optional** to maintain backward compatibility + +## Reporting Issues + +- Use [GitHub Issues](https://github.com/browseragentprotocol/bap/issues) +- Include BAP version, Node.js version, and browser type +- For bugs: include steps to reproduce, expected vs actual behavior +- For feature requests: describe the use case and proposed API + +## License + +By contributing, you agree that your contributions will be licensed under the [Apache License 2.0](LICENSE). diff --git a/README.md b/README.md index d880a86..e8ff8d9 100644 --- a/README.md +++ b/README.md @@ -1,58 +1,93 @@ # Browser Agent Protocol (BAP) +[![npm version](https://badge.fury.io/js/@browseragentprotocol%2Fcli.svg)](https://www.npmjs.com/package/@browseragentprotocol/cli) [![npm version](https://badge.fury.io/js/@browseragentprotocol%2Fmcp.svg)](https://www.npmjs.com/package/@browseragentprotocol/mcp) [![License: Apache-2.0](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) -An open standard for AI agents to interact with web browsers. +An open standard for AI agents to interact with web browsers. Two interfaces: **CLI** for shell-based agents, **MCP** for protocol-native agents. -> **v0.2.0:** Renamed MCP tools, auto-reconnect, multi-context support, streaming, and more. APIs may evolve based on feedback. - -## Overview +```bash +# CLI — any agent that can run shell commands +npx @browseragentprotocol/cli open https://example.com +npx @browseragentprotocol/cli act 'click:text:"More information..."' snapshot -BAP (Browser Agent Protocol) provides a standardized way for AI agents to control web browsers. It uses JSON-RPC 2.0 over WebSocket for communication and includes semantic selectors designed for AI comprehension. +# MCP — agents with native Model Context Protocol support +npx @browseragentprotocol/mcp +```

BAP Architecture

-### Key Features +## Why BAP? -- **Semantic Selectors**: Use accessibility roles, text content, and labels instead of brittle CSS selectors -- **Accessibility-First**: Built-in support for accessibility tree inspection -- **AI-Optimized**: Designed for LLM-based agents with token-efficient observations -- **MCP Integration**: Works seamlessly with [Model Context Protocol](https://modelcontextprotocol.io) -- **Composite Actions**: Execute multi-step action sequences in a single round-trip (`agent/act`, `agent/observe`, `agent/extract`) -- **Element References**: Stable element refs (`@submitBtn`, `@e7f3a2`) that persist across observations -- **Screenshot Annotation**: Set-of-Marks style overlays with numbered badges for vision models -- **Multi-Context Support**: Parallel isolated browser sessions with `context/create`, `context/list`, `context/destroy` -- **Human-in-the-Loop Approval**: Enterprise workflow for human oversight of sensitive actions -- **Frame Support**: Explicit frame switching for iframes with `frame/list`, `frame/switch`, `frame/main` -- **Streaming Responses**: Chunked transfers for large observations with checksum verification +- **Composite Actions**: Execute multi-step flows in one command — 40x fewer tokens than one-action-at-a-time +- **Fused Operations**: Combine navigate+observe, act+observe into single server calls — 50-85% fewer roundtrips +- **Semantic Selectors**: Target elements by purpose (`role:button:"Submit"`) not position — survives redesigns +- **Structured Extraction**: Extract validated JSON from any page with a schema +- **Two Interfaces**: CLI (`bap act`) for shell-based agents, MCP tools for protocol-native agents +- **Accessibility-First**: Built on accessibility tree inspection, designed for AI comprehension +- **Element References**: Stable refs (`@e1`, `e15`) that persist across observations +- **Screenshot Annotation**: Set-of-Marks overlays with numbered badges for vision models -## Packages +## Quick Start -### TypeScript +### CLI — For AI Agents That Run Shell Commands -| Package | Description | npm | -|---------|-------------|-----| -| [`@browseragentprotocol/protocol`](./packages/protocol) | Protocol types, schemas, and utilities | [![npm](https://img.shields.io/npm/v/@browseragentprotocol/protocol)](https://www.npmjs.com/package/@browseragentprotocol/protocol) | -| [`@browseragentprotocol/logger`](./packages/logger) | Pretty logging utilities with colors and icons | [![npm](https://img.shields.io/npm/v/@browseragentprotocol/logger)](https://www.npmjs.com/package/@browseragentprotocol/logger) | -| [`@browseragentprotocol/client`](./packages/client) | TypeScript client SDK | [![npm](https://img.shields.io/npm/v/@browseragentprotocol/client)](https://www.npmjs.com/package/@browseragentprotocol/client) | -| [`@browseragentprotocol/server-playwright`](./packages/server-playwright) | Server implementation using Playwright | [![npm](https://img.shields.io/npm/v/@browseragentprotocol/server-playwright)](https://www.npmjs.com/package/@browseragentprotocol/server-playwright) | -| [`@browseragentprotocol/mcp`](./packages/mcp) | Model Context Protocol integration | [![npm](https://img.shields.io/npm/v/@browseragentprotocol/mcp)](https://www.npmjs.com/package/@browseragentprotocol/mcp) | +```bash +# Open a page and observe interactive elements +bap open https://example.com +bap observe --max=20 -### Python +# Login flow in ONE command (vs 3+ separate commands) +bap act fill:role:textbox:"Email"="user@example.com" \ + fill:role:textbox:"Password"="secret" \ + click:role:button:"Sign in" -| Package | Description | PyPI | -|---------|-------------|------| -| [`browser-agent-protocol`](./packages/python-sdk) | Python SDK with async/sync APIs | [![PyPI](https://img.shields.io/pypi/v/browser-agent-protocol)](https://pypi.org/project/browser-agent-protocol/) | +# Extract structured data +bap extract --fields="title,price,rating" -## Quick Start +# Use semantic selectors +bap click role:button:"Get Started" +bap fill label:"Email" "user@example.com" +``` -BAP works with any MCP-compatible client. The server auto-starts — no separate setup needed. +Install globally or use via npx: + +```bash +npm i -g @browseragentprotocol/cli +# or +npx @browseragentprotocol/cli +``` + +See the full [CLI documentation](./packages/cli) for all 26 commands, selector reference, and recipes. + +### MCP — For Protocol-Native Agents + +``` +navigate({ url: "https://example.com/login" }) +observe({ includeScreenshot: true }) +act({ + steps: [ + { action: "action/fill", selector: "@e1", value: "user@example.com" }, + { action: "action/fill", selector: "@e2", value: "password123" }, + { action: "action/click", selector: "role:button:Sign in" } + ] +}) +``` + +See the [MCP documentation](./packages/mcp) for tool reference and configuration. + +## Integrations ### Claude Code +**CLI** (install skill for optimal usage): +```bash +npm i -g @browseragentprotocol/cli +bap install-skill +``` + **MCP server** (one command): ```bash claude mcp add --transport stdio bap-browser -- npx -y @browseragentprotocol/mcp @@ -92,6 +127,13 @@ Restart Claude Desktop after saving. ### Codex CLI +**CLI**: +```bash +npm i -g @browseragentprotocol/cli +bap install-skill +``` + +**MCP**: ```bash codex mcp add bap-browser -- npx -y @browseragentprotocol/mcp ``` @@ -124,147 +166,126 @@ args = ["-y", "@browseragentprotocol/mcp"] Codex Desktop browsing Hacker News with BAP

-### Browser Selection - -By default, BAP uses your locally installed Chrome. You can choose a different browser with the `--browser` flag: +### Gemini CLI +**CLI**: ```bash -npx @browseragentprotocol/mcp --browser firefox +npm i -g @browseragentprotocol/cli +bap install-skill ``` -| Value | Browser | Notes | -|---|---|---| -| `chrome` (default) | Local Chrome | Falls back to bundled Chromium if not installed | -| `chromium` | Bundled Chromium | Playwright's built-in Chromium | -| `firefox` | Firefox | Requires local Firefox | -| `webkit` | WebKit | Playwright's WebKit engine | -| `edge` | Microsoft Edge | Requires local Edge | +**MCP** — add to `~/.gemini/settings.json`: -In a JSON MCP config, pass the flag via args: ```json { "mcpServers": { "bap-browser": { "command": "npx", - "args": ["-y", "@browseragentprotocol/mcp", "--browser", "firefox"] + "args": ["-y", "@browseragentprotocol/mcp"] } } } ``` -### Using the TypeScript SDK - -#### Start the Server - -```bash -npx @browseragentprotocol/server-playwright -``` - -#### Connect from TypeScript - -```typescript -import { BAPClient, role } from "@browseragentprotocol/client"; +### Manus -const client = new BAPClient("ws://localhost:9222"); -await client.connect(); +Manus supports MCP servers via its web UI (HTTP transport only): -// Launch browser and navigate -await client.launch({ browser: "chromium", headless: false }); -await client.createPage({ url: "https://example.com" }); +1. Go to **Settings > Integrations > Custom MCP Servers** +2. Click **Add Server** +3. Set transport to **HTTP** and provide your hosted BAP MCP endpoint URL +4. Save and verify connection -// Use semantic selectors (AI-friendly) -await client.click(role("button", "Submit")); -await client.fill(role("textbox", "Email"), "user@example.com"); +> **Note:** Manus requires HTTP/SSE transport. To use BAP with Manus, deploy the MCP server as an HTTP endpoint using a stdio-to-HTTP bridge like [mcp-remote](https://www.npmjs.com/package/mcp-remote), then register the URL in the Manus UI. -// Get accessibility tree for AI reasoning -const { tree } = await client.accessibility(); +### Other Agents -await client.close(); -``` - -### Using the Python SDK +BAP CLI includes a built-in skill installer that supports 13 AI coding agent platforms: ```bash -pip install browser-agent-protocol +bap install-skill # Auto-detect and install to all agents +bap install-skill --dry-run # Preview what would be installed ``` -```python -import asyncio -from browseragentprotocol import BAPClient, role, label +Supported: Claude Code, Codex CLI, Gemini CLI, Cursor, GitHub Copilot, Windsurf, Roo Code, Amp, Deep Agents, OpenCode, and more. -async def main(): - async with BAPClient("ws://localhost:9222") as client: - # Launch browser and navigate - await client.launch(browser="chromium", headless=False) - await client.create_page(url="https://example.com") +### Browser Selection - # Use semantic selectors (AI-friendly) - await client.click(role("button", "Submit")) - await client.fill(label("Email"), "user@example.com") +By default, BAP uses your locally installed Chrome. Switch browsers with: - # Get accessibility tree for AI reasoning - tree = await client.accessibility() +```bash +# CLI +bap config browser firefox -asyncio.run(main()) +# MCP — pass via args +npx @browseragentprotocol/mcp --browser firefox ``` -For synchronous usage (scripts, notebooks): +| Value | Browser | Notes | +|---|---|---| +| `chrome` (default) | Local Chrome | Falls back to bundled Chromium if not installed | +| `chromium` | Bundled Chromium | Playwright's built-in Chromium | +| `firefox` | Firefox | Requires local Firefox | +| `webkit` | WebKit | Playwright's WebKit engine | +| `edge` | Microsoft Edge | Requires local Edge | -```python -from browseragentprotocol import BAPClientSync, role +## Packages -with BAPClientSync("ws://localhost:9222") as client: - client.launch(browser="chromium", headless=True) - client.create_page(url="https://example.com") - client.click(role("button", "Submit")) -``` +### TypeScript -### Semantic Selectors +| Package | Description | npm | +|---------|-------------|-----| +| [`@browseragentprotocol/cli`](./packages/cli) | CLI for shell-based AI agents | [![npm](https://img.shields.io/npm/v/@browseragentprotocol/cli)](https://www.npmjs.com/package/@browseragentprotocol/cli) | +| [`@browseragentprotocol/mcp`](./packages/mcp) | MCP integration for protocol-native agents | [![npm](https://img.shields.io/npm/v/@browseragentprotocol/mcp)](https://www.npmjs.com/package/@browseragentprotocol/mcp) | +| [`@browseragentprotocol/client`](./packages/client) | TypeScript client SDK | [![npm](https://img.shields.io/npm/v/@browseragentprotocol/client)](https://www.npmjs.com/package/@browseragentprotocol/client) | +| [`@browseragentprotocol/server-playwright`](./packages/server-playwright) | Server implementation using Playwright | [![npm](https://img.shields.io/npm/v/@browseragentprotocol/server-playwright)](https://www.npmjs.com/package/@browseragentprotocol/server-playwright) | +| [`@browseragentprotocol/protocol`](./packages/protocol) | Protocol types, schemas, and utilities | [![npm](https://img.shields.io/npm/v/@browseragentprotocol/protocol)](https://www.npmjs.com/package/@browseragentprotocol/protocol) | +| [`@browseragentprotocol/logger`](./packages/logger) | Pretty logging utilities | [![npm](https://img.shields.io/npm/v/@browseragentprotocol/logger)](https://www.npmjs.com/package/@browseragentprotocol/logger) | -BAP uses semantic selectors that are more stable and readable than CSS selectors: +### Python -```typescript -import { role, text, label, testId, ref } from "@browseragentprotocol/client"; +| Package | Description | PyPI | +|---------|-------------|------| +| [`browser-agent-protocol`](./packages/python-sdk) | Python SDK with async/sync APIs | [![PyPI](https://img.shields.io/pypi/v/browser-agent-protocol)](https://pypi.org/project/browser-agent-protocol/) | -// By accessibility role and name -await client.click(role("button", "Submit")); -await client.fill(role("textbox", "Search")); +## Architecture -// By visible text content -await client.click(text("Sign in")); +``` +AI Agent (shell) AI Agent (MCP-native) + │ │ + ▼ ▼ +@browseragentprotocol/cli @browseragentprotocol/mcp + │ │ + ▼ ▼ +@browseragentprotocol/client ───────┘ + │ + ▼ WebSocket (JSON-RPC 2.0) +@browseragentprotocol/server-playwright + │ + ▼ Playwright +Browser (Chromium / Firefox / WebKit) +``` -// By associated label -await client.fill(label("Email address"), "user@example.com"); +The CLI spawns the server as a background daemon that persists across commands. The MCP bridge runs as a stdio process managed by the host agent. -// By test ID (for automation) -await client.click(testId("submit-button")); +## Using the SDKs -// By stable element reference (from agent/observe) -await client.click(ref("@submitBtn")); -``` +### TypeScript -### AI Agent Methods +```typescript +import { BAPClient, role } from "@browseragentprotocol/client"; -BAP provides composite methods optimized for AI agents: +const client = new BAPClient("ws://localhost:9222"); +await client.connect(); -```typescript -// agent/observe - Get AI-optimized page snapshot -const observation = await client.observe({ - includeAccessibility: true, - includeInteractiveElements: true, - includeScreenshot: true, - maxElements: 50, - annotateScreenshot: true, // Set-of-Marks style -}); +await client.launch({ browser: "chromium", headless: false }); +await client.createPage({ url: "https://example.com" }); -// Interactive elements with stable refs -for (const el of observation.interactiveElements) { - console.log(`${el.ref}: ${el.role} - ${el.name}`); - // @e1: button - Submit - // @e2: textbox - Email -} +// Semantic selectors +await client.click(role("button", "Submit")); +await client.fill(role("textbox", "Email"), "user@example.com"); -// agent/act - Execute multi-step sequences atomically +// Composite actions const result = await client.act({ steps: [ { action: "action/fill", params: { selector: label("Email"), value: "user@example.com" } }, @@ -272,87 +293,55 @@ const result = await client.act({ { action: "action/click", params: { selector: role("button", "Sign In") } }, ], }); -console.log(`Completed ${result.completed}/${result.total} steps`); - -// agent/extract - Extract structured data -const data = await client.extract({ - instruction: "Extract all product names and prices", - schema: { - type: "array", - items: { - type: "object", - properties: { - name: { type: "string" }, - price: { type: "number" }, - }, - }, - }, -}); -``` -> **Note:** `agent/extract` (and `extract` in MCP) uses heuristic-based extraction (CSS patterns). For complex pages, consider using `content` to get page content as markdown and extract data yourself. +await client.close(); +``` -## Server Options +### Python ```bash -npx @browseragentprotocol/server-playwright [options] - -Options: - -p, --port WebSocket port (default: 9222) - -h, --host Host to bind to (default: localhost) - -b, --browser Browser: chromium, firefox, webkit (default: chromium) - --headless Run in headless mode (default: true) - --no-headless Run with visible browser window - -t, --timeout Default timeout in milliseconds (default: 30000) - -d, --debug Enable debug logging - --token Authentication token for client connections - --help Show help - -v, --version Show version +pip install browser-agent-protocol ``` -## CLI Tools +```python +import asyncio +from browseragentprotocol import BAPClient, role, label -### Python CLI +async def main(): + async with BAPClient("ws://localhost:9222") as client: + await client.launch(browser="chromium", headless=False) + await client.create_page(url="https://example.com") -```bash -# Test connection to a BAP server -bap connect ws://localhost:9222 + await client.click(role("button", "Submit")) + await client.fill(label("Email"), "user@example.com") -# Get server info -bap info ws://localhost:9222 --json +asyncio.run(main()) ``` -### TypeScript Server CLI +## Selectors -```bash -# Start the server -npx @browseragentprotocol/server-playwright --port 9222 --no-headless +BAP uses semantic selectors that survive DOM changes: -# Start with debug logging -npx @browseragentprotocol/server-playwright --debug -``` +| Selector | Example | Priority | +|----------|---------|----------| +| `role::""` | `role:button:"Submit"` | Best — ARIA role + accessible name | +| `text:""` | `text:"Sign in"` | Visible text content | +| `label:""` | `label:"Email"` | Form label association | +| `placeholder:""` | `placeholder:"Search..."` | Input placeholder | +| `testid:""` | `testid:"submit-btn"` | data-testid attribute | +| `e` / `@ref` | `e15`, `@e1` | From snapshot/observe (positional) | +| `css:` | `css:.btn-primary` | Last resort — fragile | ## Development ```bash -# Clone the repository git clone https://github.com/browseragentprotocol/bap.git cd bap - -# Install dependencies pnpm install - -# Build all packages pnpm build - -# Run type checking pnpm typecheck - -# Run linting pnpm lint - -# Install Python SDK in development mode -cd packages/python-sdk && pip install -e . +pnpm test ``` ## Contributing @@ -361,7 +350,7 @@ We welcome contributions! Please open an issue or submit a pull request on GitHu ## License -This project is licensed under the Apache License 2.0 - see the [LICENSE](LICENSE) file for details. +Apache License 2.0 - see the [LICENSE](LICENSE) file for details. ## Links diff --git a/SECURITY.md b/SECURITY.md new file mode 100644 index 0000000..218c66a --- /dev/null +++ b/SECURITY.md @@ -0,0 +1,79 @@ +# Security Policy + +## Supported Versions + +| Version | Supported | +| ------- | ------------------ | +| 0.2.x | Yes | +| < 0.2 | No | + +## Reporting a Vulnerability + +If you discover a security vulnerability in BAP, please report it responsibly. + +**Do not open a public GitHub issue for security vulnerabilities.** + +Instead, please send a report to the maintainers via one of these channels: + +1. **GitHub Security Advisories**: Use the [private vulnerability reporting](https://github.com/browseragentprotocol/bap/security/advisories/new) feature on GitHub +2. **Email**: Send details to the repository maintainers listed in the GitHub organization + +### What to include + +- Description of the vulnerability +- Steps to reproduce +- Affected versions and packages +- Potential impact +- Suggested fix (if any) + +### Response timeline + +- **Acknowledgment**: Within 48 hours +- **Initial assessment**: Within 1 week +- **Fix or mitigation**: Depends on severity, targeting: + - Critical: 48 hours + - High: 1 week + - Medium: 2 weeks + - Low: Next release cycle + +## Security Considerations + +BAP controls web browsers on behalf of AI agents. Operators and users should be aware of these security boundaries: + +### Authentication + +- The `--token` flag enables token-based authentication for WebSocket connections +- **Always use authentication in production** — without it, any process on the network can control the browser +- Tokens are compared using constant-time equality to prevent timing attacks + +### Network exposure + +- By default, the server binds to `localhost` only +- Do not expose BAP servers to the public internet without authentication and TLS +- Use `--host` with caution; binding to `0.0.0.0` exposes the server to all network interfaces + +### Domain restrictions + +- The `--allowed-domains` flag restricts which domains the browser can navigate to +- Use this in production to prevent navigation to unintended sites + +### Browser sandbox + +- BAP inherits Playwright's browser sandbox settings +- Chromium runs with sandbox enabled by default +- Do not disable the browser sandbox in production + +### Data handling + +- Screenshots and page content may contain sensitive data +- Storage state export (`getStorageState`) includes cookies and local storage +- Treat all browser data as potentially sensitive + +## Dependencies + +BAP depends on: +- **Playwright** for browser control +- **ws** for WebSocket transport +- **Zod** for schema validation + +We monitor dependencies for known vulnerabilities and update promptly. diff --git a/SKILL.md b/SKILL.md new file mode 100644 index 0000000..8577ed5 --- /dev/null +++ b/SKILL.md @@ -0,0 +1,198 @@ +--- +name: bap-browser +description: "Browser automation CLI with composite actions and semantic selectors. Use when the user needs to visit websites, fill forms, extract data, take screenshots, or automate multi-step browser workflows like login, checkout, or search." +license: Apache-2.0 +--- + +# BAP Browser CLI + +AI-first browser automation. Like playwright-cli but with composite actions, +semantic selectors, and structured extraction. + +## Quick Start + +```bash +bap open https://example.com +bap observe # compact interactive elements +bap click role:button:"Get Started" # semantic selector +bap close +``` + +## Composite Actions + +Execute multiple browser steps in ONE command instead of one-at-a-time: + +```bash +# Login flow — ONE command instead of 3+ separate calls +bap act fill:role:textbox:"Email"="user@example.com" \ + fill:role:textbox:"Password"="secret" \ + click:role:button:"Sign in" +``` + +Each step uses the syntax `action:selector=value` or `action:selector`. + +## Fused Operations + +Fused operations combine multiple server calls into one, cutting roundtrips by 50-85%. + +```bash +# Navigate + observe in 1 call (instead of bap goto + bap observe) +bap goto https://example.com --observe + +# Act + post-observe in 1 call (get updated page state after actions) +bap act click:role:button:"Submit" --observe + +# Control response size with --tier +bap goto https://example.com --observe --tier=minimal # refs + names only +bap goto https://example.com --observe --tier=interactive # elements + roles (default) +bap observe --tier=full # everything + metadata +``` + +**Always prefer fused calls** — `bap goto --observe` is 1 roundtrip vs 2 for `bap goto` then `bap observe`. + +## Common Patterns + +```bash +# Accept cookies + navigate +bap act click:text:"Accept" goto:https://example.com/app + +# Fill and submit a search +bap act fill:role:searchbox:"Search"="query here" press:Enter + +# Checkout form +bap act fill:label:"Card number"="4111111111111111" \ + fill:label:"Expiry"="12/28" \ + fill:label:"CVV"="123" \ + click:role:button:"Pay now" + +# Login with fused observe (2 calls total) +bap goto https://app.example.com/login --observe +bap act fill:label:"Email"="user@example.com" \ + fill:label:"Password"="secret" \ + click:role:button:"Sign in" --observe +``` + +## Selectors + +BAP supports both positional refs (from snapshots) and semantic selectors: + +| Selector | Example | When to use | +|----------|---------|-------------| +| `e` | `e15` | From snapshot refs (playwright-cli compatible) | +| `role::""` | `role:button:"Submit"` | When you know the element's purpose | +| `text:""` | `text:"Sign in"` | By visible text | +| `label:""` | `label:"Email"` | Form fields by label | +| `placeholder:""` | `placeholder:"Search..."` | By placeholder | +| `testid:""` | `testid:"submit-btn"` | By data-testid | + +Prefer semantic selectors (`role:`, `label:`, `text:`) — they survive page layout changes. Use `e` refs from `bap observe` or `bap snapshot` when semantic selectors are unclear. + +For the full selector reference, see [references/SELECTORS.md](references/SELECTORS.md). + +## Commands + +### Navigation + +```bash +bap open [url] # Open browser +bap goto # Navigate +bap goto --observe # Fused navigate+observe (1 call instead of 2) +bap goto --observe --tier=interactive # Fused with response tier +bap back / bap forward # History navigation +bap reload # Reload page +``` + +### Interaction + +```bash +bap click # Click element +bap fill # Fill input field (clears first) +bap type # Type into focused element (keystroke-by-keystroke) +bap press # Press keyboard key (Enter, Tab, Escape, etc.) +bap select # Select dropdown option +bap check # Check checkbox +bap uncheck # Uncheck checkbox +bap hover # Hover over element +``` + +### Observation + +```bash +bap observe # Compact interactive elements (default max 50) +bap observe --full # Full accessibility tree +bap observe --forms # Form fields only +bap observe --max=20 # Limit number of elements returned +bap observe --tier=interactive # Response tier: full, interactive, minimal +bap snapshot # Full YAML snapshot (playwright-cli compatible) +bap screenshot [--file=F] # Save screenshot to .bap/ directory +``` + +### Structured Extraction + +```bash +bap extract --fields="title,price" # Quick field extraction +bap extract --schema=schema.json # JSON Schema-based extraction +bap extract --list="product" # Extract list of items +``` + +### Sessions and Tabs + +```bash +bap -s= # Run command in named session +bap sessions # List active sessions +bap tabs # List open tabs +bap tab-new [url] # Open new tab +bap tab-select # Switch to tab +``` + +### Recipes + +```bash +bap recipe login --user= --pass=

+bap recipe fill-form --data=data.json +bap recipe wait-for [--timeout=ms] +``` + +## Output Behavior + +All outputs saved to `.bap/` directory (never injected into LLM context): +- Snapshots: `.bap/snapshot-.yml` +- Screenshots: `.bap/screenshot-.png` +- Extractions: `.bap/extraction-.json` + +After each command, BAP prints a concise summary: +``` +### Page +- URL: https://example.com/dashboard +- Title: Dashboard +### Snapshot +Saved to .bap/snapshot-1739734242.yml +``` + +## Error Handling + +| Problem | Fix | +|---------|-----| +| `bap: command not found` | Run `npm i -g @browseragentprotocol/cli` or use `npx @browseragentprotocol/cli` prefix | +| Element not found | Run `bap observe` to get fresh refs — the DOM changed after navigation | +| Stale element ref | Refs (`e15`) invalidate after navigation. Re-run `bap observe` or `bap snapshot` | +| Browser launch fails | Run `bap config browser firefox` to switch engines, or `bap config headless true` | +| Server not responding | Run `bap close-all` to kill the daemon, then retry your command | +| Navigation timeout | Page is slow to load. Try `bap goto ` again or check network connectivity | +| Click intercepted / overlay | An overlay may be blocking the element. Try `bap act click:text:"Accept" click:` to dismiss it first | +| Wrong tab active | Run `bap tabs` to list open tabs, then `bap tab-select ` | + +## When to Use BAP vs playwright-cli + +| Scenario | Use | +|----------|-----| +| Single click or type action | Either works — BAP accepts `e15` refs | +| Multi-step flow (login, form, checkout) | **BAP** — `bap act` batches steps in one command | +| Extract structured data from page | **BAP** — `bap extract` with schema validation | +| Need selectors resilient to layout changes | **BAP** — semantic selectors | +| Quick page snapshot | Either works — same YAML format | + +## Installation + +If `bap` command is not found, use `npx @browseragentprotocol/cli` as prefix. +For browser issues, run `bap config browser firefox` to switch engines. diff --git a/packages/cli/README.md b/packages/cli/README.md new file mode 100644 index 0000000..8e346e7 --- /dev/null +++ b/packages/cli/README.md @@ -0,0 +1,246 @@ +# @browseragentprotocol/cli + +[![npm version](https://badge.fury.io/js/@browseragentprotocol%2Fcli.svg)](https://www.npmjs.com/package/@browseragentprotocol/cli) +[![License: Apache-2.0](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) + +AI-native browser automation from the command line. Like playwright-cli but with superpowers: composite actions (`bap act`), semantic selectors, and structured extraction. + +## Quick Start + +```bash +npx @browseragentprotocol/cli open https://example.com +npx @browseragentprotocol/cli observe --max=20 +npx @browseragentprotocol/cli click role:button:"Get Started" +``` + +Or install globally: + +```bash +npm i -g @browseragentprotocol/cli +bap open https://example.com +``` + +## Why BAP CLI? + +### Composite Actions — 40x Token Reduction + +Execute multi-step flows in **one command** instead of one-at-a-time: + +```bash +# playwright-cli: 3 commands, 3 snapshots, 3 LLM reasoning cycles +playwright-cli fill e5 "user@example.com" +playwright-cli fill e8 "password123" +playwright-cli click e12 + +# bap: 1 command, 1 snapshot, 1 LLM reasoning cycle +bap act fill:e5="user@example.com" fill:e8="password123" click:e12 +``` + +### Semantic Selectors — Resilient to Layout Changes + +Target elements by their purpose, not their position: + +```bash +bap click role:button:"Submit" +bap fill label:"Email" "user@example.com" +bap act fill:role:textbox:"Email"="user@example.com" \ + fill:role:textbox:"Password"="secret" \ + click:role:button:"Sign in" +``` + +### Structured Extraction — Validated JSON Output + +```bash +bap extract --fields="title,price,rating" +bap extract --schema=product.json +bap extract --list="product" +``` + +## Commands + +### Navigation + +```bash +bap open [url] # Open browser, optionally navigate +bap goto # Navigate to URL +bap goto --observe # Fused: navigate + observe in 1 server call +bap back # Go back +bap forward # Go forward +bap reload # Reload page +``` + +### Interaction + +```bash +bap click # Click element +bap fill # Fill input field +bap type # Type into focused element +bap press # Press keyboard key +bap select # Select dropdown option +bap check # Check checkbox +bap uncheck # Uncheck checkbox +bap hover # Hover over element +``` + +### Observation + +```bash +bap observe # Interactive elements (default max 50) +bap observe --full # Full accessibility tree +bap observe --forms # Form fields only +bap observe --navigation # Navigation elements only +bap observe --max=20 # Limit elements +bap observe --diff # Incremental: only show changes since last observation +bap observe --tier=minimal # Response tier: full, interactive, minimal +bap snapshot [--file=F] # YAML accessibility snapshot +bap screenshot [--file=F] # PNG screenshot +``` + +### Composite Actions + +```bash +bap act ... # Execute multiple steps atomically +bap act --observe # Fused: act + observe in 1 server call +``` + +Step syntax: `action:selector=value` or `action:selector` + +```bash +# Login flow in one command +bap act fill:role:textbox:"Email"="user@example.com" \ + fill:role:textbox:"Password"="secret" \ + click:role:button:"Sign in" + +# Accept cookies + navigate +bap act click:text:"Accept" goto:https://example.com/app + +# Fill and submit a search +bap act fill:role:searchbox:"Search"="query here" press:Enter + +# Fused act + observe (1 server call instead of 3) +bap act click:e3 --observe --tier=interactive +``` + +### Sessions & Tabs + +```bash +bap -s= # Named session +bap sessions # List active sessions +bap tabs # List open tabs +bap tab-new [url] # Open new tab +bap tab-select # Switch to tab +bap frames # List frames +bap frame-switch # Switch to frame +``` + +### Recipes + +```bash +bap recipe login --user= --pass=

+bap recipe fill-form --data=data.json +bap recipe wait-for [--timeout=ms] +``` + +### Configuration + +```bash +bap config # View all settings +bap config browser firefox # Set default browser +bap config headless false # Disable headless mode +bap install-skill # Install skill to detected AI agents +bap skill init # Install skill to current project +``` + +## Selectors + +| Selector | Example | When to use | +|----------|---------|-------------| +| `e` | `e15` | From snapshot refs (playwright-cli compatible) | +| `role::""` | `role:button:"Submit"` | By ARIA role and name | +| `text:""` | `text:"Sign in"` | By visible text | +| `label:""` | `label:"Email"` | Form fields by label | +| `placeholder:""` | `placeholder:"Search..."` | By placeholder text | +| `testid:""` | `testid:"submit-btn"` | By data-testid | +| `css:` | `css:.btn-primary` | CSS selector | +| `xpath:` | `xpath://button` | XPath selector | +| `coords:,` | `coords:100,200` | By coordinates | + +## Global Options + +``` +-s= Named session +-p, --port Server port (default: 9222) +-b, --browser Browser: chrome, firefox, webkit, edge +--headless Headless mode (default) +--no-headless Show browser window +-v, --verbose Verbose output +--observe Fused observation (for goto, act) +--diff Incremental observation (for observe) +--tier= Response tier: full, interactive, minimal +--max= Limit elements (default: 50) +``` + +## Architecture + +BAP CLI communicates with a BAP Playwright server over WebSocket: + +``` +bap + ↕ WebSocket (JSON-RPC 2.0) +BAP Playwright Server (auto-started as background daemon) + ↕ Playwright +Browser (Chromium / Firefox / WebKit) +``` + +The server starts automatically on first use and persists across commands. Use `bap close-all` to stop it. + +## Output + +Commands produce concise, AI-agent-friendly output: + +``` +### Page +- URL: https://example.com/dashboard +- Title: Dashboard +### Snapshot +[Snapshot](.bap/snapshot-2026-02-16T19-30-42.yml) +``` + +Files are saved to `.bap/` in the current directory: +- Snapshots: `.bap/snapshot-.yml` +- Screenshots: `.bap/screenshot-.png` +- Extractions: `.bap/extraction-.json` + +## AI Agent Integration + +BAP CLI includes a SKILL.md file that teaches AI coding agents how to use it effectively. Install it to your agent: + +```bash +bap install-skill # Auto-detect and install to all agents +bap install-skill --dry-run # Preview what would be installed +``` + +Supports 13 AI coding agent platforms: Claude Code, Codex CLI, Gemini CLI, Cursor, GitHub Copilot, Windsurf, Roo Code, Amp, Deep Agents, OpenCode, and more. + +## Migrating from playwright-cli + +BAP is a drop-in replacement for playwright-cli. All `e` refs from snapshots work identically: + +| playwright-cli | bap | +|----------------|-----| +| `playwright-cli open [url]` | `bap open [url]` | +| `playwright-cli click e15` | `bap click e15` | +| `playwright-cli fill e5 "text"` | `bap fill e5 "text"` | +| `playwright-cli snapshot` | `bap snapshot` | +| `playwright-cli screenshot` | `bap screenshot` | + +BAP adds composite actions, semantic selectors, smart observation, and structured extraction on top. + +## Requirements + +- Node.js >= 20.0.0 +- Playwright browsers (`npx playwright install chromium`) + +## License + +Apache-2.0 diff --git a/packages/cli/__tests__/composite-parser.test.ts b/packages/cli/__tests__/composite-parser.test.ts new file mode 100644 index 0000000..3eb2651 --- /dev/null +++ b/packages/cli/__tests__/composite-parser.test.ts @@ -0,0 +1,177 @@ +import { describe, it, expect } from "vitest"; +import { + parseCompositeStep, + parseCompositeSteps, + toExecutionSteps, +} from "../src/selectors/composite-parser.js"; + +describe("parseCompositeStep", () => { + // Parameterless actions + describe("parameterless actions", () => { + it("parses snapshot", () => { + expect(parseCompositeStep("snapshot")).toEqual({ + action: "observe/ariaSnapshot", + }); + }); + it("parses screenshot", () => { + expect(parseCompositeStep("screenshot")).toEqual({ + action: "observe/screenshot", + }); + }); + it("parses back", () => { + expect(parseCompositeStep("back")).toEqual({ + action: "page/goBack", + }); + }); + it("parses forward", () => { + expect(parseCompositeStep("forward")).toEqual({ + action: "page/goForward", + }); + }); + it("parses reload", () => { + expect(parseCompositeStep("reload")).toEqual({ + action: "page/reload", + }); + }); + }); + + // Click actions + describe("click actions", () => { + it("parses click:e15", () => { + const step = parseCompositeStep("click:e15"); + expect(step.action).toBe("action/click"); + expect(step.selector).toEqual({ type: "ref", ref: "e15" }); + expect(step.value).toBeUndefined(); + }); + it('parses click:role:button:"Submit"', () => { + const step = parseCompositeStep('click:role:button:"Submit"'); + expect(step.action).toBe("action/click"); + expect(step.selector).toEqual({ + type: "role", + role: "button", + name: "Submit", + }); + }); + it('parses click:text:"Sign in"', () => { + const step = parseCompositeStep('click:text:"Sign in"'); + expect(step.action).toBe("action/click"); + expect(step.selector).toEqual({ + type: "text", + value: "Sign in", + }); + }); + }); + + // Fill actions with values + describe("fill actions", () => { + it('parses fill:e5="user@example.com"', () => { + const step = parseCompositeStep('fill:e5="user@example.com"'); + expect(step.action).toBe("action/fill"); + expect(step.selector).toEqual({ type: "ref", ref: "e5" }); + expect(step.value).toBe("user@example.com"); + }); + it("parses fill:e5=password (unquoted value)", () => { + const step = parseCompositeStep("fill:e5=password"); + expect(step.action).toBe("action/fill"); + expect(step.selector).toEqual({ type: "ref", ref: "e5" }); + expect(step.value).toBe("password"); + }); + it('parses fill:role:textbox:"Email"="user@example.com"', () => { + const step = parseCompositeStep('fill:role:textbox:"Email"="user@example.com"'); + expect(step.action).toBe("action/fill"); + expect(step.selector).toEqual({ + type: "role", + role: "textbox", + name: "Email", + }); + expect(step.value).toBe("user@example.com"); + }); + it('parses fill:label:"Email"="user@example.com"', () => { + const step = parseCompositeStep('fill:label:"Email"="user@example.com"'); + expect(step.action).toBe("action/fill"); + expect(step.selector).toEqual({ + type: "label", + value: "Email", + }); + expect(step.value).toBe("user@example.com"); + }); + }); + + // Goto actions + describe("goto actions", () => { + it("parses goto:https://example.com", () => { + const step = parseCompositeStep("goto:https://example.com"); + expect(step.action).toBe("page/navigate"); + expect(step.url).toBe("https://example.com"); + }); + it("parses goto:https://example.com/path?query=1", () => { + const step = parseCompositeStep("goto:https://example.com/path?query=1"); + expect(step.action).toBe("page/navigate"); + expect(step.url).toBe("https://example.com/path?query=1"); + }); + }); + + // Press actions + describe("press actions", () => { + it("parses press:Enter", () => { + const step = parseCompositeStep("press:Enter"); + expect(step.action).toBe("action/press"); + expect(step.key).toBe("Enter"); + }); + it("parses press:Tab", () => { + const step = parseCompositeStep("press:Tab"); + expect(step.action).toBe("action/press"); + expect(step.key).toBe("Tab"); + }); + }); + + // Error cases + describe("error cases", () => { + it("throws on invalid step without colon", () => { + expect(() => parseCompositeStep("invalid")).toThrow(); + }); + }); +}); + +describe("parseCompositeSteps", () => { + it("parses multiple steps", () => { + const steps = parseCompositeSteps([ + 'fill:e5="user@example.com"', + 'fill:e8="password"', + "click:e12", + ]); + expect(steps).toHaveLength(3); + expect(steps[0]!.action).toBe("action/fill"); + expect(steps[1]!.action).toBe("action/fill"); + expect(steps[2]!.action).toBe("action/click"); + }); +}); + +describe("toExecutionSteps", () => { + it("converts parsed steps to execution steps", () => { + const parsed = parseCompositeSteps([ + 'fill:e5="user@example.com"', + "click:e12", + "press:Enter", + ]); + const steps = toExecutionSteps(parsed); + expect(steps).toHaveLength(3); + + expect(steps[0]!.action).toBe("action/fill"); + expect(steps[0]!.params!.selector).toEqual({ type: "ref", ref: "e5" }); + expect(steps[0]!.params!.value).toBe("user@example.com"); + + expect(steps[1]!.action).toBe("action/click"); + expect(steps[1]!.params!.selector).toEqual({ type: "ref", ref: "e12" }); + + expect(steps[2]!.action).toBe("action/press"); + expect(steps[2]!.params!.key).toBe("Enter"); + }); + + it("handles goto with URL", () => { + const parsed = parseCompositeSteps(["goto:https://example.com"]); + const steps = toExecutionSteps(parsed); + expect(steps[0]!.action).toBe("page/navigate"); + expect(steps[0]!.params!.url).toBe("https://example.com"); + }); +}); diff --git a/packages/cli/__tests__/flag-parsing.test.ts b/packages/cli/__tests__/flag-parsing.test.ts new file mode 100644 index 0000000..3e0f8bf --- /dev/null +++ b/packages/cli/__tests__/flag-parsing.test.ts @@ -0,0 +1,98 @@ +import { describe, it, expect, vi, beforeEach, afterEach } from "vitest"; + +// Mock fs and os to avoid reading real config files +vi.mock("node:fs", () => ({ + default: { + existsSync: () => false, + readFileSync: () => "{}", + writeFileSync: () => {}, + mkdirSync: () => {}, + }, +})); + +vi.mock("node:os", () => ({ + default: { + homedir: () => "/tmp/test-home", + }, +})); + +// Import after mocks +const { parseArgs } = await import("../src/config/state.js"); + +describe("parseArgs fusion flags", () => { + describe("--observe", () => { + it("parses --observe flag", () => { + const flags = parseArgs(["act", "click:e1", "--observe"]); + expect(flags.observe).toBe(true); + expect(flags.command).toBe("act"); + }); + + it("defaults observe to undefined when not set", () => { + const flags = parseArgs(["act", "click:e1"]); + expect(flags.observe).toBeUndefined(); + }); + }); + + describe("--diff", () => { + it("parses --diff flag", () => { + const flags = parseArgs(["observe", "--diff"]); + expect(flags.diff).toBe(true); + expect(flags.command).toBe("observe"); + }); + + it("defaults diff to undefined when not set", () => { + const flags = parseArgs(["observe"]); + expect(flags.diff).toBeUndefined(); + }); + }); + + describe("--tier", () => { + it("parses --tier=minimal", () => { + const flags = parseArgs(["observe", "--tier=minimal"]); + expect(flags.tier).toBe("minimal"); + }); + + it("parses --tier=interactive", () => { + const flags = parseArgs(["observe", "--tier=interactive"]); + expect(flags.tier).toBe("interactive"); + }); + + it("parses --tier=full", () => { + const flags = parseArgs(["act", "click:e1", "--observe", "--tier=full"]); + expect(flags.tier).toBe("full"); + expect(flags.observe).toBe(true); + }); + + it("parses --tier as separate arg", () => { + const flags = parseArgs(["observe", "--tier", "minimal"]); + expect(flags.tier).toBe("minimal"); + }); + + it("defaults tier to undefined when not set", () => { + const flags = parseArgs(["observe"]); + expect(flags.tier).toBeUndefined(); + }); + }); + + describe("combined flags", () => { + it("parses --observe with --max", () => { + const flags = parseArgs(["act", "click:e1", "--observe", "--max=20"]); + expect(flags.observe).toBe(true); + expect(flags.max).toBe(20); + }); + + it("parses --diff with --max", () => { + const flags = parseArgs(["observe", "--diff", "--max=10"]); + expect(flags.diff).toBe(true); + expect(flags.max).toBe(10); + }); + + it("parses goto --observe with --tier", () => { + const flags = parseArgs(["goto", "https://example.com", "--observe", "--tier=interactive"]); + expect(flags.command).toBe("goto"); + expect(flags.args).toEqual(["https://example.com"]); + expect(flags.observe).toBe(true); + expect(flags.tier).toBe("interactive"); + }); + }); +}); diff --git a/packages/cli/__tests__/selectors.test.ts b/packages/cli/__tests__/selectors.test.ts new file mode 100644 index 0000000..0d7b8ee --- /dev/null +++ b/packages/cli/__tests__/selectors.test.ts @@ -0,0 +1,243 @@ +import { describe, it, expect } from "vitest"; +import { parseSelector, stripQuotes, formatSelectorForDisplay } from "../src/selectors/parser.js"; + +describe("stripQuotes", () => { + it("strips double quotes", () => { + expect(stripQuotes('"Submit"')).toBe("Submit"); + }); + it("strips single quotes", () => { + expect(stripQuotes("'Submit'")).toBe("Submit"); + }); + it("leaves unquoted strings", () => { + expect(stripQuotes("Submit")).toBe("Submit"); + }); + it("leaves mismatched quotes", () => { + expect(stripQuotes("\"Submit'")).toBe("\"Submit'"); + }); + it("handles empty string", () => { + expect(stripQuotes("")).toBe(""); + }); + it("handles single char", () => { + expect(stripQuotes("a")).toBe("a"); + }); +}); + +describe("parseSelector", () => { + // Playwright-CLI compat: e positional refs + describe("positional refs (e)", () => { + it("parses e15", () => { + expect(parseSelector("e15")).toEqual({ type: "ref", ref: "e15" }); + }); + it("parses e0", () => { + expect(parseSelector("e0")).toEqual({ type: "ref", ref: "e0" }); + }); + it("parses e123", () => { + expect(parseSelector("e123")).toEqual({ type: "ref", ref: "e123" }); + }); + it("does not match 'email' as ref", () => { + expect(parseSelector("email")).not.toEqual(expect.objectContaining({ type: "ref" })); + }); + }); + + // BAP stable refs + describe("stable refs (@)", () => { + it("parses @e1", () => { + expect(parseSelector("@e1")).toEqual({ type: "ref", ref: "@e1" }); + }); + it("parses @submitBtn", () => { + expect(parseSelector("@submitBtn")).toEqual({ type: "ref", ref: "@submitBtn" }); + }); + it("parses ref:@submitBtn", () => { + expect(parseSelector("ref:@submitBtn")).toEqual({ type: "ref", ref: "@submitBtn" }); + }); + }); + + // Role selectors + describe("role selectors", () => { + it('parses role:button:"Submit"', () => { + expect(parseSelector('role:button:"Submit"')).toEqual({ + type: "role", + role: "button", + name: "Submit", + }); + }); + it("parses role:button:Submit (without quotes)", () => { + expect(parseSelector("role:button:Submit")).toEqual({ + type: "role", + role: "button", + name: "Submit", + }); + }); + it('parses role:textbox:"Email"', () => { + expect(parseSelector('role:textbox:"Email"')).toEqual({ + type: "role", + role: "textbox", + name: "Email", + }); + }); + it("parses role:button (no name)", () => { + expect(parseSelector("role:button")).toEqual({ + type: "role", + role: "button", + name: undefined, + }); + }); + it("parses role:link:Home", () => { + expect(parseSelector("role:link:Home")).toEqual({ + type: "role", + role: "link", + name: "Home", + }); + }); + it('parses role with name containing colon: role:button:"Sign in: Now"', () => { + expect(parseSelector('role:button:"Sign in: Now"')).toEqual({ + type: "role", + role: "button", + name: "Sign in: Now", + }); + }); + }); + + // Text selectors + describe("text selectors", () => { + it('parses text:"Sign in"', () => { + expect(parseSelector('text:"Sign in"')).toEqual({ + type: "text", + value: "Sign in", + }); + }); + it("parses text:Sign in (without quotes)", () => { + expect(parseSelector("text:Sign in")).toEqual({ + type: "text", + value: "Sign in", + }); + }); + }); + + // Label selectors + describe("label selectors", () => { + it('parses label:"Email"', () => { + expect(parseSelector('label:"Email"')).toEqual({ + type: "label", + value: "Email", + }); + }); + it("parses label:Email", () => { + expect(parseSelector("label:Email")).toEqual({ + type: "label", + value: "Email", + }); + }); + }); + + // Placeholder selectors + describe("placeholder selectors", () => { + it('parses placeholder:"Search..."', () => { + expect(parseSelector('placeholder:"Search..."')).toEqual({ + type: "placeholder", + value: "Search...", + }); + }); + }); + + // TestId selectors + describe("testid selectors", () => { + it("parses testid:submit-btn", () => { + expect(parseSelector("testid:submit-btn")).toEqual({ + type: "testId", + value: "submit-btn", + }); + }); + }); + + // CSS selectors + describe("css selectors", () => { + it("parses css:.btn-primary", () => { + expect(parseSelector("css:.btn-primary")).toEqual({ + type: "css", + value: ".btn-primary", + }); + }); + it("parses #submit-btn shorthand", () => { + expect(parseSelector("#submit-btn")).toEqual({ + type: "css", + value: "#submit-btn", + }); + }); + it("parses .class shorthand", () => { + expect(parseSelector(".btn")).toEqual({ + type: "css", + value: ".btn", + }); + }); + }); + + // XPath selectors + describe("xpath selectors", () => { + it("parses xpath://button[@id='submit']", () => { + expect(parseSelector("xpath://button[@id='submit']")).toEqual({ + type: "xpath", + value: "//button[@id='submit']", + }); + }); + }); + + // Coordinates selectors + describe("coordinate selectors", () => { + it("parses coords:100,200", () => { + expect(parseSelector("coords:100,200")).toEqual({ + type: "coordinates", + x: 100, + y: 200, + }); + }); + }); + + // Default (text fallback) + describe("default fallback", () => { + it("treats plain strings as text selectors", () => { + expect(parseSelector("Sign in")).toEqual({ + type: "text", + value: "Sign in", + }); + }); + }); +}); + +describe("formatSelectorForDisplay", () => { + it("formats role selector", () => { + expect( + formatSelectorForDisplay({ type: "role", role: "button", name: "Submit" }) + ).toBe('role:button:"Submit"'); + }); + it("formats role selector without name", () => { + expect( + formatSelectorForDisplay({ type: "role", role: "button" }) + ).toBe("role:button"); + }); + it("formats text selector", () => { + expect( + formatSelectorForDisplay({ type: "text", value: "Sign in" }) + ).toBe('text:"Sign in"'); + }); + it("formats ref selector", () => { + expect( + formatSelectorForDisplay({ type: "ref", ref: "@e1" }) + ).toBe("@e1"); + }); + it("formats ref selector (e15)", () => { + expect( + formatSelectorForDisplay({ type: "ref", ref: "e15" }) + ).toBe("e15"); + }); + it("formats css shorthand", () => { + expect( + formatSelectorForDisplay({ type: "css", value: "#submit" }) + ).toBe("#submit"); + }); + it("formats coordinates", () => { + expect( + formatSelectorForDisplay({ type: "coordinates", x: 100, y: 200 }) + ).toBe("coords:100,200"); + }); +}); diff --git a/packages/cli/bin/bap.js b/packages/cli/bin/bap.js new file mode 100755 index 0000000..8fb1272 --- /dev/null +++ b/packages/cli/bin/bap.js @@ -0,0 +1,2 @@ +#!/usr/bin/env node +import "../dist/cli.js"; diff --git a/packages/cli/package.json b/packages/cli/package.json new file mode 100644 index 0000000..d473456 --- /dev/null +++ b/packages/cli/package.json @@ -0,0 +1,70 @@ +{ + "name": "@browseragentprotocol/cli", + "version": "0.2.0", + "description": "BAP CLI - AI-native browser automation from the command line", + "author": "Browser Agent Protocol", + "license": "Apache-2.0", + "type": "module", + "main": "./dist/cli.js", + "types": "./dist/cli.d.ts", + "bin": { + "bap": "./bin/bap.js" + }, + "exports": { + ".": { + "import": { + "types": "./dist/cli.d.ts", + "default": "./dist/cli.js" + } + }, + "./package.json": "./package.json" + }, + "files": [ + "dist", + "bin", + "skills", + "README.md" + ], + "repository": { + "type": "git", + "url": "https://github.com/browseragentprotocol/bap.git", + "directory": "packages/cli" + }, + "keywords": [ + "bap", + "cli", + "browser", + "automation", + "ai", + "agent", + "playwright", + "web-scraping", + "composite-actions", + "semantic-selectors" + ], + "scripts": { + "build": "tsup src/cli.ts --format esm --dts --clean", + "typecheck": "tsc --noEmit", + "test": "vitest run", + "test:watch": "vitest", + "lint": "eslint src", + "lint:fix": "eslint src --fix", + "clean": "rm -rf dist" + }, + "dependencies": { + "@browseragentprotocol/client": "workspace:*", + "@browseragentprotocol/logger": "workspace:*", + "@browseragentprotocol/protocol": "workspace:*" + }, + "devDependencies": { + "tsup": "^8.3.0", + "typescript": "^5.7.0" + }, + "publishConfig": { + "access": "public", + "registry": "https://registry.npmjs.org/" + }, + "engines": { + "node": ">=20.0.0" + } +} diff --git a/packages/cli/skills/bap-browser/SKILL.md b/packages/cli/skills/bap-browser/SKILL.md new file mode 100644 index 0000000..8577ed5 --- /dev/null +++ b/packages/cli/skills/bap-browser/SKILL.md @@ -0,0 +1,198 @@ +--- +name: bap-browser +description: "Browser automation CLI with composite actions and semantic selectors. Use when the user needs to visit websites, fill forms, extract data, take screenshots, or automate multi-step browser workflows like login, checkout, or search." +license: Apache-2.0 +--- + +# BAP Browser CLI + +AI-first browser automation. Like playwright-cli but with composite actions, +semantic selectors, and structured extraction. + +## Quick Start + +```bash +bap open https://example.com +bap observe # compact interactive elements +bap click role:button:"Get Started" # semantic selector +bap close +``` + +## Composite Actions + +Execute multiple browser steps in ONE command instead of one-at-a-time: + +```bash +# Login flow — ONE command instead of 3+ separate calls +bap act fill:role:textbox:"Email"="user@example.com" \ + fill:role:textbox:"Password"="secret" \ + click:role:button:"Sign in" +``` + +Each step uses the syntax `action:selector=value` or `action:selector`. + +## Fused Operations + +Fused operations combine multiple server calls into one, cutting roundtrips by 50-85%. + +```bash +# Navigate + observe in 1 call (instead of bap goto + bap observe) +bap goto https://example.com --observe + +# Act + post-observe in 1 call (get updated page state after actions) +bap act click:role:button:"Submit" --observe + +# Control response size with --tier +bap goto https://example.com --observe --tier=minimal # refs + names only +bap goto https://example.com --observe --tier=interactive # elements + roles (default) +bap observe --tier=full # everything + metadata +``` + +**Always prefer fused calls** — `bap goto --observe` is 1 roundtrip vs 2 for `bap goto` then `bap observe`. + +## Common Patterns + +```bash +# Accept cookies + navigate +bap act click:text:"Accept" goto:https://example.com/app + +# Fill and submit a search +bap act fill:role:searchbox:"Search"="query here" press:Enter + +# Checkout form +bap act fill:label:"Card number"="4111111111111111" \ + fill:label:"Expiry"="12/28" \ + fill:label:"CVV"="123" \ + click:role:button:"Pay now" + +# Login with fused observe (2 calls total) +bap goto https://app.example.com/login --observe +bap act fill:label:"Email"="user@example.com" \ + fill:label:"Password"="secret" \ + click:role:button:"Sign in" --observe +``` + +## Selectors + +BAP supports both positional refs (from snapshots) and semantic selectors: + +| Selector | Example | When to use | +|----------|---------|-------------| +| `e` | `e15` | From snapshot refs (playwright-cli compatible) | +| `role::""` | `role:button:"Submit"` | When you know the element's purpose | +| `text:""` | `text:"Sign in"` | By visible text | +| `label:""` | `label:"Email"` | Form fields by label | +| `placeholder:""` | `placeholder:"Search..."` | By placeholder | +| `testid:""` | `testid:"submit-btn"` | By data-testid | + +Prefer semantic selectors (`role:`, `label:`, `text:`) — they survive page layout changes. Use `e` refs from `bap observe` or `bap snapshot` when semantic selectors are unclear. + +For the full selector reference, see [references/SELECTORS.md](references/SELECTORS.md). + +## Commands + +### Navigation + +```bash +bap open [url] # Open browser +bap goto # Navigate +bap goto --observe # Fused navigate+observe (1 call instead of 2) +bap goto --observe --tier=interactive # Fused with response tier +bap back / bap forward # History navigation +bap reload # Reload page +``` + +### Interaction + +```bash +bap click # Click element +bap fill # Fill input field (clears first) +bap type # Type into focused element (keystroke-by-keystroke) +bap press # Press keyboard key (Enter, Tab, Escape, etc.) +bap select # Select dropdown option +bap check # Check checkbox +bap uncheck # Uncheck checkbox +bap hover # Hover over element +``` + +### Observation + +```bash +bap observe # Compact interactive elements (default max 50) +bap observe --full # Full accessibility tree +bap observe --forms # Form fields only +bap observe --max=20 # Limit number of elements returned +bap observe --tier=interactive # Response tier: full, interactive, minimal +bap snapshot # Full YAML snapshot (playwright-cli compatible) +bap screenshot [--file=F] # Save screenshot to .bap/ directory +``` + +### Structured Extraction + +```bash +bap extract --fields="title,price" # Quick field extraction +bap extract --schema=schema.json # JSON Schema-based extraction +bap extract --list="product" # Extract list of items +``` + +### Sessions and Tabs + +```bash +bap -s= # Run command in named session +bap sessions # List active sessions +bap tabs # List open tabs +bap tab-new [url] # Open new tab +bap tab-select # Switch to tab +``` + +### Recipes + +```bash +bap recipe login --user= --pass=

+bap recipe fill-form --data=data.json +bap recipe wait-for [--timeout=ms] +``` + +## Output Behavior + +All outputs saved to `.bap/` directory (never injected into LLM context): +- Snapshots: `.bap/snapshot-.yml` +- Screenshots: `.bap/screenshot-.png` +- Extractions: `.bap/extraction-.json` + +After each command, BAP prints a concise summary: +``` +### Page +- URL: https://example.com/dashboard +- Title: Dashboard +### Snapshot +Saved to .bap/snapshot-1739734242.yml +``` + +## Error Handling + +| Problem | Fix | +|---------|-----| +| `bap: command not found` | Run `npm i -g @browseragentprotocol/cli` or use `npx @browseragentprotocol/cli` prefix | +| Element not found | Run `bap observe` to get fresh refs — the DOM changed after navigation | +| Stale element ref | Refs (`e15`) invalidate after navigation. Re-run `bap observe` or `bap snapshot` | +| Browser launch fails | Run `bap config browser firefox` to switch engines, or `bap config headless true` | +| Server not responding | Run `bap close-all` to kill the daemon, then retry your command | +| Navigation timeout | Page is slow to load. Try `bap goto ` again or check network connectivity | +| Click intercepted / overlay | An overlay may be blocking the element. Try `bap act click:text:"Accept" click:` to dismiss it first | +| Wrong tab active | Run `bap tabs` to list open tabs, then `bap tab-select ` | + +## When to Use BAP vs playwright-cli + +| Scenario | Use | +|----------|-----| +| Single click or type action | Either works — BAP accepts `e15` refs | +| Multi-step flow (login, form, checkout) | **BAP** — `bap act` batches steps in one command | +| Extract structured data from page | **BAP** — `bap extract` with schema validation | +| Need selectors resilient to layout changes | **BAP** — semantic selectors | +| Quick page snapshot | Either works — same YAML format | + +## Installation + +If `bap` command is not found, use `npx @browseragentprotocol/cli` as prefix. +For browser issues, run `bap config browser firefox` to switch engines. diff --git a/packages/cli/skills/bap-browser/agents/openai.yaml b/packages/cli/skills/bap-browser/agents/openai.yaml new file mode 100644 index 0000000..31ddd47 --- /dev/null +++ b/packages/cli/skills/bap-browser/agents/openai.yaml @@ -0,0 +1,8 @@ +display_name: "BAP Browser" +short_description: "AI-native browser automation with composite actions, semantic selectors, and structured extraction" +default_prompt: "Use bap-browser to test the login flow on this app" +icon: "🌐" +category: "browser-automation" +allow_implicit_invocation: true +tools: + - Bash diff --git a/packages/cli/skills/bap-browser/references/MIGRATION.md b/packages/cli/skills/bap-browser/references/MIGRATION.md new file mode 100644 index 0000000..f4ab70a --- /dev/null +++ b/packages/cli/skills/bap-browser/references/MIGRATION.md @@ -0,0 +1,71 @@ +# Migrating from playwright-cli to BAP + +## Command Mapping + +| playwright-cli | bap | Notes | +|----------------|-----|-------| +| `playwright-cli open [url]` | `bap open [url]` | Identical | +| `playwright-cli goto ` | `bap goto ` | Identical | +| `playwright-cli click e15` | `bap click e15` | Identical — BAP accepts `e` refs | +| `playwright-cli fill e5 "text"` | `bap fill e5 "text"` | Identical | +| `playwright-cli type "text"` | `bap type "text"` | Identical | +| `playwright-cli press Enter` | `bap press Enter` | Identical | +| `playwright-cli snapshot` | `bap snapshot` | Same YAML format | +| `playwright-cli screenshot` | `bap screenshot` | Same PNG output | +| `playwright-cli close` | `bap close` | Identical | +| `playwright-cli go-back` | `bap back` | Shortened | +| `playwright-cli go-forward` | `bap forward` | Shortened | +| `playwright-cli session-list` | `bap sessions` | Shortened | +| `playwright-cli -s=n cmd` | `bap -s=n cmd` | Identical | +| `playwright-cli tab-list` | `bap tabs` | Shortened | +| `playwright-cli eval "js"` | `bap eval "js"` | Identical | + +## What BAP Adds + +### Composite Actions +```bash +# playwright-cli: 3 commands, 3 snapshots, 3 LLM reasoning cycles +playwright-cli fill e5 "user@example.com" +playwright-cli fill e8 "password123" +playwright-cli click e12 + +# bap: 1 command, 1 snapshot, 1 LLM reasoning cycle +bap act fill:e5="user@example.com" fill:e8="password123" click:e12 +``` + +### Semantic Selectors +```bash +# playwright-cli: must use snapshot refs (break if page changes) +playwright-cli click e15 + +# bap: use semantic selectors (resilient to layout changes) +bap click role:button:"Submit" +bap fill label:"Email" "user@example.com" +bap click text:"Sign in" +``` + +### Structured Extraction +```bash +# playwright-cli: write JavaScript, parse output manually +playwright-cli eval "JSON.stringify({title: document.querySelector('h1').textContent})" + +# bap: declare what you want, get validated JSON +bap extract --fields="title,price,rating" +``` + +### Smart Observation +```bash +# playwright-cli: full accessibility tree (thousands of tokens) +playwright-cli snapshot + +# bap: curated interactive elements only (dozens of tokens) +bap observe --max=30 +bap observe --forms # just form fields +``` + +## Output Directory + +- playwright-cli: `.playwright-cli/` +- bap: `.bap/` + +Both use the same snapshot YAML format, so existing snapshot-parsing logic works with both. diff --git a/packages/cli/skills/bap-browser/references/SELECTORS.md b/packages/cli/skills/bap-browser/references/SELECTORS.md new file mode 100644 index 0000000..de82db3 --- /dev/null +++ b/packages/cli/skills/bap-browser/references/SELECTORS.md @@ -0,0 +1,89 @@ +# BAP Semantic Selectors Reference + +## Overview + +BAP supports two selector systems: positional refs (compatible with +playwright-cli snapshots) and semantic selectors (BAP-exclusive). + +Semantic selectors describe elements by their purpose, not their position. +They survive page layout changes, dynamic content updates, and A/B tests. + +## Selector Types + +### Positional Refs (Compatibility) +``` +e15 → Element ref from last snapshot (playwright-cli compatible) +@e1 → Stable BAP element reference (persists across observations) +``` + +### Role Selector +``` +role:button:"Submit" +role:textbox:"Email" +role:link:"Home" +role:heading:"Welcome" +role:checkbox:"Remember me" +role:combobox:"Country" +role:searchbox:"Search" +role:listitem:"Item 1" +``` +Uses ARIA roles. Most reliable for interactive elements. + +### Text Selector +``` +text:"Sign in" +text:"Add to cart" +text:"Next page" +``` +Matches by visible text content. Case-sensitive exact match. + +### Label Selector +``` +label:"Email address" +label:"Password" +label:"First name" +``` +Matches form inputs by their associated `

+ bap recipe fill-form --data=data.json + bap recipe wait-for [--timeout=ms] + +${pc.cyan("CONFIGURATION")} + bap config [key] [value] View/set configuration + bap install-skill Install skill to all detected agents + bap skill init Install skill to project + +${pc.cyan("GLOBAL OPTIONS")} + -s= Named session + -p, --port Server port (default: 9222) + -b, --browser Browser: chrome, firefox, webkit, edge + --headless / --no-headless Headless mode + -v, --verbose Verbose output + -h, --help Show this help + -V, --version Show version + +${pc.dim("Docs:")} ${pc.cyan("https://github.com/browseragentprotocol/bap")} +`); +} + +function printVersion(): void { + console.log("bap-cli 0.2.0"); +} + +// ============================================================================= +// Commands that don't need a server connection +// ============================================================================= + +const NO_SERVER_COMMANDS = new Set([ + "config", "install-skill", "skill", "--help", "-h", +]); + +// ============================================================================= +// Main +// ============================================================================= + +async function main(): Promise { + const flags = parseArgs(process.argv.slice(2)); + + if (flags.help || (!flags.command && process.argv.length <= 2)) { + printHelp(); + process.exit(0); + } + + if (flags.version) { + printVersion(); + process.exit(0); + } + + if (!flags.command) { + printHelp(); + process.exit(1); + } + + const handler = getCommand(flags.command); + if (!handler) { + console.error(`Unknown command: ${flags.command}`); + console.error("Run 'bap --help' for usage."); + process.exit(1); + } + + // Some commands don't need a server (config, install-skill) + if (NO_SERVER_COMMANDS.has(flags.command)) { + // Pass a dummy client — these commands don't use it + await handler(flags.args, flags, {} as never); + return; + } + + // All other commands need a BAP server + const serverManager = new ServerManager({ + port: flags.port, + host: flags.host, + browser: flags.browser, + headless: flags.headless, + verbose: flags.verbose, + }); + + try { + const client = await serverManager.ensureClient(); + await handler(flags.args, flags, client); + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + console.error(`Error: ${message}`); + process.exit(1); + } finally { + await serverManager.disconnect(); + } +} + +main(); diff --git a/packages/cli/src/commands/act.ts b/packages/cli/src/commands/act.ts new file mode 100644 index 0000000..033785b --- /dev/null +++ b/packages/cli/src/commands/act.ts @@ -0,0 +1,95 @@ +/** + * bap act ... — Execute multiple steps atomically + * + * This is the killer feature. A login flow that costs playwright-cli + * 3 commands / 3 snapshots / ~6000 tokens costs BAP 1 command / 1 snapshot / ~150 tokens. + * + * Fusion: --observe flag fuses act + post-observe into 1 server call (50% token reduction) + * + * Examples: + * bap act fill:e5="user@example.com" fill:e8="password" click:e12 + * bap act fill:role:textbox:"Email"="user@example.com" \ + * fill:role:textbox:"Password"="secret123" \ + * click:role:button:"Sign in" + * bap act click:e3 --observe # fused act+observe (1 call) + */ + +import type { BAPClient } from "@browseragentprotocol/client"; +import type { AgentObserveParams, AgentObserveResult } from "@browseragentprotocol/protocol"; +import type { GlobalFlags } from "../config/state.js"; +import { parseCompositeSteps, toExecutionSteps } from "../selectors/composite-parser.js"; +import { printActResult, printObserveResult } from "../output/formatter.js"; +import { writeSnapshot } from "../output/filesystem.js"; +import { register } from "./registry.js"; + +async function actCommand( + args: string[], + flags: GlobalFlags, + client: BAPClient, +): Promise { + if (args.length === 0) { + console.error("Usage: bap act ..."); + console.error(""); + console.error("Steps use the syntax: action:selector=value or action:selector"); + console.error("Flags: --observe (fused post-observation), --tier="); + console.error(""); + console.error("Examples:"); + console.error(' bap act fill:e5="user@example.com" fill:e8="pass" click:e12'); + console.error(' bap act fill:role:textbox:"Email"="user@example.com" \\'); + console.error(' fill:role:textbox:"Password"="secret" \\'); + console.error(' click:role:button:"Sign in"'); + console.error(' bap act click:e3 --observe # fused act+observe (1 call)'); + process.exit(1); + } + + // Parse each arg as a composite step + const parsedSteps = parseCompositeSteps(args); + const executionSteps = toExecutionSteps(parsedSteps); + + // Fusion path: --observe flag fuses act + post-observe into 1 server call + if (flags.observe) { + const postObserve: AgentObserveParams = { + includeMetadata: true, + includeInteractiveElements: true, + maxElements: flags.max ?? 50, + responseTier: (flags.tier as "full" | "interactive" | "minimal") ?? "interactive", + }; + + const result = await client.act({ + steps: executionSteps, + stopOnFirstError: true, + postObserve, + }); + + // Access fused observation from result + const postObs = (result as Record).postObservation as AgentObserveResult | undefined; + + printActResult(result, postObs?.metadata?.url, postObs?.metadata?.title); + + if (postObs) { + printObserveResult(postObs); + } + return; + } + + // Default path: 3 separate calls (act + ariaSnapshot + observe) + const result = await client.act({ + steps: executionSteps, + stopOnFirstError: true, + }); + + // Take a snapshot after execution + const snapshot = await client.ariaSnapshot(); + const snapshotPath = await writeSnapshot(snapshot.snapshot); + + // Get page metadata + const obs = await client.observe({ + includeMetadata: true, + includeInteractiveElements: false, + maxElements: 0, + }); + + printActResult(result, obs.metadata?.url, obs.metadata?.title, snapshotPath); +} + +register("act", actCommand); diff --git a/packages/cli/src/commands/back.ts b/packages/cli/src/commands/back.ts new file mode 100644 index 0000000..31d3787 --- /dev/null +++ b/packages/cli/src/commands/back.ts @@ -0,0 +1,56 @@ +/** + * bap back — Go back in history + * bap forward — Go forward in history + * bap reload — Reload the page + */ + +import type { BAPClient } from "@browseragentprotocol/client"; +import type { GlobalFlags } from "../config/state.js"; +import { printPageSummary } from "../output/formatter.js"; +import { register } from "./registry.js"; + +async function backCommand( + _args: string[], + _flags: GlobalFlags, + client: BAPClient, +): Promise { + await client.goBack(); + const obs = await client.observe({ + includeMetadata: true, + includeInteractiveElements: false, + maxElements: 0, + }); + printPageSummary(obs.metadata?.url, obs.metadata?.title); +} + +async function forwardCommand( + _args: string[], + _flags: GlobalFlags, + client: BAPClient, +): Promise { + await client.goForward(); + const obs = await client.observe({ + includeMetadata: true, + includeInteractiveElements: false, + maxElements: 0, + }); + printPageSummary(obs.metadata?.url, obs.metadata?.title); +} + +async function reloadCommand( + _args: string[], + _flags: GlobalFlags, + client: BAPClient, +): Promise { + await client.reload(); + const obs = await client.observe({ + includeMetadata: true, + includeInteractiveElements: false, + maxElements: 0, + }); + printPageSummary(obs.metadata?.url, obs.metadata?.title); +} + +register("back", backCommand); +register("forward", forwardCommand); +register("reload", reloadCommand); diff --git a/packages/cli/src/commands/check.ts b/packages/cli/src/commands/check.ts new file mode 100644 index 0000000..b570217 --- /dev/null +++ b/packages/cli/src/commands/check.ts @@ -0,0 +1,45 @@ +/** + * bap check — Check a checkbox + * bap uncheck — Uncheck a checkbox + */ + +import type { BAPClient } from "@browseragentprotocol/client"; +import type { GlobalFlags } from "../config/state.js"; +import { parseSelector } from "../selectors/parser.js"; +import { postActionSummary } from "./helpers.js"; +import { register } from "./registry.js"; + +async function checkCommand( + args: string[], + _flags: GlobalFlags, + client: BAPClient, +): Promise { + const selectorStr = args[0]; + if (!selectorStr) { + console.error("Usage: bap check "); + process.exit(1); + } + + const selector = parseSelector(selectorStr); + await client.check(selector); + await postActionSummary(client); +} + +async function uncheckCommand( + args: string[], + _flags: GlobalFlags, + client: BAPClient, +): Promise { + const selectorStr = args[0]; + if (!selectorStr) { + console.error("Usage: bap uncheck "); + process.exit(1); + } + + const selector = parseSelector(selectorStr); + await client.uncheck(selector); + await postActionSummary(client); +} + +register("check", checkCommand); +register("uncheck", uncheckCommand); diff --git a/packages/cli/src/commands/click.ts b/packages/cli/src/commands/click.ts new file mode 100644 index 0000000..1aaf700 --- /dev/null +++ b/packages/cli/src/commands/click.ts @@ -0,0 +1,27 @@ +/** + * bap click — Click an element + */ + +import type { BAPClient } from "@browseragentprotocol/client"; +import type { GlobalFlags } from "../config/state.js"; +import { parseSelector } from "../selectors/parser.js"; +import { postActionSummary } from "./helpers.js"; +import { register } from "./registry.js"; + +async function clickCommand( + args: string[], + _flags: GlobalFlags, + client: BAPClient, +): Promise { + const selectorStr = args[0]; + if (!selectorStr) { + console.error("Usage: bap click "); + process.exit(1); + } + + const selector = parseSelector(selectorStr); + await client.click(selector); + await postActionSummary(client); +} + +register("click", clickCommand); diff --git a/packages/cli/src/commands/close.ts b/packages/cli/src/commands/close.ts new file mode 100644 index 0000000..5328003 --- /dev/null +++ b/packages/cli/src/commands/close.ts @@ -0,0 +1,36 @@ +/** + * bap close — Close browser + * bap close-all — Close browser and kill server + */ + +import type { BAPClient } from "@browseragentprotocol/client"; +import type { GlobalFlags } from "../config/state.js"; +import { ServerManager } from "../server/manager.js"; +import { register } from "./registry.js"; + +async function closeCommand( + _args: string[], + _flags: GlobalFlags, + client: BAPClient, +): Promise { + await client.closeBrowser(); + console.log("### Browser closed"); +} + +async function closeAllCommand( + _args: string[], + _flags: GlobalFlags, + client: BAPClient, +): Promise { + try { + await client.closeBrowser(); + } catch { + // Browser may already be closed + } + + ServerManager.killServer(); + console.log("### Browser and server closed"); +} + +register("close", closeCommand); +register("close-all", closeAllCommand); diff --git a/packages/cli/src/commands/config.ts b/packages/cli/src/commands/config.ts new file mode 100644 index 0000000..470c55f --- /dev/null +++ b/packages/cli/src/commands/config.ts @@ -0,0 +1,77 @@ +/** + * bap config [value] — View or set configuration + * + * Examples: + * bap config browser firefox + * bap config headless false + * bap config timeout 60000 + */ + +import type { BAPClient } from "@browseragentprotocol/client"; +import type { GlobalFlags } from "../config/state.js"; +import { loadConfig, saveConfig } from "../config/state.js"; +import { register } from "./registry.js"; + +async function configCommand( + args: string[], + _flags: GlobalFlags, + _client: BAPClient, +): Promise { + const key = args[0]; + const value = args[1]; + + if (!key) { + // Show current config + const config = loadConfig(); + console.log("### BAP Configuration"); + for (const [k, v] of Object.entries(config)) { + console.log(` ${k}: ${v}`); + } + return; + } + + if (!value) { + // Show specific key + const config = loadConfig(); + const val = (config as unknown as Record)[key]; + if (val !== undefined) { + console.log(`${key}: ${val}`); + } else { + console.error(`Unknown config key: ${key}`); + console.error("Valid keys: browser, headless, timeout, port"); + process.exit(1); + } + return; + } + + // Set value + const updates: Record = {}; + + switch (key) { + case "browser": + if (!["chrome", "chromium", "firefox", "webkit", "edge"].includes(value)) { + console.error("Valid browsers: chrome, chromium, firefox, webkit, edge"); + process.exit(1); + } + updates.browser = value; + break; + case "headless": + updates.headless = value === "true"; + break; + case "timeout": + updates.timeout = parseInt(value, 10); + break; + case "port": + updates.port = parseInt(value, 10); + break; + default: + console.error(`Unknown config key: ${key}`); + console.error("Valid keys: browser, headless, timeout, port"); + process.exit(1); + } + + saveConfig(updates); + console.log(`Set ${key} = ${value}`); +} + +register("config", configCommand); diff --git a/packages/cli/src/commands/eval.ts b/packages/cli/src/commands/eval.ts new file mode 100644 index 0000000..a571e8b --- /dev/null +++ b/packages/cli/src/commands/eval.ts @@ -0,0 +1,43 @@ +/** + * bap eval "js" — Evaluate JavaScript in the page + * + * NOTE: The BAP protocol does not support arbitrary JS evaluation + * (by design — security boundary). This command falls back to + * `client.content("text")` for basic content retrieval. + * For structured data extraction, use `bap extract` instead. + */ + +import type { BAPClient } from "@browseragentprotocol/client"; +import type { GlobalFlags } from "../config/state.js"; +import { register } from "./registry.js"; + +async function evalCommand( + args: string[], + _flags: GlobalFlags, + client: BAPClient, +): Promise { + const js = args[0]; + if (!js) { + console.error('Usage: bap eval ""'); + console.error(""); + console.error("Note: BAP does not support arbitrary JS evaluation."); + console.error("For structured data, use: bap extract --fields=\"title,price\""); + process.exit(1); + } + + // BAP protocol doesn't support eval — retrieve page content instead + console.error("Warning: BAP does not support arbitrary JS evaluation."); + console.error("Falling back to page content retrieval."); + console.error("For structured data, use: bap extract --fields=\"title,price\""); + console.error(""); + + try { + const result = await client.content("text"); + console.log(result.content); + } catch (error) { + console.error(`Failed: ${error instanceof Error ? error.message : String(error)}`); + process.exit(1); + } +} + +register("eval", evalCommand); diff --git a/packages/cli/src/commands/extract.ts b/packages/cli/src/commands/extract.ts new file mode 100644 index 0000000..cdb0e09 --- /dev/null +++ b/packages/cli/src/commands/extract.ts @@ -0,0 +1,72 @@ +/** + * bap extract — Structured data extraction + * + * Flags: + * --fields="title,price,rating" Quick field extraction + * --schema=schema.json JSON Schema-based extraction + * --list="product" Extract list of items + */ + +import fs from "node:fs/promises"; +import type { BAPClient } from "@browseragentprotocol/client"; +import type { ExtractionSchema } from "@browseragentprotocol/protocol"; +import type { GlobalFlags } from "../config/state.js"; +import { printExtractionResult } from "../output/formatter.js"; +import { writeExtraction } from "../output/filesystem.js"; +import { register } from "./registry.js"; + +async function extractCommand( + _args: string[], + flags: GlobalFlags, + client: BAPClient, +): Promise { + let schema: ExtractionSchema; + let mode: "single" | "list" = "single"; + let instruction: string; + + if (flags.fields) { + // Auto-generate schema from field names + const fields = flags.fields.split(",").map((f) => f.trim()); + const properties: Record = {}; + for (const field of fields) { + properties[field] = { type: "string", description: field }; + } + schema = { type: "object", properties } as ExtractionSchema; + instruction = `Extract the following fields: ${fields.join(", ")}`; + } else if (flags.schema) { + // Read schema from file + const schemaContent = await fs.readFile(flags.schema, "utf-8"); + schema = JSON.parse(schemaContent) as ExtractionSchema; + instruction = "Extract data matching the provided schema"; + if ((schema as Record).type === "array") { + mode = "list"; + } + } else if (flags.list) { + // Auto-generate list schema + schema = { + type: "array", + items: { type: "object", properties: {} }, + } as unknown as ExtractionSchema; + instruction = `Extract all ${flags.list} items from the page`; + mode = "list"; + } else { + console.error("Usage: bap extract --fields=... | --schema=... | --list=..."); + console.error(""); + console.error("Examples:"); + console.error(' bap extract --fields="title,price,rating"'); + console.error(" bap extract --schema=product.json"); + console.error(' bap extract --list="product"'); + process.exit(1); + } + + const result = await client.extract({ + instruction, + schema, + mode, + }); + + const filepath = await writeExtraction(result.data); + printExtractionResult(result, filepath); +} + +register("extract", extractCommand); diff --git a/packages/cli/src/commands/fill.ts b/packages/cli/src/commands/fill.ts new file mode 100644 index 0000000..897b15c --- /dev/null +++ b/packages/cli/src/commands/fill.ts @@ -0,0 +1,28 @@ +/** + * bap fill — Fill an input field + */ + +import type { BAPClient } from "@browseragentprotocol/client"; +import type { GlobalFlags } from "../config/state.js"; +import { parseSelector } from "../selectors/parser.js"; +import { postActionSummary } from "./helpers.js"; +import { register } from "./registry.js"; + +async function fillCommand( + args: string[], + _flags: GlobalFlags, + client: BAPClient, +): Promise { + const selectorStr = args[0]; + const value = args[1]; + if (!selectorStr || value === undefined) { + console.error("Usage: bap fill "); + process.exit(1); + } + + const selector = parseSelector(selectorStr); + await client.fill(selector, value); + await postActionSummary(client); +} + +register("fill", fillCommand); diff --git a/packages/cli/src/commands/frames.ts b/packages/cli/src/commands/frames.ts new file mode 100644 index 0000000..f42df53 --- /dev/null +++ b/packages/cli/src/commands/frames.ts @@ -0,0 +1,44 @@ +/** + * bap frames — List frames in current page + * bap frame-switch — Switch to a frame + */ + +import type { BAPClient } from "@browseragentprotocol/client"; +import type { GlobalFlags } from "../config/state.js"; +import { register } from "./registry.js"; + +async function framesCommand( + _args: string[], + _flags: GlobalFlags, + client: BAPClient, +): Promise { + const result = await client.listFrames(); + + console.log("### Frames"); + if (result.frames && result.frames.length > 0) { + for (const frame of result.frames) { + const main = frame.isMain ? " (main)" : ""; + console.log(` ${frame.frameId} ${frame.url ?? ""}${main}`); + } + } else { + console.log(" No frames found"); + } +} + +async function frameSwitchCommand( + args: string[], + _flags: GlobalFlags, + client: BAPClient, +): Promise { + const frameId = args[0]; + if (!frameId) { + console.error("Usage: bap frame-switch "); + process.exit(1); + } + + await client.switchFrame({ frameId }); + console.log(`### Switched to frame: ${frameId}`); +} + +register("frames", framesCommand); +register("frame-switch", frameSwitchCommand); diff --git a/packages/cli/src/commands/goto.ts b/packages/cli/src/commands/goto.ts new file mode 100644 index 0000000..b782b37 --- /dev/null +++ b/packages/cli/src/commands/goto.ts @@ -0,0 +1,51 @@ +/** + * bap goto — Navigate to URL + * + * Fusion: --observe flag fuses navigate + observe into 1 server call + */ + +import type { BAPClient } from "@browseragentprotocol/client"; +import type { AgentObserveResult } from "@browseragentprotocol/protocol"; +import type { GlobalFlags } from "../config/state.js"; +import { printPageSummary, printObserveResult } from "../output/formatter.js"; +import { register } from "./registry.js"; + +async function gotoCommand( + args: string[], + flags: GlobalFlags, + client: BAPClient, +): Promise { + const url = args[0]; + if (!url) { + console.error("Usage: bap goto "); + console.error("Flags: --observe (fused observation), --max=N, --tier="); + process.exit(1); + } + + // Fusion path: --observe flag fuses navigate + observe into 1 call + if (flags.observe) { + const result = await client.navigate(url, { + observe: { + includeMetadata: true, + includeInteractiveElements: true, + maxElements: flags.max ?? 50, + responseTier: (flags.tier as "full" | "interactive" | "minimal") ?? undefined, + }, + }); + + const observation = (result as Record).observation as AgentObserveResult | undefined; + + if (observation) { + printObserveResult(observation); + } else { + printPageSummary(result.url); + } + return; + } + + // Default path + const result = await client.navigate(url); + printPageSummary(result.url); +} + +register("goto", gotoCommand); diff --git a/packages/cli/src/commands/helpers.ts b/packages/cli/src/commands/helpers.ts new file mode 100644 index 0000000..ba1e100 --- /dev/null +++ b/packages/cli/src/commands/helpers.ts @@ -0,0 +1,27 @@ +/** + * @fileoverview Shared helpers for CLI commands + * + * Extracted the common post-action pattern (ariaSnapshot → writeSnapshot → observe → print) + * that every interaction command uses after performing its action. + */ + +import type { BAPClient } from "@browseragentprotocol/client"; +import { printPageSummary } from "../output/formatter.js"; +import { writeSnapshot } from "../output/filesystem.js"; + +/** + * Post-action summary: snapshot the page and print metadata. + * Used by click, fill, type, press, select, check, uncheck, hover. + * Makes 2 server calls: ariaSnapshot + observe(metadata only). + */ +export async function postActionSummary(client: BAPClient): Promise { + const snapshot = await client.ariaSnapshot(); + const snapshotPath = await writeSnapshot(snapshot.snapshot); + + const obs = await client.observe({ + includeMetadata: true, + includeInteractiveElements: false, + maxElements: 0, + }); + printPageSummary(obs.metadata?.url, obs.metadata?.title, snapshotPath); +} diff --git a/packages/cli/src/commands/hover.ts b/packages/cli/src/commands/hover.ts new file mode 100644 index 0000000..72cab34 --- /dev/null +++ b/packages/cli/src/commands/hover.ts @@ -0,0 +1,27 @@ +/** + * bap hover — Hover over an element + */ + +import type { BAPClient } from "@browseragentprotocol/client"; +import type { GlobalFlags } from "../config/state.js"; +import { parseSelector } from "../selectors/parser.js"; +import { postActionSummary } from "./helpers.js"; +import { register } from "./registry.js"; + +async function hoverCommand( + args: string[], + _flags: GlobalFlags, + client: BAPClient, +): Promise { + const selectorStr = args[0]; + if (!selectorStr) { + console.error("Usage: bap hover "); + process.exit(1); + } + + const selector = parseSelector(selectorStr); + await client.hover(selector); + await postActionSummary(client); +} + +register("hover", hoverCommand); diff --git a/packages/cli/src/commands/index.ts b/packages/cli/src/commands/index.ts new file mode 100644 index 0000000..53ccb2b --- /dev/null +++ b/packages/cli/src/commands/index.ts @@ -0,0 +1,32 @@ +/** + * @fileoverview Command registration — re-exports registry and triggers + * side-effect imports to register all commands. + */ + +export { register, getCommand, listCommands } from "./registry.js"; +export type { CommandHandler } from "./registry.js"; + +// Register all commands — side-effect imports +import "./open.js"; +import "./goto.js"; +import "./click.js"; +import "./fill.js"; +import "./type.js"; +import "./press.js"; +import "./select.js"; +import "./check.js"; +import "./hover.js"; +import "./screenshot.js"; +import "./snapshot.js"; +import "./close.js"; +import "./back.js"; +import "./observe.js"; +import "./act.js"; +import "./extract.js"; +import "./sessions.js"; +import "./tabs.js"; +import "./frames.js"; +import "./eval.js"; +import "./config.js"; +import "./recipe.js"; +import "./install-skill.js"; diff --git a/packages/cli/src/commands/install-skill.ts b/packages/cli/src/commands/install-skill.ts new file mode 100644 index 0000000..130d432 --- /dev/null +++ b/packages/cli/src/commands/install-skill.ts @@ -0,0 +1,240 @@ +/** + * bap install-skill — Install BAP skill to detected AI coding agents + * + * Flags: + * --project Install to project-level skill directories only + * --global Install to user-global skill directories only + * --dry-run Show what would be installed without installing + */ + +import { execSync } from "node:child_process"; +import fs from "node:fs"; +import path from "node:path"; +import os from "node:os"; +import { fileURLToPath } from "node:url"; +import type { BAPClient } from "@browseragentprotocol/client"; +import type { GlobalFlags } from "../config/state.js"; +import { register } from "./registry.js"; + +const SKILL_NAME = "bap-browser"; + +function resolveSkillSource(): string { + // Try to find the skills directory relative to this package + try { + const __dirname = path.dirname(fileURLToPath(import.meta.url)); + // In dist: packages/cli/dist/commands/ -> packages/cli/skills/ + const fromDist = path.resolve(__dirname, "../../skills", SKILL_NAME); + if (fs.existsSync(path.join(fromDist, "SKILL.md"))) { + return fromDist; + } + // In src: packages/cli/src/commands/ -> packages/cli/skills/ + const fromSrc = path.resolve(__dirname, "../../../skills", SKILL_NAME); + if (fs.existsSync(path.join(fromSrc, "SKILL.md"))) { + return fromSrc; + } + } catch { + // Fall through + } + + throw new Error( + "Could not find BAP skill files. " + + "Ensure @browseragentprotocol/cli is installed correctly." + ); +} + +function copyRecursive(src: string, dest: string): void { + fs.mkdirSync(dest, { recursive: true }); + const entries = fs.readdirSync(src, { withFileTypes: true }); + for (const entry of entries) { + const srcPath = path.join(src, entry.name); + const destPath = path.join(dest, entry.name); + if (entry.isDirectory()) { + copyRecursive(srcPath, destPath); + } else { + fs.copyFileSync(srcPath, destPath); + } + } +} + +interface AgentTarget { + name: string; + dir: string; + detect: () => boolean; +} + +function getGlobalAgents(): AgentTarget[] { + const home = os.homedir(); + return [ + { + name: "Claude Code", + dir: path.join(home, ".claude", "skills"), + detect: () => + commandExists("claude") || fs.existsSync(path.join(home, ".claude")), + }, + { + name: "Codex CLI", + dir: path.join(home, ".codex", "skills"), + detect: () => + commandExists("codex") || fs.existsSync(path.join(home, ".codex")), + }, + { + name: "Gemini CLI", + dir: path.join(home, ".gemini", "skills"), + detect: () => + commandExists("gemini") || fs.existsSync(path.join(home, ".gemini")), + }, + { + name: "Amp", + dir: path.join(home, ".amp", "skills"), + detect: () => fs.existsSync(path.join(home, ".amp")), + }, + { + name: "Deep Agents", + dir: path.join(home, ".deepagents", "agent", "skills"), + detect: () => + commandExists("deepagents") || + fs.existsSync(path.join(home, ".deepagents")), + }, + { + name: "OpenCode", + dir: path.join(home, ".config", "opencode", "skill"), + detect: () => + fs.existsSync(path.join(home, ".config", "opencode")), + }, + ]; +} + +function getProjectAgents(): AgentTarget[] { + const cwd = process.cwd(); + return [ + { + name: "Claude Code", + dir: path.join(cwd, ".claude", "skills"), + detect: () => + fs.existsSync(path.join(cwd, ".claude")) || + fs.existsSync(path.join(cwd, "CLAUDE.md")), + }, + { + name: "Codex", + dir: path.join(cwd, ".agents", "skills"), + detect: () => + fs.existsSync(path.join(cwd, ".agents")) || + fs.existsSync(path.join(cwd, ".codex")), + }, + { + name: "Gemini CLI", + dir: path.join(cwd, ".gemini", "skills"), + detect: () => + fs.existsSync(path.join(cwd, ".gemini")) || + fs.existsSync(path.join(cwd, "GEMINI.md")), + }, + { + name: "Cursor", + dir: path.join(cwd, ".cursor", "skills"), + detect: () => fs.existsSync(path.join(cwd, ".cursor")), + }, + { + name: "GitHub Copilot", + dir: path.join(cwd, ".github", "skills"), + detect: () => fs.existsSync(path.join(cwd, ".github")), + }, + { + name: "Windsurf", + dir: path.join(cwd, ".windsurf", "skills"), + detect: () => fs.existsSync(path.join(cwd, ".windsurf")), + }, + { + name: "Roo Code", + dir: path.join(cwd, ".roo", "skills"), + detect: () => fs.existsSync(path.join(cwd, ".roo")), + }, + ]; +} + +function commandExists(cmd: string): boolean { + try { + execSync(`command -v ${cmd}`, { stdio: "ignore" }); + return true; + } catch { + return false; + } +} + +async function installSkillCommand( + _args: string[], + flags: GlobalFlags, + _client: BAPClient, +): Promise { + const skillSrc = resolveSkillSource(); + const dryRun = flags.dryRun ?? false; + const projectOnly = flags.project ?? false; + const globalOnly = flags.global ?? false; + + const installed: string[] = []; + const skipped: string[] = []; + + console.log(""); + console.log("### BAP Browser Skill Installer"); + console.log(""); + + // Global installations + if (!projectOnly) { + console.log("Global skills (available in all projects):"); + for (const agent of getGlobalAgents()) { + if (agent.detect()) { + if (dryRun) { + console.log(` [dry-run] Would install to ${agent.dir}/${SKILL_NAME}/`); + } else { + copyRecursive(skillSrc, path.join(agent.dir, SKILL_NAME)); + console.log(` ✓ ${agent.name} (${agent.dir}/)`); + installed.push(agent.name); + } + } else { + skipped.push(agent.name); + } + } + console.log(""); + } + + // Project-local installations + if (!globalOnly) { + console.log("Project skills (current directory):"); + for (const agent of getProjectAgents()) { + if (agent.detect()) { + if (dryRun) { + console.log(` [dry-run] Would install to ${agent.dir}/${SKILL_NAME}/`); + } else { + copyRecursive(skillSrc, path.join(agent.dir, SKILL_NAME)); + console.log(` ✓ ${agent.name} (${agent.dir}/)`); + installed.push(agent.name); + } + } + } + console.log(""); + } + + // Summary + if (dryRun) { + console.log("Dry run complete. No files were modified."); + } else { + console.log(`Installed to ${installed.length} location(s).`); + } + + if (skipped.length > 0) { + console.log(`Not detected: ${skipped.join(", ")}`); + } +} + +register("install-skill", installSkillCommand); + +// Also register as "skill" with subcommand "init" +register("skill", async (args, flags, client) => { + const subcommand = args[0]; + if (subcommand === "init") { + flags.project = true; + await installSkillCommand([], flags, client); + } else { + console.error("Usage: bap skill init"); + process.exit(1); + } +}); diff --git a/packages/cli/src/commands/observe.ts b/packages/cli/src/commands/observe.ts new file mode 100644 index 0000000..7907d88 --- /dev/null +++ b/packages/cli/src/commands/observe.ts @@ -0,0 +1,79 @@ +/** + * bap observe — Smart observation of interactive elements + * + * Flags: + * --full Full accessibility tree + * --forms Form fields only + * --navigation Navigation elements only + * --max=N Limit to N elements (default: 50) + * --diff Incremental mode: show only changes since last observation + * --tier=T Response tier: full, interactive, minimal + */ + +import type { BAPClient, AgentObserveParams } from "@browseragentprotocol/client"; +import type { GlobalFlags } from "../config/state.js"; +import { printObserveResult, printObserveChanges } from "../output/formatter.js"; +import { writeSnapshot } from "../output/filesystem.js"; +import { register } from "./registry.js"; + +async function observeCommand( + _args: string[], + flags: GlobalFlags, + client: BAPClient, +): Promise { + const params: AgentObserveParams = { + includeInteractiveElements: true, + includeMetadata: true, + maxElements: flags.max ?? 50, + }; + + // --full: include accessibility tree + if (flags.full) { + params.includeAccessibility = true; + params.maxElements = 200; + } + + // --forms: filter to form-related roles + if (flags.forms) { + params.filterRoles = [ + "textbox", "combobox", "checkbox", "radio", + "searchbox", "spinbutton", "slider", "switch", + ]; + } + + // --navigation: filter to nav-related roles + if (flags.navigation) { + params.filterRoles = ["link", "button", "menuitem", "tab"]; + } + + // Fusion: --diff sets incremental mode + if (flags.diff) { + params.incremental = true; + } + + // Fusion: --tier sets response compression tier + if (flags.tier) { + params.responseTier = flags.tier as "full" | "interactive" | "minimal"; + } + + const result = await client.observe(params); + + // Write accessibility tree if full mode + if (flags.full && result.accessibility) { + const snapshotPath = await writeSnapshot( + JSON.stringify(result.accessibility.tree, null, 2) + ); + console.log("### Accessibility Tree"); + console.log(`[Full Tree](${snapshotPath})`); + console.log(""); + } + + // Print incremental changes if available + if (flags.diff && result.changes) { + printObserveChanges(result.changes); + } + + printObserveResult(result); +} + +register("observe", observeCommand); diff --git a/packages/cli/src/commands/open.ts b/packages/cli/src/commands/open.ts new file mode 100644 index 0000000..e411cb8 --- /dev/null +++ b/packages/cli/src/commands/open.ts @@ -0,0 +1,54 @@ +/** + * bap open [url] — Open browser, optionally navigate to URL + */ + +import type { BAPClient } from "@browseragentprotocol/client"; +import type { GlobalFlags } from "../config/state.js"; +import { printPageSummary } from "../output/formatter.js"; +import { register } from "./registry.js"; + +/** Map user-facing browser names to Playwright browser types */ +const BROWSER_MAP: Record = { + chrome: "chromium", + chromium: "chromium", + firefox: "firefox", + webkit: "webkit", + edge: "chromium", +}; + +/** Map user-facing browser names to Playwright channels */ +const CHANNEL_MAP: Record = { + chrome: "chrome", + edge: "msedge", +}; + +async function openCommand( + args: string[], + flags: GlobalFlags, + client: BAPClient, +): Promise { + const browser = BROWSER_MAP[flags.browser] ?? "chromium"; + const channel = CHANNEL_MAP[flags.browser]; + + // Launch browser + await client.launch({ + browser, + channel, + headless: flags.headless, + }); + + // Create a page + await client.createPage(); + + // Navigate if URL provided + const url = args[0]; + if (url) { + const result = await client.navigate(url); + printPageSummary(result.url); + } else { + console.log("### Browser opened"); + console.log("Use `bap goto ` to navigate."); + } +} + +register("open", openCommand); diff --git a/packages/cli/src/commands/press.ts b/packages/cli/src/commands/press.ts new file mode 100644 index 0000000..c3f0eb3 --- /dev/null +++ b/packages/cli/src/commands/press.ts @@ -0,0 +1,25 @@ +/** + * bap press — Press a keyboard key + */ + +import type { BAPClient } from "@browseragentprotocol/client"; +import type { GlobalFlags } from "../config/state.js"; +import { postActionSummary } from "./helpers.js"; +import { register } from "./registry.js"; + +async function pressCommand( + args: string[], + _flags: GlobalFlags, + client: BAPClient, +): Promise { + const key = args[0]; + if (!key) { + console.error("Usage: bap press "); + process.exit(1); + } + + await client.press(key); + await postActionSummary(client); +} + +register("press", pressCommand); diff --git a/packages/cli/src/commands/recipe.ts b/packages/cli/src/commands/recipe.ts new file mode 100644 index 0000000..24031ac --- /dev/null +++ b/packages/cli/src/commands/recipe.ts @@ -0,0 +1,170 @@ +/** + * bap recipe — Pre-built multi-step workflows + * + * Recipes: + * bap recipe login --user= --pass=

+ * bap recipe fill-form --data=data.json + * bap recipe wait-for [--timeout=ms] + */ + +import fs from "node:fs/promises"; +import type { BAPClient } from "@browseragentprotocol/client"; +import type { GlobalFlags } from "../config/state.js"; +import { parseSelector } from "../selectors/parser.js"; +import { printActResult } from "../output/formatter.js"; +import { writeSnapshot } from "../output/filesystem.js"; +import { register } from "./registry.js"; + +async function recipeCommand( + args: string[], + flags: GlobalFlags, + client: BAPClient, +): Promise { + const recipeName = args[0]; + + if (!recipeName) { + console.error("Usage: bap recipe [args]"); + console.error(""); + console.error("Available recipes:"); + console.error(" login --user= --pass=

Log in to a website"); + console.error(" fill-form --data=data.json Fill a form from JSON"); + console.error(" wait-for [--timeout=ms] Wait for element"); + process.exit(1); + } + + switch (recipeName) { + case "login": + await recipeLogin(args.slice(1), flags, client); + break; + case "fill-form": + await recipeFillForm(args.slice(1), flags, client); + break; + case "wait-for": + await recipeWaitFor(args.slice(1), flags, client); + break; + default: + console.error(`Unknown recipe: ${recipeName}`); + process.exit(1); + } +} + +async function recipeLogin( + args: string[], + flags: GlobalFlags, + client: BAPClient, +): Promise { + const url = args[0]; + if (!url || !flags.user || !flags.pass) { + console.error("Usage: bap recipe login --user= --pass=

"); + process.exit(1); + } + + // Navigate to URL + await client.navigate(url); + + // Build steps: find username/password fields, then submit + const steps = []; + + // Use custom field selectors or auto-detect from observe + const userSelector = flags.userField + ? parseSelector(flags.userField) + : { type: "label" as const, value: "Email" }; + const passSelector = flags.passField + ? parseSelector(flags.passField) + : { type: "label" as const, value: "Password" }; + + steps.push( + { action: "action/fill", params: { selector: userSelector, value: flags.user } }, + { action: "action/fill", params: { selector: passSelector, value: flags.pass } }, + { action: "action/click", params: { selector: { type: "role" as const, role: "button" as const, name: "Sign in" } } }, + ); + + const result = await client.act({ steps, stopOnFirstError: true }); + + const snapshot = await client.ariaSnapshot(); + const snapshotPath = await writeSnapshot(snapshot.snapshot); + + const meta = await client.observe({ + includeMetadata: true, + includeInteractiveElements: false, + maxElements: 0, + }); + + printActResult(result, meta.metadata?.url, meta.metadata?.title, snapshotPath); +} + +async function recipeFillForm( + args: string[], + flags: GlobalFlags, + client: BAPClient, +): Promise { + const url = args[0]; + if (!url || !flags.data) { + console.error("Usage: bap recipe fill-form --data=data.json"); + process.exit(1); + } + + // Navigate to URL + await client.navigate(url); + + // Read form data + const dataContent = await fs.readFile(flags.data, "utf-8"); + const formData = JSON.parse(dataContent) as Record; + + // Build fill steps from form data + const steps = Object.entries(formData).map(([label, value]) => ({ + action: "action/fill", + params: { + selector: { type: "label" as const, value: label }, + value, + }, + })); + + const result = await client.act({ steps, stopOnFirstError: false }); + + const snapshot = await client.ariaSnapshot(); + const snapshotPath = await writeSnapshot(snapshot.snapshot); + + const meta = await client.observe({ + includeMetadata: true, + includeInteractiveElements: false, + maxElements: 0, + }); + + printActResult(result, meta.metadata?.url, meta.metadata?.title, snapshotPath); +} + +async function recipeWaitFor( + args: string[], + flags: GlobalFlags, + client: BAPClient, +): Promise { + const selectorStr = args[0]; + if (!selectorStr) { + console.error("Usage: bap recipe wait-for [--timeout=ms]"); + process.exit(1); + } + + const timeout = flags.timeout ?? 30000; + const interval = 500; + const start = Date.now(); + const selector = parseSelector(selectorStr); + + while (Date.now() - start < timeout) { + try { + const result = await client.element(selector, ["visible"]); + if (result.found && result.visible) { + console.log("### Element found"); + return; + } + } catch { + // Element not found yet + } + await new Promise((resolve) => setTimeout(resolve, interval)); + } + + console.error(`Timeout: element ${selectorStr} not found within ${timeout}ms`); + process.exit(1); +} + +register("recipe", recipeCommand); diff --git a/packages/cli/src/commands/registry.ts b/packages/cli/src/commands/registry.ts new file mode 100644 index 0000000..34315bc --- /dev/null +++ b/packages/cli/src/commands/registry.ts @@ -0,0 +1,26 @@ +/** + * @fileoverview Command registry — standalone module with no circular deps. + */ + +import type { BAPClient } from "@browseragentprotocol/client"; +import type { GlobalFlags } from "../config/state.js"; + +export type CommandHandler = ( + args: string[], + flags: GlobalFlags, + client: BAPClient, +) => Promise; + +const commands = new Map(); + +export function register(name: string, handler: CommandHandler): void { + commands.set(name, handler); +} + +export function getCommand(name: string): CommandHandler | undefined { + return commands.get(name); +} + +export function listCommands(): string[] { + return [...commands.keys()].sort(); +} diff --git a/packages/cli/src/commands/screenshot.ts b/packages/cli/src/commands/screenshot.ts new file mode 100644 index 0000000..7323602 --- /dev/null +++ b/packages/cli/src/commands/screenshot.ts @@ -0,0 +1,22 @@ +/** + * bap screenshot [--file=F] — Take a screenshot + */ + +import type { BAPClient } from "@browseragentprotocol/client"; +import type { GlobalFlags } from "../config/state.js"; +import { writeScreenshot } from "../output/filesystem.js"; +import { register } from "./registry.js"; + +async function screenshotCommand( + _args: string[], + flags: GlobalFlags, + client: BAPClient, +): Promise { + const result = await client.screenshot({ fullPage: false }); + const screenshotPath = await writeScreenshot(result.data, flags.file); + + console.log("### Screenshot"); + console.log(`[Screenshot](${screenshotPath})`); +} + +register("screenshot", screenshotCommand); diff --git a/packages/cli/src/commands/select.ts b/packages/cli/src/commands/select.ts new file mode 100644 index 0000000..16da2d2 --- /dev/null +++ b/packages/cli/src/commands/select.ts @@ -0,0 +1,28 @@ +/** + * bap select — Select a dropdown option + */ + +import type { BAPClient } from "@browseragentprotocol/client"; +import type { GlobalFlags } from "../config/state.js"; +import { parseSelector } from "../selectors/parser.js"; +import { postActionSummary } from "./helpers.js"; +import { register } from "./registry.js"; + +async function selectCommand( + args: string[], + _flags: GlobalFlags, + client: BAPClient, +): Promise { + const selectorStr = args[0]; + const value = args[1]; + if (!selectorStr || !value) { + console.error("Usage: bap select "); + process.exit(1); + } + + const selector = parseSelector(selectorStr); + await client.select(selector, value); + await postActionSummary(client); +} + +register("select", selectCommand); diff --git a/packages/cli/src/commands/sessions.ts b/packages/cli/src/commands/sessions.ts new file mode 100644 index 0000000..6780621 --- /dev/null +++ b/packages/cli/src/commands/sessions.ts @@ -0,0 +1,29 @@ +/** + * bap sessions — List active sessions + * + * Sessions are managed via the -s= global flag. + * Each session maps to a BAP browser context. + */ + +import type { BAPClient } from "@browseragentprotocol/client"; +import type { GlobalFlags } from "../config/state.js"; +import { register } from "./registry.js"; + +async function sessionsCommand( + _args: string[], + _flags: GlobalFlags, + client: BAPClient, +): Promise { + const result = await client.listContexts(); + + console.log("### Sessions"); + if (result.contexts && result.contexts.length > 0) { + for (const ctx of result.contexts) { + console.log(` ${ctx.id} (${ctx.pageCount} pages)`); + } + } else { + console.log(" No active sessions"); + } +} + +register("sessions", sessionsCommand); diff --git a/packages/cli/src/commands/snapshot.ts b/packages/cli/src/commands/snapshot.ts new file mode 100644 index 0000000..8454a85 --- /dev/null +++ b/packages/cli/src/commands/snapshot.ts @@ -0,0 +1,33 @@ +/** + * bap snapshot [--file=F] — Save accessibility snapshot (YAML) + */ + +import type { BAPClient } from "@browseragentprotocol/client"; +import type { GlobalFlags } from "../config/state.js"; +import { writeSnapshot } from "../output/filesystem.js"; +import { printSnapshotSummary } from "../output/formatter.js"; +import { register } from "./registry.js"; + +async function snapshotCommand( + _args: string[], + flags: GlobalFlags, + client: BAPClient, +): Promise { + const result = await client.ariaSnapshot(); + + let snapshotPath: string; + if (flags.file) { + const fs = await import("node:fs/promises"); + const path = await import("node:path"); + const dir = path.dirname(flags.file); + await fs.mkdir(dir, { recursive: true }); + await fs.writeFile(flags.file, result.snapshot, "utf-8"); + snapshotPath = flags.file; + } else { + snapshotPath = await writeSnapshot(result.snapshot); + } + + printSnapshotSummary(snapshotPath); +} + +register("snapshot", snapshotCommand); diff --git a/packages/cli/src/commands/tabs.ts b/packages/cli/src/commands/tabs.ts new file mode 100644 index 0000000..27ea531 --- /dev/null +++ b/packages/cli/src/commands/tabs.ts @@ -0,0 +1,97 @@ +/** + * bap tabs — List open tabs + * bap tab-new [url] — Open new tab + * bap tab-select — Switch to tab + * bap tab-close [index] — Close tab + */ + +import type { BAPClient } from "@browseragentprotocol/client"; +import type { GlobalFlags } from "../config/state.js"; +import { printPageSummary } from "../output/formatter.js"; +import { register } from "./registry.js"; + +async function tabsCommand( + _args: string[], + _flags: GlobalFlags, + client: BAPClient, +): Promise { + const result = await client.listPages(); + + console.log("### Tabs"); + if (result.pages.length > 0) { + for (let i = 0; i < result.pages.length; i++) { + const page = result.pages[i]!; + const active = page.id === result.activePage ? " *" : ""; + console.log(` [${i}] ${page.url ?? "about:blank"}${active}`); + } + } else { + console.log(" No open tabs"); + } +} + +async function tabNewCommand( + args: string[], + _flags: GlobalFlags, + client: BAPClient, +): Promise { + const page = await client.createPage(); + const url = args[0]; + if (url) { + const result = await client.navigate(url, { pageId: page.id }); + printPageSummary(result.url); + } else { + console.log(`### New tab opened: ${page.id}`); + } +} + +async function tabSelectCommand( + args: string[], + _flags: GlobalFlags, + client: BAPClient, +): Promise { + const indexStr = args[0]; + if (!indexStr) { + console.error("Usage: bap tab-select "); + process.exit(1); + } + + const index = parseInt(indexStr, 10); + const result = await client.listPages(); + + if (index < 0 || index >= result.pages.length) { + console.error(`Tab index ${index} out of range (0-${result.pages.length - 1})`); + process.exit(1); + } + + const page = result.pages[index]!; + await client.activatePage(page.id); + console.log(`### Switched to tab ${index}: ${page.url ?? "about:blank"}`); +} + +async function tabCloseCommand( + args: string[], + _flags: GlobalFlags, + client: BAPClient, +): Promise { + const indexStr = args[0]; + + if (indexStr) { + const index = parseInt(indexStr, 10); + const result = await client.listPages(); + if (index < 0 || index >= result.pages.length) { + console.error(`Tab index ${index} out of range (0-${result.pages.length - 1})`); + process.exit(1); + } + const page = result.pages[index]!; + await client.closePage(page.id); + console.log(`### Closed tab ${index}`); + } else { + await client.closePage(); + console.log("### Closed current tab"); + } +} + +register("tabs", tabsCommand); +register("tab-new", tabNewCommand); +register("tab-select", tabSelectCommand); +register("tab-close", tabCloseCommand); diff --git a/packages/cli/src/commands/type.ts b/packages/cli/src/commands/type.ts new file mode 100644 index 0000000..31f3999 --- /dev/null +++ b/packages/cli/src/commands/type.ts @@ -0,0 +1,35 @@ +/** + * bap type — Type text into focused element + * bap type — Type text into specific element + */ + +import type { BAPClient } from "@browseragentprotocol/client"; +import type { GlobalFlags } from "../config/state.js"; +import { parseSelector } from "../selectors/parser.js"; +import { postActionSummary } from "./helpers.js"; +import { register } from "./registry.js"; + +async function typeCommand( + args: string[], + _flags: GlobalFlags, + client: BAPClient, +): Promise { + if (args.length === 0) { + console.error("Usage: bap type OR bap type "); + process.exit(1); + } + + if (args.length === 1) { + // Type into focused element — use a text selector pointing to active element + // Since the client requires a selector, we'll use the body as a fallback + const selector = parseSelector("css:body"); + await client.type(selector, args[0]!); + } else { + const selector = parseSelector(args[0]!); + await client.type(selector, args[1]!); + } + + await postActionSummary(client); +} + +register("type", typeCommand); diff --git a/packages/cli/src/config/state.ts b/packages/cli/src/config/state.ts new file mode 100644 index 0000000..0d5a865 --- /dev/null +++ b/packages/cli/src/config/state.ts @@ -0,0 +1,232 @@ +/** + * @fileoverview Global flags parsing and configuration management + */ + +import fs from "node:fs"; +import path from "node:path"; +import os from "node:os"; + +// ============================================================================= +// Types +// ============================================================================= + +export interface GlobalFlags { + command: string; + args: string[]; + session?: string; + port: number; + host: string; + browser: string; + headless: boolean; + verbose: boolean; + help: boolean; + version: boolean; + // observe flags + full?: boolean; + forms?: boolean; + navigation?: boolean; + max?: number; + // extract flags + fields?: string; + schema?: string; + list?: string; + // install-skill flags + project?: boolean; + global?: boolean; + dryRun?: boolean; + // fusion flags + observe?: boolean; + diff?: boolean; + tier?: string; + // output flags + file?: string; + // recipe flags + user?: string; + pass?: string; + data?: string; + timeout?: number; + userField?: string; + passField?: string; +} + +export interface BAPConfig { + browser: string; + headless: boolean; + timeout: number; + port: number; +} + +const DEFAULT_CONFIG: BAPConfig = { + browser: "chrome", + headless: true, + timeout: 30000, + port: 9222, +}; + +// ============================================================================= +// Config File Management +// ============================================================================= + +function getConfigDir(): string { + return path.join(os.homedir(), ".bap"); +} + +function getConfigPath(): string { + return path.join(getConfigDir(), "config.json"); +} + +export function loadConfig(): BAPConfig { + try { + const configPath = getConfigPath(); + if (fs.existsSync(configPath)) { + const raw = fs.readFileSync(configPath, "utf-8"); + return { ...DEFAULT_CONFIG, ...JSON.parse(raw) }; + } + } catch { + // Ignore errors, use defaults + } + return { ...DEFAULT_CONFIG }; +} + +export function saveConfig(config: Partial): void { + const configDir = getConfigDir(); + fs.mkdirSync(configDir, { recursive: true }); + const existing = loadConfig(); + const merged = { ...existing, ...config }; + fs.writeFileSync(getConfigPath(), JSON.stringify(merged, null, 2), "utf-8"); +} + +// ============================================================================= +// Argument Parsing +// ============================================================================= + +export function parseArgs(argv: string[]): GlobalFlags { + const config = loadConfig(); + + const flags: GlobalFlags = { + command: "", + args: [], + port: config.port, + host: "localhost", + browser: config.browser, + headless: config.headless, + verbose: false, + help: false, + version: false, + }; + + const remaining: string[] = []; + + for (let i = 0; i < argv.length; i++) { + const arg = argv[i]!; + + // Global flags + if (arg === "--help" || arg === "-h") { + flags.help = true; + } else if (arg === "--version" || arg === "-V") { + flags.version = true; + } else if (arg === "--verbose" || arg === "-v") { + flags.verbose = true; + } else if (arg === "--port" || arg === "-p") { + flags.port = parseInt(argv[++i] ?? "9222", 10); + } else if (arg.startsWith("--port=")) { + flags.port = parseInt(arg.slice(7), 10); + } else if (arg === "--browser" || arg === "-b") { + flags.browser = argv[++i] ?? "chrome"; + } else if (arg.startsWith("--browser=")) { + flags.browser = arg.slice(10); + } else if (arg === "--headless") { + flags.headless = true; + } else if (arg === "--no-headless") { + flags.headless = false; + } else if (arg.startsWith("-s=")) { + flags.session = arg.slice(3); + } else if (arg === "-s") { + flags.session = argv[++i]; + } + // observe flags + else if (arg === "--full") { + flags.full = true; + } else if (arg === "--forms") { + flags.forms = true; + } else if (arg === "--navigation") { + flags.navigation = true; + } else if (arg.startsWith("--max=")) { + flags.max = parseInt(arg.slice(6), 10); + } else if (arg === "--max") { + flags.max = parseInt(argv[++i] ?? "50", 10); + } + // fusion flags + else if (arg === "--observe") { + flags.observe = true; + } else if (arg === "--diff") { + flags.diff = true; + } else if (arg.startsWith("--tier=")) { + flags.tier = arg.slice(7); + } else if (arg === "--tier") { + flags.tier = argv[++i]; + } + // extract flags + else if (arg.startsWith("--fields=")) { + flags.fields = arg.slice(9); + } else if (arg === "--fields") { + flags.fields = argv[++i]; + } else if (arg.startsWith("--schema=")) { + flags.schema = arg.slice(9); + } else if (arg === "--schema") { + flags.schema = argv[++i]; + } else if (arg.startsWith("--list=")) { + flags.list = arg.slice(7); + } else if (arg === "--list") { + flags.list = argv[++i]; + } + // install-skill flags + else if (arg === "--project") { + flags.project = true; + } else if (arg === "--global") { + flags.global = true; + } else if (arg === "--dry-run") { + flags.dryRun = true; + } + // output flags + else if (arg.startsWith("--file=")) { + flags.file = arg.slice(7); + } else if (arg === "--file") { + flags.file = argv[++i]; + } + // recipe flags + else if (arg.startsWith("--user=")) { + flags.user = arg.slice(7); + } else if (arg === "--user") { + flags.user = argv[++i]; + } else if (arg.startsWith("--pass=")) { + flags.pass = arg.slice(7); + } else if (arg === "--pass") { + flags.pass = argv[++i]; + } else if (arg.startsWith("--data=")) { + flags.data = arg.slice(7); + } else if (arg === "--data") { + flags.data = argv[++i]; + } else if (arg.startsWith("--timeout=")) { + flags.timeout = parseInt(arg.slice(10), 10); + } else if (arg === "--timeout") { + flags.timeout = parseInt(argv[++i] ?? "30000", 10); + } else if (arg.startsWith("--user-field=")) { + flags.userField = arg.slice(13); + } else if (arg.startsWith("--pass-field=")) { + flags.passField = arg.slice(13); + } + // positional args + else { + remaining.push(arg); + } + } + + // First positional arg is the command + if (remaining.length > 0) { + flags.command = remaining[0]!; + flags.args = remaining.slice(1); + } + + return flags; +} diff --git a/packages/cli/src/output/filesystem.ts b/packages/cli/src/output/filesystem.ts new file mode 100644 index 0000000..fd6d504 --- /dev/null +++ b/packages/cli/src/output/filesystem.ts @@ -0,0 +1,64 @@ +/** + * @fileoverview .bap/ directory management for CLI output files + */ + +import fs from "node:fs/promises"; +import path from "node:path"; + +const BAP_DIR = ".bap"; + +function timestamp(): string { + return new Date().toISOString().replace(/[:.]/g, "-").slice(0, 19); +} + +/** + * Ensure the .bap/ directory exists in the current working directory. + */ +export async function ensureBapDir(): Promise { + const dir = path.resolve(process.cwd(), BAP_DIR); + await fs.mkdir(dir, { recursive: true }); + return dir; +} + +/** + * Write a YAML snapshot to .bap/snapshot-.yml + * @returns Relative path to the snapshot file (e.g., ".bap/snapshot-2026-02-16T19-30-42.yml") + */ +export async function writeSnapshot(data: string): Promise { + const dir = await ensureBapDir(); + const filename = `snapshot-${timestamp()}.yml`; + const filepath = path.join(dir, filename); + await fs.writeFile(filepath, data, "utf-8"); + return path.join(BAP_DIR, filename); +} + +/** + * Write a PNG screenshot to .bap/screenshot-.png + * @returns Relative path to the screenshot file + */ +export async function writeScreenshot(base64Data: string, customPath?: string): Promise { + if (customPath) { + const dir = path.dirname(customPath); + await fs.mkdir(dir, { recursive: true }); + await fs.writeFile(customPath, Buffer.from(base64Data, "base64")); + return customPath; + } + + const dir = await ensureBapDir(); + const filename = `screenshot-${timestamp()}.png`; + const filepath = path.join(dir, filename); + await fs.writeFile(filepath, Buffer.from(base64Data, "base64")); + return path.join(BAP_DIR, filename); +} + +/** + * Write extraction result to .bap/extraction-.json + * @returns Relative path to the extraction file + */ +export async function writeExtraction(data: unknown): Promise { + const dir = await ensureBapDir(); + const filename = `extraction-${timestamp()}.json`; + const filepath = path.join(dir, filename); + await fs.writeFile(filepath, JSON.stringify(data, null, 2), "utf-8"); + return path.join(BAP_DIR, filename); +} diff --git a/packages/cli/src/output/formatter.ts b/packages/cli/src/output/formatter.ts new file mode 100644 index 0000000..37b24ff --- /dev/null +++ b/packages/cli/src/output/formatter.ts @@ -0,0 +1,154 @@ +/** + * @fileoverview CLI output formatting + * + * Concise, AI-agent-friendly output format: + * ### Page + * - URL: https://example.com/dashboard + * - Title: Dashboard + * ### Snapshot + * [Snapshot](.bap/snapshot-2026-02-16T19-30-42.yml) + */ + +import type { + AgentActResult, + AgentObserveResult, + AgentExtractResult, + ObserveChanges, +} from "@browseragentprotocol/protocol"; + +/** + * Print page summary with optional snapshot/screenshot links. + */ +export function printPageSummary( + url?: string, + title?: string, + snapshotPath?: string, + screenshotPath?: string, +): void { + console.log("### Page"); + if (url) console.log(`- URL: ${url}`); + if (title) console.log(`- Title: ${title}`); + + if (snapshotPath) { + console.log("### Snapshot"); + console.log(`[Snapshot](${snapshotPath})`); + } + + if (screenshotPath) { + console.log("### Screenshot"); + console.log(`[Screenshot](${screenshotPath})`); + } +} + +/** + * Print act result summary. + */ +export function printActResult( + result: AgentActResult, + url?: string, + title?: string, + snapshotPath?: string, +): void { + console.log("### Page"); + if (url) console.log(`- URL: ${url}`); + if (title) console.log(`- Title: ${title}`); + + console.log(`### Result: ${result.completed}/${result.total} steps completed`); + + if (!result.success && result.results) { + const failed = result.results.find((r) => !r.success); + if (failed?.error) { + console.log(`### Error: ${failed.error.message ?? "Unknown error"}`); + } + } + + if (snapshotPath) { + console.log("### Snapshot"); + console.log(`[Snapshot](${snapshotPath})`); + } +} + +/** + * Print observe result — compact list of interactive elements. + */ +export function printObserveResult(result: AgentObserveResult): void { + if (result.metadata) { + console.log("### Page"); + console.log(`- URL: ${result.metadata.url}`); + console.log(`- Title: ${result.metadata.title}`); + } + + if (result.interactiveElements && result.interactiveElements.length > 0) { + console.log(`### Interactive Elements (${result.interactiveElements.length})`); + for (const el of result.interactiveElements) { + const ref = el.ref ?? ""; + const name = el.name ? ` "${el.name}"` : ""; + const value = el.value ? ` [${el.value}]` : ""; + console.log(` ${ref} ${el.role}${name}${value}`); + } + } else { + console.log("### No interactive elements found"); + } +} + +/** + * Print extraction result. + */ +export function printExtractionResult( + result: AgentExtractResult, + filepath: string, +): void { + console.log("### Extraction"); + console.log(`[Data](${filepath})`); + + if (result.data) { + const preview = JSON.stringify(result.data); + if (preview.length <= 200) { + console.log(`\`\`\`json\n${JSON.stringify(result.data, null, 2)}\n\`\`\``); + } else { + console.log(`(${preview.length} bytes — see file for full data)`); + } + } +} + +/** + * Print a snapshot summary (used by snapshot/snapshot commands). + */ +export function printSnapshotSummary(snapshotPath: string): void { + console.log("### Snapshot"); + console.log(`[Snapshot](${snapshotPath})`); +} + +/** + * Print incremental observation changes (--diff mode). + */ +export function printObserveChanges(changes: ObserveChanges): void { + console.log("### Changes"); + + if (changes.added.length > 0) { + console.log(` + ${changes.added.length} added`); + for (const el of changes.added) { + const name = el.name ? ` "${el.name}"` : ""; + console.log(` + ${el.ref} ${el.role}${name}`); + } + } + + if (changes.updated.length > 0) { + console.log(` ~ ${changes.updated.length} updated`); + for (const el of changes.updated) { + const name = el.name ? ` "${el.name}"` : ""; + console.log(` ~ ${el.ref} ${el.role}${name}`); + } + } + + if (changes.removed.length > 0) { + console.log(` - ${changes.removed.length} removed`); + for (const ref of changes.removed) { + console.log(` - ${ref}`); + } + } + + if (changes.added.length === 0 && changes.updated.length === 0 && changes.removed.length === 0) { + console.log(" (no changes)"); + } +} diff --git a/packages/cli/src/selectors/composite-parser.ts b/packages/cli/src/selectors/composite-parser.ts new file mode 100644 index 0000000..d99c47b --- /dev/null +++ b/packages/cli/src/selectors/composite-parser.ts @@ -0,0 +1,174 @@ +/** + * @fileoverview Composite step parser for `bap act` syntax + * + * Parses step strings like: + * fill:role:textbox:"Email"="user@example.com" + * click:e15 + * goto:https://example.com + * press:Enter + * snapshot + */ + +import type { ExecutionStep, BAPSelector } from "@browseragentprotocol/protocol"; +import { parseSelector, stripQuotes } from "./parser.js"; + +// ============================================================================= +// Types +// ============================================================================= + +export interface ParsedStep { + /** BAP protocol action (e.g., "action/fill", "action/click") */ + action: string; + /** Parsed selector (for actions that target elements) */ + selector?: BAPSelector; + /** Value for fill/type/select */ + value?: string; + /** URL for goto */ + url?: string; + /** Key for press */ + key?: string; +} + +// ============================================================================= +// Action Name Mapping +// ============================================================================= + +const ACTION_MAP: Record = { + click: "action/click", + dblclick: "action/dblclick", + fill: "action/fill", + type: "action/type", + press: "action/press", + select: "action/select", + check: "action/check", + uncheck: "action/uncheck", + hover: "action/hover", + scroll: "action/scroll", + goto: "page/navigate", + back: "page/goBack", + forward: "page/goForward", + reload: "page/reload", + snapshot: "observe/ariaSnapshot", + screenshot: "observe/screenshot", +}; + +/** Actions that take no arguments */ +const PARAMETERLESS = new Set([ + "snapshot", "screenshot", "back", "forward", "reload", "close", +]); + +// ============================================================================= +// Parsing +// ============================================================================= + +/** + * Find the last `=` in a string that is NOT inside quotes. + * Scans right-to-left. Returns -1 if not found. + */ +function findUnquotedEquals(s: string): number { + let inQuote = false; + let quoteChar = ""; + + for (let i = s.length - 1; i >= 0; i--) { + const ch = s[i]!; + if (inQuote) { + if (ch === quoteChar) { + inQuote = false; + } + } else if (ch === '"' || ch === "'") { + inQuote = true; + quoteChar = ch; + } else if (ch === "=") { + return i; + } + } + + return -1; +} + +/** + * Parse a single composite step from CLI syntax. + * + * @param raw - The raw step string (e.g., 'fill:role:textbox:"Email"="user@example.com"') + * @returns Parsed step with action, selector, value, url, or key + */ +export function parseCompositeStep(raw: string): ParsedStep { + // Parameterless actions: just the action name + if (PARAMETERLESS.has(raw)) { + return { action: ACTION_MAP[raw] ?? raw }; + } + + // Find first colon to split action:rest + const firstColon = raw.indexOf(":"); + if (firstColon === -1) { + throw new Error(`Invalid step: "${raw}". Expected action:target or action:target=value`); + } + + const actionName = raw.slice(0, firstColon); + const rest = raw.slice(firstColon + 1); + const action = ACTION_MAP[actionName] ?? `action/${actionName}`; + + // Special case: goto takes a URL (URLs contain colons, so no further splitting) + if (actionName === "goto") { + return { action, url: rest }; + } + + // Special case: press takes a key name + if (actionName === "press") { + return { action, key: rest }; + } + + // For fill/type/select: find the value separator (=) + // The = that separates selector from value is the last unquoted = + const equalsIdx = findUnquotedEquals(rest); + + if (equalsIdx !== -1) { + const selectorStr = rest.slice(0, equalsIdx); + const value = stripQuotes(rest.slice(equalsIdx + 1)); + return { + action, + selector: parseSelector(selectorStr), + value, + }; + } + + // No value: click, check, uncheck, hover, etc. + return { + action, + selector: parseSelector(rest), + }; +} + +/** + * Parse multiple composite steps from CLI args. + */ +export function parseCompositeSteps(args: string[]): ParsedStep[] { + return args.map(parseCompositeStep); +} + +/** + * Convert parsed steps to ExecutionStep objects for client.act(). + */ +export function toExecutionSteps(steps: ParsedStep[]): ExecutionStep[] { + return steps.map((s): ExecutionStep => { + const step: ExecutionStep = { + action: s.action, + params: {}, + }; + + if (s.selector) { + step.params!.selector = s.selector; + } + if (s.value !== undefined) { + step.params!.value = s.value; + } + if (s.url) { + step.params!.url = s.url; + } + if (s.key) { + step.params!.key = s.key; + } + + return step; + }); +} diff --git a/packages/cli/src/selectors/parser.ts b/packages/cli/src/selectors/parser.ts new file mode 100644 index 0000000..692ad5a --- /dev/null +++ b/packages/cli/src/selectors/parser.ts @@ -0,0 +1,152 @@ +/** + * @fileoverview CLI selector parser + * + * Extended from packages/mcp/src/index.ts:97-163 with: + * - e positional ref support (playwright-cli compatibility) + * - Quoted string handling: role:button:"Submit" strips quotes from name + */ + +import type { BAPSelector, AriaRole } from "@browseragentprotocol/protocol"; + +/** + * Strip surrounding single or double quotes from a string. + */ +export function stripQuotes(s: string): string { + if (s.length >= 2) { + if ( + (s[0] === '"' && s[s.length - 1] === '"') || + (s[0] === "'" && s[s.length - 1] === "'") + ) { + return s.slice(1, -1); + } + } + return s; +} + +/** + * Parse a selector string into a BAPSelector object. + * + * Supports all BAP selector types plus playwright-cli compatible e refs: + * - e15 -> { type: "ref", ref: "e15" } + * - @e1 -> { type: "ref", ref: "@e1" } + * - role:button:"Submit" -> { type: "role", role: "button", name: "Submit" } + * - role:button:Submit -> { type: "role", role: "button", name: "Submit" } + * - text:"Sign in" -> { type: "text", value: "Sign in" } + * - label:"Email" -> { type: "label", value: "Email" } + * - placeholder:"Search..." -> { type: "placeholder", value: "Search..." } + * - testid:submit-btn -> { type: "testId", value: "submit-btn" } + * - css:.btn-primary -> { type: "css", value: ".btn-primary" } + * - xpath://button[@id='submit'] -> { type: "xpath", value: "//button[@id='submit']" } + * - coords:100,200 -> { type: "coordinates", x: 100, y: 200 } + * - #submit-btn -> { type: "css", value: "#submit-btn" } + */ +export function parseSelector(selector: string): BAPSelector { + // Playwright-CLI compat: e positional refs + if (/^e\d+$/.test(selector)) { + return { type: "ref", ref: selector }; + } + + // BAP stable refs: @e1, @submitBtn + if (selector.startsWith("@")) { + return { type: "ref", ref: selector }; + } + + // Ref selector: ref:@submitBtn + if (selector.startsWith("ref:")) { + return { type: "ref", ref: selector.slice(4) }; + } + + // Role selector: role:button:"Submit" or role:button:Submit + if (selector.startsWith("role:")) { + const rest = selector.slice(5); + const colonIdx = rest.indexOf(":"); + if (colonIdx === -1) { + return { type: "role", role: rest as AriaRole }; + } + const role = rest.slice(0, colonIdx) as AriaRole; + const name = stripQuotes(rest.slice(colonIdx + 1)) || undefined; + return { type: "role", role, name }; + } + + // Text selector: text:"content" or text:content + if (selector.startsWith("text:")) { + return { type: "text", value: stripQuotes(selector.slice(5)) }; + } + + // Label selector: label:"Email" or label:Email + if (selector.startsWith("label:")) { + return { type: "label", value: stripQuotes(selector.slice(6)) }; + } + + // Placeholder selector: placeholder:"Search..." + if (selector.startsWith("placeholder:")) { + return { type: "placeholder", value: stripQuotes(selector.slice(12)) }; + } + + // TestId selector: testid:submit-button + if (selector.startsWith("testid:")) { + return { type: "testId", value: selector.slice(7) }; + } + + // CSS selector: css:.btn-primary + if (selector.startsWith("css:")) { + return { type: "css", value: selector.slice(4) }; + } + + // XPath selector: xpath://button[@id='submit'] + if (selector.startsWith("xpath:")) { + return { type: "xpath", value: selector.slice(6) }; + } + + // Coordinates selector: coords:100,200 + if (selector.startsWith("coords:")) { + const coords = selector.slice(7).split(","); + if (coords.length >= 2 && coords[0] && coords[1]) { + const x = parseInt(coords[0], 10); + const y = parseInt(coords[1], 10); + if (!isNaN(x) && !isNaN(y)) { + return { type: "coordinates", x, y }; + } + } + } + + // CSS shorthand for IDs and classes + if (selector.startsWith("#") || selector.startsWith(".")) { + return { type: "css", value: selector }; + } + + // Default to text selector for plain strings + return { type: "text", value: selector }; +} + +/** + * Format a BAPSelector for display in CLI output. + */ +export function formatSelectorForDisplay(selector: BAPSelector): string { + switch (selector.type) { + case "role": + return `role:${selector.role}${selector.name ? `:"${selector.name}"` : ""}`; + case "text": + return `text:"${selector.value}"`; + case "label": + return `label:"${selector.value}"`; + case "testId": + return `testId:${selector.value}`; + case "css": + return selector.value.startsWith("#") || selector.value.startsWith(".") + ? selector.value + : `css:${selector.value}`; + case "xpath": + return `xpath:${selector.value}`; + case "placeholder": + return `placeholder:"${selector.value}"`; + case "ref": + return selector.ref; + case "coordinates": + return `coords:${selector.x},${selector.y}`; + case "semantic": + return `semantic:${selector.description}`; + default: + return JSON.stringify(selector); + } +} diff --git a/packages/cli/src/server/manager.ts b/packages/cli/src/server/manager.ts new file mode 100644 index 0000000..46a35a5 --- /dev/null +++ b/packages/cli/src/server/manager.ts @@ -0,0 +1,262 @@ +/** + * @fileoverview Server lifecycle management + * + * Manages the BAP Playwright server as a detached background daemon. + * Reuses existing server if one is already running on the target port. + * Extracted and adapted from packages/mcp/src/cli.ts. + */ + +import { spawn } from "node:child_process"; +import fs from "node:fs"; +import net from "node:net"; +import os from "node:os"; +import path from "node:path"; +import { fileURLToPath } from "node:url"; +import { createClient, type BAPClient } from "@browseragentprotocol/client"; + +// ============================================================================= +// Types +// ============================================================================= + +export interface ServerManagerOptions { + port: number; + host?: string; + browser: string; + headless: boolean; + verbose: boolean; +} + +// ============================================================================= +// Server Discovery +// ============================================================================= + +/** + * Check if a port is in use by attempting a TCP connection. + */ +function isPortInUse(port: number, host: string = "localhost"): Promise { + return new Promise((resolve) => { + const socket = net.createConnection({ port, host }); + socket.setTimeout(500); + socket.on("connect", () => { + socket.destroy(); + resolve(true); + }); + socket.on("timeout", () => { + socket.destroy(); + resolve(false); + }); + socket.on("error", () => { + socket.destroy(); + resolve(false); + }); + }); +} + +/** + * Wait for a server to become available on the given port. + */ +async function waitForServer( + port: number, + host: string = "localhost", + timeoutMs: number = 15000, + intervalMs: number = 150, +): Promise { + const start = Date.now(); + + while (Date.now() - start < timeoutMs) { + if (await isPortInUse(port, host)) { + return; + } + await new Promise((resolve) => setTimeout(resolve, intervalMs)); + } + + throw new Error( + `BAP server did not start within ${timeoutMs / 1000}s on port ${port}. ` + + `Ensure Playwright browsers are installed: npx playwright install chromium` + ); +} + +/** + * Resolve the command to start server-playwright. + * In monorepo dev, uses sibling package directly. Otherwise falls back to npx. + */ +function resolveServerCommand(): { command: string; args: string[] } { + try { + const __dirname = path.dirname(fileURLToPath(import.meta.url)); + const siblingCli = path.resolve(__dirname, "../../server-playwright/dist/cli.js"); + + if (fs.existsSync(siblingCli)) { + return { command: "node", args: [siblingCli] }; + } + } catch { + // import.meta.url resolution failed, fall through + } + + return { command: "npx", args: ["-y", "@browseragentprotocol/server-playwright"] }; +} + +// ============================================================================= +// PID File Management +// ============================================================================= + +function getPidDir(): string { + return path.join(os.homedir(), ".bap"); +} + +function getPidPath(): string { + return path.join(getPidDir(), "server.pid"); +} + +function writePidFile(pid: number): void { + const dir = getPidDir(); + fs.mkdirSync(dir, { recursive: true }); + fs.writeFileSync(getPidPath(), pid.toString(), "utf-8"); +} + +function readPidFile(): number | null { + try { + const pidStr = fs.readFileSync(getPidPath(), "utf-8").trim(); + const pid = parseInt(pidStr, 10); + return isNaN(pid) ? null : pid; + } catch { + return null; + } +} + +function removePidFile(): void { + try { + fs.unlinkSync(getPidPath()); + } catch { + // Ignore + } +} + +// ============================================================================= +// Browser Name Mapping +// ============================================================================= + +const BROWSER_MAP: Record = { + chrome: "chromium", + chromium: "chromium", + firefox: "firefox", + webkit: "webkit", + edge: "chromium", +}; + +// ============================================================================= +// Server Manager +// ============================================================================= + +export class ServerManager { + private options: Required; + private client: BAPClient | null = null; + + constructor(options: ServerManagerOptions) { + this.options = { + host: "localhost", + ...options, + }; + } + + /** + * Get a connected client, starting the server if needed. + */ + async ensureClient(): Promise { + if (this.client) return this.client; + + const { port, host, browser, headless, verbose } = this.options; + const url = `ws://${host}:${port}`; + + // Try to connect to existing server first + if (await isPortInUse(port, host)) { + if (verbose) { + process.stderr.write(`[bap] Reusing server on ${host}:${port}\n`); + } + this.client = await createClient(url, { name: "bap-cli" }); + return this.client; + } + + // Start server as detached background process + if (verbose) { + process.stderr.write(`[bap] Starting server on port ${port}...\n`); + } + + const { command, args } = resolveServerCommand(); + const serverArgs = [ + ...args, + "--port", port.toString(), + "--host", host, + headless ? "--headless" : "--no-headless", + "--browser", BROWSER_MAP[browser] ?? "chromium", + ]; + + if (verbose) { + serverArgs.push("--debug"); + } + + const child = spawn(command, serverArgs, { + stdio: verbose ? ["ignore", "pipe", "pipe"] : "ignore", + detached: true, + env: { ...process.env }, + }); + + if (verbose && child.stdout) { + child.stdout.on("data", (data: Buffer) => { + process.stderr.write(`[bap-server] ${data.toString()}`); + }); + } + if (verbose && child.stderr) { + child.stderr.on("data", (data: Buffer) => { + process.stderr.write(`[bap-server] ${data.toString()}`); + }); + } + + child.unref(); + + if (child.pid) { + writePidFile(child.pid); + } + + // Wait for server to be ready + await waitForServer(port, host); + + if (verbose) { + process.stderr.write(`[bap] Server ready on ws://${host}:${port}\n`); + } + + this.client = await createClient(url, { name: "bap-cli" }); + return this.client; + } + + /** + * Disconnect the WebSocket client (server keeps running). + */ + async disconnect(): Promise { + if (this.client) { + await this.client.close(); + this.client = null; + } + } + + /** + * Kill the background server process. + */ + static killServer(): void { + const pid = readPidFile(); + if (pid !== null) { + try { + process.kill(pid, "SIGTERM"); + setTimeout(() => { + try { + process.kill(pid, "SIGKILL"); + } catch { + // Already dead + } + }, 500); + } catch { + // Process already gone + } + removePidFile(); + } + } +} diff --git a/packages/cli/tsconfig.json b/packages/cli/tsconfig.json new file mode 100644 index 0000000..ed464a9 --- /dev/null +++ b/packages/cli/tsconfig.json @@ -0,0 +1,9 @@ +{ + "extends": "../../tsconfig.json", + "compilerOptions": { + "outDir": "./dist", + "rootDir": "./src" + }, + "include": ["src/**/*"], + "exclude": ["node_modules", "dist"] +} diff --git a/packages/cli/vitest.config.ts b/packages/cli/vitest.config.ts new file mode 100644 index 0000000..fbf257e --- /dev/null +++ b/packages/cli/vitest.config.ts @@ -0,0 +1,9 @@ +import { defineConfig } from "vitest/config"; + +export default defineConfig({ + test: { + globals: true, + environment: "node", + include: ["__tests__/**/*.test.ts", "src/**/*.test.ts"], + }, +}); diff --git a/packages/client/src/index.ts b/packages/client/src/index.ts index 33c6602..7e6845f 100644 --- a/packages/client/src/index.ts +++ b/packages/client/src/index.ts @@ -533,6 +533,8 @@ export class BAPClient extends EventEmitter { waitUntil?: WaitUntilState; timeout?: number; referer?: string; + /** Fusion: run agent/observe after navigation (fused in single call) */ + observe?: AgentObserveParams; } = {} ): Promise { return this.request("page/navigate", { @@ -1087,6 +1089,9 @@ export class BAPClient extends EventEmitter { stopOnFirstError: params.stopOnFirstError, continueOnConditionFail: params.continueOnConditionFail, timeout: params.timeout, + // Fusion: observe-act-observe kernel + preObserve: params.preObserve, + postObserve: params.postObserve, }); } @@ -1128,6 +1133,9 @@ export class BAPClient extends EventEmitter { includeRefHistory: params.includeRefHistory, // Screenshot Annotation options annotateScreenshot: params.annotateScreenshot, + // Fusion options + responseTier: params.responseTier, + incremental: params.incremental, }); } diff --git a/packages/mcp/README.md b/packages/mcp/README.md index 9ab3e2d..d88bb14 100644 --- a/packages/mcp/README.md +++ b/packages/mcp/README.md @@ -77,7 +77,7 @@ npx @browseragentprotocol/mcp --url ws://localhost:9222 | Tool | Description | |------|-------------| -| `navigate` | Navigate to a URL | +| `navigate` | Navigate to a URL. Supports fused `observe` parameter to get page observation in one roundtrip | | `go_back` | Navigate back in browser history | | `go_forward` | Navigate forward in browser history | | `reload` | Reload the current page | @@ -116,8 +116,8 @@ npx @browseragentprotocol/mcp --url ws://localhost:9222 | Tool | Description | |------|-------------| -| `observe` | AI-optimized page observation with interactive elements and stable refs | -| `act` | Execute a sequence of browser actions in a single call | +| `observe` | AI-optimized page observation with interactive elements and stable refs. Supports `incremental` (diff mode) and `responseTier` (full/interactive/minimal) | +| `act` | Execute a sequence of browser actions in a single call. Supports fused `postObserve` to get observation in one roundtrip | | `extract` | Extract structured data from the page using schema and CSS heuristics | ### Selector Formats diff --git a/packages/mcp/src/index.ts b/packages/mcp/src/index.ts index ee4e040..368e559 100644 --- a/packages/mcp/src/index.ts +++ b/packages/mcp/src/index.ts @@ -39,6 +39,7 @@ import { type AnnotationMapping, type ExtractionSchema, type AriaRole, + type AgentObserveResult, } from "@browseragentprotocol/protocol"; // ============================================================================= @@ -216,6 +217,14 @@ const TOOLS: Tool[] = [ enum: ["load", "domcontentloaded", "networkidle"], description: "When to consider navigation complete (default: load)", }, + observe: { + type: "boolean", + description: "Fuse an observation after navigation (saves a round-trip). Returns interactive elements alongside navigate results.", + }, + observeMaxElements: { + type: "number", + description: "Max interactive elements to return in fused observation (default: 50)", + }, }, required: ["url"], }, @@ -556,6 +565,14 @@ Each step can have conditions and error handling. More efficient than calling ac type: "boolean", description: "Stop execution if any step fails (default: true)", }, + postObserve: { + type: "boolean", + description: "Fuse a post-execution observation into this call (saves a round-trip). Returns interactive elements alongside act results.", + }, + observeMaxElements: { + type: "number", + description: "Max interactive elements to return in fused post-observation (default: 50)", + }, }, required: ["steps"], }, @@ -594,6 +611,15 @@ RECOMMENDED: Use this before complex interactions to understand the page.`, type: "boolean", description: "Use stable element refs that persist across observations (default: true)", }, + incremental: { + type: "boolean", + description: "Return only changes since last observation (added, updated, removed elements). Useful for monitoring page state after actions.", + }, + responseTier: { + type: "string", + enum: ["full", "interactive", "minimal"], + description: "Response compression tier: 'full' (default, all data), 'interactive' (elements+metadata only), 'minimal' (refs+names only)", + }, }, }, }, @@ -938,12 +964,40 @@ export class BAPMCPServer { } const waitUntil = (args.waitUntil as WaitUntilState) ?? "load"; - const result = await client.navigate(url, { waitUntil }); + + // Fusion: navigate-observe kernel + const observeFlag = args.observe as boolean | undefined; + const result = await client.navigate(url, { + waitUntil, + ...(observeFlag ? { + observe: { + includeMetadata: true, + includeInteractiveElements: true, + maxElements: (args.observeMaxElements as number) ?? 50, + }, + } : {}), + }); + + const textParts = [`Navigated to: ${result.url}\nStatus: ${result.status}`]; + + // Append fused observation if present + const observation = (result as Record).observation as AgentObserveResult | undefined; + if (observation?.interactiveElements && observation.interactiveElements.length > 0) { + const elementList = observation.interactiveElements + .map((el: InteractiveElement) => { + const selector = formatSelectorForDisplay(el.selector); + const hints = el.actionHints.join(", "); + return `${el.ref} ${el.role}${el.name ? `: "${el.name}"` : ""} - ${selector} (${hints})`; + }) + .join("\n"); + textParts.push(`\nInteractive Elements (${observation.interactiveElements.length}/${observation.totalInteractiveElements ?? "?"}):\n${elementList}`); + } + return { content: [ { type: "text", - text: `Navigated to: ${result.url}\nStatus: ${result.status}`, + text: textParts.join("\n"), }, ], }; @@ -1164,9 +1218,19 @@ export class BAPMCPServer { return step; }); + // Fusion: observe-act-observe kernel + const postObserveFlag = args.postObserve as boolean | undefined; const result = await client.act({ steps, stopOnFirstError: args.stopOnFirstError as boolean ?? true, + ...(postObserveFlag ? { + postObserve: { + includeMetadata: true, + includeInteractiveElements: true, + maxElements: (args.observeMaxElements as number) ?? 50, + responseTier: "interactive" as const, + }, + } : {}), }); // Format result for AI consumption @@ -1182,11 +1246,26 @@ export class BAPMCPServer { ) .join("\n"); + const actTextParts = [`${summary}\n\n${stepDetails}\n\nTotal time: ${result.duration}ms`]; + + // Append fused post-observation if present + const postObs = (result as Record).postObservation as AgentObserveResult | undefined; + if (postObs?.interactiveElements && postObs.interactiveElements.length > 0) { + const elementList = postObs.interactiveElements + .map((el: InteractiveElement) => { + const selector = formatSelectorForDisplay(el.selector); + const hints = el.actionHints.join(", "); + return `${el.ref} ${el.role}${el.name ? `: "${el.name}"` : ""} - ${selector} (${hints})`; + }) + .join("\n"); + actTextParts.push(`\nPost-execution Elements (${postObs.interactiveElements.length}/${postObs.totalInteractiveElements ?? "?"}):\n${elementList}`); + } + return { content: [ { type: "text", - text: `${summary}\n\n${stepDetails}\n\nTotal time: ${result.duration}ms`, + text: actTextParts.join("\n"), }, ], isError: !result.success, @@ -1206,6 +1285,9 @@ export class BAPMCPServer { // New features annotateScreenshot: annotate ? { enabled: true } : undefined, stableRefs: args.stableRefs as boolean | undefined, + // Fusion options + incremental: args.incremental as boolean | undefined, + responseTier: args.responseTier as "full" | "interactive" | "minimal" | undefined, }); const content: Array<{ type: "text" | "image"; text?: string; data?: string; mimeType?: string }> = []; @@ -1248,6 +1330,31 @@ export class BAPMCPServer { }); } + // Incremental changes (if incremental mode was used) + if (result.changes) { + const changeParts: string[] = []; + if (result.changes.added.length > 0) { + changeParts.push(`+ ${result.changes.added.length} added: ${result.changes.added.map((el: InteractiveElement) => `${el.ref} ${el.role}`).join(", ")}`); + } + if (result.changes.updated.length > 0) { + changeParts.push(`~ ${result.changes.updated.length} updated: ${result.changes.updated.map((el: InteractiveElement) => `${el.ref} ${el.role}`).join(", ")}`); + } + if (result.changes.removed.length > 0) { + changeParts.push(`- ${result.changes.removed.length} removed: ${result.changes.removed.join(", ")}`); + } + if (changeParts.length > 0) { + content.push({ + type: "text", + text: `\nChanges:\n${changeParts.join("\n")}`, + }); + } else { + content.push({ + type: "text", + text: "\nChanges: (no changes)", + }); + } + } + // Screenshot if (result.screenshot) { const annotatedNote = result.screenshot.annotated ? " (annotated)" : ""; diff --git a/packages/protocol/src/__tests__/element-identity.test.ts b/packages/protocol/src/__tests__/element-identity.test.ts index 68541e6..818f8bc 100644 --- a/packages/protocol/src/__tests__/element-identity.test.ts +++ b/packages/protocol/src/__tests__/element-identity.test.ts @@ -8,6 +8,7 @@ import { createElementRegistry, cleanupStaleEntries, ELEMENT_STALE_THRESHOLD, + ELEMENT_REGISTRY_MAX_SIZE, type ElementRegistryEntry, } from "../shared/element-identity.js"; import type { ElementIdentity } from "../types/agent.js"; @@ -392,3 +393,62 @@ describe("ELEMENT_STALE_THRESHOLD", () => { expect(ELEMENT_STALE_THRESHOLD).toBe(60000); }); }); + +describe("ELEMENT_REGISTRY_MAX_SIZE", () => { + it("is 2000", () => { + expect(ELEMENT_REGISTRY_MAX_SIZE).toBe(2000); + }); +}); + +describe("cleanupStaleEntries() — max size enforcement", () => { + it("evicts oldest entries when registry exceeds max size", () => { + const registry = createElementRegistry("https://example.com"); + const now = Date.now(); + + // Fill beyond max size with fresh entries (all within time threshold) + for (let i = 0; i < ELEMENT_REGISTRY_MAX_SIZE + 100; i++) { + const entry: ElementRegistryEntry = { + ref: `@el${i}`, + selector: { type: "css", value: `.el${i}` }, + identity: { role: "button", tagName: "BUTTON" }, + lastSeen: now - i, // Older entries have higher i (older lastSeen) + }; + registry.elements.set(`@el${i}`, entry); + } + + expect(registry.elements.size).toBe(ELEMENT_REGISTRY_MAX_SIZE + 100); + + const removed = cleanupStaleEntries(registry, ELEMENT_STALE_THRESHOLD); + + // Should have evicted 100 oldest entries + expect(removed).toBe(100); + expect(registry.elements.size).toBe(ELEMENT_REGISTRY_MAX_SIZE); + + // Newest entries should survive (lowest i = newest lastSeen) + expect(registry.elements.has("@el0")).toBe(true); + expect(registry.elements.has("@el1")).toBe(true); + + // Oldest entries should be evicted (highest i = oldest lastSeen) + const lastIndex = ELEMENT_REGISTRY_MAX_SIZE + 99; + expect(registry.elements.has(`@el${lastIndex}`)).toBe(false); + }); + + it("does not evict when within max size", () => { + const registry = createElementRegistry("https://example.com"); + const now = Date.now(); + + for (let i = 0; i < 10; i++) { + const entry: ElementRegistryEntry = { + ref: `@el${i}`, + selector: { type: "css", value: `.el${i}` }, + identity: { role: "button", tagName: "BUTTON" }, + lastSeen: now, + }; + registry.elements.set(`@el${i}`, entry); + } + + const removed = cleanupStaleEntries(registry); + expect(removed).toBe(0); + expect(registry.elements.size).toBe(10); + }); +}); diff --git a/packages/protocol/src/__tests__/fusion-schemas.test.ts b/packages/protocol/src/__tests__/fusion-schemas.test.ts new file mode 100644 index 0000000..2a237c7 --- /dev/null +++ b/packages/protocol/src/__tests__/fusion-schemas.test.ts @@ -0,0 +1,230 @@ +import { describe, it, expect } from "vitest"; +import { + ResponseTierSchema, + ObserveChangesSchema, + AgentObserveParamsSchema, + AgentObserveResultSchema, + AgentActParamsSchema, + AgentActResultSchema, +} from "../types/agent.js"; +import { + PageNavigateParamsSchema, + PageNavigateResultSchema, +} from "../types/methods.js"; + +describe("Fusion Protocol Schema Validation", () => { + describe("ResponseTierSchema", () => { + it("accepts valid tiers", () => { + expect(ResponseTierSchema.parse("full")).toBe("full"); + expect(ResponseTierSchema.parse("interactive")).toBe("interactive"); + expect(ResponseTierSchema.parse("minimal")).toBe("minimal"); + }); + + it("rejects invalid tiers", () => { + expect(() => ResponseTierSchema.parse("compact")).toThrow(); + expect(() => ResponseTierSchema.parse("")).toThrow(); + expect(() => ResponseTierSchema.parse(123)).toThrow(); + }); + }); + + describe("ObserveChangesSchema", () => { + it("accepts valid changes with empty arrays", () => { + const result = ObserveChangesSchema.parse({ + added: [], + updated: [], + removed: [], + }); + expect(result.added).toHaveLength(0); + expect(result.updated).toHaveLength(0); + expect(result.removed).toHaveLength(0); + }); + + it("accepts changes with populated arrays", () => { + const result = ObserveChangesSchema.parse({ + added: [{ + ref: "@btn1", + selector: { type: "role", role: "button", name: "Submit" }, + role: "button", + name: "Submit", + tagName: "BUTTON", + actionHints: ["clickable"], + }], + updated: [], + removed: ["@btn2"], + }); + expect(result.added).toHaveLength(1); + expect(result.removed).toHaveLength(1); + }); + + it("rejects missing required fields", () => { + expect(() => ObserveChangesSchema.parse({})).toThrow(); + expect(() => ObserveChangesSchema.parse({ added: [] })).toThrow(); + expect(() => ObserveChangesSchema.parse({ added: [], updated: [] })).toThrow(); + }); + }); + + describe("AgentObserveParamsSchema — fusion fields", () => { + it("accepts responseTier", () => { + const result = AgentObserveParamsSchema.parse({ + responseTier: "interactive", + }); + expect(result.responseTier).toBe("interactive"); + }); + + it("accepts incremental flag", () => { + const result = AgentObserveParamsSchema.parse({ + incremental: true, + }); + expect(result.incremental).toBe(true); + }); + + it("accepts both fusion fields together", () => { + const result = AgentObserveParamsSchema.parse({ + responseTier: "minimal", + incremental: true, + includeMetadata: true, + maxElements: 50, + }); + expect(result.responseTier).toBe("minimal"); + expect(result.incremental).toBe(true); + }); + + it("accepts empty params (all optional)", () => { + const result = AgentObserveParamsSchema.parse({}); + expect(result.responseTier).toBeUndefined(); + expect(result.incremental).toBeUndefined(); + }); + + it("rejects invalid responseTier", () => { + expect(() => AgentObserveParamsSchema.parse({ + responseTier: "compact", + })).toThrow(); + }); + }); + + describe("AgentObserveResultSchema — fusion fields", () => { + it("accepts result with changes", () => { + const result = AgentObserveResultSchema.parse({ + changes: { + added: [], + updated: [], + removed: ["@btn1"], + }, + }); + expect(result.changes?.removed).toHaveLength(1); + }); + + it("accepts result without changes", () => { + const result = AgentObserveResultSchema.parse({}); + expect(result.changes).toBeUndefined(); + }); + }); + + describe("AgentActParamsSchema — fusion fields", () => { + const minStep = { action: "action/click", params: { selector: { type: "css" as const, value: ".btn" } } }; + + it("accepts preObserve param", () => { + const result = AgentActParamsSchema.parse({ + steps: [minStep], + preObserve: { includeMetadata: true, maxElements: 50 }, + }); + expect(result.preObserve).toBeDefined(); + expect(result.preObserve?.includeMetadata).toBe(true); + }); + + it("accepts postObserve param", () => { + const result = AgentActParamsSchema.parse({ + steps: [minStep], + postObserve: { responseTier: "interactive", includeInteractiveElements: true }, + }); + expect(result.postObserve).toBeDefined(); + expect(result.postObserve?.responseTier).toBe("interactive"); + }); + + it("accepts both pre and post observe", () => { + const result = AgentActParamsSchema.parse({ + steps: [minStep], + preObserve: { includeMetadata: true }, + postObserve: { includeMetadata: true, responseTier: "minimal" }, + }); + expect(result.preObserve).toBeDefined(); + expect(result.postObserve).toBeDefined(); + }); + + it("works without fusion fields (backward compatible)", () => { + const result = AgentActParamsSchema.parse({ steps: [minStep] }); + expect(result.preObserve).toBeUndefined(); + expect(result.postObserve).toBeUndefined(); + }); + }); + + describe("AgentActResultSchema — fusion fields", () => { + const minStepResult = { step: 0, success: true, duration: 50 }; + const minResult = { + success: true, + completed: 1, + total: 1, + results: [minStepResult], + duration: 100, + }; + const viewport = { width: 1280, height: 720 }; + + it("accepts result with pre/post observations", () => { + const result = AgentActResultSchema.parse({ + ...minResult, + preObservation: { metadata: { url: "https://example.com", title: "Before", viewport } }, + postObservation: { metadata: { url: "https://example.com", title: "After", viewport } }, + }); + expect(result.preObservation).toBeDefined(); + expect(result.postObservation).toBeDefined(); + }); + + it("works without fusion fields (backward compatible)", () => { + const result = AgentActResultSchema.parse(minResult); + expect(result.preObservation).toBeUndefined(); + expect(result.postObservation).toBeUndefined(); + }); + }); + + describe("PageNavigateParamsSchema — fusion fields", () => { + it("accepts observe param for fused navigate+observe", () => { + const result = PageNavigateParamsSchema.parse({ + url: "https://example.com", + observe: { includeMetadata: true, includeInteractiveElements: true, maxElements: 50 }, + }); + expect(result.observe).toBeDefined(); + expect(result.observe?.maxElements).toBe(50); + }); + + it("works without observe (backward compatible)", () => { + const result = PageNavigateParamsSchema.parse({ + url: "https://example.com", + }); + expect(result.observe).toBeUndefined(); + }); + }); + + describe("PageNavigateResultSchema — fusion fields", () => { + it("accepts result with observation", () => { + const result = PageNavigateResultSchema.parse({ + url: "https://example.com", + status: 200, + headers: {}, + observation: { + metadata: { url: "https://example.com", title: "Test", viewport: { width: 1280, height: 720 } }, + interactiveElements: [], + }, + }); + expect(result.observation).toBeDefined(); + }); + + it("works without observation (backward compatible)", () => { + const result = PageNavigateResultSchema.parse({ + url: "https://example.com", + status: 200, + headers: {}, + }); + expect(result.observation).toBeUndefined(); + }); + }); +}); diff --git a/packages/protocol/src/shared/element-identity.ts b/packages/protocol/src/shared/element-identity.ts index 88ee7a9..5f9a2c0 100644 --- a/packages/protocol/src/shared/element-identity.ts +++ b/packages/protocol/src/shared/element-identity.ts @@ -203,6 +203,8 @@ export interface ElementRegistryEntry { identity: ElementIdentity; lastSeen: number; bounds?: { x: number; y: number; width: number; height: number }; + /** Fusion 4: Cached CSS path for fast selector resolution (bypasses semantic lookup) */ + cachedCssSelector?: string; } /** @@ -235,7 +237,17 @@ export function createElementRegistry(pageUrl: string): PageElementRegistry { export const ELEMENT_STALE_THRESHOLD = 60000; // 1 minute /** - * Clean up stale entries from registry + * Maximum number of entries in a single page element registry. + * Prevents unbounded memory growth on pages with heavy DOM churn. + * When exceeded, oldest entries (by lastSeen) are evicted first. + */ +export const ELEMENT_REGISTRY_MAX_SIZE = 2000; + +/** + * Clean up stale entries from registry. + * Also enforces max size by evicting oldest entries when the registry exceeds + * ELEMENT_REGISTRY_MAX_SIZE. + * * @param registry The registry to clean * @param threshold Maximum age in ms for entries (default: ELEMENT_STALE_THRESHOLD) * @returns Number of entries removed @@ -247,6 +259,7 @@ export function cleanupStaleEntries( const now = Date.now(); let removed = 0; + // Phase 1: Remove time-stale entries for (const [ref, entry] of registry.elements) { if (now - entry.lastSeen > threshold) { registry.elements.delete(ref); @@ -254,5 +267,16 @@ export function cleanupStaleEntries( } } + // Phase 2: Enforce max size cap (evict oldest by lastSeen) + if (registry.elements.size > ELEMENT_REGISTRY_MAX_SIZE) { + const entries = Array.from(registry.elements.entries()) + .sort((a, b) => a[1].lastSeen - b[1].lastSeen); + const toRemove = entries.length - ELEMENT_REGISTRY_MAX_SIZE; + for (let i = 0; i < toRemove; i++) { + registry.elements.delete(entries[i]![0]); + removed++; + } + } + return removed; } diff --git a/packages/protocol/src/shared/index.ts b/packages/protocol/src/shared/index.ts index fabe156..7b8889e 100644 --- a/packages/protocol/src/shared/index.ts +++ b/packages/protocol/src/shared/index.ts @@ -58,6 +58,7 @@ export { createElementRegistry, cleanupStaleEntries, ELEMENT_STALE_THRESHOLD, + ELEMENT_REGISTRY_MAX_SIZE, type DOMElementInfo, type ElementRegistryEntry, type PageElementRegistry, diff --git a/packages/protocol/src/types/agent.ts b/packages/protocol/src/types/agent.ts index 79a1d46..97d7fb3 100644 --- a/packages/protocol/src/types/agent.ts +++ b/packages/protocol/src/types/agent.ts @@ -96,6 +96,13 @@ export const AgentActParamsSchema = z.object({ /** Global timeout for entire sequence (ms) */ timeout: z.number().optional(), + + // Fusion: observe-act-observe kernel + /** Run agent/observe before executing steps (pre-observation) */ + preObserve: z.lazy(() => AgentObserveParamsSchema).optional(), + + /** Run agent/observe after executing steps (post-observation) */ + postObserve: z.lazy(() => AgentObserveParamsSchema).optional(), }); export type AgentActParams = z.infer; @@ -163,6 +170,13 @@ export const AgentActResultSchema = z.object({ /** Index of first failed step (if any) */ failedAt: z.number().optional(), + + // Fusion: observe-act-observe kernel + /** Pre-execution observation result (if preObserve was requested) */ + preObservation: z.lazy(() => AgentObserveResultSchema).optional(), + + /** Post-execution observation result (if postObserve was requested) */ + postObservation: z.lazy(() => AgentObserveResultSchema).optional(), }); export type AgentActResult = z.infer; @@ -366,6 +380,19 @@ export const AnnotationMappingSchema = z.object({ }); export type AnnotationMapping = z.infer; +// ============================================================================= +// Fusion: Response Tiers & Incremental Observation +// ============================================================================= + +/** + * Response compression tier for observations + * - "full": All requested data (tree, elements, screenshot, metadata) — default + * - "interactive": Elements + metadata only (skip tree, skip screenshot) + * - "minimal": Element refs + names only (no bounds, stripped actionHints) + */ +export const ResponseTierSchema = z.enum(["full", "interactive", "minimal"]); +export type ResponseTier = z.infer; + /** * Parameters for agent/observe */ @@ -410,6 +437,13 @@ export const AgentObserveParamsSchema = z.object({ z.boolean(), AnnotationOptionsSchema, ]).optional(), + + // Fusion options + /** Response compression tier (default: "full") */ + responseTier: ResponseTierSchema.optional(), + + /** Return only changes since last observation (added, updated, removed) */ + incremental: z.boolean().optional(), }); export type AgentObserveParams = z.infer; @@ -439,6 +473,19 @@ export const ObserveScreenshotSchema = z.object({ }); export type ObserveScreenshot = z.infer; +/** + * Incremental changes between observations (for fusion) + */ +export const ObserveChangesSchema = z.object({ + /** Elements added since last observation */ + added: z.array(InteractiveElementSchema), + /** Elements updated since last observation (value, name, or state changed) */ + updated: z.array(InteractiveElementSchema), + /** Refs of elements removed since last observation */ + removed: z.array(z.string()), +}); +export type ObserveChanges = z.infer; + /** * Result of agent/observe */ @@ -464,6 +511,9 @@ export const AgentObserveResultSchema = z.object({ /** Mapping from annotation labels to elements (if annotateScreenshot was used) */ annotationMap: z.array(AnnotationMappingSchema).optional(), + + /** Incremental changes since last observation (if incremental: true) */ + changes: ObserveChangesSchema.optional(), }); export type AgentObserveResult = z.infer; diff --git a/packages/protocol/src/types/index.ts b/packages/protocol/src/types/index.ts index 72c8a37..ca0b990 100644 --- a/packages/protocol/src/types/index.ts +++ b/packages/protocol/src/types/index.ts @@ -476,6 +476,11 @@ export { type AnnotationLabelFormat, type AnnotationOptions, type AnnotationMapping, + // Fusion: Response tiers & incremental observation + ResponseTierSchema, + ObserveChangesSchema, + type ResponseTier, + type ObserveChanges, // Observe params and result AgentObserveParamsSchema, ObserveMetadataSchema, diff --git a/packages/protocol/src/types/methods.ts b/packages/protocol/src/types/methods.ts index d9b0a3a..d0afd32 100644 --- a/packages/protocol/src/types/methods.ts +++ b/packages/protocol/src/types/methods.ts @@ -5,6 +5,7 @@ import { z } from "zod"; import { BAPSelectorSchema } from "./selectors.js"; +import { AgentObserveParamsSchema, AgentObserveResultSchema } from "./agent.js"; import { BoundingBoxSchema, ActionOptionsSchema, @@ -97,6 +98,10 @@ export const PageNavigateParamsSchema = z.object({ waitUntil: WaitUntilStateSchema.optional(), timeout: z.number().optional(), referer: z.string().optional(), + + // Fusion: navigate-observe kernel + /** Run agent/observe after navigation completes (fused in single call) */ + observe: AgentObserveParamsSchema.optional(), }); export type PageNavigateParams = z.infer; @@ -105,6 +110,10 @@ export const PageNavigateResultSchema = z.object({ url: z.string(), status: z.number(), headers: z.record(z.string()), + + // Fusion: navigate-observe kernel + /** Observation result (if observe was requested) */ + observation: AgentObserveResultSchema.optional(), }); export type PageNavigateResult = z.infer; diff --git a/packages/server-playwright/README.md b/packages/server-playwright/README.md index eb8dee7..690109b 100644 --- a/packages/server-playwright/README.md +++ b/packages/server-playwright/README.md @@ -164,6 +164,17 @@ const state = await client.getStorageState(); await client.setStorageState(state); ``` +### Fused Operations + +The server supports fused operations that combine multiple steps into single in-process calls, eliminating redundant DOM walks and WebSocket roundtrips: + +- **navigate + observe**: Pass `observe` params alongside navigate to get page observation without a second call +- **act + observe**: Pass `preObserve`/`postObserve` to get observations before/after action execution +- **Incremental observe**: Set `incremental: true` to get only changes (added/updated/removed elements) since last observation +- **Response tiers**: Set `responseTier` to `"interactive"` or `"minimal"` to reduce observation payload size +- **Selector caching**: Element CSS paths are cached in the registry for faster resolution +- **Speculative prefetch**: After click/navigate actions, the server pre-builds observations for likely next requests + ## Features - **Cross-browser**: Chromium, Firefox, and WebKit via Playwright diff --git a/packages/server-playwright/src/server.ts b/packages/server-playwright/src/server.ts index 4f18856..e745f77 100644 --- a/packages/server-playwright/src/server.ts +++ b/packages/server-playwright/src/server.ts @@ -577,6 +577,8 @@ interface FrameContext { /** Client state for a connected WebSocket client */ interface ClientState { + /** Unique identifier for this client connection (for log correlation) */ + clientId: string; initialized: boolean; browser: Browser | null; /** Default context (backwards compatible) */ @@ -627,6 +629,16 @@ interface ClientState { pendingApprovals: Map; /** Session-level approvals (for approve-session) */ sessionApprovals: Set; + + // Fusion: Speculative prefetch + /** Cached speculative observation (fire-and-forget after navigation/click) */ + speculativeObservation?: { + pageUrl: string; + result: AgentObserveResult; + timestamp: number; + }; + /** Timer handle for pending speculative prefetch (for cancellation on cleanup/disconnect) */ + speculativePrefetchTimer?: NodeJS.Timeout; } // ============================================================================= @@ -958,6 +970,7 @@ export class BAPPlaywrightServer extends EventEmitter { // Initialize client state with authorization and session tracking (v0.2.0) const now = Date.now(); const state: ClientState = { + clientId: randomUUID().slice(0, 8), initialized: false, browser: null, context: null, @@ -990,7 +1003,7 @@ export class BAPPlaywrightServer extends EventEmitter { this.setupSessionTimeouts(ws, state); this.clients.set(ws, state); - this.log(`Client connected (scopes: ${state.scopes.join(', ')})`); + this.log(`Client connected`, { clientId: state.clientId, scopes: state.scopes }); ws.on("message", async (data) => { try { @@ -1016,13 +1029,13 @@ export class BAPPlaywrightServer extends EventEmitter { }); ws.on("close", async () => { - this.log("Client disconnected"); + this.log("Client disconnected", { clientId: state.clientId }); await this.cleanupClient(state); this.clients.delete(ws); }); ws.on("error", (error) => { - this.log(`WebSocket error: ${error.message}`); + this.log(`WebSocket error: ${error.message}`, { clientId: state.clientId }); }); } @@ -1035,6 +1048,10 @@ export class BAPPlaywrightServer extends EventEmitter { request: JSONRPCRequest ): Promise { const { id, method, params } = request; + const requestId = randomUUID().slice(0, 8); + const startTime = performance.now(); + + this.log(`→ ${method}`, { clientId: state.clientId, reqId: requestId, rpcId: id }); try { // Reset idle timeout on activity (v0.2.0) @@ -1054,8 +1071,13 @@ export class BAPPlaywrightServer extends EventEmitter { } const result = await this.dispatch(ws, state, method as BAPMethod, params ?? {}); + const duration = Math.round(performance.now() - startTime); + this.log(`✓ ${method}`, { clientId: state.clientId, reqId: requestId, duration: `${duration}ms` }); return createSuccessResponse(id, result); } catch (error) { + const duration = Math.round(performance.now() - startTime); + const errMsg = error instanceof Error ? error.message : "Unknown error"; + this.log(`✗ ${method}`, { clientId: state.clientId, reqId: requestId, duration: `${duration}ms`, error: errMsg }); return this.handleError(id, error); } } @@ -1531,11 +1553,27 @@ export class BAPPlaywrightServer extends EventEmitter { referer: params.referer as string | undefined, }); - return { + const result: PageNavigateResult = { url: page.url(), status: response?.status() ?? 0, headers: response?.headers() ?? {}, }; + + // Fusion 2: navigate-observe kernel — fused observation after navigation + const observeParams = (params as Record).observe as AgentObserveParams | undefined; + if (observeParams) { + try { + const pageId = params.pageId as string | undefined; + (result as Record).observation = await this.handleAgentObserve( + state, + { ...observeParams, pageId } + ); + } catch { + // Non-fatal: observation failure doesn't block navigate result + } + } + + return result; } private async handlePageReload(state: ClientState, params: Record): Promise { @@ -2504,7 +2542,8 @@ export class BAPPlaywrightServer extends EventEmitter { } } - return { + // Build base result + const actResult: AgentActResult = { completed, total: params.steps.length, success: completed === params.steps.length, @@ -2512,6 +2551,42 @@ export class BAPPlaywrightServer extends EventEmitter { duration: Date.now() - startTime, failedAt, }; + + // Fusion 1: observe-act-observe kernel — pre-observation + const preObserve = (params as Record).preObserve as AgentObserveParams | undefined; + if (preObserve) { + try { + (actResult as Record).preObservation = await this.handleAgentObserve( + state, + { ...preObserve, pageId: params.pageId } + ); + } catch { + // Non-fatal: pre-observation failure doesn't block act result + } + } + + // Fusion 1: observe-act-observe kernel — post-observation + const postObserve = (params as Record).postObserve as AgentObserveParams | undefined; + if (postObserve) { + try { + (actResult as Record).postObservation = await this.handleAgentObserve( + state, + { ...postObserve, pageId: params.pageId } + ); + } catch { + // Non-fatal: post-observation failure doesn't block act result + } + } + + // Fusion 6: speculative prefetch — after act that ends with navigate/click + if (!postObserve && results.length > 0) { + const lastStep = params.steps[results.length - 1]; + if (lastStep && (lastStep.action === "page/navigate" || lastStep.action === "action/click")) { + this.speculativeObserve(state, params.pageId); + } + } + + return actResult; } /** @@ -2620,6 +2695,68 @@ export class BAPPlaywrightServer extends EventEmitter { return new Promise(resolve => setTimeout(resolve, ms)); } + /** + * Fusion 6: Speculative prefetch — fire-and-forget observation after act + * Builds an "interactive" tier observation that can be served from cache + * if the next call is a matching agent/observe. + * + * Guards: + * - Timer tracked on state for cancellation on cleanup/disconnect + * - Checks page still exists and URL hasn't changed before caching + * - Aborts if client state is no longer initialized (disconnected) + */ + private speculativeObserve(state: ClientState, pageId?: string): void { + // Cancel any pending speculative prefetch + if (state.speculativePrefetchTimer) { + clearTimeout(state.speculativePrefetchTimer); + state.speculativePrefetchTimer = undefined; + } + + // Snapshot the URL at call time to detect navigation during delay + let urlAtCallTime: string | undefined; + try { + const p = this.getPage(state, pageId); + urlAtCallTime = p.url(); + } catch { + return; // Page doesn't exist, skip + } + + // Fire after 200ms delay to let page settle + state.speculativePrefetchTimer = setTimeout(async () => { + state.speculativePrefetchTimer = undefined; + try { + // Guard: client may have disconnected during delay + if (!state.initialized) return; + + // Guard: page may have been closed during delay + const page = this.getPage(state, pageId); + // Guard: page may have navigated during delay + if (page.url() !== urlAtCallTime) return; + + const result = await this.handleAgentObserve(state, { + pageId, + includeMetadata: true, + includeInteractiveElements: true, + includeScreenshot: false, + includeAccessibility: false, + maxElements: 50, + responseTier: "interactive", + }); + + // Guard: check URL hasn't changed during observation + if (page.url() !== urlAtCallTime) return; + + state.speculativeObservation = { + pageUrl: page.url(), + result, + timestamp: Date.now(), + }; + } catch { + // Speculative prefetch is fire-and-forget — silently ignore errors + } + }, 200); + } + /** * Get an AI-optimized observation of the page * Supports stable element refs and screenshot annotation (Set-of-Marks) @@ -2630,11 +2767,51 @@ export class BAPPlaywrightServer extends EventEmitter { ): Promise { const page = this.getPage(state, params.pageId); const pageId = params.pageId ?? state.activePage ?? ""; + const pageUrl = page.url(); + + // Fusion 6: speculative cache — check for a valid pre-built observation + // Use if: URL matches, age < 5s, not requesting tree or screenshot (those are expensive/specific) + if (state.speculativeObservation) { + const spec = state.speculativeObservation; + const age = Date.now() - spec.timestamp; + const canUse = spec.pageUrl === pageUrl + && age < 5000 + && !params.includeAccessibility + && !params.includeScreenshot + && !params.annotateScreenshot; + // Always clear the cache (one-shot) + state.speculativeObservation = undefined; + if (canUse) { + return spec.result; + } + } + const result: AgentObserveResult = {}; + // Fusion 5: response tiers — override include flags based on tier + const responseTier = params.responseTier ?? "full"; + if (responseTier === "interactive" || responseTier === "minimal") { + // Force interactive-only: skip tree and screenshot + params = { + ...params, + includeAccessibility: false, + includeScreenshot: false, + includeInteractiveElements: true, + includeMetadata: true, + }; + } + // Get or create element registry for this page let registry = state.elementRegistries.get(pageId); - const pageUrl = page.url(); + + // Snapshot previous refs BEFORE registry update (needed for incremental diff) + const previousRefs = params.incremental && registry + ? new Map(Array.from(registry.elements.entries()).map(([ref, entry]) => [ref, { + name: entry.identity.name, + value: undefined as string | undefined, // registry doesn't track value, diff from element list + disabled: false, + }])) + : null; // Create new registry if needed or if URL changed (navigation) if (!registry || registry.pageUrl !== pageUrl || params.refreshRefs) { @@ -2685,9 +2862,51 @@ export class BAPPlaywrightServer extends EventEmitter { interactiveElements = elements.elements; if (params.includeInteractiveElements) { - result.interactiveElements = elements.elements; + // Fusion 5: minimal tier — strip elements to essential fields only + if (responseTier === "minimal") { + result.interactiveElements = elements.elements.map(el => ({ + ref: el.ref, + selector: el.selector, + role: el.role, + name: el.name, + tagName: el.tagName, + actionHints: [], + })); + } else { + result.interactiveElements = elements.elements; + } result.totalInteractiveElements = elements.total; } + + // Fusion 3: incremental observe — compute diff against previous observation + if (params.incremental && previousRefs) { + const currentRefs = new Set(elements.elements.map(el => el.ref)); + const added: InteractiveElement[] = []; + const updated: InteractiveElement[] = []; + const removed: string[] = []; + + for (const el of elements.elements) { + if (!previousRefs.has(el.ref)) { + added.push(el); + } else { + const prev = previousRefs.get(el.ref)!; + if (prev.name !== el.name || el.disabled || el.value !== undefined) { + // Heuristic: if name changed or element has notable state, include as updated + if (prev.name !== el.name) { + updated.push(el); + } + } + } + } + + for (const [prevRef] of previousRefs) { + if (!currentRefs.has(prevRef)) { + removed.push(prevRef); + } + } + + result.changes = { added, updated, removed }; + } } // Screenshot (with optional annotation) @@ -3450,6 +3669,8 @@ export class BAPPlaywrightServer extends EventEmitter { actionHints: string[]; selectorType: string; selectorValue: string; + /** Fusion 4: Pre-computed CSS path for selector caching */ + cssPath: string; bounds: { x: number; y: number; width: number; height: number } | undefined; // Identity fields for stable refs testId?: string; @@ -3594,6 +3815,9 @@ export class BAPPlaywrightServer extends EventEmitter { } } + // Fusion 4: always compute CSS path for selector caching + const cssPath = getCssPath(el); + return { index, role, @@ -3605,6 +3829,7 @@ export class BAPPlaywrightServer extends EventEmitter { actionHints: hints, selectorType, selectorValue, + cssPath, bounds: opts.includeBounds ? { x: Math.round(rect.x), y: Math.round(rect.y), @@ -3704,13 +3929,14 @@ export class BAPPlaywrightServer extends EventEmitter { } } - // Update registry + // Update registry (Fusion 4: include cached CSS selector for fast resolution) registry.elements.set(ref, { ref, selector, identity, lastSeen: Date.now(), bounds: el.bounds, + cachedCssSelector: el.cssPath || undefined, }); } else { // Use simple index-based ref @@ -3866,7 +4092,12 @@ export class BAPPlaywrightServer extends EventEmitter { `Element ref not found: ${selector.ref}. The element may have been removed or the ref may be stale.` ); } - // Use the stored selector to find the element + // Fusion 4: Use cached CSS selector for fast resolution (bypasses semantic lookup) + // Falls back to stored selector if cache miss or stale (executeStepWithRetry handles stale elements) + if (entry.cachedCssSelector) { + return page.locator(entry.cachedCssSelector); + } + // Fallback: Use the stored semantic selector return this.resolveSelector(page, entry.selector); } @@ -4321,6 +4552,13 @@ export class BAPPlaywrightServer extends EventEmitter { // Clear session timeouts (v0.2.0) this.clearSessionTimeouts(state); + // Cancel any pending speculative prefetch + if (state.speculativePrefetchTimer) { + clearTimeout(state.speculativePrefetchTimer); + state.speculativePrefetchTimer = undefined; + } + state.speculativeObservation = undefined; + if (state.tracing && state.context) { try { await state.context.tracing.stop(); @@ -4340,7 +4578,15 @@ export class BAPPlaywrightServer extends EventEmitter { state.browser = null; state.context = null; state.pages.clear(); + state.pageToContext.clear(); state.activePage = null; + state.elementRegistries.clear(); + state.frameContexts.clear(); + state.activeStreams.clear(); + state.pendingApprovals.clear(); + state.sessionApprovals.clear(); + state.contexts.clear(); + state.defaultContextId = null; state.initialized = false; } @@ -4704,11 +4950,18 @@ export class BAPPlaywrightServer extends EventEmitter { } /** - * Log a debug message + * Log a debug message with optional structured context */ - private log(message: string): void { + private log(message: string, context?: Record): void { if (this.options.debug) { - console.log(`[BAP Server] ${message}`); + if (context) { + const ctx = Object.entries(context) + .map(([k, v]) => `${k}=${typeof v === "string" ? v : JSON.stringify(v)}`) + .join(" "); + console.log(`[BAP Server] ${message} ${ctx}`); + } else { + console.log(`[BAP Server] ${message}`); + } } } } diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 1eb65ec..e7e7a2b 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -45,6 +45,25 @@ importers: specifier: 4.0.18 version: 4.0.18(@types/node@22.19.8)(tsx@4.21.0) + packages/cli: + dependencies: + '@browseragentprotocol/client': + specifier: workspace:* + version: link:../client + '@browseragentprotocol/logger': + specifier: workspace:* + version: link:../logger + '@browseragentprotocol/protocol': + specifier: workspace:* + version: link:../protocol + devDependencies: + tsup: + specifier: ^8.3.0 + version: 8.5.1(postcss@8.5.6)(tsx@4.21.0)(typescript@5.9.3) + typescript: + specifier: ^5.7.0 + version: 5.9.3 + packages/client: dependencies: '@browseragentprotocol/protocol': diff --git a/skills/bap-browser/SKILL.md b/skills/bap-browser/SKILL.md index 736ab42..c0b7a9a 100644 --- a/skills/bap-browser/SKILL.md +++ b/skills/bap-browser/SKILL.md @@ -104,27 +104,65 @@ act({ This pattern turns 4+ round-trips into 2. Use it. +## Fused Operations + +Fused operations combine multiple server calls into one, cutting roundtrips by 50-85%. + +### Navigate + Observe (1 call instead of 2) +``` +navigate({ url: "https://example.com", observe: { maxElements: 30, responseTier: "interactive" } }) +``` +Returns navigation result AND observation in a single response. The `observation` field on the result contains the page elements. + +### Act + Post-Observe (1 call instead of 2) +``` +act({ + steps: [ + { action: "action/click", selector: "role:button:Submit" } + ], + postObserve: { maxElements: 30, responseTier: "interactive" } +}) +``` +Executes actions AND returns the resulting page state. The `postObservation` field on the result contains the updated elements. + +### Response Tiers +Control how much data `observe` returns: + +| Tier | What's included | When to use | +|------|----------------|-------------| +| `"full"` | All fields, metadata, screenshots | First page load, debugging | +| `"interactive"` | Interactive elements, refs, roles | Most interactions (default for fused) | +| `"minimal"` | Refs and names only | Rapid polling, confirmation checks | + +``` +observe({ responseTier: "interactive", maxElements: 20 }) +``` + ## Efficiency Rules 1. **`aria_snapshot` over `accessibility`.** Same structure, ~80% fewer tokens. 2. **`observe` with `maxElements`.** Default is 50. Set it lower when you can: `maxElements: 20`. 3. **`observe` with `filterRoles`.** Focus: `filterRoles: ["button", "link", "textbox"]`. -4. **`act` over individual calls.** A login flow is 1 `act`, not 3 separate fill/click calls. -5. **`extract` over manual parsing.** Define a JSON schema. Let BAP extract. Don't scrape HTML. -6. **`content({ format: "markdown" })` over screenshots for text.** Markdown is compact and parseable. -7. **`fill` over `type` for form fields.** `fill` clears and sets; `type` sends keystrokes one at a time. +4. **`observe` with `responseTier`.** Use `"interactive"` for most flows, `"minimal"` for quick checks. +5. **Fused `navigate` + observe.** Pass `observe: {}` to `navigate` instead of calling both separately. +6. **Fused `act` + post-observe.** Pass `postObserve: {}` to `act` to get updated page state in one call. +7. **`act` over individual calls.** A login flow is 1 `act`, not 3 separate fill/click calls. +8. **`extract` over manual parsing.** Define a JSON schema. Let BAP extract. Don't scrape HTML. +9. **`content({ format: "markdown" })` over screenshots for text.** Markdown is compact and parseable. +10. **`fill` over `type` for form fields.** `fill` clears and sets; `type` sends keystrokes one at a time. ## Recipes -### Login +### Login (fused — 2 calls total) ``` +navigate({ url: "https://app.example.com/login", observe: { maxElements: 20 } }) act({ steps: [ - { action: "page/navigate", url: "https://app.example.com/login" }, { action: "action/fill", selector: "label:Email", value: "user@example.com" }, { action: "action/fill", selector: "label:Password", value: "password123" }, { action: "action/click", selector: "role:button:Sign in" } - ] + ], + postObserve: { responseTier: "interactive" } }) ``` @@ -175,9 +213,9 @@ press({ key: "ArrowDown" }) press({ key: "Enter" }) ``` -### Google search +### Google search (fused) ``` -navigate({ url: "https://www.google.com" }) +navigate({ url: "https://www.google.com", observe: { maxElements: 10 } }) act({ steps: [ { action: "action/fill", selector: "role:combobox:Search", value: "best noise cancelling headphones 2025" }, @@ -232,6 +270,9 @@ extract({ - Use CSS selectors copied from browser DevTools. They break. - Call `accessibility` when `aria_snapshot` works. Wastes tokens. - Make individual click/fill calls when `act` can batch them. +- Call `navigate` then `observe` separately when you can fuse them with `observe: {}`. +- Call `act` then `observe` separately when you can fuse them with `postObserve: {}`. +- Use `responseTier: "full"` when `"interactive"` or `"minimal"` suffice. - Take a screenshot to read text. Use `content({ format: "markdown" })`. - Skip `observe` on pages you haven't seen. You'll guess wrong. - Parse raw HTML. Use `extract` with a schema. diff --git a/vitest.config.ts b/vitest.config.ts index 3ae5d8a..3b2f1c4 100644 --- a/vitest.config.ts +++ b/vitest.config.ts @@ -7,7 +7,7 @@ export default defineConfig({ include: ["packages/*/src/**/*.test.ts"], coverage: { provider: "v8", - reporter: ["text", "html"], + reporter: ["text", "html", "json-summary"], exclude: ["**/node_modules/**", "**/dist/**", "**/*.test.ts"], }, },