diff --git a/README.md b/README.md index d9397ce..e02133f 100644 --- a/README.md +++ b/README.md @@ -33,6 +33,7 @@ integrations/ │ ├── logs/ # Logging utilities │ ├── mastra/ # Mastra AI agent integration │ ├── mongodb/ # MongoDB data extraction & storage +│ ├── openai/ # OpenAI Realtime voice agent + browser agent │ ├── stripe/ # Stripe Issuing + automation │ ├── temporal/ # Temporal workflow orchestration │ ├── trigger/ # Trigger.dev background jobs & automation diff --git a/examples/integrations/openai/.env.example b/examples/integrations/openai/.env.example new file mode 100644 index 0000000..3898fbe --- /dev/null +++ b/examples/integrations/openai/.env.example @@ -0,0 +1,7 @@ +OPENAI_API_KEY= +ANTHROPIC_API_KEY= +BROWSERBASE_API_KEY= +BROWSERBASE_PROJECT_ID= +OPENAI_REALTIME_MODEL=gpt-realtime-2 +OPENAI_REALTIME_VOICE=marin +BROWSE_BIN=browse diff --git a/examples/integrations/openai/.gitignore b/examples/integrations/openai/.gitignore new file mode 100644 index 0000000..2a182d6 --- /dev/null +++ b/examples/integrations/openai/.gitignore @@ -0,0 +1,7 @@ +.next +node_modules +.env +.env.local +.env.*.local +.DS_Store +tsconfig.tsbuildinfo diff --git a/examples/integrations/openai/README.md b/examples/integrations/openai/README.md new file mode 100644 index 0000000..e125814 --- /dev/null +++ b/examples/integrations/openai/README.md @@ -0,0 +1,55 @@ +# OpenAI Realtime + Browserbase + +Give your voice agent access to the whole web. + +A **voice agent** (OpenAI Realtime, speech-to-speech) talks with the user. A **browser agent** (Claude driving a Browserbase session) operates a real browser underneath it — opening sites, clicking, and reading pages. They share one live session, so what the agent says stays in sync with what it does. + +The user watches the browser work live, and can interrupt or redirect at any time, just by talking. + +## Required environment variables + +Create `.env.local` or `.env` with: + +```bash +OPENAI_API_KEY= +ANTHROPIC_API_KEY= +BROWSERBASE_API_KEY= +BROWSERBASE_PROJECT_ID= +``` + +Optional: + +```bash +OPENAI_REALTIME_MODEL=gpt-realtime-2 +OPENAI_REALTIME_VOICE=marin +BROWSE_BIN=browse +``` + +## Run locally + +```bash +pnpm install +pnpm dev +``` + +Open: + +```text +http://127.0.0.1:3002 +``` + +Click **Start voice**, allow the microphone, and just talk — ask it to open a site, search, click, or read a page. + +The browser agent uses the Browserbase [Browse CLI](https://www.npmjs.com/package/browse), which must be installed and available on your `PATH`. If it lives elsewhere, point `BROWSE_BIN` at it. + +## How it works + +Two cooperating agents joined by a server-side connection: + +- **Voice plane** — the browser connects to OpenAI Realtime over WebRTC (audio is peer-to-peer). The voice agent has one tool, `control_browser`, which it calls whenever the user wants something done on the web. +- **Server bridge** — the connect route creates the Realtime call, then attaches a server-side WebSocket to the same call so the backend can answer `control_browser` tool calls and speak the result back in the same conversation. +- **Browser plane** — one **persistent Claude agent runs for the whole call**. Each `control_browser` instruction is appended to the same conversation, so the agent remembers everything it has already opened and done (the user can say "go back to the first result and compare"). It drives the browser through compact tools (`navigate`, `click`, `type_text`, `press_key`, `go_back`, `read_page`) against a shared Browserbase session shown in the live-view iframe. + +Because the tool call only returns once the browser work has actually happened — and answers are grounded in (and quoted from) the live page — the spoken conversation stays in sync with the screen instead of narrating ahead of it. + +This is a prototype meant to inspire people building voice agents: the same pattern works with any speech-to-speech voice runtime in front of a Browserbase-backed browser agent. diff --git a/examples/integrations/openai/app/api/demo/control/route.ts b/examples/integrations/openai/app/api/demo/control/route.ts new file mode 100644 index 0000000..ccf6696 --- /dev/null +++ b/examples/integrations/openai/app/api/demo/control/route.ts @@ -0,0 +1,22 @@ +import { NextResponse } from "next/server"; +import { z } from "zod"; +import { runDemoInstruction } from "../../../../lib/demo-controller"; + +export const runtime = "nodejs"; + +const bodySchema = z.object({ + demoId: z.string().uuid(), + instruction: z.string().min(3), + interrupt: z.boolean().optional() +}); + +export async function POST(request: Request) { + try { + const parsed = bodySchema.parse(await request.json()); + const snapshot = await runDemoInstruction(parsed); + return NextResponse.json(snapshot); + } catch (error) { + const message = error instanceof Error ? error.message : "Demo control failed."; + return NextResponse.json({ error: message }, { status: 500 }); + } +} diff --git a/examples/integrations/openai/app/api/demo/session/route.ts b/examples/integrations/openai/app/api/demo/session/route.ts new file mode 100644 index 0000000..e364798 --- /dev/null +++ b/examples/integrations/openai/app/api/demo/session/route.ts @@ -0,0 +1,15 @@ +import { NextResponse } from "next/server"; +import { getDemoSnapshot } from "../../../../lib/demo-controller"; + +export const runtime = "nodejs"; + +export async function GET(request: Request) { + const url = new URL(request.url); + const demoId = url.searchParams.get("demoId"); + + if (!demoId) { + return NextResponse.json({ error: "Missing demoId." }, { status: 400 }); + } + + return NextResponse.json(getDemoSnapshot(demoId)); +} diff --git a/examples/integrations/openai/app/api/demo/stream/route.ts b/examples/integrations/openai/app/api/demo/stream/route.ts new file mode 100644 index 0000000..335f11c --- /dev/null +++ b/examples/integrations/openai/app/api/demo/stream/route.ts @@ -0,0 +1,48 @@ +import { getDemoSnapshot, subscribeToDemo } from "../../../../lib/demo-controller"; + +export const runtime = "nodejs"; + +export async function GET(request: Request) { + const url = new URL(request.url); + const demoId = url.searchParams.get("demoId"); + + if (!demoId) { + return new Response("Missing demoId.", { status: 400 }); + } + + const encoder = new TextEncoder(); + + const stream = new ReadableStream({ + start(controller) { + const sendSnapshot = (snapshot = getDemoSnapshot(demoId)) => { + controller.enqueue( + encoder.encode(`event: snapshot\ndata: ${JSON.stringify(snapshot)}\n\n`) + ); + }; + + sendSnapshot(); + + const unsubscribe = subscribeToDemo(demoId, (snapshot) => { + sendSnapshot(snapshot); + }); + + const heartbeat = setInterval(() => { + controller.enqueue(encoder.encode("event: ping\ndata: {}\n\n")); + }, 15000); + + request.signal.addEventListener("abort", () => { + clearInterval(heartbeat); + unsubscribe(); + controller.close(); + }); + } + }); + + return new Response(stream, { + headers: { + "Cache-Control": "no-cache, no-transform", + Connection: "keep-alive", + "Content-Type": "text/event-stream" + } + }); +} diff --git a/examples/integrations/openai/app/api/realtime/connect/route.ts b/examples/integrations/openai/app/api/realtime/connect/route.ts new file mode 100644 index 0000000..2f5514a --- /dev/null +++ b/examples/integrations/openai/app/api/realtime/connect/route.ts @@ -0,0 +1,57 @@ +import { NextResponse } from "next/server"; +import { attachRealtimeSideband } from "../../../../lib/openai-realtime-sideband"; +import { buildRealtimeSessionConfig } from "../../../../lib/realtime-config"; + +export const runtime = "nodejs"; + +function getOpenAiApiKey() { + return process.env.OPENAI_API_KEY ?? process.env.openai_key ?? null; +} + +export async function POST(request: Request) { + const apiKey = getOpenAiApiKey(); + if (!apiKey) { + return NextResponse.json({ error: "OPENAI_API_KEY is missing." }, { status: 500 }); + } + + const url = new URL(request.url); + const demoId = url.searchParams.get("demoId"); + if (!demoId) { + return NextResponse.json({ error: "Missing demoId." }, { status: 400 }); + } + + const sdp = await request.text(); + if (!sdp.trim()) { + return NextResponse.json({ error: "Missing SDP offer." }, { status: 400 }); + } + + const formData = new FormData(); + formData.set("sdp", sdp); + formData.set("session", JSON.stringify(buildRealtimeSessionConfig())); + + const response = await fetch("https://api.openai.com/v1/realtime/calls", { + method: "POST", + headers: { + Authorization: `Bearer ${apiKey}` + }, + body: formData + }); + + const answerSdp = await response.text(); + if (!response.ok) { + return new Response(answerSdp || "Failed to create Realtime call.", { status: response.status }); + } + + const location = response.headers.get("Location"); + const callId = location?.split("/").pop() ?? null; + if (callId) { + attachRealtimeSideband({ callId, demoId }); + } + + return new Response(answerSdp, { + headers: { + "Content-Type": "application/sdp", + ...(callId ? { "X-OpenAI-Call-Id": callId } : {}) + } + }); +} diff --git a/examples/integrations/openai/app/demo-client.tsx b/examples/integrations/openai/app/demo-client.tsx new file mode 100644 index 0000000..5ed9202 --- /dev/null +++ b/examples/integrations/openai/app/demo-client.tsx @@ -0,0 +1,385 @@ +"use client"; + +import { useCallback, useEffect, useRef, useState } from "react"; +import type { DemoEvent, DemoSessionSnapshot } from "../lib/demo-types"; + +const EMPTY_SESSION: DemoSessionSnapshot = { + demoId: "", + activeRunId: null, + status: "idle", + busy: false, + liveViewUrl: null, + browserbaseSessionId: null, + claudeSessionId: null, + currentUrl: null, + pageTitle: null, + lastInstruction: null, + lastSummary: null, + currentStep: null, + lastNarration: null, + stepCount: 0, + error: null, + missingConfig: [], + events: [] +}; + +type VoiceStatus = "idle" | "connecting" | "connected" | "error"; + +type TranscriptLine = { + id: string; + role: "user" | "agent" | "system"; + text: string; + createdAt: string; +}; + +export function DemoClient() { + const [demoId] = useState(() => crypto.randomUUID()); + const [session, setSession] = useState({ + ...EMPTY_SESSION, + demoId + }); + const [uiError, setUiError] = useState(null); + const [transcript, setTranscript] = useState([]); + const [voiceStatus, setVoiceStatus] = useState("idle"); + const [, setCallId] = useState(null); + + const peerConnectionRef = useRef(null); + const dataChannelRef = useRef(null); + const mediaStreamRef = useRef(null); + const audioRef = useRef(null); + + const appendTranscriptLine = useCallback((role: TranscriptLine["role"], text: string) => { + const trimmed = text.trim(); + if (!trimmed) { + return; + } + + setTranscript((current) => [ + ...current, + { + id: crypto.randomUUID(), + role, + text: trimmed, + createdAt: new Date().toISOString() + } + ]); + }, []); + + const refreshSession = useCallback(async () => { + const response = await fetch(`/api/demo/session?demoId=${encodeURIComponent(demoId)}`); + if (!response.ok) { + return; + } + + const nextSession = (await response.json()) as DemoSessionSnapshot; + setSession(nextSession); + }, [demoId]); + + useEffect(() => { + void refreshSession(); + }, [refreshSession]); + + useEffect(() => { + const source = new EventSource(`/api/demo/stream?demoId=${encodeURIComponent(demoId)}`); + + const onSnapshot = (event: Event) => { + const messageEvent = event as MessageEvent; + const nextSession = JSON.parse(messageEvent.data) as DemoSessionSnapshot; + setSession(nextSession); + }; + + source.addEventListener("snapshot", onSnapshot); + source.onerror = () => { + void refreshSession(); + }; + + return () => { + source.removeEventListener("snapshot", onSnapshot); + source.close(); + }; + }, [demoId, refreshSession]); + + const handleRealtimeEvent = useCallback( + (rawEvent: unknown) => { + if (!rawEvent || typeof rawEvent !== "object" || !("type" in rawEvent)) { + return; + } + + const event = rawEvent as Record; + const type = String(event.type); + + if (type === "error") { + const error = event.error as { message?: string } | undefined; + appendTranscriptLine("system", error?.message ?? "Realtime session error."); + return; + } + + if (type === "session.created") { + appendTranscriptLine("system", "OpenAI Realtime session connected."); + return; + } + + if (type === "conversation.item.input_audio_transcription.completed") { + if (typeof event.transcript === "string") { + appendTranscriptLine("user", event.transcript); + } + return; + } + + if (type === "response.audio_transcript.done" || type === "response.text.done") { + if (typeof event.transcript === "string") { + appendTranscriptLine("agent", event.transcript); + } else if (typeof event.text === "string") { + appendTranscriptLine("agent", event.text); + } + } + }, + [appendTranscriptLine] + ); + + const stopVoice = useCallback(() => { + dataChannelRef.current?.close(); + dataChannelRef.current = null; + + peerConnectionRef.current?.close(); + peerConnectionRef.current = null; + + mediaStreamRef.current?.getTracks().forEach((track) => track.stop()); + mediaStreamRef.current = null; + + if (audioRef.current) { + audioRef.current.srcObject = null; + } + + setCallId(null); + setVoiceStatus("idle"); + appendTranscriptLine("system", "Voice session ended."); + }, [appendTranscriptLine]); + + useEffect(() => stopVoice, [stopVoice]); + + const startVoice = async () => { + setUiError(null); + setVoiceStatus("connecting"); + + try { + const peerConnection = new RTCPeerConnection(); + peerConnectionRef.current = peerConnection; + + const audioElement = new Audio(); + audioElement.autoplay = true; + audioRef.current = audioElement; + + peerConnection.ontrack = (event) => { + audioElement.srcObject = event.streams[0]; + }; + + const mediaStream = await navigator.mediaDevices.getUserMedia({ audio: true }); + mediaStreamRef.current = mediaStream; + mediaStream.getTracks().forEach((track) => peerConnection.addTrack(track, mediaStream)); + + const dataChannel = peerConnection.createDataChannel("oai-events"); + dataChannelRef.current = dataChannel; + + dataChannel.addEventListener("open", () => { + setVoiceStatus("connected"); + appendTranscriptLine("system", "Data channel connected."); + dataChannel.send( + JSON.stringify({ + type: "response.create", + response: { + instructions: + "Greet the user in one short sentence and ask what they would like to do in the browser. Do not suggest, name, or assume any specific website or task — wait for the user to tell you." + } + }) + ); + }); + + dataChannel.addEventListener("message", (event) => { + try { + handleRealtimeEvent(JSON.parse(event.data)); + } catch { + appendTranscriptLine("system", "Received an unreadable Realtime event."); + } + }); + + peerConnection.addEventListener("connectionstatechange", () => { + if (peerConnection.connectionState === "failed" || peerConnection.connectionState === "disconnected") { + setVoiceStatus("error"); + } + }); + + const offer = await peerConnection.createOffer(); + await peerConnection.setLocalDescription(offer); + + const response = await fetch(`/api/realtime/connect?demoId=${encodeURIComponent(demoId)}`, { + method: "POST", + headers: { + "Content-Type": "application/sdp" + }, + body: offer.sdp + }); + + if (!response.ok) { + const message = await readErrorMessage(response); + throw new Error(message || "Realtime connection failed."); + } + + const nextCallId = response.headers.get("X-OpenAI-Call-Id"); + setCallId(nextCallId); + + await peerConnection.setRemoteDescription({ + type: "answer", + sdp: await response.text() + }); + } catch (error) { + const message = error instanceof Error ? error.message : "Voice session failed to start."; + setUiError(message); + setVoiceStatus("error"); + appendTranscriptLine("system", message); + stopVoice(); + } + }; + + const conversation = mergeConversation(session.events, transcript); + const latestConversation = [...conversation].reverse(); + + return ( +
+
+
+
A voice agent + a browser agent
+

Give your agent access to the whole web.

+

+ A voice agent talks with the user. A browser agent operates a real browser underneath it — opening sites, + clicking, and reading pages. They share one live session, so what the agent says stays in sync with what it + does. +

+

A blueprint for giving any voice agent hands on the web.

+
+ +
+
+
+ Voice session + {voiceStatus} +
+
+ + +
+
+

+ Start voice, then just talk — ask it to open a site, search, click, or read a page, and watch it browse + live. +

+ {session.busy ?

The agent is working on your request.

: null} + {uiError ?

{uiError}

: null} +
+
+ +
+
+
+
+ Live browser session + {session.pageTitle ?? "Waiting for the first instruction"} +
+
+ {session.busy ? "Running" : "Idle"} +
+
+
+ {session.currentUrl ?? "No page yet"} +
+ + {session.liveViewUrl ? ( +