From 8932824c318ced3f838aad9d87dc02c5556d52ae Mon Sep 17 00:00:00 2001 From: "rosetta-livekit-bot[bot]" <282703043+rosetta-livekit-bot[bot]@users.noreply.github.com> Date: Mon, 1 Jun 2026 02:59:44 +0000 Subject: [PATCH] feat(voice): add AgentSession.claimUserTurn --- .changeset/claim-user-turn.md | 5 +++ agents/src/voice/agent_activity.ts | 19 ++++++++++ agents/src/voice/agent_session.ts | 54 ++++++++++++++++++++++++++++- agents/src/voice/room_io/room_io.ts | 8 +++-- 4 files changed, 82 insertions(+), 4 deletions(-) create mode 100644 .changeset/claim-user-turn.md diff --git a/.changeset/claim-user-turn.md b/.changeset/claim-user-turn.md new file mode 100644 index 000000000..ae575207c --- /dev/null +++ b/.changeset/claim-user-turn.md @@ -0,0 +1,5 @@ +--- +"@livekit/agents": minor +--- + +Add AgentSession.claimUserTurn for programmatic user-driven turns. diff --git a/agents/src/voice/agent_activity.ts b/agents/src/voice/agent_activity.ts index 0ff06b318..3d3822826 100644 --- a/agents/src/voice/agent_activity.ts +++ b/agents/src/voice/agent_activity.ts @@ -212,6 +212,7 @@ export class AgentActivity implements RecognitionHooks { private lock = new Mutex(); private audioStream = new MultiInputStream(); private audioStreamId?: string; + private userSpeaking = false; // default to null as None, which maps to the default provider tool choice value private toolChoice: ToolChoice | null = null; @@ -914,6 +915,10 @@ export class AgentActivity implements RecognitionHooks { this.realtimeSession?.clearAudio(); } + get isUserSpeaking(): boolean { + return this.userSpeaking; + } + say( text: string | ReadableStream, options?: { @@ -1017,6 +1022,7 @@ export class AgentActivity implements RecognitionHooks { onInputSpeechStarted(_ev: InputSpeechStartedEvent): void { this.logger.info('onInputSpeechStarted'); + this.userSpeaking = true; if (!this.vad) { this.agentSession._updateUserState('speaking'); @@ -1043,6 +1049,7 @@ export class AgentActivity implements RecognitionHooks { onInputSpeechStopped(ev: InputSpeechStoppedEvent): void { this.logger.info(ev, 'onInputSpeechStopped'); + this.userSpeaking = false; if (!this.vad) { if (this.isInterruptionDetectionEnabled && this.audioRecognition) { @@ -1129,6 +1136,7 @@ export class AgentActivity implements RecognitionHooks { // recognition hooks onStartOfSpeech(ev: VADEvent): void { + this.userSpeaking = true; let speechStartTime = Date.now(); if (ev) { // Subtract both speechDuration and inferenceDuration to correct for VAD model latency. @@ -1171,6 +1179,7 @@ export class AgentActivity implements RecognitionHooks { } onEndOfSpeech(ev: VADEvent): void { + this.userSpeaking = false; let speechEndTime = Date.now(); if (ev) { // Subtract both silenceDuration and inferenceDuration to correct for VAD model latency. @@ -1676,6 +1685,16 @@ export class AgentActivity implements RecognitionHooks { await delay(0, { signal }); } } + + if (this.agentSession._userTurnClaims > 0) { + await this.waitForOrAbort( + this.agentSession._userTurnReleased.wait().then(() => undefined), + signal, + 'error waiting for user turn claim release', + ); + agentActive = waitForAgent; + userActive = waitForUser; + } } } diff --git a/agents/src/voice/agent_session.ts b/agents/src/voice/agent_session.ts index 3fae07e88..ff69f7df7 100644 --- a/agents/src/voice/agent_session.ts +++ b/agents/src/voice/agent_session.ts @@ -43,7 +43,7 @@ import { type ResolvedSessionConnectOptions, type SessionConnectOptions, } from '../types.js'; -import { Task, asError } from '../utils.js'; +import { Event, Task, asError } from '../utils.js'; import type { VAD } from '../vad.js'; import type { Agent } from './agent.js'; import { @@ -259,6 +259,11 @@ type ActivityTransitionOptions = { waitOnEnter?: boolean; }; +export interface UserTurnClaim { + /** Release the programmatic user turn. Safe to call more than once. */ + release(): void; +} + export class AgentSession< UserData = UnknownUserData, > extends (EventEmitter as new () => TypedEmitter) { @@ -287,6 +292,12 @@ export class AgentSession< private _userState: UserState = 'listening'; private _agentState: AgentState = 'initializing'; + /** @internal */ + _userTurnClaims = 0; + + /** @internal */ + _userTurnReleased = new Event(); + private _input: AgentInput; private _output: AgentOutput; @@ -407,6 +418,7 @@ export class AgentSession< this.sessionOptions = resolvedSessionOptions; this.options = legacyVoiceOptions; this._aecWarmupRemaining = this.sessionOptions.aecWarmupDuration ?? 0; + this._userTurnReleased.set(); this._onUserInputTranscribed = this._onUserInputTranscribed.bind(this); this.on(AgentSessionEventTypes.UserInputTranscribed, this._onUserInputTranscribed); @@ -678,6 +690,42 @@ export class AgentSession< } } + /** + * Declare a programmatic user-driven turn. + * + * Pins `userState` to `"speaking"` and keeps idle detection open until the returned claim is + * released. Pass a callback to release automatically when the callback settles. + */ + claimUserTurn(callback: () => T | Promise): Promise; + claimUserTurn(): UserTurnClaim; + claimUserTurn(callback?: () => T | Promise): UserTurnClaim | Promise { + const first = this._userTurnClaims === 0; + this._userTurnClaims += 1; + if (first) { + this._userTurnReleased.clear(); + this._updateUserState('speaking', { lastSpeakingTime: Date.now() }); + } + + let released = false; + const release = () => { + if (released) { + return; + } + released = true; + this._userTurnClaims -= 1; + if (this._userTurnClaims === 0) { + this._userTurnReleased.set(); + this._updateUserState(this.activity?.isUserSpeaking ? 'speaking' : 'listening'); + } + }; + + if (callback) { + return Promise.resolve().then(callback).finally(release); + } + + return { release }; + } + commitUserTurn() { if (!this.activity) { throw new Error('AgentSession is not running'); @@ -1220,6 +1268,10 @@ export class AgentSession< state: UserState, options?: { lastSpeakingTime?: number; otelContext?: Context }, ) { + if (this._userTurnClaims > 0 && state !== 'speaking') { + return; + } + if (this._userState === state) { return; } diff --git a/agents/src/voice/room_io/room_io.ts b/agents/src/voice/room_io/room_io.ts index fd6541344..0145d1d30 100644 --- a/agents/src/voice/room_io/room_io.ts +++ b/agents/src/voice/room_io/room_io.ts @@ -46,9 +46,11 @@ import { ParticipantTranscriptionOutput, } from './_output.js'; -export const DEFAULT_TEXT_INPUT_CALLBACK: TextInputCallback = (sess, ev) => { - sess.interrupt(); - sess.generateReply({ userInput: ev.text }); +export const DEFAULT_TEXT_INPUT_CALLBACK: TextInputCallback = async (sess, ev) => { + await sess.claimUserTurn(async () => { + await sess.interrupt().await; + sess.generateReply({ userInput: ev.text }); + }); }; const DEFAULT_PARTICIPANT_KINDS: ParticipantKind[] = [