diff --git a/.changeset/dynamic-endpointing.md b/.changeset/dynamic-endpointing.md new file mode 100644 index 000000000..42730b782 --- /dev/null +++ b/.changeset/dynamic-endpointing.md @@ -0,0 +1,5 @@ +--- +'@livekit/agents': patch +--- + +Add dynamic endpointing for voice turn handling. diff --git a/agents/src/utils.ts b/agents/src/utils.ts index 8ab828930..208bd804e 100644 --- a/agents/src/utils.ts +++ b/agents/src/utils.ts @@ -351,32 +351,68 @@ export class AsyncIterableQueue implements AsyncIterableIterator { /** @internal */ export class ExpFilter { #alpha: number; + #min?: number; #max?: number; #filtered?: number = undefined; - constructor(alpha: number, max?: number) { - this.#alpha = alpha; - this.#max = max; + constructor( + alphaOrOpts: number | { alpha: number; initial?: number; minVal?: number; maxVal?: number }, + max?: number, + ) { + if (typeof alphaOrOpts === 'number') { + this.#alpha = alphaOrOpts; + this.#max = max; + return; + } + + this.#validateAlpha(alphaOrOpts.alpha); + this.#alpha = alphaOrOpts.alpha; + this.#filtered = alphaOrOpts.initial; + this.#min = alphaOrOpts.minVal; + this.#max = alphaOrOpts.maxVal; } - reset(alpha?: number) { - if (alpha) { - this.#alpha = alpha; + reset( + alphaOrOpts?: number | { alpha?: number; initial?: number; minVal?: number; maxVal?: number }, + ) { + if (typeof alphaOrOpts === 'object') { + if (alphaOrOpts.alpha !== undefined) { + this.#validateAlpha(alphaOrOpts.alpha); + this.#alpha = alphaOrOpts.alpha; + } + if (alphaOrOpts.initial !== undefined) { + this.#filtered = alphaOrOpts.initial; + } + if (alphaOrOpts.minVal !== undefined) { + this.#min = alphaOrOpts.minVal; + } + if (alphaOrOpts.maxVal !== undefined) { + this.#max = alphaOrOpts.maxVal; + } + return; + } + + if (alphaOrOpts) { + this.#alpha = alphaOrOpts; } + this.#filtered = undefined; } apply(exp: number, sample: number): number { - if (this.#filtered) { + if (this.#filtered !== undefined) { const a = this.#alpha ** exp; this.#filtered = a * this.#filtered + (1 - a) * sample; } else { this.#filtered = sample; } - if (this.#max && this.#filtered > this.#max) { + if (this.#max !== undefined && this.#filtered > this.#max) { this.#filtered = this.#max; } + if (this.#min !== undefined && this.#filtered < this.#min) { + this.#filtered = this.#min; + } return this.#filtered; } @@ -385,9 +421,19 @@ export class ExpFilter { return this.#filtered; } + get value(): number | undefined { + return this.#filtered; + } + set alpha(alpha: number) { this.#alpha = alpha; } + + #validateAlpha(alpha: number) { + if (alpha <= 0 || alpha > 1) { + throw new Error('alpha must be in (0, 1].'); + } + } } /** @internal */ diff --git a/agents/src/voice/agent_activity.ts b/agents/src/voice/agent_activity.ts index 758bc013c..b7199e37e 100644 --- a/agents/src/voice/agent_activity.ts +++ b/agents/src/voice/agent_activity.ts @@ -95,6 +95,7 @@ import { } from './generation.js'; import type { TimedString } from './io.js'; import { SpeechHandle } from './speech_handle.js'; +import { createEndpointing } from './turn_config/endpointing.js'; import { setParticipantSpanAttributes } from './utils.js'; export const agentActivityStorage = new AsyncLocalStorage(); @@ -504,12 +505,10 @@ export class AgentActivity implements RecognitionHooks { interruptionDetection: this.interruptionDetector, backchannelBoundary: this.agentSession.sessionOptions.turnHandling.interruption.backchannelBoundary, - minEndpointingDelay: - this.agent.turnHandling?.endpointing?.minDelay ?? - this.agentSession.sessionOptions.turnHandling.endpointing.minDelay, - maxEndpointingDelay: - this.agent.turnHandling?.endpointing?.maxDelay ?? - this.agentSession.sessionOptions.turnHandling.endpointing.maxDelay, + endpointing: createEndpointing({ + ...this.agentSession.sessionOptions.turnHandling.endpointing, + ...(this.agent.turnHandling?.endpointing ?? {}), + }), rootSpanContext: this.agentSession.rootSpanContext, sttModel: this.stt?.label, sttProvider: this.getSttProvider(), @@ -1976,15 +1975,17 @@ export class AgentActivity implements RecognitionHooks { let replyStartedForwardingAt: number | undefined; let replyTtsGenData: _TTSGenerationData | null = null; - const onFirstFrame = (audioOut: _AudioOut | null, startedSpeakingAt?: number) => { - replyStartedSpeakingAt = startedSpeakingAt ?? Date.now(); + const onFirstFrame = (audioOut: _AudioOut | null, startedSpeakingAt: number = Date.now()) => { + replyStartedSpeakingAt = startedSpeakingAt; replyStartedForwardingAt = audioOut?.startedForwardingAt ?? replyStartedSpeakingAt; this.agentSession._updateAgentState('speaking', { startTime: startedSpeakingAt, otelContext: speechHandle._agentTurnContext, }); - if (this.isInterruptionDetectionEnabled && this.audioRecognition) { - this.audioRecognition.onStartOfAgentSpeech(); + if (this.audioRecognition) { + this.audioRecognition.onStartOfAgentSpeech(replyStartedSpeakingAt); + } + if (this.isInterruptionDetectionEnabled) { this.disableVadInterruptionSoon(); } }; @@ -2079,7 +2080,7 @@ export class AgentActivity implements RecognitionHooks { if (this.agentSession.agentState === 'speaking') { this.agentSession._updateAgentState('listening'); - if (this.isInterruptionDetectionEnabled && this.audioRecognition) { + if (this.audioRecognition) { this.audioRecognition.onEndOfAgentSpeech(Date.now()); } this.restoreInterruptionByAudioActivity(); @@ -2264,15 +2265,20 @@ export class AgentActivity implements RecognitionHooks { let agentStartedSpeakingAt: number | undefined; let agentStartedForwardingAt: number | undefined; - const onFirstFrame = (audioOutRef: _AudioOut | null, startedSpeakingAt?: number) => { - agentStartedSpeakingAt = startedSpeakingAt ?? Date.now(); + const onFirstFrame = ( + audioOutRef: _AudioOut | null, + startedSpeakingAt: number = Date.now(), + ) => { + agentStartedSpeakingAt = startedSpeakingAt; agentStartedForwardingAt = audioOutRef?.startedForwardingAt ?? agentStartedSpeakingAt; this.agentSession._updateAgentState('speaking', { startTime: startedSpeakingAt, otelContext: speechHandle._agentTurnContext, }); - if (this.isInterruptionDetectionEnabled && this.audioRecognition) { - this.audioRecognition.onStartOfAgentSpeech(); + if (this.audioRecognition) { + this.audioRecognition.onStartOfAgentSpeech(agentStartedSpeakingAt); + } + if (this.isInterruptionDetectionEnabled) { this.disableVadInterruptionSoon(); } }; @@ -2435,8 +2441,10 @@ export class AgentActivity implements RecognitionHooks { if (this.agentSession.agentState === 'speaking') { this.agentSession._updateAgentState('listening'); - if (this.isInterruptionDetectionEnabled && this.audioRecognition) { + if (this.audioRecognition) { this.audioRecognition.onEndOfAgentSpeech(Date.now()); + } + if (this.isInterruptionDetectionEnabled) { this.restoreInterruptionByAudioActivity(); } } @@ -2480,11 +2488,11 @@ export class AgentActivity implements RecognitionHooks { this.agentSession._updateAgentState('thinking'); } else if (this.agentSession.agentState === 'speaking') { this.agentSession._updateAgentState('listening'); - if (this.isInterruptionDetectionEnabled && this.audioRecognition) { - { - this.audioRecognition.onEndOfAgentSpeech(Date.now()); - this.restoreInterruptionByAudioActivity(); - } + if (this.audioRecognition) { + this.audioRecognition.onEndOfAgentSpeech(Date.now()); + } + if (this.isInterruptionDetectionEnabled) { + this.restoreInterruptionByAudioActivity(); } } @@ -2691,11 +2699,14 @@ export class AgentActivity implements RecognitionHooks { return; } - const onFirstFrame = (startedSpeakingAt?: number) => { + const onFirstFrame = (startedSpeakingAt: number = Date.now()) => { this.agentSession._updateAgentState('speaking', { startTime: startedSpeakingAt, otelContext: speechHandle._agentTurnContext, }); + if (this.audioRecognition) { + this.audioRecognition.onStartOfAgentSpeech(startedSpeakingAt); + } }; const readMessages = async ( @@ -2938,6 +2949,12 @@ export class AgentActivity implements RecognitionHooks { 'playout completed with interrupt', ); } + if (this.agentSession.agentState === 'speaking') { + this.agentSession._updateAgentState('listening'); + if (this.audioRecognition) { + this.audioRecognition.onEndOfAgentSpeech(Date.now()); + } + } speechHandle._markGenerationDone(); await executeToolsTask.cancelAndWait(AgentActivity.REPLY_TASK_CANCEL_TIMEOUT); @@ -2975,6 +2992,9 @@ export class AgentActivity implements RecognitionHooks { this.agentSession._updateAgentState('thinking'); } else if (this.agentSession.agentState === 'speaking') { this.agentSession._updateAgentState('listening'); + if (this.audioRecognition) { + this.audioRecognition.onEndOfAgentSpeech(Date.now()); + } } if (toolOutput.output.length === 0) { @@ -3441,7 +3461,7 @@ export class AgentActivity implements RecognitionHooks { otelContext: this.pausedSpeech.handle._agentTurnContext, }); if (this.audioRecognition && this.pausedSpeech.agentState === 'speaking') { - this.audioRecognition.onStartOfAgentSpeech(); + this.audioRecognition.onStartOfAgentSpeech(Date.now()); } if (this.isInterruptionDetectionEnabled) { this.disableVadInterruptionSoon(); diff --git a/agents/src/voice/audio_recognition.ts b/agents/src/voice/audio_recognition.ts index f3ed51a4e..06e3785e3 100644 --- a/agents/src/voice/audio_recognition.ts +++ b/agents/src/voice/audio_recognition.ts @@ -37,6 +37,11 @@ import { Task, cancelAndWait, delay, readStream, waitForAbort } from '../utils.j import { type VAD, type VADEvent, VADEventType } from '../vad.js'; import type { TurnDetectionMode } from './agent_session.js'; import type { STTNode } from './io.js'; +import { + type BaseEndpointing, + createEndpointing, + defaultEndpointingOptions, +} from './turn_config/endpointing.js'; import { setParticipantSpanAttributes } from './utils.js'; export interface EndOfTurnInfo { @@ -146,10 +151,12 @@ export interface AudioRecognitionOptions { * configures them separately. `null` (or `undefined`) disables. */ backchannelBoundary?: number | [number, number] | null; - /** Minimum endpointing delay in milliseconds. */ - minEndpointingDelay: number; - /** Maximum endpointing delay in milliseconds. */ - maxEndpointingDelay: number; + /** Endpointing delay strategy. */ + endpointing?: BaseEndpointing; + /** @deprecated Use endpointing instead. */ + minEndpointingDelay?: number; + /** @deprecated Use endpointing instead. */ + maxEndpointingDelay?: number; /** Root span context for tracing. */ rootSpanContext?: Context; /** STT model name for tracing */ @@ -178,8 +185,7 @@ export class AudioRecognition { private vad?: VAD; private turnDetector?: _TurnDetector; private turnDetectionMode?: TurnDetectionMode; - private minEndpointingDelay: number; - private maxEndpointingDelay: number; + private endpointing: BaseEndpointing; private lastLanguage?: LanguageCode; private rootSpanContext?: Context; private sttModel?: string; @@ -240,6 +246,7 @@ export class AudioRecognition { private transcriptBuffer: SpeechEvent[]; private isInterruptionEnabled: boolean; private isAgentSpeaking: boolean; + private interruptionDetected?: boolean; private interruptionStreamChannel?: StreamChannel; private closed = false; @@ -255,8 +262,13 @@ export class AudioRecognition { this.vad = opts.vad; this.turnDetector = opts.turnDetector; this.turnDetectionMode = opts.turnDetectionMode; - this.minEndpointingDelay = opts.minEndpointingDelay; - this.maxEndpointingDelay = opts.maxEndpointingDelay; + this.endpointing = + opts.endpointing ?? + createEndpointing({ + ...defaultEndpointingOptions, + minDelay: opts.minEndpointingDelay ?? defaultEndpointingOptions.minDelay, + maxDelay: opts.maxEndpointingDelay ?? defaultEndpointingOptions.maxDelay, + }); this.lastLanguage = undefined; this.rootSpanContext = opts.rootSpanContext; this.sttModel = opts.sttModel; @@ -268,6 +280,7 @@ export class AudioRecognition { this.transcriptBuffer = []; this.isInterruptionEnabled = !!(opts.interruptionDetection && opts.vad); this.isAgentSpeaking = false; + this.interruptionDetected = undefined; const rawBoundary = opts.backchannelBoundary; if (rawBoundary === undefined || rawBoundary === null) { @@ -411,8 +424,9 @@ export class AudioRecognition { this.backchannelBoundaryCallback = undefined; } - async onStartOfAgentSpeech() { + async onStartOfAgentSpeech(startedAt: number) { this.isAgentSpeaking = true; + this.endpointing.onStartOfAgentSpeech(startedAt); if (this.backchannelBoundary && this.backchannelBoundary[0] > 0) { this.cancelBackchannelBoundary(); @@ -429,6 +443,10 @@ export class AudioRecognition { async onEndOfAgentSpeech(ignoreUserTranscriptUntil: number) { this.cancelBackchannelBoundary(); + if (this.isAgentSpeaking) { + this.endpointing.onEndOfAgentSpeech(Date.now()); + } + if (!this.isInterruptionEnabled) { this.isAgentSpeaking = false; return; @@ -465,6 +483,9 @@ export class AudioRecognition { /** Start interruption inference when agent is speaking and overlap speech starts. */ async onStartOfOverlapSpeech(speechDuration: number, startedAt: number, userSpeakingSpan?: Span) { if (this.isAgentSpeaking) { + if (!this.endpointing.overlapping) { + this.endpointing.onStartOfSpeech(startedAt, true); + } this.trySendInterruptionSentinel( InterruptionStreamSentinel.overlapSpeechStarted( speechDuration, @@ -863,8 +884,11 @@ export class AudioRecognition { case SpeechEventType.START_OF_SPEECH: if (this.turnDetectionMode !== 'stt') break; { - const span = this.ensureUserTurnSpan(Date.now()); + const speechStartTime = Date.now(); + const span = this.ensureUserTurnSpan(speechStartTime); const ctx = this.userTurnContext(span); + this.endpointing.onStartOfSpeech(speechStartTime, this.isAgentSpeaking); + this.interruptionDetected = undefined; otelContext.with(ctx, () => { this.hooks.onStartOfSpeech({ type: VADEventType.START_OF_SPEECH, @@ -889,8 +913,15 @@ export class AudioRecognition { case SpeechEventType.END_OF_SPEECH: if (this.turnDetectionMode !== 'stt') break; { + const speechEndTime = Date.now(); const span = this.ensureUserTurnSpan(); const ctx = this.userTurnContext(span); + if (this.speaking) { + this.endpointing.onEndOfSpeech( + speechEndTime, + this.interruptionDetected === false && this.isAgentSpeaking, + ); + } otelContext.with(ctx, () => { this.hooks.onEndOfSpeech({ type: VADEventType.END_OF_SPEECH, @@ -934,6 +965,8 @@ export class AudioRecognition { return; } + this.interruptionDetected = ev.isInterruption; + if (ev.isInterruption) { this.hooks.onInterruption(ev); } @@ -969,7 +1002,7 @@ export class AudioRecognition { speechStartTime: number | undefined, ) => async (controller: AbortController) => { - let endpointingDelay = this.minEndpointingDelay; + let endpointingDelay = this.endpointing.minDelay; const userTurnSpan = this.ensureUserTurnSpan(); const userTurnCtx = this.userTurnContext(userTurnSpan); @@ -995,7 +1028,7 @@ export class AudioRecognition { ); if (unlikelyThreshold && endOfTurnProbability < unlikelyThreshold) { - endpointingDelay = this.maxEndpointingDelay; + endpointingDelay = this.endpointing.maxDelay; } } catch (error) { this.logger.error(error, 'Error predicting end of turn'); @@ -1181,9 +1214,11 @@ export class AudioRecognition { case VADEventType.START_OF_SPEECH: this.logger.debug('VAD task: START_OF_SPEECH'); { - const startTime = Date.now() - ev.speechDuration; + const startTime = Date.now() - ev.speechDuration - ev.inferenceDuration; const span = this.ensureUserTurnSpan(startTime); const ctx = this.userTurnContext(span); + this.endpointing.onStartOfSpeech(startTime, this.isAgentSpeaking); + this.interruptionDetected = undefined; otelContext.with(ctx, () => this.hooks.onStartOfSpeech(ev)); } this.speaking = true; @@ -1211,8 +1246,15 @@ export class AudioRecognition { case VADEventType.END_OF_SPEECH: this.logger.debug('VAD task: END_OF_SPEECH'); { + const endTime = Date.now() - ev.silenceDuration - ev.inferenceDuration; const span = this.ensureUserTurnSpan(); const ctx = this.userTurnContext(span); + if (this.speaking) { + this.endpointing.onEndOfSpeech( + endTime, + this.interruptionDetected === false && this.isAgentSpeaking, + ); + } otelContext.with(ctx, () => this.hooks.onEndOfSpeech(ev)); } diff --git a/agents/src/voice/audio_recognition_backchannel.test.ts b/agents/src/voice/audio_recognition_backchannel.test.ts index 1d634e963..b80b2936f 100644 --- a/agents/src/voice/audio_recognition_backchannel.test.ts +++ b/agents/src/voice/audio_recognition_backchannel.test.ts @@ -49,7 +49,7 @@ describe('AudioRecognition backchannel boundary', () => { const ar = createRecognition({ backchannelBoundary: 250 }); const cb = vi.fn(); - await ar.onStartOfAgentSpeech(); + await ar.onStartOfAgentSpeech(Date.now()); ar.backchannelBoundaryCallback = cb; expect(ar.backchannelBoundaryActive).toBe(true); @@ -66,7 +66,7 @@ describe('AudioRecognition backchannel boundary', () => { const ar = createRecognition({ backchannelBoundary: [100, 999] }); const cb = vi.fn(); - await ar.onStartOfAgentSpeech(); + await ar.onStartOfAgentSpeech(Date.now()); ar.backchannelBoundaryCallback = cb; vi.advanceTimersByTime(99); @@ -81,7 +81,7 @@ describe('AudioRecognition backchannel boundary', () => { const ar = createRecognition({ backchannelBoundary: null }); const cb = vi.fn(); - await ar.onStartOfAgentSpeech(); + await ar.onStartOfAgentSpeech(Date.now()); ar.backchannelBoundaryCallback = cb; expect(ar.backchannelBoundaryActive).toBe(false); @@ -94,7 +94,7 @@ describe('AudioRecognition backchannel boundary', () => { it('disables both sides cleanly when undefined', async () => { const ar = createRecognition(); - await ar.onStartOfAgentSpeech(); + await ar.onStartOfAgentSpeech(Date.now()); expect(ar.backchannelBoundaryActive).toBe(false); await ar.close(); @@ -104,7 +104,7 @@ describe('AudioRecognition backchannel boundary', () => { const ar = createRecognition({ backchannelBoundary: [0, 500] }); const cb = vi.fn(); - await ar.onStartOfAgentSpeech(); + await ar.onStartOfAgentSpeech(Date.now()); ar.backchannelBoundaryCallback = cb; expect(ar.backchannelBoundaryActive).toBe(false); @@ -128,18 +128,18 @@ describe('AudioRecognition backchannel boundary', () => { describe('lifecycle', () => { it('re-arms the start cooldown when onStartOfAgentSpeech is called again', async () => { // Models the false-interruption resume path: the agent is paused and resumed, - // so AgentActivity calls onStartOfAgentSpeech() a second time. + // so AgentActivity calls onStartOfAgentSpeech a second time. const ar = createRecognition({ backchannelBoundary: 200 }); const cb1 = vi.fn(); const cb2 = vi.fn(); - await ar.onStartOfAgentSpeech(); + await ar.onStartOfAgentSpeech(Date.now()); ar.backchannelBoundaryCallback = cb1; expect(ar.backchannelBoundaryActive).toBe(true); vi.advanceTimersByTime(50); // resume path -> re-arm the timer 50ms in - await ar.onStartOfAgentSpeech(); + await ar.onStartOfAgentSpeech(Date.now()); ar.backchannelBoundaryCallback = cb2; expect(ar.backchannelBoundaryActive).toBe(true); @@ -160,7 +160,7 @@ describe('AudioRecognition backchannel boundary', () => { const ar = createRecognition({ backchannelBoundary: 500 }); const cb = vi.fn(); - await ar.onStartOfAgentSpeech(); + await ar.onStartOfAgentSpeech(Date.now()); ar.backchannelBoundaryCallback = cb; expect(ar.backchannelBoundaryActive).toBe(true); @@ -178,7 +178,7 @@ describe('AudioRecognition backchannel boundary', () => { const ar = createRecognition({ backchannelBoundary: [1000, 500] }); const cb = vi.fn(); - await ar.onStartOfAgentSpeech(); + await ar.onStartOfAgentSpeech(Date.now()); ar.backchannelBoundaryCallback = cb; expect(ar.backchannelBoundaryActive).toBe(true); @@ -195,7 +195,7 @@ describe('AudioRecognition backchannel boundary', () => { const ar = createRecognition({ backchannelBoundary: 1000 }); const cb = vi.fn(); - await ar.onStartOfAgentSpeech(); + await ar.onStartOfAgentSpeech(Date.now()); ar.backchannelBoundaryCallback = cb; expect(ar.backchannelBoundaryActive).toBe(true); @@ -212,7 +212,7 @@ describe('AudioRecognition backchannel boundary', () => { const ar = createRecognition({ backchannelBoundary: 1000 }); const cb = vi.fn(); - await ar.onStartOfAgentSpeech(); + await ar.onStartOfAgentSpeech(Date.now()); ar.backchannelBoundaryCallback = cb; expect(ar.backchannelBoundaryActive).toBe(true); diff --git a/agents/src/voice/audio_recognition_endpointing.test.ts b/agents/src/voice/audio_recognition_endpointing.test.ts new file mode 100644 index 000000000..58002ca92 --- /dev/null +++ b/agents/src/voice/audio_recognition_endpointing.test.ts @@ -0,0 +1,50 @@ +// SPDX-FileCopyrightText: 2026 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import { describe, expect, it } from 'vitest'; +import { ChatContext } from '../llm/chat_context.js'; +import { initializeLogger } from '../log.js'; +import { AudioRecognition, type RecognitionHooks } from './audio_recognition.js'; +import { BaseEndpointing } from './turn_config/endpointing.js'; + +function createHooks(): RecognitionHooks { + return { + onInterruption: () => {}, + onStartOfSpeech: () => {}, + onVADInferenceDone: () => {}, + onEndOfSpeech: () => {}, + onInterimTranscript: () => {}, + onFinalTranscript: () => {}, + onPreemptiveGeneration: () => {}, + retrieveChatCtx: () => ChatContext.empty(), + onEndOfTurn: async () => true, + }; +} + +class RecordingEndpointing extends BaseEndpointing { + speechStarts: Array<{ startedAt: number; overlapping: boolean }> = []; + + override onStartOfSpeech(startedAt: number, overlapping = false): void { + super.onStartOfSpeech(startedAt, overlapping); + this.speechStarts.push({ startedAt, overlapping }); + } +} + +describe('AudioRecognition endpointing integration', () => { + initializeLogger({ pretty: false, level: 'silent' }); + + it('marks endpointing overlap when audio activity starts while the agent is speaking', async () => { + const endpointing = new RecordingEndpointing({ minDelay: 300, maxDelay: 3000 }); + const recognition = new AudioRecognition({ + recognitionHooks: createHooks(), + endpointing, + }); + + await recognition.onStartOfAgentSpeech(1000); + await recognition.onStartOfOverlapSpeech(0, 1200); + await recognition.onStartOfOverlapSpeech(0, 1300); + + expect(endpointing.speechStarts).toEqual([{ startedAt: 1200, overlapping: true }]); + expect(endpointing.overlapping).toBe(true); + }); +}); diff --git a/agents/src/voice/index.ts b/agents/src/voice/index.ts index 808ac88c1..1303d85c4 100644 --- a/agents/src/voice/index.ts +++ b/agents/src/voice/index.ts @@ -32,4 +32,5 @@ export { export * from './report.js'; export * from './room_io/index.js'; export { RunContext } from './run_context.js'; +export * from './turn_config/endpointing.js'; export * as testing from './testing/index.js'; diff --git a/agents/src/voice/turn_config/endpointing.test.ts b/agents/src/voice/turn_config/endpointing.test.ts new file mode 100644 index 000000000..83be6fe15 --- /dev/null +++ b/agents/src/voice/turn_config/endpointing.test.ts @@ -0,0 +1,94 @@ +// SPDX-FileCopyrightText: 2026 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import { describe, expect, it } from 'vitest'; +import { BaseEndpointing, DynamicEndpointing, createEndpointing } from './endpointing.js'; + +describe('DynamicEndpointing', () => { + it('initializes with configured delays', () => { + const endpointing = new DynamicEndpointing({ minDelay: 300, maxDelay: 1000 }); + + expect(endpointing.minDelay).toBe(300); + expect(endpointing.maxDelay).toBe(1000); + }); + + it('updates minDelay from pauses between utterances', () => { + const endpointing = new DynamicEndpointing({ minDelay: 300, maxDelay: 1000, alpha: 0.5 }); + + endpointing.onEndOfSpeech(100_000); + endpointing.onStartOfSpeech(100_400); + endpointing.onEndOfSpeech(100_500); + + expect(endpointing.minDelay).toBeCloseTo(350); + }); + + it('updates maxDelay from pauses before a new turn', () => { + const endpointing = new DynamicEndpointing({ minDelay: 300, maxDelay: 1000, alpha: 0.5 }); + + endpointing.onEndOfSpeech(100_000); + endpointing.onStartOfAgentSpeech(100_600); + endpointing.onStartOfSpeech(101_500); + endpointing.onEndOfSpeech(102_000); + + expect(endpointing.maxDelay).toBeCloseTo(800); + }); + + it('skips updates for ignored overlapping speech outside the grace period', () => { + const endpointing = new DynamicEndpointing({ minDelay: 300, maxDelay: 1000, alpha: 0.5 }); + + endpointing.onEndOfSpeech(100_000); + endpointing.onStartOfAgentSpeech(100_500); + endpointing.onStartOfSpeech(101_500, true); + + const previousMinDelay = endpointing.minDelay; + const previousMaxDelay = endpointing.maxDelay; + endpointing.onEndOfSpeech(101_800, true); + + expect(endpointing.minDelay).toBe(previousMinDelay); + expect(endpointing.maxDelay).toBe(previousMaxDelay); + }); + + it('updates options and clamps learned delays', () => { + const endpointing = new DynamicEndpointing({ minDelay: 300, maxDelay: 1000, alpha: 0.5 }); + + endpointing.updateOptions({ minDelay: 500, maxDelay: 2000 }); + endpointing.onEndOfSpeech(100_000); + endpointing.onStartOfSpeech(100_200); + + expect(endpointing.minDelay).toBeCloseTo(500); + + endpointing.onEndOfSpeech(101_000); + endpointing.onStartOfAgentSpeech(102_800); + endpointing.onStartOfSpeech(103_500); + + expect(endpointing.maxDelay).toBeGreaterThan(1000); + expect(endpointing.maxDelay).toBeLessThanOrEqual(2000); + }); +}); + +describe('createEndpointing', () => { + it('creates dynamic endpointing for dynamic mode', () => { + const endpointing = createEndpointing({ + mode: 'dynamic', + minDelay: 300, + maxDelay: 1000, + alpha: 0.7, + }); + + expect(endpointing).toBeInstanceOf(DynamicEndpointing); + }); + + it('creates base endpointing for fixed mode', () => { + const endpointing = createEndpointing({ + mode: 'fixed', + minDelay: 500, + maxDelay: 3000, + alpha: 0.9, + }); + + expect(endpointing).toBeInstanceOf(BaseEndpointing); + expect(endpointing).not.toBeInstanceOf(DynamicEndpointing); + expect(endpointing.minDelay).toBe(500); + expect(endpointing.maxDelay).toBe(3000); + }); +}); diff --git a/agents/src/voice/turn_config/endpointing.ts b/agents/src/voice/turn_config/endpointing.ts index f2603e00f..aafeac952 100644 --- a/agents/src/voice/turn_config/endpointing.ts +++ b/agents/src/voice/turn_config/endpointing.ts @@ -1,6 +1,8 @@ // SPDX-FileCopyrightText: 2026 LiveKit, Inc. // // SPDX-License-Identifier: Apache-2.0 +import { ExpFilter } from '../../utils.js'; + /** * Configuration for endpointing, which determines when the user's turn is complete. */ @@ -24,10 +26,261 @@ export interface EndpointingOptions { * @defaultValue 3000 */ maxDelay: number; + /** + * Exponential moving average coefficient for dynamic endpointing. Higher values give more + * weight to history. + * @defaultValue 0.9 + */ + alpha: number; } export const defaultEndpointingOptions = { mode: 'fixed', minDelay: 500, maxDelay: 3000, + alpha: 0.9, } as const satisfies EndpointingOptions; + +const AGENT_SPEECH_LEADING_SILENCE_GRACE_PERIOD = 250; + +export class BaseEndpointing { + protected _minDelay: number; + protected _maxDelay: number; + protected _overlapping = false; + + constructor({ minDelay, maxDelay }: { minDelay: number; maxDelay: number }) { + this._minDelay = minDelay; + this._maxDelay = maxDelay; + } + + updateOptions({ minDelay, maxDelay }: { minDelay?: number; maxDelay?: number }) { + if (minDelay !== undefined) { + this._minDelay = minDelay; + } + if (maxDelay !== undefined) { + this._maxDelay = maxDelay; + } + } + + get minDelay(): number { + return this._minDelay; + } + + get maxDelay(): number { + return this._maxDelay; + } + + get overlapping(): boolean { + return this._overlapping; + } + + onStartOfSpeech(_startedAt: number, overlapping = false): void { + this._overlapping = overlapping; + } + + onEndOfSpeech(_endedAt: number, _shouldIgnore = false): void { + this._overlapping = false; + } + + onStartOfAgentSpeech(_startedAt: number): void {} + + onEndOfAgentSpeech(_endedAt: number): void {} +} + +export class DynamicEndpointing extends BaseEndpointing { + #utterancePause: ExpFilter; + #turnPause: ExpFilter; + #utteranceStartedAt?: number; + #utteranceEndedAt?: number; + #agentSpeechStartedAt?: number; + #agentSpeechEndedAt?: number; + #speaking = false; + + constructor({ + minDelay, + maxDelay, + alpha = 0.9, + }: { + minDelay: number; + maxDelay: number; + alpha?: number; + }) { + super({ minDelay, maxDelay }); + this.#utterancePause = new ExpFilter({ + alpha, + initial: minDelay, + minVal: minDelay, + maxVal: maxDelay, + }); + this.#turnPause = new ExpFilter({ + alpha, + initial: maxDelay, + minVal: minDelay, + maxVal: maxDelay, + }); + } + + override get minDelay(): number { + return this.#utterancePause.value ?? this._minDelay; + } + + override get maxDelay(): number { + return Math.max(this.#turnPause.value ?? this._maxDelay, this.minDelay); + } + + get betweenUtteranceDelay(): number { + if (this.#utteranceStartedAt === undefined || this.#utteranceEndedAt === undefined) { + return 0; + } + return Math.max(0, this.#utteranceStartedAt - this.#utteranceEndedAt); + } + + get betweenTurnDelay(): number { + if (this.#agentSpeechStartedAt === undefined || this.#utteranceEndedAt === undefined) { + return 0; + } + return Math.max(0, this.#agentSpeechStartedAt - this.#utteranceEndedAt); + } + + get immediateInterruptionDelay(): [number, number] { + if (this.#utteranceStartedAt === undefined || this.#agentSpeechStartedAt === undefined) { + return [0, 0]; + } + return [this.betweenTurnDelay, Math.abs(this.betweenUtteranceDelay - this.betweenTurnDelay)]; + } + + override onStartOfAgentSpeech(startedAt: number): void { + this.#agentSpeechStartedAt = startedAt; + this.#agentSpeechEndedAt = undefined; + this._overlapping = false; + } + + override onEndOfAgentSpeech(endedAt: number): void { + if ( + this.#agentSpeechStartedAt !== undefined && + (this.#agentSpeechEndedAt === undefined || + this.#agentSpeechEndedAt < this.#agentSpeechStartedAt) + ) { + this.#agentSpeechEndedAt = endedAt; + } + this._overlapping = false; + } + + override onStartOfSpeech(startedAt: number, overlapping = false): void { + if (this._overlapping) { + return; + } + + // Audio-activity interruption can arrive before the previous utterance's end timestamp is + // finalized. In that case the stored end appears earlier than its start, so pin it just before + // agent speech to keep the immediate-interruption pause calculation meaningful. + if ( + this.#utteranceStartedAt !== undefined && + this.#utteranceEndedAt !== undefined && + this.#agentSpeechStartedAt !== undefined && + this.#utteranceEndedAt < this.#utteranceStartedAt && + overlapping + ) { + this.#utteranceEndedAt = this.#agentSpeechStartedAt - 1; + } + + this.#utteranceStartedAt = startedAt; + this._overlapping = overlapping; + this.#speaking = true; + } + + override onEndOfSpeech(endedAt: number, shouldIgnore = false): void { + if (shouldIgnore && this._overlapping) { + if ( + this.#utteranceStartedAt === undefined || + this.#agentSpeechStartedAt === undefined || + Math.abs(this.#utteranceStartedAt - this.#agentSpeechStartedAt) >= + AGENT_SPEECH_LEADING_SILENCE_GRACE_PERIOD + ) { + this._overlapping = false; + this.#speaking = false; + this.#utteranceStartedAt = undefined; + this.#utteranceEndedAt = undefined; + return; + } + } + + const agentStillSpeaking = + this.#agentSpeechStartedAt !== undefined && this.#agentSpeechEndedAt === undefined; + const betweenUtteranceDelay = this.betweenUtteranceDelay; + const betweenTurnDelay = this.betweenTurnDelay; + + if (this._overlapping || agentStillSpeaking) { + const [turnDelay, interruptionDelay] = this.immediateInterruptionDelay; + const isImmediateInterruption = + interruptionDelay > 0 && + interruptionDelay <= this.minDelay && + turnDelay > 0 && + turnDelay <= this.maxDelay && + betweenUtteranceDelay > 0; + + if (isImmediateInterruption) { + // User resumed almost immediately after the agent started, so the prior pause was probably + // part of the same user turn. Learn it as minDelay to avoid cutting in next time. + this.#utterancePause.apply(1, betweenUtteranceDelay); + } else if (betweenTurnDelay > 0) { + // User spoke after a more substantial agent-start gap. Treat it as a new turn boundary and + // learn that gap as maxDelay. + this.#turnPause.apply(1, betweenTurnDelay); + } + } else if (betweenTurnDelay > 0) { + // Normal case: user ended, agent eventually started. Learn that turn-boundary wait. + this.#turnPause.apply(1, betweenTurnDelay); + } else { + const noAgentSpeechAroundPause = + this.#agentSpeechEndedAt === undefined && this.#agentSpeechStartedAt === undefined; + if (betweenUtteranceDelay > 0 && noAgentSpeechAroundPause) { + // User continued before any agent speech happened. Learn the pause as part of the user's + // natural speaking rhythm. + this.#utterancePause.apply(1, betweenUtteranceDelay); + } + } + + this.#utteranceEndedAt = endedAt; + this.#agentSpeechStartedAt = undefined; + this.#agentSpeechEndedAt = undefined; + this.#speaking = false; + this._overlapping = false; + } + + override updateOptions({ + minDelay, + maxDelay, + alpha, + }: { + minDelay?: number; + maxDelay?: number; + alpha?: number; + }) { + if (minDelay !== undefined) { + this._minDelay = minDelay; + this.#utterancePause.reset({ initial: minDelay, minVal: minDelay }); + this.#turnPause.reset({ minVal: minDelay }); + } + if (maxDelay !== undefined) { + this._maxDelay = maxDelay; + this.#turnPause.reset({ initial: maxDelay, maxVal: maxDelay }); + this.#utterancePause.reset({ maxVal: maxDelay }); + } + if (alpha !== undefined) { + this.#utterancePause.reset({ alpha }); + this.#turnPause.reset({ alpha }); + } + } +} + +export function createEndpointing(options: EndpointingOptions): BaseEndpointing { + if (options.mode === 'dynamic') { + return new DynamicEndpointing({ + minDelay: options.minDelay, + maxDelay: options.maxDelay, + alpha: options.alpha, + }); + } + return new BaseEndpointing({ minDelay: options.minDelay, maxDelay: options.maxDelay }); +} diff --git a/examples/src/basic_agent.ts b/examples/src/basic_agent.ts index 9153fbf95..ca5b70287 100644 --- a/examples/src/basic_agent.ts +++ b/examples/src/basic_agent.ts @@ -75,6 +75,13 @@ export default defineAgent({ falseInterruptionTimeout: 1000, mode: 'adaptive', }, + // Dynamic endpointing learns the user's pause rhythm and adapts the short/long waits + // used before committing a user turn. + endpointing: { + mode: 'dynamic', + minDelay: 300, + maxDelay: 3000, + }, // Preemptive generation speculatively starts LLM inference while the user is still // speaking to reduce time-to-first-token. See PreemptiveGenerationOptions for all // tunables (enabled, preemptiveTts, maxSpeechDuration, maxRetries). diff --git a/plugins/openai/src/stt.test.ts b/plugins/openai/src/stt.test.ts index 321fd587e..72118d526 100644 --- a/plugins/openai/src/stt.test.ts +++ b/plugins/openai/src/stt.test.ts @@ -5,7 +5,8 @@ import { VAD as BaseVAD, type VADStream } from '@livekit/agents'; import { VAD } from '@livekit/agents-plugin-silero'; import { stt } from '@livekit/agents-plugins-test'; import { describe, expect, it, vi } from 'vitest'; -import { STT, SpeechStream } from './stt.js'; +import type { SpeechStream } from './stt.js'; +import { STT } from './stt.js'; const hasOpenAIApiKey = Boolean(process.env.OPENAI_API_KEY);