Skip to content
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .changeset/dynamic-endpointing.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
'@livekit/agents': patch
---

Add dynamic endpointing for voice turn handling.
62 changes: 54 additions & 8 deletions agents/src/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -351,32 +351,68 @@ export class AsyncIterableQueue<T> implements AsyncIterableIterator<T> {
/** @internal */
export class ExpFilter {
#alpha: number;
#min?: number;
#max?: number;
#filtered?: number = undefined;

constructor(alpha: number, max?: number) {
this.#alpha = alpha;
this.#max = max;
constructor(
alphaOrOpts: number | { alpha: number; initial?: number; minVal?: number; maxVal?: number },
max?: number,
) {
if (typeof alphaOrOpts === 'number') {
this.#alpha = alphaOrOpts;
this.#max = max;
return;
}

this.#validateAlpha(alphaOrOpts.alpha);
this.#alpha = alphaOrOpts.alpha;
this.#filtered = alphaOrOpts.initial;
this.#min = alphaOrOpts.minVal;
this.#max = alphaOrOpts.maxVal;
}

reset(alpha?: number) {
if (alpha) {
this.#alpha = alpha;
reset(
alphaOrOpts?: number | { alpha?: number; initial?: number; minVal?: number; maxVal?: number },
) {
if (typeof alphaOrOpts === 'object') {
if (alphaOrOpts.alpha !== undefined) {
this.#validateAlpha(alphaOrOpts.alpha);
this.#alpha = alphaOrOpts.alpha;
}
if (alphaOrOpts.initial !== undefined) {
this.#filtered = alphaOrOpts.initial;
}
if (alphaOrOpts.minVal !== undefined) {
this.#min = alphaOrOpts.minVal;
}
if (alphaOrOpts.maxVal !== undefined) {
this.#max = alphaOrOpts.maxVal;
}
return;
}

if (alphaOrOpts) {
this.#alpha = alphaOrOpts;
}

this.#filtered = undefined;
}

apply(exp: number, sample: number): number {
if (this.#filtered) {
if (this.#filtered !== undefined) {
const a = this.#alpha ** exp;
this.#filtered = a * this.#filtered + (1 - a) * sample;
} else {
this.#filtered = sample;
}

if (this.#max && this.#filtered > this.#max) {
if (this.#max !== undefined && this.#filtered > this.#max) {
this.#filtered = this.#max;
}
if (this.#min !== undefined && this.#filtered < this.#min) {
this.#filtered = this.#min;
}

return this.#filtered;
}
Expand All @@ -385,9 +421,19 @@ export class ExpFilter {
return this.#filtered;
}

get value(): number | undefined {
return this.#filtered;
}

set alpha(alpha: number) {
this.#alpha = alpha;
}

#validateAlpha(alpha: number) {
if (alpha <= 0 || alpha > 1) {
throw new Error('alpha must be in (0, 1].');
}
}
}

/** @internal */
Expand Down
66 changes: 43 additions & 23 deletions agents/src/voice/agent_activity.ts
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ import {
} from './generation.js';
import type { TimedString } from './io.js';
import { SpeechHandle } from './speech_handle.js';
import { createEndpointing } from './turn_config/endpointing.js';
import { setParticipantSpanAttributes } from './utils.js';

export const agentActivityStorage = new AsyncLocalStorage<AgentActivity>();
Expand Down Expand Up @@ -504,12 +505,10 @@ export class AgentActivity implements RecognitionHooks {
interruptionDetection: this.interruptionDetector,
backchannelBoundary:
this.agentSession.sessionOptions.turnHandling.interruption.backchannelBoundary,
minEndpointingDelay:
this.agent.turnHandling?.endpointing?.minDelay ??
this.agentSession.sessionOptions.turnHandling.endpointing.minDelay,
maxEndpointingDelay:
this.agent.turnHandling?.endpointing?.maxDelay ??
this.agentSession.sessionOptions.turnHandling.endpointing.maxDelay,
endpointing: createEndpointing({
...this.agentSession.sessionOptions.turnHandling.endpointing,
...(this.agent.turnHandling?.endpointing ?? {}),
}),
rootSpanContext: this.agentSession.rootSpanContext,
sttModel: this.stt?.label,
sttProvider: this.getSttProvider(),
Expand Down Expand Up @@ -1976,15 +1975,17 @@ export class AgentActivity implements RecognitionHooks {
let replyStartedForwardingAt: number | undefined;
let replyTtsGenData: _TTSGenerationData | null = null;

const onFirstFrame = (audioOut: _AudioOut | null, startedSpeakingAt?: number) => {
replyStartedSpeakingAt = startedSpeakingAt ?? Date.now();
const onFirstFrame = (audioOut: _AudioOut | null, startedSpeakingAt: number = Date.now()) => {
replyStartedSpeakingAt = startedSpeakingAt;
replyStartedForwardingAt = audioOut?.startedForwardingAt ?? replyStartedSpeakingAt;
this.agentSession._updateAgentState('speaking', {
startTime: startedSpeakingAt,
otelContext: speechHandle._agentTurnContext,
});
if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
this.audioRecognition.onStartOfAgentSpeech();
if (this.audioRecognition) {
this.audioRecognition.onStartOfAgentSpeech(replyStartedSpeakingAt);
}
if (this.isInterruptionDetectionEnabled) {
this.disableVadInterruptionSoon();
}
};
Expand Down Expand Up @@ -2079,7 +2080,7 @@ export class AgentActivity implements RecognitionHooks {

if (this.agentSession.agentState === 'speaking') {
this.agentSession._updateAgentState('listening');
if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
if (this.audioRecognition) {
this.audioRecognition.onEndOfAgentSpeech(Date.now());
}
this.restoreInterruptionByAudioActivity();
Expand Down Expand Up @@ -2264,15 +2265,20 @@ export class AgentActivity implements RecognitionHooks {

let agentStartedSpeakingAt: number | undefined;
let agentStartedForwardingAt: number | undefined;
const onFirstFrame = (audioOutRef: _AudioOut | null, startedSpeakingAt?: number) => {
agentStartedSpeakingAt = startedSpeakingAt ?? Date.now();
const onFirstFrame = (
audioOutRef: _AudioOut | null,
startedSpeakingAt: number = Date.now(),
) => {
agentStartedSpeakingAt = startedSpeakingAt;
agentStartedForwardingAt = audioOutRef?.startedForwardingAt ?? agentStartedSpeakingAt;
this.agentSession._updateAgentState('speaking', {
startTime: startedSpeakingAt,
otelContext: speechHandle._agentTurnContext,
});
if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
this.audioRecognition.onStartOfAgentSpeech();
if (this.audioRecognition) {
this.audioRecognition.onStartOfAgentSpeech(agentStartedSpeakingAt);
}
if (this.isInterruptionDetectionEnabled) {
this.disableVadInterruptionSoon();
}
};
Expand Down Expand Up @@ -2435,8 +2441,10 @@ export class AgentActivity implements RecognitionHooks {

if (this.agentSession.agentState === 'speaking') {
this.agentSession._updateAgentState('listening');
if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
if (this.audioRecognition) {
this.audioRecognition.onEndOfAgentSpeech(Date.now());
}
if (this.isInterruptionDetectionEnabled) {
this.restoreInterruptionByAudioActivity();
}
}
Expand Down Expand Up @@ -2480,11 +2488,11 @@ export class AgentActivity implements RecognitionHooks {
this.agentSession._updateAgentState('thinking');
} else if (this.agentSession.agentState === 'speaking') {
this.agentSession._updateAgentState('listening');
if (this.isInterruptionDetectionEnabled && this.audioRecognition) {
{
this.audioRecognition.onEndOfAgentSpeech(Date.now());
this.restoreInterruptionByAudioActivity();
}
if (this.audioRecognition) {
this.audioRecognition.onEndOfAgentSpeech(Date.now());
}
if (this.isInterruptionDetectionEnabled) {
this.restoreInterruptionByAudioActivity();
}
}

Expand Down Expand Up @@ -2691,11 +2699,14 @@ export class AgentActivity implements RecognitionHooks {
return;
}

const onFirstFrame = (startedSpeakingAt?: number) => {
const onFirstFrame = (startedSpeakingAt: number = Date.now()) => {
this.agentSession._updateAgentState('speaking', {
startTime: startedSpeakingAt,
otelContext: speechHandle._agentTurnContext,
});
if (this.audioRecognition) {
this.audioRecognition.onStartOfAgentSpeech(startedSpeakingAt);
}
};

const readMessages = async (
Expand Down Expand Up @@ -2938,6 +2949,12 @@ export class AgentActivity implements RecognitionHooks {
'playout completed with interrupt',
);
}
if (this.agentSession.agentState === 'speaking') {
this.agentSession._updateAgentState('listening');
if (this.audioRecognition) {
this.audioRecognition.onEndOfAgentSpeech(Date.now());
}
}
speechHandle._markGenerationDone();
await executeToolsTask.cancelAndWait(AgentActivity.REPLY_TASK_CANCEL_TIMEOUT);

Expand Down Expand Up @@ -2975,6 +2992,9 @@ export class AgentActivity implements RecognitionHooks {
this.agentSession._updateAgentState('thinking');
} else if (this.agentSession.agentState === 'speaking') {
this.agentSession._updateAgentState('listening');
if (this.audioRecognition) {
this.audioRecognition.onEndOfAgentSpeech(Date.now());
}
}

if (toolOutput.output.length === 0) {
Expand Down Expand Up @@ -3441,7 +3461,7 @@ export class AgentActivity implements RecognitionHooks {
otelContext: this.pausedSpeech.handle._agentTurnContext,
});
if (this.audioRecognition && this.pausedSpeech.agentState === 'speaking') {
this.audioRecognition.onStartOfAgentSpeech();
this.audioRecognition.onStartOfAgentSpeech(Date.now());
}
if (this.isInterruptionDetectionEnabled) {
this.disableVadInterruptionSoon();
Expand Down
Loading
Loading