Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .changeset/chatty-gorillas-search.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
"@livekit/agents": patch
---

Improve audio discard checks
37 changes: 25 additions & 12 deletions agents/src/voice/agent_activity.ts
Original file line number Diff line number Diff line change
Expand Up @@ -526,6 +526,7 @@ export class AgentActivity implements RecognitionHooks {
sttModel: this.stt?.label,
sttProvider: this.getSttProvider(),
getLinkedParticipant: () => this.agentSession._roomIO?.linkedParticipant,
shouldDiscardAudioForStt: () => this.shouldDiscardInputAudio(),
});

if (reuseResources?.sttPipeline) {
Expand Down Expand Up @@ -819,11 +820,9 @@ export class AgentActivity implements RecognitionHooks {
// than on the source audioStream via pipeThrough. pipeThrough locks its source stream, so
// if it were applied directly on audioStream, that lock would survive MultiInputStream.close()
// and make audioStream permanently locked for subsequent attachAudioInput calls (e.g. handoff).
const aecWarmupAudioFilter = new TransformStream<AudioFrame, AudioFrame>({
const discardAudioFilter = new TransformStream<AudioFrame, AudioFrame>({
transform: (frame, controller) => {
const shouldDiscardForAecWarmup =
this.agentSession.agentState === 'speaking' && this.agentSession._aecWarmupRemaining > 0;
if (!shouldDiscardForAecWarmup) {
if (!this.shouldDiscardInputAudio()) {
controller.enqueue(frame);
}
},
Expand All @@ -832,22 +831,36 @@ export class AgentActivity implements RecognitionHooks {
this.audioStreamId = this.audioStream.addInputStream(audioStream);

if (this.realtimeSession && this.audioRecognition) {
const [realtimeAudioStream, recognitionAudioStream] = this.audioStream.stream
.pipeThrough(aecWarmupAudioFilter)
.tee();
this.realtimeSession.setInputAudioStream(realtimeAudioStream);
const [realtimeAudioStream, recognitionAudioStream] = this.audioStream.stream.tee();
this.realtimeSession.setInputAudioStream(realtimeAudioStream.pipeThrough(discardAudioFilter));
this.audioRecognition.setInputAudioStream(recognitionAudioStream);
} else if (this.realtimeSession) {
this.realtimeSession.setInputAudioStream(
this.audioStream.stream.pipeThrough(aecWarmupAudioFilter),
this.audioStream.stream.pipeThrough(discardAudioFilter),
);
} else if (this.audioRecognition) {
this.audioRecognition.setInputAudioStream(
this.audioStream.stream.pipeThrough(aecWarmupAudioFilter),
);
this.audioRecognition.setInputAudioStream(this.audioStream.stream);
}
}

private shouldDiscardInputAudio(): boolean {
const aecWarmupActive =
this.agentSession.agentState === 'speaking' && this.agentSession._aecWarmupRemaining > 0;

const discardAudioIfUninterruptible =
this.agent.turnHandling?.interruption?.discardAudioIfUninterruptible ??
this.agentSession.sessionOptions.turnHandling.interruption.discardAudioIfUninterruptible;

const uninterruptibleSpeechActive =
this._currentSpeech !== undefined &&
!this._currentSpeech.done() &&
!this._currentSpeech.interrupted &&
!this._currentSpeech.allowInterruptions &&
discardAudioIfUninterruptible;

return aecWarmupActive || uninterruptibleSpeechActive;
}

detachAudioInput(): void {
if (this.audioStreamId === undefined) {
return;
Expand Down
22 changes: 20 additions & 2 deletions agents/src/voice/audio_recognition.ts
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,8 @@ export interface AudioRecognitionOptions {
sttProvider?: string;
/** Getter for linked participant for span attribution */
getLinkedParticipant?: () => ParticipantLike | undefined;
/** Predicate used to skip frames for STT while still forwarding them to VAD/interruption. */
shouldDiscardAudioForStt?: (frame: AudioFrame) => boolean;
}

/**
Expand Down Expand Up @@ -308,12 +310,28 @@ export class AudioRecognition {
);
const primaryInputStream = this.deferredInputStream.stream.pipeThrough(broadcast);

const filterSttInput = (stream: ReadableStream<AudioFrame>) => {
if (!opts.shouldDiscardAudioForStt) {
return stream;
}

return stream.pipeThrough(
new TransformStream<AudioFrame, AudioFrame>({
transform: (frame, controller) => {
if (!opts.shouldDiscardAudioForStt!(frame)) {
controller.enqueue(frame);
}
},
}),
);
};

if (opts.interruptionDetection) {
const [vadInputStream, teedInput] = primaryInputStream.tee();
const [inputStream, sttInputStream] = teedInput.tee();
this.vadInputStream = vadInputStream;
this.sttInputStream = mergeReadableStreams(
sttInputStream,
filterSttInput(sttInputStream),
this.silenceAudioTransform.readable,
);
this.interruptionStreamChannel = createStreamChannel();
Expand All @@ -322,7 +340,7 @@ export class AudioRecognition {
const [vadInputStream, sttInputStream] = primaryInputStream.tee();
this.vadInputStream = vadInputStream;
this.sttInputStream = mergeReadableStreams(
sttInputStream,
filterSttInput(sttInputStream),
this.silenceAudioTransform.readable,
);
}
Expand Down
Loading