diff --git a/.changeset/chatty-gorillas-search.md b/.changeset/chatty-gorillas-search.md new file mode 100644 index 000000000..6a4c2dbaa --- /dev/null +++ b/.changeset/chatty-gorillas-search.md @@ -0,0 +1,5 @@ +--- +"@livekit/agents": patch +--- + +Improve audio discard checks diff --git a/agents/src/voice/agent_activity.ts b/agents/src/voice/agent_activity.ts index 639be6405..da12153e4 100644 --- a/agents/src/voice/agent_activity.ts +++ b/agents/src/voice/agent_activity.ts @@ -526,6 +526,7 @@ export class AgentActivity implements RecognitionHooks { sttModel: this.stt?.label, sttProvider: this.getSttProvider(), getLinkedParticipant: () => this.agentSession._roomIO?.linkedParticipant, + shouldDiscardAudioForStt: () => this.shouldDiscardInputAudio(), }); if (reuseResources?.sttPipeline) { @@ -819,11 +820,9 @@ export class AgentActivity implements RecognitionHooks { // than on the source audioStream via pipeThrough. pipeThrough locks its source stream, so // if it were applied directly on audioStream, that lock would survive MultiInputStream.close() // and make audioStream permanently locked for subsequent attachAudioInput calls (e.g. handoff). - const aecWarmupAudioFilter = new TransformStream({ + const discardAudioFilter = new TransformStream({ transform: (frame, controller) => { - const shouldDiscardForAecWarmup = - this.agentSession.agentState === 'speaking' && this.agentSession._aecWarmupRemaining > 0; - if (!shouldDiscardForAecWarmup) { + if (!this.shouldDiscardInputAudio()) { controller.enqueue(frame); } }, @@ -832,22 +831,36 @@ export class AgentActivity implements RecognitionHooks { this.audioStreamId = this.audioStream.addInputStream(audioStream); if (this.realtimeSession && this.audioRecognition) { - const [realtimeAudioStream, recognitionAudioStream] = this.audioStream.stream - .pipeThrough(aecWarmupAudioFilter) - .tee(); - this.realtimeSession.setInputAudioStream(realtimeAudioStream); + const [realtimeAudioStream, recognitionAudioStream] = this.audioStream.stream.tee(); + this.realtimeSession.setInputAudioStream(realtimeAudioStream.pipeThrough(discardAudioFilter)); this.audioRecognition.setInputAudioStream(recognitionAudioStream); } else if (this.realtimeSession) { this.realtimeSession.setInputAudioStream( - this.audioStream.stream.pipeThrough(aecWarmupAudioFilter), + this.audioStream.stream.pipeThrough(discardAudioFilter), ); } else if (this.audioRecognition) { - this.audioRecognition.setInputAudioStream( - this.audioStream.stream.pipeThrough(aecWarmupAudioFilter), - ); + this.audioRecognition.setInputAudioStream(this.audioStream.stream); } } + private shouldDiscardInputAudio(): boolean { + const aecWarmupActive = + this.agentSession.agentState === 'speaking' && this.agentSession._aecWarmupRemaining > 0; + + const discardAudioIfUninterruptible = + this.agent.turnHandling?.interruption?.discardAudioIfUninterruptible ?? + this.agentSession.sessionOptions.turnHandling.interruption.discardAudioIfUninterruptible; + + const uninterruptibleSpeechActive = + this._currentSpeech !== undefined && + !this._currentSpeech.done() && + !this._currentSpeech.interrupted && + !this._currentSpeech.allowInterruptions && + discardAudioIfUninterruptible; + + return aecWarmupActive || uninterruptibleSpeechActive; + } + detachAudioInput(): void { if (this.audioStreamId === undefined) { return; diff --git a/agents/src/voice/audio_recognition.ts b/agents/src/voice/audio_recognition.ts index f3ed51a4e..c02e6dea5 100644 --- a/agents/src/voice/audio_recognition.ts +++ b/agents/src/voice/audio_recognition.ts @@ -158,6 +158,8 @@ export interface AudioRecognitionOptions { sttProvider?: string; /** Getter for linked participant for span attribution */ getLinkedParticipant?: () => ParticipantLike | undefined; + /** Predicate used to skip frames for STT while still forwarding them to VAD/interruption. */ + shouldDiscardAudioForStt?: (frame: AudioFrame) => boolean; } /** @@ -308,12 +310,28 @@ export class AudioRecognition { ); const primaryInputStream = this.deferredInputStream.stream.pipeThrough(broadcast); + const filterSttInput = (stream: ReadableStream) => { + if (!opts.shouldDiscardAudioForStt) { + return stream; + } + + return stream.pipeThrough( + new TransformStream({ + transform: (frame, controller) => { + if (!opts.shouldDiscardAudioForStt!(frame)) { + controller.enqueue(frame); + } + }, + }), + ); + }; + if (opts.interruptionDetection) { const [vadInputStream, teedInput] = primaryInputStream.tee(); const [inputStream, sttInputStream] = teedInput.tee(); this.vadInputStream = vadInputStream; this.sttInputStream = mergeReadableStreams( - sttInputStream, + filterSttInput(sttInputStream), this.silenceAudioTransform.readable, ); this.interruptionStreamChannel = createStreamChannel(); @@ -322,7 +340,7 @@ export class AudioRecognition { const [vadInputStream, sttInputStream] = primaryInputStream.tee(); this.vadInputStream = vadInputStream; this.sttInputStream = mergeReadableStreams( - sttInputStream, + filterSttInput(sttInputStream), this.silenceAudioTransform.readable, ); }