livekit · toubatbrian · May 14, 2026 · May 14, 2026 · May 14, 2026 · May 14, 2026
diff --git a/.changeset/chatty-gorillas-search.md b/.changeset/chatty-gorillas-search.md
@@ -0,0 +1,5 @@
+---
+"@livekit/agents": patch
+---
+
+Improve audio discard checks
diff --git a/agents/src/voice/agent_activity.ts b/agents/src/voice/agent_activity.ts
@@ -526,6 +526,7 @@ export class AgentActivity implements RecognitionHooks {
       sttModel: this.stt?.label,
       sttProvider: this.getSttProvider(),
       getLinkedParticipant: () => this.agentSession._roomIO?.linkedParticipant,
+      shouldDiscardAudioForStt: () => this.shouldDiscardInputAudio(),
     });
 
     if (reuseResources?.sttPipeline) {
@@ -819,11 +820,9 @@ export class AgentActivity implements RecognitionHooks {
     // than on the source audioStream via pipeThrough. pipeThrough locks its source stream, so
     // if it were applied directly on audioStream, that lock would survive MultiInputStream.close()
     // and make audioStream permanently locked for subsequent attachAudioInput calls (e.g. handoff).
-    const aecWarmupAudioFilter = new TransformStream<AudioFrame, AudioFrame>({
+    const discardAudioFilter = new TransformStream<AudioFrame, AudioFrame>({
       transform: (frame, controller) => {
-        const shouldDiscardForAecWarmup =
-          this.agentSession.agentState === 'speaking' && this.agentSession._aecWarmupRemaining > 0;
-        if (!shouldDiscardForAecWarmup) {
+        if (!this.shouldDiscardInputAudio()) {
           controller.enqueue(frame);
         }
       },
@@ -832,22 +831,36 @@ export class AgentActivity implements RecognitionHooks {
     this.audioStreamId = this.audioStream.addInputStream(audioStream);
 
     if (this.realtimeSession && this.audioRecognition) {
-      const [realtimeAudioStream, recognitionAudioStream] = this.audioStream.stream
-        .pipeThrough(aecWarmupAudioFilter)
-        .tee();
-      this.realtimeSession.setInputAudioStream(realtimeAudioStream);
+      const [realtimeAudioStream, recognitionAudioStream] = this.audioStream.stream.tee();
+      this.realtimeSession.setInputAudioStream(realtimeAudioStream.pipeThrough(discardAudioFilter));
       this.audioRecognition.setInputAudioStream(recognitionAudioStream);
     } else if (this.realtimeSession) {
       this.realtimeSession.setInputAudioStream(
-        this.audioStream.stream.pipeThrough(aecWarmupAudioFilter),
+        this.audioStream.stream.pipeThrough(discardAudioFilter),
       );
     } else if (this.audioRecognition) {
-      this.audioRecognition.setInputAudioStream(
-        this.audioStream.stream.pipeThrough(aecWarmupAudioFilter),
-      );
+      this.audioRecognition.setInputAudioStream(this.audioStream.stream);
     }
   }
 
+  private shouldDiscardInputAudio(): boolean {
+    const aecWarmupActive =
+      this.agentSession.agentState === 'speaking' && this.agentSession._aecWarmupRemaining > 0;
+
+    const discardAudioIfUninterruptible =
+      this.agent.turnHandling?.interruption?.discardAudioIfUninterruptible ??
+      this.agentSession.sessionOptions.turnHandling.interruption.discardAudioIfUninterruptible;
+
+    const uninterruptibleSpeechActive =
+      this._currentSpeech !== undefined &&
+      !this._currentSpeech.done() &&
+      !this._currentSpeech.interrupted &&
+      !this._currentSpeech.allowInterruptions &&
+      discardAudioIfUninterruptible;
+
+    return aecWarmupActive || uninterruptibleSpeechActive;
+  }
+
   detachAudioInput(): void {
     if (this.audioStreamId === undefined) {
       return;

diff --git a/agents/src/voice/audio_recognition.ts b/agents/src/voice/audio_recognition.ts
@@ -158,6 +158,8 @@ export interface AudioRecognitionOptions {
   sttProvider?: string;
   /** Getter for linked participant for span attribution */
   getLinkedParticipant?: () => ParticipantLike | undefined;
+  /** Predicate used to skip frames for STT while still forwarding them to VAD/interruption. */
+  shouldDiscardAudioForStt?: (frame: AudioFrame) => boolean;
 }
 
 /**
@@ -308,12 +310,28 @@ export class AudioRecognition {
     );
     const primaryInputStream = this.deferredInputStream.stream.pipeThrough(broadcast);
 
+    const filterSttInput = (stream: ReadableStream<AudioFrame>) => {
+      if (!opts.shouldDiscardAudioForStt) {
+        return stream;
+      }
+
+      return stream.pipeThrough(
+        new TransformStream<AudioFrame, AudioFrame>({
+          transform: (frame, controller) => {
+            if (!opts.shouldDiscardAudioForStt!(frame)) {
+              controller.enqueue(frame);
+            }
+          },
+        }),
+      );
+    };
+
     if (opts.interruptionDetection) {
       const [vadInputStream, teedInput] = primaryInputStream.tee();
       const [inputStream, sttInputStream] = teedInput.tee();
       this.vadInputStream = vadInputStream;
       this.sttInputStream = mergeReadableStreams(
-        sttInputStream,
+        filterSttInput(sttInputStream),
         this.silenceAudioTransform.readable,
       );
       this.interruptionStreamChannel = createStreamChannel();
@@ -322,7 +340,7 @@ export class AudioRecognition {
       const [vadInputStream, sttInputStream] = primaryInputStream.tee();
       this.vadInputStream = vadInputStream;
       this.sttInputStream = mergeReadableStreams(
-        sttInputStream,
+        filterSttInput(sttInputStream),
         this.silenceAudioTransform.readable,
       );
     }