From f65efd6df8a42ee6450a363aa9728da01e97fd75 Mon Sep 17 00:00:00 2001
From: lquerel <l.querel@f5.com>
Date: Sun, 29 Mar 2026 21:55:29 -0700
Subject: [PATCH 01/18] Add processor-local wakeups and fix workspace checks

---
 .../src/processors/batch_processor/mod.rs     | 219 +++++-----
 .../durable_buffer_processor/mod.rs           | 346 ++++++++++------
 .../processors/log_sampling_processor/mod.rs  |   1 +
 .../src/processors/retry_processor/mod.rs     |   1 +
 .../crates/engine/src/control.rs              |  13 +
 .../crates/engine/src/effect_handler.rs       |  29 +-
 rust/otap-dataflow/crates/engine/src/lib.rs   |   2 +
 .../crates/engine/src/local/processor.rs      |  14 +-
 .../crates/engine/src/message.rs              | 391 +++++++++++++++++-
 .../crates/engine/src/node_local_scheduler.rs | 322 +++++++++++++++
 .../crates/engine/src/processor.rs            |  15 +-
 .../crates/engine/src/shared/processor.rs     |  14 +-
 .../crates/pdata/src/validation/collector.rs  |   4 +
 .../crates/pdata/src/validation/scenarios.rs  |   8 +
 14 files changed, 1127 insertions(+), 252 deletions(-)
 create mode 100644 rust/otap-dataflow/crates/engine/src/node_local_scheduler.rs
diff --git a/rust/otap-dataflow/crates/core-nodes/src/processors/batch_processor/mod.rs b/rust/otap-dataflow/crates/core-nodes/src/processors/batch_processor/mod.rs
index 5c9b76d542..84634b103f 100644
--- a/rust/otap-dataflow/crates/core-nodes/src/processors/batch_processor/mod.rs
+++ b/rust/otap-dataflow/crates/core-nodes/src/processors/batch_processor/mod.rs
@@ -38,7 +38,7 @@ use otap_df_engine::MessageSourceLocalEffectHandlerExtension;
 use otap_df_engine::{
     ConsumerEffectHandlerExtension, Interests, ProducerEffectHandlerExtension,
     config::ProcessorConfig,
-    control::{AckMsg, CallData, NackMsg, NodeControlMsg},
+    control::{AckMsg, CallData, NackMsg, NodeControlMsg, WakeupSlot},
     error::{Error as EngineError, ProcessorErrorKind},
     local::processor as local,
     message::Message,
@@ -78,6 +78,25 @@ pub const DEFAULT_MAX_BATCH_DURATION_MS: u64 = 200;
 const LOG_MSG_BATCHING_FAILED_PREFIX: &str = "OTAP batch processor: low-level batching failed for";
 const LOG_MSG_BATCHING_FAILED_SUFFIX: &str = "; dropping";
 
+const WAKEUP_SLOT_OTAP_LOGS: WakeupSlot = WakeupSlot(0);
+const WAKEUP_SLOT_OTAP_METRICS: WakeupSlot = WakeupSlot(1);
+const WAKEUP_SLOT_OTAP_TRACES: WakeupSlot = WakeupSlot(2);
+const WAKEUP_SLOT_OTLP_LOGS: WakeupSlot = WakeupSlot(3);
+const WAKEUP_SLOT_OTLP_METRICS: WakeupSlot = WakeupSlot(4);
+const WAKEUP_SLOT_OTLP_TRACES: WakeupSlot = WakeupSlot(5);
+
+const fn signal_from_wakeup_slot(slot: WakeupSlot) -> Option<(SignalFormat, SignalType)> {
+    match slot {
+        WAKEUP_SLOT_OTAP_LOGS => Some((SignalFormat::OtapRecords, SignalType::Logs)),
+        WAKEUP_SLOT_OTAP_METRICS => Some((SignalFormat::OtapRecords, SignalType::Metrics)),
+        WAKEUP_SLOT_OTAP_TRACES => Some((SignalFormat::OtapRecords, SignalType::Traces)),
+        WAKEUP_SLOT_OTLP_LOGS => Some((SignalFormat::OtlpBytes, SignalType::Logs)),
+        WAKEUP_SLOT_OTLP_METRICS => Some((SignalFormat::OtlpBytes, SignalType::Metrics)),
+        WAKEUP_SLOT_OTLP_TRACES => Some((SignalFormat::OtlpBytes, SignalType::Traces)),
+        _ => None,
+    }
+}
+
 /// How to size a batch.
 ///
 /// Note: these are not always supported. In the present code, the only
@@ -149,9 +168,9 @@ trait Batcher<T: OtapPayloadHelpers> {
         records: Vec<T>,
     ) -> Result<Vec<T>, PDataError>;
 
-    /// We are using an empty DelayData request as a one-shot
-    /// timer. This returns the appropriate empty request.
-    /// TODO: Add proper one-shot timer and cancellation, see #1472.
+    fn wakeup_slot(signal: SignalType) -> WakeupSlot;
+
+    /// Returns the appropriate empty request payload for this signal.
     fn empty(signal: SignalType) -> T;
 }
 
@@ -743,6 +762,14 @@ impl Batcher<OtapArrowRecords> for SignalBuffer<OtapArrowRecords> {
             SignalType::Traces => OtapArrowRecords::Traces(otap_df_pdata::otap::Traces::default()),
         }
     }
+
+    fn wakeup_slot(signal: SignalType) -> WakeupSlot {
+        match signal {
+            SignalType::Logs => WAKEUP_SLOT_OTAP_LOGS,
+            SignalType::Metrics => WAKEUP_SLOT_OTAP_METRICS,
+            SignalType::Traces => WAKEUP_SLOT_OTAP_TRACES,
+        }
+    }
 }
 
 impl Batcher<OtlpProtoBytes> for SignalBuffer<OtlpProtoBytes> {
@@ -763,6 +790,14 @@ impl Batcher<OtlpProtoBytes> for SignalBuffer<OtlpProtoBytes> {
             SignalType::Traces => OtlpProtoBytes::ExportTracesRequest(Bytes::new()),
         }
     }
+
+    fn wakeup_slot(signal: SignalType) -> WakeupSlot {
+        match signal {
+            SignalType::Logs => WAKEUP_SLOT_OTLP_LOGS,
+            SignalType::Metrics => WAKEUP_SLOT_OTLP_METRICS,
+            SignalType::Traces => WAKEUP_SLOT_OTLP_TRACES,
+        }
+    }
 }
 
 impl<'a, T: OtapPayloadHelpers> BatchProcessorSignal<'a, T>
@@ -848,6 +883,8 @@ where
             return Ok(());
         }
 
+        let _ = effect.cancel_wakeup(SignalBuffer::<T>::wakeup_slot(self.signal));
+
         // If this is a timer-based flush and we were called too soon,
         // skip. this may happen if the batch for which the timer was set
         // flushes for size before the timer.
@@ -1099,6 +1136,32 @@ impl local::Processor<OtapPdata> for BatchProcessor {
                         message: e.to_string(),
                     }
                 }),
+                NodeControlMsg::Wakeup { slot, when } => {
+                    let Some((format, signal)) = signal_from_wakeup_slot(slot) else {
+                        return Ok(());
+                    };
+
+                    match format {
+                        SignalFormat::OtapRecords => {
+                            if let Some(mut otap_format) = self.otap_format() {
+                                otap_format
+                                    .for_signal(signal)
+                                    .flush_signal_impl(effect, when, FlushReason::Timer)
+                                    .await?;
+                            }
+                        }
+                        SignalFormat::OtlpBytes => {
+                            if let Some(mut otlp_format) = self.otlp_format() {
+                                otlp_format
+                                    .for_signal(signal)
+                                    .flush_signal_impl(effect, when, FlushReason::Timer)
+                                    .await?;
+                            }
+                        }
+                    };
+
+                    Ok(())
+                }
                 NodeControlMsg::DelayedData { data, when } => {
                     let signal = data.signal_type();
 
@@ -1326,18 +1389,11 @@ where
         self.arrival = Some(now);
 
         effect
-            .delay_data(
-                now + timeout,
-                Box::new(OtapPdata::new(
-                    Context::default(),
-                    Self::empty(signal).into(),
-                )),
-            )
-            .await
+            .set_wakeup(Self::wakeup_slot(signal), now + timeout)
             .map_err(|_| EngineError::ProcessorError {
                 processor: effect.processor_id(),
                 kind: ProcessorErrorKind::Other,
-                error: "could not set one-shot timer".into(),
+                error: "could not set wakeup".into(),
                 source_detail: "".into(),
             })
     }
@@ -1367,12 +1423,12 @@ mod tests {
     use otap_df_engine::config::ProcessorConfig;
     use otap_df_engine::context::ControllerContext;
     use otap_df_engine::control::{
-        NodeControlMsg, PipelineCompletionMsg, RuntimeControlMsg, pipeline_completion_msg_channel,
+        NodeControlMsg, PipelineCompletionMsg, pipeline_completion_msg_channel,
         runtime_ctrl_msg_channel,
     };
     use otap_df_engine::message::Message;
     use otap_df_engine::node::Node;
-    use otap_df_engine::testing::liveness::{next_completion, next_runtime_control};
+    use otap_df_engine::testing::liveness::next_completion;
     use otap_df_engine::testing::processor::TestRuntime;
     use otap_df_engine::testing::test_node;
     use otap_df_otap::pdata::OtapPdata;
@@ -1620,7 +1676,7 @@ mod tests {
     #[derive(Clone)]
     enum TestEvent {
         Input(OtlpProtoMessage),
-        Elapsed, // Signal to deliver all pending DelayedData messages
+        Elapsed, // Signal to deliver due wakeups
     }
 
     /// Policy for acking or nacking an output
@@ -1657,6 +1713,17 @@ mod tests {
         otap_to_otlp(&rec)
     }
 
+    const fn all_wakeup_slots() -> [WakeupSlot; 6] {
+        [
+            WAKEUP_SLOT_OTAP_LOGS,
+            WAKEUP_SLOT_OTAP_METRICS,
+            WAKEUP_SLOT_OTAP_TRACES,
+            WAKEUP_SLOT_OTLP_LOGS,
+            WAKEUP_SLOT_OTLP_METRICS,
+            WAKEUP_SLOT_OTLP_TRACES,
+        ]
+    }
+
     fn run_batch_processor_test<F, P>(
         events: impl Iterator<Item = TestEvent>,
         subscribe: bool,
@@ -1686,10 +1753,8 @@ mod tests {
 
         phase
             .run_test(move |mut ctx| async move {
-                let (runtime_ctrl_tx, mut runtime_ctrl_rx) = runtime_ctrl_msg_channel(10);
                 let (pipeline_completion_tx, mut pipeline_completion_rx) =
                     pipeline_completion_msg_channel(10);
-                ctx.set_runtime_ctrl_sender(runtime_ctrl_tx);
                 ctx.set_pipeline_completion_sender(pipeline_completion_tx);
 
                 // Track outputs by event position
@@ -1702,16 +1767,11 @@ mod tests {
                 let mut received_acks: Vec<TestCallData> = Vec::new();
                 let mut received_nacks: Vec<TestCallData> = Vec::new();
 
-                // Track latest DelayedData message
-                let mut pending_delay: Option<(Instant, Box<OtapPdata>)> = None;
                 let mut input_idx = 0;
                 let mut total_outputs = 0;
 
                 // Process each event in sequence
                 for (event_idx, event) in events.into_iter().enumerate() {
-                    // Determine if this is an elapsed event
-                    let is_elapsed = matches!(event, TestEvent::Elapsed);
-
                     // Process the event
                     match event {
                         TestEvent::Input(input_otlp) => {
@@ -1744,20 +1804,15 @@ mod tests {
                             input_idx += 1;
                         }
                         TestEvent::Elapsed => {
-                            // Elapsed event - no input to process
-                        }
-                    }
-
-                    // If this is an Elapsed event, deliver the pending DelayedData if present
-                    if is_elapsed {
-                        if let Some((when, data)) = pending_delay.take() {
-                            // Note we deliver "when" exactly as the DelayData requested,
-                            // which is a future timestamp; however it's the deadline requested,
-                            // and since "when" passes through, the comparison is succesful using
-                            // the expected instant.
-                            let delayed_msg =
-                                Message::Control(NodeControlMsg::DelayedData { when, data });
-                            ctx.process(delayed_msg).await.expect("process delayed");
+                            let when = Instant::now() + Duration::from_secs(1);
+                            for slot in all_wakeup_slots() {
+                                ctx.process(Message::Control(NodeControlMsg::Wakeup {
+                                    slot,
+                                    when,
+                                }))
+                                .await
+                                .expect("process wakeup");
+                            }
                         }
                     }
 
@@ -1799,22 +1854,6 @@ mod tests {
                             }
                         }
 
-                        // Drain control channel for DelayData requests and acks/nacks
-                        loop {
-                            match runtime_ctrl_rx.try_recv() {
-                                Ok(RuntimeControlMsg::DelayData { when, data, .. }) => {
-                                    looped += 1;
-                                    pending_delay = Some((when, data));
-                                }
-                                Ok(_) => {
-                                    panic!("unexpected case");
-                                }
-                                Err(_) => {
-                                    break;
-                                }
-                            }
-                        }
-
                         loop {
                             match pipeline_completion_rx.try_recv() {
                                 Ok(PipelineCompletionMsg::DeliverAck { ack }) => {
@@ -2010,11 +2049,11 @@ mod tests {
         test_timer_flush(datagen.generate_logs().into(), true);
     }
 
-    // The processor schedules one-shot DelayedData wakeups without cancelling older
-    // ones. This test proves that a stale wakeup is ignored and that the current
-    // wakeup still flushes the buffered input later.
+    // The processor replaces wakeups per slot. This test proves that an early
+    // wakeup is ignored and that the current wakeup still flushes the buffered
+    // input later.
     #[test]
-    fn test_timer_flush_ignores_stale_delayed_wakeup() {
+    fn test_timer_flush_ignores_stale_wakeup() {
         let (telemetry_registry, metrics_reporter, phase) = setup_test_runtime(json!({
             "otap": {
                 "min_size": 5,
@@ -2026,9 +2065,6 @@ mod tests {
 
         phase
             .run_test(move |mut ctx| async move {
-                let (runtime_ctrl_tx, mut runtime_ctrl_rx) = runtime_ctrl_msg_channel(10);
-                ctx.set_runtime_ctrl_sender(runtime_ctrl_tx);
-
                 let mut datagen = DataGenerator::new(1);
                 let first = datagen.generate_logs();
                 let second = datagen.generate_logs();
@@ -2043,20 +2079,6 @@ mod tests {
                     "first input should remain buffered"
                 );
 
-                let RuntimeControlMsg::DelayData {
-                    when: stale_when,
-                    data: stale_data,
-                    ..
-                } = next_runtime_control(
-                    &mut runtime_ctrl_rx,
-                    Duration::from_secs(1),
-                    "initial batch timer wakeup",
-                )
-                .await
-                else {
-                    panic!("expected initial DelayData");
-                };
-
                 // The second input takes the buffer over the min size, so the processor flushes
                 // before the original timer fires.
                 let rec = encode_logs_otap_batch(&second).expect("encode logs");
@@ -2075,37 +2097,25 @@ mod tests {
                     "new post-flush batch should remain buffered"
                 );
 
-                let RuntimeControlMsg::DelayData {
-                    when: current_when,
-                    data: current_data,
-                    ..
-                } = next_runtime_control(
-                    &mut runtime_ctrl_rx,
-                    Duration::from_secs(1),
-                    "replacement batch timer wakeup",
-                )
-                .await
-                else {
-                    panic!("expected replacement DelayData");
-                };
-
-                ctx.process(Message::Control(NodeControlMsg::DelayedData {
+                let stale_when = Instant::now();
+                ctx.process(Message::Control(NodeControlMsg::Wakeup {
+                    slot: WAKEUP_SLOT_OTAP_LOGS,
                     when: stale_when,
-                    data: stale_data,
                 }))
                 .await
-                .expect("process stale delayed data");
+                .expect("process stale wakeup");
                 assert!(
                     ctx.drain_pdata().await.is_empty(),
                     "stale wakeup should be ignored"
                 );
 
-                ctx.process(Message::Control(NodeControlMsg::DelayedData {
+                let current_when = Instant::now() + Duration::from_secs(1);
+                ctx.process(Message::Control(NodeControlMsg::Wakeup {
+                    slot: WAKEUP_SLOT_OTAP_LOGS,
                     when: current_when,
-                    data: current_data,
                 }))
                 .await
-                .expect("process current delayed data");
+                .expect("process current wakeup");
                 let final_flush = ctx.drain_pdata().await;
                 assert_eq!(
                     final_flush.len(),
@@ -2691,9 +2701,6 @@ mod tests {
 
         phase
             .run_test(move |mut ctx| async move {
-                let (pipeline_tx, mut pipeline_rx) = runtime_ctrl_msg_channel(10);
-                ctx.set_runtime_ctrl_sender(pipeline_tx);
-
                 // Create test data
                 let mut datagen = DataGenerator::new(1);
                 let logs1: OtlpProtoMessage = datagen.generate_logs().into();
@@ -2704,8 +2711,6 @@ mod tests {
                 let otap_message2 = otlp_to_otap(&logs2);
 
                 let mut outputs = Vec::new();
-                let mut pending_delays: Vec<(Instant, Box<OtapPdata>)> = Vec::new();
-
                 // Send both
                 ctx.process(Message::PData(OtapPdata::new_default(otlp_message1.into())))
                     .await
@@ -2715,23 +2720,17 @@ mod tests {
                     .await
                     .expect("process otlp");
 
-                // Drain control channel for DelayData
-                while let Ok(RuntimeControlMsg::DelayData { when, data, .. }) =
-                    pipeline_rx.try_recv()
-                {
-                    pending_delays.push((when, data));
-                }
-
                 assert!(
                     ctx.drain_pdata().await.is_empty(),
                     "no outputs before timeout"
                 );
 
-                // Trigger timeout
-                for (when, data) in pending_delays {
-                    ctx.process(Message::Control(NodeControlMsg::DelayedData { when, data }))
+                // Trigger timeout for both active batching slots.
+                let when = Instant::now() + Duration::from_secs(1);
+                for slot in [WAKEUP_SLOT_OTLP_LOGS, WAKEUP_SLOT_OTAP_LOGS] {
+                    ctx.process(Message::Control(NodeControlMsg::Wakeup { slot, when }))
                         .await
-                        .expect("process delayed");
+                        .expect("process wakeup");
                 }
 
                 // Drain outputs after timeout
diff --git a/rust/otap-dataflow/crates/core-nodes/src/processors/durable_buffer_processor/mod.rs b/rust/otap-dataflow/crates/core-nodes/src/processors/durable_buffer_processor/mod.rs
index 17689eeeca..549a9fe3a3 100644
--- a/rust/otap-dataflow/crates/core-nodes/src/processors/durable_buffer_processor/mod.rs
+++ b/rust/otap-dataflow/crates/core-nodes/src/processors/durable_buffer_processor/mod.rs
@@ -43,7 +43,7 @@
 //! - `TimerTick`: Poll storage for bundles, send downstream
 //! - `Ack`: Extract BundleRef from calldata, call handle.ack()
 //! - `Nack (permanent)`: Call handle.reject() — no retry
-//! - `Nack (transient)`: Call handle.defer() and schedule retry via delay_data()
+//! - `Nack (transient)`: Call handle.defer() and schedule retry via a wakeup
 //! - `Shutdown`: Flush storage engine
 //!
 //! # Retry Behavior and Error Handling
@@ -104,7 +104,7 @@ use otap_df_config::node::NodeUserConfig;
 use otap_df_engine::config::ProcessorConfig;
 use otap_df_engine::context::PipelineContext;
 use otap_df_engine::control::Context8u8;
-use otap_df_engine::control::{AckMsg, CallData, NackMsg, NodeControlMsg};
+use otap_df_engine::control::{AckMsg, CallData, NackMsg, NodeControlMsg, WakeupSlot};
 use otap_df_engine::error::Error;
 use otap_df_engine::local::processor::EffectHandler;
 use otap_df_engine::message::Message;
@@ -318,42 +318,6 @@ fn decode_bundle_ref(calldata: &CallData) -> Option<BundleRef> {
     })
 }
 
-/// Encode a retry ticket into CallData for DelayedData scheduling.
-///
-/// Layout: [segment_seq (u64), bundle_index (u32), retry_count (u32) packed into u64]
-fn encode_retry_ticket(bundle_ref: BundleRef, retry_count: u32) -> CallData {
-    // Pack bundle_index (low 32 bits) and retry_count (high 32 bits) into one u64
-    let packed = (bundle_ref.bundle_index.raw() as u64) | ((retry_count as u64) << 32);
-    smallvec![
-        Context8u8::from(bundle_ref.segment_seq.raw()),
-        Context8u8::from(packed),
-    ]
-}
-
-/// Decode a retry ticket from CallData.
-///
-/// Returns (BundleRef, retry_count) if valid.
-fn decode_retry_ticket(calldata: &CallData) -> Option<(BundleRef, u32)> {
-    if calldata.len() < 2 {
-        return None;
-    }
-    let segment_seq = SegmentSeq::new(u64::from(calldata[0]));
-    let packed = u64::from(calldata[1]);
-    let bundle_index = BundleIndex::new((packed & 0xFFFF_FFFF) as u32);
-    let retry_count = (packed >> 32) as u32;
-    Some((
-        BundleRef {
-            segment_seq,
-            bundle_index,
-        },
-        retry_count,
-    ))
-}
-
-// ─────────────────────────────────────────────────────────────────────────────
-// Pending Bundle Tracking
-// ─────────────────────────────────────────────────────────────────────────────
-
 /// State for tracking a pending downstream delivery.
 ///
 /// Holds the Quiver bundle handle to keep the bundle claimed while in-flight.
@@ -370,6 +334,13 @@ struct PendingBundle {
     signal_type: SignalType,
 }
 
+/// Local retry state held between wakeup scheduling and wakeup delivery.
+#[derive(Clone, Copy)]
+struct RetryWakeup {
+    bundle_ref: BundleRef,
+    retry_count: u32,
+}
+
 /// Result of attempting to process a bundle with non-blocking send.
 enum ProcessBundleResult {
     /// Bundle was successfully sent downstream.
@@ -440,11 +411,19 @@ pub struct DurableBuffer {
     /// Key is the (segment_seq, bundle_index) pair encoded as a u128 for fast lookup.
     pending_bundles: HashMap<(u64, u32), PendingBundle>,
 
-    /// Bundles scheduled for retry via delay_data.
+    /// Bundles scheduled for retry via node-local wakeups.
     /// These are skipped by poll_next_bundle to enforce backoff.
-    /// Removed when the delay fires and claim_bundle is called.
     retry_scheduled: HashSet<(u64, u32)>,
 
+    /// Wakeup slot assigned to each bundle currently waiting for retry.
+    retry_wakeup_slots: HashMap<(u64, u32), WakeupSlot>,
+
+    /// Retry state keyed by wakeup slot.
+    retry_wakeups: HashMap<WakeupSlot, RetryWakeup>,
+
+    /// Monotonic slot allocator for retry wakeups.
+    next_retry_wakeup_slot: u64,
+
     /// Configuration.
     config: DurableBufferConfig,
 
@@ -538,6 +517,9 @@ impl DurableBuffer {
             engine_state: EngineState::Uninitialized,
             pending_bundles: HashMap::new(),
             retry_scheduled: HashSet::new(),
+            retry_wakeup_slots: HashMap::new(),
+            retry_wakeups: HashMap::new(),
+            next_retry_wakeup_slot: 0,
             config,
             core_id,
             num_cores,
@@ -633,14 +615,13 @@ impl DurableBuffer {
         self.pending_bundles.len() < self.config.max_in_flight
     }
 
-    /// Schedule a retry for a bundle via delay_data.
+    /// Schedule a retry for a bundle via a processor-local wakeup.
     ///
-    /// This is the single point of coordination between `delay_data` scheduling
-    /// and `retry_scheduled` tracking. Always use this method instead of calling
-    /// `delay_data` directly to ensure the two stay in sync.
+    /// This is the single point of coordination between wakeup scheduling and
+    /// `retry_scheduled` tracking. Always use this method to keep the two in sync.
     ///
     /// Returns true if scheduling succeeded, false if it failed (caller should
-    /// let poll_next_bundle pick up the bundle instead).
+    /// let `poll_next_bundle` pick up the bundle instead).
     async fn schedule_retry(
         &mut self,
         bundle_ref: BundleRef,
@@ -649,40 +630,45 @@ impl DurableBuffer {
         effect_handler: &mut EffectHandler<OtapPdata>,
     ) -> bool {
         let key = (bundle_ref.segment_seq.raw(), bundle_ref.bundle_index.raw());
-
-        // Create a lightweight retry ticket
-        // TODO(#1472): Replace with proper timer support when available.
-        // Currently we abuse delay_data() with an empty payload as a workaround
-        // for the lack of a native "schedule callback" primitive.
-        let retry_ticket = OtapPdata::new(
-            Default::default(),
-            OtapPayload::empty(SignalType::Traces), // Signal type doesn't matter for empty payload
-        );
-        let calldata = encode_retry_ticket(bundle_ref, retry_count);
-        let mut retry_ticket = Box::new(retry_ticket);
-        effect_handler.subscribe_to(Interests::empty(), calldata, &mut retry_ticket);
-
+        let (slot, is_new_slot) = match self.retry_wakeup_slots.entry(key) {
+            Entry::Occupied(entry) => (*entry.get(), false),
+            Entry::Vacant(entry) => {
+                let slot = WakeupSlot(self.next_retry_wakeup_slot);
+                self.next_retry_wakeup_slot = self.next_retry_wakeup_slot.saturating_add(1);
+                let _ = entry.insert(slot);
+                (slot, true)
+            }
+        };
         let retry_at = Instant::now() + delay;
-        if effect_handler
-            .delay_data(retry_at, retry_ticket)
-            .await
-            .is_ok()
-        {
+        if effect_handler.set_wakeup(slot, retry_at).is_ok() {
             // Track that this bundle is scheduled - poll_next_bundle will skip it
             let _ = self.retry_scheduled.insert(key);
+            let _ = self.retry_wakeups.insert(
+                slot,
+                RetryWakeup {
+                    bundle_ref,
+                    retry_count,
+                },
+            );
             true
         } else {
-            // Failed to schedule - don't add to retry_scheduled, poll will pick it up
+            if is_new_slot {
+                let _ = self.retry_wakeup_slots.remove(&key);
+            }
             false
         }
     }
 
-    /// Remove a bundle from retry_scheduled tracking.
-    ///
-    /// Call this when the delay has fired and we're about to process the retry.
-    fn unschedule_retry(&mut self, bundle_ref: BundleRef) {
-        let key = (bundle_ref.segment_seq.raw(), bundle_ref.bundle_index.raw());
+    /// Remove retry-wakeup tracking for a bundle now being resumed.
+    fn take_retry_wakeup(&mut self, slot: WakeupSlot) -> Option<RetryWakeup> {
+        let wakeup = self.retry_wakeups.remove(&slot)?;
+        let key = (
+            wakeup.bundle_ref.segment_seq.raw(),
+            wakeup.bundle_ref.bundle_index.raw(),
+        );
         let _ = self.retry_scheduled.remove(&key);
+        let _ = self.retry_wakeup_slots.remove(&key);
+        Some(wakeup)
     }
 
     /// Lazily initialize the Quiver engine on first use.
@@ -1501,10 +1487,9 @@ impl DurableBuffer {
     /// For permanent NACKs (e.g., malformed data that will never succeed), the bundle
     /// is rejected immediately without retry.
     ///
-    /// For transient NACKs, schedules a retry with exponential backoff using `delay_data()`.
-    /// The bundle is deferred in Quiver (releasing the claim) and a lightweight
-    /// retry ticket is scheduled. When the delay expires, `handle_delayed_retry`
-    /// will re-claim the bundle and attempt redelivery.
+    /// For transient NACKs, schedules a retry with exponential backoff using a
+    /// processor-local wakeup. The bundle is deferred in Quiver (releasing the
+    /// claim) and local retry state is retained until the wakeup fires.
     async fn handle_nack(
         &mut self,
         nack: NackMsg<OtapPdata>,
@@ -1601,29 +1586,26 @@ impl DurableBuffer {
         Ok(())
     }
 
-    /// Handle a delayed retry ticket.
+    /// Handle a retry wakeup.
     ///
     /// Re-claims the bundle from Quiver and attempts redelivery downstream.
-    async fn handle_delayed_retry(
+    async fn handle_retry_wakeup(
         &mut self,
-        retry_ticket: Box<OtapPdata>,
+        slot: WakeupSlot,
         effect_handler: &mut EffectHandler<OtapPdata>,
     ) -> Result<(), Error> {
-        // Decode the retry ticket
-        let Some(calldata) = retry_ticket.source_route() else {
-            otel_warn!("durable_buffer.retry.missing_calldata");
-            return Ok(());
-        };
-
-        let Some((bundle_ref, retry_count)) = decode_retry_ticket(&calldata.calldata) else {
-            otel_warn!("durable_buffer.retry.invalid_calldata");
+        let Some(RetryWakeup {
+            bundle_ref,
+            retry_count,
+        }) = self.take_retry_wakeup(slot)
+        else {
+            otel_warn!("durable_buffer.retry.unknown_wakeup", wakeup_slot = slot.0);
             return Ok(());
         };
 
         // Check max_in_flight limit
         if !self.can_send_more() {
             // At capacity - re-schedule with a short delay.
-            // Bundle stays in retry_scheduled (wasn't removed yet).
             otel_debug!(
                 "durable_buffer.retry.deferred",
                 segment_seq = bundle_ref.segment_seq.raw(),
@@ -1632,8 +1614,6 @@ impl DurableBuffer {
                 max_in_flight = self.config.max_in_flight
             );
 
-            // Re-schedule - note: bundle is still in retry_scheduled, schedule_retry
-            // will just update it (insert is idempotent for HashSet)
             if !self
                 .schedule_retry(
                     bundle_ref,
@@ -1643,17 +1623,11 @@ impl DurableBuffer {
                 )
                 .await
             {
-                // Failed to re-schedule - remove from retry_scheduled so poll can pick it up
-                self.unschedule_retry(bundle_ref);
                 otel_warn!("durable_buffer.retry.reschedule_failed");
             }
             return Ok(());
         }
 
-        // Backoff period has elapsed and we have capacity - remove from retry_scheduled.
-        // This allows poll_next_bundle to see it again if claim_bundle fails.
-        self.unschedule_retry(bundle_ref);
-
         // Re-claim the bundle from Quiver
         let claim_result = {
             let (engine, subscriber_id) = self.engine()?;
@@ -1924,15 +1898,10 @@ impl otap_df_engine::local::processor::Processor<OtapPdata> for DurableBuffer {
                     Ok(())
                 }
                 NodeControlMsg::DrainIngress { .. } => Ok(()),
-                NodeControlMsg::DelayedData { data, .. } => {
-                    // Check if this is a retry ticket (has BundleRef + retry_count in calldata)
-                    if let Some(route) = data.source_route() {
-                        if decode_retry_ticket(&route.calldata).is_some() {
-                            // This is a retry ticket - handle retry
-                            return self.handle_delayed_retry(data, effect_handler).await;
-                        }
-                    }
-                    // Not a retry ticket - shouldn't happen, but handle gracefully
+                NodeControlMsg::Wakeup { slot, .. } => {
+                    self.handle_retry_wakeup(slot, effect_handler).await
+                }
+                NodeControlMsg::DelayedData { .. } => {
                     otel_warn!("durable_buffer.delayed_data.unexpected");
                     Ok(())
                 }
@@ -2015,45 +1984,166 @@ mod tests {
     }
 
     #[test]
-    fn test_retry_ticket_encoding_roundtrip() {
+    fn test_take_retry_wakeup_clears_tracking() {
+        use otap_df_engine::context::ControllerContext;
+        use otap_df_telemetry::registry::TelemetryRegistryHandle;
+
+        let registry = TelemetryRegistryHandle::default();
+        let controller_ctx = ControllerContext::new(registry);
+        let pipeline_ctx =
+            controller_ctx.pipeline_context_with("test".into(), "test".into(), 0, 1, 0);
+
+        let config = DurableBufferConfig {
+            path: std::path::PathBuf::from("/tmp/test-retry-wakeup"),
+            retention_size_cap: byte_unit::Byte::from_u64(256 * 1024 * 1024),
+            max_age: None,
+            size_cap_policy: SizeCapPolicy::Backpressure,
+            poll_interval: Duration::from_millis(100),
+            otlp_handling: OtlpHandling::PassThrough,
+            max_segment_open_duration: Duration::from_secs(1),
+            initial_retry_interval: Duration::from_secs(1),
+            max_retry_interval: Duration::from_secs(30),
+            retry_multiplier: 2.0,
+            max_in_flight: 1000,
+        };
+
+        let mut processor = DurableBuffer::new(config, &pipeline_ctx).unwrap();
         let bundle_ref = BundleRef {
             segment_seq: SegmentSeq::new(98765),
             bundle_index: BundleIndex::new(123),
         };
-        let retry_count = 7u32;
-
-        let calldata = encode_retry_ticket(bundle_ref, retry_count);
-        let decoded = decode_retry_ticket(&calldata);
+        let key = (bundle_ref.segment_seq.raw(), bundle_ref.bundle_index.raw());
+        let slot = WakeupSlot(7);
+        let _ = processor.retry_scheduled.insert(key);
+        let _ = processor.retry_wakeup_slots.insert(key, slot);
+        let _ = processor.retry_wakeups.insert(
+            slot,
+            RetryWakeup {
+                bundle_ref,
+                retry_count: 3,
+            },
+        );
 
-        assert!(decoded.is_some());
-        let (decoded_ref, decoded_count) = decoded.unwrap();
-        assert_eq!(decoded_ref.segment_seq.raw(), 98765);
-        assert_eq!(decoded_ref.bundle_index.raw(), 123);
-        assert_eq!(decoded_count, 7);
+        let taken = processor
+            .take_retry_wakeup(slot)
+            .expect("retry wakeup should exist");
+        assert_eq!(taken.bundle_ref.segment_seq.raw(), 98765);
+        assert_eq!(taken.bundle_ref.bundle_index.raw(), 123);
+        assert_eq!(taken.retry_count, 3);
+        assert!(!processor.retry_scheduled.contains(&key));
+        assert!(!processor.retry_wakeup_slots.contains_key(&key));
+        assert!(!processor.retry_wakeups.contains_key(&slot));
     }
 
     #[test]
-    fn test_retry_ticket_encoding_max_values() {
-        let bundle_ref = BundleRef {
-            segment_seq: SegmentSeq::new(u64::MAX),
-            bundle_index: BundleIndex::new(u32::MAX),
-        };
-        let retry_count = u32::MAX;
+    fn test_take_retry_wakeup_unknown_slot_is_ignored() {
+        use otap_df_engine::context::ControllerContext;
+        use otap_df_telemetry::registry::TelemetryRegistryHandle;
+
+        let registry = TelemetryRegistryHandle::default();
+        let controller_ctx = ControllerContext::new(registry);
+        let pipeline_ctx =
+            controller_ctx.pipeline_context_with("test".into(), "test".into(), 0, 1, 0);
 
-        let calldata = encode_retry_ticket(bundle_ref, retry_count);
-        let decoded = decode_retry_ticket(&calldata);
+        let config = DurableBufferConfig {
+            path: std::path::PathBuf::from("/tmp/test-retry-wakeup-miss"),
+            retention_size_cap: byte_unit::Byte::from_u64(256 * 1024 * 1024),
+            max_age: None,
+            size_cap_policy: SizeCapPolicy::Backpressure,
+            poll_interval: Duration::from_millis(100),
+            otlp_handling: OtlpHandling::PassThrough,
+            max_segment_open_duration: Duration::from_secs(1),
+            initial_retry_interval: Duration::from_secs(1),
+            max_retry_interval: Duration::from_secs(30),
+            retry_multiplier: 2.0,
+            max_in_flight: 1000,
+        };
 
-        assert!(decoded.is_some());
-        let (decoded_ref, decoded_count) = decoded.unwrap();
-        assert_eq!(decoded_ref.segment_seq.raw(), u64::MAX);
-        assert_eq!(decoded_ref.bundle_index.raw(), u32::MAX);
-        assert_eq!(decoded_count, u32::MAX);
+        let mut processor = DurableBuffer::new(config, &pipeline_ctx).unwrap();
+        assert!(processor.take_retry_wakeup(WakeupSlot(999)).is_none());
     }
 
     #[test]
-    fn test_decode_retry_ticket_empty_calldata() {
-        let calldata: CallData = smallvec![];
-        assert!(decode_retry_ticket(&calldata).is_none());
+    fn test_retry_wakeup_resumes_retry_logic() {
+        use otap_df_config::node::NodeUserConfig;
+        use otap_df_engine::config::ProcessorConfig;
+        use otap_df_engine::context::ControllerContext;
+        use otap_df_engine::control::pipeline_completion_msg_channel;
+        use otap_df_engine::message::Message;
+        use otap_df_engine::testing::processor::TestRuntime;
+        use otap_df_engine::testing::test_node;
+        use otap_df_otap::testing::next_nack;
+        use otap_df_pdata::encode::encode_logs_otap_batch;
+        use otap_df_pdata::testing::fixtures::DataGenerator;
+        use serde_json::json;
+
+        let rt = TestRuntime::new();
+        let controller = ControllerContext::new(rt.metrics_registry());
+        let pipeline_ctx = controller.pipeline_context_with("grp".into(), "pipe".into(), 0, 1, 0);
+        let temp_dir = tempfile::tempdir().expect("tempdir");
+
+        let mut node_config = NodeUserConfig::new_processor_config(DURABLE_BUFFER_URN);
+        node_config.config = json!({
+            "path": temp_dir.path(),
+            "retention_size_cap": "256 MiB",
+            "poll_interval": "100ms",
+            "max_segment_open_duration": "1s",
+            "initial_retry_interval": "1s",
+            "max_retry_interval": "30s",
+            "retry_multiplier": 2.0,
+            "max_in_flight": 1000
+        });
+
+        let processor = create_durable_buffer(
+            pipeline_ctx,
+            test_node("durable-buffer-retry-wakeup"),
+            Arc::new(node_config),
+            &ProcessorConfig::new("durable-buffer-retry-wakeup"),
+        )
+        .expect("create durable buffer");
+
+        rt.set_processor(processor)
+            .run_test(move |mut ctx| async move {
+                let (pipeline_completion_tx, _pipeline_completion_rx) =
+                    pipeline_completion_msg_channel(10);
+                ctx.set_pipeline_completion_sender(pipeline_completion_tx);
+
+                let mut datagen = DataGenerator::new(1);
+                let input = datagen.generate_logs();
+                let rec = encode_logs_otap_batch(&input).expect("encode logs");
+                ctx.process(Message::PData(OtapPdata::new_default(rec.into())))
+                    .await
+                    .expect("process input");
+
+                ctx.process(Message::Control(NodeControlMsg::TimerTick {}))
+                    .await
+                    .expect("process timer tick");
+                let mut outputs = ctx.drain_pdata().await;
+                assert_eq!(outputs.len(), 1, "timer tick should emit one bundle");
+
+                let sent = outputs.pop().expect("sent bundle");
+                let (_, nack) =
+                    next_nack(NackMsg::new("retry", sent)).expect("expected nack subscriber");
+                ctx.process(Message::Control(NodeControlMsg::Nack(nack)))
+                    .await
+                    .expect("process nack");
+                assert!(
+                    ctx.drain_pdata().await.is_empty(),
+                    "nack should defer delivery until wakeup"
+                );
+
+                ctx.process(Message::Control(NodeControlMsg::Wakeup {
+                    slot: WakeupSlot(0),
+                    when: Instant::now() + Duration::from_secs(1),
+                }))
+                .await
+                .expect("process retry wakeup");
+
+                let retried = ctx.drain_pdata().await;
+                assert_eq!(retried.len(), 1, "wakeup should resume retry delivery");
+                assert_eq!(retried[0].signal_type(), SignalType::Logs);
+            })
+            .validate(|_| async {});
     }
 
     #[test]
diff --git a/rust/otap-dataflow/crates/core-nodes/src/processors/log_sampling_processor/mod.rs b/rust/otap-dataflow/crates/core-nodes/src/processors/log_sampling_processor/mod.rs
index 7409b2ac9d..45f1dd162f 100644
--- a/rust/otap-dataflow/crates/core-nodes/src/processors/log_sampling_processor/mod.rs
+++ b/rust/otap-dataflow/crates/core-nodes/src/processors/log_sampling_processor/mod.rs
@@ -199,6 +199,7 @@ impl local::Processor<OtapPdata> for LogSamplingProcessor {
                 | NodeControlMsg::Ack(_)
                 | NodeControlMsg::Nack(_)
                 | NodeControlMsg::DrainIngress { .. }
+                | NodeControlMsg::Wakeup { .. }
                 | NodeControlMsg::DelayedData { .. } => Ok(()),
             },
         }
diff --git a/rust/otap-dataflow/crates/core-nodes/src/processors/retry_processor/mod.rs b/rust/otap-dataflow/crates/core-nodes/src/processors/retry_processor/mod.rs
index 83dd314126..3bc2ccdb00 100644
--- a/rust/otap-dataflow/crates/core-nodes/src/processors/retry_processor/mod.rs
+++ b/rust/otap-dataflow/crates/core-nodes/src/processors/retry_processor/mod.rs
@@ -651,6 +651,7 @@ impl Processor<OtapPdata> for RetryProcessor {
                 NodeControlMsg::TimerTick { .. } => {
                     unreachable!("unused");
                 }
+                NodeControlMsg::Wakeup { .. } => Ok(()),
                 NodeControlMsg::DrainIngress { .. } => Ok(()),
                 NodeControlMsg::Shutdown { .. } => Ok(()),
             },
diff --git a/rust/otap-dataflow/crates/engine/src/control.rs b/rust/otap-dataflow/crates/engine/src/control.rs
index 09a30065ad..8135fbfa0b 100644
--- a/rust/otap-dataflow/crates/engine/src/control.rs
+++ b/rust/otap-dataflow/crates/engine/src/control.rs
@@ -75,6 +75,11 @@ impl From<Context8u8> for f64 {
 /// numbers, deadline, num_items, etc.
 pub type CallData = SmallVec<[Context8u8; 3]>;
 
+/// Opaque key used to identify a node-local scheduled wakeup.
+#[repr(transparent)]
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)]
+pub struct WakeupSlot(pub u64);
+
 /// Engine-managed call data envelope. Wraps the CallData with an envelope
 /// containing timestamp. Lives on the forward path (in context stack frames).
 #[derive(Clone, Debug, Default, PartialEq)]
@@ -222,6 +227,14 @@ pub enum NodeControlMsg<PData> {
         metrics_reporter: MetricsReporter,
     },
 
+    /// A processor-local wakeup scheduled by the processor effect handler.
+    Wakeup {
+        /// Scheduled wakeup slot.
+        slot: WakeupSlot,
+        /// Original scheduled wakeup instant.
+        when: Instant,
+    },
+
     /// Delayed data returning to the node which delayed it.
     DelayedData {
         /// When resumed
diff --git a/rust/otap-dataflow/crates/engine/src/effect_handler.rs b/rust/otap-dataflow/crates/engine/src/effect_handler.rs
index d93060af4d..b02c137842 100644
--- a/rust/otap-dataflow/crates/engine/src/effect_handler.rs
+++ b/rust/otap-dataflow/crates/engine/src/effect_handler.rs
@@ -4,13 +4,15 @@
 //! Common foundation of all effect handlers.
 
 use crate::Interests;
+use crate::WakeupError;
 use crate::completion_emission_metrics::CompletionEmissionMetricsHandle;
 use crate::control::{
     AckMsg, NackMsg, PipelineCompletionMsg, PipelineCompletionMsgSender, RuntimeControlMsg,
-    RuntimeCtrlMsgSender,
+    RuntimeCtrlMsgSender, WakeupSlot,
 };
 use crate::error::Error;
 use crate::node::NodeId;
+use crate::node_local_scheduler::NodeLocalSchedulerHandle;
 use otap_df_channel::error::SendError;
 use otap_df_telemetry::error::Error as TelemetryError;
 use otap_df_telemetry::metrics::{MetricSet, MetricSetHandler};
@@ -58,6 +60,8 @@ pub(crate) struct EffectHandlerCore<PData> {
     pub(crate) source_tag: SourceTagging,
     /// Precomputed node interests derived from metric level.
     node_interests: Interests,
+    /// Optional processor-local wakeup scheduler.
+    local_scheduler: Option<NodeLocalSchedulerHandle>,
 }
 
 impl<PData> EffectHandlerCore<PData> {
@@ -71,6 +75,7 @@ impl<PData> EffectHandlerCore<PData> {
             completion_emission_metrics: None,
             source_tag: SourceTagging::Disabled,
             node_interests: Interests::empty(),
+            local_scheduler: None,
         }
     }
 
@@ -103,6 +108,11 @@ impl<PData> EffectHandlerCore<PData> {
         self.completion_emission_metrics = completion_emission_metrics;
     }
 
+    /// Sets the processor-local wakeup scheduler for this effect handler.
+    pub(crate) fn set_local_scheduler(&mut self, local_scheduler: NodeLocalSchedulerHandle) {
+        self.local_scheduler = Some(local_scheduler);
+    }
+
     /// Returns outgoing messages source tagging mode.
     #[must_use]
     pub const fn source_tagging(&self) -> SourceTagging {
@@ -397,6 +407,23 @@ impl<PData> EffectHandlerCore<PData> {
         })
     }
 
+    /// Set or replace a processor-local wakeup.
+    pub fn set_wakeup(&self, slot: WakeupSlot, when: Instant) -> Result<(), WakeupError> {
+        self.local_scheduler
+            .as_ref()
+            .expect("node-local scheduler not set for processor effect handler")
+            .set_wakeup(slot, when)
+    }
+
+    /// Cancel a previously scheduled processor-local wakeup.
+    #[must_use]
+    pub fn cancel_wakeup(&self, slot: WakeupSlot) -> bool {
+        self.local_scheduler
+            .as_ref()
+            .expect("node-local scheduler not set for processor effect handler")
+            .cancel_wakeup(slot)
+    }
+
     /// Notifies the runtime control manager that this receiver has completed
     /// ingress drain.
     pub async fn notify_receiver_drained(&self) -> Result<(), Error> {
diff --git a/rust/otap-dataflow/crates/engine/src/lib.rs b/rust/otap-dataflow/crates/engine/src/lib.rs
index 7cadf00111..ea2a8ca7c2 100644
--- a/rust/otap-dataflow/crates/engine/src/lib.rs
+++ b/rust/otap-dataflow/crates/engine/src/lib.rs
@@ -68,6 +68,7 @@ pub mod engine_metrics;
 pub mod entity_context;
 pub mod local;
 pub mod node;
+mod node_local_scheduler;
 pub mod output_router;
 pub mod pipeline_ctrl;
 mod pipeline_metrics;
@@ -78,6 +79,7 @@ pub mod terminal_state;
 pub mod testing;
 pub mod topic;
 pub mod wiring_contract;
+pub use node_local_scheduler::WakeupError;
 
 /// Trait for factory types that expose a name.
 ///
diff --git a/rust/otap-dataflow/crates/engine/src/local/processor.rs b/rust/otap-dataflow/crates/engine/src/local/processor.rs
index f78dc66665..b115d9636c 100644
--- a/rust/otap-dataflow/crates/engine/src/local/processor.rs
+++ b/rust/otap-dataflow/crates/engine/src/local/processor.rs
@@ -33,7 +33,8 @@
 //! in parallel on different cores, each with its own processor instance.
 
 use crate::Interests;
-use crate::control::{AckMsg, NackMsg, RuntimeCtrlMsgSender};
+use crate::WakeupError;
+use crate::control::{AckMsg, NackMsg, RuntimeCtrlMsgSender, WakeupSlot};
 use crate::effect_handler::{
     EffectHandlerCore, SourceTagging, TelemetryTimerCancelHandle, TimerCancelHandle,
 };
@@ -262,6 +263,17 @@ impl<PData> EffectHandler<PData> {
         self.core.delay_data(when, data).await
     }
 
+    /// Set or replace a processor-local wakeup.
+    pub fn set_wakeup(&self, slot: WakeupSlot, when: Instant) -> Result<(), WakeupError> {
+        self.core.set_wakeup(slot, when)
+    }
+
+    /// Cancel a previously scheduled processor-local wakeup.
+    #[must_use]
+    pub fn cancel_wakeup(&self, slot: WakeupSlot) -> bool {
+        self.core.cancel_wakeup(slot)
+    }
+
     /// Reports metrics collected by the processor.
     #[allow(dead_code)] // Will be used in the future. ToDo report metrics from channel and messages.
     pub(crate) fn report_metrics<M: MetricSetHandler + 'static>(
diff --git a/rust/otap-dataflow/crates/engine/src/message.rs b/rust/otap-dataflow/crates/engine/src/message.rs
index 884e3fa5ac..c6a8957f5e 100644
--- a/rust/otap-dataflow/crates/engine/src/message.rs
+++ b/rust/otap-dataflow/crates/engine/src/message.rs
@@ -6,6 +6,7 @@
 use crate::clock;
 use crate::control::{AckMsg, NackMsg, NodeControlMsg};
 use crate::local::message::{LocalReceiver, LocalSender};
+use crate::node_local_scheduler::NodeLocalSchedulerHandle;
 use crate::shared::message::{SharedReceiver, SharedSender};
 use crate::{Interests, ReceivedAtNode};
 use otap_df_channel::error::{RecvError, SendError};
@@ -244,6 +245,7 @@ impl<T> ChannelReceiver<T> for SharedReceiver<T> {
 ///
 /// This enum lets the shared core express that difference explicitly without
 /// forking the whole receive loop.
+#[derive(Clone, Copy)]
 enum DrainPolicy {
     /// Respect the caller's admission flag even after shutdown has been
     /// latched.
@@ -257,6 +259,7 @@ enum DrainPolicy {
 struct InboxCore<PData, ControlRx, PDataRx> {
     control_rx: Option<ControlRx>,
     pdata_rx: Option<PDataRx>,
+    local_scheduler: Option<NodeLocalSchedulerHandle>,
     /// Once a Shutdown is seen, this is set to `Some(instant)` representing the drain deadline.
     shutting_down_deadline: Option<Instant>,
     /// Holds the ControlMsg::Shutdown until after we’ve drained pdata.
@@ -270,10 +273,17 @@ struct InboxCore<PData, ControlRx, PDataRx> {
 }
 
 impl<PData, ControlRx, PDataRx> InboxCore<PData, ControlRx, PDataRx> {
-    fn new(control_rx: ControlRx, pdata_rx: PDataRx, node_id: usize, interests: Interests) -> Self {
+    fn new(
+        control_rx: ControlRx,
+        pdata_rx: PDataRx,
+        local_scheduler: Option<NodeLocalSchedulerHandle>,
+        node_id: usize,
+        interests: Interests,
+    ) -> Self {
         Self {
             control_rx: Some(control_rx),
             pdata_rx: Some(pdata_rx),
+            local_scheduler,
             shutting_down_deadline: None,
             pending_shutdown: None,
             node_id,
@@ -285,6 +295,9 @@ impl<PData, ControlRx, PDataRx> InboxCore<PData, ControlRx, PDataRx> {
     fn shutdown(&mut self) {
         self.shutting_down_deadline = None;
         self.consecutive_control = 0;
+        if let Some(local_scheduler) = &self.local_scheduler {
+            local_scheduler.begin_shutdown();
+        }
         drop(self.control_rx.take().expect("control_rx must exist"));
         drop(self.pdata_rx.take().expect("pdata_rx must exist"));
     }
@@ -330,6 +343,33 @@ where
         accept_pdata || matches!(policy, DrainPolicy::ForceDrainDuringShutdown)
     }
 
+    fn shutdown_drain_complete(&self) -> bool {
+        self.pdata_rx
+            .as_ref()
+            .expect("pdata_rx must exist")
+            .is_empty()
+            && self
+                .local_scheduler
+                .as_ref()
+                .map(NodeLocalSchedulerHandle::is_drained)
+                .unwrap_or(true)
+    }
+
+    fn pop_local_due(&mut self, now: Instant) -> Option<Message<PData>> {
+        self.local_scheduler
+            .as_ref()
+            .and_then(|scheduler| scheduler.pop_due(now))
+            .map(|(slot, when)| self.control_message(NodeControlMsg::Wakeup { slot, when }))
+    }
+
+    fn next_local_expiry_sleep(&self, now: Instant) -> Option<clock::Sleep> {
+        self.local_scheduler
+            .as_ref()
+            .and_then(NodeLocalSchedulerHandle::next_expiry)
+            .filter(|when| *when > now)
+            .map(clock::sleep_until)
+    }
+
     async fn recv_with_policy(
         &mut self,
         accept_pdata: bool,
@@ -373,12 +413,7 @@ where
                 // only after the bounded pdata backlog is empty. This keeps the
                 // channel-level drain contract explicit: upstream work that was
                 // already accepted into the channel gets a chance to run first.
-                if self
-                    .pdata_rx
-                    .as_ref()
-                    .expect("pdata_rx must exist")
-                    .is_empty()
-                {
+                if self.shutdown_drain_complete() {
                     let shutdown = self
                         .pending_shutdown
                         .take()
@@ -392,6 +427,9 @@ where
                     sleep_until_deadline = Some(clock::sleep_until(dl));
                 }
 
+                let now = clock::now();
+                let mut sleep_until_local = self.next_local_expiry_sleep(now);
+
                 // Even while draining we cap control preference. This prevents a
                 // sustained Ack/Nack or shutdown-control burst from starving the
                 // already buffered pdata that shutdown is trying to drain.
@@ -415,6 +453,28 @@ where
                     }
                 }
 
+                if !self
+                    .control_rx
+                    .as_ref()
+                    .expect("control_rx must exist")
+                    .is_empty()
+                {
+                    match self
+                        .control_rx
+                        .as_mut()
+                        .expect("control_rx must exist")
+                        .try_recv()
+                    {
+                        Ok(msg) => return Ok(self.control_message(msg)),
+                        Err(RecvError::Empty) => {}
+                        Err(e) => return Err(e),
+                    }
+                }
+
+                if let Some(msg) = self.pop_local_due(now) {
+                    return Ok(msg);
+                }
+
                 // Drain pdata (gated by accept_pdata) and deliver control messages.
                 // Honoring accept_pdata during draining lets stateful processors
                 // receive Ack/Nack to reduce in-flight state and reopen capacity.
@@ -445,6 +505,22 @@ where
                             Ok(msg) => return Ok(self.control_message(msg)),
                             Err(e) => return Err(e),
                         },
+
+                        _ = async {
+                            if let Some(delay) = sleep_until_local.as_mut() {
+                                delay.await;
+                            }
+                        }, if sleep_until_local.is_some() => {
+                            continue;
+                        },
+
+                        _ = async {
+                            if let Some(local_scheduler) = self.local_scheduler.as_ref() {
+                                local_scheduler.wait_for_change().await;
+                            }
+                        }, if self.local_scheduler.is_some() => {
+                            continue;
+                        },
                     }
                 } else {
                     tokio::select! {
@@ -473,11 +549,30 @@ where
                                 return Ok(Message::Control(shutdown));
                             }
                         },
+
+                        _ = async {
+                            if let Some(delay) = sleep_until_local.as_mut() {
+                                delay.await;
+                            }
+                        }, if sleep_until_local.is_some() => {
+                            continue;
+                        },
+
+                        _ = async {
+                            if let Some(local_scheduler) = self.local_scheduler.as_ref() {
+                                local_scheduler.wait_for_change().await;
+                            }
+                        }, if self.local_scheduler.is_some() => {
+                            continue;
+                        },
                     }
                 }
             }
 
             // Normal mode: no shutdown yet
+            let now = clock::now();
+            let mut sleep_until_local = self.next_local_expiry_sleep(now);
+
             if accept_pdata && self.consecutive_control >= CONTROL_BURST_LIMIT {
                 match self
                     .pdata_rx
@@ -491,6 +586,43 @@ where
                 }
             }
 
+            if !self
+                .control_rx
+                .as_ref()
+                .expect("control_rx must exist")
+                .is_empty()
+            {
+                match self
+                    .control_rx
+                    .as_mut()
+                    .expect("control_rx must exist")
+                    .try_recv()
+                {
+                    Ok(NodeControlMsg::Shutdown { deadline, reason }) => {
+                        if deadline <= clock::now() {
+                            self.shutdown();
+                            return Ok(Message::Control(NodeControlMsg::Shutdown {
+                                deadline,
+                                reason,
+                            }));
+                        }
+                        if let Some(local_scheduler) = &self.local_scheduler {
+                            local_scheduler.begin_shutdown();
+                        }
+                        self.shutting_down_deadline = Some(deadline);
+                        self.pending_shutdown = Some(NodeControlMsg::Shutdown { deadline, reason });
+                        continue;
+                    }
+                    Ok(msg) => return Ok(self.control_message(msg)),
+                    Err(RecvError::Empty) => {}
+                    Err(e) => return Err(e),
+                }
+            }
+
+            if let Some(msg) = self.pop_local_due(now) {
+                return Ok(msg);
+            }
+
             if accept_pdata && self.consecutive_control >= CONTROL_BURST_LIMIT {
                 tokio::select! {
                     biased;
@@ -514,6 +646,9 @@ where
                                 self.shutdown();
                                 return Ok(Message::Control(NodeControlMsg::Shutdown { deadline, reason }));
                             }
+                            if let Some(local_scheduler) = &self.local_scheduler {
+                                local_scheduler.begin_shutdown();
+                            }
                             self.shutting_down_deadline = Some(deadline);
                             self.pending_shutdown = Some(NodeControlMsg::Shutdown { deadline, reason });
                             continue;
@@ -521,6 +656,22 @@ where
                         Ok(msg) => return Ok(self.control_message(msg)),
                         Err(e)  => return Err(e),
                     },
+
+                    _ = async {
+                        if let Some(delay) = sleep_until_local.as_mut() {
+                            delay.await;
+                        }
+                    }, if sleep_until_local.is_some() => {
+                        continue;
+                    },
+
+                    _ = async {
+                        if let Some(local_scheduler) = self.local_scheduler.as_ref() {
+                            local_scheduler.wait_for_change().await;
+                        }
+                    }, if self.local_scheduler.is_some() => {
+                        continue;
+                    },
                 }
             } else {
                 tokio::select! {
@@ -536,6 +687,9 @@ where
                                 self.shutdown();
                                 return Ok(Message::Control(NodeControlMsg::Shutdown { deadline, reason }));
                             }
+                            if let Some(local_scheduler) = &self.local_scheduler {
+                                local_scheduler.begin_shutdown();
+                            }
                             self.shutting_down_deadline = Some(deadline);
                             self.pending_shutdown = Some(NodeControlMsg::Shutdown { deadline, reason });
                             continue;
@@ -550,6 +704,22 @@ where
                             Err(RecvError::Closed) => return Ok(self.closed_pdata_shutdown()),
                             Err(e) => return Err(e),
                         }
+                    },
+
+                    _ = async {
+                        if let Some(delay) = sleep_until_local.as_mut() {
+                            delay.await;
+                        }
+                    }, if sleep_until_local.is_some() => {
+                        continue;
+                    },
+
+                    _ = async {
+                        if let Some(local_scheduler) = self.local_scheduler.as_ref() {
+                            local_scheduler.wait_for_change().await;
+                        }
+                    }, if self.local_scheduler.is_some() => {
+                        continue;
                     }
                 }
             }
@@ -574,9 +744,33 @@ impl<PData> ProcessorInbox<PData> {
         pdata_rx: Receiver<PData>,
         node_id: usize,
         interests: Interests,
+    ) -> Self {
+        Self::new_with_local_scheduler(
+            control_rx,
+            pdata_rx,
+            NodeLocalSchedulerHandle::new(32),
+            node_id,
+            interests,
+        )
+    }
+
+    /// Creates a new processor inbox with an explicit processor-local scheduler.
+    #[must_use]
+    pub(crate) fn new_with_local_scheduler(
+        control_rx: Receiver<NodeControlMsg<PData>>,
+        pdata_rx: Receiver<PData>,
+        local_scheduler: NodeLocalSchedulerHandle,
+        node_id: usize,
+        interests: Interests,
     ) -> Self {
         Self {
-            core: InboxCore::new(control_rx, pdata_rx, node_id, interests),
+            core: InboxCore::new(
+                control_rx,
+                pdata_rx,
+                Some(local_scheduler),
+                node_id,
+                interests,
+            ),
         }
     }
 }
@@ -613,7 +807,7 @@ impl<PData, ControlRx, PDataRx> ExporterInbox<PData, ControlRx, PDataRx> {
         interests: Interests,
     ) -> Self {
         Self {
-            core: InboxCore::new(control_rx, pdata_rx, node_id, interests),
+            core: InboxCore::new(control_rx, pdata_rx, None, node_id, interests),
         }
     }
 }
@@ -665,6 +859,185 @@ impl<PData: ReceivedAtNode> ExporterInbox<PData> {
     }
 }
 
+/// Backward-compatible exporter inbox alias.
+pub type ExporterMessageChannel<
+    PData,
+    ControlRx = Receiver<NodeControlMsg<PData>>,
+    PDataRx = Receiver<PData>,
+> = ExporterInbox<PData, ControlRx, PDataRx>;
+
 /// Send-friendly exporter inbox type for shared exporter runtimes.
 pub(crate) type SharedExporterInbox<PData> =
     ExporterInbox<PData, SharedReceiver<NodeControlMsg<PData>>, SharedReceiver<PData>>;
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::WakeupError;
+    use crate::local::message::LocalReceiver;
+    use crate::testing::TestMsg;
+    use otap_df_channel::mpsc;
+    use std::time::Duration;
+
+    fn local_processor_inbox(
+        wakeup_capacity: usize,
+    ) -> (
+        mpsc::Sender<NodeControlMsg<TestMsg>>,
+        mpsc::Sender<TestMsg>,
+        NodeLocalSchedulerHandle,
+        ProcessorInbox<TestMsg>,
+    ) {
+        let (control_tx, control_rx) = mpsc::Channel::<NodeControlMsg<TestMsg>>::new(64);
+        let (pdata_tx, pdata_rx) = mpsc::Channel::<TestMsg>::new(64);
+        let scheduler = NodeLocalSchedulerHandle::new(wakeup_capacity);
+        let inbox = ProcessorInbox::new_with_local_scheduler(
+            Receiver::Local(LocalReceiver::mpsc(control_rx)),
+            Receiver::Local(LocalReceiver::mpsc(pdata_rx)),
+            scheduler.clone(),
+            7,
+            Interests::empty(),
+        );
+        (control_tx, pdata_tx, scheduler, inbox)
+    }
+
+    #[tokio::test]
+    async fn processor_inbox_emits_due_wakeup_as_control_message() {
+        let (_control_tx, _pdata_tx, scheduler, mut inbox) = local_processor_inbox(4);
+        let when = Instant::now();
+        scheduler
+            .set_wakeup(crate::control::WakeupSlot(0), when)
+            .expect("wakeup should schedule");
+
+        let message = tokio::time::timeout(Duration::from_millis(50), inbox.recv_when(true))
+            .await
+            .expect("inbox should wake")
+            .expect("message should arrive");
+        assert!(matches!(
+            message,
+            Message::Control(NodeControlMsg::Wakeup {
+                slot: crate::control::WakeupSlot(0),
+                when: observed,
+            }) if observed == when
+        ));
+    }
+
+    #[tokio::test]
+    async fn processor_inbox_wakeup_preserves_control_fairness() {
+        let (_control_tx, pdata_tx, scheduler, mut inbox) = local_processor_inbox(64);
+        pdata_tx
+            .send_async(TestMsg::new("pdata"))
+            .await
+            .expect("pdata should enqueue");
+        let when = Instant::now();
+        for slot in 0..40 {
+            scheduler
+                .set_wakeup(crate::control::WakeupSlot(slot), when)
+                .expect("wakeup should schedule");
+        }
+
+        let mut wakeups = 0usize;
+        let mut saw_pdata = false;
+        while wakeups <= CONTROL_BURST_LIMIT {
+            match inbox.recv_when(true).await.expect("message should arrive") {
+                Message::PData(TestMsg(value)) => {
+                    assert_eq!(value, "pdata");
+                    saw_pdata = true;
+                    break;
+                }
+                Message::Control(NodeControlMsg::Wakeup { .. }) => {
+                    wakeups += 1;
+                }
+                other => panic!("unexpected message {other:?}"),
+            }
+        }
+
+        assert!(
+            saw_pdata,
+            "pdata should not starve behind processor-local wakeups"
+        );
+    }
+
+    #[tokio::test]
+    async fn processor_inbox_rejects_wakeups_after_shutdown_latch() {
+        let (control_tx, pdata_tx, scheduler, mut inbox) = local_processor_inbox(4);
+        pdata_tx
+            .send_async(TestMsg::new("buffered"))
+            .await
+            .expect("pdata should enqueue");
+        control_tx
+            .send_async(NodeControlMsg::Shutdown {
+                deadline: Instant::now() + Duration::from_secs(1),
+                reason: "shutdown".to_owned(),
+            })
+            .await
+            .expect("shutdown should enqueue");
+        control_tx
+            .send_async(NodeControlMsg::Config {
+                config: serde_json::json!({"mode": "draining"}),
+            })
+            .await
+            .expect("config should enqueue");
+
+        let first = inbox
+            .recv_when(false)
+            .await
+            .expect("control should arrive after shutdown latch");
+        assert!(matches!(
+            first,
+            Message::Control(NodeControlMsg::Config { .. })
+        ));
+        assert_eq!(
+            scheduler.set_wakeup(crate::control::WakeupSlot(1), Instant::now()),
+            Err(WakeupError::ShuttingDown)
+        );
+    }
+
+    #[tokio::test]
+    async fn processor_inbox_drops_pending_wakeups_on_shutdown_latch() {
+        let (control_tx, pdata_tx, scheduler, mut inbox) = local_processor_inbox(4);
+        pdata_tx
+            .send_async(TestMsg::new("buffered"))
+            .await
+            .expect("pdata should enqueue");
+        scheduler
+            .set_wakeup(crate::control::WakeupSlot(2), Instant::now())
+            .expect("wakeup should schedule");
+        control_tx
+            .send_async(NodeControlMsg::Shutdown {
+                deadline: Instant::now() + Duration::from_secs(1),
+                reason: "shutdown".to_owned(),
+            })
+            .await
+            .expect("shutdown should enqueue");
+        control_tx
+            .send_async(NodeControlMsg::Config {
+                config: serde_json::json!({"drop": true}),
+            })
+            .await
+            .expect("config should enqueue");
+
+        let first = inbox
+            .recv_when(false)
+            .await
+            .expect("control should arrive after shutdown latch");
+        assert!(matches!(
+            first,
+            Message::Control(NodeControlMsg::Config { .. })
+        ));
+
+        let drained = inbox
+            .recv_when(true)
+            .await
+            .expect("buffered pdata should drain");
+        assert!(matches!(drained, Message::PData(TestMsg(ref value)) if value == "buffered"));
+
+        let shutdown = inbox
+            .recv_when(true)
+            .await
+            .expect("shutdown should follow drain");
+        assert!(matches!(
+            shutdown,
+            Message::Control(NodeControlMsg::Shutdown { .. })
+        ));
+    }
+}
diff --git a/rust/otap-dataflow/crates/engine/src/node_local_scheduler.rs b/rust/otap-dataflow/crates/engine/src/node_local_scheduler.rs
new file mode 100644
index 0000000000..8cbc0fb658
--- /dev/null
+++ b/rust/otap-dataflow/crates/engine/src/node_local_scheduler.rs
@@ -0,0 +1,322 @@
+// Copyright The OpenTelemetry Authors
+// SPDX-License-Identifier: Apache-2.0
+
+//! Node-local wakeup scheduling for processor inboxes.
+
+use crate::control::WakeupSlot;
+use std::cmp::Ordering;
+use std::collections::{BinaryHeap, HashMap};
+use std::sync::{Arc, Mutex};
+use std::time::Instant;
+use tokio::sync::Notify;
+
+/// Error returned when a wakeup request cannot be accepted.
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub enum WakeupError {
+    /// The processor has already latched shutdown.
+    ShuttingDown,
+    /// The bounded live wakeup slot set is full.
+    Capacity,
+}
+
+#[derive(Clone, Copy, Debug)]
+struct WakeupState {
+    when: Instant,
+    generation: u64,
+    sequence: u64,
+}
+
+#[derive(Debug)]
+struct ScheduledWakeup {
+    slot: WakeupSlot,
+    when: Instant,
+    generation: u64,
+    sequence: u64,
+}
+
+impl Ord for ScheduledWakeup {
+    fn cmp(&self, other: &Self) -> Ordering {
+        other
+            .when
+            .cmp(&self.when)
+            .then_with(|| other.sequence.cmp(&self.sequence))
+    }
+}
+
+impl PartialOrd for ScheduledWakeup {
+    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
+impl PartialEq for ScheduledWakeup {
+    fn eq(&self, other: &Self) -> bool {
+        self.slot == other.slot
+            && self.when == other.when
+            && self.generation == other.generation
+            && self.sequence == other.sequence
+    }
+}
+
+impl Eq for ScheduledWakeup {}
+
+struct NodeLocalScheduler {
+    wakeup_capacity: usize,
+    next_sequence: u64,
+    wakeups: BinaryHeap<ScheduledWakeup>,
+    wakeup_state: HashMap<WakeupSlot, WakeupState>,
+    shutting_down: bool,
+}
+
+impl NodeLocalScheduler {
+    fn new(wakeup_capacity: usize) -> Self {
+        Self {
+            wakeup_capacity,
+            next_sequence: 0,
+            wakeups: BinaryHeap::new(),
+            wakeup_state: HashMap::new(),
+            shutting_down: false,
+        }
+    }
+
+    fn next_sequence(&mut self) -> u64 {
+        let next = self.next_sequence;
+        self.next_sequence = self.next_sequence.saturating_add(1);
+        next
+    }
+
+    fn set_wakeup(&mut self, slot: WakeupSlot, when: Instant) -> Result<(), WakeupError> {
+        if self.shutting_down {
+            return Err(WakeupError::ShuttingDown);
+        }
+
+        let sequence = self.next_sequence();
+        let generation = if let Some(state) = self.wakeup_state.get_mut(&slot) {
+            state.when = when;
+            state.generation = state.generation.saturating_add(1);
+            state.sequence = sequence;
+            state.generation
+        } else {
+            if self.wakeup_state.len() >= self.wakeup_capacity {
+                return Err(WakeupError::Capacity);
+            }
+            let _ = self.wakeup_state.insert(
+                slot,
+                WakeupState {
+                    when,
+                    generation: 0,
+                    sequence,
+                },
+            );
+            0
+        };
+
+        self.wakeups.push(ScheduledWakeup {
+            slot,
+            when,
+            generation,
+            sequence,
+        });
+        Ok(())
+    }
+
+    fn cancel_wakeup(&mut self, slot: WakeupSlot) -> bool {
+        if self.shutting_down {
+            return false;
+        }
+        self.wakeup_state.remove(&slot).is_some()
+    }
+
+    fn discard_stale_wakeup_head(&mut self) {
+        while let Some(head) = self.wakeups.peek() {
+            let Some(state) = self.wakeup_state.get(&head.slot) else {
+                let _ = self.wakeups.pop();
+                continue;
+            };
+            if state.generation != head.generation || state.when != head.when {
+                let _ = self.wakeups.pop();
+                continue;
+            }
+            break;
+        }
+    }
+
+    fn next_expiry(&mut self) -> Option<Instant> {
+        self.discard_stale_wakeup_head();
+        self.wakeups.peek().map(|wakeup| wakeup.when)
+    }
+
+    fn pop_due(&mut self, now: Instant) -> Option<(WakeupSlot, Instant)> {
+        self.discard_stale_wakeup_head();
+
+        let next_due = self.wakeups.peek().map(|wakeup| wakeup.when)?;
+        if next_due > now {
+            return None;
+        }
+
+        let wakeup = self.wakeups.pop().expect("wakeup must exist");
+        let _ = self.wakeup_state.remove(&wakeup.slot);
+        Some((wakeup.slot, wakeup.when))
+    }
+
+    fn begin_shutdown(&mut self) {
+        if self.shutting_down {
+            return;
+        }
+        self.shutting_down = true;
+        self.wakeup_state.clear();
+        self.wakeups.clear();
+    }
+
+    fn is_drained(&self) -> bool {
+        self.wakeup_state.is_empty()
+    }
+}
+
+/// Shared handle used by the processor inbox and the processor effect handler.
+pub(crate) struct NodeLocalSchedulerHandle {
+    inner: Arc<Mutex<NodeLocalScheduler>>,
+    notify: Arc<Notify>,
+}
+
+impl Clone for NodeLocalSchedulerHandle {
+    fn clone(&self) -> Self {
+        Self {
+            inner: Arc::clone(&self.inner),
+            notify: Arc::clone(&self.notify),
+        }
+    }
+}
+
+impl NodeLocalSchedulerHandle {
+    pub(crate) fn new(wakeup_capacity: usize) -> Self {
+        Self {
+            inner: Arc::new(Mutex::new(NodeLocalScheduler::new(wakeup_capacity))),
+            notify: Arc::new(Notify::new()),
+        }
+    }
+
+    fn with_scheduler<R>(&self, f: impl FnOnce(&mut NodeLocalScheduler) -> R) -> R {
+        let mut guard = self
+            .inner
+            .lock()
+            .unwrap_or_else(|poisoned| poisoned.into_inner());
+        f(&mut guard)
+    }
+
+    pub(crate) fn set_wakeup(&self, slot: WakeupSlot, when: Instant) -> Result<(), WakeupError> {
+        let result = self.with_scheduler(|scheduler| scheduler.set_wakeup(slot, when));
+        if result.is_ok() {
+            self.notify.notify_one();
+        }
+        result
+    }
+
+    #[must_use]
+    pub(crate) fn cancel_wakeup(&self, slot: WakeupSlot) -> bool {
+        let changed = self.with_scheduler(|scheduler| scheduler.cancel_wakeup(slot));
+        if changed {
+            self.notify.notify_one();
+        }
+        changed
+    }
+
+    pub(crate) fn next_expiry(&self) -> Option<Instant> {
+        self.with_scheduler(NodeLocalScheduler::next_expiry)
+    }
+
+    pub(crate) fn pop_due(&self, now: Instant) -> Option<(WakeupSlot, Instant)> {
+        self.with_scheduler(|scheduler| scheduler.pop_due(now))
+    }
+
+    pub(crate) fn begin_shutdown(&self) {
+        self.with_scheduler(NodeLocalScheduler::begin_shutdown);
+        self.notify.notify_waiters();
+    }
+
+    pub(crate) fn is_drained(&self) -> bool {
+        self.with_scheduler(|scheduler| scheduler.is_drained())
+    }
+
+    pub(crate) async fn wait_for_change(&self) {
+        self.notify.notified().await;
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::time::Duration;
+
+    #[test]
+    fn set_wakeup_schedules_a_wakeup() {
+        let mut scheduler = NodeLocalScheduler::new(2);
+        let now = Instant::now();
+        let when = now + Duration::from_secs(1);
+
+        assert_eq!(scheduler.set_wakeup(WakeupSlot(7), when), Ok(()));
+        assert_eq!(scheduler.next_expiry(), Some(when));
+        assert_eq!(scheduler.pop_due(now), None);
+        assert_eq!(scheduler.pop_due(when), Some((WakeupSlot(7), when)));
+        assert_eq!(scheduler.next_expiry(), None);
+    }
+
+    #[test]
+    fn setting_same_slot_replaces_previous_due_time() {
+        let mut scheduler = NodeLocalScheduler::new(2);
+        let now = Instant::now();
+        let later = now + Duration::from_secs(10);
+        let sooner = now + Duration::from_secs(1);
+
+        assert_eq!(scheduler.set_wakeup(WakeupSlot(3), later), Ok(()));
+        assert_eq!(scheduler.set_wakeup(WakeupSlot(3), sooner), Ok(()));
+        assert_eq!(scheduler.next_expiry(), Some(sooner));
+        assert_eq!(scheduler.pop_due(sooner), Some((WakeupSlot(3), sooner)));
+        assert_eq!(scheduler.pop_due(later), None);
+    }
+
+    #[test]
+    fn cancel_wakeup_removes_pending_wakeup() {
+        let mut scheduler = NodeLocalScheduler::new(2);
+        let when = Instant::now() + Duration::from_secs(1);
+
+        assert_eq!(scheduler.set_wakeup(WakeupSlot(5), when), Ok(()));
+        assert!(scheduler.cancel_wakeup(WakeupSlot(5)));
+        assert!(!scheduler.cancel_wakeup(WakeupSlot(5)));
+        assert_eq!(scheduler.next_expiry(), None);
+        assert_eq!(scheduler.pop_due(when), None);
+    }
+
+    #[test]
+    fn capacity_is_enforced_on_distinct_live_slots() {
+        let mut scheduler = NodeLocalScheduler::new(1);
+        let when = Instant::now() + Duration::from_secs(1);
+
+        assert_eq!(scheduler.set_wakeup(WakeupSlot(0), when), Ok(()));
+        assert_eq!(
+            scheduler.set_wakeup(WakeupSlot(1), when),
+            Err(WakeupError::Capacity)
+        );
+        assert_eq!(
+            scheduler.set_wakeup(WakeupSlot(0), when + Duration::from_secs(1)),
+            Ok(())
+        );
+    }
+
+    #[test]
+    fn stale_heap_entries_are_ignored() {
+        let mut scheduler = NodeLocalScheduler::new(2);
+        let now = Instant::now();
+        let first = now + Duration::from_secs(5);
+        let replacement = now + Duration::from_secs(1);
+
+        assert_eq!(scheduler.set_wakeup(WakeupSlot(9), first), Ok(()));
+        assert_eq!(scheduler.set_wakeup(WakeupSlot(9), replacement), Ok(()));
+        assert_eq!(
+            scheduler.pop_due(replacement),
+            Some((WakeupSlot(9), replacement))
+        );
+        assert_eq!(scheduler.pop_due(first), None);
+        assert_eq!(scheduler.next_expiry(), None);
+    }
+}
diff --git a/rust/otap-dataflow/crates/engine/src/processor.rs b/rust/otap-dataflow/crates/engine/src/processor.rs
index c0fbc9d97d..995b209488 100644
--- a/rust/otap-dataflow/crates/engine/src/processor.rs
+++ b/rust/otap-dataflow/crates/engine/src/processor.rs
@@ -24,6 +24,7 @@ use crate::local::message::{LocalReceiver, LocalSender};
 use crate::local::processor as local;
 use crate::message::{Message, ProcessorInbox, Receiver, Sender};
 use crate::node::{Node, NodeId, NodeWithPDataReceiver, NodeWithPDataSender};
+use crate::node_local_scheduler::NodeLocalSchedulerHandle;
 use crate::shared::message::{SharedReceiver, SharedSender};
 use crate::shared::processor as shared;
 use otap_df_channel::error::SendError;
@@ -325,6 +326,7 @@ impl<PData> ProcessorWrapper<PData> {
         match self {
             ProcessorWrapper::Local {
                 node_id,
+                runtime_config,
                 processor,
                 control_receiver,
                 pdata_senders,
@@ -333,7 +335,9 @@ impl<PData> ProcessorWrapper<PData> {
                 source_tag,
                 ..
             } => {
-                let inbox = ProcessorInbox::new(
+                let local_scheduler =
+                    NodeLocalSchedulerHandle::new(runtime_config.control_channel.capacity);
+                let inbox = ProcessorInbox::new_with_local_scheduler(
                     Receiver::Local(control_receiver),
                     pdata_receiver.ok_or_else(|| Error::ProcessorError {
                         processor: node_id.clone(),
@@ -341,6 +345,7 @@ impl<PData> ProcessorWrapper<PData> {
                         error: "The pdata receiver must be defined at this stage".to_owned(),
                         source_detail: String::new(),
                     })?,
+                    local_scheduler.clone(),
                     node_id.index,
                     node_interests,
                 );
@@ -352,6 +357,7 @@ impl<PData> ProcessorWrapper<PData> {
                     metrics_reporter,
                 );
                 effect_handler.set_source_tagging(source_tag);
+                effect_handler.core.set_local_scheduler(local_scheduler);
                 Ok(ProcessorWrapperRuntime::Local {
                     processor,
                     effect_handler,
@@ -360,6 +366,7 @@ impl<PData> ProcessorWrapper<PData> {
             }
             ProcessorWrapper::Shared {
                 node_id,
+                runtime_config,
                 processor,
                 control_receiver,
                 pdata_senders,
@@ -368,7 +375,9 @@ impl<PData> ProcessorWrapper<PData> {
                 source_tag,
                 ..
             } => {
-                let inbox = ProcessorInbox::new(
+                let local_scheduler =
+                    NodeLocalSchedulerHandle::new(runtime_config.control_channel.capacity);
+                let inbox = ProcessorInbox::new_with_local_scheduler(
                     Receiver::Shared(control_receiver),
                     Receiver::Shared(pdata_receiver.ok_or_else(|| Error::ProcessorError {
                         processor: node_id.clone(),
@@ -376,6 +385,7 @@ impl<PData> ProcessorWrapper<PData> {
                         error: "The pdata receiver must be defined at this stage".to_owned(),
                         source_detail: String::new(),
                     })?),
+                    local_scheduler.clone(),
                     node_id.index,
                     node_interests,
                 );
@@ -387,6 +397,7 @@ impl<PData> ProcessorWrapper<PData> {
                     metrics_reporter,
                 );
                 effect_handler.set_source_tagging(source_tag);
+                effect_handler.core.set_local_scheduler(local_scheduler);
                 Ok(ProcessorWrapperRuntime::Shared {
                     processor,
                     effect_handler,
diff --git a/rust/otap-dataflow/crates/engine/src/shared/processor.rs b/rust/otap-dataflow/crates/engine/src/shared/processor.rs
index 247714b0f3..3c9db25aaf 100644
--- a/rust/otap-dataflow/crates/engine/src/shared/processor.rs
+++ b/rust/otap-dataflow/crates/engine/src/shared/processor.rs
@@ -32,7 +32,8 @@
 //! in parallel on different cores, each with its own processor instance.
 
 use crate::Interests;
-use crate::control::{AckMsg, NackMsg, RuntimeCtrlMsgSender};
+use crate::WakeupError;
+use crate::control::{AckMsg, NackMsg, RuntimeCtrlMsgSender, WakeupSlot};
 use crate::effect_handler::{
     EffectHandlerCore, SourceTagging, TelemetryTimerCancelHandle, TimerCancelHandle,
 };
@@ -235,6 +236,17 @@ impl<PData> EffectHandler<PData> {
         self.core.delay_data(when, data).await
     }
 
+    /// Set or replace a processor-local wakeup.
+    pub fn set_wakeup(&self, slot: WakeupSlot, when: Instant) -> Result<(), WakeupError> {
+        self.core.set_wakeup(slot, when)
+    }
+
+    /// Cancel a previously scheduled processor-local wakeup.
+    #[must_use]
+    pub fn cancel_wakeup(&self, slot: WakeupSlot) -> bool {
+        self.core.cancel_wakeup(slot)
+    }
+
     /// Reports metrics collected by the processor.
     #[allow(dead_code)] // Will be used in the future. ToDo report metrics from channel and messages.
     pub(crate) fn report_metrics<M: MetricSetHandler + 'static>(
diff --git a/rust/otap-dataflow/crates/pdata/src/validation/collector.rs b/rust/otap-dataflow/crates/pdata/src/validation/collector.rs
index 308ddca3cb..810fcf30f0 100644
--- a/rust/otap-dataflow/crates/pdata/src/validation/collector.rs
+++ b/rust/otap-dataflow/crates/pdata/src/validation/collector.rs
@@ -40,6 +40,10 @@ pub static COLLECTOR_PATH: LazyLock<String> = LazyLock::new(|| {
     path
 });
 
+pub(super) fn collector_available() -> bool {
+    Path::new(COLLECTOR_PATH.as_str()).exists()
+}
+
 /// Helper function to spawn an async task that reads lines from a buffer and logs them with a prefix.
 /// Optionally checks for a message substring and sends a signal when it matches.
 async fn spawn_line_reader<R>(
diff --git a/rust/otap-dataflow/crates/pdata/src/validation/scenarios.rs b/rust/otap-dataflow/crates/pdata/src/validation/scenarios.rs
index 2b6404cf66..018b4a66a9 100644
--- a/rust/otap-dataflow/crates/pdata/src/validation/scenarios.rs
+++ b/rust/otap-dataflow/crates/pdata/src/validation/scenarios.rs
@@ -28,6 +28,14 @@ pub async fn run_single_round_trip_test<I, O, F>(
     I::Response: std::fmt::Debug + PartialEq + Default,
     F: FnOnce() -> I::Request + 'static,
 {
+    if !super::collector::collector_available() {
+        eprintln!(
+            "Skipping validation test because collector binary is unavailable at '{}'.",
+            super::collector::COLLECTOR_PATH.as_str()
+        );
+        return;
+    }
+
     match run_single_round_trip::<I, O, F>(create_request, expected_error).await {
         Ok(_) => {}
         Err(err) => {

From 09d650f697ab5df50d80987d38b5880c90f04c22 Mon Sep 17 00:00:00 2001
From: lquerel <l.querel@f5.com>
Date: Tue, 31 Mar 2026 08:33:08 -0700
Subject: [PATCH 02/18] Handle wakeups in control channel benchmark

---
 .../benchmarks/benches/control_channel/main.rs              | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/rust/otap-dataflow/benchmarks/benches/control_channel/main.rs b/rust/otap-dataflow/benchmarks/benches/control_channel/main.rs
index a1641640ae..c1b2edde94 100644
--- a/rust/otap-dataflow/benchmarks/benches/control_channel/main.rs
+++ b/rust/otap-dataflow/benchmarks/benches/control_channel/main.rs
@@ -140,7 +140,8 @@ async fn consume_current_local(
             NodeControlMsg::Config { .. } => observed.configs += 1,
             NodeControlMsg::DrainIngress { .. }
             | NodeControlMsg::Shutdown { .. }
-            | NodeControlMsg::DelayedData { .. } => {
+            | NodeControlMsg::DelayedData { .. }
+            | NodeControlMsg::Wakeup { .. } => {
                 panic!("unexpected message in benchmark current local receiver");
             }
         }
@@ -203,7 +204,8 @@ async fn consume_current_shared(
             NodeControlMsg::Config { .. } => observed.configs += 1,
             NodeControlMsg::DrainIngress { .. }
             | NodeControlMsg::Shutdown { .. }
-            | NodeControlMsg::DelayedData { .. } => {
+            | NodeControlMsg::DelayedData { .. }
+            | NodeControlMsg::Wakeup { .. } => {
                 panic!("unexpected message in benchmark current shared receiver");
             }
         }

From 44d64d4253b95aaec66dffc359575357da8a539c Mon Sep 17 00:00:00 2001
From: lquerel <l.querel@f5.com>
Date: Wed, 1 Apr 2026 19:23:52 -0700
Subject: [PATCH 03/18] Add batch wakeup liveness test

---
 .../otap/tests/core_node_liveness_tests.rs    | 256 ++++++++++++++++++
 1 file changed, 256 insertions(+)

diff --git a/rust/otap-dataflow/crates/otap/tests/core_node_liveness_tests.rs b/rust/otap-dataflow/crates/otap/tests/core_node_liveness_tests.rs
index 79e6d9d8c6..264d42c0cd 100644
--- a/rust/otap-dataflow/crates/otap/tests/core_node_liveness_tests.rs
+++ b/rust/otap-dataflow/crates/otap/tests/core_node_liveness_tests.rs
@@ -29,6 +29,7 @@ use otap_df_otap::OTAP_PIPELINE_FACTORY;
 use otap_df_state::store::ObservedStateStore;
 use otap_df_telemetry::InternalTelemetrySystem;
 use serde_json::json;
+use std::collections::HashMap;
 use std::sync::Arc;
 use std::sync::atomic::{AtomicBool, AtomicU64, Ordering};
 use std::time::{Duration, Instant};
@@ -52,6 +53,27 @@ fn fake_receiver_config(
     })
 }
 
+fn rate_limited_fake_receiver_config(
+    max_signal_count: u64,
+    max_batch_size: usize,
+    signals_per_second: usize,
+    enable_ack_nack: bool,
+) -> serde_json::Value {
+    json!({
+        "traffic_config": {
+            "signals_per_second": signals_per_second,
+            "max_signal_count": max_signal_count,
+            "max_batch_size": max_batch_size,
+            "metric_weight": 0,
+            "trace_weight": 0,
+            "log_weight": 100
+        },
+        "data_source": "static",
+        "generation_strategy": "pre_generated",
+        "enable_ack_nack": enable_ack_nack
+    })
+}
+
 fn build_retry_pipeline_config(
     pipeline_group_id: &PipelineGroupId,
     pipeline_id: &PipelineId,
@@ -126,6 +148,44 @@ fn build_batch_pipeline_config(
         .expect("failed to build batch liveness pipeline config")
 }
 
+fn build_otlp_batch_local_wakeup_pipeline_config(
+    pipeline_group_id: &PipelineGroupId,
+    pipeline_id: &PipelineId,
+    counter_id: &str,
+) -> PipelineConfig {
+    PipelineConfigBuilder::new()
+        .add_receiver(
+            "fake_receiver",
+            OTAP_FAKE_DATA_GENERATOR_URN,
+            Some(rate_limited_fake_receiver_config(5, 1, 1, true)),
+        )
+        .add_processor(
+            "batch",
+            OTAP_BATCH_PROCESSOR_URN,
+            Some(json!({
+                "format": "otlp",
+                "otlp": {
+                    "min_size": 262144,
+                    "sizer": "bytes"
+                },
+                "max_batch_duration": "250ms"
+            })),
+        )
+        .add_exporter(
+            "counting_exporter",
+            COUNTING_EXPORTER_URN,
+            Some(json!({"counter_id": counter_id})),
+        )
+        .one_of("fake_receiver", ["batch"])
+        .one_of("batch", ["counting_exporter"])
+        .build(
+            PipelineType::Otap,
+            pipeline_group_id.clone(),
+            pipeline_id.clone(),
+        )
+        .expect("failed to build local wakeup batch liveness pipeline config")
+}
+
 fn run_pipeline_with_condition<F>(
     config: PipelineConfig,
     pipeline_group_id: &PipelineGroupId,
@@ -218,6 +278,148 @@ fn run_pipeline_with_condition<F>(
     );
 }
 
+#[derive(Debug, Default)]
+struct BatchMetricsSnapshot {
+    fields: HashMap<String, u64>,
+}
+
+impl BatchMetricsSnapshot {
+    fn get(&self, field: &str) -> u64 {
+        self.fields.get(field).copied().unwrap_or(u64::MAX)
+    }
+
+    fn assert_eq(&self, field: &str, expected: u64) {
+        let actual = self.get(field);
+        assert_eq!(
+            actual, expected,
+            "{field}: expected {expected}, got {actual}"
+        );
+    }
+}
+
+fn capture_batch_metrics(
+    registry: &otap_df_telemetry::registry::TelemetryRegistryHandle,
+) -> BatchMetricsSnapshot {
+    let mut snapshot = BatchMetricsSnapshot::default();
+    registry.visit_current_metrics(|desc, _attrs, iter| {
+        if desc.name == "otap.processor.batch" {
+            for (field, value) in iter {
+                let _ = snapshot
+                    .fields
+                    .insert(field.name.to_owned(), value.to_u64_lossy());
+            }
+        }
+    });
+    snapshot
+}
+
+fn run_pipeline_and_capture_batch_metrics<F>(
+    config: PipelineConfig,
+    pipeline_group_id: &PipelineGroupId,
+    pipeline_id: &PipelineId,
+    max_duration: Duration,
+    shutdown_deadline: Duration,
+    shutdown_condition: F,
+) -> BatchMetricsSnapshot
+where
+    F: Fn() -> bool + Send + 'static,
+{
+    let telemetry_system = InternalTelemetrySystem::default();
+    let registry = telemetry_system.registry();
+    let collector = telemetry_system.collector();
+    let controller_ctx = ControllerContext::new(registry.clone());
+    let pipeline_ctx = controller_ctx.pipeline_context_with(
+        pipeline_group_id.clone(),
+        pipeline_id.clone(),
+        0,
+        1,
+        0,
+    );
+    let pipeline_entity_key = pipeline_ctx.register_pipeline_entity();
+    let channel_capacity_policy = ChannelCapacityPolicy::default();
+    let runtime_pipeline = OTAP_PIPELINE_FACTORY
+        .build(
+            pipeline_ctx.clone(),
+            config,
+            channel_capacity_policy.clone(),
+            TelemetryPolicy::default(),
+            None,
+        )
+        .expect("failed to build runtime pipeline");
+
+    let (runtime_ctrl_tx, runtime_ctrl_rx) =
+        runtime_ctrl_msg_channel(channel_capacity_policy.control.pipeline);
+    let (pipeline_completion_tx, pipeline_completion_rx) =
+        pipeline_completion_msg_channel(channel_capacity_policy.control.completion);
+    let runtime_ctrl_tx_for_shutdown = runtime_ctrl_tx.clone();
+
+    let observed_state_store =
+        ObservedStateStore::new(&ObservedStateSettings::default(), registry.clone());
+    let pipeline_key = DeployedPipelineKey {
+        pipeline_group_id: pipeline_group_id.clone(),
+        pipeline_id: pipeline_id.clone(),
+        core_id: 0,
+    };
+    let metrics_reporter = telemetry_system.reporter();
+    let event_reporter = observed_state_store.reporter(SendPolicy::default());
+
+    let capture_registry = registry.clone();
+    let capture_collector = collector.clone();
+    let shutdown_handle = std::thread::spawn(move || {
+        let start = Instant::now();
+        let poll_interval = Duration::from_millis(10);
+        loop {
+            if start.elapsed() >= max_duration || shutdown_condition() {
+                break;
+            }
+            capture_collector.collect_pending();
+            std::thread::sleep(poll_interval);
+        }
+
+        let telemetry_wait = Duration::from_millis(1500);
+        let telemetry_start = Instant::now();
+        while telemetry_start.elapsed() < telemetry_wait {
+            capture_collector.collect_pending();
+            std::thread::sleep(poll_interval);
+        }
+        capture_collector.collect_pending();
+
+        let snapshot = capture_batch_metrics(&capture_registry);
+
+        let deadline = Instant::now() + shutdown_deadline;
+        let _ = runtime_ctrl_tx_for_shutdown.try_send(RuntimeControlMsg::Shutdown {
+            deadline,
+            reason: "batch metrics capture shutdown".to_owned(),
+        });
+
+        snapshot
+    });
+
+    let run_result = {
+        let _pipeline_entity_guard =
+            set_pipeline_entity_key(pipeline_ctx.metrics_registry(), pipeline_entity_key);
+        runtime_pipeline.run_forever(
+            pipeline_key,
+            pipeline_ctx,
+            event_reporter,
+            metrics_reporter,
+            Duration::from_secs(1),
+            runtime_ctrl_tx,
+            runtime_ctrl_rx,
+            pipeline_completion_tx,
+            pipeline_completion_rx,
+        )
+    };
+    let snapshot = shutdown_handle
+        .join()
+        .expect("batch metrics capture thread should succeed");
+    assert!(
+        run_result.is_ok(),
+        "pipeline failed to shut down cleanly: {run_result:?}"
+    );
+    snapshot
+}
+
 // This pipeline starts with a downstream exporter that transiently Nacks every
 // request. Once retries are demonstrably happening, the exporter flips to Ack
 // mode and the pipeline must eventually drain all admitted work.
@@ -305,3 +507,57 @@ fn test_batch_pipeline_eventually_flushes_partial_batch() {
     );
     counting_exporter::unregister_counter(test_id);
 }
+
+// This exercises a batch pipeline where:
+// - the traffic generator emits 5 single-item OTLP log batches at 1 signal/sec
+// - the batch processor uses byte sizing with a 256 KiB minimum size
+// - each generated batch is intentionally tiny, so a size-triggered flush is
+//   impossible under this setup
+//
+// The test waits for all 5 items to reach the downstream counting exporter,
+// then captures the batch processor metrics from the in-process telemetry
+// registry. Under these inputs, the runtime guarantees we expect are:
+// - the pipeline makes forward progress without any size-based flushes
+// - every generated item is eventually exported downstream
+// - every flush is attributed to `flushes.timer`, which means the processor's
+//   node-local wakeup path fired and delivered the timeout back through the
+//   real inbox/runtime path
+// - the processor emits 5 output log batches after consuming 5 input log
+//   batches, so the wakeup-triggered flushes are producing real downstream
+//   pdata batches rather than being dropped internally
+#[test]
+fn test_batch_pipeline_uses_timer_wakeup_metrics_with_otlp_bytes_config() {
+    let pipeline_group_id: PipelineGroupId = "liveness-group".into();
+    let pipeline_id: PipelineId = "batch-pipeline-local-wakeup".into();
+    let test_id = "batch-pipeline-local-wakeup";
+    let delivered_items = Arc::new(AtomicU64::new(0));
+    counting_exporter::register_counter(test_id, delivered_items.clone());
+
+    let config =
+        build_otlp_batch_local_wakeup_pipeline_config(&pipeline_group_id, &pipeline_id, test_id);
+    let metrics = run_pipeline_and_capture_batch_metrics(
+        config,
+        &pipeline_group_id,
+        &pipeline_id,
+        Duration::from_secs(8),
+        Duration::from_secs(2),
+        {
+            let delivered_items = delivered_items.clone();
+            move || delivered_items.load(Ordering::Acquire) >= 5
+        },
+    );
+
+    assert_eq!(
+        delivered_items.load(Ordering::Acquire),
+        5,
+        "the local wakeup pipeline should export every generated item"
+    );
+    metrics.assert_eq("consumed.items.logs", 5);
+    metrics.assert_eq("consumed.batches.logs", 5);
+    metrics.assert_eq("produced.items.logs", 5);
+    metrics.assert_eq("produced.batches.logs", 5);
+    metrics.assert_eq("flushes.size", 0);
+    metrics.assert_eq("flushes.timer", 5);
+
+    counting_exporter::unregister_counter(test_id);
+}

From 24408dc509d1d40e4e5310b4f45f9b0db2ea2ffa Mon Sep 17 00:00:00 2001
From: lquerel <l.querel@f5.com>
Date: Thu, 2 Apr 2026 12:03:56 -0700
Subject: [PATCH 04/18] Document processor wakeup API

---
 .../otap-dataflow/crates/engine/src/control.rs | 18 ++++++++++++++++--
 .../crates/engine/src/effect_handler.rs        | 13 +++++++++++++
 .../crates/engine/src/local/processor.rs       |  8 +++++++-
 .../crates/engine/src/shared/processor.rs      |  8 +++++++-
 4 files changed, 43 insertions(+), 4 deletions(-)

diff --git a/rust/otap-dataflow/crates/engine/src/control.rs b/rust/otap-dataflow/crates/engine/src/control.rs
index 8135fbfa0b..cf7df79b61 100644
--- a/rust/otap-dataflow/crates/engine/src/control.rs
+++ b/rust/otap-dataflow/crates/engine/src/control.rs
@@ -75,7 +75,13 @@ impl From<Context8u8> for f64 {
 /// numbers, deadline, num_items, etc.
 pub type CallData = SmallVec<[Context8u8; 3]>;
 
-/// Opaque key used to identify a node-local scheduled wakeup.
+/// Opaque key used to identify a processor-local scheduled wakeup.
+///
+/// Slots are scoped to a single processor instance. They do not need to be
+/// globally unique across the pipeline, so processors can define local
+/// constants such as `WakeupSlot(0)` for their own internal timers.
+///
+/// Re-scheduling the same slot replaces the previous wakeup for that slot.
 #[repr(transparent)]
 #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)]
 pub struct WakeupSlot(pub u64);
@@ -228,10 +234,18 @@ pub enum NodeControlMsg<PData> {
     },
 
     /// A processor-local wakeup scheduled by the processor effect handler.
+    ///
+    /// This is delivered back through the processor inbox as normal control
+    /// traffic. The slot identifies which logical wakeup fired; processors are
+    /// expected to interpret the slot according to their own local namespace.
+    ///
+    /// Wakeups are best-effort runtime signals rather than durable work items:
+    /// once processor shutdown is latched, pending wakeups are dropped and no
+    /// further wakeups are accepted.
     Wakeup {
         /// Scheduled wakeup slot.
         slot: WakeupSlot,
-        /// Original scheduled wakeup instant.
+        /// Scheduled due time currently associated with this slot.
         when: Instant,
     },
 
diff --git a/rust/otap-dataflow/crates/engine/src/effect_handler.rs b/rust/otap-dataflow/crates/engine/src/effect_handler.rs
index b02c137842..e3c468e311 100644
--- a/rust/otap-dataflow/crates/engine/src/effect_handler.rs
+++ b/rust/otap-dataflow/crates/engine/src/effect_handler.rs
@@ -408,6 +408,15 @@ impl<PData> EffectHandlerCore<PData> {
     }
 
     /// Set or replace a processor-local wakeup.
+    ///
+    /// Wakeups are keyed by [`WakeupSlot`]. Scheduling the same slot again
+    /// replaces the previous due time for that slot.
+    ///
+    /// # Errors
+    ///
+    /// Returns [`WakeupError::ShuttingDown`] once processor shutdown has been
+    /// latched. Returns [`WakeupError::Capacity`] if the processor has reached
+    /// its configured live wakeup-slot capacity.
     pub fn set_wakeup(&self, slot: WakeupSlot, when: Instant) -> Result<(), WakeupError> {
         self.local_scheduler
             .as_ref()
@@ -416,6 +425,10 @@ impl<PData> EffectHandlerCore<PData> {
     }
 
     /// Cancel a previously scheduled processor-local wakeup.
+    ///
+    /// Returns `true` when a live wakeup for `slot` was removed. Returns
+    /// `false` when the slot was not scheduled or when shutdown has already
+    /// been latched for the processor.
     #[must_use]
     pub fn cancel_wakeup(&self, slot: WakeupSlot) -> bool {
         self.local_scheduler
diff --git a/rust/otap-dataflow/crates/engine/src/local/processor.rs b/rust/otap-dataflow/crates/engine/src/local/processor.rs
index b115d9636c..19d94e57f4 100644
--- a/rust/otap-dataflow/crates/engine/src/local/processor.rs
+++ b/rust/otap-dataflow/crates/engine/src/local/processor.rs
@@ -69,7 +69,13 @@ pub trait Processor<PData> {
     /// - Transform the message and return a new message
     /// - Filter the message by returning None
     /// - Split the message into multiple messages by returning a vector
-    /// - Handle control messages (e.g., Config, TimerTick, Shutdown)
+    /// - Handle control messages (e.g., Config, TimerTick, Wakeup, Shutdown)
+    ///
+    /// Processor-local wakeups are scheduled through
+    /// [`EffectHandler::set_wakeup`]. They are delivered back to the processor
+    /// as `Message::Control(NodeControlMsg::Wakeup { .. })` through the normal
+    /// inbox path and participate in the same control-vs-pdata fairness rules
+    /// as other control traffic.
     ///
     /// # Parameters
     ///
diff --git a/rust/otap-dataflow/crates/engine/src/shared/processor.rs b/rust/otap-dataflow/crates/engine/src/shared/processor.rs
index 3c9db25aaf..2c199a6023 100644
--- a/rust/otap-dataflow/crates/engine/src/shared/processor.rs
+++ b/rust/otap-dataflow/crates/engine/src/shared/processor.rs
@@ -68,7 +68,13 @@ pub trait Processor<PData> {
     /// - Transform the message and return a new message
     /// - Filter the message by returning None
     /// - Split the message into multiple messages by returning a vector
-    /// - Handle control messages (e.g., Config, TimerTick, Shutdown)
+    /// - Handle control messages (e.g., Config, TimerTick, Wakeup, Shutdown)
+    ///
+    /// Processor-local wakeups are scheduled through
+    /// [`EffectHandler::set_wakeup`]. They are delivered back to the processor
+    /// as `Message::Control(NodeControlMsg::Wakeup { .. })` through the normal
+    /// inbox path and participate in the same control-vs-pdata fairness rules
+    /// as other control traffic.
     ///
     /// # Parameters
     ///

From 207af003add6dcf5abcfcaf663d15c96c3a00e55 Mon Sep 17 00:00:00 2001
From: lquerel <l.querel@f5.com>
Date: Thu, 2 Apr 2026 15:01:34 -0700
Subject: [PATCH 05/18] Use indexed heap for local wakeups

---
 .../crates/engine/src/node_local_scheduler.rs | 293 ++++++++++++------
 1 file changed, 201 insertions(+), 92 deletions(-)

diff --git a/rust/otap-dataflow/crates/engine/src/node_local_scheduler.rs b/rust/otap-dataflow/crates/engine/src/node_local_scheduler.rs
index 8cbc0fb658..2ed31fe7da 100644
--- a/rust/otap-dataflow/crates/engine/src/node_local_scheduler.rs
+++ b/rust/otap-dataflow/crates/engine/src/node_local_scheduler.rs
@@ -4,8 +4,7 @@
 //! Node-local wakeup scheduling for processor inboxes.
 
 use crate::control::WakeupSlot;
-use std::cmp::Ordering;
-use std::collections::{BinaryHeap, HashMap};
+use std::collections::HashMap;
 use std::sync::{Arc, Mutex};
 use std::time::Instant;
 use tokio::sync::Notify;
@@ -20,51 +19,17 @@ pub enum WakeupError {
 }
 
 #[derive(Clone, Copy, Debug)]
-struct WakeupState {
-    when: Instant,
-    generation: u64,
-    sequence: u64,
-}
-
-#[derive(Debug)]
 struct ScheduledWakeup {
     slot: WakeupSlot,
     when: Instant,
-    generation: u64,
     sequence: u64,
 }
 
-impl Ord for ScheduledWakeup {
-    fn cmp(&self, other: &Self) -> Ordering {
-        other
-            .when
-            .cmp(&self.when)
-            .then_with(|| other.sequence.cmp(&self.sequence))
-    }
-}
-
-impl PartialOrd for ScheduledWakeup {
-    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
-        Some(self.cmp(other))
-    }
-}
-
-impl PartialEq for ScheduledWakeup {
-    fn eq(&self, other: &Self) -> bool {
-        self.slot == other.slot
-            && self.when == other.when
-            && self.generation == other.generation
-            && self.sequence == other.sequence
-    }
-}
-
-impl Eq for ScheduledWakeup {}
-
 struct NodeLocalScheduler {
     wakeup_capacity: usize,
     next_sequence: u64,
-    wakeups: BinaryHeap<ScheduledWakeup>,
-    wakeup_state: HashMap<WakeupSlot, WakeupState>,
+    wakeups: Vec<ScheduledWakeup>,
+    wakeup_indices: HashMap<WakeupSlot, usize>,
     shutting_down: bool,
 }
 
@@ -73,8 +38,8 @@ impl NodeLocalScheduler {
         Self {
             wakeup_capacity,
             next_sequence: 0,
-            wakeups: BinaryHeap::new(),
-            wakeup_state: HashMap::new(),
+            wakeups: Vec::new(),
+            wakeup_indices: HashMap::new(),
             shutting_down: false,
         }
     }
@@ -85,38 +50,123 @@ impl NodeLocalScheduler {
         next
     }
 
+    fn wakeup_precedes(left: &ScheduledWakeup, right: &ScheduledWakeup) -> bool {
+        left.when < right.when || (left.when == right.when && left.sequence < right.sequence)
+    }
+
+    fn swap_entries(&mut self, left: usize, right: usize) {
+        if left == right {
+            return;
+        }
+
+        self.wakeups.swap(left, right);
+
+        let left_slot = self.wakeups[left].slot;
+        let right_slot = self.wakeups[right].slot;
+        let _ = self
+            .wakeup_indices
+            .insert(left_slot, left)
+            .expect("left slot index should exist");
+        let _ = self
+            .wakeup_indices
+            .insert(right_slot, right)
+            .expect("right slot index should exist");
+    }
+
+    fn sift_up(&mut self, mut index: usize) {
+        while index > 0 {
+            let parent = (index - 1) / 2;
+            if !Self::wakeup_precedes(&self.wakeups[index], &self.wakeups[parent]) {
+                break;
+            }
+            self.swap_entries(index, parent);
+            index = parent;
+        }
+    }
+
+    fn sift_down(&mut self, mut index: usize) {
+        let len = self.wakeups.len();
+        loop {
+            let left = index * 2 + 1;
+            if left >= len {
+                break;
+            }
+
+            let right = left + 1;
+            let mut smallest = left;
+            if right < len && Self::wakeup_precedes(&self.wakeups[right], &self.wakeups[left]) {
+                smallest = right;
+            }
+
+            if !Self::wakeup_precedes(&self.wakeups[smallest], &self.wakeups[index]) {
+                break;
+            }
+
+            self.swap_entries(index, smallest);
+            index = smallest;
+        }
+    }
+
+    fn repair_heap_at(&mut self, index: usize) {
+        if index > 0 {
+            let parent = (index - 1) / 2;
+            if Self::wakeup_precedes(&self.wakeups[index], &self.wakeups[parent]) {
+                self.sift_up(index);
+                return;
+            }
+        }
+        self.sift_down(index);
+    }
+
+    fn remove_heap_entry(&mut self, index: usize) -> ScheduledWakeup {
+        let last = self
+            .wakeups
+            .len()
+            .checked_sub(1)
+            .expect("heap entry removal requires a non-empty heap");
+
+        if index == last {
+            return self.wakeups.pop().expect("last wakeup should exist");
+        }
+
+        self.wakeups.swap(index, last);
+        let removed = self.wakeups.pop().expect("removed wakeup should exist");
+
+        let moved_slot = self.wakeups[index].slot;
+        let _ = self
+            .wakeup_indices
+            .insert(moved_slot, index)
+            .expect("moved slot index should exist");
+        self.repair_heap_at(index);
+        removed
+    }
+
     fn set_wakeup(&mut self, slot: WakeupSlot, when: Instant) -> Result<(), WakeupError> {
         if self.shutting_down {
             return Err(WakeupError::ShuttingDown);
         }
 
         let sequence = self.next_sequence();
-        let generation = if let Some(state) = self.wakeup_state.get_mut(&slot) {
-            state.when = when;
-            state.generation = state.generation.saturating_add(1);
-            state.sequence = sequence;
-            state.generation
+        if let Some(&index) = self.wakeup_indices.get(&slot) {
+            self.wakeups[index].when = when;
+            self.wakeups[index].sequence = sequence;
+            self.repair_heap_at(index);
         } else {
-            if self.wakeup_state.len() >= self.wakeup_capacity {
+            if self.wakeup_indices.len() >= self.wakeup_capacity {
                 return Err(WakeupError::Capacity);
             }
-            let _ = self.wakeup_state.insert(
+            let index = self.wakeups.len();
+            self.wakeups.push(ScheduledWakeup {
                 slot,
-                WakeupState {
-                    when,
-                    generation: 0,
-                    sequence,
-                },
+                when,
+                sequence,
+            });
+            assert!(
+                self.wakeup_indices.insert(slot, index).is_none(),
+                "new wakeup slot should not already exist"
             );
-            0
-        };
-
-        self.wakeups.push(ScheduledWakeup {
-            slot,
-            when,
-            generation,
-            sequence,
-        });
+            self.sift_up(index);
+        }
         Ok(())
     }
 
@@ -124,38 +174,63 @@ impl NodeLocalScheduler {
         if self.shutting_down {
             return false;
         }
-        self.wakeup_state.remove(&slot).is_some()
-    }
-
-    fn discard_stale_wakeup_head(&mut self) {
-        while let Some(head) = self.wakeups.peek() {
-            let Some(state) = self.wakeup_state.get(&head.slot) else {
-                let _ = self.wakeups.pop();
-                continue;
-            };
-            if state.generation != head.generation || state.when != head.when {
-                let _ = self.wakeups.pop();
-                continue;
+
+        let Some(index) = self.wakeup_indices.remove(&slot) else {
+            return false;
+        };
+
+        let removed = self.remove_heap_entry(index);
+        debug_assert_eq!(removed.slot, slot);
+        true
+    }
+
+    #[cfg(debug_assertions)]
+    fn assert_consistent(&self) {
+        assert_eq!(self.wakeups.len(), self.wakeup_indices.len());
+
+        for (index, wakeup) in self.wakeups.iter().enumerate() {
+            assert_eq!(
+                self.wakeup_indices.get(&wakeup.slot).copied(),
+                Some(index),
+                "heap index must match map entry"
+            );
+
+            if index > 0 {
+                let parent = (index - 1) / 2;
+                assert!(
+                    !Self::wakeup_precedes(&self.wakeups[index], &self.wakeups[parent]),
+                    "heap child must not precede parent"
+                );
             }
-            break;
         }
     }
 
     fn next_expiry(&mut self) -> Option<Instant> {
-        self.discard_stale_wakeup_head();
-        self.wakeups.peek().map(|wakeup| wakeup.when)
+        #[cfg(debug_assertions)]
+        self.assert_consistent();
+        self.wakeups.first().map(|wakeup| wakeup.when)
     }
 
     fn pop_due(&mut self, now: Instant) -> Option<(WakeupSlot, Instant)> {
-        self.discard_stale_wakeup_head();
+        #[cfg(debug_assertions)]
+        self.assert_consistent();
 
-        let next_due = self.wakeups.peek().map(|wakeup| wakeup.when)?;
+        let next_due = self.wakeups.first().map(|wakeup| wakeup.when)?;
         if next_due > now {
             return None;
         }
 
-        let wakeup = self.wakeups.pop().expect("wakeup must exist");
-        let _ = self.wakeup_state.remove(&wakeup.slot);
+        let slot = self
+            .wakeups
+            .first()
+            .expect("due wakeup should exist")
+            .slot;
+        let removed_index = self
+            .wakeup_indices
+            .remove(&slot)
+            .expect("due wakeup slot index should exist");
+        debug_assert_eq!(removed_index, 0);
+        let wakeup = self.remove_heap_entry(0);
         Some((wakeup.slot, wakeup.when))
     }
 
@@ -164,12 +239,12 @@ impl NodeLocalScheduler {
             return;
         }
         self.shutting_down = true;
-        self.wakeup_state.clear();
+        self.wakeup_indices.clear();
         self.wakeups.clear();
     }
 
     fn is_drained(&self) -> bool {
-        self.wakeup_state.is_empty()
+        self.wakeup_indices.is_empty()
     }
 }
 
@@ -248,6 +323,16 @@ mod tests {
     use super::*;
     use std::time::Duration;
 
+    fn assert_heap_bound(scheduler: &NodeLocalScheduler) {
+        assert_eq!(
+            scheduler.wakeups.len(),
+            scheduler.wakeup_indices.len(),
+            "scheduler should keep exactly one heap entry per live slot"
+        );
+        #[cfg(debug_assertions)]
+        scheduler.assert_consistent();
+    }
+
     #[test]
     fn set_wakeup_schedules_a_wakeup() {
         let mut scheduler = NodeLocalScheduler::new(2);
@@ -255,9 +340,11 @@ mod tests {
         let when = now + Duration::from_secs(1);
 
         assert_eq!(scheduler.set_wakeup(WakeupSlot(7), when), Ok(()));
+        assert_heap_bound(&scheduler);
         assert_eq!(scheduler.next_expiry(), Some(when));
         assert_eq!(scheduler.pop_due(now), None);
         assert_eq!(scheduler.pop_due(when), Some((WakeupSlot(7), when)));
+        assert_heap_bound(&scheduler);
         assert_eq!(scheduler.next_expiry(), None);
     }
 
@@ -270,8 +357,11 @@ mod tests {
 
         assert_eq!(scheduler.set_wakeup(WakeupSlot(3), later), Ok(()));
         assert_eq!(scheduler.set_wakeup(WakeupSlot(3), sooner), Ok(()));
+        assert_heap_bound(&scheduler);
+        assert_eq!(scheduler.wakeups.len(), 1);
         assert_eq!(scheduler.next_expiry(), Some(sooner));
         assert_eq!(scheduler.pop_due(sooner), Some((WakeupSlot(3), sooner)));
+        assert_heap_bound(&scheduler);
         assert_eq!(scheduler.pop_due(later), None);
     }
 
@@ -281,7 +371,9 @@ mod tests {
         let when = Instant::now() + Duration::from_secs(1);
 
         assert_eq!(scheduler.set_wakeup(WakeupSlot(5), when), Ok(()));
+        assert_heap_bound(&scheduler);
         assert!(scheduler.cancel_wakeup(WakeupSlot(5)));
+        assert_heap_bound(&scheduler);
         assert!(!scheduler.cancel_wakeup(WakeupSlot(5)));
         assert_eq!(scheduler.next_expiry(), None);
         assert_eq!(scheduler.pop_due(when), None);
@@ -301,22 +393,39 @@ mod tests {
             scheduler.set_wakeup(WakeupSlot(0), when + Duration::from_secs(1)),
             Ok(())
         );
+        assert_heap_bound(&scheduler);
     }
 
     #[test]
-    fn stale_heap_entries_are_ignored() {
+    fn repeated_reschedules_keep_single_heap_entry() {
         let mut scheduler = NodeLocalScheduler::new(2);
         let now = Instant::now();
-        let first = now + Duration::from_secs(5);
-        let replacement = now + Duration::from_secs(1);
+        for offset in (1..=32).rev() {
+            let when = now + Duration::from_secs(offset);
+            assert_eq!(scheduler.set_wakeup(WakeupSlot(9), when), Ok(()));
+            assert_heap_bound(&scheduler);
+            assert_eq!(scheduler.wakeups.len(), 1);
+            assert_eq!(scheduler.next_expiry(), Some(when));
+        }
 
-        assert_eq!(scheduler.set_wakeup(WakeupSlot(9), first), Ok(()));
-        assert_eq!(scheduler.set_wakeup(WakeupSlot(9), replacement), Ok(()));
-        assert_eq!(
-            scheduler.pop_due(replacement),
-            Some((WakeupSlot(9), replacement))
-        );
-        assert_eq!(scheduler.pop_due(first), None);
+        let expected = now + Duration::from_secs(1);
+        assert_eq!(scheduler.pop_due(expected), Some((WakeupSlot(9), expected)));
         assert_eq!(scheduler.next_expiry(), None);
     }
+
+    #[test]
+    fn equal_deadlines_follow_schedule_sequence() {
+        let mut scheduler = NodeLocalScheduler::new(4);
+        let when = Instant::now() + Duration::from_secs(1);
+
+        assert_eq!(scheduler.set_wakeup(WakeupSlot(1), when), Ok(()));
+        assert_eq!(scheduler.set_wakeup(WakeupSlot(2), when), Ok(()));
+        assert_eq!(scheduler.set_wakeup(WakeupSlot(3), when), Ok(()));
+        assert_heap_bound(&scheduler);
+
+        assert_eq!(scheduler.pop_due(when), Some((WakeupSlot(1), when)));
+        assert_eq!(scheduler.pop_due(when), Some((WakeupSlot(2), when)));
+        assert_eq!(scheduler.pop_due(when), Some((WakeupSlot(3), when)));
+        assert_heap_bound(&scheduler);
+    }
 }

From 9237dc00b35260354dac4e264ae4ca6185f74b5b Mon Sep 17 00:00:00 2001
From: lquerel <l.querel@f5.com>
Date: Thu, 2 Apr 2026 15:11:47 -0700
Subject: [PATCH 06/18] Document local wakeup guarantees

---
 rust/otap-dataflow/crates/engine/README.md | 44 +++++++++++++++++++++-
 1 file changed, 43 insertions(+), 1 deletion(-)

diff --git a/rust/otap-dataflow/crates/engine/README.md b/rust/otap-dataflow/crates/engine/README.md
index 59180340c7..282f206573 100644
--- a/rust/otap-dataflow/crates/engine/README.md
+++ b/rust/otap-dataflow/crates/engine/README.md
@@ -231,7 +231,7 @@ closed normal admission.
 The current message families are:
 
 - **Node control messages**: `Ack`, `Nack`, `Config`, `TimerTick`,
-  `CollectTelemetry`, `DelayedData`, `DrainIngress`, `Shutdown`
+  `CollectTelemetry`, `Wakeup`, `DelayedData`, `DrainIngress`, `Shutdown`
 - **Runtime control messages**: `StartTimer`, `CancelTimer`,
   `StartTelemetryTimer`, `CancelTelemetryTimer`, `DelayData`,
   `ReceiverDrained`, `Shutdown`
@@ -274,6 +274,47 @@ behavior:
    empty and close. The later sections on Ack/Nack delivery and graceful
    shutdown cover this in more detail.
 
+### Processor-Local Wakeups
+
+Processors can schedule local wakeups through the processor effect handler:
+
+- `set_wakeup(slot, when)` schedules or replaces the wakeup for `slot`
+- `cancel_wakeup(slot)` removes the wakeup for `slot` if one is live
+
+This API is intentionally processor-local:
+
+- `WakeupSlot` is scoped to one processor instance, not globally across the
+  pipeline
+- a processor can define its own slot constants such as `WakeupSlot(0)`
+- the engine does not interpret slot meaning; it only routes the slot back to
+  the originating processor
+
+Wakeups are delivered through `ProcessorInbox` as
+`NodeControlMsg::Wakeup { slot, when }`. They therefore participate in the
+same receive loop and the same bounded fairness policy as other control
+traffic.
+
+The current runtime properties and guarantees are:
+
+- **Keyed replacement:** there is at most one live wakeup per slot; scheduling
+  the same slot again replaces the previous due time
+- **Cancellation:** canceling a live slot prevents that wakeup from being
+  delivered later
+- **Bounded live state:** scheduler state is bounded by the number of live
+  wakeup slots accepted for the processor
+- **Deterministic ordering:** if two wakeups have the same due time, they are
+  delivered in schedule order
+- **No payload retention:** wakeups carry only `(slot, when)` and do not retain
+  deferred `pdata`
+- **Shutdown rejection and drop:** once processor shutdown is latched, new
+  wakeups are rejected and pending wakeups are dropped immediately
+- **No flush-on-shutdown guarantee:** pending wakeups are not drained or forced
+  through during shutdown
+
+Wakeups are best-effort runtime scheduling signals. They are not durable work
+items and are not part of the runtime-control delayed-data mechanism that is
+still used by retry-oriented flows outside this API.
+
 ## Runtime Properties
 
 The runtime is organized around a small set of guarantees:
@@ -316,6 +357,7 @@ In practice, effect handlers are how nodes:
 - subscribe to Ack/Nack interests on the forward path
 - emit Ack/Nack outcomes onto the pipeline-completion channel
 - schedule or cancel timers on the runtime-control channel
+- schedule or cancel processor-local wakeups
 - return delayed data
 - report `ReceiverDrained`
 - create listeners and sockets with engine-defined socket options

From 6c05e1982610ec5401a4ff45d909091b2fbb5681 Mon Sep 17 00:00:00 2001
From: lquerel <l.querel@f5.com>
Date: Thu, 2 Apr 2026 15:41:01 -0700
Subject: [PATCH 07/18] Derive batch wakeup slots from signal keys

---
 .../src/processors/batch_processor/mod.rs     | 96 ++++++++++++-------
 1 file changed, 64 insertions(+), 32 deletions(-)

diff --git a/rust/otap-dataflow/crates/core-nodes/src/processors/batch_processor/mod.rs b/rust/otap-dataflow/crates/core-nodes/src/processors/batch_processor/mod.rs
index 84634b103f..0feee78806 100644
--- a/rust/otap-dataflow/crates/core-nodes/src/processors/batch_processor/mod.rs
+++ b/rust/otap-dataflow/crates/core-nodes/src/processors/batch_processor/mod.rs
@@ -78,21 +78,29 @@ pub const DEFAULT_MAX_BATCH_DURATION_MS: u64 = 200;
 const LOG_MSG_BATCHING_FAILED_PREFIX: &str = "OTAP batch processor: low-level batching failed for";
 const LOG_MSG_BATCHING_FAILED_SUFFIX: &str = "; dropping";
 
-const WAKEUP_SLOT_OTAP_LOGS: WakeupSlot = WakeupSlot(0);
-const WAKEUP_SLOT_OTAP_METRICS: WakeupSlot = WakeupSlot(1);
-const WAKEUP_SLOT_OTAP_TRACES: WakeupSlot = WakeupSlot(2);
-const WAKEUP_SLOT_OTLP_LOGS: WakeupSlot = WakeupSlot(3);
-const WAKEUP_SLOT_OTLP_METRICS: WakeupSlot = WakeupSlot(4);
-const WAKEUP_SLOT_OTLP_TRACES: WakeupSlot = WakeupSlot(5);
+// Encodes each supported (format, signal) pair into a distinct batch-local
+// wakeup slot.
+const fn wakeup_slot(format: SignalFormat, signal: SignalType) -> WakeupSlot {
+    let format_base = match format {
+        SignalFormat::OtapRecords => 0,
+        SignalFormat::OtlpBytes => 3,
+    };
+    let signal_offset = match signal {
+        SignalType::Logs => 0,
+        SignalType::Metrics => 1,
+        SignalType::Traces => 2,
+    };
+    WakeupSlot(format_base + signal_offset)
+}
 
 const fn signal_from_wakeup_slot(slot: WakeupSlot) -> Option<(SignalFormat, SignalType)> {
-    match slot {
-        WAKEUP_SLOT_OTAP_LOGS => Some((SignalFormat::OtapRecords, SignalType::Logs)),
-        WAKEUP_SLOT_OTAP_METRICS => Some((SignalFormat::OtapRecords, SignalType::Metrics)),
-        WAKEUP_SLOT_OTAP_TRACES => Some((SignalFormat::OtapRecords, SignalType::Traces)),
-        WAKEUP_SLOT_OTLP_LOGS => Some((SignalFormat::OtlpBytes, SignalType::Logs)),
-        WAKEUP_SLOT_OTLP_METRICS => Some((SignalFormat::OtlpBytes, SignalType::Metrics)),
-        WAKEUP_SLOT_OTLP_TRACES => Some((SignalFormat::OtlpBytes, SignalType::Traces)),
+    match slot.0 {
+        0 => Some((SignalFormat::OtapRecords, SignalType::Logs)),
+        1 => Some((SignalFormat::OtapRecords, SignalType::Metrics)),
+        2 => Some((SignalFormat::OtapRecords, SignalType::Traces)),
+        3 => Some((SignalFormat::OtlpBytes, SignalType::Logs)),
+        4 => Some((SignalFormat::OtlpBytes, SignalType::Metrics)),
+        5 => Some((SignalFormat::OtlpBytes, SignalType::Traces)),
         _ => None,
     }
 }
@@ -764,11 +772,7 @@ impl Batcher<OtapArrowRecords> for SignalBuffer<OtapArrowRecords> {
     }
 
     fn wakeup_slot(signal: SignalType) -> WakeupSlot {
-        match signal {
-            SignalType::Logs => WAKEUP_SLOT_OTAP_LOGS,
-            SignalType::Metrics => WAKEUP_SLOT_OTAP_METRICS,
-            SignalType::Traces => WAKEUP_SLOT_OTAP_TRACES,
-        }
+        wakeup_slot(SignalFormat::OtapRecords, signal)
     }
 }
 
@@ -792,11 +796,7 @@ impl Batcher<OtlpProtoBytes> for SignalBuffer<OtlpProtoBytes> {
     }
 
     fn wakeup_slot(signal: SignalType) -> WakeupSlot {
-        match signal {
-            SignalType::Logs => WAKEUP_SLOT_OTLP_LOGS,
-            SignalType::Metrics => WAKEUP_SLOT_OTLP_METRICS,
-            SignalType::Traces => WAKEUP_SLOT_OTLP_TRACES,
-        }
+        wakeup_slot(SignalFormat::OtlpBytes, signal)
     }
 }
 
@@ -1715,12 +1715,12 @@ mod tests {
 
     const fn all_wakeup_slots() -> [WakeupSlot; 6] {
         [
-            WAKEUP_SLOT_OTAP_LOGS,
-            WAKEUP_SLOT_OTAP_METRICS,
-            WAKEUP_SLOT_OTAP_TRACES,
-            WAKEUP_SLOT_OTLP_LOGS,
-            WAKEUP_SLOT_OTLP_METRICS,
-            WAKEUP_SLOT_OTLP_TRACES,
+            wakeup_slot(SignalFormat::OtapRecords, SignalType::Logs),
+            wakeup_slot(SignalFormat::OtapRecords, SignalType::Metrics),
+            wakeup_slot(SignalFormat::OtapRecords, SignalType::Traces),
+            wakeup_slot(SignalFormat::OtlpBytes, SignalType::Logs),
+            wakeup_slot(SignalFormat::OtlpBytes, SignalType::Metrics),
+            wakeup_slot(SignalFormat::OtlpBytes, SignalType::Traces),
         ]
     }
 
@@ -2049,6 +2049,35 @@ mod tests {
         test_timer_flush(datagen.generate_logs().into(), true);
     }
 
+    #[test]
+    fn test_wakeup_slot_round_trip_and_uniqueness() {
+        let slots = [
+            (SignalFormat::OtapRecords, SignalType::Logs),
+            (SignalFormat::OtapRecords, SignalType::Metrics),
+            (SignalFormat::OtapRecords, SignalType::Traces),
+            (SignalFormat::OtlpBytes, SignalType::Logs),
+            (SignalFormat::OtlpBytes, SignalType::Metrics),
+            (SignalFormat::OtlpBytes, SignalType::Traces),
+        ];
+
+        for (expected_format, expected_signal) in slots {
+            let slot = wakeup_slot(expected_format, expected_signal);
+            assert_eq!(
+                signal_from_wakeup_slot(slot),
+                Some((expected_format, expected_signal))
+            );
+        }
+
+        let mut unique = std::collections::HashSet::new();
+        for (format, signal) in slots {
+            assert!(
+                unique.insert(wakeup_slot(format, signal)),
+                "slot mapping should be unique for each format/signal pair"
+            );
+        }
+        assert_eq!(unique.len(), 6);
+    }
+
     // The processor replaces wakeups per slot. This test proves that an early
     // wakeup is ignored and that the current wakeup still flushes the buffered
     // input later.
@@ -2099,7 +2128,7 @@ mod tests {
 
                 let stale_when = Instant::now();
                 ctx.process(Message::Control(NodeControlMsg::Wakeup {
-                    slot: WAKEUP_SLOT_OTAP_LOGS,
+                    slot: wakeup_slot(SignalFormat::OtapRecords, SignalType::Logs),
                     when: stale_when,
                 }))
                 .await
@@ -2111,7 +2140,7 @@ mod tests {
 
                 let current_when = Instant::now() + Duration::from_secs(1);
                 ctx.process(Message::Control(NodeControlMsg::Wakeup {
-                    slot: WAKEUP_SLOT_OTAP_LOGS,
+                    slot: wakeup_slot(SignalFormat::OtapRecords, SignalType::Logs),
                     when: current_when,
                 }))
                 .await
@@ -2727,7 +2756,10 @@ mod tests {
 
                 // Trigger timeout for both active batching slots.
                 let when = Instant::now() + Duration::from_secs(1);
-                for slot in [WAKEUP_SLOT_OTLP_LOGS, WAKEUP_SLOT_OTAP_LOGS] {
+                for slot in [
+                    wakeup_slot(SignalFormat::OtlpBytes, SignalType::Logs),
+                    wakeup_slot(SignalFormat::OtapRecords, SignalType::Logs),
+                ] {
                     ctx.process(Message::Control(NodeControlMsg::Wakeup { slot, when }))
                         .await
                         .expect("process wakeup");

From aeb0527bfd021b435a4ecc9549ef53a68bc8e54e Mon Sep 17 00:00:00 2001
From: lquerel <l.querel@f5.com>
Date: Thu, 2 Apr 2026 16:40:36 -0700
Subject: [PATCH 08/18] Handle durable buffer wakeup overflow

---
 .../durable_buffer_processor/mod.rs           | 640 ++++++++++++++----
 1 file changed, 514 insertions(+), 126 deletions(-)

diff --git a/rust/otap-dataflow/crates/core-nodes/src/processors/durable_buffer_processor/mod.rs b/rust/otap-dataflow/crates/core-nodes/src/processors/durable_buffer_processor/mod.rs
index 549a9fe3a3..c9efc3480e 100644
--- a/rust/otap-dataflow/crates/core-nodes/src/processors/durable_buffer_processor/mod.rs
+++ b/rust/otap-dataflow/crates/core-nodes/src/processors/durable_buffer_processor/mod.rs
@@ -73,7 +73,7 @@ mod bundle_adapter;
 mod config;
 
 use std::collections::hash_map::Entry;
-use std::collections::{HashMap, HashSet};
+use std::collections::{BTreeSet, HashMap, HashSet};
 use std::sync::Arc;
 use std::time::{Duration, Instant};
 
@@ -112,6 +112,7 @@ use otap_df_engine::node::NodeId;
 use otap_df_engine::processor::ProcessorWrapper;
 use otap_df_engine::{
     ConsumerEffectHandlerExtension, Interests, ProcessorFactory, ProducerEffectHandlerExtension,
+    WakeupError,
 };
 use otap_df_pdata::{OtapArrowRecords, OtapPayload};
 use otap_df_telemetry::instrument::{Counter, Gauge, ObserveCounter};
@@ -318,6 +319,10 @@ fn decode_bundle_ref(calldata: &CallData) -> Option<BundleRef> {
     })
 }
 
+fn retry_key(bundle_ref: BundleRef) -> (u64, u32) {
+    (bundle_ref.segment_seq.raw(), bundle_ref.bundle_index.raw())
+}
+
 /// State for tracking a pending downstream delivery.
 ///
 /// Holds the Quiver bundle handle to keep the bundle claimed while in-flight.
@@ -341,6 +346,21 @@ struct RetryWakeup {
     retry_count: u32,
 }
 
+#[derive(Clone, Copy)]
+struct OverflowRetry {
+    bundle_ref: BundleRef,
+    retry_count: u32,
+    retry_at: Instant,
+    sequence: u64,
+}
+
+#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
+struct OverflowRetryOrder {
+    retry_at: Instant,
+    sequence: u64,
+    key: (u64, u32),
+}
+
 /// Result of attempting to process a bundle with non-blocking send.
 enum ProcessBundleResult {
     /// Bundle was successfully sent downstream.
@@ -411,19 +431,44 @@ pub struct DurableBuffer {
     /// Key is the (segment_seq, bundle_index) pair encoded as a u128 for fast lookup.
     pending_bundles: HashMap<(u64, u32), PendingBundle>,
 
-    /// Bundles scheduled for retry via node-local wakeups.
-    /// These are skipped by poll_next_bundle to enforce backoff.
+    /// Bundles currently held out of the normal poll loop while backoff is active.
+    ///
+    /// Invariant: every key here is deferred for retry either by an armed wakeup
+    /// (`retry_wakeup_slots` + `retry_wakeups`) or by the local overflow queue
+    /// (`retry_overflow` + `retry_overflow_order`).
     retry_scheduled: HashSet<(u64, u32)>,
 
     /// Wakeup slot assigned to each bundle currently waiting for retry.
+    ///
+    /// Invariant: this only contains bundles that successfully acquired a slot
+    /// in the engine wakeup scheduler. Keys present only in `retry_overflow`
+    /// intentionally have no slot entry yet.
     retry_wakeup_slots: HashMap<(u64, u32), WakeupSlot>,
 
     /// Retry state keyed by wakeup slot.
+    ///
+    /// Invariant: for every `(key -> slot)` in `retry_wakeup_slots`, there is a
+    /// matching `(slot -> RetryWakeup)` entry here describing the same bundle.
     retry_wakeups: HashMap<WakeupSlot, RetryWakeup>,
 
+    /// Retry state held locally while wakeup scheduling is at capacity.
+    ///
+    /// Guarantee: overflowed retries remain deferred and keep their target due
+    /// time even when the engine wakeup scheduler is full.
+    retry_overflow: HashMap<(u64, u32), OverflowRetry>,
+
+    /// Due-order index for locally deferred retries.
+    ///
+    /// Invariant: this contains exactly one ordering key for each entry in
+    /// `retry_overflow`, using `sequence` as a deterministic tie-breaker.
+    retry_overflow_order: BTreeSet<OverflowRetryOrder>,
+
     /// Monotonic slot allocator for retry wakeups.
     next_retry_wakeup_slot: u64,
 
+    /// Monotonic tie-breaker for locally deferred retry ordering.
+    next_retry_overflow_sequence: u64,
+
     /// Configuration.
     config: DurableBufferConfig,
 
@@ -519,7 +564,10 @@ impl DurableBuffer {
             retry_scheduled: HashSet::new(),
             retry_wakeup_slots: HashMap::new(),
             retry_wakeups: HashMap::new(),
+            retry_overflow: HashMap::new(),
+            retry_overflow_order: BTreeSet::new(),
             next_retry_wakeup_slot: 0,
+            next_retry_overflow_sequence: 0,
             config,
             core_id,
             num_cores,
@@ -615,13 +663,131 @@ impl DurableBuffer {
         self.pending_bundles.len() < self.config.max_in_flight
     }
 
+    /// Allocates a new processor-local wakeup slot for a retrying bundle.
+    ///
+    /// Guarantee: slot values are never reused within one processor instance,
+    /// which keeps stale wakeups from aliasing a newer retry.
+    fn next_retry_wakeup_slot(&mut self) -> WakeupSlot {
+        let slot = WakeupSlot(self.next_retry_wakeup_slot);
+        self.next_retry_wakeup_slot = self.next_retry_wakeup_slot.saturating_add(1);
+        slot
+    }
+
+    /// Builds the deterministic ordering key for an overflowed retry.
+    fn overflow_retry_order(key: (u64, u32), retry: OverflowRetry) -> OverflowRetryOrder {
+        OverflowRetryOrder {
+            retry_at: retry.retry_at,
+            sequence: retry.sequence,
+            key,
+        }
+    }
+
+    /// Removes one overflowed retry from both local indexes.
+    ///
+    /// Invariant preserved: `retry_overflow` and `retry_overflow_order` stay in
+    /// lockstep after every insertion/removal.
+    fn remove_retry_overflow(&mut self, key: (u64, u32)) -> Option<OverflowRetry> {
+        let retry = self.retry_overflow.remove(&key)?;
+        let _ = self
+            .retry_overflow_order
+            .remove(&Self::overflow_retry_order(key, retry));
+        Some(retry)
+    }
+
+    /// Defers a retry in local processor state when the engine wakeup scheduler
+    /// has no free slot.
+    ///
+    /// Guarantees:
+    /// - the bundle remains in `retry_scheduled`, so `poll_next_bundle()` keeps
+    ///   skipping it
+    /// - the most recent `(retry_count, retry_at)` replaces any older local
+    ///   overflow record for the same bundle
+    /// - equal due times are processed deterministically by `sequence`
+    fn insert_retry_overflow(
+        &mut self,
+        bundle_ref: BundleRef,
+        retry_count: u32,
+        retry_at: Instant,
+    ) {
+        let key = retry_key(bundle_ref);
+        let _ = self.remove_retry_overflow(key);
+        let retry = OverflowRetry {
+            bundle_ref,
+            retry_count,
+            retry_at,
+            sequence: self.next_retry_overflow_sequence,
+        };
+        self.next_retry_overflow_sequence = self.next_retry_overflow_sequence.saturating_add(1);
+        let _ = self.retry_scheduled.insert(key);
+        let _ = self.retry_overflow.insert(key, retry);
+        let _ = self
+            .retry_overflow_order
+            .insert(Self::overflow_retry_order(key, retry));
+    }
+
+    /// Pops the next locally deferred retry only when its due time has arrived.
+    ///
+    /// Guarantee: returning a retry clears all local overflow bookkeeping for
+    /// that bundle so it can be resumed exactly once.
+    fn take_due_retry_overflow(&mut self, now: Instant) -> Option<RetryWakeup> {
+        let order = *self.retry_overflow_order.first()?;
+        if order.retry_at > now {
+            return None;
+        }
+
+        let _ = self.retry_overflow_order.remove(&order);
+        let retry = self.retry_overflow.remove(&order.key)?;
+        let _ = self.retry_scheduled.remove(&order.key);
+
+        Some(RetryWakeup {
+            bundle_ref: retry.bundle_ref,
+            retry_count: retry.retry_count,
+        })
+    }
+
+    /// Opportunistically moves overflowed retries back into engine wakeup slots.
+    ///
+    /// Guarantees:
+    /// - never drops a deferred retry when slot acquisition fails
+    /// - preserves retry due time when promotion succeeds
+    /// - stops as soon as the scheduler reports `Capacity` or shutdown
+    fn promote_retry_overflow_to_wakeups(&mut self, effect_handler: &mut EffectHandler<OtapPdata>) {
+        while let Some(order) = self.retry_overflow_order.first().copied() {
+            let Some(retry) = self.retry_overflow.get(&order.key).copied() else {
+                let _ = self.retry_overflow_order.remove(&order);
+                continue;
+            };
+
+            let slot = self.next_retry_wakeup_slot();
+            match effect_handler.set_wakeup(slot, retry.retry_at) {
+                Ok(()) => {
+                    let _ = self.retry_overflow_order.remove(&order);
+                    let _ = self.retry_overflow.remove(&order.key);
+                    let _ = self.retry_wakeup_slots.insert(order.key, slot);
+                    let _ = self.retry_wakeups.insert(
+                        slot,
+                        RetryWakeup {
+                            bundle_ref: retry.bundle_ref,
+                            retry_count: retry.retry_count,
+                        },
+                    );
+                }
+                Err(WakeupError::Capacity | WakeupError::ShuttingDown) => break,
+            }
+        }
+    }
+
     /// Schedule a retry for a bundle via a processor-local wakeup.
     ///
     /// This is the single point of coordination between wakeup scheduling and
-    /// `retry_scheduled` tracking. Always use this method to keep the two in sync.
+    /// `retry_scheduled` tracking. Always use this method to keep the state in sync.
     ///
-    /// Returns true if scheduling succeeded, false if it failed (caller should
-    /// let `poll_next_bundle` pick up the bundle instead).
+    /// Guarantees:
+    /// - on success, the bundle remains deferred until either a wakeup or local
+    ///   overflow retry resumes it
+    /// - wakeup-capacity exhaustion falls back to local overflow state instead
+    ///   of immediate re-polling
+    /// - returns `false` only when the processor is already shutting down
     async fn schedule_retry(
         &mut self,
         bundle_ref: BundleRef,
@@ -629,48 +795,185 @@ impl DurableBuffer {
         delay: Duration,
         effect_handler: &mut EffectHandler<OtapPdata>,
     ) -> bool {
-        let key = (bundle_ref.segment_seq.raw(), bundle_ref.bundle_index.raw());
-        let (slot, is_new_slot) = match self.retry_wakeup_slots.entry(key) {
-            Entry::Occupied(entry) => (*entry.get(), false),
-            Entry::Vacant(entry) => {
-                let slot = WakeupSlot(self.next_retry_wakeup_slot);
-                self.next_retry_wakeup_slot = self.next_retry_wakeup_slot.saturating_add(1);
-                let _ = entry.insert(slot);
-                (slot, true)
-            }
+        let key = retry_key(bundle_ref);
+        let _ = self.remove_retry_overflow(key);
+        let (slot, is_new_slot) = if let Some(slot) = self.retry_wakeup_slots.get(&key).copied() {
+            (slot, false)
+        } else {
+            let slot = self.next_retry_wakeup_slot();
+            let _ = self.retry_wakeup_slots.insert(key, slot);
+            (slot, true)
         };
         let retry_at = Instant::now() + delay;
-        if effect_handler.set_wakeup(slot, retry_at).is_ok() {
-            // Track that this bundle is scheduled - poll_next_bundle will skip it
-            let _ = self.retry_scheduled.insert(key);
-            let _ = self.retry_wakeups.insert(
-                slot,
-                RetryWakeup {
-                    bundle_ref,
-                    retry_count,
-                },
-            );
-            true
-        } else {
-            if is_new_slot {
-                let _ = self.retry_wakeup_slots.remove(&key);
+        match effect_handler.set_wakeup(slot, retry_at) {
+            Ok(()) => {
+                // Track that this bundle is scheduled - poll_next_bundle will skip it
+                let _ = self.retry_scheduled.insert(key);
+                let _ = self.retry_wakeups.insert(
+                    slot,
+                    RetryWakeup {
+                        bundle_ref,
+                        retry_count,
+                    },
+                );
+                true
+            }
+            Err(WakeupError::Capacity) => {
+                if is_new_slot {
+                    let _ = self.retry_wakeup_slots.remove(&key);
+                }
+                self.insert_retry_overflow(bundle_ref, retry_count, retry_at);
+                true
+            }
+            Err(WakeupError::ShuttingDown) => {
+                if is_new_slot {
+                    let _ = self.retry_wakeup_slots.remove(&key);
+                }
+                false
             }
-            false
         }
     }
 
     /// Remove retry-wakeup tracking for a bundle now being resumed.
+    ///
+    /// Guarantee: taking a wakeup clears the armed-wakeup bookkeeping for that
+    /// bundle before retry resumption starts.
     fn take_retry_wakeup(&mut self, slot: WakeupSlot) -> Option<RetryWakeup> {
         let wakeup = self.retry_wakeups.remove(&slot)?;
-        let key = (
-            wakeup.bundle_ref.segment_seq.raw(),
-            wakeup.bundle_ref.bundle_index.raw(),
-        );
+        let key = retry_key(wakeup.bundle_ref);
         let _ = self.retry_scheduled.remove(&key);
         let _ = self.retry_wakeup_slots.remove(&key);
         Some(wakeup)
     }
 
+    /// Resumes one deferred retry, either by sending it downstream again or by
+    /// re-deferring it if downstream/backpressure constraints still apply.
+    ///
+    /// Guarantees:
+    /// - respects `max_in_flight`
+    /// - never falls back to hot polling on wakeup-capacity exhaustion
+    /// - preserves retry semantics for both armed wakeups and local overflow retries
+    async fn resume_retry(
+        &mut self,
+        bundle_ref: BundleRef,
+        retry_count: u32,
+        effect_handler: &mut EffectHandler<OtapPdata>,
+    ) -> Result<(), Error> {
+        if !self.can_send_more() {
+            otel_debug!(
+                "durable_buffer.retry.deferred",
+                segment_seq = bundle_ref.segment_seq.raw(),
+                bundle_index = bundle_ref.bundle_index.raw(),
+                in_flight = self.pending_bundles.len(),
+                max_in_flight = self.config.max_in_flight
+            );
+
+            if !self
+                .schedule_retry(
+                    bundle_ref,
+                    retry_count,
+                    self.config.poll_interval,
+                    effect_handler,
+                )
+                .await
+            {
+                otel_warn!("durable_buffer.retry.reschedule_failed");
+            }
+            return Ok(());
+        }
+
+        let claim_result = {
+            let (engine, subscriber_id) = self.engine()?;
+            engine.claim_bundle(subscriber_id, bundle_ref)
+        };
+
+        match claim_result {
+            Ok(handle) => match self.try_process_bundle_handle_with_retry_count(
+                handle,
+                retry_count,
+                effect_handler,
+            ) {
+                ProcessBundleResult::Sent => {
+                    otel_debug!(
+                        "durable_buffer.retry.sent",
+                        segment_seq = bundle_ref.segment_seq.raw(),
+                        bundle_index = bundle_ref.bundle_index.raw(),
+                        retry_count = retry_count
+                    );
+                }
+                ProcessBundleResult::Skipped => {
+                    otel_warn!(
+                        "durable_buffer.retry.skipped",
+                        segment_seq = bundle_ref.segment_seq.raw(),
+                        bundle_index = bundle_ref.bundle_index.raw()
+                    );
+                }
+                ProcessBundleResult::Backpressure => {
+                    otel_debug!(
+                        "durable_buffer.retry.backpressure",
+                        segment_seq = bundle_ref.segment_seq.raw(),
+                        bundle_index = bundle_ref.bundle_index.raw()
+                    );
+
+                    if !self
+                        .schedule_retry(
+                            bundle_ref,
+                            retry_count,
+                            self.config.poll_interval,
+                            effect_handler,
+                        )
+                        .await
+                    {
+                        otel_warn!("durable_buffer.retry.reschedule_failed");
+                    }
+                }
+                ProcessBundleResult::Error(e) => {
+                    return Err(e);
+                }
+            },
+            Err(e) => {
+                otel_debug!(
+                    "durable_buffer.retry.claim_failed",
+                    segment_seq = bundle_ref.segment_seq.raw(),
+                    bundle_index = bundle_ref.bundle_index.raw(),
+                    error = %e
+                );
+            }
+        }
+
+        Ok(())
+    }
+
+    /// Runs due overflowed retries directly from `TimerTick` while there is
+    /// time left in the tick budget.
+    ///
+    /// Guarantee: overdue retries still make forward progress even if the wakeup
+    /// scheduler remains full for an extended period.
+    async fn handle_due_retry_overflow(
+        &mut self,
+        deadline: Instant,
+        effect_handler: &mut EffectHandler<OtapPdata>,
+    ) -> Result<(), Error> {
+        loop {
+            if Instant::now() >= deadline || !self.can_send_more() {
+                break;
+            }
+
+            let Some(RetryWakeup {
+                bundle_ref,
+                retry_count,
+            }) = self.take_due_retry_overflow(Instant::now())
+            else {
+                break;
+            };
+
+            self.resume_retry(bundle_ref, retry_count, effect_handler)
+                .await?;
+        }
+
+        Ok(())
+    }
+
     /// Lazily initialize the Quiver engine on first use.
     async fn ensure_engine_initialized(&mut self) -> Result<(), Error> {
         match &self.engine_state {
@@ -1197,6 +1500,15 @@ impl DurableBuffer {
         let deadline = Instant::now() + drain_budget;
         let mut bundles_processed = 0usize;
 
+        // First, resume any overflowed retries whose backoff has elapsed.
+        // This preserves the retry delay guarantee even when the shared wakeup
+        // scheduler is saturated and some retries had to stay local.
+        self.handle_due_retry_overflow(deadline, effect_handler)
+            .await?;
+        // If wakeup capacity became available while handling due overflowed
+        // retries, move waiting retries back to the normal wakeup path.
+        self.promote_retry_overflow_to_wakeups(effect_handler);
+
         // Track the first skipped bundle to detect when we've cycled through all
         // available bundles without making progress (all are in-flight or scheduled).
         let mut first_skipped: Option<(u64, u32)> = None;
@@ -1344,10 +1656,10 @@ impl DurableBuffer {
 
         // Skip if this bundle is scheduled for retry (waiting for backoff).
         // This enforces the exponential backoff - poll_next_bundle() returns
-        // deferred bundles immediately, but we should wait for delay_data to fire.
+        // deferred bundles immediately, but we should wait for the retry delay.
         if self.retry_scheduled.contains(&key) {
             // Bundle is waiting for backoff. Release the claim; it will be
-            // re-claimed when the delay_data retry ticket fires.
+            // re-claimed when a retry wakeup or due overflow retry resumes it.
             drop(handle); // Implicit defer
             return ProcessBundleResult::Skipped;
         }
@@ -1600,99 +1912,13 @@ impl DurableBuffer {
         }) = self.take_retry_wakeup(slot)
         else {
             otel_warn!("durable_buffer.retry.unknown_wakeup", wakeup_slot = slot.0);
+            self.promote_retry_overflow_to_wakeups(effect_handler);
             return Ok(());
         };
 
-        // Check max_in_flight limit
-        if !self.can_send_more() {
-            // At capacity - re-schedule with a short delay.
-            otel_debug!(
-                "durable_buffer.retry.deferred",
-                segment_seq = bundle_ref.segment_seq.raw(),
-                bundle_index = bundle_ref.bundle_index.raw(),
-                in_flight = self.pending_bundles.len(),
-                max_in_flight = self.config.max_in_flight
-            );
-
-            if !self
-                .schedule_retry(
-                    bundle_ref,
-                    retry_count,
-                    self.config.poll_interval,
-                    effect_handler,
-                )
-                .await
-            {
-                otel_warn!("durable_buffer.retry.reschedule_failed");
-            }
-            return Ok(());
-        }
-
-        // Re-claim the bundle from Quiver
-        let claim_result = {
-            let (engine, subscriber_id) = self.engine()?;
-            engine.claim_bundle(subscriber_id, bundle_ref)
-        };
-
-        match claim_result {
-            Ok(handle) => {
-                // Successfully re-claimed, now send downstream
-                match self.try_process_bundle_handle_with_retry_count(
-                    handle,
-                    retry_count,
-                    effect_handler,
-                ) {
-                    ProcessBundleResult::Sent => {
-                        otel_debug!(
-                            "durable_buffer.retry.sent",
-                            segment_seq = bundle_ref.segment_seq.raw(),
-                            bundle_index = bundle_ref.bundle_index.raw(),
-                            retry_count = retry_count
-                        );
-                    }
-                    ProcessBundleResult::Skipped => {
-                        // Shouldn't happen - we just claimed it and removed from retry_scheduled
-                        otel_warn!(
-                            "durable_buffer.retry.skipped",
-                            segment_seq = bundle_ref.segment_seq.raw(),
-                            bundle_index = bundle_ref.bundle_index.raw()
-                        );
-                    }
-                    ProcessBundleResult::Backpressure => {
-                        // Channel full - the handle was dropped (deferred).
-                        // Re-schedule retry with a short delay.
-                        otel_debug!(
-                            "durable_buffer.retry.backpressure",
-                            segment_seq = bundle_ref.segment_seq.raw(),
-                            bundle_index = bundle_ref.bundle_index.raw()
-                        );
-
-                        // Short delay for backpressure (not exponential - this isn't a failure).
-                        // If scheduling fails, poll will pick it up.
-                        let _ = self
-                            .schedule_retry(
-                                bundle_ref,
-                                retry_count,
-                                self.config.poll_interval,
-                                effect_handler,
-                            )
-                            .await;
-                    }
-                    ProcessBundleResult::Error(e) => {
-                        return Err(e);
-                    }
-                }
-            }
-            Err(e) => {
-                // Claim failed - bundle may have been resolved or segment dropped
-                otel_debug!(
-                    "durable_buffer.retry.claim_failed",
-                    segment_seq = bundle_ref.segment_seq.raw(),
-                    bundle_index = bundle_ref.bundle_index.raw(),
-                    error = %e
-                );
-            }
-        }
+        self.resume_retry(bundle_ref, retry_count, effect_handler)
+            .await?;
+        self.promote_retry_overflow_to_wakeups(effect_handler);
 
         Ok(())
     }
@@ -2063,6 +2289,64 @@ mod tests {
         assert!(processor.take_retry_wakeup(WakeupSlot(999)).is_none());
     }
 
+    /// Scenario: a retry was deferred in local overflow state because wakeup
+    /// capacity was exhausted, and its due time has now arrived.
+    /// Guarantees: taking that retry clears all overflow bookkeeping, removes it
+    /// from `retry_scheduled`, and returns the original `(bundle_ref, retry_count)`.
+    #[test]
+    fn test_take_due_retry_overflow_clears_tracking() {
+        use otap_df_engine::context::ControllerContext;
+        use otap_df_telemetry::registry::TelemetryRegistryHandle;
+
+        let registry = TelemetryRegistryHandle::default();
+        let controller_ctx = ControllerContext::new(registry);
+        let pipeline_ctx =
+            controller_ctx.pipeline_context_with("test".into(), "test".into(), 0, 1, 0);
+
+        let config = DurableBufferConfig {
+            path: std::path::PathBuf::from("/tmp/test-retry-overflow"),
+            retention_size_cap: byte_unit::Byte::from_u64(256 * 1024 * 1024),
+            max_age: None,
+            size_cap_policy: SizeCapPolicy::Backpressure,
+            poll_interval: Duration::from_millis(100),
+            otlp_handling: OtlpHandling::PassThrough,
+            max_segment_open_duration: Duration::from_secs(1),
+            initial_retry_interval: Duration::from_secs(1),
+            max_retry_interval: Duration::from_secs(30),
+            retry_multiplier: 2.0,
+            max_in_flight: 1000,
+        };
+
+        let mut processor = DurableBuffer::new(config, &pipeline_ctx).unwrap();
+        let bundle_ref = BundleRef {
+            segment_seq: SegmentSeq::new(321),
+            bundle_index: BundleIndex::new(7),
+        };
+        let key = retry_key(bundle_ref);
+        let retry_at = Instant::now();
+
+        processor.insert_retry_overflow(bundle_ref, 4, retry_at);
+
+        assert!(processor.retry_scheduled.contains(&key));
+        assert!(processor.retry_overflow.contains_key(&key));
+        assert_eq!(processor.retry_overflow_order.len(), 1);
+
+        let retry = processor
+            .take_due_retry_overflow(retry_at + Duration::from_millis(1))
+            .expect("retry should be due");
+
+        assert_eq!(retry.bundle_ref.segment_seq.raw(), 321);
+        assert_eq!(retry.bundle_ref.bundle_index.raw(), 7);
+        assert_eq!(retry.retry_count, 4);
+        assert!(!processor.retry_scheduled.contains(&key));
+        assert!(!processor.retry_overflow.contains_key(&key));
+        assert!(processor.retry_overflow_order.is_empty());
+    }
+
+    /// Scenario: one transient NACK arms a normal processor-local wakeup and the
+    /// wakeup control message is later delivered through the processor inbox.
+    /// Guarantees: the retry stays deferred until the wakeup arrives, and that
+    /// wakeup resumes normal downstream delivery exactly once.
     #[test]
     fn test_retry_wakeup_resumes_retry_logic() {
         use otap_df_config::node::NodeUserConfig;
@@ -2146,6 +2430,110 @@ mod tests {
             .validate(|_| async {});
     }
 
+    /// Scenario: two transient NACKs occur while the processor only has wakeup
+    /// capacity for one armed retry.
+    /// Guarantees: the overflowed retry remains deferred instead of hot-repolling,
+    /// a later `TimerTick` resumes it once due, and the originally armed wakeup
+    /// path still resumes its retry independently.
+    #[test]
+    fn test_retry_capacity_overflow_uses_local_deferral() {
+        use otap_df_config::node::NodeUserConfig;
+        use otap_df_engine::config::ProcessorConfig;
+        use otap_df_engine::context::ControllerContext;
+        use otap_df_engine::control::pipeline_completion_msg_channel;
+        use otap_df_engine::message::Message;
+        use otap_df_engine::testing::processor::TestRuntime;
+        use otap_df_engine::testing::test_node;
+        use otap_df_otap::testing::next_nack;
+        use otap_df_pdata::encode::encode_logs_otap_batch;
+        use otap_df_pdata::testing::fixtures::DataGenerator;
+        use serde_json::json;
+
+        let rt = TestRuntime::new();
+        let controller = ControllerContext::new(rt.metrics_registry());
+        let pipeline_ctx = controller.pipeline_context_with("grp".into(), "pipe".into(), 0, 1, 0);
+        let temp_dir = tempfile::tempdir().expect("tempdir");
+
+        let mut node_config = NodeUserConfig::new_processor_config(DURABLE_BUFFER_URN);
+        node_config.config = json!({
+            "path": temp_dir.path(),
+            "retention_size_cap": "256 MiB",
+            "poll_interval": "50ms",
+            "max_segment_open_duration": "1s",
+            "initial_retry_interval": "100ms",
+            "max_retry_interval": "100ms",
+            "retry_multiplier": 2.0,
+            "max_in_flight": 1000
+        });
+
+        let processor = create_durable_buffer(
+            pipeline_ctx,
+            test_node("durable-buffer-retry-overflow"),
+            Arc::new(node_config),
+            &ProcessorConfig::with_channel_capacities("durable-buffer-retry-overflow", 1, 100),
+        )
+        .expect("create durable buffer");
+
+        rt.set_processor(processor)
+            .run_test(move |mut ctx| async move {
+                let (pipeline_completion_tx, _pipeline_completion_rx) =
+                    pipeline_completion_msg_channel(10);
+                ctx.set_pipeline_completion_sender(pipeline_completion_tx);
+
+                let mut datagen = DataGenerator::new(2);
+                for _ in 0..2 {
+                    let input = datagen.generate_logs();
+                    let rec = encode_logs_otap_batch(&input).expect("encode logs");
+                    ctx.process(Message::PData(OtapPdata::new_default(rec.into())))
+                        .await
+                        .expect("process input");
+                }
+
+                ctx.process(Message::Control(NodeControlMsg::TimerTick {}))
+                    .await
+                    .expect("process timer tick");
+                let mut outputs = ctx.drain_pdata().await;
+                assert_eq!(outputs.len(), 2, "timer tick should emit two bundles");
+
+                for sent in outputs.drain(..) {
+                    let (_, nack) =
+                        next_nack(NackMsg::new("retry", sent)).expect("expected nack subscriber");
+                    ctx.process(Message::Control(NodeControlMsg::Nack(nack)))
+                        .await
+                        .expect("process nack");
+                }
+
+                ctx.process(Message::Control(NodeControlMsg::TimerTick {}))
+                    .await
+                    .expect("process immediate timer tick");
+                assert!(
+                    ctx.drain_pdata().await.is_empty(),
+                    "capacity overflow retries should stay deferred until due"
+                );
+
+                ctx.sleep(Duration::from_millis(200)).await;
+                ctx.process(Message::Control(NodeControlMsg::TimerTick {}))
+                    .await
+                    .expect("process due timer tick");
+                let overflow_retry = ctx.drain_pdata().await;
+                assert_eq!(
+                    overflow_retry.len(),
+                    1,
+                    "a due overflow retry should resume on timer tick even without wakeup capacity"
+                );
+
+                ctx.process(Message::Control(NodeControlMsg::Wakeup {
+                    slot: WakeupSlot(0),
+                    when: Instant::now(),
+                }))
+                .await
+                .expect("process armed retry wakeup");
+                let wakeup_retry = ctx.drain_pdata().await;
+                assert_eq!(wakeup_retry.len(), 1, "armed wakeup should still resume retry delivery");
+            })
+            .validate(|_| async {});
+    }
+
     #[test]
     fn test_backoff_calculation() {
         use otap_df_engine::context::ControllerContext;

From fc65d842dbd14705122636d378f24f2073c80958 Mon Sep 17 00:00:00 2001
From: lquerel <l.querel@f5.com>
Date: Thu, 2 Apr 2026 17:05:47 -0700
Subject: [PATCH 09/18] Add wakeup regression tests

---
 .../src/processors/batch_processor/mod.rs     |  63 +++++++
 .../durable_buffer_processor/mod.rs           | 174 ++++++++++++++++++
 .../crates/engine/src/message.rs              |  36 ++++
 .../crates/engine/src/node_local_scheduler.rs |  36 ++++
 4 files changed, 309 insertions(+)

diff --git a/rust/otap-dataflow/crates/core-nodes/src/processors/batch_processor/mod.rs b/rust/otap-dataflow/crates/core-nodes/src/processors/batch_processor/mod.rs
index 0feee78806..fa19d2e7bd 100644
--- a/rust/otap-dataflow/crates/core-nodes/src/processors/batch_processor/mod.rs
+++ b/rust/otap-dataflow/crates/core-nodes/src/processors/batch_processor/mod.rs
@@ -2164,6 +2164,69 @@ mod tests {
             });
     }
 
+    /// Scenario: buffered input has armed a real batch wakeup, but the processor
+    /// first receives a foreign wakeup slot that does not decode to any local
+    /// `(format, signal)` timer.
+    /// Guarantees: the foreign wakeup is ignored without flushing or corrupting
+    /// state, and the real/current wakeup still flushes the buffered input later.
+    #[test]
+    fn test_unknown_wakeup_slot_is_ignored_without_side_effects() {
+        let (telemetry_registry, metrics_reporter, phase) = setup_test_runtime(json!({
+            "otap": {
+                "min_size": 5,
+                "max_size": 10,
+                "sizer": "items",
+            },
+            "max_batch_duration": "50ms"
+        }));
+
+        phase
+            .run_test(move |mut ctx| async move {
+                let mut datagen = DataGenerator::new(1);
+                let input = datagen.generate_logs();
+
+                let rec = encode_logs_otap_batch(&input).expect("encode logs");
+                ctx.process(Message::PData(OtapPdata::new_default(rec.into())))
+                    .await
+                    .expect("process input");
+                assert!(
+                    ctx.drain_pdata().await.is_empty(),
+                    "input should remain buffered until the real wakeup"
+                );
+
+                ctx.process(Message::Control(NodeControlMsg::Wakeup {
+                    slot: WakeupSlot(99),
+                    when: Instant::now(),
+                }))
+                .await
+                .expect("process unknown wakeup");
+                assert!(
+                    ctx.drain_pdata().await.is_empty(),
+                    "foreign wakeup should be ignored"
+                );
+
+                let current_when = Instant::now() + Duration::from_secs(1);
+                ctx.process(Message::Control(NodeControlMsg::Wakeup {
+                    slot: wakeup_slot(SignalFormat::OtapRecords, SignalType::Logs),
+                    when: current_when,
+                }))
+                .await
+                .expect("process current wakeup");
+                let flushed = ctx.drain_pdata().await;
+                assert_eq!(flushed.len(), 1, "real wakeup should flush buffered input");
+
+                ctx.process(Message::Control(NodeControlMsg::CollectTelemetry {
+                    metrics_reporter,
+                }))
+                .await
+                .expect("collect telemetry");
+            })
+            .validate(move |_| async move {
+                tokio::time::sleep(Duration::from_millis(50)).await;
+                verify_item_metrics(&telemetry_registry, SignalType::Logs, 3);
+            });
+    }
+
     // A partial batch that never reached the size threshold must still flush on
     // Shutdown, and its downstream Ack must release the upstream completion state
     // rather than leaving correlated requests stuck.
diff --git a/rust/otap-dataflow/crates/core-nodes/src/processors/durable_buffer_processor/mod.rs b/rust/otap-dataflow/crates/core-nodes/src/processors/durable_buffer_processor/mod.rs
index c9efc3480e..07c0a34370 100644
--- a/rust/otap-dataflow/crates/core-nodes/src/processors/durable_buffer_processor/mod.rs
+++ b/rust/otap-dataflow/crates/core-nodes/src/processors/durable_buffer_processor/mod.rs
@@ -2343,6 +2343,73 @@ mod tests {
         assert!(processor.retry_overflow_order.is_empty());
     }
 
+    /// Scenario: multiple retries overflow the wakeup scheduler and are stored
+    /// locally with the same due timestamp.
+    /// Guarantees: equal-deadline overflow retries are resumed in insertion
+    /// order using the local sequence tie-breaker.
+    #[test]
+    fn test_equal_deadline_overflow_retries_follow_sequence_order() {
+        use otap_df_engine::context::ControllerContext;
+        use otap_df_telemetry::registry::TelemetryRegistryHandle;
+
+        let registry = TelemetryRegistryHandle::default();
+        let controller_ctx = ControllerContext::new(registry);
+        let pipeline_ctx =
+            controller_ctx.pipeline_context_with("test".into(), "test".into(), 0, 1, 0);
+
+        let config = DurableBufferConfig {
+            path: std::path::PathBuf::from("/tmp/test-retry-overflow-order"),
+            retention_size_cap: byte_unit::Byte::from_u64(256 * 1024 * 1024),
+            max_age: None,
+            size_cap_policy: SizeCapPolicy::Backpressure,
+            poll_interval: Duration::from_millis(100),
+            otlp_handling: OtlpHandling::PassThrough,
+            max_segment_open_duration: Duration::from_secs(1),
+            initial_retry_interval: Duration::from_secs(1),
+            max_retry_interval: Duration::from_secs(30),
+            retry_multiplier: 2.0,
+            max_in_flight: 1000,
+        };
+
+        let mut processor = DurableBuffer::new(config, &pipeline_ctx).unwrap();
+        let retry_at = Instant::now();
+        let first = BundleRef {
+            segment_seq: SegmentSeq::new(111),
+            bundle_index: BundleIndex::new(1),
+        };
+        let second = BundleRef {
+            segment_seq: SegmentSeq::new(111),
+            bundle_index: BundleIndex::new(2),
+        };
+        let third = BundleRef {
+            segment_seq: SegmentSeq::new(111),
+            bundle_index: BundleIndex::new(3),
+        };
+
+        processor.insert_retry_overflow(first, 1, retry_at);
+        processor.insert_retry_overflow(second, 2, retry_at);
+        processor.insert_retry_overflow(third, 3, retry_at);
+
+        let first_retry = processor
+            .take_due_retry_overflow(retry_at + Duration::from_millis(1))
+            .expect("first retry should be due");
+        let second_retry = processor
+            .take_due_retry_overflow(retry_at + Duration::from_millis(1))
+            .expect("second retry should be due");
+        let third_retry = processor
+            .take_due_retry_overflow(retry_at + Duration::from_millis(1))
+            .expect("third retry should be due");
+
+        assert_eq!(first_retry.bundle_ref.bundle_index.raw(), 1);
+        assert_eq!(first_retry.retry_count, 1);
+        assert_eq!(second_retry.bundle_ref.bundle_index.raw(), 2);
+        assert_eq!(second_retry.retry_count, 2);
+        assert_eq!(third_retry.bundle_ref.bundle_index.raw(), 3);
+        assert_eq!(third_retry.retry_count, 3);
+        assert!(processor.retry_overflow.is_empty());
+        assert!(processor.retry_overflow_order.is_empty());
+    }
+
     /// Scenario: one transient NACK arms a normal processor-local wakeup and the
     /// wakeup control message is later delivered through the processor inbox.
     /// Guarantees: the retry stays deferred until the wakeup arrives, and that
@@ -2430,6 +2497,113 @@ mod tests {
             .validate(|_| async {});
     }
 
+    /// Scenario: an unrelated wakeup arrives while durable-buffer still has one
+    /// armed retry and one overflowed retry pending.
+    /// Guarantees: the unrelated wakeup does not cause early redelivery or lose
+    /// either deferred retry; the overflowed retry still resumes on `TimerTick`
+    /// once due, and the armed retry still resumes through its wakeup path.
+    #[test]
+    fn test_unknown_wakeup_does_not_lose_overflowed_retry() {
+        use otap_df_config::node::NodeUserConfig;
+        use otap_df_engine::config::ProcessorConfig;
+        use otap_df_engine::context::ControllerContext;
+        use otap_df_engine::control::pipeline_completion_msg_channel;
+        use otap_df_engine::message::Message;
+        use otap_df_engine::testing::processor::TestRuntime;
+        use otap_df_engine::testing::test_node;
+        use otap_df_otap::testing::next_nack;
+        use otap_df_pdata::encode::encode_logs_otap_batch;
+        use otap_df_pdata::testing::fixtures::DataGenerator;
+        use serde_json::json;
+
+        let rt = TestRuntime::new();
+        let controller = ControllerContext::new(rt.metrics_registry());
+        let pipeline_ctx = controller.pipeline_context_with("grp".into(), "pipe".into(), 0, 1, 0);
+        let temp_dir = tempfile::tempdir().expect("tempdir");
+
+        let mut node_config = NodeUserConfig::new_processor_config(DURABLE_BUFFER_URN);
+        node_config.config = json!({
+            "path": temp_dir.path(),
+            "retention_size_cap": "256 MiB",
+            "poll_interval": "50ms",
+            "max_segment_open_duration": "1s",
+            "initial_retry_interval": "100ms",
+            "max_retry_interval": "100ms",
+            "retry_multiplier": 2.0,
+            "max_in_flight": 1000
+        });
+
+        let processor = create_durable_buffer(
+            pipeline_ctx,
+            test_node("durable-buffer-unknown-wakeup"),
+            Arc::new(node_config),
+            &ProcessorConfig::with_channel_capacities("durable-buffer-unknown-wakeup", 1, 100),
+        )
+        .expect("create durable buffer");
+
+        rt.set_processor(processor)
+            .run_test(move |mut ctx| async move {
+                let (pipeline_completion_tx, _pipeline_completion_rx) =
+                    pipeline_completion_msg_channel(10);
+                ctx.set_pipeline_completion_sender(pipeline_completion_tx);
+
+                let mut datagen = DataGenerator::new(2);
+                for _ in 0..2 {
+                    let input = datagen.generate_logs();
+                    let rec = encode_logs_otap_batch(&input).expect("encode logs");
+                    ctx.process(Message::PData(OtapPdata::new_default(rec.into())))
+                        .await
+                        .expect("process input");
+                }
+
+                ctx.process(Message::Control(NodeControlMsg::TimerTick {}))
+                    .await
+                    .expect("process timer tick");
+                let mut outputs = ctx.drain_pdata().await;
+                assert_eq!(outputs.len(), 2, "timer tick should emit two bundles");
+
+                for sent in outputs.drain(..) {
+                    let (_, nack) =
+                        next_nack(NackMsg::new("retry", sent)).expect("expected nack subscriber");
+                    ctx.process(Message::Control(NodeControlMsg::Nack(nack)))
+                        .await
+                        .expect("process nack");
+                }
+
+                ctx.process(Message::Control(NodeControlMsg::Wakeup {
+                    slot: WakeupSlot(999),
+                    when: Instant::now(),
+                }))
+                .await
+                .expect("process unknown wakeup");
+                assert!(
+                    ctx.drain_pdata().await.is_empty(),
+                    "unknown wakeup should not redeliver deferred retries"
+                );
+
+                ctx.sleep(Duration::from_millis(200)).await;
+                ctx.process(Message::Control(NodeControlMsg::TimerTick {}))
+                    .await
+                    .expect("process due timer tick");
+                let overflow_retry = ctx.drain_pdata().await;
+                assert_eq!(
+                    overflow_retry.len(),
+                    1,
+                    "overflowed retry should still resume on timer tick"
+                );
+
+                ctx.process(Message::Control(NodeControlMsg::Wakeup {
+                    slot: WakeupSlot(0),
+                    when: Instant::now(),
+                }))
+                .await
+                .expect("process armed retry wakeup");
+                let wakeup_retry = ctx.drain_pdata().await;
+                assert_eq!(wakeup_retry.len(), 1, "armed retry should still resume");
+            })
+            .validate(|_| async {});
+    }
+
     /// Scenario: two transient NACKs occur while the processor only has wakeup
     /// capacity for one armed retry.
     /// Guarantees: the overflowed retry remains deferred instead of hot-repolling,
diff --git a/rust/otap-dataflow/crates/engine/src/message.rs b/rust/otap-dataflow/crates/engine/src/message.rs
index c6a8957f5e..d92a6f33ac 100644
--- a/rust/otap-dataflow/crates/engine/src/message.rs
+++ b/rust/otap-dataflow/crates/engine/src/message.rs
@@ -957,6 +957,42 @@ mod tests {
         );
     }
 
+    /// Scenario: a normal control message is already buffered in the processor
+    /// inbox when a processor-local wakeup also becomes due.
+    /// Guarantees: the buffered control message is delivered first, and the
+    /// due wakeup follows as ordinary control traffic rather than bypassing the
+    /// existing control queue.
+    #[tokio::test]
+    async fn processor_inbox_keeps_buffered_control_ahead_of_due_wakeups() {
+        let (control_tx, _pdata_tx, scheduler, mut inbox) = local_processor_inbox(4);
+        let when = Instant::now();
+
+        control_tx
+            .send_async(NodeControlMsg::Config {
+                config: serde_json::json!({"mode": "keep-control-order"}),
+            })
+            .await
+            .expect("config should enqueue");
+        scheduler
+            .set_wakeup(crate::control::WakeupSlot(0), when)
+            .expect("wakeup should schedule");
+
+        let first = inbox.recv_when(true).await.expect("message should arrive");
+        assert!(matches!(
+            first,
+            Message::Control(NodeControlMsg::Config { .. })
+        ));
+
+        let second = inbox.recv_when(true).await.expect("message should arrive");
+        assert!(matches!(
+            second,
+            Message::Control(NodeControlMsg::Wakeup {
+                slot: crate::control::WakeupSlot(0),
+                when: observed,
+            }) if observed == when
+        ));
+    }
+
     #[tokio::test]
     async fn processor_inbox_rejects_wakeups_after_shutdown_latch() {
         let (control_tx, pdata_tx, scheduler, mut inbox) = local_processor_inbox(4);
diff --git a/rust/otap-dataflow/crates/engine/src/node_local_scheduler.rs b/rust/otap-dataflow/crates/engine/src/node_local_scheduler.rs
index 2ed31fe7da..56b9c0bc71 100644
--- a/rust/otap-dataflow/crates/engine/src/node_local_scheduler.rs
+++ b/rust/otap-dataflow/crates/engine/src/node_local_scheduler.rs
@@ -379,6 +379,42 @@ mod tests {
         assert_eq!(scheduler.pop_due(when), None);
     }
 
+    /// Scenario: a wakeup is rescheduled after heap reordering and then
+    /// canceled while it is tracked at a moved, non-root heap index.
+    /// Guarantees: cancellation removes the correct slot, preserves heap/index
+    /// consistency, and leaves the remaining wakeups due in the expected order.
+    #[test]
+    fn cancel_after_reschedule_removes_the_moved_entry() {
+        let mut scheduler = NodeLocalScheduler::new(4);
+        let now = Instant::now();
+        let first = now + Duration::from_secs(1);
+        let second = now + Duration::from_secs(10);
+        let third = now + Duration::from_secs(20);
+        let fourth = now + Duration::from_secs(30);
+        let moved = now + Duration::from_secs(2);
+
+        assert_eq!(scheduler.set_wakeup(WakeupSlot(1), first), Ok(()));
+        assert_eq!(scheduler.set_wakeup(WakeupSlot(2), second), Ok(()));
+        assert_eq!(scheduler.set_wakeup(WakeupSlot(3), third), Ok(()));
+        assert_eq!(scheduler.set_wakeup(WakeupSlot(4), fourth), Ok(()));
+        assert_eq!(scheduler.set_wakeup(WakeupSlot(3), moved), Ok(()));
+
+        let moved_index = scheduler
+            .wakeup_indices
+            .get(&WakeupSlot(3))
+            .copied()
+            .expect("rescheduled slot should still be tracked");
+        assert!(moved_index > 0, "rescheduled slot should be a non-root entry");
+
+        assert!(scheduler.cancel_wakeup(WakeupSlot(3)));
+        assert_heap_bound(&scheduler);
+        assert_eq!(scheduler.pop_due(first), Some((WakeupSlot(1), first)));
+        assert_eq!(scheduler.pop_due(moved), None);
+        assert_eq!(scheduler.pop_due(second), Some((WakeupSlot(2), second)));
+        assert_eq!(scheduler.pop_due(fourth), Some((WakeupSlot(4), fourth)));
+        assert_eq!(scheduler.next_expiry(), None);
+    }
+
     #[test]
     fn capacity_is_enforced_on_distinct_live_slots() {
         let mut scheduler = NodeLocalScheduler::new(1);

From dfc9519a5582d195944acba0cafce49a8fbc55c6 Mon Sep 17 00:00:00 2001
From: lquerel <l.querel@f5.com>
Date: Thu, 2 Apr 2026 17:17:57 -0700
Subject: [PATCH 10/18] Document wakeup test guarantees

---
 .../src/processors/durable_buffer_processor/mod.rs    |  6 +++++-
 .../crates/engine/src/node_local_scheduler.rs         | 11 +++++------
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/rust/otap-dataflow/crates/core-nodes/src/processors/durable_buffer_processor/mod.rs b/rust/otap-dataflow/crates/core-nodes/src/processors/durable_buffer_processor/mod.rs
index 07c0a34370..5d593ef1cb 100644
--- a/rust/otap-dataflow/crates/core-nodes/src/processors/durable_buffer_processor/mod.rs
+++ b/rust/otap-dataflow/crates/core-nodes/src/processors/durable_buffer_processor/mod.rs
@@ -2703,7 +2703,11 @@ mod tests {
                 .await
                 .expect("process armed retry wakeup");
                 let wakeup_retry = ctx.drain_pdata().await;
-                assert_eq!(wakeup_retry.len(), 1, "armed wakeup should still resume retry delivery");
+                assert_eq!(
+                    wakeup_retry.len(),
+                    1,
+                    "armed wakeup should still resume retry delivery"
+                );
             })
             .validate(|_| async {});
     }
diff --git a/rust/otap-dataflow/crates/engine/src/node_local_scheduler.rs b/rust/otap-dataflow/crates/engine/src/node_local_scheduler.rs
index 56b9c0bc71..5e2e8b085d 100644
--- a/rust/otap-dataflow/crates/engine/src/node_local_scheduler.rs
+++ b/rust/otap-dataflow/crates/engine/src/node_local_scheduler.rs
@@ -220,11 +220,7 @@ impl NodeLocalScheduler {
             return None;
         }
 
-        let slot = self
-            .wakeups
-            .first()
-            .expect("due wakeup should exist")
-            .slot;
+        let slot = self.wakeups.first().expect("due wakeup should exist").slot;
         let removed_index = self
             .wakeup_indices
             .remove(&slot)
@@ -404,7 +400,10 @@ mod tests {
             .get(&WakeupSlot(3))
             .copied()
             .expect("rescheduled slot should still be tracked");
-        assert!(moved_index > 0, "rescheduled slot should be a non-root entry");
+        assert!(
+            moved_index > 0,
+            "rescheduled slot should be a non-root entry"
+        );
 
         assert!(scheduler.cancel_wakeup(WakeupSlot(3)));
         assert_heap_bound(&scheduler);

From 8146d390bc47db0efea539ab418aa407417311a9 Mon Sep 17 00:00:00 2001
From: lquerel <l.querel@f5.com>
Date: Thu, 2 Apr 2026 18:18:46 -0700
Subject: [PATCH 11/18] Document batch wakeup slot test

---
 .../crates/core-nodes/src/processors/batch_processor/mod.rs   | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/rust/otap-dataflow/crates/core-nodes/src/processors/batch_processor/mod.rs b/rust/otap-dataflow/crates/core-nodes/src/processors/batch_processor/mod.rs
index fa19d2e7bd..d16986d0cc 100644
--- a/rust/otap-dataflow/crates/core-nodes/src/processors/batch_processor/mod.rs
+++ b/rust/otap-dataflow/crates/core-nodes/src/processors/batch_processor/mod.rs
@@ -2049,6 +2049,10 @@ mod tests {
         test_timer_flush(datagen.generate_logs().into(), true);
     }
 
+    /// Scenario: the batch processor derives wakeup slots from the supported
+    /// `(format, signal)` pairs used by its internal timers.
+    /// Guarantees: every supported pair round-trips through the encoder/decoder,
+    /// and each pair maps to a distinct wakeup slot.
     #[test]
     fn test_wakeup_slot_round_trip_and_uniqueness() {
         let slots = [

From 5725ef2dc0a1a74266bde224169f8307c71e7725 Mon Sep 17 00:00:00 2001
From: lquerel <l.querel@f5.com>
Date: Fri, 3 Apr 2026 10:23:38 -0700
Subject: [PATCH 12/18] Refine local wakeup API and durable retry state

---
 .../src/processors/batch_processor/mod.rs     |  14 +-
 .../durable_buffer_processor/mod.rs           | 524 +++---------------
 .../retry_wakeup_state.rs                     | 460 +++++++++++++++
 rust/otap-dataflow/crates/engine/README.md    |  17 +-
 .../crates/engine/src/control.rs              |  15 +-
 .../crates/engine/src/effect_handler.rs       |  16 +-
 rust/otap-dataflow/crates/engine/src/lib.rs   |   2 +-
 .../crates/engine/src/local/processor.rs      |   8 +-
 .../crates/engine/src/message.rs              |  24 +-
 .../crates/engine/src/node_local_scheduler.rs | 159 ++++--
 .../crates/engine/src/shared/processor.rs     |   8 +-
 11 files changed, 722 insertions(+), 525 deletions(-)
 create mode 100644 rust/otap-dataflow/crates/core-nodes/src/processors/durable_buffer_processor/retry_wakeup_state.rs

diff --git a/rust/otap-dataflow/crates/core-nodes/src/processors/batch_processor/mod.rs b/rust/otap-dataflow/crates/core-nodes/src/processors/batch_processor/mod.rs
index d16986d0cc..749dd215ec 100644
--- a/rust/otap-dataflow/crates/core-nodes/src/processors/batch_processor/mod.rs
+++ b/rust/otap-dataflow/crates/core-nodes/src/processors/batch_processor/mod.rs
@@ -1136,7 +1136,7 @@ impl local::Processor<OtapPdata> for BatchProcessor {
                         message: e.to_string(),
                     }
                 }),
-                NodeControlMsg::Wakeup { slot, when } => {
+                NodeControlMsg::Wakeup { slot, when, .. } => {
                     let Some((format, signal)) = signal_from_wakeup_slot(slot) else {
                         return Ok(());
                     };
@@ -1390,6 +1390,7 @@ where
 
         effect
             .set_wakeup(Self::wakeup_slot(signal), now + timeout)
+            .map(|_| ())
             .map_err(|_| EngineError::ProcessorError {
                 processor: effect.processor_id(),
                 kind: ProcessorErrorKind::Other,
@@ -1809,6 +1810,7 @@ mod tests {
                                 ctx.process(Message::Control(NodeControlMsg::Wakeup {
                                     slot,
                                     when,
+                                    revision: 0,
                                 }))
                                 .await
                                 .expect("process wakeup");
@@ -2134,6 +2136,7 @@ mod tests {
                 ctx.process(Message::Control(NodeControlMsg::Wakeup {
                     slot: wakeup_slot(SignalFormat::OtapRecords, SignalType::Logs),
                     when: stale_when,
+                    revision: 0,
                 }))
                 .await
                 .expect("process stale wakeup");
@@ -2146,6 +2149,7 @@ mod tests {
                 ctx.process(Message::Control(NodeControlMsg::Wakeup {
                     slot: wakeup_slot(SignalFormat::OtapRecords, SignalType::Logs),
                     when: current_when,
+                    revision: 1,
                 }))
                 .await
                 .expect("process current wakeup");
@@ -2201,6 +2205,7 @@ mod tests {
                 ctx.process(Message::Control(NodeControlMsg::Wakeup {
                     slot: WakeupSlot(99),
                     when: Instant::now(),
+                    revision: 0,
                 }))
                 .await
                 .expect("process unknown wakeup");
@@ -2213,6 +2218,7 @@ mod tests {
                 ctx.process(Message::Control(NodeControlMsg::Wakeup {
                     slot: wakeup_slot(SignalFormat::OtapRecords, SignalType::Logs),
                     when: current_when,
+                    revision: 1,
                 }))
                 .await
                 .expect("process current wakeup");
@@ -2827,7 +2833,11 @@ mod tests {
                     wakeup_slot(SignalFormat::OtlpBytes, SignalType::Logs),
                     wakeup_slot(SignalFormat::OtapRecords, SignalType::Logs),
                 ] {
-                    ctx.process(Message::Control(NodeControlMsg::Wakeup { slot, when }))
+                    ctx.process(Message::Control(NodeControlMsg::Wakeup {
+                        slot,
+                        when,
+                        revision: 0,
+                    }))
                         .await
                         .expect("process wakeup");
                 }
diff --git a/rust/otap-dataflow/crates/core-nodes/src/processors/durable_buffer_processor/mod.rs b/rust/otap-dataflow/crates/core-nodes/src/processors/durable_buffer_processor/mod.rs
index 5d593ef1cb..539c4d6299 100644
--- a/rust/otap-dataflow/crates/core-nodes/src/processors/durable_buffer_processor/mod.rs
+++ b/rust/otap-dataflow/crates/core-nodes/src/processors/durable_buffer_processor/mod.rs
@@ -71,9 +71,10 @@
 
 mod bundle_adapter;
 mod config;
+mod retry_wakeup_state;
 
 use std::collections::hash_map::Entry;
-use std::collections::{BTreeSet, HashMap, HashSet};
+use std::collections::{HashMap, HashSet};
 use std::sync::Arc;
 use std::time::{Duration, Instant};
 
@@ -97,6 +98,9 @@ use bundle_adapter::{
     OtapRecordBundleAdapter, OtlpBytesAdapter, convert_bundle_to_pdata, signal_type_from_slot_id,
 };
 pub use config::{DurableBufferConfig, OtlpHandling, SizeCapPolicy};
+use retry_wakeup_state::RetryWakeupState;
+#[cfg(test)]
+use retry_wakeup_state::{retry_key, retry_wakeup_slot};
 
 use otap_df_config::SignalType;
 use otap_df_config::error::Error as ConfigError;
@@ -104,7 +108,9 @@ use otap_df_config::node::NodeUserConfig;
 use otap_df_engine::config::ProcessorConfig;
 use otap_df_engine::context::PipelineContext;
 use otap_df_engine::control::Context8u8;
-use otap_df_engine::control::{AckMsg, CallData, NackMsg, NodeControlMsg, WakeupSlot};
+use otap_df_engine::control::{
+    AckMsg, CallData, NackMsg, NodeControlMsg, WakeupRevision, WakeupSlot,
+};
 use otap_df_engine::error::Error;
 use otap_df_engine::local::processor::EffectHandler;
 use otap_df_engine::message::Message;
@@ -112,7 +118,6 @@ use otap_df_engine::node::NodeId;
 use otap_df_engine::processor::ProcessorWrapper;
 use otap_df_engine::{
     ConsumerEffectHandlerExtension, Interests, ProcessorFactory, ProducerEffectHandlerExtension,
-    WakeupError,
 };
 use otap_df_pdata::{OtapArrowRecords, OtapPayload};
 use otap_df_telemetry::instrument::{Counter, Gauge, ObserveCounter};
@@ -319,10 +324,6 @@ fn decode_bundle_ref(calldata: &CallData) -> Option<BundleRef> {
     })
 }
 
-fn retry_key(bundle_ref: BundleRef) -> (u64, u32) {
-    (bundle_ref.segment_seq.raw(), bundle_ref.bundle_index.raw())
-}
-
 /// State for tracking a pending downstream delivery.
 ///
 /// Holds the Quiver bundle handle to keep the bundle claimed while in-flight.
@@ -339,28 +340,6 @@ struct PendingBundle {
     signal_type: SignalType,
 }
 
-/// Local retry state held between wakeup scheduling and wakeup delivery.
-#[derive(Clone, Copy)]
-struct RetryWakeup {
-    bundle_ref: BundleRef,
-    retry_count: u32,
-}
-
-#[derive(Clone, Copy)]
-struct OverflowRetry {
-    bundle_ref: BundleRef,
-    retry_count: u32,
-    retry_at: Instant,
-    sequence: u64,
-}
-
-#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
-struct OverflowRetryOrder {
-    retry_at: Instant,
-    sequence: u64,
-    key: (u64, u32),
-}
-
 /// Result of attempting to process a bundle with non-blocking send.
 enum ProcessBundleResult {
     /// Bundle was successfully sent downstream.
@@ -431,43 +410,8 @@ pub struct DurableBuffer {
     /// Key is the (segment_seq, bundle_index) pair encoded as a u128 for fast lookup.
     pending_bundles: HashMap<(u64, u32), PendingBundle>,
 
-    /// Bundles currently held out of the normal poll loop while backoff is active.
-    ///
-    /// Invariant: every key here is deferred for retry either by an armed wakeup
-    /// (`retry_wakeup_slots` + `retry_wakeups`) or by the local overflow queue
-    /// (`retry_overflow` + `retry_overflow_order`).
-    retry_scheduled: HashSet<(u64, u32)>,
-
-    /// Wakeup slot assigned to each bundle currently waiting for retry.
-    ///
-    /// Invariant: this only contains bundles that successfully acquired a slot
-    /// in the engine wakeup scheduler. Keys present only in `retry_overflow`
-    /// intentionally have no slot entry yet.
-    retry_wakeup_slots: HashMap<(u64, u32), WakeupSlot>,
-
-    /// Retry state keyed by wakeup slot.
-    ///
-    /// Invariant: for every `(key -> slot)` in `retry_wakeup_slots`, there is a
-    /// matching `(slot -> RetryWakeup)` entry here describing the same bundle.
-    retry_wakeups: HashMap<WakeupSlot, RetryWakeup>,
-
-    /// Retry state held locally while wakeup scheduling is at capacity.
-    ///
-    /// Guarantee: overflowed retries remain deferred and keep their target due
-    /// time even when the engine wakeup scheduler is full.
-    retry_overflow: HashMap<(u64, u32), OverflowRetry>,
-
-    /// Due-order index for locally deferred retries.
-    ///
-    /// Invariant: this contains exactly one ordering key for each entry in
-    /// `retry_overflow`, using `sequence` as a deterministic tie-breaker.
-    retry_overflow_order: BTreeSet<OverflowRetryOrder>,
-
-    /// Monotonic slot allocator for retry wakeups.
-    next_retry_wakeup_slot: u64,
-
-    /// Monotonic tie-breaker for locally deferred retry ordering.
-    next_retry_overflow_sequence: u64,
+    /// Processor-local wakeup bookkeeping for retry deferral and overflow.
+    retry_wakeup_state: RetryWakeupState,
 
     /// Configuration.
     config: DurableBufferConfig,
@@ -561,13 +505,7 @@ impl DurableBuffer {
         Ok(Self {
             engine_state: EngineState::Uninitialized,
             pending_bundles: HashMap::new(),
-            retry_scheduled: HashSet::new(),
-            retry_wakeup_slots: HashMap::new(),
-            retry_wakeups: HashMap::new(),
-            retry_overflow: HashMap::new(),
-            retry_overflow_order: BTreeSet::new(),
-            next_retry_wakeup_slot: 0,
-            next_retry_overflow_sequence: 0,
+            retry_wakeup_state: RetryWakeupState::new(),
             config,
             core_id,
             num_cores,
@@ -663,120 +601,6 @@ impl DurableBuffer {
         self.pending_bundles.len() < self.config.max_in_flight
     }
 
-    /// Allocates a new processor-local wakeup slot for a retrying bundle.
-    ///
-    /// Guarantee: slot values are never reused within one processor instance,
-    /// which keeps stale wakeups from aliasing a newer retry.
-    fn next_retry_wakeup_slot(&mut self) -> WakeupSlot {
-        let slot = WakeupSlot(self.next_retry_wakeup_slot);
-        self.next_retry_wakeup_slot = self.next_retry_wakeup_slot.saturating_add(1);
-        slot
-    }
-
-    /// Builds the deterministic ordering key for an overflowed retry.
-    fn overflow_retry_order(key: (u64, u32), retry: OverflowRetry) -> OverflowRetryOrder {
-        OverflowRetryOrder {
-            retry_at: retry.retry_at,
-            sequence: retry.sequence,
-            key,
-        }
-    }
-
-    /// Removes one overflowed retry from both local indexes.
-    ///
-    /// Invariant preserved: `retry_overflow` and `retry_overflow_order` stay in
-    /// lockstep after every insertion/removal.
-    fn remove_retry_overflow(&mut self, key: (u64, u32)) -> Option<OverflowRetry> {
-        let retry = self.retry_overflow.remove(&key)?;
-        let _ = self
-            .retry_overflow_order
-            .remove(&Self::overflow_retry_order(key, retry));
-        Some(retry)
-    }
-
-    /// Defers a retry in local processor state when the engine wakeup scheduler
-    /// has no free slot.
-    ///
-    /// Guarantees:
-    /// - the bundle remains in `retry_scheduled`, so `poll_next_bundle()` keeps
-    ///   skipping it
-    /// - the most recent `(retry_count, retry_at)` replaces any older local
-    ///   overflow record for the same bundle
-    /// - equal due times are processed deterministically by `sequence`
-    fn insert_retry_overflow(
-        &mut self,
-        bundle_ref: BundleRef,
-        retry_count: u32,
-        retry_at: Instant,
-    ) {
-        let key = retry_key(bundle_ref);
-        let _ = self.remove_retry_overflow(key);
-        let retry = OverflowRetry {
-            bundle_ref,
-            retry_count,
-            retry_at,
-            sequence: self.next_retry_overflow_sequence,
-        };
-        self.next_retry_overflow_sequence = self.next_retry_overflow_sequence.saturating_add(1);
-        let _ = self.retry_scheduled.insert(key);
-        let _ = self.retry_overflow.insert(key, retry);
-        let _ = self
-            .retry_overflow_order
-            .insert(Self::overflow_retry_order(key, retry));
-    }
-
-    /// Pops the next locally deferred retry only when its due time has arrived.
-    ///
-    /// Guarantee: returning a retry clears all local overflow bookkeeping for
-    /// that bundle so it can be resumed exactly once.
-    fn take_due_retry_overflow(&mut self, now: Instant) -> Option<RetryWakeup> {
-        let order = *self.retry_overflow_order.first()?;
-        if order.retry_at > now {
-            return None;
-        }
-
-        let _ = self.retry_overflow_order.remove(&order);
-        let retry = self.retry_overflow.remove(&order.key)?;
-        let _ = self.retry_scheduled.remove(&order.key);
-
-        Some(RetryWakeup {
-            bundle_ref: retry.bundle_ref,
-            retry_count: retry.retry_count,
-        })
-    }
-
-    /// Opportunistically moves overflowed retries back into engine wakeup slots.
-    ///
-    /// Guarantees:
-    /// - never drops a deferred retry when slot acquisition fails
-    /// - preserves retry due time when promotion succeeds
-    /// - stops as soon as the scheduler reports `Capacity` or shutdown
-    fn promote_retry_overflow_to_wakeups(&mut self, effect_handler: &mut EffectHandler<OtapPdata>) {
-        while let Some(order) = self.retry_overflow_order.first().copied() {
-            let Some(retry) = self.retry_overflow.get(&order.key).copied() else {
-                let _ = self.retry_overflow_order.remove(&order);
-                continue;
-            };
-
-            let slot = self.next_retry_wakeup_slot();
-            match effect_handler.set_wakeup(slot, retry.retry_at) {
-                Ok(()) => {
-                    let _ = self.retry_overflow_order.remove(&order);
-                    let _ = self.retry_overflow.remove(&order.key);
-                    let _ = self.retry_wakeup_slots.insert(order.key, slot);
-                    let _ = self.retry_wakeups.insert(
-                        slot,
-                        RetryWakeup {
-                            bundle_ref: retry.bundle_ref,
-                            retry_count: retry.retry_count,
-                        },
-                    );
-                }
-                Err(WakeupError::Capacity | WakeupError::ShuttingDown) => break,
-            }
-        }
-    }
-
     /// Schedule a retry for a bundle via a processor-local wakeup.
     ///
     /// This is the single point of coordination between wakeup scheduling and
@@ -795,55 +619,9 @@ impl DurableBuffer {
         delay: Duration,
         effect_handler: &mut EffectHandler<OtapPdata>,
     ) -> bool {
-        let key = retry_key(bundle_ref);
-        let _ = self.remove_retry_overflow(key);
-        let (slot, is_new_slot) = if let Some(slot) = self.retry_wakeup_slots.get(&key).copied() {
-            (slot, false)
-        } else {
-            let slot = self.next_retry_wakeup_slot();
-            let _ = self.retry_wakeup_slots.insert(key, slot);
-            (slot, true)
-        };
         let retry_at = Instant::now() + delay;
-        match effect_handler.set_wakeup(slot, retry_at) {
-            Ok(()) => {
-                // Track that this bundle is scheduled - poll_next_bundle will skip it
-                let _ = self.retry_scheduled.insert(key);
-                let _ = self.retry_wakeups.insert(
-                    slot,
-                    RetryWakeup {
-                        bundle_ref,
-                        retry_count,
-                    },
-                );
-                true
-            }
-            Err(WakeupError::Capacity) => {
-                if is_new_slot {
-                    let _ = self.retry_wakeup_slots.remove(&key);
-                }
-                self.insert_retry_overflow(bundle_ref, retry_count, retry_at);
-                true
-            }
-            Err(WakeupError::ShuttingDown) => {
-                if is_new_slot {
-                    let _ = self.retry_wakeup_slots.remove(&key);
-                }
-                false
-            }
-        }
-    }
-
-    /// Remove retry-wakeup tracking for a bundle now being resumed.
-    ///
-    /// Guarantee: taking a wakeup clears the armed-wakeup bookkeeping for that
-    /// bundle before retry resumption starts.
-    fn take_retry_wakeup(&mut self, slot: WakeupSlot) -> Option<RetryWakeup> {
-        let wakeup = self.retry_wakeups.remove(&slot)?;
-        let key = retry_key(wakeup.bundle_ref);
-        let _ = self.retry_scheduled.remove(&key);
-        let _ = self.retry_wakeup_slots.remove(&key);
-        Some(wakeup)
+        self.retry_wakeup_state
+            .schedule_at(bundle_ref, retry_count, retry_at, effect_handler)
     }
 
     /// Resumes one deferred retry, either by sending it downstream again or by
@@ -959,15 +737,14 @@ impl DurableBuffer {
                 break;
             }
 
-            let Some(RetryWakeup {
-                bundle_ref,
-                retry_count,
-            }) = self.take_due_retry_overflow(Instant::now())
+            let Some(retry) = self
+                .retry_wakeup_state
+                .take_due_retry_overflow(Instant::now())
             else {
                 break;
             };
 
-            self.resume_retry(bundle_ref, retry_count, effect_handler)
+            self.resume_retry(retry.bundle_ref(), retry.retry_count(), effect_handler)
                 .await?;
         }
 
@@ -1503,11 +1280,11 @@ impl DurableBuffer {
         // First, resume any overflowed retries whose backoff has elapsed.
         // This preserves the retry delay guarantee even when the shared wakeup
         // scheduler is saturated and some retries had to stay local.
-        self.handle_due_retry_overflow(deadline, effect_handler)
-            .await?;
+        self.handle_due_retry_overflow(deadline, effect_handler).await?;
         // If wakeup capacity became available while handling due overflowed
         // retries, move waiting retries back to the normal wakeup path.
-        self.promote_retry_overflow_to_wakeups(effect_handler);
+        self.retry_wakeup_state
+            .promote_overflow_to_wakeups(effect_handler);
 
         // Track the first skipped bundle to detect when we've cycled through all
         // available bundles without making progress (all are in-flight or scheduled).
@@ -1562,7 +1339,7 @@ impl DurableBuffer {
                                     "durable_buffer.drain.all_blocked",
                                     bundles_processed = bundles_processed,
                                     in_flight = self.pending_bundles.len(),
-                                    retry_scheduled = self.retry_scheduled.len()
+                                    retry_scheduled = self.retry_wakeup_state.scheduled_len()
                                 );
                                 break;
                             }
@@ -1657,7 +1434,7 @@ impl DurableBuffer {
         // Skip if this bundle is scheduled for retry (waiting for backoff).
         // This enforces the exponential backoff - poll_next_bundle() returns
         // deferred bundles immediately, but we should wait for the retry delay.
-        if self.retry_scheduled.contains(&key) {
+        if self.retry_wakeup_state.is_deferred_key(key) {
             // Bundle is waiting for backoff. Release the claim; it will be
             // re-claimed when a retry wakeup or due overflow retry resumes it.
             drop(handle); // Implicit defer
@@ -1904,21 +1681,25 @@ impl DurableBuffer {
     async fn handle_retry_wakeup(
         &mut self,
         slot: WakeupSlot,
+        revision: WakeupRevision,
         effect_handler: &mut EffectHandler<OtapPdata>,
     ) -> Result<(), Error> {
-        let Some(RetryWakeup {
-            bundle_ref,
-            retry_count,
-        }) = self.take_retry_wakeup(slot)
+        let Some(retry) = self.retry_wakeup_state.take_retry_wakeup(slot, revision)
         else {
-            otel_warn!("durable_buffer.retry.unknown_wakeup", wakeup_slot = slot.0);
-            self.promote_retry_overflow_to_wakeups(effect_handler);
+            otel_warn!(
+                "durable_buffer.retry.unknown_wakeup",
+                wakeup_slot = slot.0.to_string(),
+                wakeup_revision = revision
+            );
+            self.retry_wakeup_state
+                .promote_overflow_to_wakeups(effect_handler);
             return Ok(());
         };
 
-        self.resume_retry(bundle_ref, retry_count, effect_handler)
+        self.resume_retry(retry.bundle_ref(), retry.retry_count(), effect_handler)
             .await?;
-        self.promote_retry_overflow_to_wakeups(effect_handler);
+        self.retry_wakeup_state
+            .promote_overflow_to_wakeups(effect_handler);
 
         Ok(())
     }
@@ -2124,8 +1905,9 @@ impl otap_df_engine::local::processor::Processor<OtapPdata> for DurableBuffer {
                     Ok(())
                 }
                 NodeControlMsg::DrainIngress { .. } => Ok(()),
-                NodeControlMsg::Wakeup { slot, .. } => {
-                    self.handle_retry_wakeup(slot, effect_handler).await
+                NodeControlMsg::Wakeup { slot, revision, .. } => {
+                    self.handle_retry_wakeup(slot, revision, effect_handler)
+                        .await
                 }
                 NodeControlMsg::DelayedData { .. } => {
                     otel_warn!("durable_buffer.delayed_data.unexpected");
@@ -2209,207 +1991,6 @@ mod tests {
         assert!(decode_bundle_ref(&calldata).is_none());
     }
 
-    #[test]
-    fn test_take_retry_wakeup_clears_tracking() {
-        use otap_df_engine::context::ControllerContext;
-        use otap_df_telemetry::registry::TelemetryRegistryHandle;
-
-        let registry = TelemetryRegistryHandle::default();
-        let controller_ctx = ControllerContext::new(registry);
-        let pipeline_ctx =
-            controller_ctx.pipeline_context_with("test".into(), "test".into(), 0, 1, 0);
-
-        let config = DurableBufferConfig {
-            path: std::path::PathBuf::from("/tmp/test-retry-wakeup"),
-            retention_size_cap: byte_unit::Byte::from_u64(256 * 1024 * 1024),
-            max_age: None,
-            size_cap_policy: SizeCapPolicy::Backpressure,
-            poll_interval: Duration::from_millis(100),
-            otlp_handling: OtlpHandling::PassThrough,
-            max_segment_open_duration: Duration::from_secs(1),
-            initial_retry_interval: Duration::from_secs(1),
-            max_retry_interval: Duration::from_secs(30),
-            retry_multiplier: 2.0,
-            max_in_flight: 1000,
-        };
-
-        let mut processor = DurableBuffer::new(config, &pipeline_ctx).unwrap();
-        let bundle_ref = BundleRef {
-            segment_seq: SegmentSeq::new(98765),
-            bundle_index: BundleIndex::new(123),
-        };
-        let key = (bundle_ref.segment_seq.raw(), bundle_ref.bundle_index.raw());
-        let slot = WakeupSlot(7);
-        let _ = processor.retry_scheduled.insert(key);
-        let _ = processor.retry_wakeup_slots.insert(key, slot);
-        let _ = processor.retry_wakeups.insert(
-            slot,
-            RetryWakeup {
-                bundle_ref,
-                retry_count: 3,
-            },
-        );
-
-        let taken = processor
-            .take_retry_wakeup(slot)
-            .expect("retry wakeup should exist");
-        assert_eq!(taken.bundle_ref.segment_seq.raw(), 98765);
-        assert_eq!(taken.bundle_ref.bundle_index.raw(), 123);
-        assert_eq!(taken.retry_count, 3);
-        assert!(!processor.retry_scheduled.contains(&key));
-        assert!(!processor.retry_wakeup_slots.contains_key(&key));
-        assert!(!processor.retry_wakeups.contains_key(&slot));
-    }
-
-    #[test]
-    fn test_take_retry_wakeup_unknown_slot_is_ignored() {
-        use otap_df_engine::context::ControllerContext;
-        use otap_df_telemetry::registry::TelemetryRegistryHandle;
-
-        let registry = TelemetryRegistryHandle::default();
-        let controller_ctx = ControllerContext::new(registry);
-        let pipeline_ctx =
-            controller_ctx.pipeline_context_with("test".into(), "test".into(), 0, 1, 0);
-
-        let config = DurableBufferConfig {
-            path: std::path::PathBuf::from("/tmp/test-retry-wakeup-miss"),
-            retention_size_cap: byte_unit::Byte::from_u64(256 * 1024 * 1024),
-            max_age: None,
-            size_cap_policy: SizeCapPolicy::Backpressure,
-            poll_interval: Duration::from_millis(100),
-            otlp_handling: OtlpHandling::PassThrough,
-            max_segment_open_duration: Duration::from_secs(1),
-            initial_retry_interval: Duration::from_secs(1),
-            max_retry_interval: Duration::from_secs(30),
-            retry_multiplier: 2.0,
-            max_in_flight: 1000,
-        };
-
-        let mut processor = DurableBuffer::new(config, &pipeline_ctx).unwrap();
-        assert!(processor.take_retry_wakeup(WakeupSlot(999)).is_none());
-    }
-
-    /// Scenario: a retry was deferred in local overflow state because wakeup
-    /// capacity was exhausted, and its due time has now arrived.
-    /// Guarantees: taking that retry clears all overflow bookkeeping, removes it
-    /// from `retry_scheduled`, and returns the original `(bundle_ref, retry_count)`.
-    #[test]
-    fn test_take_due_retry_overflow_clears_tracking() {
-        use otap_df_engine::context::ControllerContext;
-        use otap_df_telemetry::registry::TelemetryRegistryHandle;
-
-        let registry = TelemetryRegistryHandle::default();
-        let controller_ctx = ControllerContext::new(registry);
-        let pipeline_ctx =
-            controller_ctx.pipeline_context_with("test".into(), "test".into(), 0, 1, 0);
-
-        let config = DurableBufferConfig {
-            path: std::path::PathBuf::from("/tmp/test-retry-overflow"),
-            retention_size_cap: byte_unit::Byte::from_u64(256 * 1024 * 1024),
-            max_age: None,
-            size_cap_policy: SizeCapPolicy::Backpressure,
-            poll_interval: Duration::from_millis(100),
-            otlp_handling: OtlpHandling::PassThrough,
-            max_segment_open_duration: Duration::from_secs(1),
-            initial_retry_interval: Duration::from_secs(1),
-            max_retry_interval: Duration::from_secs(30),
-            retry_multiplier: 2.0,
-            max_in_flight: 1000,
-        };
-
-        let mut processor = DurableBuffer::new(config, &pipeline_ctx).unwrap();
-        let bundle_ref = BundleRef {
-            segment_seq: SegmentSeq::new(321),
-            bundle_index: BundleIndex::new(7),
-        };
-        let key = retry_key(bundle_ref);
-        let retry_at = Instant::now();
-
-        processor.insert_retry_overflow(bundle_ref, 4, retry_at);
-
-        assert!(processor.retry_scheduled.contains(&key));
-        assert!(processor.retry_overflow.contains_key(&key));
-        assert_eq!(processor.retry_overflow_order.len(), 1);
-
-        let retry = processor
-            .take_due_retry_overflow(retry_at + Duration::from_millis(1))
-            .expect("retry should be due");
-
-        assert_eq!(retry.bundle_ref.segment_seq.raw(), 321);
-        assert_eq!(retry.bundle_ref.bundle_index.raw(), 7);
-        assert_eq!(retry.retry_count, 4);
-        assert!(!processor.retry_scheduled.contains(&key));
-        assert!(!processor.retry_overflow.contains_key(&key));
-        assert!(processor.retry_overflow_order.is_empty());
-    }
-
-    /// Scenario: multiple retries overflow the wakeup scheduler and are stored
-    /// locally with the same due timestamp.
-    /// Guarantees: equal-deadline overflow retries are resumed in insertion
-    /// order using the local sequence tie-breaker.
-    #[test]
-    fn test_equal_deadline_overflow_retries_follow_sequence_order() {
-        use otap_df_engine::context::ControllerContext;
-        use otap_df_telemetry::registry::TelemetryRegistryHandle;
-
-        let registry = TelemetryRegistryHandle::default();
-        let controller_ctx = ControllerContext::new(registry);
-        let pipeline_ctx =
-            controller_ctx.pipeline_context_with("test".into(), "test".into(), 0, 1, 0);
-
-        let config = DurableBufferConfig {
-            path: std::path::PathBuf::from("/tmp/test-retry-overflow-order"),
-            retention_size_cap: byte_unit::Byte::from_u64(256 * 1024 * 1024),
-            max_age: None,
-            size_cap_policy: SizeCapPolicy::Backpressure,
-            poll_interval: Duration::from_millis(100),
-            otlp_handling: OtlpHandling::PassThrough,
-            max_segment_open_duration: Duration::from_secs(1),
-            initial_retry_interval: Duration::from_secs(1),
-            max_retry_interval: Duration::from_secs(30),
-            retry_multiplier: 2.0,
-            max_in_flight: 1000,
-        };
-
-        let mut processor = DurableBuffer::new(config, &pipeline_ctx).unwrap();
-        let retry_at = Instant::now();
-        let first = BundleRef {
-            segment_seq: SegmentSeq::new(111),
-            bundle_index: BundleIndex::new(1),
-        };
-        let second = BundleRef {
-            segment_seq: SegmentSeq::new(111),
-            bundle_index: BundleIndex::new(2),
-        };
-        let third = BundleRef {
-            segment_seq: SegmentSeq::new(111),
-            bundle_index: BundleIndex::new(3),
-        };
-
-        processor.insert_retry_overflow(first, 1, retry_at);
-        processor.insert_retry_overflow(second, 2, retry_at);
-        processor.insert_retry_overflow(third, 3, retry_at);
-
-        let first_retry = processor
-            .take_due_retry_overflow(retry_at + Duration::from_millis(1))
-            .expect("first retry should be due");
-        let second_retry = processor
-            .take_due_retry_overflow(retry_at + Duration::from_millis(1))
-            .expect("second retry should be due");
-        let third_retry = processor
-            .take_due_retry_overflow(retry_at + Duration::from_millis(1))
-            .expect("third retry should be due");
-
-        assert_eq!(first_retry.bundle_ref.bundle_index.raw(), 1);
-        assert_eq!(first_retry.retry_count, 1);
-        assert_eq!(second_retry.bundle_ref.bundle_index.raw(), 2);
-        assert_eq!(second_retry.retry_count, 2);
-        assert_eq!(third_retry.bundle_ref.bundle_index.raw(), 3);
-        assert_eq!(third_retry.retry_count, 3);
-        assert!(processor.retry_overflow.is_empty());
-        assert!(processor.retry_overflow_order.is_empty());
-    }
-
     /// Scenario: one transient NACK arms a normal processor-local wakeup and the
     /// wakeup control message is later delivered through the processor inbox.
     /// Guarantees: the retry stays deferred until the wakeup arrives, and that
@@ -2475,6 +2056,9 @@ mod tests {
                 let sent = outputs.pop().expect("sent bundle");
                 let (_, nack) =
                     next_nack(NackMsg::new("retry", sent)).expect("expected nack subscriber");
+                let bundle_ref =
+                    decode_bundle_ref(&nack.unwind.route.calldata).expect("bundle ref in nack");
+                let armed_slot = retry_wakeup_slot(retry_key(bundle_ref));
                 ctx.process(Message::Control(NodeControlMsg::Nack(nack)))
                     .await
                     .expect("process nack");
@@ -2484,8 +2068,9 @@ mod tests {
                 );
 
                 ctx.process(Message::Control(NodeControlMsg::Wakeup {
-                    slot: WakeupSlot(0),
+                    slot: armed_slot,
                     when: Instant::now() + Duration::from_secs(1),
+                    revision: 0,
                 }))
                 .await
                 .expect("process retry wakeup");
@@ -2561,18 +2146,26 @@ mod tests {
                     .expect("process timer tick");
                 let mut outputs = ctx.drain_pdata().await;
                 assert_eq!(outputs.len(), 2, "timer tick should emit two bundles");
+                let mut armed_slot = None;
 
-                for sent in outputs.drain(..) {
+                for (index, sent) in outputs.drain(..).enumerate() {
                     let (_, nack) =
                         next_nack(NackMsg::new("retry", sent)).expect("expected nack subscriber");
+                    if index == 0 {
+                        let bundle_ref = decode_bundle_ref(&nack.unwind.route.calldata)
+                            .expect("bundle ref in nack");
+                        armed_slot = Some(retry_wakeup_slot(retry_key(bundle_ref)));
+                    }
                     ctx.process(Message::Control(NodeControlMsg::Nack(nack)))
                         .await
                         .expect("process nack");
                 }
+                let armed_slot = armed_slot.expect("first nack should arm a wakeup");
 
                 ctx.process(Message::Control(NodeControlMsg::Wakeup {
                     slot: WakeupSlot(999),
                     when: Instant::now(),
+                    revision: 0,
                 }))
                 .await
                 .expect("process unknown wakeup");
@@ -2593,8 +2186,9 @@ mod tests {
                 );
 
                 ctx.process(Message::Control(NodeControlMsg::Wakeup {
-                    slot: WakeupSlot(0),
+                    slot: armed_slot,
                     when: Instant::now(),
+                    revision: 0,
                 }))
                 .await
                 .expect("process armed retry wakeup");
@@ -2668,14 +2262,21 @@ mod tests {
                     .expect("process timer tick");
                 let mut outputs = ctx.drain_pdata().await;
                 assert_eq!(outputs.len(), 2, "timer tick should emit two bundles");
+                let mut armed_slot = None;
 
-                for sent in outputs.drain(..) {
+                for (index, sent) in outputs.drain(..).enumerate() {
                     let (_, nack) =
                         next_nack(NackMsg::new("retry", sent)).expect("expected nack subscriber");
+                    if index == 0 {
+                        let bundle_ref = decode_bundle_ref(&nack.unwind.route.calldata)
+                            .expect("bundle ref in nack");
+                        armed_slot = Some(retry_wakeup_slot(retry_key(bundle_ref)));
+                    }
                     ctx.process(Message::Control(NodeControlMsg::Nack(nack)))
                         .await
                         .expect("process nack");
                 }
+                let armed_slot = armed_slot.expect("first nack should arm a wakeup");
 
                 ctx.process(Message::Control(NodeControlMsg::TimerTick {}))
                     .await
@@ -2697,8 +2298,9 @@ mod tests {
                 );
 
                 ctx.process(Message::Control(NodeControlMsg::Wakeup {
-                    slot: WakeupSlot(0),
+                    slot: armed_slot,
                     when: Instant::now(),
+                    revision: 0,
                 }))
                 .await
                 .expect("process armed retry wakeup");
diff --git a/rust/otap-dataflow/crates/core-nodes/src/processors/durable_buffer_processor/retry_wakeup_state.rs b/rust/otap-dataflow/crates/core-nodes/src/processors/durable_buffer_processor/retry_wakeup_state.rs
new file mode 100644
index 0000000000..8f55a0972b
--- /dev/null
+++ b/rust/otap-dataflow/crates/core-nodes/src/processors/durable_buffer_processor/retry_wakeup_state.rs
@@ -0,0 +1,460 @@
+// Copyright The OpenTelemetry Authors
+// SPDX-License-Identifier: Apache-2.0
+
+use otap_df_engine::control::{WakeupRevision, WakeupSlot};
+use otap_df_engine::local::processor::EffectHandler;
+use otap_df_engine::{WakeupError, WakeupSetOutcome};
+use otap_df_otap::pdata::OtapPdata;
+use quiver::subscriber::BundleRef;
+use std::collections::{BTreeSet, HashMap, HashSet};
+use std::time::Instant;
+
+/// Convert a Quiver bundle identity into the stable key used by retry state.
+pub(super) fn retry_key(bundle_ref: BundleRef) -> (u64, u32) {
+    (bundle_ref.segment_seq.raw(), bundle_ref.bundle_index.raw())
+}
+
+/// Encodes a durable-buffer bundle identity into a processor-local wakeup slot.
+///
+/// Layout: `[segment_seq:u64 | bundle_index:u64]`
+pub(super) const fn retry_wakeup_slot(key: (u64, u32)) -> WakeupSlot {
+    WakeupSlot(((key.0 as u128) << 64) | (key.1 as u128))
+}
+
+/// Retry state for a bundle that has already acquired an engine wakeup slot.
+///
+/// This is the armed phase of retry deferral:
+/// - Quiver has released the bundle claim via implicit defer
+/// - the processor has successfully registered a node-local wakeup for that bundle
+/// - the wakeup slot is the bundle key encoded directly via `retry_wakeup_slot(...)`
+/// - `revision` is the current scheduler revision for that slot
+///
+/// The struct intentionally keeps only the minimum information needed to resume
+/// the retry when the matching wakeup fires.
+#[derive(Clone, Copy)]
+pub(super) struct RetryWakeup {
+    bundle_ref: BundleRef,
+    retry_count: u32,
+    revision: WakeupRevision,
+}
+
+impl RetryWakeup {
+    const fn new(bundle_ref: BundleRef, retry_count: u32, revision: WakeupRevision) -> Self {
+        Self {
+            bundle_ref,
+            retry_count,
+            revision,
+        }
+    }
+
+    pub(super) const fn bundle_ref(self) -> BundleRef {
+        self.bundle_ref
+    }
+
+    pub(super) const fn retry_count(self) -> u32 {
+        self.retry_count
+    }
+}
+
+/// Retry state for a bundle that could not acquire an engine wakeup slot yet.
+///
+/// This is the local overflow phase of retry deferral, used when
+/// `EffectHandler::set_wakeup(...)` returns `WakeupError::Capacity`.
+///
+/// Guarantees supported by this representation:
+/// - the bundle still remains deferred and is kept out of `poll_next_bundle()`
+///   through `retry_scheduled`
+/// - the intended retry deadline is preserved in `retry_at`
+/// - equal deadlines are ordered deterministically by `sequence`
+///
+/// `OverflowRetry` is stored in `retry_overflow` and indexed for due-order by
+/// a matching `OverflowRetryOrder` entry in `retry_overflow_order`.
+#[derive(Clone, Copy)]
+struct OverflowRetry {
+    bundle_ref: BundleRef,
+    retry_count: u32,
+    retry_at: Instant,
+    sequence: u64,
+}
+
+/// Ordering/index key for `retry_overflow_order`.
+///
+/// This is kept separate from `OverflowRetry` so the processor can maintain:
+/// - a keyed lookup map (`retry_overflow`) for exact replacement/removal
+/// - an ordered set (`retry_overflow_order`) for "next due" selection
+///
+/// Ordering is lexicographic by `(retry_at, sequence, key)`, which means:
+/// - earlier deadlines are resumed first
+/// - equal deadlines use insertion sequence as a deterministic tie-breaker
+/// - `key` keeps the ordering total and points back to the authoritative
+///   `OverflowRetry` stored in the map
+#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
+struct OverflowRetryOrder {
+    retry_at: Instant,
+    sequence: u64,
+    key: (u64, u32),
+}
+
+/// Local wakeup bookkeeping for durable-buffer retry deferral.
+///
+/// This state owns the invariants around armed retries, overflow retries, and
+/// deferred bundle membership. `DurableBuffer` itself still owns retry policy,
+/// Quiver interaction, and downstream resend behavior.
+pub(super) struct RetryWakeupState {
+    /// Bundles currently held out of the normal poll loop while backoff is active.
+    ///
+    /// Invariant: every key here is deferred for retry either by an armed wakeup
+    /// (`retry_wakeups`) or by the local overflow queue
+    /// (`retry_overflow` + `retry_overflow_order`).
+    retry_scheduled: HashSet<(u64, u32)>,
+
+    /// Retry state keyed by wakeup slot.
+    ///
+    /// Invariant: each slot is the encoded bundle key for the matching
+    /// `RetryWakeup.bundle_ref`, and `RetryWakeup.revision` is the only wakeup
+    /// revision for that slot that is allowed to resume the retry.
+    retry_wakeups: HashMap<WakeupSlot, RetryWakeup>,
+
+    /// Retry state held locally while wakeup scheduling is at capacity.
+    ///
+    /// Guarantee: overflowed retries remain deferred and keep their target due
+    /// time even when the engine wakeup scheduler is full.
+    retry_overflow: HashMap<(u64, u32), OverflowRetry>,
+
+    /// Due-order index for locally deferred retries.
+    ///
+    /// Invariant: this contains exactly one ordering key for each entry in
+    /// `retry_overflow`, using `sequence` as a deterministic tie-breaker.
+    retry_overflow_order: BTreeSet<OverflowRetryOrder>,
+
+    /// Monotonic tie-breaker for locally deferred retry ordering.
+    next_retry_overflow_sequence: u64,
+}
+
+impl RetryWakeupState {
+    pub(super) fn new() -> Self {
+        Self {
+            retry_scheduled: HashSet::new(),
+            retry_wakeups: HashMap::new(),
+            retry_overflow: HashMap::new(),
+            retry_overflow_order: BTreeSet::new(),
+            next_retry_overflow_sequence: 0,
+        }
+    }
+
+    pub(super) fn scheduled_len(&self) -> usize {
+        self.retry_scheduled.len()
+    }
+
+    pub(super) fn is_deferred_key(&self, key: (u64, u32)) -> bool {
+        self.retry_scheduled.contains(&key)
+    }
+
+    fn overflow_retry_order(key: (u64, u32), retry: OverflowRetry) -> OverflowRetryOrder {
+        OverflowRetryOrder {
+            retry_at: retry.retry_at,
+            sequence: retry.sequence,
+            key,
+        }
+    }
+
+    /// Removes one overflowed retry from both local indexes.
+    ///
+    /// Invariant preserved: `retry_overflow` and `retry_overflow_order` stay in
+    /// lockstep after every insertion/removal.
+    fn remove_retry_overflow(&mut self, key: (u64, u32)) -> Option<OverflowRetry> {
+        let retry = self.retry_overflow.remove(&key)?;
+        let _ = self
+            .retry_overflow_order
+            .remove(&Self::overflow_retry_order(key, retry));
+        Some(retry)
+    }
+
+    /// Defers a retry in local processor state when the engine wakeup scheduler
+    /// has no free slot.
+    ///
+    /// Guarantees:
+    /// - the bundle remains in `retry_scheduled`, so `poll_next_bundle()` keeps
+    ///   skipping it
+    /// - the most recent `(retry_count, retry_at)` replaces any older local
+    ///   overflow record for the same bundle
+    /// - equal due times are processed deterministically by `sequence`
+    fn insert_retry_overflow(&mut self, bundle_ref: BundleRef, retry_count: u32, retry_at: Instant) {
+        let key = retry_key(bundle_ref);
+        let _ = self.remove_retry_overflow(key);
+        let retry = OverflowRetry {
+            bundle_ref,
+            retry_count,
+            retry_at,
+            sequence: self.next_retry_overflow_sequence,
+        };
+        self.next_retry_overflow_sequence = self.next_retry_overflow_sequence.saturating_add(1);
+        let _ = self.retry_scheduled.insert(key);
+        let _ = self.retry_overflow.insert(key, retry);
+        let _ = self
+            .retry_overflow_order
+            .insert(Self::overflow_retry_order(key, retry));
+    }
+
+    /// Pops the next locally deferred retry only when its due time has arrived.
+    ///
+    /// Guarantee: returning a retry clears all local overflow bookkeeping for
+    /// that bundle so it can be resumed exactly once.
+    pub(super) fn take_due_retry_overflow(&mut self, now: Instant) -> Option<RetryWakeup> {
+        let order = *self.retry_overflow_order.first()?;
+        if order.retry_at > now {
+            return None;
+        }
+
+        let _ = self.retry_overflow_order.remove(&order);
+        let retry = self.retry_overflow.remove(&order.key)?;
+        let _ = self.retry_scheduled.remove(&order.key);
+
+        Some(RetryWakeup::new(retry.bundle_ref, retry.retry_count, 0))
+    }
+
+    /// Opportunistically moves overflowed retries back into engine wakeup slots.
+    ///
+    /// Guarantees:
+    /// - never drops a deferred retry when slot acquisition fails
+    /// - preserves retry due time when promotion succeeds
+    /// - stops as soon as the scheduler reports `Capacity` or shutdown
+    pub(super) fn promote_overflow_to_wakeups(
+        &mut self,
+        effect_handler: &mut EffectHandler<OtapPdata>,
+    ) {
+        while let Some(order) = self.retry_overflow_order.first().copied() {
+            let Some(retry) = self.retry_overflow.get(&order.key).copied() else {
+                let _ = self.retry_overflow_order.remove(&order);
+                continue;
+            };
+
+            let slot = retry_wakeup_slot(order.key);
+            match effect_handler.set_wakeup(slot, retry.retry_at) {
+                Ok(outcome) => {
+                    let _ = self.retry_overflow_order.remove(&order);
+                    let _ = self.retry_overflow.remove(&order.key);
+                    let _ = self.retry_wakeups.insert(
+                        slot,
+                        RetryWakeup::new(
+                            retry.bundle_ref,
+                            retry.retry_count,
+                            outcome.revision(),
+                        ),
+                    );
+                }
+                Err(WakeupError::Capacity | WakeupError::ShuttingDown) => break,
+            }
+        }
+    }
+
+    /// Schedule or re-schedule retry deferral for a bundle.
+    ///
+    /// Guarantees:
+    /// - on success, the bundle remains deferred until either an armed wakeup
+    ///   or local overflow retry resumes it
+    /// - wakeup-capacity exhaustion falls back to local overflow state instead
+    ///   of immediate re-polling
+    /// - returns `false` only when the processor is already shutting down
+    pub(super) fn schedule_at(
+        &mut self,
+        bundle_ref: BundleRef,
+        retry_count: u32,
+        retry_at: Instant,
+        effect_handler: &mut EffectHandler<OtapPdata>,
+    ) -> bool {
+        let key = retry_key(bundle_ref);
+        let _ = self.remove_retry_overflow(key);
+        let slot = retry_wakeup_slot(key);
+        match effect_handler.set_wakeup(slot, retry_at) {
+            Ok(WakeupSetOutcome::Inserted { revision } | WakeupSetOutcome::Replaced { revision }) => {
+                let _ = self.retry_scheduled.insert(key);
+                let _ = self
+                    .retry_wakeups
+                    .insert(slot, RetryWakeup::new(bundle_ref, retry_count, revision));
+                true
+            }
+            Err(WakeupError::Capacity) => {
+                self.insert_retry_overflow(bundle_ref, retry_count, retry_at);
+                true
+            }
+            Err(WakeupError::ShuttingDown) => false,
+        }
+    }
+
+    /// Remove retry-wakeup tracking for a bundle now being resumed.
+    ///
+    /// Guarantee: taking a wakeup clears the armed-wakeup bookkeeping for that
+    /// bundle before retry resumption starts.
+    pub(super) fn take_retry_wakeup(
+        &mut self,
+        slot: WakeupSlot,
+        revision: WakeupRevision,
+    ) -> Option<RetryWakeup> {
+        let wakeup = self.retry_wakeups.get(&slot).copied()?;
+        if wakeup.revision != revision {
+            return None;
+        }
+
+        let wakeup = self
+            .retry_wakeups
+            .remove(&slot)
+            .expect("matching wakeup should still exist");
+        let key = retry_key(wakeup.bundle_ref);
+        let _ = self.retry_scheduled.remove(&key);
+        Some(wakeup)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use quiver::segment::SegmentSeq;
+    use quiver::subscriber::BundleIndex;
+    use std::time::Duration;
+
+    /// Scenario: a retry is armed in the engine wakeup scheduler for one
+    /// bundle, and the matching wakeup later arrives with the same revision.
+    /// Guarantees: taking that wakeup clears the armed retry bookkeeping and
+    /// returns the original `(bundle_ref, retry_count)` exactly once.
+    #[test]
+    fn take_retry_wakeup_clears_tracking() {
+        let mut state = RetryWakeupState::new();
+        let bundle_ref = BundleRef {
+            segment_seq: SegmentSeq::new(98765),
+            bundle_index: BundleIndex::new(123),
+        };
+        let key = retry_key(bundle_ref);
+        let slot = retry_wakeup_slot(key);
+        let _ = state.retry_scheduled.insert(key);
+        let _ = state
+            .retry_wakeups
+            .insert(slot, RetryWakeup::new(bundle_ref, 3, 17));
+
+        let taken = state
+            .take_retry_wakeup(slot, 17)
+            .expect("retry wakeup should exist");
+        assert_eq!(taken.bundle_ref().segment_seq.raw(), 98765);
+        assert_eq!(taken.bundle_ref().bundle_index.raw(), 123);
+        assert_eq!(taken.retry_count(), 3);
+        assert!(!state.retry_scheduled.contains(&key));
+        assert!(!state.retry_wakeups.contains_key(&slot));
+    }
+
+    /// Scenario: the processor receives a wakeup for a slot that has no armed
+    /// retry state.
+    /// Guarantees: the unknown wakeup is ignored and does not mutate retry
+    /// bookkeeping.
+    #[test]
+    fn take_retry_wakeup_unknown_slot_is_ignored() {
+        let mut state = RetryWakeupState::new();
+        assert!(state.take_retry_wakeup(WakeupSlot(999), 0).is_none());
+    }
+
+    /// Scenario: a slot has been rescheduled, so the processor still has armed
+    /// retry state for that slot but the arriving wakeup carries an older
+    /// revision.
+    /// Guarantees: the stale wakeup is ignored, and the current armed retry
+    /// state remains available for the matching revision.
+    #[test]
+    fn take_retry_wakeup_stale_revision_is_ignored() {
+        let mut state = RetryWakeupState::new();
+        let bundle_ref = BundleRef {
+            segment_seq: SegmentSeq::new(123),
+            bundle_index: BundleIndex::new(9),
+        };
+        let key = retry_key(bundle_ref);
+        let slot = retry_wakeup_slot(key);
+
+        let _ = state.retry_scheduled.insert(key);
+        let _ = state
+            .retry_wakeups
+            .insert(slot, RetryWakeup::new(bundle_ref, 2, 5));
+
+        assert!(state.take_retry_wakeup(slot, 4).is_none());
+        assert!(state.retry_scheduled.contains(&key));
+        assert!(state.retry_wakeups.contains_key(&slot));
+    }
+
+    /// Scenario: a retry was deferred in local overflow state because wakeup
+    /// capacity was exhausted, and its due time has now arrived.
+    /// Guarantees: taking that retry clears all overflow bookkeeping, removes it
+    /// from `retry_scheduled`, and returns the original `(bundle_ref, retry_count)`.
+    #[test]
+    fn take_due_retry_overflow_clears_tracking() {
+        let mut state = RetryWakeupState::new();
+        let bundle_ref = BundleRef {
+            segment_seq: SegmentSeq::new(321),
+            bundle_index: BundleIndex::new(7),
+        };
+        let key = retry_key(bundle_ref);
+        let retry_at = Instant::now();
+
+        state.insert_retry_overflow(bundle_ref, 4, retry_at);
+
+        assert!(state.retry_scheduled.contains(&key));
+        assert!(state.retry_overflow.contains_key(&key));
+        assert_eq!(state.retry_overflow_order.len(), 1);
+
+        let retry = state
+            .take_due_retry_overflow(retry_at + Duration::from_millis(1))
+            .expect("retry should be due");
+
+        assert_eq!(retry.bundle_ref().segment_seq.raw(), 321);
+        assert_eq!(retry.bundle_ref().bundle_index.raw(), 7);
+        assert_eq!(retry.retry_count(), 4);
+        assert!(!state.retry_scheduled.contains(&key));
+        assert!(!state.retry_overflow.contains_key(&key));
+        assert!(state.retry_overflow_order.is_empty());
+    }
+
+    /// Scenario: multiple retries overflow the wakeup scheduler and are stored
+    /// locally with the same due timestamp.
+    /// Guarantees: equal-deadline overflow retries are resumed in insertion
+    /// order using the local sequence tie-breaker.
+    #[test]
+    fn equal_deadline_overflow_retries_follow_sequence_order() {
+        let mut state = RetryWakeupState::new();
+        let retry_at = Instant::now();
+        let first = BundleRef {
+            segment_seq: SegmentSeq::new(111),
+            bundle_index: BundleIndex::new(1),
+        };
+        let second = BundleRef {
+            segment_seq: SegmentSeq::new(222),
+            bundle_index: BundleIndex::new(2),
+        };
+        let third = BundleRef {
+            segment_seq: SegmentSeq::new(333),
+            bundle_index: BundleIndex::new(3),
+        };
+
+        state.insert_retry_overflow(first, 1, retry_at);
+        state.insert_retry_overflow(second, 2, retry_at);
+        state.insert_retry_overflow(third, 3, retry_at);
+
+        assert_eq!(
+            state
+                .take_due_retry_overflow(retry_at + Duration::from_millis(1))
+                .expect("first retry")
+                .bundle_ref(),
+            first
+        );
+        assert_eq!(
+            state
+                .take_due_retry_overflow(retry_at + Duration::from_millis(1))
+                .expect("second retry")
+                .bundle_ref(),
+            second
+        );
+        assert_eq!(
+            state
+                .take_due_retry_overflow(retry_at + Duration::from_millis(1))
+                .expect("third retry")
+                .bundle_ref(),
+            third
+        );
+        assert!(state.retry_overflow.is_empty());
+        assert!(state.retry_overflow_order.is_empty());
+    }
+}
diff --git a/rust/otap-dataflow/crates/engine/README.md b/rust/otap-dataflow/crates/engine/README.md
index 282f206573..1989460fd3 100644
--- a/rust/otap-dataflow/crates/engine/README.md
+++ b/rust/otap-dataflow/crates/engine/README.md
@@ -278,7 +278,9 @@ behavior:
 
 Processors can schedule local wakeups through the processor effect handler:
 
-- `set_wakeup(slot, when)` schedules or replaces the wakeup for `slot`
+- `set_wakeup(slot, when)` schedules or replaces the wakeup for `slot` and
+  returns whether that slot was inserted or replaced, along with the accepted
+  wakeup revision
 - `cancel_wakeup(slot)` removes the wakeup for `slot` if one is live
 
 This API is intentionally processor-local:
@@ -286,26 +288,31 @@ This API is intentionally processor-local:
 - `WakeupSlot` is scoped to one processor instance, not globally across the
   pipeline
 - a processor can define its own slot constants such as `WakeupSlot(0)`
+- a processor can also encode compact structured local identifiers directly in
+  the widened `WakeupSlot(pub u128)` payload when that is more natural
 - the engine does not interpret slot meaning; it only routes the slot back to
   the originating processor
 
 Wakeups are delivered through `ProcessorInbox` as
-`NodeControlMsg::Wakeup { slot, when }`. They therefore participate in the
-same receive loop and the same bounded fairness policy as other control
+`NodeControlMsg::Wakeup { slot, when, revision }`. They therefore participate
+in the same receive loop and the same bounded fairness policy as other control
 traffic.
 
 The current runtime properties and guarantees are:
 
 - **Keyed replacement:** there is at most one live wakeup per slot; scheduling
   the same slot again replaces the previous due time
+- **Revisioned delivery:** every accepted schedule gets a scheduler-assigned
+  revision; re-scheduling a live slot gives it a new revision so processors can
+  ignore stale wakeups for reused slots
 - **Cancellation:** canceling a live slot prevents that wakeup from being
   delivered later
 - **Bounded live state:** scheduler state is bounded by the number of live
   wakeup slots accepted for the processor
 - **Deterministic ordering:** if two wakeups have the same due time, they are
   delivered in schedule order
-- **No payload retention:** wakeups carry only `(slot, when)` and do not retain
-  deferred `pdata`
+- **No payload retention:** wakeups carry only `(slot, when, revision)` and do
+  not retain deferred `pdata`
 - **Shutdown rejection and drop:** once processor shutdown is latched, new
   wakeups are rejected and pending wakeups are dropped immediately
 - **No flush-on-shutdown guarantee:** pending wakeups are not drained or forced
diff --git a/rust/otap-dataflow/crates/engine/src/control.rs b/rust/otap-dataflow/crates/engine/src/control.rs
index cf7df79b61..d0711b8dd8 100644
--- a/rust/otap-dataflow/crates/engine/src/control.rs
+++ b/rust/otap-dataflow/crates/engine/src/control.rs
@@ -82,9 +82,18 @@ pub type CallData = SmallVec<[Context8u8; 3]>;
 /// constants such as `WakeupSlot(0)` for their own internal timers.
 ///
 /// Re-scheduling the same slot replaces the previous wakeup for that slot.
+/// The widened `u128` payload lets processors encode compact structured local
+/// identifiers directly when that is more natural than allocating slot numbers.
 #[repr(transparent)]
 #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)]
-pub struct WakeupSlot(pub u64);
+pub struct WakeupSlot(pub u128);
+
+/// Monotonic wakeup revision assigned by the scheduler each time a slot is set.
+///
+/// Re-scheduling an existing slot gives it a new revision. Processors can use
+/// the revision carried back in [`NodeControlMsg::Wakeup`] to distinguish a
+/// current wakeup from a stale delivery for the same slot.
+pub type WakeupRevision = u64;
 
 /// Engine-managed call data envelope. Wraps the CallData with an envelope
 /// containing timestamp. Lives on the forward path (in context stack frames).
@@ -238,6 +247,8 @@ pub enum NodeControlMsg<PData> {
     /// This is delivered back through the processor inbox as normal control
     /// traffic. The slot identifies which logical wakeup fired; processors are
     /// expected to interpret the slot according to their own local namespace.
+    /// The revision changes every time the slot is (re-)scheduled and allows
+    /// processors to ignore stale wakeups for a reused slot.
     ///
     /// Wakeups are best-effort runtime signals rather than durable work items:
     /// once processor shutdown is latched, pending wakeups are dropped and no
@@ -247,6 +258,8 @@ pub enum NodeControlMsg<PData> {
         slot: WakeupSlot,
         /// Scheduled due time currently associated with this slot.
         when: Instant,
+        /// Scheduler-assigned revision for this slot schedule.
+        revision: WakeupRevision,
     },
 
     /// Delayed data returning to the node which delayed it.
diff --git a/rust/otap-dataflow/crates/engine/src/effect_handler.rs b/rust/otap-dataflow/crates/engine/src/effect_handler.rs
index e3c468e311..23a39a13e5 100644
--- a/rust/otap-dataflow/crates/engine/src/effect_handler.rs
+++ b/rust/otap-dataflow/crates/engine/src/effect_handler.rs
@@ -4,7 +4,7 @@
 //! Common foundation of all effect handlers.
 
 use crate::Interests;
-use crate::WakeupError;
+use crate::{WakeupError, WakeupSetOutcome};
 use crate::completion_emission_metrics::CompletionEmissionMetricsHandle;
 use crate::control::{
     AckMsg, NackMsg, PipelineCompletionMsg, PipelineCompletionMsgSender, RuntimeControlMsg,
@@ -410,14 +410,24 @@ impl<PData> EffectHandlerCore<PData> {
     /// Set or replace a processor-local wakeup.
     ///
     /// Wakeups are keyed by [`WakeupSlot`]. Scheduling the same slot again
-    /// replaces the previous due time for that slot.
+    /// replaces the previous due time for that slot and assigns a new
+    /// scheduler revision for that slot.
+    ///
+    /// The returned [`WakeupSetOutcome`] tells the caller whether the slot was
+    /// newly inserted or whether an existing live wakeup was replaced, and
+    /// carries the accepted revision that will later be returned in
+    /// `NodeControlMsg::Wakeup`.
     ///
     /// # Errors
     ///
     /// Returns [`WakeupError::ShuttingDown`] once processor shutdown has been
     /// latched. Returns [`WakeupError::Capacity`] if the processor has reached
     /// its configured live wakeup-slot capacity.
-    pub fn set_wakeup(&self, slot: WakeupSlot, when: Instant) -> Result<(), WakeupError> {
+    pub fn set_wakeup(
+        &self,
+        slot: WakeupSlot,
+        when: Instant,
+    ) -> Result<WakeupSetOutcome, WakeupError> {
         self.local_scheduler
             .as_ref()
             .expect("node-local scheduler not set for processor effect handler")
diff --git a/rust/otap-dataflow/crates/engine/src/lib.rs b/rust/otap-dataflow/crates/engine/src/lib.rs
index ea2a8ca7c2..67c79b9cf3 100644
--- a/rust/otap-dataflow/crates/engine/src/lib.rs
+++ b/rust/otap-dataflow/crates/engine/src/lib.rs
@@ -79,7 +79,7 @@ pub mod terminal_state;
 pub mod testing;
 pub mod topic;
 pub mod wiring_contract;
-pub use node_local_scheduler::WakeupError;
+pub use node_local_scheduler::{WakeupError, WakeupSetOutcome};
 
 /// Trait for factory types that expose a name.
 ///
diff --git a/rust/otap-dataflow/crates/engine/src/local/processor.rs b/rust/otap-dataflow/crates/engine/src/local/processor.rs
index 19d94e57f4..0fd4f151a5 100644
--- a/rust/otap-dataflow/crates/engine/src/local/processor.rs
+++ b/rust/otap-dataflow/crates/engine/src/local/processor.rs
@@ -33,7 +33,7 @@
 //! in parallel on different cores, each with its own processor instance.
 
 use crate::Interests;
-use crate::WakeupError;
+use crate::{WakeupError, WakeupSetOutcome};
 use crate::control::{AckMsg, NackMsg, RuntimeCtrlMsgSender, WakeupSlot};
 use crate::effect_handler::{
     EffectHandlerCore, SourceTagging, TelemetryTimerCancelHandle, TimerCancelHandle,
@@ -270,7 +270,11 @@ impl<PData> EffectHandler<PData> {
     }
 
     /// Set or replace a processor-local wakeup.
-    pub fn set_wakeup(&self, slot: WakeupSlot, when: Instant) -> Result<(), WakeupError> {
+    pub fn set_wakeup(
+        &self,
+        slot: WakeupSlot,
+        when: Instant,
+    ) -> Result<WakeupSetOutcome, WakeupError> {
         self.core.set_wakeup(slot, when)
     }
 
diff --git a/rust/otap-dataflow/crates/engine/src/message.rs b/rust/otap-dataflow/crates/engine/src/message.rs
index d92a6f33ac..aad6c3b773 100644
--- a/rust/otap-dataflow/crates/engine/src/message.rs
+++ b/rust/otap-dataflow/crates/engine/src/message.rs
@@ -359,7 +359,13 @@ where
         self.local_scheduler
             .as_ref()
             .and_then(|scheduler| scheduler.pop_due(now))
-            .map(|(slot, when)| self.control_message(NodeControlMsg::Wakeup { slot, when }))
+            .map(|(slot, when, revision)| {
+                self.control_message(NodeControlMsg::Wakeup {
+                    slot,
+                    when,
+                    revision,
+                })
+            })
     }
 
     fn next_local_expiry_sleep(&self, now: Instant) -> Option<clock::Sleep> {
@@ -904,9 +910,10 @@ mod tests {
     async fn processor_inbox_emits_due_wakeup_as_control_message() {
         let (_control_tx, _pdata_tx, scheduler, mut inbox) = local_processor_inbox(4);
         let when = Instant::now();
-        scheduler
+        let outcome = scheduler
             .set_wakeup(crate::control::WakeupSlot(0), when)
             .expect("wakeup should schedule");
+        let revision = outcome.revision();
 
         let message = tokio::time::timeout(Duration::from_millis(50), inbox.recv_when(true))
             .await
@@ -917,7 +924,8 @@ mod tests {
             Message::Control(NodeControlMsg::Wakeup {
                 slot: crate::control::WakeupSlot(0),
                 when: observed,
-            }) if observed == when
+                revision: observed_revision,
+            }) if observed == when && observed_revision == revision
         ));
     }
 
@@ -930,7 +938,7 @@ mod tests {
             .expect("pdata should enqueue");
         let when = Instant::now();
         for slot in 0..40 {
-            scheduler
+            let _ = scheduler
                 .set_wakeup(crate::control::WakeupSlot(slot), when)
                 .expect("wakeup should schedule");
         }
@@ -973,9 +981,10 @@ mod tests {
             })
             .await
             .expect("config should enqueue");
-        scheduler
+        let outcome = scheduler
             .set_wakeup(crate::control::WakeupSlot(0), when)
             .expect("wakeup should schedule");
+        let revision = outcome.revision();
 
         let first = inbox.recv_when(true).await.expect("message should arrive");
         assert!(matches!(
@@ -989,7 +998,8 @@ mod tests {
             Message::Control(NodeControlMsg::Wakeup {
                 slot: crate::control::WakeupSlot(0),
                 when: observed,
-            }) if observed == when
+                revision: observed_revision,
+            }) if observed == when && observed_revision == revision
         ));
     }
 
@@ -1035,7 +1045,7 @@ mod tests {
             .send_async(TestMsg::new("buffered"))
             .await
             .expect("pdata should enqueue");
-        scheduler
+        let _ = scheduler
             .set_wakeup(crate::control::WakeupSlot(2), Instant::now())
             .expect("wakeup should schedule");
         control_tx
diff --git a/rust/otap-dataflow/crates/engine/src/node_local_scheduler.rs b/rust/otap-dataflow/crates/engine/src/node_local_scheduler.rs
index 5e2e8b085d..609bc3023f 100644
--- a/rust/otap-dataflow/crates/engine/src/node_local_scheduler.rs
+++ b/rust/otap-dataflow/crates/engine/src/node_local_scheduler.rs
@@ -3,7 +3,7 @@
 
 //! Node-local wakeup scheduling for processor inboxes.
 
-use crate::control::WakeupSlot;
+use crate::control::{WakeupRevision, WakeupSlot};
 use std::collections::HashMap;
 use std::sync::{Arc, Mutex};
 use std::time::Instant;
@@ -18,16 +18,41 @@ pub enum WakeupError {
     Capacity,
 }
 
+/// Outcome of setting a processor-local wakeup slot.
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub enum WakeupSetOutcome {
+    /// A new live slot was inserted into the scheduler.
+    Inserted {
+        /// Scheduler-assigned revision for the live wakeup now stored in the slot.
+        revision: WakeupRevision,
+    },
+    /// An existing live slot was updated in place.
+    Replaced {
+        /// Scheduler-assigned revision for the replacement wakeup now stored in the slot.
+        revision: WakeupRevision,
+    },
+}
+
+impl WakeupSetOutcome {
+    /// Returns the scheduler-assigned revision for the accepted wakeup.
+    #[must_use]
+    pub const fn revision(self) -> WakeupRevision {
+        match self {
+            Self::Inserted { revision } | Self::Replaced { revision } => revision,
+        }
+    }
+}
+
 #[derive(Clone, Copy, Debug)]
 struct ScheduledWakeup {
     slot: WakeupSlot,
     when: Instant,
-    sequence: u64,
+    revision: WakeupRevision,
 }
 
 struct NodeLocalScheduler {
     wakeup_capacity: usize,
-    next_sequence: u64,
+    next_revision: WakeupRevision,
     wakeups: Vec<ScheduledWakeup>,
     wakeup_indices: HashMap<WakeupSlot, usize>,
     shutting_down: bool,
@@ -37,21 +62,21 @@ impl NodeLocalScheduler {
     fn new(wakeup_capacity: usize) -> Self {
         Self {
             wakeup_capacity,
-            next_sequence: 0,
+            next_revision: 0,
             wakeups: Vec::new(),
             wakeup_indices: HashMap::new(),
             shutting_down: false,
         }
     }
 
-    fn next_sequence(&mut self) -> u64 {
-        let next = self.next_sequence;
-        self.next_sequence = self.next_sequence.saturating_add(1);
+    fn next_revision(&mut self) -> WakeupRevision {
+        let next = self.next_revision;
+        self.next_revision = self.next_revision.saturating_add(1);
         next
     }
 
     fn wakeup_precedes(left: &ScheduledWakeup, right: &ScheduledWakeup) -> bool {
-        left.when < right.when || (left.when == right.when && left.sequence < right.sequence)
+        left.when < right.when || (left.when == right.when && left.revision < right.revision)
     }
 
     fn swap_entries(&mut self, left: usize, right: usize) {
@@ -141,16 +166,21 @@ impl NodeLocalScheduler {
         removed
     }
 
-    fn set_wakeup(&mut self, slot: WakeupSlot, when: Instant) -> Result<(), WakeupError> {
+    fn set_wakeup(
+        &mut self,
+        slot: WakeupSlot,
+        when: Instant,
+    ) -> Result<WakeupSetOutcome, WakeupError> {
         if self.shutting_down {
             return Err(WakeupError::ShuttingDown);
         }
 
-        let sequence = self.next_sequence();
+        let revision = self.next_revision();
         if let Some(&index) = self.wakeup_indices.get(&slot) {
             self.wakeups[index].when = when;
-            self.wakeups[index].sequence = sequence;
+            self.wakeups[index].revision = revision;
             self.repair_heap_at(index);
+            Ok(WakeupSetOutcome::Replaced { revision })
         } else {
             if self.wakeup_indices.len() >= self.wakeup_capacity {
                 return Err(WakeupError::Capacity);
@@ -159,15 +189,15 @@ impl NodeLocalScheduler {
             self.wakeups.push(ScheduledWakeup {
                 slot,
                 when,
-                sequence,
+                revision,
             });
             assert!(
                 self.wakeup_indices.insert(slot, index).is_none(),
                 "new wakeup slot should not already exist"
             );
             self.sift_up(index);
+            Ok(WakeupSetOutcome::Inserted { revision })
         }
-        Ok(())
     }
 
     fn cancel_wakeup(&mut self, slot: WakeupSlot) -> bool {
@@ -211,7 +241,7 @@ impl NodeLocalScheduler {
         self.wakeups.first().map(|wakeup| wakeup.when)
     }
 
-    fn pop_due(&mut self, now: Instant) -> Option<(WakeupSlot, Instant)> {
+    fn pop_due(&mut self, now: Instant) -> Option<(WakeupSlot, Instant, WakeupRevision)> {
         #[cfg(debug_assertions)]
         self.assert_consistent();
 
@@ -227,7 +257,7 @@ impl NodeLocalScheduler {
             .expect("due wakeup slot index should exist");
         debug_assert_eq!(removed_index, 0);
         let wakeup = self.remove_heap_entry(0);
-        Some((wakeup.slot, wakeup.when))
+        Some((wakeup.slot, wakeup.when, wakeup.revision))
     }
 
     fn begin_shutdown(&mut self) {
@@ -275,7 +305,11 @@ impl NodeLocalSchedulerHandle {
         f(&mut guard)
     }
 
-    pub(crate) fn set_wakeup(&self, slot: WakeupSlot, when: Instant) -> Result<(), WakeupError> {
+    pub(crate) fn set_wakeup(
+        &self,
+        slot: WakeupSlot,
+        when: Instant,
+    ) -> Result<WakeupSetOutcome, WakeupError> {
         let result = self.with_scheduler(|scheduler| scheduler.set_wakeup(slot, when));
         if result.is_ok() {
             self.notify.notify_one();
@@ -296,7 +330,7 @@ impl NodeLocalSchedulerHandle {
         self.with_scheduler(NodeLocalScheduler::next_expiry)
     }
 
-    pub(crate) fn pop_due(&self, now: Instant) -> Option<(WakeupSlot, Instant)> {
+    pub(crate) fn pop_due(&self, now: Instant) -> Option<(WakeupSlot, Instant, WakeupRevision)> {
         self.with_scheduler(|scheduler| scheduler.pop_due(now))
     }
 
@@ -335,11 +369,14 @@ mod tests {
         let now = Instant::now();
         let when = now + Duration::from_secs(1);
 
-        assert_eq!(scheduler.set_wakeup(WakeupSlot(7), when), Ok(()));
+        assert_eq!(
+            scheduler.set_wakeup(WakeupSlot(7), when),
+            Ok(WakeupSetOutcome::Inserted { revision: 0 })
+        );
         assert_heap_bound(&scheduler);
         assert_eq!(scheduler.next_expiry(), Some(when));
         assert_eq!(scheduler.pop_due(now), None);
-        assert_eq!(scheduler.pop_due(when), Some((WakeupSlot(7), when)));
+        assert_eq!(scheduler.pop_due(when), Some((WakeupSlot(7), when, 0)));
         assert_heap_bound(&scheduler);
         assert_eq!(scheduler.next_expiry(), None);
     }
@@ -351,12 +388,18 @@ mod tests {
         let later = now + Duration::from_secs(10);
         let sooner = now + Duration::from_secs(1);
 
-        assert_eq!(scheduler.set_wakeup(WakeupSlot(3), later), Ok(()));
-        assert_eq!(scheduler.set_wakeup(WakeupSlot(3), sooner), Ok(()));
+        assert_eq!(
+            scheduler.set_wakeup(WakeupSlot(3), later),
+            Ok(WakeupSetOutcome::Inserted { revision: 0 })
+        );
+        assert_eq!(
+            scheduler.set_wakeup(WakeupSlot(3), sooner),
+            Ok(WakeupSetOutcome::Replaced { revision: 1 })
+        );
         assert_heap_bound(&scheduler);
         assert_eq!(scheduler.wakeups.len(), 1);
         assert_eq!(scheduler.next_expiry(), Some(sooner));
-        assert_eq!(scheduler.pop_due(sooner), Some((WakeupSlot(3), sooner)));
+        assert_eq!(scheduler.pop_due(sooner), Some((WakeupSlot(3), sooner, 1)));
         assert_heap_bound(&scheduler);
         assert_eq!(scheduler.pop_due(later), None);
     }
@@ -366,7 +409,10 @@ mod tests {
         let mut scheduler = NodeLocalScheduler::new(2);
         let when = Instant::now() + Duration::from_secs(1);
 
-        assert_eq!(scheduler.set_wakeup(WakeupSlot(5), when), Ok(()));
+        assert_eq!(
+            scheduler.set_wakeup(WakeupSlot(5), when),
+            Ok(WakeupSetOutcome::Inserted { revision: 0 })
+        );
         assert_heap_bound(&scheduler);
         assert!(scheduler.cancel_wakeup(WakeupSlot(5)));
         assert_heap_bound(&scheduler);
@@ -389,11 +435,26 @@ mod tests {
         let fourth = now + Duration::from_secs(30);
         let moved = now + Duration::from_secs(2);
 
-        assert_eq!(scheduler.set_wakeup(WakeupSlot(1), first), Ok(()));
-        assert_eq!(scheduler.set_wakeup(WakeupSlot(2), second), Ok(()));
-        assert_eq!(scheduler.set_wakeup(WakeupSlot(3), third), Ok(()));
-        assert_eq!(scheduler.set_wakeup(WakeupSlot(4), fourth), Ok(()));
-        assert_eq!(scheduler.set_wakeup(WakeupSlot(3), moved), Ok(()));
+        assert_eq!(
+            scheduler.set_wakeup(WakeupSlot(1), first),
+            Ok(WakeupSetOutcome::Inserted { revision: 0 })
+        );
+        assert_eq!(
+            scheduler.set_wakeup(WakeupSlot(2), second),
+            Ok(WakeupSetOutcome::Inserted { revision: 1 })
+        );
+        assert_eq!(
+            scheduler.set_wakeup(WakeupSlot(3), third),
+            Ok(WakeupSetOutcome::Inserted { revision: 2 })
+        );
+        assert_eq!(
+            scheduler.set_wakeup(WakeupSlot(4), fourth),
+            Ok(WakeupSetOutcome::Inserted { revision: 3 })
+        );
+        assert_eq!(
+            scheduler.set_wakeup(WakeupSlot(3), moved),
+            Ok(WakeupSetOutcome::Replaced { revision: 4 })
+        );
 
         let moved_index = scheduler
             .wakeup_indices
@@ -407,10 +468,10 @@ mod tests {
 
         assert!(scheduler.cancel_wakeup(WakeupSlot(3)));
         assert_heap_bound(&scheduler);
-        assert_eq!(scheduler.pop_due(first), Some((WakeupSlot(1), first)));
+        assert_eq!(scheduler.pop_due(first), Some((WakeupSlot(1), first, 0)));
         assert_eq!(scheduler.pop_due(moved), None);
-        assert_eq!(scheduler.pop_due(second), Some((WakeupSlot(2), second)));
-        assert_eq!(scheduler.pop_due(fourth), Some((WakeupSlot(4), fourth)));
+        assert_eq!(scheduler.pop_due(second), Some((WakeupSlot(2), second, 1)));
+        assert_eq!(scheduler.pop_due(fourth), Some((WakeupSlot(4), fourth, 3)));
         assert_eq!(scheduler.next_expiry(), None);
     }
 
@@ -419,14 +480,17 @@ mod tests {
         let mut scheduler = NodeLocalScheduler::new(1);
         let when = Instant::now() + Duration::from_secs(1);
 
-        assert_eq!(scheduler.set_wakeup(WakeupSlot(0), when), Ok(()));
+        assert_eq!(
+            scheduler.set_wakeup(WakeupSlot(0), when),
+            Ok(WakeupSetOutcome::Inserted { revision: 0 })
+        );
         assert_eq!(
             scheduler.set_wakeup(WakeupSlot(1), when),
             Err(WakeupError::Capacity)
         );
         assert_eq!(
             scheduler.set_wakeup(WakeupSlot(0), when + Duration::from_secs(1)),
-            Ok(())
+            Ok(WakeupSetOutcome::Replaced { revision: 1 })
         );
         assert_heap_bound(&scheduler);
     }
@@ -437,14 +501,18 @@ mod tests {
         let now = Instant::now();
         for offset in (1..=32).rev() {
             let when = now + Duration::from_secs(offset);
-            assert_eq!(scheduler.set_wakeup(WakeupSlot(9), when), Ok(()));
+            let outcome = scheduler
+                .set_wakeup(WakeupSlot(9), when)
+                .expect("wakeup should schedule");
+            let expected_revision: WakeupRevision = 32 - offset;
+            assert_eq!(outcome.revision(), expected_revision);
             assert_heap_bound(&scheduler);
             assert_eq!(scheduler.wakeups.len(), 1);
             assert_eq!(scheduler.next_expiry(), Some(when));
         }
 
         let expected = now + Duration::from_secs(1);
-        assert_eq!(scheduler.pop_due(expected), Some((WakeupSlot(9), expected)));
+        assert_eq!(scheduler.pop_due(expected), Some((WakeupSlot(9), expected, 31)));
         assert_eq!(scheduler.next_expiry(), None);
     }
 
@@ -453,14 +521,23 @@ mod tests {
         let mut scheduler = NodeLocalScheduler::new(4);
         let when = Instant::now() + Duration::from_secs(1);
 
-        assert_eq!(scheduler.set_wakeup(WakeupSlot(1), when), Ok(()));
-        assert_eq!(scheduler.set_wakeup(WakeupSlot(2), when), Ok(()));
-        assert_eq!(scheduler.set_wakeup(WakeupSlot(3), when), Ok(()));
+        assert_eq!(
+            scheduler.set_wakeup(WakeupSlot(1), when),
+            Ok(WakeupSetOutcome::Inserted { revision: 0 })
+        );
+        assert_eq!(
+            scheduler.set_wakeup(WakeupSlot(2), when),
+            Ok(WakeupSetOutcome::Inserted { revision: 1 })
+        );
+        assert_eq!(
+            scheduler.set_wakeup(WakeupSlot(3), when),
+            Ok(WakeupSetOutcome::Inserted { revision: 2 })
+        );
         assert_heap_bound(&scheduler);
 
-        assert_eq!(scheduler.pop_due(when), Some((WakeupSlot(1), when)));
-        assert_eq!(scheduler.pop_due(when), Some((WakeupSlot(2), when)));
-        assert_eq!(scheduler.pop_due(when), Some((WakeupSlot(3), when)));
+        assert_eq!(scheduler.pop_due(when), Some((WakeupSlot(1), when, 0)));
+        assert_eq!(scheduler.pop_due(when), Some((WakeupSlot(2), when, 1)));
+        assert_eq!(scheduler.pop_due(when), Some((WakeupSlot(3), when, 2)));
         assert_heap_bound(&scheduler);
     }
 }
diff --git a/rust/otap-dataflow/crates/engine/src/shared/processor.rs b/rust/otap-dataflow/crates/engine/src/shared/processor.rs
index 2c199a6023..353af95325 100644
--- a/rust/otap-dataflow/crates/engine/src/shared/processor.rs
+++ b/rust/otap-dataflow/crates/engine/src/shared/processor.rs
@@ -32,7 +32,7 @@
 //! in parallel on different cores, each with its own processor instance.
 
 use crate::Interests;
-use crate::WakeupError;
+use crate::{WakeupError, WakeupSetOutcome};
 use crate::control::{AckMsg, NackMsg, RuntimeCtrlMsgSender, WakeupSlot};
 use crate::effect_handler::{
     EffectHandlerCore, SourceTagging, TelemetryTimerCancelHandle, TimerCancelHandle,
@@ -243,7 +243,11 @@ impl<PData> EffectHandler<PData> {
     }
 
     /// Set or replace a processor-local wakeup.
-    pub fn set_wakeup(&self, slot: WakeupSlot, when: Instant) -> Result<(), WakeupError> {
+    pub fn set_wakeup(
+        &self,
+        slot: WakeupSlot,
+        when: Instant,
+    ) -> Result<WakeupSetOutcome, WakeupError> {
         self.core.set_wakeup(slot, when)
     }
 

From bc2f1945c354a6c597cc7610537a537ac83621b5 Mon Sep 17 00:00:00 2001
From: lquerel <l.querel@f5.com>
Date: Fri, 3 Apr 2026 14:53:55 -0700
Subject: [PATCH 13/18] Fix wakeup and exporter test regressions

---
 .../src/exporters/otap_exporter/mod.rs          |  6 ++++++
 .../src/processors/batch_processor/mod.rs       |  4 ++--
 .../processors/durable_buffer_processor/mod.rs  | 10 +++++++---
 .../retry_wakeup_state.rs                       | 17 ++++++++++-------
 .../crates/engine/src/effect_handler.rs         |  2 +-
 .../crates/engine/src/local/processor.rs        |  2 +-
 .../crates/engine/src/node_local_scheduler.rs   |  8 ++++++--
 .../crates/engine/src/shared/processor.rs       |  2 +-
 8 files changed, 34 insertions(+), 17 deletions(-)

diff --git a/rust/otap-dataflow/crates/core-nodes/src/exporters/otap_exporter/mod.rs b/rust/otap-dataflow/crates/core-nodes/src/exporters/otap_exporter/mod.rs
index 190467917b..8652b56030 100644
--- a/rust/otap-dataflow/crates/core-nodes/src/exporters/otap_exporter/mod.rs
+++ b/rust/otap-dataflow/crates/core-nodes/src/exporters/otap_exporter/mod.rs
@@ -1280,11 +1280,13 @@ mod tests {
 
         let (batch_received_tx, mut batch_received_rx) = tokio::sync::mpsc::channel(1);
         let (server_shutdown_tx, server_shutdown_rx) = tokio::sync::oneshot::channel();
+        let (server_ready_tx, server_ready_rx) = tokio::sync::oneshot::channel();
 
         // Start gRPC server that returns errors
         let listening_addr: SocketAddr = format!("{grpc_addr}:{grpc_port}").parse().unwrap();
         let server_handle = tokio_rt.spawn(async move {
             let tcp_listener = TcpListener::bind(listening_addr).await.unwrap();
+            let _ = server_ready_tx.send(());
             let tcp_stream = TcpListenerStream::new(tcp_listener);
             let error_service = ArrowLogsServiceServer::new(ArrowLogsServiceGrpcErrorMock {
                 sender: batch_received_tx,
@@ -1316,6 +1318,10 @@ mod tests {
             });
 
             tokio::join!(local_set, async {
+                server_ready_rx
+                    .await
+                    .expect("server should bind before exporter traffic starts");
+
                 // Send a batch with ACK/NACK subscription
                 let log_message = create_otap_batch(LOG_BATCH_ID, ArrowPayloadType::Logs);
                 let pdata = OtapPdata::new_default(log_message.into()).test_subscribe_to(
diff --git a/rust/otap-dataflow/crates/core-nodes/src/processors/batch_processor/mod.rs b/rust/otap-dataflow/crates/core-nodes/src/processors/batch_processor/mod.rs
index 749dd215ec..41bc3c4aab 100644
--- a/rust/otap-dataflow/crates/core-nodes/src/processors/batch_processor/mod.rs
+++ b/rust/otap-dataflow/crates/core-nodes/src/processors/batch_processor/mod.rs
@@ -2838,8 +2838,8 @@ mod tests {
                         when,
                         revision: 0,
                     }))
-                        .await
-                        .expect("process wakeup");
+                    .await
+                    .expect("process wakeup");
                 }
 
                 // Drain outputs after timeout
diff --git a/rust/otap-dataflow/crates/core-nodes/src/processors/durable_buffer_processor/mod.rs b/rust/otap-dataflow/crates/core-nodes/src/processors/durable_buffer_processor/mod.rs
index 539c4d6299..55db50efbf 100644
--- a/rust/otap-dataflow/crates/core-nodes/src/processors/durable_buffer_processor/mod.rs
+++ b/rust/otap-dataflow/crates/core-nodes/src/processors/durable_buffer_processor/mod.rs
@@ -324,6 +324,10 @@ fn decode_bundle_ref(calldata: &CallData) -> Option<BundleRef> {
     })
 }
 
+// ─────────────────────────────────────────────────────────────────────────────
+// Pending Bundle Tracking
+// ─────────────────────────────────────────────────────────────────────────────
+
 /// State for tracking a pending downstream delivery.
 ///
 /// Holds the Quiver bundle handle to keep the bundle claimed while in-flight.
@@ -1280,7 +1284,8 @@ impl DurableBuffer {
         // First, resume any overflowed retries whose backoff has elapsed.
         // This preserves the retry delay guarantee even when the shared wakeup
         // scheduler is saturated and some retries had to stay local.
-        self.handle_due_retry_overflow(deadline, effect_handler).await?;
+        self.handle_due_retry_overflow(deadline, effect_handler)
+            .await?;
         // If wakeup capacity became available while handling due overflowed
         // retries, move waiting retries back to the normal wakeup path.
         self.retry_wakeup_state
@@ -1684,8 +1689,7 @@ impl DurableBuffer {
         revision: WakeupRevision,
         effect_handler: &mut EffectHandler<OtapPdata>,
     ) -> Result<(), Error> {
-        let Some(retry) = self.retry_wakeup_state.take_retry_wakeup(slot, revision)
-        else {
+        let Some(retry) = self.retry_wakeup_state.take_retry_wakeup(slot, revision) else {
             otel_warn!(
                 "durable_buffer.retry.unknown_wakeup",
                 wakeup_slot = slot.0.to_string(),
diff --git a/rust/otap-dataflow/crates/core-nodes/src/processors/durable_buffer_processor/retry_wakeup_state.rs b/rust/otap-dataflow/crates/core-nodes/src/processors/durable_buffer_processor/retry_wakeup_state.rs
index 8f55a0972b..5ac0e81b23 100644
--- a/rust/otap-dataflow/crates/core-nodes/src/processors/durable_buffer_processor/retry_wakeup_state.rs
+++ b/rust/otap-dataflow/crates/core-nodes/src/processors/durable_buffer_processor/retry_wakeup_state.rs
@@ -179,7 +179,12 @@ impl RetryWakeupState {
     /// - the most recent `(retry_count, retry_at)` replaces any older local
     ///   overflow record for the same bundle
     /// - equal due times are processed deterministically by `sequence`
-    fn insert_retry_overflow(&mut self, bundle_ref: BundleRef, retry_count: u32, retry_at: Instant) {
+    fn insert_retry_overflow(
+        &mut self,
+        bundle_ref: BundleRef,
+        retry_count: u32,
+        retry_at: Instant,
+    ) {
         let key = retry_key(bundle_ref);
         let _ = self.remove_retry_overflow(key);
         let retry = OverflowRetry {
@@ -236,11 +241,7 @@ impl RetryWakeupState {
                     let _ = self.retry_overflow.remove(&order.key);
                     let _ = self.retry_wakeups.insert(
                         slot,
-                        RetryWakeup::new(
-                            retry.bundle_ref,
-                            retry.retry_count,
-                            outcome.revision(),
-                        ),
+                        RetryWakeup::new(retry.bundle_ref, retry.retry_count, outcome.revision()),
                     );
                 }
                 Err(WakeupError::Capacity | WakeupError::ShuttingDown) => break,
@@ -267,7 +268,9 @@ impl RetryWakeupState {
         let _ = self.remove_retry_overflow(key);
         let slot = retry_wakeup_slot(key);
         match effect_handler.set_wakeup(slot, retry_at) {
-            Ok(WakeupSetOutcome::Inserted { revision } | WakeupSetOutcome::Replaced { revision }) => {
+            Ok(
+                WakeupSetOutcome::Inserted { revision } | WakeupSetOutcome::Replaced { revision },
+            ) => {
                 let _ = self.retry_scheduled.insert(key);
                 let _ = self
                     .retry_wakeups
diff --git a/rust/otap-dataflow/crates/engine/src/effect_handler.rs b/rust/otap-dataflow/crates/engine/src/effect_handler.rs
index 23a39a13e5..6e65ad60d6 100644
--- a/rust/otap-dataflow/crates/engine/src/effect_handler.rs
+++ b/rust/otap-dataflow/crates/engine/src/effect_handler.rs
@@ -4,7 +4,6 @@
 //! Common foundation of all effect handlers.
 
 use crate::Interests;
-use crate::{WakeupError, WakeupSetOutcome};
 use crate::completion_emission_metrics::CompletionEmissionMetricsHandle;
 use crate::control::{
     AckMsg, NackMsg, PipelineCompletionMsg, PipelineCompletionMsgSender, RuntimeControlMsg,
@@ -13,6 +12,7 @@ use crate::control::{
 use crate::error::Error;
 use crate::node::NodeId;
 use crate::node_local_scheduler::NodeLocalSchedulerHandle;
+use crate::{WakeupError, WakeupSetOutcome};
 use otap_df_channel::error::SendError;
 use otap_df_telemetry::error::Error as TelemetryError;
 use otap_df_telemetry::metrics::{MetricSet, MetricSetHandler};
diff --git a/rust/otap-dataflow/crates/engine/src/local/processor.rs b/rust/otap-dataflow/crates/engine/src/local/processor.rs
index 0fd4f151a5..7fef84bbf7 100644
--- a/rust/otap-dataflow/crates/engine/src/local/processor.rs
+++ b/rust/otap-dataflow/crates/engine/src/local/processor.rs
@@ -33,7 +33,6 @@
 //! in parallel on different cores, each with its own processor instance.
 
 use crate::Interests;
-use crate::{WakeupError, WakeupSetOutcome};
 use crate::control::{AckMsg, NackMsg, RuntimeCtrlMsgSender, WakeupSlot};
 use crate::effect_handler::{
     EffectHandlerCore, SourceTagging, TelemetryTimerCancelHandle, TimerCancelHandle,
@@ -43,6 +42,7 @@ use crate::message::{Message, Sender};
 use crate::node::NodeId;
 use crate::output_router::OutputRouter;
 use crate::process_duration::ComputeDuration;
+use crate::{WakeupError, WakeupSetOutcome};
 use async_trait::async_trait;
 use otap_df_config::PortName;
 use otap_df_telemetry::error::Error as TelemetryError;
diff --git a/rust/otap-dataflow/crates/engine/src/node_local_scheduler.rs b/rust/otap-dataflow/crates/engine/src/node_local_scheduler.rs
index 609bc3023f..019bdfa6a8 100644
--- a/rust/otap-dataflow/crates/engine/src/node_local_scheduler.rs
+++ b/rust/otap-dataflow/crates/engine/src/node_local_scheduler.rs
@@ -175,8 +175,8 @@ impl NodeLocalScheduler {
             return Err(WakeupError::ShuttingDown);
         }
 
-        let revision = self.next_revision();
         if let Some(&index) = self.wakeup_indices.get(&slot) {
+            let revision = self.next_revision();
             self.wakeups[index].when = when;
             self.wakeups[index].revision = revision;
             self.repair_heap_at(index);
@@ -185,6 +185,7 @@ impl NodeLocalScheduler {
             if self.wakeup_indices.len() >= self.wakeup_capacity {
                 return Err(WakeupError::Capacity);
             }
+            let revision = self.next_revision();
             let index = self.wakeups.len();
             self.wakeups.push(ScheduledWakeup {
                 slot,
@@ -512,7 +513,10 @@ mod tests {
         }
 
         let expected = now + Duration::from_secs(1);
-        assert_eq!(scheduler.pop_due(expected), Some((WakeupSlot(9), expected, 31)));
+        assert_eq!(
+            scheduler.pop_due(expected),
+            Some((WakeupSlot(9), expected, 31))
+        );
         assert_eq!(scheduler.next_expiry(), None);
     }
 
diff --git a/rust/otap-dataflow/crates/engine/src/shared/processor.rs b/rust/otap-dataflow/crates/engine/src/shared/processor.rs
index 353af95325..a5647766d1 100644
--- a/rust/otap-dataflow/crates/engine/src/shared/processor.rs
+++ b/rust/otap-dataflow/crates/engine/src/shared/processor.rs
@@ -32,7 +32,6 @@
 //! in parallel on different cores, each with its own processor instance.
 
 use crate::Interests;
-use crate::{WakeupError, WakeupSetOutcome};
 use crate::control::{AckMsg, NackMsg, RuntimeCtrlMsgSender, WakeupSlot};
 use crate::effect_handler::{
     EffectHandlerCore, SourceTagging, TelemetryTimerCancelHandle, TimerCancelHandle,
@@ -42,6 +41,7 @@ use crate::message::Message;
 use crate::node::NodeId;
 use crate::output_router::OutputRouter;
 use crate::shared::message::SharedSender;
+use crate::{WakeupError, WakeupSetOutcome};
 use async_trait::async_trait;
 use otap_df_config::PortName;
 use otap_df_telemetry::error::Error as TelemetryError;

From 9d5fe47bce0c17d01e159299d9d0675324cffa8f Mon Sep 17 00:00:00 2001
From: lquerel <l.querel@f5.com>
Date: Fri, 3 Apr 2026 17:06:46 -0700
Subject: [PATCH 14/18] Simplify durable buffer retry scheduling

---
 .../deferred_retry_state.rs                   | 501 ++++++++++++++++++
 .../durable_buffer_processor/mod.rs           | 241 ++++-----
 .../retry_wakeup_state.rs                     | 463 ----------------
 3 files changed, 597 insertions(+), 608 deletions(-)
 create mode 100644 rust/otap-dataflow/crates/core-nodes/src/processors/durable_buffer_processor/deferred_retry_state.rs
 delete mode 100644 rust/otap-dataflow/crates/core-nodes/src/processors/durable_buffer_processor/retry_wakeup_state.rs

diff --git a/rust/otap-dataflow/crates/core-nodes/src/processors/durable_buffer_processor/deferred_retry_state.rs b/rust/otap-dataflow/crates/core-nodes/src/processors/durable_buffer_processor/deferred_retry_state.rs
new file mode 100644
index 0000000000..f68ce5c109
--- /dev/null
+++ b/rust/otap-dataflow/crates/core-nodes/src/processors/durable_buffer_processor/deferred_retry_state.rs
@@ -0,0 +1,501 @@
+// Copyright The OpenTelemetry Authors
+// SPDX-License-Identifier: Apache-2.0
+
+//! Deferred retry scheduling for `durable_buffer_processor`.
+//!
+//! Durable buffer needs local per-bundle retry state even though it only uses
+//! one engine wakeup slot. This module owns that state and keeps the two layers
+//! aligned:
+//!
+//! - the processor tracks every deferred bundle locally by bundle identity
+//! - a local ordered index gives the next retry to resume
+//! - the single engine wakeup slot is always armed to the earliest deferred
+//!   retry deadline
+//! - wakeup revisions are used to ignore stale wakeups after re-arming
+//!
+//! This keeps retry scheduling on one mechanism under heavy NACK pressure
+//! instead of splitting between per-bundle wakeups and a separate overflow
+//! path.
+//!
+//! Guarantees:
+//!
+//! - a deferred bundle is held out of the normal poll loop until it is resumed
+//!   or explicitly re-deferred
+//! - due retries are resumed in deadline order, with deterministic ordering for
+//!   equal deadlines
+//! - durable buffer retry scheduling does not depend on having one engine
+//!   wakeup slot per deferred bundle
+
+use otap_df_engine::WakeupError;
+use otap_df_engine::control::{WakeupRevision, WakeupSlot};
+use otap_df_engine::local::processor::EffectHandler;
+use otap_df_otap::pdata::OtapPdata;
+use quiver::subscriber::BundleRef;
+use std::collections::{BTreeSet, HashMap, HashSet};
+use std::time::Instant;
+
+/// Durable buffer uses one processor-local wakeup slot for "the earliest retry
+/// currently pending in local state".
+pub(super) const RETRY_WAKEUP_SLOT: WakeupSlot = WakeupSlot(0);
+
+/// Convert a Quiver bundle identity into the stable key used by retry state.
+pub(super) fn retry_key(bundle_ref: BundleRef) -> (u64, u32) {
+    (bundle_ref.segment_seq.raw(), bundle_ref.bundle_index.raw())
+}
+
+/// Local deferred retry state for one bundle.
+///
+/// Durable buffer keeps retry scheduling state locally and only uses the engine
+/// wakeup API to re-arm the earliest pending retry deadline.
+#[derive(Clone, Copy)]
+pub(super) struct DeferredRetry {
+    bundle_ref: BundleRef,
+    retry_count: u32,
+    retry_at: Instant,
+    sequence: u64,
+}
+
+impl DeferredRetry {
+    const fn new(
+        bundle_ref: BundleRef,
+        retry_count: u32,
+        retry_at: Instant,
+        sequence: u64,
+    ) -> Self {
+        Self {
+            bundle_ref,
+            retry_count,
+            retry_at,
+            sequence,
+        }
+    }
+
+    pub(super) const fn bundle_ref(self) -> BundleRef {
+        self.bundle_ref
+    }
+
+    pub(super) const fn retry_count(self) -> u32 {
+        self.retry_count
+    }
+}
+
+/// Tracks the engine wakeup currently armed for the earliest deferred retry.
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+struct ArmedRetryWakeup {
+    when: Instant,
+    revision: WakeupRevision,
+}
+
+/// Ordering/index key for deferred retries.
+///
+/// Ordering is lexicographic by `(retry_at, sequence, key)`, which means:
+/// - earlier retry deadlines are resumed first
+/// - equal deadlines use insertion sequence as a deterministic tie-breaker
+/// - `key` keeps the ordering total and points back to the authoritative
+///   `DeferredRetry` stored in the map
+#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
+struct DeferredRetryOrder {
+    retry_at: Instant,
+    sequence: u64,
+    key: (u64, u32),
+}
+
+/// Local deferred-retry scheduling state for durable buffer.
+///
+/// The processor keeps all retry deadlines locally and uses one engine wakeup
+/// slot for "the earliest retry currently pending". This keeps the heavy-NACK
+/// path on a single scheduling mechanism instead of splitting between many
+/// armed wakeups and a separate overflow queue.
+pub(super) struct DeferredRetryState {
+    /// Bundles currently held out of the normal poll loop while retry backoff
+    /// is active.
+    ///
+    /// Invariant: every key here appears exactly once in `deferred`.
+    retry_scheduled: HashSet<(u64, u32)>,
+
+    /// Authoritative retry state keyed by bundle identity.
+    ///
+    /// Invariant: every deferred bundle appears exactly once here and exactly
+    /// once in `deferred_order`.
+    deferred: HashMap<(u64, u32), DeferredRetry>,
+
+    /// Due-order index for deferred retries.
+    deferred_order: BTreeSet<DeferredRetryOrder>,
+
+    /// Engine wakeup currently armed for the earliest deferred retry, if any.
+    ///
+    /// Invariant: when present, `when` equals the earliest currently armed
+    /// retry deadline as seen by the engine wakeup API, and `revision` is the
+    /// only wakeup revision allowed to trigger that arm.
+    armed_wakeup: Option<ArmedRetryWakeup>,
+
+    /// Monotonic tie-breaker for equal-deadline ordering.
+    next_sequence: u64,
+}
+
+impl DeferredRetryState {
+    pub(super) fn new() -> Self {
+        Self {
+            retry_scheduled: HashSet::new(),
+            deferred: HashMap::new(),
+            deferred_order: BTreeSet::new(),
+            armed_wakeup: None,
+            next_sequence: 0,
+        }
+    }
+
+    pub(super) fn scheduled_len(&self) -> usize {
+        self.retry_scheduled.len()
+    }
+
+    pub(super) fn is_deferred_key(&self, key: (u64, u32)) -> bool {
+        self.retry_scheduled.contains(&key)
+    }
+
+    fn deferred_order(key: (u64, u32), retry: DeferredRetry) -> DeferredRetryOrder {
+        DeferredRetryOrder {
+            retry_at: retry.retry_at,
+            sequence: retry.sequence,
+            key,
+        }
+    }
+
+    fn remove_deferred(&mut self, key: (u64, u32)) -> Option<DeferredRetry> {
+        let retry = self.deferred.remove(&key)?;
+        let _ = self
+            .deferred_order
+            .remove(&Self::deferred_order(key, retry));
+        let _ = self.retry_scheduled.remove(&key);
+        Some(retry)
+    }
+
+    fn insert_deferred(&mut self, bundle_ref: BundleRef, retry_count: u32, retry_at: Instant) {
+        let key = retry_key(bundle_ref);
+        let _ = self.remove_deferred(key);
+        let retry = DeferredRetry::new(bundle_ref, retry_count, retry_at, self.next_sequence);
+        self.next_sequence = self.next_sequence.saturating_add(1);
+        let _ = self.retry_scheduled.insert(key);
+        let _ = self.deferred.insert(key, retry);
+        let _ = self.deferred_order.insert(Self::deferred_order(key, retry));
+    }
+
+    fn desired_wakeup_at(&self, no_earlier_than: Option<Instant>) -> Option<Instant> {
+        let earliest = self.deferred_order.first().map(|order| order.retry_at)?;
+        Some(match no_earlier_than {
+            Some(not_before) if earliest < not_before => not_before,
+            _ => earliest,
+        })
+    }
+
+    fn sync_armed_wakeup(
+        &mut self,
+        effect_handler: &mut EffectHandler<OtapPdata>,
+        no_earlier_than: Option<Instant>,
+    ) -> Result<(), WakeupError> {
+        let Some(when) = self.desired_wakeup_at(no_earlier_than) else {
+            if self.armed_wakeup.is_some() {
+                let _ = effect_handler.cancel_wakeup(RETRY_WAKEUP_SLOT);
+                self.armed_wakeup = None;
+            }
+            return Ok(());
+        };
+
+        if self
+            .armed_wakeup
+            .is_some_and(|armed_wakeup| armed_wakeup.when == when)
+        {
+            return Ok(());
+        }
+
+        let revision = effect_handler
+            .set_wakeup(RETRY_WAKEUP_SLOT, when)?
+            .revision();
+        self.armed_wakeup = Some(ArmedRetryWakeup { when, revision });
+        Ok(())
+    }
+
+    /// Schedule or re-schedule retry deferral for a bundle.
+    ///
+    /// Guarantees:
+    /// - the bundle remains deferred in local state until retry resumption
+    /// - the single engine wakeup always tracks the earliest deferred retry
+    /// - returns `false` only when the wakeup could not be armed
+    pub(super) fn schedule_at(
+        &mut self,
+        bundle_ref: BundleRef,
+        retry_count: u32,
+        retry_at: Instant,
+        effect_handler: &mut EffectHandler<OtapPdata>,
+    ) -> bool {
+        let key = retry_key(bundle_ref);
+        self.insert_deferred(bundle_ref, retry_count, retry_at);
+        match self.sync_armed_wakeup(effect_handler, None) {
+            Ok(()) => true,
+            Err(error) => {
+                let _ = self.remove_deferred(key);
+                debug_assert_ne!(
+                    error,
+                    WakeupError::Capacity,
+                    "single-slot durable-buffer wakeup should not hit capacity"
+                );
+                false
+            }
+        }
+    }
+
+    /// Accept one wakeup delivery only when it matches the currently armed
+    /// durable-buffer slot and revision.
+    ///
+    /// Guarantees:
+    /// - unrelated slots are ignored
+    /// - stale revisions are ignored
+    /// - the matching wakeup clears the armed state exactly once
+    pub(super) fn accept_wakeup(&mut self, slot: WakeupSlot, revision: WakeupRevision) -> bool {
+        if slot != RETRY_WAKEUP_SLOT {
+            return false;
+        }
+
+        let Some(armed_wakeup) = self.armed_wakeup else {
+            return false;
+        };
+
+        if armed_wakeup.revision != revision {
+            return false;
+        }
+
+        self.armed_wakeup = None;
+        true
+    }
+
+    /// Pop the next deferred retry only when its due time has arrived.
+    ///
+    /// Guarantee: returning a retry clears all local deferred bookkeeping for
+    /// that bundle so it can be resumed exactly once.
+    pub(super) fn take_due_retry(&mut self, now: Instant) -> Option<DeferredRetry> {
+        let order = *self.deferred_order.first()?;
+        if order.retry_at > now {
+            return None;
+        }
+
+        let _ = self.deferred_order.remove(&order);
+        let retry = self.deferred.remove(&order.key)?;
+        let _ = self.retry_scheduled.remove(&order.key);
+        Some(retry)
+    }
+
+    /// Re-arm the single durable-buffer wakeup after retry processing.
+    ///
+    /// `no_earlier_than` lets the caller push the next retry attempt out when
+    /// retries are already due but resend is currently blocked by flow control.
+    pub(super) fn rearm_after_processing(
+        &mut self,
+        effect_handler: &mut EffectHandler<OtapPdata>,
+        no_earlier_than: Option<Instant>,
+    ) -> bool {
+        match self.sync_armed_wakeup(effect_handler, no_earlier_than) {
+            Ok(()) => true,
+            Err(error) => {
+                debug_assert_ne!(
+                    error,
+                    WakeupError::Capacity,
+                    "single-slot durable-buffer wakeup should not hit capacity"
+                );
+                false
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use quiver::segment::SegmentSeq;
+    use quiver::subscriber::BundleIndex;
+    use std::time::Duration;
+
+    /// Scenario: one deferred retry is currently the earliest retry, and the
+    /// processor computes the wakeup it should arm next.
+    /// Guarantees: the single durable-buffer wakeup targets that earliest retry.
+    #[test]
+    fn desired_wakeup_tracks_earliest_retry() {
+        let mut state = DeferredRetryState::new();
+        let now = Instant::now();
+
+        state.insert_deferred(
+            BundleRef {
+                segment_seq: SegmentSeq::new(1),
+                bundle_index: BundleIndex::new(1),
+            },
+            1,
+            now + Duration::from_secs(3),
+        );
+        state.insert_deferred(
+            BundleRef {
+                segment_seq: SegmentSeq::new(2),
+                bundle_index: BundleIndex::new(2),
+            },
+            2,
+            now + Duration::from_secs(1),
+        );
+
+        assert_eq!(
+            state.desired_wakeup_at(None),
+            Some(now + Duration::from_secs(1))
+        );
+    }
+
+    /// Scenario: retries are already due, but resend is currently blocked and
+    /// the processor wants to defer the next retry attempt by one poll interval.
+    /// Guarantees: the next wakeup is not armed earlier than the supplied floor.
+    #[test]
+    fn desired_wakeup_respects_retry_floor() {
+        let mut state = DeferredRetryState::new();
+        let now = Instant::now();
+
+        state.insert_deferred(
+            BundleRef {
+                segment_seq: SegmentSeq::new(9),
+                bundle_index: BundleIndex::new(1),
+            },
+            1,
+            now - Duration::from_millis(1),
+        );
+
+        assert_eq!(
+            state.desired_wakeup_at(Some(now + Duration::from_secs(1))),
+            Some(now + Duration::from_secs(1))
+        );
+    }
+
+    /// Scenario: the processor receives the exact wakeup revision currently
+    /// armed for the durable-buffer retry slot.
+    /// Guarantees: that wakeup is accepted and clears the armed state exactly once.
+    #[test]
+    fn accept_wakeup_clears_matching_arm() {
+        let mut state = DeferredRetryState::new();
+        let now = Instant::now();
+        state.armed_wakeup = Some(ArmedRetryWakeup {
+            when: now,
+            revision: 17,
+        });
+
+        assert!(state.accept_wakeup(RETRY_WAKEUP_SLOT, 17));
+        assert!(state.armed_wakeup.is_none());
+        assert!(!state.accept_wakeup(RETRY_WAKEUP_SLOT, 17));
+    }
+
+    /// Scenario: the processor receives a wakeup for the retry slot, but the
+    /// revision is stale relative to the currently armed wakeup.
+    /// Guarantees: the stale wakeup is ignored and the armed wakeup remains.
+    #[test]
+    fn accept_wakeup_ignores_stale_revision() {
+        let mut state = DeferredRetryState::new();
+        let now = Instant::now();
+        state.armed_wakeup = Some(ArmedRetryWakeup {
+            when: now,
+            revision: 5,
+        });
+
+        assert!(!state.accept_wakeup(RETRY_WAKEUP_SLOT, 4));
+        assert_eq!(
+            state.armed_wakeup,
+            Some(ArmedRetryWakeup {
+                when: now,
+                revision: 5,
+            })
+        );
+    }
+
+    /// Scenario: the processor receives a wakeup for some unrelated slot.
+    /// Guarantees: the unrelated wakeup is ignored and armed retry state remains.
+    #[test]
+    fn accept_wakeup_ignores_unrelated_slot() {
+        let mut state = DeferredRetryState::new();
+        let now = Instant::now();
+        state.armed_wakeup = Some(ArmedRetryWakeup {
+            when: now,
+            revision: 3,
+        });
+
+        assert!(!state.accept_wakeup(WakeupSlot(999), 3));
+        assert!(state.armed_wakeup.is_some());
+    }
+
+    /// Scenario: one deferred retry becomes due and is popped for retry resumption.
+    /// Guarantees: taking that retry clears all local deferred bookkeeping.
+    #[test]
+    fn take_due_retry_clears_tracking() {
+        let mut state = DeferredRetryState::new();
+        let bundle_ref = BundleRef {
+            segment_seq: SegmentSeq::new(321),
+            bundle_index: BundleIndex::new(7),
+        };
+        let key = retry_key(bundle_ref);
+        let retry_at = Instant::now();
+
+        state.insert_deferred(bundle_ref, 4, retry_at);
+
+        assert!(state.retry_scheduled.contains(&key));
+        assert!(state.deferred.contains_key(&key));
+        assert_eq!(state.deferred_order.len(), 1);
+
+        let retry = state
+            .take_due_retry(retry_at + Duration::from_millis(1))
+            .expect("retry should be due");
+
+        assert_eq!(retry.bundle_ref().segment_seq.raw(), 321);
+        assert_eq!(retry.bundle_ref().bundle_index.raw(), 7);
+        assert_eq!(retry.retry_count(), 4);
+        assert!(!state.retry_scheduled.contains(&key));
+        assert!(!state.deferred.contains_key(&key));
+        assert!(state.deferred_order.is_empty());
+    }
+
+    /// Scenario: multiple retries become due at the same timestamp.
+    /// Guarantees: equal-deadline retries are resumed in insertion order via sequence.
+    #[test]
+    fn equal_deadline_retries_follow_sequence_order() {
+        let mut state = DeferredRetryState::new();
+        let retry_at = Instant::now();
+        let first = BundleRef {
+            segment_seq: SegmentSeq::new(111),
+            bundle_index: BundleIndex::new(1),
+        };
+        let second = BundleRef {
+            segment_seq: SegmentSeq::new(222),
+            bundle_index: BundleIndex::new(2),
+        };
+        let third = BundleRef {
+            segment_seq: SegmentSeq::new(333),
+            bundle_index: BundleIndex::new(3),
+        };
+
+        state.insert_deferred(first, 1, retry_at);
+        state.insert_deferred(second, 2, retry_at);
+        state.insert_deferred(third, 3, retry_at);
+
+        assert_eq!(
+            state
+                .take_due_retry(retry_at + Duration::from_millis(1))
+                .expect("first retry")
+                .bundle_ref(),
+            first
+        );
+        assert_eq!(
+            state
+                .take_due_retry(retry_at + Duration::from_millis(1))
+                .expect("second retry")
+                .bundle_ref(),
+            second
+        );
+        assert_eq!(
+            state
+                .take_due_retry(retry_at + Duration::from_millis(1))
+                .expect("third retry")
+                .bundle_ref(),
+            third
+        );
+        assert!(state.deferred.is_empty());
+        assert!(state.deferred_order.is_empty());
+    }
+}
diff --git a/rust/otap-dataflow/crates/core-nodes/src/processors/durable_buffer_processor/mod.rs b/rust/otap-dataflow/crates/core-nodes/src/processors/durable_buffer_processor/mod.rs
index 55db50efbf..fe536de2a7 100644
--- a/rust/otap-dataflow/crates/core-nodes/src/processors/durable_buffer_processor/mod.rs
+++ b/rust/otap-dataflow/crates/core-nodes/src/processors/durable_buffer_processor/mod.rs
@@ -71,7 +71,7 @@
 
 mod bundle_adapter;
 mod config;
-mod retry_wakeup_state;
+mod deferred_retry_state;
 
 use std::collections::hash_map::Entry;
 use std::collections::{HashMap, HashSet};
@@ -98,9 +98,9 @@ use bundle_adapter::{
     OtapRecordBundleAdapter, OtlpBytesAdapter, convert_bundle_to_pdata, signal_type_from_slot_id,
 };
 pub use config::{DurableBufferConfig, OtlpHandling, SizeCapPolicy};
-use retry_wakeup_state::RetryWakeupState;
 #[cfg(test)]
-use retry_wakeup_state::{retry_key, retry_wakeup_slot};
+use deferred_retry_state::RETRY_WAKEUP_SLOT;
+use deferred_retry_state::DeferredRetryState;
 
 use otap_df_config::SignalType;
 use otap_df_config::error::Error as ConfigError;
@@ -377,6 +377,16 @@ enum EngineState {
     Failed(String),
 }
 
+/// Outcome of trying to resume one deferred retry.
+enum RetryResumeOutcome {
+    /// The retry was re-claimed and sent downstream.
+    Sent,
+    /// The retry remains deferred because resend is blocked for now.
+    Deferred,
+    /// The retry no longer needs to be retried on this pass.
+    Skipped,
+}
+
 /// Cached per-segment signal classification for queued-item gauge computation.
 ///
 /// Populated once per segment (on first access after finalization) and never
@@ -414,8 +424,9 @@ pub struct DurableBuffer {
     /// Key is the (segment_seq, bundle_index) pair encoded as a u128 for fast lookup.
     pending_bundles: HashMap<(u64, u32), PendingBundle>,
 
-    /// Processor-local wakeup bookkeeping for retry deferral and overflow.
-    retry_wakeup_state: RetryWakeupState,
+    /// Processor-local retry deferral state, driven by one wakeup slot plus a
+    /// local ordered retry queue.
+    deferred_retry_state: DeferredRetryState,
 
     /// Configuration.
     config: DurableBufferConfig,
@@ -509,7 +520,7 @@ impl DurableBuffer {
         Ok(Self {
             engine_state: EngineState::Uninitialized,
             pending_bundles: HashMap::new(),
-            retry_wakeup_state: RetryWakeupState::new(),
+            deferred_retry_state: DeferredRetryState::new(),
             config,
             core_id,
             num_cores,
@@ -605,17 +616,13 @@ impl DurableBuffer {
         self.pending_bundles.len() < self.config.max_in_flight
     }
 
-    /// Schedule a retry for a bundle via a processor-local wakeup.
-    ///
-    /// This is the single point of coordination between wakeup scheduling and
-    /// `retry_scheduled` tracking. Always use this method to keep the state in sync.
+    /// Schedule a retry for a bundle in local deferred state and ensure the
+    /// single durable-buffer wakeup tracks the earliest pending retry.
     ///
     /// Guarantees:
-    /// - on success, the bundle remains deferred until either a wakeup or local
-    ///   overflow retry resumes it
-    /// - wakeup-capacity exhaustion falls back to local overflow state instead
-    ///   of immediate re-polling
-    /// - returns `false` only when the processor is already shutting down
+    /// - on success, the bundle remains deferred until a wakeup resumes it
+    /// - the engine wakeup slot is always re-armed to the earliest retry deadline
+    /// - returns `false` only when the wakeup could not be armed
     async fn schedule_retry(
         &mut self,
         bundle_ref: BundleRef,
@@ -624,7 +631,7 @@ impl DurableBuffer {
         effect_handler: &mut EffectHandler<OtapPdata>,
     ) -> bool {
         let retry_at = Instant::now() + delay;
-        self.retry_wakeup_state
+        self.deferred_retry_state
             .schedule_at(bundle_ref, retry_count, retry_at, effect_handler)
     }
 
@@ -633,14 +640,15 @@ impl DurableBuffer {
     ///
     /// Guarantees:
     /// - respects `max_in_flight`
-    /// - never falls back to hot polling on wakeup-capacity exhaustion
-    /// - preserves retry semantics for both armed wakeups and local overflow retries
+    /// - re-defers blocked retries with `poll_interval`
+    /// - returns enough outcome information for the caller to decide whether
+    ///   the current wakeup pass should keep resuming more due retries
     async fn resume_retry(
         &mut self,
         bundle_ref: BundleRef,
         retry_count: u32,
         effect_handler: &mut EffectHandler<OtapPdata>,
-    ) -> Result<(), Error> {
+    ) -> Result<RetryResumeOutcome, Error> {
         if !self.can_send_more() {
             otel_debug!(
                 "durable_buffer.retry.deferred",
@@ -661,7 +669,7 @@ impl DurableBuffer {
             {
                 otel_warn!("durable_buffer.retry.reschedule_failed");
             }
-            return Ok(());
+            return Ok(RetryResumeOutcome::Deferred);
         }
 
         let claim_result = {
@@ -682,6 +690,7 @@ impl DurableBuffer {
                         bundle_index = bundle_ref.bundle_index.raw(),
                         retry_count = retry_count
                     );
+                    Ok(RetryResumeOutcome::Sent)
                 }
                 ProcessBundleResult::Skipped => {
                     otel_warn!(
@@ -689,6 +698,7 @@ impl DurableBuffer {
                         segment_seq = bundle_ref.segment_seq.raw(),
                         bundle_index = bundle_ref.bundle_index.raw()
                     );
+                    Ok(RetryResumeOutcome::Skipped)
                 }
                 ProcessBundleResult::Backpressure => {
                     otel_debug!(
@@ -708,10 +718,9 @@ impl DurableBuffer {
                     {
                         otel_warn!("durable_buffer.retry.reschedule_failed");
                     }
+                    Ok(RetryResumeOutcome::Deferred)
                 }
-                ProcessBundleResult::Error(e) => {
-                    return Err(e);
-                }
+                ProcessBundleResult::Error(e) => Err(e),
             },
             Err(e) => {
                 otel_debug!(
@@ -720,39 +729,9 @@ impl DurableBuffer {
                     bundle_index = bundle_ref.bundle_index.raw(),
                     error = %e
                 );
+                Ok(RetryResumeOutcome::Skipped)
             }
         }
-
-        Ok(())
-    }
-
-    /// Runs due overflowed retries directly from `TimerTick` while there is
-    /// time left in the tick budget.
-    ///
-    /// Guarantee: overdue retries still make forward progress even if the wakeup
-    /// scheduler remains full for an extended period.
-    async fn handle_due_retry_overflow(
-        &mut self,
-        deadline: Instant,
-        effect_handler: &mut EffectHandler<OtapPdata>,
-    ) -> Result<(), Error> {
-        loop {
-            if Instant::now() >= deadline || !self.can_send_more() {
-                break;
-            }
-
-            let Some(retry) = self
-                .retry_wakeup_state
-                .take_due_retry_overflow(Instant::now())
-            else {
-                break;
-            };
-
-            self.resume_retry(retry.bundle_ref(), retry.retry_count(), effect_handler)
-                .await?;
-        }
-
-        Ok(())
     }
 
     /// Lazily initialize the Quiver engine on first use.
@@ -1281,16 +1260,6 @@ impl DurableBuffer {
         let deadline = Instant::now() + drain_budget;
         let mut bundles_processed = 0usize;
 
-        // First, resume any overflowed retries whose backoff has elapsed.
-        // This preserves the retry delay guarantee even when the shared wakeup
-        // scheduler is saturated and some retries had to stay local.
-        self.handle_due_retry_overflow(deadline, effect_handler)
-            .await?;
-        // If wakeup capacity became available while handling due overflowed
-        // retries, move waiting retries back to the normal wakeup path.
-        self.retry_wakeup_state
-            .promote_overflow_to_wakeups(effect_handler);
-
         // Track the first skipped bundle to detect when we've cycled through all
         // available bundles without making progress (all are in-flight or scheduled).
         let mut first_skipped: Option<(u64, u32)> = None;
@@ -1344,7 +1313,7 @@ impl DurableBuffer {
                                     "durable_buffer.drain.all_blocked",
                                     bundles_processed = bundles_processed,
                                     in_flight = self.pending_bundles.len(),
-                                    retry_scheduled = self.retry_wakeup_state.scheduled_len()
+                                    retry_scheduled = self.deferred_retry_state.scheduled_len()
                                 );
                                 break;
                             }
@@ -1439,9 +1408,9 @@ impl DurableBuffer {
         // Skip if this bundle is scheduled for retry (waiting for backoff).
         // This enforces the exponential backoff - poll_next_bundle() returns
         // deferred bundles immediately, but we should wait for the retry delay.
-        if self.retry_wakeup_state.is_deferred_key(key) {
+        if self.deferred_retry_state.is_deferred_key(key) {
             // Bundle is waiting for backoff. Release the claim; it will be
-            // re-claimed when a retry wakeup or due overflow retry resumes it.
+            // re-claimed when the single durable-buffer retry wakeup resumes it.
             drop(handle); // Implicit defer
             return ProcessBundleResult::Skipped;
         }
@@ -1581,9 +1550,8 @@ impl DurableBuffer {
     /// For permanent NACKs (e.g., malformed data that will never succeed), the bundle
     /// is rejected immediately without retry.
     ///
-    /// For transient NACKs, schedules a retry with exponential backoff using a
-    /// processor-local wakeup. The bundle is deferred in Quiver (releasing the
-    /// claim) and local retry state is retained until the wakeup fires.
+    /// For transient NACKs, defers the bundle locally with exponential backoff
+    /// and ensures the single durable-buffer wakeup tracks the earliest retry.
     async fn handle_nack(
         &mut self,
         nack: NackMsg<OtapPdata>,
@@ -1689,21 +1657,40 @@ impl DurableBuffer {
         revision: WakeupRevision,
         effect_handler: &mut EffectHandler<OtapPdata>,
     ) -> Result<(), Error> {
-        let Some(retry) = self.retry_wakeup_state.take_retry_wakeup(slot, revision) else {
+        if !self.deferred_retry_state.accept_wakeup(slot, revision) {
             otel_warn!(
                 "durable_buffer.retry.unknown_wakeup",
                 wakeup_slot = slot.0.to_string(),
                 wakeup_revision = revision
             );
-            self.retry_wakeup_state
-                .promote_overflow_to_wakeups(effect_handler);
             return Ok(());
-        };
+        }
+
+        let mut rearm_no_earlier_than = None;
+        loop {
+            let now = Instant::now();
+            let Some(retry) = self.deferred_retry_state.take_due_retry(now) else {
+                break;
+            };
+
+            match self
+                .resume_retry(retry.bundle_ref(), retry.retry_count(), effect_handler)
+                .await?
+            {
+                RetryResumeOutcome::Sent | RetryResumeOutcome::Skipped => {}
+                RetryResumeOutcome::Deferred => {
+                    rearm_no_earlier_than = Some(now + self.config.poll_interval);
+                    break;
+                }
+            }
+        }
 
-        self.resume_retry(retry.bundle_ref(), retry.retry_count(), effect_handler)
-            .await?;
-        self.retry_wakeup_state
-            .promote_overflow_to_wakeups(effect_handler);
+        if !self
+            .deferred_retry_state
+            .rearm_after_processing(effect_handler, rearm_no_earlier_than)
+        {
+            otel_warn!("durable_buffer.retry.rearm_failed");
+        }
 
         Ok(())
     }
@@ -2024,8 +2011,8 @@ mod tests {
             "retention_size_cap": "256 MiB",
             "poll_interval": "100ms",
             "max_segment_open_duration": "1s",
-            "initial_retry_interval": "1s",
-            "max_retry_interval": "30s",
+            "initial_retry_interval": "100ms",
+            "max_retry_interval": "100ms",
             "retry_multiplier": 2.0,
             "max_in_flight": 1000
         });
@@ -2060,9 +2047,6 @@ mod tests {
                 let sent = outputs.pop().expect("sent bundle");
                 let (_, nack) =
                     next_nack(NackMsg::new("retry", sent)).expect("expected nack subscriber");
-                let bundle_ref =
-                    decode_bundle_ref(&nack.unwind.route.calldata).expect("bundle ref in nack");
-                let armed_slot = retry_wakeup_slot(retry_key(bundle_ref));
                 ctx.process(Message::Control(NodeControlMsg::Nack(nack)))
                     .await
                     .expect("process nack");
@@ -2071,9 +2055,10 @@ mod tests {
                     "nack should defer delivery until wakeup"
                 );
 
+                ctx.sleep(Duration::from_millis(200)).await;
                 ctx.process(Message::Control(NodeControlMsg::Wakeup {
-                    slot: armed_slot,
-                    when: Instant::now() + Duration::from_secs(1),
+                    slot: RETRY_WAKEUP_SLOT,
+                    when: Instant::now(),
                     revision: 0,
                 }))
                 .await
@@ -2086,13 +2071,12 @@ mod tests {
             .validate(|_| async {});
     }
 
-    /// Scenario: an unrelated wakeup arrives while durable-buffer still has one
-    /// armed retry and one overflowed retry pending.
+    /// Scenario: an unrelated wakeup arrives while durable-buffer has multiple
+    /// deferred retries pending behind its single retry wakeup slot.
     /// Guarantees: the unrelated wakeup does not cause early redelivery or lose
-    /// either deferred retry; the overflowed retry still resumes on `TimerTick`
-    /// once due, and the armed retry still resumes through its wakeup path.
+    /// deferred retries; the matching wakeup later resumes all due retries.
     #[test]
-    fn test_unknown_wakeup_does_not_lose_overflowed_retry() {
+    fn test_unknown_wakeup_does_not_lose_deferred_retries() {
         use otap_df_config::node::NodeUserConfig;
         use otap_df_engine::config::ProcessorConfig;
         use otap_df_engine::context::ControllerContext;
@@ -2150,21 +2134,13 @@ mod tests {
                     .expect("process timer tick");
                 let mut outputs = ctx.drain_pdata().await;
                 assert_eq!(outputs.len(), 2, "timer tick should emit two bundles");
-                let mut armed_slot = None;
-
-                for (index, sent) in outputs.drain(..).enumerate() {
+                for sent in outputs.drain(..) {
                     let (_, nack) =
                         next_nack(NackMsg::new("retry", sent)).expect("expected nack subscriber");
-                    if index == 0 {
-                        let bundle_ref = decode_bundle_ref(&nack.unwind.route.calldata)
-                            .expect("bundle ref in nack");
-                        armed_slot = Some(retry_wakeup_slot(retry_key(bundle_ref)));
-                    }
                     ctx.process(Message::Control(NodeControlMsg::Nack(nack)))
                         .await
                         .expect("process nack");
                 }
-                let armed_slot = armed_slot.expect("first nack should arm a wakeup");
 
                 ctx.process(Message::Control(NodeControlMsg::Wakeup {
                     slot: WakeupSlot(999),
@@ -2179,36 +2155,29 @@ mod tests {
                 );
 
                 ctx.sleep(Duration::from_millis(200)).await;
-                ctx.process(Message::Control(NodeControlMsg::TimerTick {}))
-                    .await
-                    .expect("process due timer tick");
-                let overflow_retry = ctx.drain_pdata().await;
-                assert_eq!(
-                    overflow_retry.len(),
-                    1,
-                    "overflowed retry should still resume on timer tick"
-                );
-
                 ctx.process(Message::Control(NodeControlMsg::Wakeup {
-                    slot: armed_slot,
+                    slot: RETRY_WAKEUP_SLOT,
                     when: Instant::now(),
                     revision: 0,
                 }))
                 .await
-                .expect("process armed retry wakeup");
-                let wakeup_retry = ctx.drain_pdata().await;
-                assert_eq!(wakeup_retry.len(), 1, "armed retry should still resume");
+                .expect("process shared retry wakeup");
+                let retried = ctx.drain_pdata().await;
+                assert_eq!(
+                    retried.len(),
+                    2,
+                    "matching wakeup should resume all due deferred retries"
+                );
             })
             .validate(|_| async {});
     }
 
-    /// Scenario: two transient NACKs occur while the processor only has wakeup
-    /// capacity for one armed retry.
-    /// Guarantees: the overflowed retry remains deferred instead of hot-repolling,
-    /// a later `TimerTick` resumes it once due, and the originally armed wakeup
-    /// path still resumes its retry independently.
+    /// Scenario: two transient NACKs occur and both retries share the durable
+    /// buffer's single wakeup slot.
+    /// Guarantees: no retry is re-delivered before the shared wakeup fires, and
+    /// one matching wakeup resumes all due retries.
     #[test]
-    fn test_retry_capacity_overflow_uses_local_deferral() {
+    fn test_multiple_retries_share_single_wakeup() {
         use otap_df_config::node::NodeUserConfig;
         use otap_df_engine::config::ProcessorConfig;
         use otap_df_engine::context::ControllerContext;
@@ -2240,9 +2209,9 @@ mod tests {
 
         let processor = create_durable_buffer(
             pipeline_ctx,
-            test_node("durable-buffer-retry-overflow"),
+            test_node("durable-buffer-shared-retry-wakeup"),
             Arc::new(node_config),
-            &ProcessorConfig::with_channel_capacities("durable-buffer-retry-overflow", 1, 100),
+            &ProcessorConfig::with_channel_capacities("durable-buffer-shared-retry-wakeup", 1, 100),
         )
         .expect("create durable buffer");
 
@@ -2266,53 +2235,35 @@ mod tests {
                     .expect("process timer tick");
                 let mut outputs = ctx.drain_pdata().await;
                 assert_eq!(outputs.len(), 2, "timer tick should emit two bundles");
-                let mut armed_slot = None;
-
-                for (index, sent) in outputs.drain(..).enumerate() {
+                for sent in outputs.drain(..) {
                     let (_, nack) =
                         next_nack(NackMsg::new("retry", sent)).expect("expected nack subscriber");
-                    if index == 0 {
-                        let bundle_ref = decode_bundle_ref(&nack.unwind.route.calldata)
-                            .expect("bundle ref in nack");
-                        armed_slot = Some(retry_wakeup_slot(retry_key(bundle_ref)));
-                    }
                     ctx.process(Message::Control(NodeControlMsg::Nack(nack)))
                         .await
                         .expect("process nack");
                 }
-                let armed_slot = armed_slot.expect("first nack should arm a wakeup");
 
                 ctx.process(Message::Control(NodeControlMsg::TimerTick {}))
                     .await
                     .expect("process immediate timer tick");
                 assert!(
                     ctx.drain_pdata().await.is_empty(),
-                    "capacity overflow retries should stay deferred until due"
+                    "shared wakeup retries should stay deferred until due"
                 );
 
                 ctx.sleep(Duration::from_millis(200)).await;
-                ctx.process(Message::Control(NodeControlMsg::TimerTick {}))
-                    .await
-                    .expect("process due timer tick");
-                let overflow_retry = ctx.drain_pdata().await;
-                assert_eq!(
-                    overflow_retry.len(),
-                    1,
-                    "a due overflow retry should resume on timer tick even without wakeup capacity"
-                );
-
                 ctx.process(Message::Control(NodeControlMsg::Wakeup {
-                    slot: armed_slot,
+                    slot: RETRY_WAKEUP_SLOT,
                     when: Instant::now(),
                     revision: 0,
                 }))
                 .await
-                .expect("process armed retry wakeup");
+                .expect("process shared retry wakeup");
                 let wakeup_retry = ctx.drain_pdata().await;
                 assert_eq!(
                     wakeup_retry.len(),
-                    1,
-                    "armed wakeup should still resume retry delivery"
+                    2,
+                    "shared wakeup should resume all due retry deliveries"
                 );
             })
             .validate(|_| async {});
diff --git a/rust/otap-dataflow/crates/core-nodes/src/processors/durable_buffer_processor/retry_wakeup_state.rs b/rust/otap-dataflow/crates/core-nodes/src/processors/durable_buffer_processor/retry_wakeup_state.rs
deleted file mode 100644
index 5ac0e81b23..0000000000
--- a/rust/otap-dataflow/crates/core-nodes/src/processors/durable_buffer_processor/retry_wakeup_state.rs
+++ /dev/null
@@ -1,463 +0,0 @@
-// Copyright The OpenTelemetry Authors
-// SPDX-License-Identifier: Apache-2.0
-
-use otap_df_engine::control::{WakeupRevision, WakeupSlot};
-use otap_df_engine::local::processor::EffectHandler;
-use otap_df_engine::{WakeupError, WakeupSetOutcome};
-use otap_df_otap::pdata::OtapPdata;
-use quiver::subscriber::BundleRef;
-use std::collections::{BTreeSet, HashMap, HashSet};
-use std::time::Instant;
-
-/// Convert a Quiver bundle identity into the stable key used by retry state.
-pub(super) fn retry_key(bundle_ref: BundleRef) -> (u64, u32) {
-    (bundle_ref.segment_seq.raw(), bundle_ref.bundle_index.raw())
-}
-
-/// Encodes a durable-buffer bundle identity into a processor-local wakeup slot.
-///
-/// Layout: `[segment_seq:u64 | bundle_index:u64]`
-pub(super) const fn retry_wakeup_slot(key: (u64, u32)) -> WakeupSlot {
-    WakeupSlot(((key.0 as u128) << 64) | (key.1 as u128))
-}
-
-/// Retry state for a bundle that has already acquired an engine wakeup slot.
-///
-/// This is the armed phase of retry deferral:
-/// - Quiver has released the bundle claim via implicit defer
-/// - the processor has successfully registered a node-local wakeup for that bundle
-/// - the wakeup slot is the bundle key encoded directly via `retry_wakeup_slot(...)`
-/// - `revision` is the current scheduler revision for that slot
-///
-/// The struct intentionally keeps only the minimum information needed to resume
-/// the retry when the matching wakeup fires.
-#[derive(Clone, Copy)]
-pub(super) struct RetryWakeup {
-    bundle_ref: BundleRef,
-    retry_count: u32,
-    revision: WakeupRevision,
-}
-
-impl RetryWakeup {
-    const fn new(bundle_ref: BundleRef, retry_count: u32, revision: WakeupRevision) -> Self {
-        Self {
-            bundle_ref,
-            retry_count,
-            revision,
-        }
-    }
-
-    pub(super) const fn bundle_ref(self) -> BundleRef {
-        self.bundle_ref
-    }
-
-    pub(super) const fn retry_count(self) -> u32 {
-        self.retry_count
-    }
-}
-
-/// Retry state for a bundle that could not acquire an engine wakeup slot yet.
-///
-/// This is the local overflow phase of retry deferral, used when
-/// `EffectHandler::set_wakeup(...)` returns `WakeupError::Capacity`.
-///
-/// Guarantees supported by this representation:
-/// - the bundle still remains deferred and is kept out of `poll_next_bundle()`
-///   through `retry_scheduled`
-/// - the intended retry deadline is preserved in `retry_at`
-/// - equal deadlines are ordered deterministically by `sequence`
-///
-/// `OverflowRetry` is stored in `retry_overflow` and indexed for due-order by
-/// a matching `OverflowRetryOrder` entry in `retry_overflow_order`.
-#[derive(Clone, Copy)]
-struct OverflowRetry {
-    bundle_ref: BundleRef,
-    retry_count: u32,
-    retry_at: Instant,
-    sequence: u64,
-}
-
-/// Ordering/index key for `retry_overflow_order`.
-///
-/// This is kept separate from `OverflowRetry` so the processor can maintain:
-/// - a keyed lookup map (`retry_overflow`) for exact replacement/removal
-/// - an ordered set (`retry_overflow_order`) for "next due" selection
-///
-/// Ordering is lexicographic by `(retry_at, sequence, key)`, which means:
-/// - earlier deadlines are resumed first
-/// - equal deadlines use insertion sequence as a deterministic tie-breaker
-/// - `key` keeps the ordering total and points back to the authoritative
-///   `OverflowRetry` stored in the map
-#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
-struct OverflowRetryOrder {
-    retry_at: Instant,
-    sequence: u64,
-    key: (u64, u32),
-}
-
-/// Local wakeup bookkeeping for durable-buffer retry deferral.
-///
-/// This state owns the invariants around armed retries, overflow retries, and
-/// deferred bundle membership. `DurableBuffer` itself still owns retry policy,
-/// Quiver interaction, and downstream resend behavior.
-pub(super) struct RetryWakeupState {
-    /// Bundles currently held out of the normal poll loop while backoff is active.
-    ///
-    /// Invariant: every key here is deferred for retry either by an armed wakeup
-    /// (`retry_wakeups`) or by the local overflow queue
-    /// (`retry_overflow` + `retry_overflow_order`).
-    retry_scheduled: HashSet<(u64, u32)>,
-
-    /// Retry state keyed by wakeup slot.
-    ///
-    /// Invariant: each slot is the encoded bundle key for the matching
-    /// `RetryWakeup.bundle_ref`, and `RetryWakeup.revision` is the only wakeup
-    /// revision for that slot that is allowed to resume the retry.
-    retry_wakeups: HashMap<WakeupSlot, RetryWakeup>,
-
-    /// Retry state held locally while wakeup scheduling is at capacity.
-    ///
-    /// Guarantee: overflowed retries remain deferred and keep their target due
-    /// time even when the engine wakeup scheduler is full.
-    retry_overflow: HashMap<(u64, u32), OverflowRetry>,
-
-    /// Due-order index for locally deferred retries.
-    ///
-    /// Invariant: this contains exactly one ordering key for each entry in
-    /// `retry_overflow`, using `sequence` as a deterministic tie-breaker.
-    retry_overflow_order: BTreeSet<OverflowRetryOrder>,
-
-    /// Monotonic tie-breaker for locally deferred retry ordering.
-    next_retry_overflow_sequence: u64,
-}
-
-impl RetryWakeupState {
-    pub(super) fn new() -> Self {
-        Self {
-            retry_scheduled: HashSet::new(),
-            retry_wakeups: HashMap::new(),
-            retry_overflow: HashMap::new(),
-            retry_overflow_order: BTreeSet::new(),
-            next_retry_overflow_sequence: 0,
-        }
-    }
-
-    pub(super) fn scheduled_len(&self) -> usize {
-        self.retry_scheduled.len()
-    }
-
-    pub(super) fn is_deferred_key(&self, key: (u64, u32)) -> bool {
-        self.retry_scheduled.contains(&key)
-    }
-
-    fn overflow_retry_order(key: (u64, u32), retry: OverflowRetry) -> OverflowRetryOrder {
-        OverflowRetryOrder {
-            retry_at: retry.retry_at,
-            sequence: retry.sequence,
-            key,
-        }
-    }
-
-    /// Removes one overflowed retry from both local indexes.
-    ///
-    /// Invariant preserved: `retry_overflow` and `retry_overflow_order` stay in
-    /// lockstep after every insertion/removal.
-    fn remove_retry_overflow(&mut self, key: (u64, u32)) -> Option<OverflowRetry> {
-        let retry = self.retry_overflow.remove(&key)?;
-        let _ = self
-            .retry_overflow_order
-            .remove(&Self::overflow_retry_order(key, retry));
-        Some(retry)
-    }
-
-    /// Defers a retry in local processor state when the engine wakeup scheduler
-    /// has no free slot.
-    ///
-    /// Guarantees:
-    /// - the bundle remains in `retry_scheduled`, so `poll_next_bundle()` keeps
-    ///   skipping it
-    /// - the most recent `(retry_count, retry_at)` replaces any older local
-    ///   overflow record for the same bundle
-    /// - equal due times are processed deterministically by `sequence`
-    fn insert_retry_overflow(
-        &mut self,
-        bundle_ref: BundleRef,
-        retry_count: u32,
-        retry_at: Instant,
-    ) {
-        let key = retry_key(bundle_ref);
-        let _ = self.remove_retry_overflow(key);
-        let retry = OverflowRetry {
-            bundle_ref,
-            retry_count,
-            retry_at,
-            sequence: self.next_retry_overflow_sequence,
-        };
-        self.next_retry_overflow_sequence = self.next_retry_overflow_sequence.saturating_add(1);
-        let _ = self.retry_scheduled.insert(key);
-        let _ = self.retry_overflow.insert(key, retry);
-        let _ = self
-            .retry_overflow_order
-            .insert(Self::overflow_retry_order(key, retry));
-    }
-
-    /// Pops the next locally deferred retry only when its due time has arrived.
-    ///
-    /// Guarantee: returning a retry clears all local overflow bookkeeping for
-    /// that bundle so it can be resumed exactly once.
-    pub(super) fn take_due_retry_overflow(&mut self, now: Instant) -> Option<RetryWakeup> {
-        let order = *self.retry_overflow_order.first()?;
-        if order.retry_at > now {
-            return None;
-        }
-
-        let _ = self.retry_overflow_order.remove(&order);
-        let retry = self.retry_overflow.remove(&order.key)?;
-        let _ = self.retry_scheduled.remove(&order.key);
-
-        Some(RetryWakeup::new(retry.bundle_ref, retry.retry_count, 0))
-    }
-
-    /// Opportunistically moves overflowed retries back into engine wakeup slots.
-    ///
-    /// Guarantees:
-    /// - never drops a deferred retry when slot acquisition fails
-    /// - preserves retry due time when promotion succeeds
-    /// - stops as soon as the scheduler reports `Capacity` or shutdown
-    pub(super) fn promote_overflow_to_wakeups(
-        &mut self,
-        effect_handler: &mut EffectHandler<OtapPdata>,
-    ) {
-        while let Some(order) = self.retry_overflow_order.first().copied() {
-            let Some(retry) = self.retry_overflow.get(&order.key).copied() else {
-                let _ = self.retry_overflow_order.remove(&order);
-                continue;
-            };
-
-            let slot = retry_wakeup_slot(order.key);
-            match effect_handler.set_wakeup(slot, retry.retry_at) {
-                Ok(outcome) => {
-                    let _ = self.retry_overflow_order.remove(&order);
-                    let _ = self.retry_overflow.remove(&order.key);
-                    let _ = self.retry_wakeups.insert(
-                        slot,
-                        RetryWakeup::new(retry.bundle_ref, retry.retry_count, outcome.revision()),
-                    );
-                }
-                Err(WakeupError::Capacity | WakeupError::ShuttingDown) => break,
-            }
-        }
-    }
-
-    /// Schedule or re-schedule retry deferral for a bundle.
-    ///
-    /// Guarantees:
-    /// - on success, the bundle remains deferred until either an armed wakeup
-    ///   or local overflow retry resumes it
-    /// - wakeup-capacity exhaustion falls back to local overflow state instead
-    ///   of immediate re-polling
-    /// - returns `false` only when the processor is already shutting down
-    pub(super) fn schedule_at(
-        &mut self,
-        bundle_ref: BundleRef,
-        retry_count: u32,
-        retry_at: Instant,
-        effect_handler: &mut EffectHandler<OtapPdata>,
-    ) -> bool {
-        let key = retry_key(bundle_ref);
-        let _ = self.remove_retry_overflow(key);
-        let slot = retry_wakeup_slot(key);
-        match effect_handler.set_wakeup(slot, retry_at) {
-            Ok(
-                WakeupSetOutcome::Inserted { revision } | WakeupSetOutcome::Replaced { revision },
-            ) => {
-                let _ = self.retry_scheduled.insert(key);
-                let _ = self
-                    .retry_wakeups
-                    .insert(slot, RetryWakeup::new(bundle_ref, retry_count, revision));
-                true
-            }
-            Err(WakeupError::Capacity) => {
-                self.insert_retry_overflow(bundle_ref, retry_count, retry_at);
-                true
-            }
-            Err(WakeupError::ShuttingDown) => false,
-        }
-    }
-
-    /// Remove retry-wakeup tracking for a bundle now being resumed.
-    ///
-    /// Guarantee: taking a wakeup clears the armed-wakeup bookkeeping for that
-    /// bundle before retry resumption starts.
-    pub(super) fn take_retry_wakeup(
-        &mut self,
-        slot: WakeupSlot,
-        revision: WakeupRevision,
-    ) -> Option<RetryWakeup> {
-        let wakeup = self.retry_wakeups.get(&slot).copied()?;
-        if wakeup.revision != revision {
-            return None;
-        }
-
-        let wakeup = self
-            .retry_wakeups
-            .remove(&slot)
-            .expect("matching wakeup should still exist");
-        let key = retry_key(wakeup.bundle_ref);
-        let _ = self.retry_scheduled.remove(&key);
-        Some(wakeup)
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use quiver::segment::SegmentSeq;
-    use quiver::subscriber::BundleIndex;
-    use std::time::Duration;
-
-    /// Scenario: a retry is armed in the engine wakeup scheduler for one
-    /// bundle, and the matching wakeup later arrives with the same revision.
-    /// Guarantees: taking that wakeup clears the armed retry bookkeeping and
-    /// returns the original `(bundle_ref, retry_count)` exactly once.
-    #[test]
-    fn take_retry_wakeup_clears_tracking() {
-        let mut state = RetryWakeupState::new();
-        let bundle_ref = BundleRef {
-            segment_seq: SegmentSeq::new(98765),
-            bundle_index: BundleIndex::new(123),
-        };
-        let key = retry_key(bundle_ref);
-        let slot = retry_wakeup_slot(key);
-        let _ = state.retry_scheduled.insert(key);
-        let _ = state
-            .retry_wakeups
-            .insert(slot, RetryWakeup::new(bundle_ref, 3, 17));
-
-        let taken = state
-            .take_retry_wakeup(slot, 17)
-            .expect("retry wakeup should exist");
-        assert_eq!(taken.bundle_ref().segment_seq.raw(), 98765);
-        assert_eq!(taken.bundle_ref().bundle_index.raw(), 123);
-        assert_eq!(taken.retry_count(), 3);
-        assert!(!state.retry_scheduled.contains(&key));
-        assert!(!state.retry_wakeups.contains_key(&slot));
-    }
-
-    /// Scenario: the processor receives a wakeup for a slot that has no armed
-    /// retry state.
-    /// Guarantees: the unknown wakeup is ignored and does not mutate retry
-    /// bookkeeping.
-    #[test]
-    fn take_retry_wakeup_unknown_slot_is_ignored() {
-        let mut state = RetryWakeupState::new();
-        assert!(state.take_retry_wakeup(WakeupSlot(999), 0).is_none());
-    }
-
-    /// Scenario: a slot has been rescheduled, so the processor still has armed
-    /// retry state for that slot but the arriving wakeup carries an older
-    /// revision.
-    /// Guarantees: the stale wakeup is ignored, and the current armed retry
-    /// state remains available for the matching revision.
-    #[test]
-    fn take_retry_wakeup_stale_revision_is_ignored() {
-        let mut state = RetryWakeupState::new();
-        let bundle_ref = BundleRef {
-            segment_seq: SegmentSeq::new(123),
-            bundle_index: BundleIndex::new(9),
-        };
-        let key = retry_key(bundle_ref);
-        let slot = retry_wakeup_slot(key);
-
-        let _ = state.retry_scheduled.insert(key);
-        let _ = state
-            .retry_wakeups
-            .insert(slot, RetryWakeup::new(bundle_ref, 2, 5));
-
-        assert!(state.take_retry_wakeup(slot, 4).is_none());
-        assert!(state.retry_scheduled.contains(&key));
-        assert!(state.retry_wakeups.contains_key(&slot));
-    }
-
-    /// Scenario: a retry was deferred in local overflow state because wakeup
-    /// capacity was exhausted, and its due time has now arrived.
-    /// Guarantees: taking that retry clears all overflow bookkeeping, removes it
-    /// from `retry_scheduled`, and returns the original `(bundle_ref, retry_count)`.
-    #[test]
-    fn take_due_retry_overflow_clears_tracking() {
-        let mut state = RetryWakeupState::new();
-        let bundle_ref = BundleRef {
-            segment_seq: SegmentSeq::new(321),
-            bundle_index: BundleIndex::new(7),
-        };
-        let key = retry_key(bundle_ref);
-        let retry_at = Instant::now();
-
-        state.insert_retry_overflow(bundle_ref, 4, retry_at);
-
-        assert!(state.retry_scheduled.contains(&key));
-        assert!(state.retry_overflow.contains_key(&key));
-        assert_eq!(state.retry_overflow_order.len(), 1);
-
-        let retry = state
-            .take_due_retry_overflow(retry_at + Duration::from_millis(1))
-            .expect("retry should be due");
-
-        assert_eq!(retry.bundle_ref().segment_seq.raw(), 321);
-        assert_eq!(retry.bundle_ref().bundle_index.raw(), 7);
-        assert_eq!(retry.retry_count(), 4);
-        assert!(!state.retry_scheduled.contains(&key));
-        assert!(!state.retry_overflow.contains_key(&key));
-        assert!(state.retry_overflow_order.is_empty());
-    }
-
-    /// Scenario: multiple retries overflow the wakeup scheduler and are stored
-    /// locally with the same due timestamp.
-    /// Guarantees: equal-deadline overflow retries are resumed in insertion
-    /// order using the local sequence tie-breaker.
-    #[test]
-    fn equal_deadline_overflow_retries_follow_sequence_order() {
-        let mut state = RetryWakeupState::new();
-        let retry_at = Instant::now();
-        let first = BundleRef {
-            segment_seq: SegmentSeq::new(111),
-            bundle_index: BundleIndex::new(1),
-        };
-        let second = BundleRef {
-            segment_seq: SegmentSeq::new(222),
-            bundle_index: BundleIndex::new(2),
-        };
-        let third = BundleRef {
-            segment_seq: SegmentSeq::new(333),
-            bundle_index: BundleIndex::new(3),
-        };
-
-        state.insert_retry_overflow(first, 1, retry_at);
-        state.insert_retry_overflow(second, 2, retry_at);
-        state.insert_retry_overflow(third, 3, retry_at);
-
-        assert_eq!(
-            state
-                .take_due_retry_overflow(retry_at + Duration::from_millis(1))
-                .expect("first retry")
-                .bundle_ref(),
-            first
-        );
-        assert_eq!(
-            state
-                .take_due_retry_overflow(retry_at + Duration::from_millis(1))
-                .expect("second retry")
-                .bundle_ref(),
-            second
-        );
-        assert_eq!(
-            state
-                .take_due_retry_overflow(retry_at + Duration::from_millis(1))
-                .expect("third retry")
-                .bundle_ref(),
-            third
-        );
-        assert!(state.retry_overflow.is_empty());
-        assert!(state.retry_overflow_order.is_empty());
-    }
-}

From 2f8d8d277e2c2c5be2113b9d3525b80a12b81791 Mon Sep 17 00:00:00 2001
From: lquerel <l.querel@f5.com>
Date: Fri, 3 Apr 2026 17:53:33 -0700
Subject: [PATCH 15/18] Simplify deferred retry state and inbox docs

---
 .../deferred_retry_state.rs                   | 43 +++++++-----
 .../durable_buffer_processor/mod.rs           | 68 ++++++-------------
 .../crates/engine/src/message.rs              | 27 ++++++--
 3 files changed, 68 insertions(+), 70 deletions(-)

diff --git a/rust/otap-dataflow/crates/core-nodes/src/processors/durable_buffer_processor/deferred_retry_state.rs b/rust/otap-dataflow/crates/core-nodes/src/processors/durable_buffer_processor/deferred_retry_state.rs
index f68ce5c109..f8ebe8d9eb 100644
--- a/rust/otap-dataflow/crates/core-nodes/src/processors/durable_buffer_processor/deferred_retry_state.rs
+++ b/rust/otap-dataflow/crates/core-nodes/src/processors/durable_buffer_processor/deferred_retry_state.rs
@@ -23,6 +23,9 @@
 //!   or explicitly re-deferred
 //! - due retries are resumed in deadline order, with deterministic ordering for
 //!   equal deadlines
+//! - this module does not introduce any growth path beyond the number of
+//!   deferred bundles: it keeps one authoritative map entry and one ordered
+//!   index entry per deferred bundle, plus at most one armed wakeup record
 //! - durable buffer retry scheduling does not depend on having one engine
 //!   wakeup slot per deferred bundle
 
@@ -31,8 +34,8 @@ use otap_df_engine::control::{WakeupRevision, WakeupSlot};
 use otap_df_engine::local::processor::EffectHandler;
 use otap_df_otap::pdata::OtapPdata;
 use quiver::subscriber::BundleRef;
-use std::collections::{BTreeSet, HashMap, HashSet};
-use std::time::Instant;
+use std::collections::{BTreeSet, HashMap};
+use std::time::{Duration, Instant};
 
 /// Durable buffer uses one processor-local wakeup slot for "the earliest retry
 /// currently pending in local state".
@@ -107,12 +110,6 @@ struct DeferredRetryOrder {
 /// path on a single scheduling mechanism instead of splitting between many
 /// armed wakeups and a separate overflow queue.
 pub(super) struct DeferredRetryState {
-    /// Bundles currently held out of the normal poll loop while retry backoff
-    /// is active.
-    ///
-    /// Invariant: every key here appears exactly once in `deferred`.
-    retry_scheduled: HashSet<(u64, u32)>,
-
     /// Authoritative retry state keyed by bundle identity.
     ///
     /// Invariant: every deferred bundle appears exactly once here and exactly
@@ -136,7 +133,6 @@ pub(super) struct DeferredRetryState {
 impl DeferredRetryState {
     pub(super) fn new() -> Self {
         Self {
-            retry_scheduled: HashSet::new(),
             deferred: HashMap::new(),
             deferred_order: BTreeSet::new(),
             armed_wakeup: None,
@@ -145,11 +141,11 @@ impl DeferredRetryState {
     }
 
     pub(super) fn scheduled_len(&self) -> usize {
-        self.retry_scheduled.len()
+        self.deferred.len()
     }
 
     pub(super) fn is_deferred_key(&self, key: (u64, u32)) -> bool {
-        self.retry_scheduled.contains(&key)
+        self.deferred.contains_key(&key)
     }
 
     fn deferred_order(key: (u64, u32), retry: DeferredRetry) -> DeferredRetryOrder {
@@ -165,7 +161,6 @@ impl DeferredRetryState {
         let _ = self
             .deferred_order
             .remove(&Self::deferred_order(key, retry));
-        let _ = self.retry_scheduled.remove(&key);
         Some(retry)
     }
 
@@ -174,7 +169,6 @@ impl DeferredRetryState {
         let _ = self.remove_deferred(key);
         let retry = DeferredRetry::new(bundle_ref, retry_count, retry_at, self.next_sequence);
         self.next_sequence = self.next_sequence.saturating_add(1);
-        let _ = self.retry_scheduled.insert(key);
         let _ = self.deferred.insert(key, retry);
         let _ = self.deferred_order.insert(Self::deferred_order(key, retry));
     }
@@ -243,6 +237,26 @@ impl DeferredRetryState {
         }
     }
 
+    /// Schedule or re-schedule retry deferral after a relative delay.
+    ///
+    /// Guarantees:
+    /// - equivalent to `schedule_at(now + delay)`
+    /// - keeps the delay-to-deadline conversion local to deferred retry state
+    pub(super) fn schedule_after(
+        &mut self,
+        bundle_ref: BundleRef,
+        retry_count: u32,
+        delay: Duration,
+        effect_handler: &mut EffectHandler<OtapPdata>,
+    ) -> bool {
+        self.schedule_at(
+            bundle_ref,
+            retry_count,
+            Instant::now() + delay,
+            effect_handler,
+        )
+    }
+
     /// Accept one wakeup delivery only when it matches the currently armed
     /// durable-buffer slot and revision.
     ///
@@ -279,7 +293,6 @@ impl DeferredRetryState {
 
         let _ = self.deferred_order.remove(&order);
         let retry = self.deferred.remove(&order.key)?;
-        let _ = self.retry_scheduled.remove(&order.key);
         Some(retry)
     }
 
@@ -435,7 +448,6 @@ mod tests {
 
         state.insert_deferred(bundle_ref, 4, retry_at);
 
-        assert!(state.retry_scheduled.contains(&key));
         assert!(state.deferred.contains_key(&key));
         assert_eq!(state.deferred_order.len(), 1);
 
@@ -446,7 +458,6 @@ mod tests {
         assert_eq!(retry.bundle_ref().segment_seq.raw(), 321);
         assert_eq!(retry.bundle_ref().bundle_index.raw(), 7);
         assert_eq!(retry.retry_count(), 4);
-        assert!(!state.retry_scheduled.contains(&key));
         assert!(!state.deferred.contains_key(&key));
         assert!(state.deferred_order.is_empty());
     }
diff --git a/rust/otap-dataflow/crates/core-nodes/src/processors/durable_buffer_processor/mod.rs b/rust/otap-dataflow/crates/core-nodes/src/processors/durable_buffer_processor/mod.rs
index fe536de2a7..0090f45ce7 100644
--- a/rust/otap-dataflow/crates/core-nodes/src/processors/durable_buffer_processor/mod.rs
+++ b/rust/otap-dataflow/crates/core-nodes/src/processors/durable_buffer_processor/mod.rs
@@ -98,9 +98,9 @@ use bundle_adapter::{
     OtapRecordBundleAdapter, OtlpBytesAdapter, convert_bundle_to_pdata, signal_type_from_slot_id,
 };
 pub use config::{DurableBufferConfig, OtlpHandling, SizeCapPolicy};
+use deferred_retry_state::DeferredRetryState;
 #[cfg(test)]
 use deferred_retry_state::RETRY_WAKEUP_SLOT;
-use deferred_retry_state::DeferredRetryState;
 
 use otap_df_config::SignalType;
 use otap_df_config::error::Error as ConfigError;
@@ -616,25 +616,6 @@ impl DurableBuffer {
         self.pending_bundles.len() < self.config.max_in_flight
     }
 
-    /// Schedule a retry for a bundle in local deferred state and ensure the
-    /// single durable-buffer wakeup tracks the earliest pending retry.
-    ///
-    /// Guarantees:
-    /// - on success, the bundle remains deferred until a wakeup resumes it
-    /// - the engine wakeup slot is always re-armed to the earliest retry deadline
-    /// - returns `false` only when the wakeup could not be armed
-    async fn schedule_retry(
-        &mut self,
-        bundle_ref: BundleRef,
-        retry_count: u32,
-        delay: Duration,
-        effect_handler: &mut EffectHandler<OtapPdata>,
-    ) -> bool {
-        let retry_at = Instant::now() + delay;
-        self.deferred_retry_state
-            .schedule_at(bundle_ref, retry_count, retry_at, effect_handler)
-    }
-
     /// Resumes one deferred retry, either by sending it downstream again or by
     /// re-deferring it if downstream/backpressure constraints still apply.
     ///
@@ -643,7 +624,7 @@ impl DurableBuffer {
     /// - re-defers blocked retries with `poll_interval`
     /// - returns enough outcome information for the caller to decide whether
     ///   the current wakeup pass should keep resuming more due retries
-    async fn resume_retry(
+    fn resume_retry(
         &mut self,
         bundle_ref: BundleRef,
         retry_count: u32,
@@ -658,15 +639,12 @@ impl DurableBuffer {
                 max_in_flight = self.config.max_in_flight
             );
 
-            if !self
-                .schedule_retry(
-                    bundle_ref,
-                    retry_count,
-                    self.config.poll_interval,
-                    effect_handler,
-                )
-                .await
-            {
+            if !self.deferred_retry_state.schedule_after(
+                bundle_ref,
+                retry_count,
+                self.config.poll_interval,
+                effect_handler,
+            ) {
                 otel_warn!("durable_buffer.retry.reschedule_failed");
             }
             return Ok(RetryResumeOutcome::Deferred);
@@ -707,15 +685,12 @@ impl DurableBuffer {
                         bundle_index = bundle_ref.bundle_index.raw()
                     );
 
-                    if !self
-                        .schedule_retry(
-                            bundle_ref,
-                            retry_count,
-                            self.config.poll_interval,
-                            effect_handler,
-                        )
-                        .await
-                    {
+                    if !self.deferred_retry_state.schedule_after(
+                        bundle_ref,
+                        retry_count,
+                        self.config.poll_interval,
+                        effect_handler,
+                    ) {
                         otel_warn!("durable_buffer.retry.reschedule_failed");
                     }
                     Ok(RetryResumeOutcome::Deferred)
@@ -1625,10 +1600,12 @@ impl DurableBuffer {
             drop(pending.handle);
 
             // Schedule the retry
-            if self
-                .schedule_retry(bundle_ref, retry_count, backoff, effect_handler)
-                .await
-            {
+            if self.deferred_retry_state.schedule_after(
+                bundle_ref,
+                retry_count,
+                backoff,
+                effect_handler,
+            ) {
                 self.metrics.retries_scheduled.add(1);
             } else {
                 otel_warn!(
@@ -1673,10 +1650,7 @@ impl DurableBuffer {
                 break;
             };
 
-            match self
-                .resume_retry(retry.bundle_ref(), retry.retry_count(), effect_handler)
-                .await?
-            {
+            match self.resume_retry(retry.bundle_ref(), retry.retry_count(), effect_handler)? {
                 RetryResumeOutcome::Sent | RetryResumeOutcome::Skipped => {}
                 RetryResumeOutcome::Deferred => {
                     rearm_no_earlier_than = Some(now + self.config.poll_interval);
diff --git a/rust/otap-dataflow/crates/engine/src/message.rs b/rust/otap-dataflow/crates/engine/src/message.rs
index aad6c3b773..00356bb7e0 100644
--- a/rust/otap-dataflow/crates/engine/src/message.rs
+++ b/rust/otap-dataflow/crates/engine/src/message.rs
@@ -865,13 +865,6 @@ impl<PData: ReceivedAtNode> ExporterInbox<PData> {
     }
 }
 
-/// Backward-compatible exporter inbox alias.
-pub type ExporterMessageChannel<
-    PData,
-    ControlRx = Receiver<NodeControlMsg<PData>>,
-    PDataRx = Receiver<PData>,
-> = ExporterInbox<PData, ControlRx, PDataRx>;
-
 /// Send-friendly exporter inbox type for shared exporter runtimes.
 pub(crate) type SharedExporterInbox<PData> =
     ExporterInbox<PData, SharedReceiver<NodeControlMsg<PData>>, SharedReceiver<PData>>;
@@ -906,6 +899,11 @@ mod tests {
         (control_tx, pdata_tx, scheduler, inbox)
     }
 
+    /// Scenario: a processor-local wakeup is scheduled for immediate delivery
+    /// while the processor inbox is otherwise idle.
+    /// Guarantees: the inbox surfaces the due wakeup as
+    /// `NodeControlMsg::Wakeup` with the scheduled slot, deadline, and
+    /// accepted revision.
     #[tokio::test]
     async fn processor_inbox_emits_due_wakeup_as_control_message() {
         let (_control_tx, _pdata_tx, scheduler, mut inbox) = local_processor_inbox(4);
@@ -929,6 +927,11 @@ mod tests {
         ));
     }
 
+    /// Scenario: a processor inbox has both pending pdata and a burst of due
+    /// processor-local wakeups.
+    /// Guarantees: wakeups still count as ordinary control traffic for the
+    /// existing fairness policy, so pdata is eventually delivered instead of
+    /// starving behind an unbounded wakeup burst.
     #[tokio::test]
     async fn processor_inbox_wakeup_preserves_control_fairness() {
         let (_control_tx, pdata_tx, scheduler, mut inbox) = local_processor_inbox(64);
@@ -1003,6 +1006,11 @@ mod tests {
         ));
     }
 
+    /// Scenario: shutdown has been latched and the processor-local scheduler
+    /// receives a new wakeup request while the inbox is draining buffered
+    /// messages.
+    /// Guarantees: new wakeup requests are rejected with
+    /// `WakeupError::ShuttingDown` once shutdown has been latched.
     #[tokio::test]
     async fn processor_inbox_rejects_wakeups_after_shutdown_latch() {
         let (control_tx, pdata_tx, scheduler, mut inbox) = local_processor_inbox(4);
@@ -1038,6 +1046,11 @@ mod tests {
         );
     }
 
+    /// Scenario: a processor-local wakeup is pending when shutdown is latched
+    /// and the inbox still has buffered pdata to drain.
+    /// Guarantees: pending wakeups are dropped immediately on shutdown latch,
+    /// buffered pdata still drains according to the inbox contract, and the
+    /// latched shutdown is delivered after draining completes.
     #[tokio::test]
     async fn processor_inbox_drops_pending_wakeups_on_shutdown_latch() {
         let (control_tx, pdata_tx, scheduler, mut inbox) = local_processor_inbox(4);

From 5b05cff0a3bae558c2370bf9b273699ca3297e81 Mon Sep 17 00:00:00 2001
From: lquerel <l.querel@f5.com>
Date: Fri, 3 Apr 2026 18:35:49 -0700
Subject: [PATCH 16/18] Make processor wakeups opt-in

---
 .../src/processors/batch_processor/mod.rs     |  7 +-
 .../durable_buffer_processor/mod.rs           |  7 +-
 .../crates/engine/src/effect_handler.rs       | 14 +--
 rust/otap-dataflow/crates/engine/src/lib.rs   |  1 +
 .../crates/engine/src/local/processor.rs      | 33 ++++++-
 .../crates/engine/src/message.rs              | 10 +-
 .../crates/engine/src/node_local_scheduler.rs |  2 +
 .../crates/engine/src/processor.rs            | 92 ++++++++++++++-----
 .../crates/engine/src/shared/processor.rs     |  9 ++
 9 files changed, 133 insertions(+), 42 deletions(-)

diff --git a/rust/otap-dataflow/crates/core-nodes/src/processors/batch_processor/mod.rs b/rust/otap-dataflow/crates/core-nodes/src/processors/batch_processor/mod.rs
index 41bc3c4aab..90cc303d63 100644
--- a/rust/otap-dataflow/crates/core-nodes/src/processors/batch_processor/mod.rs
+++ b/rust/otap-dataflow/crates/core-nodes/src/processors/batch_processor/mod.rs
@@ -36,7 +36,8 @@ use otap_df_config::node::NodeUserConfig;
 use otap_df_config::{SignalFormat, SignalType};
 use otap_df_engine::MessageSourceLocalEffectHandlerExtension;
 use otap_df_engine::{
-    ConsumerEffectHandlerExtension, Interests, ProducerEffectHandlerExtension,
+    ConsumerEffectHandlerExtension, Interests, ProcessorRuntimeCapabilities,
+    ProducerEffectHandlerExtension,
     config::ProcessorConfig,
     control::{AckMsg, CallData, NackMsg, NodeControlMsg, WakeupSlot},
     error::{Error as EngineError, ProcessorErrorKind},
@@ -1117,6 +1118,10 @@ pub fn create_otap_batch_processor(
 
 #[async_trait(?Send)]
 impl local::Processor<OtapPdata> for BatchProcessor {
+    fn runtime_capabilities(&self) -> ProcessorRuntimeCapabilities {
+        ProcessorRuntimeCapabilities::LOCAL_WAKEUPS
+    }
+
     async fn process(
         &mut self,
         msg: Message<OtapPdata>,
diff --git a/rust/otap-dataflow/crates/core-nodes/src/processors/durable_buffer_processor/mod.rs b/rust/otap-dataflow/crates/core-nodes/src/processors/durable_buffer_processor/mod.rs
index 0090f45ce7..d3771776c2 100644
--- a/rust/otap-dataflow/crates/core-nodes/src/processors/durable_buffer_processor/mod.rs
+++ b/rust/otap-dataflow/crates/core-nodes/src/processors/durable_buffer_processor/mod.rs
@@ -117,7 +117,8 @@ use otap_df_engine::message::Message;
 use otap_df_engine::node::NodeId;
 use otap_df_engine::processor::ProcessorWrapper;
 use otap_df_engine::{
-    ConsumerEffectHandlerExtension, Interests, ProcessorFactory, ProducerEffectHandlerExtension,
+    ConsumerEffectHandlerExtension, Interests, ProcessorFactory, ProcessorRuntimeCapabilities,
+    ProducerEffectHandlerExtension,
 };
 use otap_df_pdata::{OtapArrowRecords, OtapPayload};
 use otap_df_telemetry::instrument::{Counter, Gauge, ObserveCounter};
@@ -1779,6 +1780,10 @@ impl DurableBuffer {
 
 #[async_trait(?Send)]
 impl otap_df_engine::local::processor::Processor<OtapPdata> for DurableBuffer {
+    fn runtime_capabilities(&self) -> ProcessorRuntimeCapabilities {
+        ProcessorRuntimeCapabilities::LOCAL_WAKEUPS
+    }
+
     async fn process(
         &mut self,
         msg: Message<OtapPdata>,
diff --git a/rust/otap-dataflow/crates/engine/src/effect_handler.rs b/rust/otap-dataflow/crates/engine/src/effect_handler.rs
index 6e65ad60d6..5b714ceb15 100644
--- a/rust/otap-dataflow/crates/engine/src/effect_handler.rs
+++ b/rust/otap-dataflow/crates/engine/src/effect_handler.rs
@@ -420,9 +420,11 @@ impl<PData> EffectHandlerCore<PData> {
     ///
     /// # Errors
     ///
-    /// Returns [`WakeupError::ShuttingDown`] once processor shutdown has been
-    /// latched. Returns [`WakeupError::Capacity`] if the processor has reached
-    /// its configured live wakeup-slot capacity.
+    /// Returns [`WakeupError::Unsupported`] when the processor runtime did not
+    /// enable processor-local wakeups. Returns [`WakeupError::ShuttingDown`]
+    /// once processor shutdown has been latched. Returns
+    /// [`WakeupError::Capacity`] if the processor has reached its configured
+    /// live wakeup-slot capacity.
     pub fn set_wakeup(
         &self,
         slot: WakeupSlot,
@@ -430,7 +432,7 @@ impl<PData> EffectHandlerCore<PData> {
     ) -> Result<WakeupSetOutcome, WakeupError> {
         self.local_scheduler
             .as_ref()
-            .expect("node-local scheduler not set for processor effect handler")
+            .ok_or(WakeupError::Unsupported)?
             .set_wakeup(slot, when)
     }
 
@@ -443,8 +445,8 @@ impl<PData> EffectHandlerCore<PData> {
     pub fn cancel_wakeup(&self, slot: WakeupSlot) -> bool {
         self.local_scheduler
             .as_ref()
-            .expect("node-local scheduler not set for processor effect handler")
-            .cancel_wakeup(slot)
+            .map(|scheduler| scheduler.cancel_wakeup(slot))
+            .unwrap_or(false)
     }
 
     /// Notifies the runtime control manager that this receiver has completed
diff --git a/rust/otap-dataflow/crates/engine/src/lib.rs b/rust/otap-dataflow/crates/engine/src/lib.rs
index 67c79b9cf3..1889cf38f9 100644
--- a/rust/otap-dataflow/crates/engine/src/lib.rs
+++ b/rust/otap-dataflow/crates/engine/src/lib.rs
@@ -80,6 +80,7 @@ pub mod testing;
 pub mod topic;
 pub mod wiring_contract;
 pub use node_local_scheduler::{WakeupError, WakeupSetOutcome};
+pub use processor::ProcessorRuntimeCapabilities;
 
 /// Trait for factory types that expose a name.
 ///
diff --git a/rust/otap-dataflow/crates/engine/src/local/processor.rs b/rust/otap-dataflow/crates/engine/src/local/processor.rs
index 7fef84bbf7..3423ce062b 100644
--- a/rust/otap-dataflow/crates/engine/src/local/processor.rs
+++ b/rust/otap-dataflow/crates/engine/src/local/processor.rs
@@ -42,6 +42,7 @@ use crate::message::{Message, Sender};
 use crate::node::NodeId;
 use crate::output_router::OutputRouter;
 use crate::process_duration::ComputeDuration;
+use crate::processor::ProcessorRuntimeCapabilities;
 use crate::{WakeupError, WakeupSetOutcome};
 use async_trait::async_trait;
 use otap_df_config::PortName;
@@ -104,6 +105,14 @@ pub trait Processor<PData> {
     fn accept_pdata(&self) -> bool {
         true
     }
+
+    /// Returns optional runtime features that this processor needs from the engine.
+    ///
+    /// Processors should only opt into capabilities they actually use so the
+    /// engine can avoid wiring unused runtime machinery onto the common path.
+    fn runtime_capabilities(&self) -> ProcessorRuntimeCapabilities {
+        ProcessorRuntimeCapabilities::empty()
+    }
 }
 
 /// A `!Send` implementation of the EffectHandler.
@@ -339,12 +348,13 @@ mod tests {
     use crate::completion_emission_metrics::make_completion_emission_metrics;
     use crate::context::ControllerContext;
     use crate::control::{
-        AckMsg, Frame, NackMsg, PipelineCompletionMsg, RouteData, pipeline_completion_msg_channel,
+        AckMsg, Frame, NackMsg, PipelineCompletionMsg, RouteData, WakeupSlot,
+        pipeline_completion_msg_channel,
     };
     use crate::entity_context::NodeTelemetryHandle;
     use crate::local::message::LocalSender;
     use crate::testing::test_node;
-    use crate::{Interests, Unwindable};
+    use crate::{Interests, Unwindable, WakeupError};
     use otap_df_channel::error::SendError;
     use otap_df_channel::mpsc;
     use otap_df_config::{MetricLevel, node::NodeKind};
@@ -476,6 +486,25 @@ mod tests {
         );
     }
 
+    /// Scenario: a processor effect handler has not been wired with the
+    /// processor-local wakeup runtime capability and attempts to schedule a
+    /// wakeup anyway.
+    /// Guarantees: the call fails with `WakeupError::Unsupported` instead of
+    /// panicking, so non-opting processors do not require the wakeup runtime
+    /// machinery to exist.
+    #[test]
+    fn effect_handler_set_wakeup_without_runtime_support_returns_unsupported() {
+        let (_metrics_rx, metrics_reporter) = MetricsReporter::create_new_and_receiver(1);
+        let eh =
+            EffectHandler::<u64>::new(test_node("proc"), HashMap::new(), None, metrics_reporter);
+
+        assert_eq!(
+            eh.set_wakeup(WakeupSlot(0), Instant::now()),
+            Err(WakeupError::Unsupported)
+        );
+        assert!(!eh.cancel_wakeup(WakeupSlot(0)));
+    }
+
     #[tokio::test]
     async fn effect_handler_send_message_ambiguous_without_default() {
         let (a_tx, a_rx) = channel::<u64>(10);
diff --git a/rust/otap-dataflow/crates/engine/src/message.rs b/rust/otap-dataflow/crates/engine/src/message.rs
index 00356bb7e0..9957fedabf 100644
--- a/rust/otap-dataflow/crates/engine/src/message.rs
+++ b/rust/otap-dataflow/crates/engine/src/message.rs
@@ -751,13 +751,9 @@ impl<PData> ProcessorInbox<PData> {
         node_id: usize,
         interests: Interests,
     ) -> Self {
-        Self::new_with_local_scheduler(
-            control_rx,
-            pdata_rx,
-            NodeLocalSchedulerHandle::new(32),
-            node_id,
-            interests,
-        )
+        Self {
+            core: InboxCore::new(control_rx, pdata_rx, None, node_id, interests),
+        }
     }
 
     /// Creates a new processor inbox with an explicit processor-local scheduler.
diff --git a/rust/otap-dataflow/crates/engine/src/node_local_scheduler.rs b/rust/otap-dataflow/crates/engine/src/node_local_scheduler.rs
index 019bdfa6a8..b93e0714fb 100644
--- a/rust/otap-dataflow/crates/engine/src/node_local_scheduler.rs
+++ b/rust/otap-dataflow/crates/engine/src/node_local_scheduler.rs
@@ -12,6 +12,8 @@ use tokio::sync::Notify;
 /// Error returned when a wakeup request cannot be accepted.
 #[derive(Clone, Copy, Debug, PartialEq, Eq)]
 pub enum WakeupError {
+    /// Processor-local wakeups were not enabled for this processor runtime.
+    Unsupported,
     /// The processor has already latched shutdown.
     ShuttingDown,
     /// The bounded live wakeup slot set is full.
diff --git a/rust/otap-dataflow/crates/engine/src/processor.rs b/rust/otap-dataflow/crates/engine/src/processor.rs
index 995b209488..c25405656a 100644
--- a/rust/otap-dataflow/crates/engine/src/processor.rs
+++ b/rust/otap-dataflow/crates/engine/src/processor.rs
@@ -36,6 +36,15 @@ use std::collections::HashMap;
 use std::sync::Arc;
 use std::time::Duration;
 
+bitflags::bitflags! {
+/// Optional runtime features that a processor can request from the engine.
+#[derive(Clone, Copy, Debug, Default, PartialEq, Eq)]
+pub struct ProcessorRuntimeCapabilities: u8 {
+    /// Enable processor-local wakeup scheduling and delivery through `ProcessorInbox`.
+    const LOCAL_WAKEUPS = 1 << 0;
+}
+}
+
 /// A wrapper for the processor that allows for both `Send` and `!Send` effect handlers.
 ///
 /// Note: This is useful for creating a single interface for the processor regardless of the effect
@@ -335,20 +344,34 @@ impl<PData> ProcessorWrapper<PData> {
                 source_tag,
                 ..
             } => {
-                let local_scheduler =
-                    NodeLocalSchedulerHandle::new(runtime_config.control_channel.capacity);
-                let inbox = ProcessorInbox::new_with_local_scheduler(
-                    Receiver::Local(control_receiver),
-                    pdata_receiver.ok_or_else(|| Error::ProcessorError {
-                        processor: node_id.clone(),
-                        kind: ProcessorErrorKind::Configuration,
-                        error: "The pdata receiver must be defined at this stage".to_owned(),
-                        source_detail: String::new(),
-                    })?,
-                    local_scheduler.clone(),
-                    node_id.index,
-                    node_interests,
-                );
+                let runtime_capabilities = processor.runtime_capabilities();
+                let pdata_receiver = pdata_receiver.ok_or_else(|| Error::ProcessorError {
+                    processor: node_id.clone(),
+                    kind: ProcessorErrorKind::Configuration,
+                    error: "The pdata receiver must be defined at this stage".to_owned(),
+                    source_detail: String::new(),
+                })?;
+                let maybe_local_scheduler = runtime_capabilities
+                    .contains(ProcessorRuntimeCapabilities::LOCAL_WAKEUPS)
+                    .then(|| {
+                        NodeLocalSchedulerHandle::new(runtime_config.control_channel.capacity)
+                    });
+                let inbox = if let Some(local_scheduler) = maybe_local_scheduler.clone() {
+                    ProcessorInbox::new_with_local_scheduler(
+                        Receiver::Local(control_receiver),
+                        pdata_receiver,
+                        local_scheduler,
+                        node_id.index,
+                        node_interests,
+                    )
+                } else {
+                    ProcessorInbox::new(
+                        Receiver::Local(control_receiver),
+                        pdata_receiver,
+                        node_id.index,
+                        node_interests,
+                    )
+                };
                 let default_port = user_config.default_output.clone();
                 let mut effect_handler = local::EffectHandler::new(
                     node_id,
@@ -357,7 +380,9 @@ impl<PData> ProcessorWrapper<PData> {
                     metrics_reporter,
                 );
                 effect_handler.set_source_tagging(source_tag);
-                effect_handler.core.set_local_scheduler(local_scheduler);
+                if let Some(local_scheduler) = maybe_local_scheduler {
+                    effect_handler.core.set_local_scheduler(local_scheduler);
+                }
                 Ok(ProcessorWrapperRuntime::Local {
                     processor,
                     effect_handler,
@@ -375,20 +400,35 @@ impl<PData> ProcessorWrapper<PData> {
                 source_tag,
                 ..
             } => {
-                let local_scheduler =
-                    NodeLocalSchedulerHandle::new(runtime_config.control_channel.capacity);
-                let inbox = ProcessorInbox::new_with_local_scheduler(
-                    Receiver::Shared(control_receiver),
+                let runtime_capabilities = processor.runtime_capabilities();
+                let pdata_receiver =
                     Receiver::Shared(pdata_receiver.ok_or_else(|| Error::ProcessorError {
                         processor: node_id.clone(),
                         kind: ProcessorErrorKind::Configuration,
                         error: "The pdata receiver must be defined at this stage".to_owned(),
                         source_detail: String::new(),
-                    })?),
-                    local_scheduler.clone(),
-                    node_id.index,
-                    node_interests,
-                );
+                    })?);
+                let maybe_local_scheduler = runtime_capabilities
+                    .contains(ProcessorRuntimeCapabilities::LOCAL_WAKEUPS)
+                    .then(|| {
+                        NodeLocalSchedulerHandle::new(runtime_config.control_channel.capacity)
+                    });
+                let inbox = if let Some(local_scheduler) = maybe_local_scheduler.clone() {
+                    ProcessorInbox::new_with_local_scheduler(
+                        Receiver::Shared(control_receiver),
+                        pdata_receiver,
+                        local_scheduler,
+                        node_id.index,
+                        node_interests,
+                    )
+                } else {
+                    ProcessorInbox::new(
+                        Receiver::Shared(control_receiver),
+                        pdata_receiver,
+                        node_id.index,
+                        node_interests,
+                    )
+                };
                 let default_port = user_config.default_output.clone();
                 let mut effect_handler = shared::EffectHandler::new(
                     node_id,
@@ -397,7 +437,9 @@ impl<PData> ProcessorWrapper<PData> {
                     metrics_reporter,
                 );
                 effect_handler.set_source_tagging(source_tag);
-                effect_handler.core.set_local_scheduler(local_scheduler);
+                if let Some(local_scheduler) = maybe_local_scheduler {
+                    effect_handler.core.set_local_scheduler(local_scheduler);
+                }
                 Ok(ProcessorWrapperRuntime::Shared {
                     processor,
                     effect_handler,
diff --git a/rust/otap-dataflow/crates/engine/src/shared/processor.rs b/rust/otap-dataflow/crates/engine/src/shared/processor.rs
index a5647766d1..0b597655e9 100644
--- a/rust/otap-dataflow/crates/engine/src/shared/processor.rs
+++ b/rust/otap-dataflow/crates/engine/src/shared/processor.rs
@@ -40,6 +40,7 @@ use crate::error::{Error, TypedError};
 use crate::message::Message;
 use crate::node::NodeId;
 use crate::output_router::OutputRouter;
+use crate::processor::ProcessorRuntimeCapabilities;
 use crate::shared::message::SharedSender;
 use crate::{WakeupError, WakeupSetOutcome};
 use async_trait::async_trait;
@@ -103,6 +104,14 @@ pub trait Processor<PData> {
     fn accept_pdata(&self) -> bool {
         true
     }
+
+    /// Returns optional runtime features that this processor needs from the engine.
+    ///
+    /// Processors should only opt into capabilities they actually use so the
+    /// engine can avoid wiring unused runtime machinery onto the common path.
+    fn runtime_capabilities(&self) -> ProcessorRuntimeCapabilities {
+        ProcessorRuntimeCapabilities::empty()
+    }
 }
 
 /// A `Send` implementation of the EffectHandler.

From ecb2ce5e5d5dce60ee4aaa4dff80a2ee99a1e0dd Mon Sep 17 00:00:00 2001
From: lquerel <l.querel@f5.com>
Date: Fri, 3 Apr 2026 18:53:03 -0700
Subject: [PATCH 17/18] Drain deferred retries during shutdown

---
 .../deferred_retry_state.rs                   |  42 +++++++
 .../durable_buffer_processor/mod.rs           | 103 +++++++++++++++++-
 2 files changed, 143 insertions(+), 2 deletions(-)

diff --git a/rust/otap-dataflow/crates/core-nodes/src/processors/durable_buffer_processor/deferred_retry_state.rs b/rust/otap-dataflow/crates/core-nodes/src/processors/durable_buffer_processor/deferred_retry_state.rs
index f8ebe8d9eb..d1a0b61eb2 100644
--- a/rust/otap-dataflow/crates/core-nodes/src/processors/durable_buffer_processor/deferred_retry_state.rs
+++ b/rust/otap-dataflow/crates/core-nodes/src/processors/durable_buffer_processor/deferred_retry_state.rs
@@ -296,6 +296,19 @@ impl DeferredRetryState {
         Some(retry)
     }
 
+    /// Drop all deferred retry gating at shutdown entry.
+    ///
+    /// Guarantees:
+    /// - no bundle remains blocked behind local retry backoff state
+    /// - the armed wakeup record is cleared
+    /// - previously deferred bundles can be drained through the normal Quiver
+    ///   poll path during shutdown
+    pub(super) fn clear_for_shutdown(&mut self) {
+        self.deferred.clear();
+        self.deferred_order.clear();
+        self.armed_wakeup = None;
+    }
+
     /// Re-arm the single durable-buffer wakeup after retry processing.
     ///
     /// `no_earlier_than` lets the caller push the next retry attempt out when
@@ -509,4 +522,33 @@ mod tests {
         assert!(state.deferred.is_empty());
         assert!(state.deferred_order.is_empty());
     }
+
+    /// Scenario: durable buffer starts shutdown while it still has deferred
+    /// retries tracked locally behind its single retry wakeup.
+    /// Guarantees: shutdown clearing removes all local retry gating and the
+    /// armed wakeup record so those bundles can be drained through the normal
+    /// poll path.
+    #[test]
+    fn clear_for_shutdown_drops_deferred_tracking() {
+        let mut state = DeferredRetryState::new();
+        let retry_at = Instant::now() + Duration::from_secs(1);
+        state.insert_deferred(
+            BundleRef {
+                segment_seq: SegmentSeq::new(7),
+                bundle_index: BundleIndex::new(1),
+            },
+            1,
+            retry_at,
+        );
+        state.armed_wakeup = Some(ArmedRetryWakeup {
+            when: retry_at,
+            revision: 9,
+        });
+
+        state.clear_for_shutdown();
+
+        assert!(state.deferred.is_empty());
+        assert!(state.deferred_order.is_empty());
+        assert!(state.armed_wakeup.is_none());
+    }
 }
diff --git a/rust/otap-dataflow/crates/core-nodes/src/processors/durable_buffer_processor/mod.rs b/rust/otap-dataflow/crates/core-nodes/src/processors/durable_buffer_processor/mod.rs
index d3771776c2..448d38a566 100644
--- a/rust/otap-dataflow/crates/core-nodes/src/processors/durable_buffer_processor/mod.rs
+++ b/rust/otap-dataflow/crates/core-nodes/src/processors/durable_buffer_processor/mod.rs
@@ -1674,8 +1674,9 @@ impl DurableBuffer {
     ///
     /// The shutdown sequence is:
     /// 1. Flush to finalize any open segment (makes data visible to subscribers)
-    /// 2. Drain remaining bundles to downstream (best-effort, respects deadline)
-    /// 3. Engine shutdown (always attempted - also finalizes open segment if flush was skipped)
+    /// 2. Clear deferred-retry gating so parked retry bundles become drainable
+    /// 3. Drain remaining bundles to downstream (best-effort, respects deadline)
+    /// 4. Engine shutdown (always attempted - also finalizes open segment if flush was skipped)
     ///
     /// Note: Quiver's `shutdown()` internally calls `finalize_current_segment()`, so even
     /// if we skip the explicit flush due to deadline pressure, the engine shutdown will
@@ -1696,6 +1697,12 @@ impl DurableBuffer {
             return Ok(());
         }
 
+        // Shutdown is terminal for this processor instance, so retry backoff no
+        // longer matters. Clear local deferred-retry gating up front so bundles
+        // that were parked behind backoff become drainable through the normal
+        // Quiver poll loop below.
+        self.deferred_retry_state.clear_for_shutdown();
+
         // Check deadline before flush/drain sequence
         if Instant::now() >= deadline {
             otel_warn!("durable_buffer.shutdown.deadline_exceeded");
@@ -2248,6 +2255,98 @@ mod tests {
             .validate(|_| async {});
     }
 
+    /// Scenario: a bundle is transiently NACKed, becomes deferred behind the
+    /// durable-buffer retry wakeup, and shutdown starts before that wakeup
+    /// fires.
+    /// Guarantees: shutdown clears deferred-retry gating so the existing drain
+    /// loop can forward that parked bundle instead of leaving it restart-dependent.
+    #[test]
+    fn test_shutdown_drains_deferred_retry_bundle() {
+        use otap_df_config::node::NodeUserConfig;
+        use otap_df_engine::config::ProcessorConfig;
+        use otap_df_engine::context::ControllerContext;
+        use otap_df_engine::control::pipeline_completion_msg_channel;
+        use otap_df_engine::message::Message;
+        use otap_df_engine::testing::processor::TestRuntime;
+        use otap_df_engine::testing::test_node;
+        use otap_df_otap::testing::next_nack;
+        use otap_df_pdata::encode::encode_logs_otap_batch;
+        use otap_df_pdata::testing::fixtures::DataGenerator;
+        use serde_json::json;
+
+        let rt = TestRuntime::new();
+        let controller = ControllerContext::new(rt.metrics_registry());
+        let pipeline_ctx = controller.pipeline_context_with("grp".into(), "pipe".into(), 0, 1, 0);
+        let temp_dir = tempfile::tempdir().expect("tempdir");
+
+        let mut node_config = NodeUserConfig::new_processor_config(DURABLE_BUFFER_URN);
+        node_config.config = json!({
+            "path": temp_dir.path(),
+            "retention_size_cap": "256 MiB",
+            "poll_interval": "100ms",
+            "max_segment_open_duration": "1s",
+            "initial_retry_interval": "10s",
+            "max_retry_interval": "10s",
+            "retry_multiplier": 2.0,
+            "max_in_flight": 1000
+        });
+
+        let processor = create_durable_buffer(
+            pipeline_ctx,
+            test_node("durable-buffer-shutdown-drain-deferred"),
+            Arc::new(node_config),
+            &ProcessorConfig::new("durable-buffer-shutdown-drain-deferred"),
+        )
+        .expect("create durable buffer");
+
+        rt.set_processor(processor)
+            .run_test(move |mut ctx| async move {
+                let (pipeline_completion_tx, _pipeline_completion_rx) =
+                    pipeline_completion_msg_channel(10);
+                ctx.set_pipeline_completion_sender(pipeline_completion_tx);
+
+                let mut datagen = DataGenerator::new(1);
+                let input = datagen.generate_logs();
+                let rec = encode_logs_otap_batch(&input).expect("encode logs");
+                ctx.process(Message::PData(OtapPdata::new_default(rec.into())))
+                    .await
+                    .expect("process input");
+
+                ctx.process(Message::Control(NodeControlMsg::TimerTick {}))
+                    .await
+                    .expect("process timer tick");
+                let mut outputs = ctx.drain_pdata().await;
+                assert_eq!(outputs.len(), 1, "timer tick should emit one bundle");
+
+                let sent = outputs.pop().expect("sent bundle");
+                let (_, nack) =
+                    next_nack(NackMsg::new("retry", sent)).expect("expected nack subscriber");
+                ctx.process(Message::Control(NodeControlMsg::Nack(nack)))
+                    .await
+                    .expect("process nack");
+                assert!(
+                    ctx.drain_pdata().await.is_empty(),
+                    "nack should defer delivery until either wakeup or shutdown drain"
+                );
+
+                ctx.process(Message::Control(NodeControlMsg::Shutdown {
+                    deadline: Instant::now() + Duration::from_secs(1),
+                    reason: "shutdown".to_owned(),
+                }))
+                .await
+                .expect("process shutdown");
+
+                let drained = ctx.drain_pdata().await;
+                assert_eq!(
+                    drained.len(),
+                    1,
+                    "shutdown drain should forward the deferred retry bundle"
+                );
+                assert_eq!(drained[0].signal_type(), SignalType::Logs);
+            })
+            .validate(|_| async {});
+    }
+
     #[test]
     fn test_backoff_calculation() {
         use otap_df_engine::context::ControllerContext;

From 6cb88edbe84713db7cc1b43cbecf586803d7d374 Mon Sep 17 00:00:00 2001
From: lquerel <l.querel@f5.com>
Date: Mon, 6 Apr 2026 17:17:32 -0700
Subject: [PATCH 18/18] Make wakeups requirement-driven and harden batch ack
 routing

---
 .../src/processors/batch_processor/mod.rs     | 274 +++++++++++++-----
 .../durable_buffer_processor/mod.rs           |  10 +-
 rust/otap-dataflow/crates/engine/src/lib.rs   |   6 +-
 .../crates/engine/src/local/processor.rs      |  13 +-
 .../crates/engine/src/processor.rs            | 159 ++++++++--
 .../crates/engine/src/shared/processor.rs     |  13 +-
 6 files changed, 360 insertions(+), 115 deletions(-)

diff --git a/rust/otap-dataflow/crates/core-nodes/src/processors/batch_processor/mod.rs b/rust/otap-dataflow/crates/core-nodes/src/processors/batch_processor/mod.rs
index 90cc303d63..4e2c5d3b48 100644
--- a/rust/otap-dataflow/crates/core-nodes/src/processors/batch_processor/mod.rs
+++ b/rust/otap-dataflow/crates/core-nodes/src/processors/batch_processor/mod.rs
@@ -36,8 +36,8 @@ use otap_df_config::node::NodeUserConfig;
 use otap_df_config::{SignalFormat, SignalType};
 use otap_df_engine::MessageSourceLocalEffectHandlerExtension;
 use otap_df_engine::{
-    ConsumerEffectHandlerExtension, Interests, ProcessorRuntimeCapabilities,
-    ProducerEffectHandlerExtension,
+    ConsumerEffectHandlerExtension, Interests, LocalWakeupRequirements,
+    ProcessorRuntimeRequirements, ProducerEffectHandlerExtension,
     config::ProcessorConfig,
     control::{AckMsg, CallData, NackMsg, NodeControlMsg, WakeupSlot},
     error::{Error as EngineError, ProcessorErrorKind},
@@ -474,6 +474,12 @@ where
     metrics: &'a mut MetricSet<BatchProcessorMetrics>,
 }
 
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+enum ActiveBatchProcessorFormatKind {
+    Otap,
+    Otlp,
+}
+
 /// There are three reasons to flush.
 #[derive(Copy, Clone, Debug, PartialEq, Eq)]
 enum FlushReason {
@@ -567,6 +573,20 @@ async fn log_batching_failed(
 }
 
 impl BatchProcessor {
+    fn no_active_format_error() -> EngineError {
+        EngineError::InternalError {
+            message: "batch processor has no active format state".to_owned(),
+        }
+    }
+
+    const fn local_wakeup_requirements(&self) -> LocalWakeupRequirements {
+        let live_slots = match self.config.format {
+            BatchingFormat::Otap | BatchingFormat::Otlp => 3,
+            BatchingFormat::Preserve => 6,
+        };
+        LocalWakeupRequirements::new(live_slots)
+    }
+
     /// Parse JSON config and build the processor instance with the provided metrics set.
     /// This function does not wrap the processor into a ProcessorWrapper so callers can
     /// preserve the original NodeUserConfig (including outputs/default_output).
@@ -660,6 +680,27 @@ impl BatchProcessor {
             })
     }
 
+    fn format_for_signal_format(
+        &self,
+        signal_format: SignalFormat,
+    ) -> Option<ActiveBatchProcessorFormatKind> {
+        match signal_format {
+            SignalFormat::OtapRecords if self.otap_signals.is_some() => {
+                Some(ActiveBatchProcessorFormatKind::Otap)
+            }
+            SignalFormat::OtapRecords if self.otlp_signals.is_some() => {
+                Some(ActiveBatchProcessorFormatKind::Otlp)
+            }
+            SignalFormat::OtlpBytes if self.otlp_signals.is_some() => {
+                Some(ActiveBatchProcessorFormatKind::Otlp)
+            }
+            SignalFormat::OtlpBytes if self.otap_signals.is_some() => {
+                Some(ActiveBatchProcessorFormatKind::Otap)
+            }
+            _ => None,
+        }
+    }
+
     /// Process one incoming batch. Immediately acks empty requests.
     /// If this input causes pending data to exceed the lower bound, it will
     /// flush at least one output.
@@ -698,33 +739,33 @@ impl BatchProcessor {
 
         match payload {
             OtapPayload::OtapArrowRecords(otap) => {
-                if self.otap_signals.is_some() {
-                    self.otap_format()
-                        .expect("some")
+                if let Some(mut otap_format) = self.otap_format() {
+                    otap_format
                         .for_signal(signal)
                         .accept_payload(effect, ctx, otap, items)
                         .await?
-                } else {
-                    self.otlp_format()
-                        .expect("some")
+                } else if let Some(mut otlp_format) = self.otlp_format() {
+                    otlp_format
                         .for_signal(signal)
                         .accept_payload(effect, ctx, otap.try_into()?, items)
                         .await?
+                } else {
+                    return Err(Self::no_active_format_error());
                 }
             }
             OtapPayload::OtlpBytes(otlp) => {
-                if self.otlp_signals.is_some() {
-                    self.otlp_format()
-                        .expect("some")
+                if let Some(mut otlp_format) = self.otlp_format() {
+                    otlp_format
                         .for_signal(signal)
                         .accept_payload(effect, ctx, otlp, items)
                         .await?
-                } else {
-                    self.otap_format()
-                        .expect("some")
+                } else if let Some(mut otap_format) = self.otap_format() {
+                    otap_format
                         .for_signal(signal)
                         .accept_payload(effect, ctx, otlp.try_into()?, items)
                         .await?
+                } else {
+                    return Err(Self::no_active_format_error());
                 }
             }
         };
@@ -1080,21 +1121,22 @@ impl BatchProcessor {
         }
 
         let signal = retdata.signal_type();
-        match retdata.signal_format() {
-            SignalFormat::OtapRecords => {
+        match self.format_for_signal_format(retdata.signal_format()) {
+            Some(ActiveBatchProcessorFormatKind::Otap) => {
                 self.otap_format()
-                    .expect("some")
+                    .expect("otap batch state must exist when otap format kind is selected")
                     .for_signal(signal)
                     .handle(signal, calldata, effect, res)
                     .await
             }
-            SignalFormat::OtlpBytes => {
+            Some(ActiveBatchProcessorFormatKind::Otlp) => {
                 self.otlp_format()
-                    .expect("some")
+                    .expect("otlp batch state must exist when otlp format kind is selected")
                     .for_signal(signal)
                     .handle(signal, calldata, effect, res)
                     .await
             }
+            None => Err(Self::no_active_format_error()),
         }
     }
 }
@@ -1118,8 +1160,10 @@ pub fn create_otap_batch_processor(
 
 #[async_trait(?Send)]
 impl local::Processor<OtapPdata> for BatchProcessor {
-    fn runtime_capabilities(&self) -> ProcessorRuntimeCapabilities {
-        ProcessorRuntimeCapabilities::LOCAL_WAKEUPS
+    fn runtime_requirements(&self) -> ProcessorRuntimeRequirements {
+        ProcessorRuntimeRequirements {
+            local_wakeups: Some(self.local_wakeup_requirements()),
+        }
     }
 
     async fn process(
@@ -1128,72 +1172,77 @@ impl local::Processor<OtapPdata> for BatchProcessor {
         effect: &mut local::EffectHandler<OtapPdata>,
     ) -> Result<(), EngineError> {
         match msg {
-            Message::Control(ctrl) => match ctrl {
-                NodeControlMsg::Config { .. } => Ok(()),
-                NodeControlMsg::Shutdown { .. } => {
-                    self.flush_shutdown(effect).await?;
-                    Ok(())
-                }
-                NodeControlMsg::CollectTelemetry {
-                    mut metrics_reporter,
-                } => metrics_reporter.report(&mut self.metrics).map_err(|e| {
-                    EngineError::InternalError {
-                        message: e.to_string(),
+            Message::Control(ctrl) => {
+                match ctrl {
+                    NodeControlMsg::Config { .. } => Ok(()),
+                    NodeControlMsg::Shutdown { .. } => {
+                        self.flush_shutdown(effect).await?;
+                        Ok(())
                     }
-                }),
-                NodeControlMsg::Wakeup { slot, when, .. } => {
-                    let Some((format, signal)) = signal_from_wakeup_slot(slot) else {
-                        return Ok(());
-                    };
-
-                    match format {
-                        SignalFormat::OtapRecords => {
-                            if let Some(mut otap_format) = self.otap_format() {
-                                otap_format
-                                    .for_signal(signal)
-                                    .flush_signal_impl(effect, when, FlushReason::Timer)
-                                    .await?;
-                            }
+                    NodeControlMsg::CollectTelemetry {
+                        mut metrics_reporter,
+                    } => metrics_reporter.report(&mut self.metrics).map_err(|e| {
+                        EngineError::InternalError {
+                            message: e.to_string(),
                         }
-                        SignalFormat::OtlpBytes => {
-                            if let Some(mut otlp_format) = self.otlp_format() {
-                                otlp_format
-                                    .for_signal(signal)
-                                    .flush_signal_impl(effect, when, FlushReason::Timer)
-                                    .await?;
+                    }),
+                    NodeControlMsg::Wakeup { slot, when, .. } => {
+                        let Some((format, signal)) = signal_from_wakeup_slot(slot) else {
+                            return Ok(());
+                        };
+
+                        match format {
+                            SignalFormat::OtapRecords => {
+                                if let Some(mut otap_format) = self.otap_format() {
+                                    otap_format
+                                        .for_signal(signal)
+                                        .flush_signal_impl(effect, when, FlushReason::Timer)
+                                        .await?;
+                                }
                             }
-                        }
-                    };
-
-                    Ok(())
-                }
-                NodeControlMsg::DelayedData { data, when } => {
-                    let signal = data.signal_type();
+                            SignalFormat::OtlpBytes => {
+                                if let Some(mut otlp_format) = self.otlp_format() {
+                                    otlp_format
+                                        .for_signal(signal)
+                                        .flush_signal_impl(effect, when, FlushReason::Timer)
+                                        .await?;
+                                }
+                            }
+                        };
 
-                    match data.signal_format() {
-                        SignalFormat::OtapRecords => {
-                            self.otap_format()
-                                .expect("some")
+                        Ok(())
+                    }
+                    NodeControlMsg::DelayedData { data, when } => {
+                        let signal = data.signal_type();
+
+                        match self.format_for_signal_format(data.signal_format()) {
+                            Some(ActiveBatchProcessorFormatKind::Otap) => self
+                                .otap_format()
+                                .expect(
+                                    "otap batch state must exist when otap format kind is selected",
+                                )
                                 .for_signal(signal)
                                 .flush_signal_impl(effect, when, FlushReason::Timer)
-                                .await?
-                        }
-                        SignalFormat::OtlpBytes => {
-                            self.otlp_format()
-                                .expect("some")
+                                .await?,
+                            Some(ActiveBatchProcessorFormatKind::Otlp) => self
+                                .otlp_format()
+                                .expect(
+                                    "otlp batch state must exist when otlp format kind is selected",
+                                )
                                 .for_signal(signal)
                                 .flush_signal_impl(effect, when, FlushReason::Timer)
-                                .await?
-                        }
-                    };
+                                .await?,
+                            None => return Err(Self::no_active_format_error()),
+                        };
 
-                    Ok(())
+                        Ok(())
+                    }
+                    NodeControlMsg::Ack(ack) => self.handle_ack(effect, ack).await,
+                    NodeControlMsg::Nack(nack) => self.handle_nack(effect, nack).await,
+                    NodeControlMsg::DrainIngress { .. } => Ok(()),
+                    NodeControlMsg::TimerTick { .. } => unreachable!(),
                 }
-                NodeControlMsg::Ack(ack) => self.handle_ack(effect, ack).await,
-                NodeControlMsg::Nack(nack) => self.handle_nack(effect, nack).await,
-                NodeControlMsg::DrainIngress { .. } => Ok(()),
-                NodeControlMsg::TimerTick { .. } => unreachable!(),
-            },
+            }
             Message::PData(request) => self.process_signal_impl(effect, request).await,
         }
     }
@@ -2242,6 +2291,77 @@ mod tests {
             });
     }
 
+    /// Scenario: the batch processor runs in forced OTAP mode, has live
+    /// outbound completion state in its OTAP batch bookkeeping, and then
+    /// receives a downstream Ack whose returned payload format is OTLP bytes.
+    /// Guarantees: response handling falls back to the active OTAP batch state,
+    /// releases the outbound slot, and delivers the upstream Ack without
+    /// panicking on the returned payload format.
+    #[test]
+    fn test_ack_response_format_falls_back_to_active_batch_state() {
+        let (_telemetry_registry, _metrics_reporter, phase) = setup_test_runtime(json!({
+            "format": "otap",
+            "otap": {
+                "min_size": 1,
+                "max_size": 10,
+                "sizer": "items",
+            },
+            "max_batch_duration": "1s"
+        }));
+
+        phase
+            .run_test(move |mut ctx| async move {
+                let (pipeline_completion_tx, mut pipeline_completion_rx) =
+                    pipeline_completion_msg_channel(10);
+                ctx.set_pipeline_completion_sender(pipeline_completion_tx);
+
+                let mut datagen = DataGenerator::new(1);
+                let input: OtlpProtoMessage = datagen.generate_logs().into();
+                let input_bytes = otlp_message_to_bytes(&input);
+
+                let pdata = OtapPdata::new_default(input_bytes.clone().into()).test_subscribe_to(
+                    Interests::ACKS | Interests::NACKS,
+                    TestCallData::default().into(),
+                    23,
+                );
+
+                ctx.process(Message::PData(pdata))
+                    .await
+                    .expect("process input");
+
+                let mut outputs = ctx.drain_pdata().await;
+                assert_eq!(outputs.len(), 1, "size flush should emit one batch");
+
+                let output = outputs.remove(0);
+                let (output_ctx, _output_payload) = output.into_parts();
+                let returned = OtapPdata::new(output_ctx, input_bytes.into());
+
+                let (_, ack) =
+                    next_ack(AckMsg::new(returned)).expect("expected outbound ack subscriber");
+                ctx.process(Message::Control(NodeControlMsg::Ack(ack)))
+                    .await
+                    .expect("process ack");
+
+                match next_completion(
+                    &mut pipeline_completion_rx,
+                    Duration::from_secs(1),
+                    "batch processor upstream completion after format fallback ack",
+                )
+                .await
+                {
+                    PipelineCompletionMsg::DeliverAck { ack } => {
+                        let (node_id, ack) = next_ack(ack).expect("expected ack subscriber");
+                        assert_eq!(node_id, 23);
+                        let calldata: TestCallData =
+                            ack.unwind.route.calldata.try_into().expect("calldata");
+                        assert_eq!(TestCallData::default(), calldata);
+                    }
+                    other => panic!("expected upstream ack after format fallback, got {other:?}"),
+                }
+            })
+            .validate(|_| async move {});
+    }
+
     // A partial batch that never reached the size threshold must still flush on
     // Shutdown, and its downstream Ack must release the upstream completion state
     // rather than leaving correlated requests stuck.
diff --git a/rust/otap-dataflow/crates/core-nodes/src/processors/durable_buffer_processor/mod.rs b/rust/otap-dataflow/crates/core-nodes/src/processors/durable_buffer_processor/mod.rs
index 448d38a566..19ce01c2e7 100644
--- a/rust/otap-dataflow/crates/core-nodes/src/processors/durable_buffer_processor/mod.rs
+++ b/rust/otap-dataflow/crates/core-nodes/src/processors/durable_buffer_processor/mod.rs
@@ -117,8 +117,8 @@ use otap_df_engine::message::Message;
 use otap_df_engine::node::NodeId;
 use otap_df_engine::processor::ProcessorWrapper;
 use otap_df_engine::{
-    ConsumerEffectHandlerExtension, Interests, ProcessorFactory, ProcessorRuntimeCapabilities,
-    ProducerEffectHandlerExtension,
+    ConsumerEffectHandlerExtension, Interests, LocalWakeupRequirements, ProcessorFactory,
+    ProcessorRuntimeRequirements, ProducerEffectHandlerExtension,
 };
 use otap_df_pdata::{OtapArrowRecords, OtapPayload};
 use otap_df_telemetry::instrument::{Counter, Gauge, ObserveCounter};
@@ -1787,8 +1787,10 @@ impl DurableBuffer {
 
 #[async_trait(?Send)]
 impl otap_df_engine::local::processor::Processor<OtapPdata> for DurableBuffer {
-    fn runtime_capabilities(&self) -> ProcessorRuntimeCapabilities {
-        ProcessorRuntimeCapabilities::LOCAL_WAKEUPS
+    fn runtime_requirements(&self) -> ProcessorRuntimeRequirements {
+        ProcessorRuntimeRequirements {
+            local_wakeups: Some(LocalWakeupRequirements::new(1)),
+        }
     }
 
     async fn process(
diff --git a/rust/otap-dataflow/crates/engine/src/lib.rs b/rust/otap-dataflow/crates/engine/src/lib.rs
index 1889cf38f9..68f5fb9aad 100644
--- a/rust/otap-dataflow/crates/engine/src/lib.rs
+++ b/rust/otap-dataflow/crates/engine/src/lib.rs
@@ -18,7 +18,7 @@ use crate::{
     local::message::{LocalReceiver, LocalSender},
     message::{Receiver, Sender},
     node::{Node, NodeDefs, NodeId, NodeName, NodeType},
-    processor::ProcessorWrapper,
+    processor::{ProcessorWrapper, validate_local_wakeup_requirements},
     receiver::ReceiverWrapper,
     runtime_pipeline::{PipeNode, RuntimePipeline},
     shared::message::{SharedReceiver, SharedSender},
@@ -80,7 +80,7 @@ pub mod testing;
 pub mod topic;
 pub mod wiring_contract;
 pub use node_local_scheduler::{WakeupError, WakeupSetOutcome};
-pub use processor::ProcessorRuntimeCapabilities;
+pub use processor::{LocalWakeupRequirements, ProcessorRuntimeRequirements};
 
 /// Trait for factory types that expose a name.
 ///
@@ -1455,6 +1455,8 @@ impl<PData: 'static + Clone + Debug> PipelineFactory<PData> {
         )
         .map_err(|e| Error::ConfigError(Box::new(e)))?;
 
+        validate_local_wakeup_requirements(&node_id, processor.runtime_requirements())?;
+
         otel_debug!(
             "processor.create.complete",
             pipeline_group_id = pipeline_group_id.as_ref(),
diff --git a/rust/otap-dataflow/crates/engine/src/local/processor.rs b/rust/otap-dataflow/crates/engine/src/local/processor.rs
index 3423ce062b..a1dc10233c 100644
--- a/rust/otap-dataflow/crates/engine/src/local/processor.rs
+++ b/rust/otap-dataflow/crates/engine/src/local/processor.rs
@@ -42,7 +42,7 @@ use crate::message::{Message, Sender};
 use crate::node::NodeId;
 use crate::output_router::OutputRouter;
 use crate::process_duration::ComputeDuration;
-use crate::processor::ProcessorRuntimeCapabilities;
+use crate::processor::ProcessorRuntimeRequirements;
 use crate::{WakeupError, WakeupSetOutcome};
 use async_trait::async_trait;
 use otap_df_config::PortName;
@@ -106,12 +106,13 @@ pub trait Processor<PData> {
         true
     }
 
-    /// Returns optional runtime features that this processor needs from the engine.
+    /// Returns optional runtime services that this processor needs from the engine.
     ///
-    /// Processors should only opt into capabilities they actually use so the
-    /// engine can avoid wiring unused runtime machinery onto the common path.
-    fn runtime_capabilities(&self) -> ProcessorRuntimeCapabilities {
-        ProcessorRuntimeCapabilities::empty()
+    /// This is the single source of truth for runtime wiring. For example,
+    /// `local_wakeups: Some(...)` both enables processor-local wakeups and
+    /// declares the live slot count the engine must provision.
+    fn runtime_requirements(&self) -> ProcessorRuntimeRequirements {
+        ProcessorRuntimeRequirements::none()
     }
 }
 
diff --git a/rust/otap-dataflow/crates/engine/src/processor.rs b/rust/otap-dataflow/crates/engine/src/processor.rs
index c25405656a..55d514765e 100644
--- a/rust/otap-dataflow/crates/engine/src/processor.rs
+++ b/rust/otap-dataflow/crates/engine/src/processor.rs
@@ -36,13 +36,53 @@ use std::collections::HashMap;
 use std::sync::Arc;
 use std::time::Duration;
 
-bitflags::bitflags! {
-/// Optional runtime features that a processor can request from the engine.
+/// Processor-local wakeup requirements declared by a processor implementation.
+///
+/// `live_slots` is the maximum number of distinct wakeup slots that can be
+/// live at the same time for one processor instance.
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub struct LocalWakeupRequirements {
+    /// Maximum number of concurrently live wakeup slots.
+    pub live_slots: usize,
+}
+
+impl LocalWakeupRequirements {
+    /// Create local wakeup requirements for a processor.
+    #[must_use]
+    pub const fn new(live_slots: usize) -> Self {
+        Self { live_slots }
+    }
+}
+
+/// Optional runtime services requested by a processor implementation.
+///
+/// This is the single source of truth for processor runtime wiring. For
+/// example, `local_wakeups: Some(...)` both enables processor-local wakeups and
+/// declares the live slot count that the runtime must provision.
 #[derive(Clone, Copy, Debug, Default, PartialEq, Eq)]
-pub struct ProcessorRuntimeCapabilities: u8 {
-    /// Enable processor-local wakeup scheduling and delivery through `ProcessorInbox`.
-    const LOCAL_WAKEUPS = 1 << 0;
+pub struct ProcessorRuntimeRequirements {
+    /// Processor-local wakeup requirements, if the processor uses the local
+    /// wakeup API.
+    pub local_wakeups: Option<LocalWakeupRequirements>,
 }
+
+impl ProcessorRuntimeRequirements {
+    /// Runtime requirements for a processor that does not need any optional
+    /// engine services.
+    #[must_use]
+    pub const fn none() -> Self {
+        Self {
+            local_wakeups: None,
+        }
+    }
+
+    /// Runtime requirements for a processor that uses local wakeups.
+    #[must_use]
+    pub const fn with_local_wakeups(live_slots: usize) -> Self {
+        Self {
+            local_wakeups: Some(LocalWakeupRequirements::new(live_slots)),
+        }
+    }
 }
 
 /// A wrapper for the processor that allows for both `Send` and `!Send` effect handlers.
@@ -243,6 +283,13 @@ impl<PData> ProcessorWrapper<PData> {
         }
     }
 
+    pub(crate) fn runtime_requirements(&self) -> ProcessorRuntimeRequirements {
+        match self {
+            ProcessorWrapper::Local { processor, .. } => processor.runtime_requirements(),
+            ProcessorWrapper::Shared { processor, .. } => processor.runtime_requirements(),
+        }
+    }
+
     pub(crate) fn with_control_channel_metrics(
         self,
         pipeline_ctx: &PipelineContext,
@@ -335,7 +382,7 @@ impl<PData> ProcessorWrapper<PData> {
         match self {
             ProcessorWrapper::Local {
                 node_id,
-                runtime_config,
+                runtime_config: _,
                 processor,
                 control_receiver,
                 pdata_senders,
@@ -344,18 +391,17 @@ impl<PData> ProcessorWrapper<PData> {
                 source_tag,
                 ..
             } => {
-                let runtime_capabilities = processor.runtime_capabilities();
+                let runtime_requirements = processor.runtime_requirements();
                 let pdata_receiver = pdata_receiver.ok_or_else(|| Error::ProcessorError {
                     processor: node_id.clone(),
                     kind: ProcessorErrorKind::Configuration,
                     error: "The pdata receiver must be defined at this stage".to_owned(),
                     source_detail: String::new(),
                 })?;
-                let maybe_local_scheduler = runtime_capabilities
-                    .contains(ProcessorRuntimeCapabilities::LOCAL_WAKEUPS)
-                    .then(|| {
-                        NodeLocalSchedulerHandle::new(runtime_config.control_channel.capacity)
-                    });
+                validate_local_wakeup_requirements(&node_id, runtime_requirements)?;
+                let maybe_local_scheduler = runtime_requirements
+                    .local_wakeups
+                    .map(|requirements| NodeLocalSchedulerHandle::new(requirements.live_slots));
                 let inbox = if let Some(local_scheduler) = maybe_local_scheduler.clone() {
                     ProcessorInbox::new_with_local_scheduler(
                         Receiver::Local(control_receiver),
@@ -391,7 +437,7 @@ impl<PData> ProcessorWrapper<PData> {
             }
             ProcessorWrapper::Shared {
                 node_id,
-                runtime_config,
+                runtime_config: _,
                 processor,
                 control_receiver,
                 pdata_senders,
@@ -400,7 +446,7 @@ impl<PData> ProcessorWrapper<PData> {
                 source_tag,
                 ..
             } => {
-                let runtime_capabilities = processor.runtime_capabilities();
+                let runtime_requirements = processor.runtime_requirements();
                 let pdata_receiver =
                     Receiver::Shared(pdata_receiver.ok_or_else(|| Error::ProcessorError {
                         processor: node_id.clone(),
@@ -408,11 +454,10 @@ impl<PData> ProcessorWrapper<PData> {
                         error: "The pdata receiver must be defined at this stage".to_owned(),
                         source_detail: String::new(),
                     })?);
-                let maybe_local_scheduler = runtime_capabilities
-                    .contains(ProcessorRuntimeCapabilities::LOCAL_WAKEUPS)
-                    .then(|| {
-                        NodeLocalSchedulerHandle::new(runtime_config.control_channel.capacity)
-                    });
+                validate_local_wakeup_requirements(&node_id, runtime_requirements)?;
+                let maybe_local_scheduler = runtime_requirements
+                    .local_wakeups
+                    .map(|requirements| NodeLocalSchedulerHandle::new(requirements.live_slots));
                 let inbox = if let Some(local_scheduler) = maybe_local_scheduler.clone() {
                     ProcessorInbox::new_with_local_scheduler(
                         Receiver::Shared(control_receiver),
@@ -612,6 +657,27 @@ impl<PData> Node<PData> for ProcessorWrapper<PData> {
     }
 }
 
+pub(crate) fn validate_local_wakeup_requirements(
+    node_id: &NodeId,
+    requirements: ProcessorRuntimeRequirements,
+) -> Result<(), Error> {
+    let Some(local_wakeups) = requirements.local_wakeups else {
+        return Ok(());
+    };
+
+    if local_wakeups.live_slots == 0 {
+        return Err(Error::ProcessorError {
+            processor: node_id.clone(),
+            kind: ProcessorErrorKind::Configuration,
+            error: "processor-local wakeup requirement must declare at least one live slot"
+                .to_owned(),
+            source_detail: String::new(),
+        });
+    }
+
+    Ok(())
+}
+
 #[async_trait::async_trait(?Send)]
 impl<PData> Controllable<PData> for ProcessorWrapper<PData> {
     /// Returns the control message sender for the processor.
@@ -688,7 +754,9 @@ mod tests {
     use crate::control::NodeControlMsg::{Config, Shutdown, TimerTick};
     use crate::local::processor as local;
     use crate::message::Message;
-    use crate::processor::{Error, ProcessorWrapper};
+    use crate::processor::{
+        Error, ProcessorRuntimeRequirements, ProcessorWrapper, validate_local_wakeup_requirements,
+    };
     use crate::shared::processor as shared;
     use crate::testing::processor::TestRuntime;
     use crate::testing::processor::{TestContext, ValidateContext};
@@ -860,4 +928,55 @@ mod tests {
             .run_test(scenario())
             .validate(validation_procedure());
     }
+
+    /// Scenario: a processor does not request any processor-local wakeup
+    /// service from the runtime.
+    /// Guarantees: validation succeeds without requiring any local wakeup
+    /// capacity, so processors that do not use wakeups do not pay configuration
+    /// or startup costs for that service.
+    #[test]
+    fn validate_local_wakeup_requirements_accepts_processors_without_wakeups() {
+        assert!(
+            validate_local_wakeup_requirements(
+                &test_node("test_processor"),
+                ProcessorRuntimeRequirements::none(),
+            )
+            .is_ok()
+        );
+    }
+
+    /// Scenario: a processor declares local wakeups but reports an invalid live
+    /// slot requirement of zero.
+    /// Guarantees: validation rejects the configuration before startup, so the
+    /// runtime never provisions an unusable local wakeup service.
+    #[test]
+    fn validate_local_wakeup_requirements_rejects_zero_live_slots() {
+        let err = validate_local_wakeup_requirements(
+            &test_node("test_processor"),
+            ProcessorRuntimeRequirements::with_local_wakeups(0),
+        )
+        .expect_err("zero live slots must be rejected");
+
+        let Error::ProcessorError { error, .. } = err else {
+            panic!("expected processor configuration error");
+        };
+        assert_eq!(
+            error,
+            "processor-local wakeup requirement must declare at least one live slot"
+        );
+    }
+
+    /// Scenario: a processor declares a positive local wakeup live slot count.
+    /// Guarantees: validation succeeds so the declared slot count can act as
+    /// the single source of truth for local wakeup runtime provisioning.
+    #[test]
+    fn validate_local_wakeup_requirements_accepts_positive_live_slots() {
+        assert!(
+            validate_local_wakeup_requirements(
+                &test_node("test_processor"),
+                ProcessorRuntimeRequirements::with_local_wakeups(6),
+            )
+            .is_ok()
+        );
+    }
 }
diff --git a/rust/otap-dataflow/crates/engine/src/shared/processor.rs b/rust/otap-dataflow/crates/engine/src/shared/processor.rs
index 0b597655e9..ae3a873aab 100644
--- a/rust/otap-dataflow/crates/engine/src/shared/processor.rs
+++ b/rust/otap-dataflow/crates/engine/src/shared/processor.rs
@@ -40,7 +40,7 @@ use crate::error::{Error, TypedError};
 use crate::message::Message;
 use crate::node::NodeId;
 use crate::output_router::OutputRouter;
-use crate::processor::ProcessorRuntimeCapabilities;
+use crate::processor::ProcessorRuntimeRequirements;
 use crate::shared::message::SharedSender;
 use crate::{WakeupError, WakeupSetOutcome};
 use async_trait::async_trait;
@@ -105,12 +105,13 @@ pub trait Processor<PData> {
         true
     }
 
-    /// Returns optional runtime features that this processor needs from the engine.
+    /// Returns optional runtime services that this processor needs from the engine.
     ///
-    /// Processors should only opt into capabilities they actually use so the
-    /// engine can avoid wiring unused runtime machinery onto the common path.
-    fn runtime_capabilities(&self) -> ProcessorRuntimeCapabilities {
-        ProcessorRuntimeCapabilities::empty()
+    /// This is the single source of truth for runtime wiring. For example,
+    /// `local_wakeups: Some(...)` both enables processor-local wakeups and
+    /// declares the live slot count the engine must provision.
+    fn runtime_requirements(&self) -> ProcessorRuntimeRequirements {
+        ProcessorRuntimeRequirements::none()
     }
 }