diff --git a/rust/otap-dataflow/Cargo.toml b/rust/otap-dataflow/Cargo.toml index ff80d4c619..80dbf59f14 100644 --- a/rust/otap-dataflow/Cargo.toml +++ b/rust/otap-dataflow/Cargo.toml @@ -50,6 +50,7 @@ otap-df-pdata-otlp-macros = { path = "./crates/pdata/src/otlp/macros"} otap-df-pdata-otlp-model = { path = "./crates/pdata/src/otlp/model"} otap-df-config = { path = "crates/config" } otap-df-contrib-nodes = { path = "crates/contrib-nodes" } +otap-df-control-channel = { path = "crates/control-channel" } otap-df-controller = { path = "crates/controller" } otap-df-core-nodes = { path = "crates/core-nodes" } otap-df-engine = { path = "crates/engine" } diff --git a/rust/otap-dataflow/benchmarks/Cargo.toml b/rust/otap-dataflow/benchmarks/Cargo.toml index a9c3d8c0cb..8e0016c38d 100644 --- a/rust/otap-dataflow/benchmarks/Cargo.toml +++ b/rust/otap-dataflow/benchmarks/Cargo.toml @@ -21,6 +21,7 @@ tonic = { workspace = true } tonic-prost = { workspace = true } prost = { workspace = true } +otap-df-control-channel = { workspace = true } otap-df-config = { workspace = true } otap-df-channel = { workspace = true } otap-df-engine = { workspace = true } @@ -56,6 +57,10 @@ workspace = true name = "channel" harness = false +[[bench]] +name = "control_channel" +harness = false + [[bench]] name = "exporter" harness = false diff --git a/rust/otap-dataflow/benchmarks/benches/control_channel/main.rs b/rust/otap-dataflow/benchmarks/benches/control_channel/main.rs new file mode 100644 index 0000000000..9f800f52ad --- /dev/null +++ b/rust/otap-dataflow/benchmarks/benches/control_channel/main.rs @@ -0,0 +1,418 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +//! Compare the current engine control-channel path against the standalone +//! control-aware channel under heavy Ack/Nack traffic. + +#![allow(missing_docs)] + +use criterion::{BenchmarkId, Criterion, Throughput, criterion_group, criterion_main}; +use otap_df_channel::mpsc; +use otap_df_control_channel::{ + AckMsg as ControlAwareAckMsg, CompletionMsg as ControlAwareCompletionMsg, ControlChannelConfig, + ControlCmd, NackMsg as ControlAwareNackMsg, NodeControlEvent, NodeControlReceiver, + NodeControlSender, node_channel, +}; +use otap_df_engine::control::{AckMsg, NackMsg, NodeControlMsg}; +use otap_df_engine::local::message::{LocalReceiver as CurrentLocalReceiver, LocalSender}; +use otap_df_engine::shared::message::{ + SharedReceiver as CurrentSharedReceiver, SharedSender as CurrentSharedSender, +}; +use otap_df_telemetry::reporter::MetricsReporter; +use std::future::{Future, poll_fn}; +use std::hint::black_box; +use std::pin::Pin; +use std::task::Poll; +use tokio::runtime::Builder; +use tokio::task::LocalSet; + +#[cfg(not(windows))] +use tikv_jemallocator::Jemalloc; + +#[cfg(not(windows))] +#[global_allocator] +static GLOBAL: Jemalloc = Jemalloc; + +const COMPLETION_COUNT: usize = 100_000; +const CHANNEL_CAPACITY: usize = 1_024; +const COMPLETION_BATCH_MAX: usize = 32; +const NACK_EVERY: usize = 8; +const TIMER_EVERY: usize = 64; +const TELEMETRY_EVERY: usize = 256; +const CONFIG_EVERY: usize = 1_024; +const CANCELED_BLOCKED_SENDS: usize = 100_000; + +#[derive(Clone, Copy, Debug)] +enum Scenario { + AckNackOnly, + AckNackWithControlNoise, +} + +impl Scenario { + const fn bench_name(self) -> &'static str { + match self { + Self::AckNackOnly => "ack_nack_only", + Self::AckNackWithControlNoise => "ack_nack_with_control_noise", + } + } + + const fn has_control_noise(self) -> bool { + matches!(self, Self::AckNackWithControlNoise) + } +} + +#[derive(Debug, Default)] +struct ObservedWork { + completions: usize, + completion_batches: usize, + timer_ticks: usize, + telemetry_ticks: usize, + configs: usize, +} + +fn control_aware_channel_config() -> ControlChannelConfig { + ControlChannelConfig { + completion_msg_capacity: CHANNEL_CAPACITY, + completion_batch_max: COMPLETION_BATCH_MAX, + completion_burst_limit: COMPLETION_BATCH_MAX, + } +} + +async fn is_pending_once(mut future: Pin<&mut F>) -> bool +where + F: Future, +{ + poll_fn(|cx| Poll::Ready(future.as_mut().poll(cx).is_pending())).await +} + +fn build_metrics_reporter() -> MetricsReporter { + MetricsReporter::create_new_and_receiver(16).1 +} + +async fn produce_current_local(tx: LocalSender>, scenario: Scenario) { + let metrics_reporter = build_metrics_reporter(); + + for idx in 0..COMPLETION_COUNT { + if idx % NACK_EVERY == 0 { + tx.send(NodeControlMsg::Nack(NackMsg::new("temporary", idx))) + .await + .expect("nack should enqueue"); + } else { + tx.send(NodeControlMsg::Ack(AckMsg::new(idx))) + .await + .expect("ack should enqueue"); + } + + if scenario.has_control_noise() { + if idx % TIMER_EVERY == 0 { + tx.send(NodeControlMsg::TimerTick {}) + .await + .expect("timer tick should enqueue"); + } + if idx % TELEMETRY_EVERY == 0 { + tx.send(NodeControlMsg::CollectTelemetry { + metrics_reporter: metrics_reporter.clone(), + }) + .await + .expect("telemetry collection should enqueue"); + } + if idx % CONFIG_EVERY == 0 { + tx.send(NodeControlMsg::Config { + config: serde_json::json!({ "seq": idx }), + }) + .await + .expect("config should enqueue"); + } + } + } +} + +async fn consume_current_local( + mut rx: CurrentLocalReceiver>, +) -> ObservedWork { + let mut observed = ObservedWork::default(); + + while let Ok(msg) = rx.recv().await { + match msg { + NodeControlMsg::Ack(_) | NodeControlMsg::Nack(_) => observed.completions += 1, + NodeControlMsg::TimerTick {} => observed.timer_ticks += 1, + NodeControlMsg::CollectTelemetry { .. } => observed.telemetry_ticks += 1, + NodeControlMsg::Config { .. } => observed.configs += 1, + NodeControlMsg::DrainIngress { .. } + | NodeControlMsg::Shutdown { .. } + | NodeControlMsg::Wakeup { .. } + | NodeControlMsg::DelayedData { .. } => { + panic!("unexpected message in benchmark current local receiver"); + } + } + } + + observed +} + +async fn produce_current_shared( + tx: CurrentSharedSender>, + scenario: Scenario, +) { + let metrics_reporter = build_metrics_reporter(); + + for idx in 0..COMPLETION_COUNT { + if idx % NACK_EVERY == 0 { + tx.send(NodeControlMsg::Nack(NackMsg::new("temporary", idx))) + .await + .expect("nack should enqueue"); + } else { + tx.send(NodeControlMsg::Ack(AckMsg::new(idx))) + .await + .expect("ack should enqueue"); + } + + if scenario.has_control_noise() { + if idx % TIMER_EVERY == 0 { + tx.send(NodeControlMsg::TimerTick {}) + .await + .expect("timer tick should enqueue"); + } + if idx % TELEMETRY_EVERY == 0 { + tx.send(NodeControlMsg::CollectTelemetry { + metrics_reporter: metrics_reporter.clone(), + }) + .await + .expect("telemetry collection should enqueue"); + } + if idx % CONFIG_EVERY == 0 { + tx.send(NodeControlMsg::Config { + config: serde_json::json!({ "seq": idx }), + }) + .await + .expect("config should enqueue"); + } + } + } +} + +async fn consume_current_shared( + mut rx: CurrentSharedReceiver>, +) -> ObservedWork { + let mut observed = ObservedWork::default(); + + while let Ok(msg) = rx.recv().await { + match msg { + NodeControlMsg::Ack(_) | NodeControlMsg::Nack(_) => observed.completions += 1, + NodeControlMsg::TimerTick {} => observed.timer_ticks += 1, + NodeControlMsg::CollectTelemetry { .. } => observed.telemetry_ticks += 1, + NodeControlMsg::Config { .. } => observed.configs += 1, + NodeControlMsg::DrainIngress { .. } + | NodeControlMsg::Shutdown { .. } + | NodeControlMsg::Wakeup { .. } + | NodeControlMsg::DelayedData { .. } => { + panic!("unexpected message in benchmark current shared receiver"); + } + } + } + + observed +} + +async fn send_control_aware_completion(tx: &NodeControlSender, idx: usize) { + let result = if idx.is_multiple_of(NACK_EVERY) { + tx.send(ControlCmd::Nack(ControlAwareNackMsg::new("temporary", idx))) + .await + } else { + tx.send(ControlCmd::Ack(ControlAwareAckMsg::new(idx))).await + }; + let _ = result.expect("control-aware completion send should succeed"); +} + +async fn produce_control_aware(tx: NodeControlSender, scenario: Scenario) { + for idx in 0..COMPLETION_COUNT { + send_control_aware_completion(&tx, idx).await; + + if scenario.has_control_noise() { + if idx % TIMER_EVERY == 0 { + let result = tx.try_send(ControlCmd::TimerTick); + assert!(result.is_ok(), "timer tick should not fail"); + } + if idx % TELEMETRY_EVERY == 0 { + let result = tx.try_send(ControlCmd::CollectTelemetry); + assert!(result.is_ok(), "telemetry tick should not fail"); + } + if idx % CONFIG_EVERY == 0 { + let result = tx.try_send(ControlCmd::Config { + config: serde_json::json!({ "seq": idx }), + }); + assert!(result.is_ok(), "config should not fail"); + } + } + } +} + +async fn consume_control_aware(mut rx: NodeControlReceiver) -> ObservedWork { + let mut observed = ObservedWork::default(); + + while let Some(event) = rx.recv().await { + match event { + NodeControlEvent::CompletionBatch(batch) => { + observed.completion_batches += 1; + observed.completions += batch.len(); + for completion in batch { + match completion { + ControlAwareCompletionMsg::Ack(_) | ControlAwareCompletionMsg::Nack(_) => {} + } + } + } + NodeControlEvent::TimerTick => observed.timer_ticks += 1, + NodeControlEvent::CollectTelemetry => observed.telemetry_ticks += 1, + NodeControlEvent::Config { .. } => observed.configs += 1, + NodeControlEvent::Shutdown(_) => { + panic!("unexpected event in control-aware benchmark receiver"); + } + } + } + + observed +} + +async fn run_current_local_workload(scenario: Scenario) -> ObservedWork { + let (tx_raw, rx_raw) = mpsc::Channel::new(CHANNEL_CAPACITY); + let tx = LocalSender::mpsc(tx_raw); + let rx = CurrentLocalReceiver::mpsc(rx_raw); + + let ((), observed) = tokio::join!( + produce_current_local(tx, scenario), + consume_current_local(rx) + ); + assert_eq!(observed.completions, COMPLETION_COUNT); + observed +} + +async fn run_current_shared_workload(scenario: Scenario) -> ObservedWork { + let (tx_raw, rx_raw) = tokio::sync::mpsc::channel(CHANNEL_CAPACITY); + let tx = CurrentSharedSender::mpsc(tx_raw); + let rx = CurrentSharedReceiver::mpsc(rx_raw); + + let ((), observed) = tokio::join!( + produce_current_shared(tx, scenario), + consume_current_shared(rx) + ); + assert_eq!(observed.completions, COMPLETION_COUNT); + observed +} + +async fn run_control_aware_workload(scenario: Scenario) -> ObservedWork { + let (tx, rx) = + node_channel(control_aware_channel_config()).expect("control-aware channel config valid"); + + let ((), observed) = tokio::join!( + produce_control_aware(tx, scenario), + consume_control_aware(rx) + ); + assert_eq!(observed.completions, COMPLETION_COUNT); + observed +} + +async fn run_control_aware_canceled_sender_churn() -> usize { + let (tx, mut rx) = + node_channel(control_aware_channel_config()).expect("control-aware channel config valid"); + + let _ = tx + .try_send(ControlCmd::Ack(ControlAwareAckMsg::new(0))) + .expect("seed completion should enqueue"); + + for idx in 0..CANCELED_BLOCKED_SENDS { + let mut blocked = + std::pin::pin!(tx.send(ControlCmd::Ack(ControlAwareAckMsg::new(idx + 1)))); + assert!(is_pending_once(blocked.as_mut()).await); + } + + let mut live = std::pin::pin!(tx.send(ControlCmd::Ack(ControlAwareAckMsg::new( + CANCELED_BLOCKED_SENDS + 1, + )))); + assert!(is_pending_once(live.as_mut()).await); + + let first_batch = rx.recv().await; + assert!(matches!( + first_batch, + Some(NodeControlEvent::CompletionBatch(_)) + )); + let _ = live + .await + .expect("live blocked send should complete after capacity is freed"); + let second_batch = rx.recv().await; + assert!(matches!( + second_batch, + Some(NodeControlEvent::CompletionBatch(_)) + )); + + CANCELED_BLOCKED_SENDS +} + +fn bench_control_channels(c: &mut Criterion) { + let rt = Builder::new_current_thread() + .enable_all() + .build() + .expect("failed to build tokio runtime"); + + let cores = core_affinity::get_core_ids().expect("couldn't get core IDs"); + let core = cores.iter().last().expect("no cores found"); + _ = core_affinity::set_for_current(*core); + + let mut group = c.benchmark_group("control_channel_ack_nack"); + _ = group.throughput(Throughput::Elements(COMPLETION_COUNT as u64)); + + for scenario in [Scenario::AckNackOnly, Scenario::AckNackWithControlNoise] { + let _ = group.bench_function( + BenchmarkId::new("current_local", scenario.bench_name()), + |b| { + b.to_async(&rt).iter(|| async { + let local = LocalSet::new(); + let observed = local + .run_until(async { run_current_local_workload(scenario).await }) + .await; + let _ = black_box(observed); + }); + }, + ); + + let _ = group.bench_function( + BenchmarkId::new("control_aware", scenario.bench_name()), + |b| { + b.to_async(&rt).iter(|| async { + let local = LocalSet::new(); + let observed = local + .run_until(async { run_control_aware_workload(scenario).await }) + .await; + let _ = black_box(observed); + }); + }, + ); + + let _ = group.bench_function( + BenchmarkId::new("current_shared", scenario.bench_name()), + |b| { + b.to_async(&rt).iter(|| async { + let observed = run_current_shared_workload(scenario).await; + let _ = black_box(observed); + }); + }, + ); + } + + group.finish(); + + let mut churn_group = c.benchmark_group("control_channel_blocked_sender_churn"); + let _ = churn_group.throughput(Throughput::Elements(CANCELED_BLOCKED_SENDS as u64)); + let _ = churn_group.bench_function("control_aware", |b| { + b.to_async(&rt).iter(|| async { + let local = LocalSet::new(); + let churned = local + .run_until(async { run_control_aware_canceled_sender_churn().await }) + .await; + let _ = black_box(churned); + }); + }); + churn_group.finish(); +} + +criterion_group!(benches, bench_control_channels); +criterion_main!(benches); diff --git a/rust/otap-dataflow/crates/contrib-nodes/src/exporters/azure_monitor_exporter/exporter.rs b/rust/otap-dataflow/crates/contrib-nodes/src/exporters/azure_monitor_exporter/exporter.rs index c78cb2956a..fc2bcef955 100644 --- a/rust/otap-dataflow/crates/contrib-nodes/src/exporters/azure_monitor_exporter/exporter.rs +++ b/rust/otap-dataflow/crates/contrib-nodes/src/exporters/azure_monitor_exporter/exporter.rs @@ -10,7 +10,7 @@ use otap_df_engine::context::PipelineContext; use otap_df_engine::control::{AckMsg, NackMsg, NodeControlMsg}; use otap_df_engine::error::Error as EngineError; use otap_df_engine::local::exporter::{EffectHandler, Exporter}; -use otap_df_engine::message::{ExporterMessageChannel, Message}; +use otap_df_engine::message::{ExporterInbox, Message, Receiver}; use otap_df_engine::terminal_state::TerminalState; use otap_df_pdata::otlp::OtlpProtoBytes; use otap_df_pdata::views::otap::OtapLogsView; @@ -462,7 +462,7 @@ impl AzureMonitorExporter { impl Exporter for AzureMonitorExporter { async fn start( mut self: Box, - mut msg_chan: ExporterMessageChannel, + mut msg_chan: ExporterInbox, effect_handler: EffectHandler, ) -> Result { otel_info!( @@ -709,14 +709,14 @@ mod tests { ) -> ( mpsc::Sender>, mpsc::Sender, - ExporterMessageChannel, + ExporterInbox, ) { let (control_tx, control_rx) = mpsc::Channel::>::new(capacity); let (pdata_tx, pdata_rx) = mpsc::Channel::::new(capacity); ( control_tx, pdata_tx, - ExporterMessageChannel::new( + ExporterInbox::new( Receiver::Local(LocalReceiver::mpsc(control_rx)), Receiver::Local(LocalReceiver::mpsc(pdata_rx)), 0, diff --git a/rust/otap-dataflow/crates/contrib-nodes/src/exporters/geneva_exporter/mod.rs b/rust/otap-dataflow/crates/contrib-nodes/src/exporters/geneva_exporter/mod.rs index f0e6666a1a..1d1430d71a 100644 --- a/rust/otap-dataflow/crates/contrib-nodes/src/exporters/geneva_exporter/mod.rs +++ b/rust/otap-dataflow/crates/contrib-nodes/src/exporters/geneva_exporter/mod.rs @@ -37,7 +37,7 @@ use otap_df_engine::control::{AckMsg, NackMsg}; use otap_df_engine::error::Error; use otap_df_engine::exporter::ExporterWrapper; use otap_df_engine::local::exporter::{EffectHandler, Exporter}; -use otap_df_engine::message::{ExporterMessageChannel, Message}; +use otap_df_engine::message::{ExporterInbox, Message}; use otap_df_engine::node::NodeId; use otap_df_engine::terminal_state::TerminalState; use otap_df_pdata::otlp::OtlpProtoBytes; @@ -611,7 +611,7 @@ pub static GENEVA_EXPORTER: ExporterFactory = ExporterFactory { impl Exporter for GenevaExporter { async fn start( mut self: Box, - mut msg_chan: ExporterMessageChannel, + mut msg_chan: ExporterInbox, effect_handler: EffectHandler, ) -> Result { otel_info!( diff --git a/rust/otap-dataflow/crates/control-channel/Cargo.toml b/rust/otap-dataflow/crates/control-channel/Cargo.toml new file mode 100644 index 0000000000..63404a121c --- /dev/null +++ b/rust/otap-dataflow/crates/control-channel/Cargo.toml @@ -0,0 +1,18 @@ +[package] +name = "otap-df-control-channel" +description = "Standalone control-aware bounded channel for OTAP dataflow" +version.workspace = true +authors.workspace = true +repository.workspace = true +license.workspace = true +publish.workspace = true +edition.workspace = true +rust-version.workspace = true + +[lints] +workspace = true + +[dependencies] +serde_json = { workspace = true } +thiserror = { workspace = true } +tokio = { workspace = true, features = ["sync", "time"] } diff --git a/rust/otap-dataflow/crates/control-channel/README.md b/rust/otap-dataflow/crates/control-channel/README.md new file mode 100644 index 0000000000..be393968f1 --- /dev/null +++ b/rust/otap-dataflow/crates/control-channel/README.md @@ -0,0 +1,349 @@ +# Control-Aware Bounded Channel + +This crate defines a bounded control channel for OTAP dataflow node-control +traffic. + +Its purpose is to provide a control-plane primitive with semantics that are +stronger and more explicit than a generic FIFO MPSC queue: + +- lifecycle control such as `DrainIngress` and `Shutdown` must still be + accepted and delivered even when ordinary control traffic is backlogged +- high-frequency completion traffic must remain efficient through bounded batching +- low-value control noise should be coalesced instead of competing with + correctness-critical work +- shutdown progress must remain bounded and explicit +- the public API should make invalid lifecycle operations hard to express + +Here, "lifecycle control" means the per-node shutdown transitions carried by +`DrainIngress` and `Shutdown`, as opposed to ordinary control traffic such as +`Ack`, `Nack`, `Config`, `TimerTick`, and `CollectTelemetry`. + +## Goal + +The control channel is meant for node-control traffic, not for pdata transport. + +It targets the specific needs of the OTAP engine: + +- thread-per-core execution +- single-threaded async runtimes on the hot path +- frequent `Ack` and `Nack` traffic when `wait_for_result` is enabled, + with batching to amortize receive-side overhead +- node-local lifecycle transitions such as `DrainIngress` and `Shutdown` +- a need for bounded memory, bounded-fairness, and explicit terminal progress + +The design separates policy classes that behave differently: + +- retained, backpressured, and batched completion traffic +- best-effort coalesced control work, where duplicate signals collapse into one + pending token +- latest-wins configuration updates, where the newest pending value replaces + the older one +- reserved lifecycle tokens + +## Design + +The channel uses one internal queue implementation with role-specific public +APIs. + +### Operational overview + +```text +receiver sender receiver-side delivery +--------------- ---------------------- +accept_drain_ingress() -----------------------> DrainIngress +accept_shutdown(deadline) --------------------> Shutdown +try_send/send(Ack | Nack) --------------------> CompletionBatch(...) +try_send/send(Config) ------------------------> Config +try_send/send(TimerTick) ---------------------> TimerTick +try_send/send(CollectTelemetry) --------------> CollectTelemetry + + +------------------------------------------------+ + | Control channel core | + |------------------------------------------------| + | lifecycle slots: drain_ingress, shutdown | + | retained queue: completion deque | + | latest-wins slot: config | + | coalesced flags: timer_tick, collect_telemetry | + |------------------------------------------------| + | phases: Normal | + | IngressDrainRecorded | + | ShutdownRecorded | + |------------------------------------------------| + | stats / metrics surface: | + | completion_len | + | completion_batch_emitted_total | + | completion_message_emitted_total | + | config_replaced_total | + | timer_tick_coalesced_total | + | collect_telemetry_coalesced_total | + | normal_event_dropped_during_drain_total | + | shutdown_forced | + +------------------------------------------------+ +``` + +The sender side submits lifecycle operations and ordinary control commands into +separate internal classes. The receiver side observes normalized control events +whose ordering is governed by the channel's fairness and shutdown policy. + +### Role-specific APIs + +There are two channel families: + +- `receiver_channel(...)` + - sender type: `ReceiverControlSender` + - receiver event type: `ReceiverControlEvent` + - supports both `accept_drain_ingress(...)` and `accept_shutdown(...)` +- `node_channel(...)` + - sender type: `NodeControlSender` + - receiver event type: `NodeControlEvent` + - supports `accept_shutdown(...)` only + +This split is intentional. `DrainIngress` is receiver-specific lifecycle +control, so non-receiver nodes cannot express it through the public API. + +The crate intentionally exposes one single-owner implementation. That matches the +thread-per-core execution model the engine is targeting and keeps the control +state machine behind one receiver-owned queue core. + +### Traffic classes + +The channel currently supports: + +- lifecycle: + - `DrainIngress` + - `Shutdown` +- retained completion traffic: + - `Ack` + - `Nack` +- latest-wins normal control, where a new pending value replaces the previous + one: + - `Config` +- coalesced best-effort normal control, where duplicate signals merge into one + pending token: + - `TimerTick` + - `CollectTelemetry` + +### Internal model + +Internally, that queue implementation stores control classes in separate +slots/queues instead of forcing everything through one FIFO: + +- reserved lifecycle slots for `DrainIngress` and `Shutdown` +- a bounded completion deque for `Ack` and `Nack` +- one replaceable `Config` slot +- one coalesced `TimerTick` flag +- one coalesced `CollectTelemetry` flag + +This lets the channel apply class-specific admission and receive-side policy +without requiring unbounded buffering or sender-side queue surgery. + +## Behaviors And Guarantees + +### Bounded memory + +Ordinary retained traffic is bounded by configuration: + +- `completion_msg_capacity` +- `completion_batch_max` +- `completion_burst_limit` + +Lifecycle tokens do not consume ordinary bounded completion capacity. + +### Backpressure vs non-blocking send + +For non-lifecycle traffic, the sender exposes two modes: + +- `try_send(...)` + - non-blocking + - returns the original command on `Full` or `Closed` +- `send(...).await` + - waits for bounded capacity when needed + - returns only when the command is accepted or the channel closes + - wakes blocked completion senders in FIFO order as completion capacity is released + +This supports both explicit backpressure and opportunistic best-effort usage, +depending on the caller. + +### Completion batching + +`Ack` and `Nack` are retained in a bounded FIFO and emitted as +`CompletionBatch(Vec>)`. + +Properties: + +- arrival order is preserved within completion traffic +- batching reduces receive-side overhead under heavy completion load +- protocols that support completion signals such as `Ack` and `Nack` can take + advantage of this batching to reduce control-plane churn +- completion messages can optionally carry explicit metadata through + `AckMsg` / `NackMsg`; standalone usage defaults to + `Meta = ()`, while future engine integration can preserve unwind state there +- `completion_batch_max` bounds the size of a single emitted batch +- completion traffic remains eligible after `DrainIngress` +- completion traffic remains eligible after `Shutdown` until terminal progress + occurs + +### Latest-wins config + +`Config` is not queued as an unbounded sequence. + +Properties: + +- only the most recent pending config is kept +- a new config replaces the previously pending one +- config is dropped once drain or shutdown begins + +### Best-effort coalescing + +`TimerTick` and `CollectTelemetry` are coalesced per channel. + +Properties: + +- only one pending tick of each kind is retained +- duplicate sends are reported as coalesced +- these events are dropped once drain or shutdown begins + +### Bounded fairness + +The receive side is not a pure priority queue and not a pure FIFO. + +It enforces bounded fairness between completion traffic and normal control +traffic: + +- completions are emitted in batches +- `completion_burst_limit` bounds how many completion messages can be delivered + consecutively before one pending normal event must be surfaced +- `Config`, `TimerTick`, and `CollectTelemetry` are rotated fairly when + multiple normal events are pending + +This prevents long completion runs from starving all other control activity in +normal operation. + +### Drain and shutdown semantics + +`DrainIngress` and `Shutdown` are distinct lifecycle states. + +Properties: + +- `DrainIngress` and `Shutdown` are accepted through reserved lifecycle slots +- if both are present, `DrainIngress` is delivered first +- once drain begins, ordinary non-completion control work such as `Config`, + `TimerTick`, and `CollectTelemetry` is no longer accepted +- the `DrainIngress` deadline is carried to the receiver event loop so the + receiver can bound its own ingress-drain phase; it does not make the + control-channel queue itself deadline-driven +- once shutdown begins, ordinary non-completion control work such as `Config`, + `TimerTick`, and `CollectTelemetry` is no longer accepted +- pending ordinary non-completion control state is cleared when drain or + shutdown is accepted +- completion traffic may continue draining after shutdown is accepted + +### Deadline-bounded terminal progress + +Only `Shutdown` carries an active queue-level deadline. `DrainIngress` may also +carry a deadline field, but that field is for receiver-local ingress-drain +behavior after delivery rather than for forced progress inside the control +channel itself. + +`Shutdown` carries a deadline and the receiver wait path is deadline-aware. + +Properties: + +- if retained completion traffic drains before the deadline, `Shutdown` is + emitted after retained work +- if the deadline expires first, terminal progress is forced +- when forced shutdown fires, remaining retained completions are abandoned and + the channel closes +- after final shutdown delivery, new sends are rejected + +This is the key liveness property needed to avoid shutdown being postponed +indefinitely by continued completion traffic. + +### Single-owner execution model + +The channel is implemented as a single-owner state machine: + +- the queue core is mutated only by the channel owner +- sender clones share that owner through local single-threaded handles +- blocked completion senders wait in a keyed FIFO waiter queue so + capacity release can wake only the senders that can now make progress +- receiver waiting remains deadline-aware so forced shutdown does not + depend on a later producer wakeup + +## Observability + +The channel exposes a `stats()` snapshot on both senders and receivers. + +The snapshot includes: + +- lifecycle state: + - `phase` + - `drain_ingress_recorded` + - `shutdown_recorded` + - `shutdown_forced` + - `closed` +- current occupancy / pending state: + - `completion_len` + - `has_pending_drain_ingress` + - `has_pending_shutdown` + - `has_pending_config` + - `has_pending_timer_tick` + - `has_pending_collect_telemetry` + - `completion_burst_len` +- cumulative counters: + - `completion_batch_emitted_total` + - `completion_message_emitted_total` + - `config_replaced_total` + - `timer_tick_coalesced_total` + - `collect_telemetry_coalesced_total` + - `normal_event_dropped_during_drain_total` + - `completion_abandoned_on_forced_shutdown_total` + +These snapshots are intended to map cleanly to a future engine metric set such +as `channel.control`, attached to the existing control-channel entity rather +than to pipeline-global runtime-control telemetry. + +## Future Work + +### Engine integration + +The intended engine integration is: + +- one control channel per node +- receiver nodes use the receiver-role API +- non-receiver nodes use the node-role API +- control-channel telemetry is reported as `channel.control` from the existing + `stats()` surface + +Pipeline-wide shutdown orchestration should remain in the engine, not in the +channel. In particular, an engine-side helper such as +`begin_receiver_shutdown(deadline, reason)` should: + +- accept `DrainIngress` on receiver channels +- wait for `ReceiverDrained` +- accept downstream `Shutdown` only after receiver drain completes + +After integration, behavior and performance should be revalidated under +realistic engine workloads. + +### Admin UI + +Once the control channel is integrated into the engine and starts emitting a +dedicated `channel.control` metric set, the admin UI should be updated in a +separate change. + +The intended UI changes are: + +- graph model: + - associate `channel.control` metric sets with existing control-channel edges + using `channel.id` +- selection details: + - render a control-specific metrics block for control channels + - use `completion.queued` as the queue-depth fallback when generic + `queue.depth` is not available +- optional charting: + - add selected-channel views for control-specific gauges and counters only + after the emitted metric names are stable + +Those updates are intentionally deferred until integration time so this branch +stays focused on the standalone channel design rather than on dormant UI code. diff --git a/rust/otap-dataflow/crates/control-channel/src/channel.rs b/rust/otap-dataflow/crates/control-channel/src/channel.rs new file mode 100644 index 0000000000..48de39b7ee --- /dev/null +++ b/rust/otap-dataflow/crates/control-channel/src/channel.rs @@ -0,0 +1,1036 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +//! Standalone control-aware channel with a single channel-receiver-owned queue +//! core. +//! +//! "Single-owner" means one channel receiver task owns and mutates the queue +//! state machine. Cloned senders can submit work and wait for capacity, but +//! they do not share mutable queue state. +//! +//! The design is split into three layers: +//! +//! - [`Inner`] owns the control-specific semantics: lifecycle recording, +//! bounded completion admission, completion batching, latest-wins config +//! replacement, coalesced timer/telemetry signals, bounded fairness, and +//! deadline-bounded forced shutdown progress. +//! - [`State`] wraps that core with single-threaded ownership, sender liveness, +//! wake routing for the channel receiver, and blocked-sender waiter +//! management. +//! - [`ControlSenderCore`] and [`ControlReceiverCore`] provide the operational +//! sender/receiver behavior over that shared state. +//! +//! The generic parameter `Meta` is the completion-side metadata carried by +//! `Ack`/`Nack` messages. Use `Meta = ()` when completions only need to return +//! the payload, or supply a richer type when the integration needs explicit +//! unwind/routing context alongside that payload. +//! +//! The hot path is intentionally asymmetric: +//! +//! - senders only mutate the queue through short `RefCell` borrows and never +//! own the queue core directly +//! - the channel receiver is the only side that pops events and advances drain +//! or forced-shutdown progress +//! - only completion traffic can become capacity-bound; lifecycle signals are +//! recorded separately and config/timer/telemetry use replacement or +//! coalescing rules instead of ordinary FIFO queuing +//! +//! Under contention, blocked completion senders wait in a keyed FIFO waiter +//! structure. That keeps wakeups bounded when completion slots are released, +//! while still allowing terminal transitions such as shutdown or close to wake +//! everyone so they can observe the new state. + +use crate::core::Inner; +use crate::types::CoreControlEvent; +use crate::{ + ConfigError, ControlChannelConfig, ControlChannelStats, ControlCmd, DrainIngressMsg, + LifecycleSendResult, NodeControlEvent, ReceiverControlEvent, SendError, SendOutcome, + ShutdownMsg, +}; +use std::cell::{Cell, RefCell}; +use std::collections::VecDeque; +use std::future::Future; +use std::pin::Pin; +use std::rc::Rc; +use std::task::{Context, Poll, Waker}; +use tokio::sync::Notify; +use tokio::time::{Instant as TokioInstant, Sleep, sleep_until}; + +// TODO: Consider deduplicating the keyed blocked-sender waiter logic with +// `otap-df-channel` by extracting a shared `sender_waiters.rs` there. +// The current `mpsc` and `mpmc` waiters carry the same tombstone pattern; +// this crate fixes it locally first to keep this PR isolated. +// +// The reusable part is the waiter mechanism itself: stable waiter keys, slot +// reuse, cancellation-safe unregister, and FIFO wake order. The control +// channel would still keep its own `Inner` state machine because control +// admission, fairness, batching, and lifecycle handling are more specialized +// than the generic channel crate. +// +// This PR keeps a local copy on purpose to stay simple and isolated while the +// standalone control-channel design is being reviewed. Any deduplication should +// happen in a second phase and must be benchmarked to confirm that extracting +// the waiter code preserves the current performance characteristics. + +#[derive(Clone, Copy, PartialEq, Eq)] +struct SenderWaiterKey { + // Stable key into the waiter slot array; generation prevents ABA when a + // slot index is reused after cancellation/completion. + index: usize, + generation: u64, +} + +struct SenderWaiterSlot { + generation: u64, + waker: Option, + in_use: bool, + queued: bool, +} + +impl SenderWaiterSlot { + const fn vacant() -> Self { + Self { + generation: 0, + waker: None, + in_use: false, + queued: false, + } + } +} + +const SENDER_WAITER_COMPACT_MIN_QUEUE_LEN: usize = 64; + +struct SenderWaiters { + // FIFO queue of waiter keys used to preserve blocked-sender wake order. + queue: VecDeque, + // Slot storage allows O(1) refresh/unregister by key without scanning the queue. + slots: Vec, + // Reuse released slots to avoid per-contention allocations. + free_slots: Vec, + // Number of live queued waiters still represented in `queue`. + queued_live: usize, + // Number of stale queued entries left behind by cancellation and awaiting + // wake-time cleanup or periodic compaction. + queued_stale: usize, + next_generation: u64, +} + +impl SenderWaiters { + fn new() -> Self { + Self { + queue: VecDeque::new(), + slots: Vec::new(), + free_slots: Vec::new(), + queued_live: 0, + queued_stale: 0, + next_generation: 0, + } + } + + fn wake_n(&mut self, mut count: usize) { + while count > 0 { + let Some(key) = self.queue.pop_front() else { + break; + }; + let Some(slot) = self.slots.get_mut(key.index) else { + self.queued_stale -= 1; + continue; + }; + // Stale queue entries are expected when futures are canceled or + // re-queued; skip until we find a live queued waiter. + if !slot.in_use || slot.generation != key.generation || !slot.queued { + self.queued_stale -= 1; + continue; + } + slot.queued = false; + self.queued_live -= 1; + if let Some(waker) = slot.waker.take() { + waker.wake(); + count -= 1; + } + } + } + + fn wake_all(&mut self) { + while let Some(key) = self.queue.pop_front() { + let Some(slot) = self.slots.get_mut(key.index) else { + self.queued_stale -= 1; + continue; + }; + if !slot.in_use || slot.generation != key.generation || !slot.queued { + self.queued_stale -= 1; + continue; + } + slot.queued = false; + self.queued_live -= 1; + if let Some(waker) = slot.waker.take() { + waker.wake(); + } + } + } + + fn register_or_refresh(&mut self, waiter_key: &mut Option, waker: &Waker) { + if let Some(existing_key) = *waiter_key { + if let Some(slot) = self.slots.get_mut(existing_key.index) { + if slot.in_use && slot.generation == existing_key.generation { + if slot + .waker + .as_ref() + .is_none_or(|existing| !existing.will_wake(waker)) + { + slot.waker = Some(waker.clone()); + } + if !slot.queued { + slot.queued = true; + self.queue.push_back(existing_key); + self.queued_live += 1; + } + return; + } + } + } + + let index = if let Some(index) = self.free_slots.pop() { + index + } else { + self.slots.push(SenderWaiterSlot::vacant()); + self.slots.len() - 1 + }; + + let generation = self.next_generation; + self.next_generation = self.next_generation.wrapping_add(1); + + let slot = &mut self.slots[index]; + slot.generation = generation; + slot.waker = Some(waker.clone()); + slot.in_use = true; + slot.queued = true; + + let key = SenderWaiterKey { index, generation }; + self.queue.push_back(key); + self.queued_live += 1; + *waiter_key = Some(key); + } + + fn unregister(&mut self, waiter_key: SenderWaiterKey) { + let Some(slot) = self.slots.get_mut(waiter_key.index) else { + return; + }; + if !slot.in_use || slot.generation != waiter_key.generation { + return; + } + slot.in_use = false; + if slot.queued { + self.queued_live -= 1; + self.queued_stale += 1; + } + slot.queued = false; + slot.waker = None; + self.free_slots.push(waiter_key.index); + self.maybe_compact_queue(); + } + + fn maybe_compact_queue(&mut self) { + if self.queue.len() < SENDER_WAITER_COMPACT_MIN_QUEUE_LEN { + return; + } + if self.queued_stale * 2 < self.queue.len() { + return; + } + self.compact_queue(); + } + + fn compact_queue(&mut self) { + self.queue.retain(|key| { + self.slots + .get(key.index) + .is_some_and(|slot| slot.in_use && slot.queued && slot.generation == key.generation) + }); + self.queued_live = self.queue.len(); + self.queued_stale = 0; + } +} + +struct State { + /// Single-threaded ownership of the queue core plus the wake state layered + /// around it. `Inner` owns all admission, batching, fairness, and + /// lifecycle semantics; this wrapper only handles channel-receiver waiting, + /// sender liveness, and wake routing. + inner: RefCell>, + notify: Notify, + /// Number of live sender handles. When the last sender drops, the channel + /// transitions to closed so the channel receiver can finish after buffered + /// work is drained. + sender_count: Cell, + /// Contended completion sends register here so capacity release can wake a + /// bounded FIFO subset of blocked senders without waking every waiter. + /// + /// The keyed waiter-slot structure is adapted from the local MPSC channel + /// in `otap-df-channel`. The reuse is intentionally narrow: only the + /// blocked-sender waiting mechanism is borrowed here, while control + /// admission and delivery remain specific to `Inner`. + sender_waiters: RefCell>, +} + +impl State { + fn register_or_refresh_sender_waiter( + &self, + waiter_key: &mut Option, + waker: &Waker, + ) { + self.sender_waiters + .borrow_mut() + .get_or_insert_with(SenderWaiters::new) + .register_or_refresh(waiter_key, waker); + } + + fn unregister_sender_waiter(&self, waiter_key: SenderWaiterKey) { + if let Some(waiters) = self.sender_waiters.borrow_mut().as_mut() { + waiters.unregister(waiter_key); + } + } + + fn wake_completion_waiters(&self, slots_freed: usize) { + if slots_freed == 0 { + return; + } + if let Some(waiters) = self.sender_waiters.borrow_mut().as_mut() { + waiters.wake_n(slots_freed); + } + } + + fn wake_all_sender_waiters(&self) { + if let Some(waiters) = self.sender_waiters.borrow_mut().as_mut() { + waiters.wake_all(); + } + } +} + +struct ControlSenderCore { + state: Rc>, +} + +struct ControlReceiverCore { + state: Rc>, +} + +/// Sender for receiver nodes, including `DrainIngress`. +pub struct ReceiverControlSender { + inner: ControlSenderCore, +} + +/// Sender for non-receiver nodes. +pub struct NodeControlSender { + inner: ControlSenderCore, +} + +/// Receiver for receiver-role control events. +pub struct ReceiverControlReceiver { + inner: ControlReceiverCore, +} + +/// Receiver for non-receiver node control events. +pub struct NodeControlReceiver { + inner: ControlReceiverCore, +} + +struct SendFuture<'a, PData, Meta = ()> { + sender: &'a ControlSenderCore, + // Owned command being retried across polls. `poll()` takes it out before + // each send attempt so ownership can move into `try_send()`. If the queue + // is still full, the returned command is stored back here before the + // future parks, which keeps the original command available across wakeups, + // cancellation, or close. + pending_cmd: Option>, + waiter_key: Option, + // Forced-shutdown deadlines are re-checked while blocked so a completion + // sender does not wait forever for capacity that shutdown semantics will + // eventually abandon. + deadline_sleep: Option>>, +} + +impl Clone for ControlSenderCore { + fn clone(&self) -> Self { + self.state + .sender_count + .set(self.state.sender_count.get().saturating_add(1)); + Self { + state: Rc::clone(&self.state), + } + } +} + +impl Drop for ControlSenderCore { + fn drop(&mut self) { + let next = self.state.sender_count.get().saturating_sub(1); + self.state.sender_count.set(next); + if next == 0 { + let closed = self.state.inner.borrow_mut().close(); + if closed { + self.state.notify.notify_waiters(); + } + } + } +} + +impl ControlSenderCore { + /// Records `DrainIngress` outside the bounded completion capacity. + /// + /// Guarantee: once accepted, the lifecycle token is visible to the channel + /// receiver even if ordinary completion traffic is saturated. Acceptance + /// wakes the channel receiver promptly so it can observe the drain request, + /// but it does not wake blocked completion senders because drain does not + /// change completion admissibility or free completion capacity. The carried + /// deadline is for the channel receiver's own ingress-drain logic; unlike + /// `Shutdown`, it does not make the control-channel queue itself + /// deadline-driven. + fn accept_drain_ingress(&self, msg: DrainIngressMsg) -> LifecycleSendResult { + let result = self.state.inner.borrow_mut().record_drain_ingress(msg); + if matches!(result, LifecycleSendResult::Accepted) { + self.state.notify.notify_waiters(); + } + result + } + + /// Records `Shutdown` outside the bounded completion capacity. + /// + /// Guarantee: once accepted, shutdown is remembered even if the completion + /// queue is full. Acceptance wakes both blocked senders and the channel + /// receiver so forced-shutdown deadlines and abandonment rules do not + /// depend on later producer activity. + fn accept_shutdown(&self, msg: ShutdownMsg) -> LifecycleSendResult { + let result = self.state.inner.borrow_mut().record_shutdown(msg); + if matches!(result, LifecycleSendResult::Accepted) { + self.state.wake_all_sender_waiters(); + self.state.notify.notify_waiters(); + } + result + } + + /// Attempts one immediate non-lifecycle send without waiting. + /// + /// Guarantee: this never parks the caller. It either applies the command + /// immediately, returns `Full` with the original command, or returns + /// `Closed` with the original command. Successful admission wakes the + /// channel receiver once so newly available work can be observed. + fn try_send( + &self, + cmd: ControlCmd, + ) -> Result> { + let result = self.state.inner.borrow_mut().try_send(cmd); + if matches!(result, Ok(SendOutcome::Accepted | SendOutcome::Replaced)) { + self.state.notify.notify_one(); + } + result + } + + /// Sends one non-lifecycle command, waiting asynchronously only when + /// completion capacity is temporarily exhausted. + /// + /// Guarantee: the future preserves the original command until it is either + /// accepted/replaced or returned as `Closed`. While blocked, the future is + /// cancellation-safe and re-checks forced-shutdown deadlines so it does + /// not wait forever for capacity that terminal state will eventually + /// abandon. + async fn send( + &self, + cmd: ControlCmd, + ) -> Result> { + SendFuture { + sender: self, + pending_cmd: Some(cmd), + waiter_key: None, + deadline_sleep: None, + } + .await + } + + /// Closes the channel for new sends. + /// + /// Guarantee: close wakes both blocked senders and the channel receiver so + /// all parties can observe terminal state without needing any further + /// producer activity. + fn close(&self) { + let closed = self.state.inner.borrow_mut().close(); + if closed { + self.state.wake_all_sender_waiters(); + self.state.notify.notify_waiters(); + } + } + + /// Returns a snapshot of the current queue occupancy and lifecycle state. + /// + /// Guarantee: this is observational only; it does not change wake state or + /// queue contents. + fn stats(&self) -> ControlChannelStats { + self.state.inner.borrow().stats() + } +} + +impl<'a, PData, Meta> Unpin for SendFuture<'a, PData, Meta> {} + +impl Drop for SendFuture<'_, PData, Meta> { + fn drop(&mut self) { + let Some(waiter_key) = self.waiter_key.take() else { + return; + }; + self.sender.state.unregister_sender_waiter(waiter_key); + } +} + +impl Future for SendFuture<'_, PData, Meta> { + type Output = Result>; + + // `poll()` implements a small retry state machine around `try_send()`: + // + // - it takes ownership of the current command from `pending_cmd` + // - it attempts immediate admission through `try_send()` + // - on `Full`, it stores the command back, refreshes waiter registration, + // arms or refreshes the forced-shutdown deadline sleep, and returns + // `Pending` + // - on `Accepted`/`Replaced` or `Closed`, it clears waiter/deadline state + // and returns `Ready` + // + // Guarantees: + // - the original command is never lost while the future is pending + // - cancellation is safe because `Drop` unregisters any outstanding waiter + // - blocked sends do not depend on a future producer wakeup to observe + // forced shutdown progress, because the deadline path loops back into + // admission checks on its own + fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll { + let this = self.as_mut().get_mut(); + + loop { + // `pending_cmd` is populated when the future is created and is + // restored before every `Poll::Pending` return on the full-queue + // path. Ready returns exit immediately, so hitting this `expect` + // would mean the future was polled again after completion or that + // this function broke its own local invariant. + let cmd = this + .pending_cmd + .take() + .expect("SendFuture missing pending_cmd invariant"); + + match this.sender.try_send(cmd) { + Ok(outcome) => { + if let Some(waiter_key) = this.waiter_key.take() { + this.sender.state.unregister_sender_waiter(waiter_key); + } + this.deadline_sleep = None; + return Poll::Ready(Ok(outcome)); + } + Err(crate::TrySendError::Closed(cmd)) => { + if let Some(waiter_key) = this.waiter_key.take() { + this.sender.state.unregister_sender_waiter(waiter_key); + } + this.deadline_sleep = None; + return Poll::Ready(Err(SendError::Closed(cmd))); + } + Err(crate::TrySendError::Full { cmd, .. }) => { + this.pending_cmd = Some(cmd); + } + } + + let mut waiter_key = this.waiter_key; + this.sender + .state + .register_or_refresh_sender_waiter(&mut waiter_key, cx.waker()); + this.waiter_key = waiter_key; + + let deadline = this.sender.state.inner.borrow().next_deadline(); + if let Some(deadline) = deadline { + let deadline = TokioInstant::from_std(deadline); + let needs_reset = this + .deadline_sleep + .as_ref() + .is_none_or(|sleep| sleep.deadline() != deadline); + if needs_reset { + this.deadline_sleep = Some(Box::pin(sleep_until(deadline))); + } + if let Some(sleep) = this.deadline_sleep.as_mut() { + if sleep.as_mut().poll(cx).is_ready() { + this.deadline_sleep = None; + if let Some(waiter_key) = this.waiter_key.take() { + this.sender.state.unregister_sender_waiter(waiter_key); + } + continue; + } + } + } else { + this.deadline_sleep = None; + } + + return Poll::Pending; + } + } +} + +impl Clone for ReceiverControlSender { + fn clone(&self) -> Self { + Self { + inner: self.inner.clone(), + } + } +} + +impl Clone for NodeControlSender { + fn clone(&self) -> Self { + Self { + inner: self.inner.clone(), + } + } +} + +impl ReceiverControlSender { + /// Accepts a receiver-drain lifecycle token without consuming bounded + /// control capacity. + /// + /// Guarantee: once accepted, the drain token remains observable to the + /// channel receiver even if ordinary completion traffic is saturated. + /// Acceptance wakes the channel receiver promptly, but it does not wake + /// blocked completion senders because drain does not free completion + /// capacity. The embedded deadline is intended for receiver-local ingress- + /// drain behavior, not for queue-level forced shutdown in the control + /// channel itself. + #[must_use] + pub fn accept_drain_ingress(&self, msg: DrainIngressMsg) -> LifecycleSendResult { + self.inner.accept_drain_ingress(msg) + } + + /// Accepts a shutdown lifecycle token without consuming bounded control + /// capacity. + /// + /// Guarantee: once accepted, shutdown is remembered even if the completion + /// queue is full. Acceptance wakes both blocked senders and the channel + /// receiver so forced-shutdown deadlines and abandonment rules do not + /// depend on later producer activity. + #[must_use] + pub fn accept_shutdown(&self, msg: ShutdownMsg) -> LifecycleSendResult { + self.inner.accept_shutdown(msg) + } + + /// Attempts to send a non-lifecycle control command without waiting for + /// capacity. + /// + /// Guarantee: this never parks the caller. It either applies the command + /// immediately, returns `Full` with the original command, or returns + /// `Closed` with the original command. Successful admission wakes the + /// channel receiver once so newly available work can be observed. + pub fn try_send( + &self, + cmd: ControlCmd, + ) -> Result> { + self.inner.try_send(cmd) + } + + /// Sends a non-lifecycle control command, waiting asynchronously for + /// bounded completion capacity when needed. + /// + /// Guarantee: this only waits when completion capacity is exhausted. The + /// original command is preserved until it is accepted/replaced or returned + /// as `Closed`, and the wait path is cancellation-safe. + pub async fn send( + &self, + cmd: ControlCmd, + ) -> Result> { + self.inner.send(cmd).await + } + + /// Closes the channel for new sends. + /// + /// Guarantee: close wakes both blocked senders and the channel receiver so + /// all parties can observe terminal state without needing any further + /// producer activity. + pub fn close(&self) { + self.inner.close(); + } + + /// Returns a snapshot of channel occupancy and lifecycle state. + /// + /// Guarantee: this is observational only; it does not change wake state or + /// queue contents. + #[must_use] + pub fn stats(&self) -> ControlChannelStats { + self.inner.stats() + } +} + +impl NodeControlSender { + /// Accepts a shutdown lifecycle token without consuming bounded control + /// capacity. + /// + /// Guarantee: once accepted, shutdown is remembered even if the completion + /// queue is full. Acceptance wakes both blocked senders and the channel + /// receiver so forced-shutdown deadlines and abandonment rules do not + /// depend on later producer activity. + #[must_use] + pub fn accept_shutdown(&self, msg: ShutdownMsg) -> LifecycleSendResult { + self.inner.accept_shutdown(msg) + } + + /// Attempts to send a non-lifecycle control command without waiting for + /// capacity. + /// + /// Guarantee: this never parks the caller. It either applies the command + /// immediately, returns `Full` with the original command, or returns + /// `Closed` with the original command. Successful admission wakes the + /// channel receiver once so newly available work can be observed. + pub fn try_send( + &self, + cmd: ControlCmd, + ) -> Result> { + self.inner.try_send(cmd) + } + + /// Sends a non-lifecycle control command, waiting asynchronously for + /// bounded completion capacity when needed. + /// + /// Guarantee: this only waits when completion capacity is exhausted. The + /// original command is preserved until it is accepted/replaced or returned + /// as `Closed`, and the wait path is cancellation-safe. + pub async fn send( + &self, + cmd: ControlCmd, + ) -> Result> { + self.inner.send(cmd).await + } + + /// Closes the channel for new sends. + /// + /// Guarantee: close wakes both blocked senders and the channel receiver so + /// all parties can observe terminal state without needing any further + /// producer activity. + pub fn close(&self) { + self.inner.close(); + } + + /// Returns a snapshot of channel occupancy and lifecycle state. + /// + /// Guarantee: this is observational only; it does not change wake state or + /// queue contents. + #[must_use] + pub fn stats(&self) -> ControlChannelStats { + self.inner.stats() + } +} + +impl ControlReceiverCore { + fn notify_after_pop(&self, event: &CoreControlEvent) { + match event { + CoreControlEvent::CompletionBatch(batch) => { + // One freed completion slot should wake one blocked sender, so + // batch drains wake a bounded FIFO subset instead of a herd. + self.state.wake_completion_waiters(batch.len()); + } + CoreControlEvent::Shutdown(_) => { + // Shutdown is terminal state, not ordinary capacity release, so + // all blocked senders must wake and observe the new state. + self.state.wake_all_sender_waiters(); + self.state.notify.notify_waiters(); + } + CoreControlEvent::DrainIngress(_) + | CoreControlEvent::Config { .. } + | CoreControlEvent::TimerTick + | CoreControlEvent::CollectTelemetry => {} + } + } + + async fn recv_internal(&mut self) -> Option> { + loop { + { + let mut inner = self.state.inner.borrow_mut(); + if let Some(event) = inner.pop_event() { + drop(inner); + self.notify_after_pop(&event); + return Some(event); + } + if inner.closed { + return None; + } + } + + // Avoid waiting if the queue changed after the channel receiver + // decided it was currently empty. Shutdown deadlines are part of + // the wait condition so forced shutdown does not depend on a + // producer arriving later to wake the channel receiver. + let notified = self.state.notify.notified(); + let (version, deadline) = { + let inner = self.state.inner.borrow(); + (inner.version, inner.next_deadline()) + }; + + if self.state.inner.borrow().version != version { + continue; + } + + if let Some(deadline) = deadline { + tokio::select! { + _ = notified => {} + _ = sleep_until(TokioInstant::from_std(deadline)) => {} + } + } else { + notified.await; + } + } + } + + fn try_recv_internal(&mut self) -> Option> { + let event = self.state.inner.borrow_mut().pop_event(); + if let Some(event_ref) = &event { + self.notify_after_pop(event_ref); + } + event + } + + fn stats(&self) -> ControlChannelStats { + self.state.inner.borrow().stats() + } +} + +impl ReceiverControlReceiver { + /// Receives the next available control event, or `None` if the channel is + /// closed and fully drained. + pub async fn recv(&mut self) -> Option> { + self.inner + .recv_internal() + .await + .map(ReceiverControlEvent::::from_core) + } + + /// Attempts to receive one control event without waiting. + pub fn try_recv(&mut self) -> Option> { + self.inner + .try_recv_internal() + .map(ReceiverControlEvent::::from_core) + } + + /// Returns a snapshot of channel occupancy and lifecycle state. + #[must_use] + pub fn stats(&self) -> ControlChannelStats { + self.inner.stats() + } +} + +impl NodeControlReceiver { + /// Receives the next available control event, or `None` if the channel is + /// closed and fully drained. + pub async fn recv(&mut self) -> Option> { + self.inner + .recv_internal() + .await + .map(NodeControlEvent::::from_core) + } + + /// Attempts to receive one control event without waiting. + pub fn try_recv(&mut self) -> Option> { + self.inner + .try_recv_internal() + .map(NodeControlEvent::::from_core) + } + + /// Returns a snapshot of channel occupancy and lifecycle state. + #[must_use] + pub fn stats(&self) -> ControlChannelStats { + self.inner.stats() + } +} + +fn channel_state( + config: ControlChannelConfig, +) -> Result< + ( + ControlSenderCore, + ControlReceiverCore, + ), + ConfigError, +> { + config.validate()?; + let state = Rc::new(State { + inner: RefCell::new(Inner::new(config)), + notify: Notify::new(), + sender_count: Cell::new(1), + sender_waiters: RefCell::new(None), + }); + + Ok(( + ControlSenderCore { + state: Rc::clone(&state), + }, + ControlReceiverCore { state }, + )) +} + +/// Creates a new control-aware sender/channel-receiver pair for receiver nodes. +pub fn receiver_channel( + config: ControlChannelConfig, +) -> Result<(ReceiverControlSender, ReceiverControlReceiver), ConfigError> { + receiver_channel_with_meta(config) +} + +/// Creates a new control-aware sender/channel-receiver pair for receiver nodes +/// with explicit completion metadata carried by `Ack`/`Nack`. +pub fn receiver_channel_with_meta( + config: ControlChannelConfig, +) -> Result< + ( + ReceiverControlSender, + ReceiverControlReceiver, + ), + ConfigError, +> { + let (sender, receiver) = channel_state(config)?; + Ok(( + ReceiverControlSender { inner: sender }, + ReceiverControlReceiver { inner: receiver }, + )) +} + +/// Creates a new control-aware sender/channel-receiver pair for non-receiver +/// nodes. +pub fn node_channel( + config: ControlChannelConfig, +) -> Result<(NodeControlSender, NodeControlReceiver), ConfigError> { + node_channel_with_meta(config) +} + +/// Creates a new control-aware sender/channel-receiver pair for non-receiver +/// nodes with explicit completion metadata carried by `Ack`/`Nack`. +pub fn node_channel_with_meta( + config: ControlChannelConfig, +) -> Result< + ( + NodeControlSender, + NodeControlReceiver, + ), + ConfigError, +> { + let (sender, receiver) = channel_state(config)?; + Ok(( + NodeControlSender { inner: sender }, + NodeControlReceiver { inner: receiver }, + )) +} + +#[cfg(test)] +mod sender_waiters_tests { + use super::{SENDER_WAITER_COMPACT_MIN_QUEUE_LEN, SenderWaiters}; + use std::sync::{Arc, Mutex}; + use std::task::{Wake, Waker}; + + struct NoopWake; + + impl Wake for NoopWake { + fn wake(self: Arc) {} + + fn wake_by_ref(self: &Arc) {} + } + + struct RecordingWake { + id: usize, + wake_log: Arc>>, + } + + impl RecordingWake { + fn new(id: usize, wake_log: Arc>>) -> Self { + Self { id, wake_log } + } + } + + impl Wake for RecordingWake { + fn wake(self: Arc) { + self.wake_log.lock().unwrap().push(self.id); + } + + fn wake_by_ref(self: &Arc) { + self.wake_log.lock().unwrap().push(self.id); + } + } + + fn noop_waker() -> Waker { + Waker::from(Arc::new(NoopWake)) + } + + #[test] + fn repeated_unregister_keeps_queue_length_bounded_without_wakes() { + // Scenario: blocked send futures are repeatedly canceled before any + // completion capacity is released, so no wake path drains tombstones. + // Guarantees: periodic compaction keeps the waiter queue bounded even + // under pure cancellation churn with no intervening wakeups. + let mut waiters = SenderWaiters::new(); + let waker = noop_waker(); + + for _ in 0..(SENDER_WAITER_COMPACT_MIN_QUEUE_LEN * 3) { + let mut waiter_key = None; + waiters.register_or_refresh(&mut waiter_key, &waker); + waiters.unregister(waiter_key.unwrap()); + + assert!(waiters.queue.len() < SENDER_WAITER_COMPACT_MIN_QUEUE_LEN); + assert_eq!(waiters.queued_live, 0); + assert_eq!(waiters.queue.len(), waiters.queued_stale); + } + } + + #[test] + fn compaction_preserves_fifo_order_for_live_waiters() { + // Scenario: compaction runs after many canceled waiters while a few + // live waiters are still queued in FIFO order. + // Guarantees: compaction removes stale tombstones without changing + // the wake order of the remaining live waiters. + let mut waiters = SenderWaiters::new(); + let wake_log = Arc::new(Mutex::new(Vec::new())); + + for id in 1..=3 { + let mut waiter_key = None; + let waker = Waker::from(Arc::new(RecordingWake::new(id, Arc::clone(&wake_log)))); + waiters.register_or_refresh(&mut waiter_key, &waker); + } + + let stale_needed_for_compaction = SENDER_WAITER_COMPACT_MIN_QUEUE_LEN - waiters.queued_live; + let noop_waker = noop_waker(); + for _ in 0..stale_needed_for_compaction { + let mut waiter_key = None; + waiters.register_or_refresh(&mut waiter_key, &noop_waker); + waiters.unregister(waiter_key.unwrap()); + } + + assert_eq!(waiters.queued_stale, 0); + assert_eq!(waiters.queued_live, 3); + assert_eq!(waiters.queue.len(), 3); + + waiters.wake_all(); + + assert_eq!(*wake_log.lock().unwrap(), vec![1, 2, 3]); + assert!(waiters.queue.is_empty()); + assert_eq!(waiters.queued_live, 0); + assert_eq!(waiters.queued_stale, 0); + } + + #[test] + fn compaction_removes_generation_mismatched_stale_entries() { + // Scenario: a slot is canceled, then reused for a later waiter with a + // new generation while the old queue entry is still present. + // Guarantees: compaction drops the stale generation-mismatched entry + // and retains only the live waiter for the reused slot. + let mut waiters = SenderWaiters::new(); + let waker = noop_waker(); + + let mut first = None; + waiters.register_or_refresh(&mut first, &waker); + let first_key = first.unwrap(); + waiters.unregister(first_key); + + let mut second = None; + waiters.register_or_refresh(&mut second, &waker); + let second_key = second.unwrap(); + + waiters.compact_queue(); + + assert_eq!(waiters.queue.len(), 1); + let retained = waiters.queue.front().copied().unwrap(); + assert_eq!(retained.index, second_key.index); + assert_eq!(retained.generation, second_key.generation); + assert_ne!(retained.generation, first_key.generation); + assert_eq!(waiters.queued_live, 1); + assert_eq!(waiters.queued_stale, 0); + } +} diff --git a/rust/otap-dataflow/crates/control-channel/src/core.rs b/rust/otap-dataflow/crates/control-channel/src/core.rs new file mode 100644 index 0000000000..a034f8a9af --- /dev/null +++ b/rust/otap-dataflow/crates/control-channel/src/core.rs @@ -0,0 +1,584 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +//! Queue core for the standalone control-aware channel. +//! +//! In this crate, "queue core" means the pure in-memory state machine that +//! owns all pending control state and decides both admission and delivery +//! order. It does not do any async waiting, waking, or handle management; the +//! outer channel layer in `channel.rs` adds those behaviors around this core. +//! +//! The queue core is responsible for: +//! +//! - reserved-capacity lifecycle recording for `DrainIngress` and `Shutdown` +//! - bounded admission of completion traffic (`Ack`/`Nack`) +//! - latest-wins/coalesced handling for normal control work +//! - bounded fairness between completion batches and normal control events +//! - deadline-bounded forced shutdown progress +//! +//! [`Inner`] is the concrete queue-core state machine. Its generic parameter +//! `Meta` is carried only by completion messages, so callers can attach unwind +//! or routing context while the queue core remains agnostic about its meaning. + +use crate::types::CoreControlEvent; +use crate::{ + AdmissionClass, CompletionMsg, ControlChannelConfig, ControlChannelStats, ControlCmd, + DrainIngressMsg, LifecycleSendResult, Phase, SendOutcome, ShutdownMsg, TrySendError, +}; +use std::collections::VecDeque; +use std::time::Instant; + +/// Delivery class used by the queue core when it round-robins pending normal +/// non-completion control work. +#[derive(Clone, Copy)] +enum NormalEventClass { + /// Latest-wins config updates. Only the newest pending config matters. + Config, + /// Coalesced timer wakeup emitted at most once while pending. + TimerTick, + /// Coalesced telemetry collection token emitted at most once while pending. + CollectTelemetry, +} + +impl NormalEventClass { + /// Advances the round-robin cursor used to provide bounded fairness across + /// normal non-completion control work. + fn next(self) -> Self { + match self { + Self::Config => Self::TimerTick, + Self::TimerTick => Self::CollectTelemetry, + Self::CollectTelemetry => Self::Config, + } + } +} + +/// Queue-core state machine for one control-channel instance. +/// +/// In this crate, a "queue core" is the pure in-memory policy engine that +/// owns all pending control state and decides admission and delivery order. +/// It does not perform async waiting, wakeups, or handle management; the outer +/// channel layer in `channel.rs` adds those behaviors around this state. +/// +/// `Inner` stores pending lifecycle, completion, and normal +/// control work and emits [`CoreControlEvent`] values in policy order. The +/// generic parameter `Meta` is carried only inside completion messages +/// (`Ack`/`Nack`), so the queue core stays agnostic about any unwind or +/// routing meaning attached to that metadata. +pub(crate) struct Inner { + /// Immutable admission and fairness policy for this channel instance. + pub(crate) config: ControlChannelConfig, + /// Current lifecycle phase. This drives which classes can still be + /// admitted and which events may be delivered next. + pub(crate) phase: Phase, + /// Terminal close flag. Once set, no new sends are accepted and the + /// receiver returns `None` after buffered work is drained. + pub(crate) closed: bool, + // Generation counter used with `Notify` to avoid check-then-sleep races in + // sender/receiver wait loops. Any state transition that can unblock a + // waiter must bump this value before notifications are observed. + pub(crate) version: u64, + /// Reserved lifecycle tokens. They bypass bounded completion capacity and + /// are delivered ahead of normal control work. + drain_ingress: Option, + shutdown: Option, + /// Deadline after which shutdown stops waiting for completion backlog and + /// forces terminal progress. + shutdown_deadline: Option, + /// Sticky lifecycle observability flags kept even after the corresponding + /// token has been delivered. + drain_ingress_recorded: bool, + shutdown_recorded: bool, + shutdown_forced: bool, + /// Lossless backpressured completion backlog (`Ack`/`Nack`). + completion: VecDeque>, + /// Latest-wins normal control state. + latest_config: Option, + /// Coalesced best-effort normal control state. + pending_timer_tick: bool, + pending_collect_telemetry: bool, + /// Number of completion messages emitted since the last normal control + /// event. This enforces bounded fairness between completion traffic and + /// normal control work. + completion_burst_len: usize, + /// Monotonic counters used for observability and tests. + completion_batch_emitted_total: u64, + completion_message_emitted_total: u64, + config_replaced_total: u64, + timer_tick_coalesced_total: u64, + collect_telemetry_coalesced_total: u64, + normal_event_dropped_during_drain_total: u64, + completion_abandoned_on_forced_shutdown_total: u64, + /// Round-robin cursor for normal control events. This avoids fixed-priority + /// starvation among config, timer, and telemetry work. + next_normal_event: NormalEventClass, +} + +impl Inner { + /// Creates an empty queue core. The outer channel wrapper adds waiting + /// behavior around this state machine. + /// + /// Guarantee: the returned core starts in `Phase::Normal` with no pending + /// lifecycle, completion, or normal control work. + pub(crate) fn new(config: ControlChannelConfig) -> Self { + Self { + config, + phase: Phase::Normal, + closed: false, + version: 0, + drain_ingress: None, + shutdown: None, + shutdown_deadline: None, + drain_ingress_recorded: false, + shutdown_recorded: false, + shutdown_forced: false, + completion: VecDeque::new(), + latest_config: None, + pending_timer_tick: false, + pending_collect_telemetry: false, + completion_burst_len: 0, + completion_batch_emitted_total: 0, + completion_message_emitted_total: 0, + config_replaced_total: 0, + timer_tick_coalesced_total: 0, + collect_telemetry_coalesced_total: 0, + normal_event_dropped_during_drain_total: 0, + completion_abandoned_on_forced_shutdown_total: 0, + next_normal_event: NormalEventClass::Config, + } + } + + /// Returns a point-in-time snapshot of queue occupancy, lifecycle flags, + /// and cumulative counters. + /// + /// Guarantee: this is observational only; it does not mutate queue state, + /// fairness counters, or lifecycle progress. + pub(crate) fn stats(&self) -> ControlChannelStats { + ControlChannelStats { + phase: self.phase, + drain_ingress_recorded: self.drain_ingress_recorded, + shutdown_recorded: self.shutdown_recorded, + has_pending_drain_ingress: self.drain_ingress.is_some(), + has_pending_shutdown: self.shutdown.is_some(), + completion_len: self.completion.len(), + has_pending_config: self.latest_config.is_some(), + has_pending_timer_tick: self.pending_timer_tick, + has_pending_collect_telemetry: self.pending_collect_telemetry, + completion_burst_len: self.completion_burst_len, + completion_batch_emitted_total: self.completion_batch_emitted_total, + completion_message_emitted_total: self.completion_message_emitted_total, + config_replaced_total: self.config_replaced_total, + timer_tick_coalesced_total: self.timer_tick_coalesced_total, + collect_telemetry_coalesced_total: self.collect_telemetry_coalesced_total, + normal_event_dropped_during_drain_total: self.normal_event_dropped_during_drain_total, + completion_abandoned_on_forced_shutdown_total: self + .completion_abandoned_on_forced_shutdown_total, + shutdown_forced: self.shutdown_forced, + closed: self.closed, + } + } + + /// Closes the queue core. This is triggered either explicitly by a + /// sender or implicitly when the last sender handle drops. + /// + /// Guarantee: close is idempotent. After the first successful close, no new + /// sends are accepted and the outer channel receiver will eventually observe + /// `None` after buffered work is drained. + pub(crate) fn close(&mut self) -> bool { + if self.closed { + return false; + } + self.closed = true; + self.bump_version(); + true + } + + /// Returns the next deadline that senders/receivers must wait on in + /// addition to notifications. Only shutdown currently arms an internal + /// deadline. + /// + /// Guarantee: `Some(deadline)` means waiting until that instant may change + /// what `pop_event()` would return. `None` means notifications alone are + /// sufficient because the queue core is either closed, already forced, or + /// not currently deadline-driven. + pub(crate) fn next_deadline(&self) -> Option { + if self.closed || self.shutdown_forced { + return None; + } + if self.phase == Phase::ShutdownRecorded { + return self.shutdown_deadline; + } + None + } + + /// Attempts to admit a non-lifecycle control command using the current + /// phase and bounded-capacity policy. + /// + /// Guarantee: this never blocks. It either updates the queue core + /// immediately or returns `Full`/`Closed` with the original command so the + /// caller retains ownership. Lifecycle messages are intentionally excluded + /// from this path and must use the reserved-capacity record methods. + pub(crate) fn try_send( + &mut self, + cmd: ControlCmd, + ) -> Result> { + if self.phase == Phase::ShutdownRecorded { + self.refresh_shutdown_force(Instant::now()); + } + if self.closed || self.shutdown_forced { + return Err(TrySendError::Closed(cmd)); + } + + match cmd { + ControlCmd::Ack(ack) => { + if self.completion.len() >= self.config.completion_msg_capacity { + return Err(TrySendError::Full { + admission_class: AdmissionClass::Backpressured, + cmd: ControlCmd::Ack(ack), + }); + } + Ok(self.push_completion(CompletionMsg::Ack(ack))) + } + ControlCmd::Nack(nack) => { + if self.completion.len() >= self.config.completion_msg_capacity { + return Err(TrySendError::Full { + admission_class: AdmissionClass::Backpressured, + cmd: ControlCmd::Nack(nack), + }); + } + Ok(self.push_completion(CompletionMsg::Nack(nack))) + } + ControlCmd::Config { config } => Ok(self.send_config(config)), + ControlCmd::TimerTick => Ok(self.send_timer_tick()), + ControlCmd::CollectTelemetry => Ok(self.send_telemetry_tick()), + } + } + + /// Records the receiver-only drain lifecycle token. The token is reserved + /// capacity, delivered ahead of bounded control traffic, and clears pending + /// normal control work that no longer matters once drain has started. + /// + /// Guarantee: once accepted, `DrainIngress` becomes observable to the queue + /// consumer even if the completion backlog is saturated. Accepting drain + /// also drops normal control work because config/timer/telemetry no longer + /// matter once ingress shutdown has begun. The `DrainIngressMsg::deadline` + /// is carried through for the receiver event loop to interpret; unlike + /// `Shutdown`, it does not arm queue-core forced-progress behavior here. + pub(crate) fn record_drain_ingress(&mut self, msg: DrainIngressMsg) -> LifecycleSendResult { + if self.closed { + return LifecycleSendResult::Closed; + } + if self.drain_ingress_recorded { + return LifecycleSendResult::AlreadyAccepted; + } + + self.clear_normal_pending(); + self.drain_ingress = Some(msg); + self.drain_ingress_recorded = true; + if self.phase == Phase::Normal { + self.phase = Phase::IngressDrainRecorded; + } + self.bump_version(); + LifecycleSendResult::Accepted + } + + /// Records shutdown with its terminal deadline. Shutdown admission is + /// reserved-capacity and flips the queue into terminal-progress mode. + /// + /// Guarantee: once accepted, shutdown is remembered even if completion + /// capacity is full. Pending normal control is discarded immediately, + /// future normal control is rejected, and delivery thereafter is governed + /// by completion draining plus the forced-shutdown deadline. + pub(crate) fn record_shutdown(&mut self, msg: ShutdownMsg) -> LifecycleSendResult { + if self.closed { + return LifecycleSendResult::Closed; + } + if self.shutdown_recorded { + return LifecycleSendResult::AlreadyAccepted; + } + + self.clear_normal_pending(); + self.shutdown_deadline = Some(msg.deadline); + self.shutdown = Some(msg); + self.shutdown_recorded = true; + self.phase = Phase::ShutdownRecorded; + self.refresh_shutdown_force(Instant::now()); + self.bump_version(); + LifecycleSendResult::Accepted + } + + /// Pops the next deliverable event according to lifecycle precedence, + /// bounded fairness, and deadline-bounded shutdown rules. + /// + /// Guarantee: this is the only place where the queue core converts pending + /// state into a deliverable [`CoreControlEvent`]. It never invents new + /// work, and it returns `None` only when no event is currently deliverable. + /// When shutdown has reached its force deadline, the returned shutdown path + /// abandons any remaining completion backlog and makes terminal progress. + /// + /// Delivery order is: + /// 1. `DrainIngress` if pending + /// 2. shutdown-mode completion draining, subject to deadline forcing + /// 3. in normal phase, bounded alternation between completion batches and + /// round-robin normal control events + pub(crate) fn pop_event(&mut self) -> Option> { + if self.phase == Phase::ShutdownRecorded { + self.refresh_shutdown_force(Instant::now()); + } + + // Reserved lifecycle delivery always wins. `DrainIngress` is emitted + // once before any remaining completion backlog so receivers stop + // admitting new external work as early as possible. + if let Some(msg) = self.drain_ingress.take() { + self.completion_burst_len = 0; + self.bump_version(); + return Some(CoreControlEvent::DrainIngress(msg)); + } + + // Drain/shutdown phases override normal fairness rules. Once either + // lifecycle has been recorded, normal control work has already been + // discarded and only lifecycle/completion delivery remains relevant. + match self.phase { + Phase::ShutdownRecorded => { + if self.shutdown_forced { + return self.finalize_shutdown(true); + } + if !self.completion.is_empty() { + return Some(self.take_completion_batch(None)); + } + return self.finalize_shutdown(false); + } + Phase::IngressDrainRecorded => { + if !self.completion.is_empty() { + return Some(self.take_completion_batch(None)); + } + return None; + } + Phase::Normal => {} + } + + let has_pending_normal_event = self.has_pending_normal_event(); + + // If only completion work remains, emit it greedily; there is no + // normal traffic left that needs fairness protection. + if !has_pending_normal_event { + if !self.completion.is_empty() { + return Some(self.take_completion_batch(None)); + } + return None; + } + + // Once the completion burst budget is exhausted, force one pending + // normal event before emitting more completion traffic. + if self.completion_burst_len >= self.config.completion_burst_limit { + if let Some(event) = self.take_next_normal_event() { + return Some(event); + } + } + + // Otherwise, prefer completion traffic until the burst limit says one + // normal event must run, but cap the batch so we do not overshoot that + // fairness budget. + if !self.completion.is_empty() { + return Some( + self.take_completion_batch(Some( + self.config + .completion_burst_limit + .saturating_sub(self.completion_burst_len), + )), + ); + } + + if let Some(event) = self.take_next_normal_event() { + return Some(event); + } + + None + } + + /// Checks whether shutdown has crossed its force deadline and, if so, + /// flips the queue into forced terminal progress. + fn refresh_shutdown_force(&mut self, now: Instant) { + if self.shutdown_forced || self.phase != Phase::ShutdownRecorded { + return; + } + + if let Some(deadline) = self.shutdown_deadline { + if now >= deadline { + self.shutdown_forced = true; + self.bump_version(); + } + } + } + + /// Appends one completion message to the lossless backpressured backlog. + fn push_completion(&mut self, msg: CompletionMsg) -> SendOutcome { + self.completion.push_back(msg); + self.bump_version(); + SendOutcome::Accepted + } + + /// Accepts or replaces the latest pending config while the queue remains in + /// normal phase. Config is dropped once drain or shutdown has started. + fn send_config(&mut self, config: serde_json::Value) -> SendOutcome { + if self.phase != Phase::Normal { + self.normal_event_dropped_during_drain_total = self + .normal_event_dropped_during_drain_total + .saturating_add(1); + return SendOutcome::DroppedDuringDrain; + } + + let outcome = if self.latest_config.is_some() { + self.config_replaced_total = self.config_replaced_total.saturating_add(1); + SendOutcome::Replaced + } else { + SendOutcome::Accepted + }; + self.latest_config = Some(config); + self.bump_version(); + outcome + } + + /// Accepts one pending timer tick token. Repeated offers coalesce until + /// the pending token is delivered. + fn send_timer_tick(&mut self) -> SendOutcome { + if self.phase != Phase::Normal { + self.normal_event_dropped_during_drain_total = self + .normal_event_dropped_during_drain_total + .saturating_add(1); + return SendOutcome::DroppedDuringDrain; + } + + if self.pending_timer_tick { + self.timer_tick_coalesced_total = self.timer_tick_coalesced_total.saturating_add(1); + return SendOutcome::Coalesced; + } + self.pending_timer_tick = true; + self.bump_version(); + SendOutcome::Accepted + } + + /// Accepts one pending telemetry collection token. Repeated offers + /// coalesce until the pending token is delivered. + fn send_telemetry_tick(&mut self) -> SendOutcome { + if self.phase != Phase::Normal { + self.normal_event_dropped_during_drain_total = self + .normal_event_dropped_during_drain_total + .saturating_add(1); + return SendOutcome::DroppedDuringDrain; + } + + if self.pending_collect_telemetry { + self.collect_telemetry_coalesced_total = + self.collect_telemetry_coalesced_total.saturating_add(1); + return SendOutcome::Coalesced; + } + + self.pending_collect_telemetry = true; + self.bump_version(); + SendOutcome::Accepted + } + + /// Drops pending normal control state when drain/shutdown begins or after + /// terminal shutdown delivery. + fn clear_normal_pending(&mut self) { + self.latest_config = None; + self.pending_timer_tick = false; + self.pending_collect_telemetry = false; + self.completion_burst_len = 0; + } + + fn has_pending_normal_event(&self) -> bool { + self.latest_config.is_some() || self.pending_timer_tick || self.pending_collect_telemetry + } + + /// Picks the next pending normal control event using the round-robin + /// cursor. Delivering any normal event resets the completion burst counter. + fn take_next_normal_event(&mut self) -> Option> { + for _ in 0..3 { + let candidate = self.next_normal_event; + self.next_normal_event = candidate.next(); + + let event = match candidate { + NormalEventClass::Config => self + .latest_config + .take() + .map(|config| CoreControlEvent::Config { config }), + NormalEventClass::TimerTick => self.pending_timer_tick.then(|| { + self.pending_timer_tick = false; + CoreControlEvent::TimerTick + }), + NormalEventClass::CollectTelemetry => self.pending_collect_telemetry.then(|| { + self.pending_collect_telemetry = false; + CoreControlEvent::CollectTelemetry + }), + }; + + if let Some(event) = event { + self.completion_burst_len = 0; + self.bump_version(); + return Some(event); + } + } + + None + } + + /// Emits one bounded completion batch. When fairness is active, the batch + /// size is further capped so at least one normal event can run before more + /// completion traffic is emitted. + fn take_completion_batch( + &mut self, + fairness_budget: Option, + ) -> CoreControlEvent { + let mut batch_len = self.completion.len().min(self.config.completion_batch_max); + if let Some(limit) = fairness_budget { + batch_len = batch_len.min(limit.max(1)); + } + + let mut batch = Vec::with_capacity(batch_len); + for _ in 0..batch_len { + // `batch_len` is derived from `self.completion.len()` and nothing + // mutates `self.completion` between that check and this loop, so + // this `expect` can only fire if this function breaks its own + // local accounting invariant. + let msg = self + .completion + .pop_front() + .expect("completion batch length/accounting invariant"); + batch.push(msg); + } + self.completion_burst_len = self.completion_burst_len.saturating_add(batch_len); + self.completion_batch_emitted_total = self.completion_batch_emitted_total.saturating_add(1); + self.completion_message_emitted_total = self + .completion_message_emitted_total + .saturating_add(batch_len as u64); + self.bump_version(); + CoreControlEvent::CompletionBatch(batch) + } + + /// Emits the terminal shutdown event. Forced shutdown abandons any + /// remaining completion backlog; graceful shutdown emits only after that + /// backlog has drained. + fn finalize_shutdown(&mut self, forced: bool) -> Option> { + let msg = self.shutdown.take()?; + if forced { + self.completion_abandoned_on_forced_shutdown_total = self + .completion_abandoned_on_forced_shutdown_total + .saturating_add(self.completion.len() as u64); + self.completion.clear(); + } + + self.clear_normal_pending(); + self.shutdown_deadline = None; + self.shutdown_forced = false; + self.closed = true; + self.bump_version(); + Some(CoreControlEvent::Shutdown(msg)) + } + + /// Bumps the waiter generation after any transition that may unblock a + /// sender or receiver. + fn bump_version(&mut self) { + self.version = self.version.wrapping_add(1); + } +} diff --git a/rust/otap-dataflow/crates/control-channel/src/lib.rs b/rust/otap-dataflow/crates/control-channel/src/lib.rs new file mode 100644 index 0000000000..c60867438f --- /dev/null +++ b/rust/otap-dataflow/crates/control-channel/src/lib.rs @@ -0,0 +1,26 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +//! Standalone control-aware bounded channel primitives. +//! +//! This crate is intentionally not integrated into the engine runtime yet. It +//! provides a bounded, policy-aware control channel that can +//! reserve lifecycle delivery, batch completion traffic, and coalesce +//! best-effort control work. + +mod channel; +mod core; +mod types; + +pub use channel::{ + NodeControlReceiver, NodeControlSender, ReceiverControlReceiver, ReceiverControlSender, + node_channel, node_channel_with_meta, receiver_channel, receiver_channel_with_meta, +}; +pub use types::{ + AckMsg, AdmissionClass, CompletionMsg, ConfigError, ControlChannelConfig, ControlChannelStats, + ControlCmd, DrainIngressMsg, LifecycleSendResult, NackMsg, NodeControlEvent, Phase, + ReceiverControlEvent, SendError, SendOutcome, ShutdownMsg, TrySendError, +}; + +#[cfg(test)] +mod tests; diff --git a/rust/otap-dataflow/crates/control-channel/src/tests.rs b/rust/otap-dataflow/crates/control-channel/src/tests.rs new file mode 100644 index 0000000000..fe05bfbbf9 --- /dev/null +++ b/rust/otap-dataflow/crates/control-channel/src/tests.rs @@ -0,0 +1,832 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +use crate::{ + AckMsg, AdmissionClass, CompletionMsg, ConfigError, ControlChannelConfig, ControlCmd, + DrainIngressMsg, LifecycleSendResult, NodeControlEvent, ReceiverControlEvent, SendError, + SendOutcome, ShutdownMsg, TrySendError, node_channel, node_channel_with_meta, receiver_channel, +}; +use std::future::{Future, poll_fn}; +use std::pin::Pin; +use std::sync::{ + Arc, + atomic::{AtomicUsize, Ordering}, +}; +use std::task::{Context, Poll, Wake, Waker}; +use std::time::{Duration, Instant}; + +fn test_config() -> ControlChannelConfig { + ControlChannelConfig { + completion_msg_capacity: 4, + completion_batch_max: 2, + completion_burst_limit: 2, + } +} + +fn ack(value: &str) -> ControlCmd { + ControlCmd::Ack(AckMsg::new(value.to_owned())) +} + +fn nack(value: &str) -> ControlCmd { + ControlCmd::Nack(crate::NackMsg::new("nack", value.to_owned())) +} + +#[derive(Clone, Debug, PartialEq, Eq)] +struct TestMeta { + route: &'static str, + seq: u64, +} + +fn shutdown(deadline: Instant) -> ShutdownMsg { + ShutdownMsg { + deadline, + reason: "shutdown".to_owned(), + } +} + +fn drain(deadline: Instant) -> DrainIngressMsg { + DrainIngressMsg { + deadline, + reason: "drain".to_owned(), + } +} + +async fn is_pending_once(mut future: Pin<&mut F>) -> bool +where + F: Future, +{ + poll_fn(|cx| Poll::Ready(future.as_mut().poll(cx).is_pending())).await +} + +#[derive(Default)] +struct CountingWake { + wake_count: AtomicUsize, +} + +impl CountingWake { + fn count(&self) -> usize { + self.wake_count.load(Ordering::SeqCst) + } +} + +impl Wake for CountingWake { + fn wake(self: Arc) { + let _ = self.wake_count.fetch_add(1, Ordering::SeqCst); + } + + fn wake_by_ref(self: &Arc) { + let _ = self.wake_count.fetch_add(1, Ordering::SeqCst); + } +} + +#[test] +fn zero_completion_capacity_is_rejected_at_validation_and_construction() { + // Scenario: retained completion traffic needs at least one slot, so zero + // completion capacity must be rejected before constructing the channel. + // Guarantees: config validation and both channel constructors reject this + // impossible configuration before any send path can block forever. + let config = ControlChannelConfig { + completion_msg_capacity: 0, + completion_batch_max: 1, + completion_burst_limit: 1, + }; + + assert_eq!( + config.validate(), + Err(ConfigError::ZeroCompletionMsgCapacity) + ); + assert!(matches!( + node_channel::(config.clone()), + Err(ConfigError::ZeroCompletionMsgCapacity) + )); + assert!(matches!( + receiver_channel::(config), + Err(ConfigError::ZeroCompletionMsgCapacity) + )); +} + +#[tokio::test(flavor = "current_thread")] +async fn lifecycle_tokens_remain_deliverable_under_backlog() { + // Scenario: completion backlog is saturated, but reserved-capacity + // lifecycle delivery still gets shutdown through to the receiver. + // Guarantees: lifecycle tokens bypass completion-capacity pressure and + // remain deliverable even when regular completion admission is full. + let deadline = Instant::now() + Duration::from_secs(1); + let (tx, mut rx) = node_channel(ControlChannelConfig { + completion_msg_capacity: 2, + completion_batch_max: 2, + completion_burst_limit: 2, + }) + .unwrap(); + + assert_eq!(tx.try_send(ack("ack-1")).unwrap(), SendOutcome::Accepted); + assert_eq!(tx.try_send(ack("ack-2")).unwrap(), SendOutcome::Accepted); + assert_eq!( + tx.accept_shutdown(shutdown(deadline)), + LifecycleSendResult::Accepted + ); + + let first = rx.recv().await.expect("completion batch should arrive"); + let second = rx.recv().await.expect("shutdown should arrive"); + + assert_eq!( + first, + NodeControlEvent::CompletionBatch(vec![ + CompletionMsg::Ack(AckMsg::new("ack-1".to_owned())), + CompletionMsg::Ack(AckMsg::new("ack-2".to_owned())), + ]) + ); + assert_eq!( + second, + NodeControlEvent::Shutdown(ShutdownMsg { + deadline, + reason: "shutdown".to_owned(), + }) + ); +} + +#[tokio::test(flavor = "current_thread")] +async fn drain_ingress_precedes_shutdown_even_if_shutdown_arrives_first() { + // Scenario: the engine records shutdown before receiver drain, but the + // receiver must still observe `DrainIngress` before `Shutdown`. + // Guarantees: delivery order gives receiver drain precedence over shutdown + // regardless of lifecycle recording order. + let deadline = Instant::now() + Duration::from_secs(1); + let (tx, mut rx) = receiver_channel::(test_config()).unwrap(); + + assert_eq!( + tx.accept_shutdown(shutdown(deadline)), + LifecycleSendResult::Accepted + ); + assert_eq!( + tx.accept_drain_ingress(drain(deadline)), + LifecycleSendResult::Accepted + ); + + assert_eq!( + rx.recv().await, + Some(ReceiverControlEvent::DrainIngress(DrainIngressMsg { + deadline, + reason: "drain".to_owned(), + })) + ); + assert_eq!( + rx.recv().await, + Some(ReceiverControlEvent::Shutdown(ShutdownMsg { + deadline, + reason: "shutdown".to_owned(), + })) + ); +} + +#[tokio::test(flavor = "current_thread")] +async fn config_and_best_effort_work_are_rejected_after_drain_ingress() { + // Scenario: once receiver drain starts, normal control work becomes stale + // and must be dropped while completion traffic continues to drain. + // Guarantees: drain clears normal control semantics, preserves completion + // draining, and records dropped-normal counters for observability. + let deadline = Instant::now() + Duration::from_secs(1); + let (tx, mut rx) = receiver_channel::(test_config()).unwrap(); + + assert_eq!( + tx.accept_drain_ingress(drain(deadline)), + LifecycleSendResult::Accepted + ); + assert_eq!( + tx.try_send(ControlCmd::Config { + config: serde_json::json!({"ignored": true}), + }) + .unwrap(), + SendOutcome::DroppedDuringDrain + ); + assert_eq!( + tx.try_send(ControlCmd::TimerTick).unwrap(), + SendOutcome::DroppedDuringDrain + ); + assert_eq!( + tx.try_send(ControlCmd::CollectTelemetry).unwrap(), + SendOutcome::DroppedDuringDrain + ); + assert_eq!(tx.try_send(ack("ack-1")).unwrap(), SendOutcome::Accepted); + + let stats = tx.stats(); + assert!(stats.drain_ingress_recorded); + assert!(!stats.shutdown_recorded); + assert_eq!(stats.normal_event_dropped_during_drain_total, 3); + + assert_eq!( + rx.recv().await, + Some(ReceiverControlEvent::DrainIngress(DrainIngressMsg { + deadline, + reason: "drain".to_owned(), + })) + ); + assert_eq!( + rx.recv().await, + Some(ReceiverControlEvent::CompletionBatch(vec![ + CompletionMsg::Ack(AckMsg::new("ack-1".to_owned())) + ])) + ); + assert_eq!(rx.try_recv(), None); +} + +#[tokio::test(flavor = "current_thread")] +async fn completion_batching_preserves_arrival_order() { + // Scenario: mixed `Ack` and `Nack` messages are batched without reordering. + // Guarantees: completion batching preserves FIFO arrival order and updates + // the emitted-batch and emitted-message counters consistently. + let (tx, mut rx) = node_channel::(test_config()).unwrap(); + + assert_eq!(tx.try_send(ack("ack-1")).unwrap(), SendOutcome::Accepted); + assert_eq!(tx.try_send(nack("nack-1")).unwrap(), SendOutcome::Accepted); + assert_eq!(tx.try_send(ack("ack-2")).unwrap(), SendOutcome::Accepted); + + assert_eq!( + rx.recv().await, + Some(NodeControlEvent::CompletionBatch(vec![ + CompletionMsg::Ack(AckMsg::new("ack-1".to_owned())), + CompletionMsg::Nack(crate::NackMsg::new("nack", "nack-1".to_owned())), + ])) + ); + assert_eq!( + rx.recv().await, + Some(NodeControlEvent::CompletionBatch(vec![CompletionMsg::Ack( + AckMsg::new("ack-2".to_owned()) + )])) + ); + + let stats = tx.stats(); + assert_eq!(stats.completion_batch_emitted_total, 2); + assert_eq!(stats.completion_message_emitted_total, 3); +} + +#[tokio::test(flavor = "current_thread")] +async fn completion_burst_limit_forces_pending_normal_work() { + // Scenario: pending normal control work must break up long completion + // bursts once the configured burst limit is reached. + // Guarantees: the completion burst limit forces a normal event before more + // completion traffic can continue once the configured limit is reached. + let (tx, mut rx) = node_channel::(ControlChannelConfig { + completion_msg_capacity: 8, + completion_batch_max: 2, + completion_burst_limit: 2, + }) + .unwrap(); + + for value in ["ack-1", "ack-2", "ack-3", "ack-4"] { + assert_eq!(tx.try_send(ack(value)).unwrap(), SendOutcome::Accepted); + } + assert_eq!( + tx.try_send(ControlCmd::Config { + config: serde_json::json!({"version": 1}), + }) + .unwrap(), + SendOutcome::Accepted + ); + + assert_eq!( + rx.recv().await, + Some(NodeControlEvent::CompletionBatch(vec![ + CompletionMsg::Ack(AckMsg::new("ack-1".to_owned())), + CompletionMsg::Ack(AckMsg::new("ack-2".to_owned())), + ])) + ); + assert_eq!( + rx.recv().await, + Some(NodeControlEvent::Config { + config: serde_json::json!({"version": 1}), + }) + ); + assert_eq!( + rx.recv().await, + Some(NodeControlEvent::CompletionBatch(vec![ + CompletionMsg::Ack(AckMsg::new("ack-3".to_owned())), + CompletionMsg::Ack(AckMsg::new("ack-4".to_owned())), + ])) + ); +} + +#[tokio::test(flavor = "current_thread")] +async fn pending_normal_work_is_cleared_when_shutdown_is_accepted() { + // Scenario: stale normal control work is discarded when shutdown is + // accepted so terminal progress is not delayed by obsolete tokens. + // Guarantees: shutdown admission clears pending normal work immediately and + // allows terminal progress without delivering stale config or best-effort tokens. + let deadline = Instant::now() + Duration::from_secs(1); + let (tx, mut rx) = node_channel::(test_config()).unwrap(); + + assert_eq!( + tx.try_send(ControlCmd::TimerTick).unwrap(), + SendOutcome::Accepted + ); + assert_eq!( + tx.try_send(ControlCmd::CollectTelemetry).unwrap(), + SendOutcome::Accepted + ); + assert_eq!( + tx.try_send(ControlCmd::Config { + config: serde_json::json!({"ignored": true}), + }) + .unwrap(), + SendOutcome::Accepted + ); + assert_eq!( + tx.accept_shutdown(shutdown(deadline)), + LifecycleSendResult::Accepted + ); + + assert_eq!( + rx.recv().await, + Some(NodeControlEvent::Shutdown(ShutdownMsg { + deadline, + reason: "shutdown".to_owned(), + })) + ); + assert_eq!(rx.try_recv(), None); +} + +#[tokio::test(flavor = "current_thread")] +async fn best_effort_work_is_suppressed_once_shutdown_is_recorded() { + // Scenario: after shutdown is latched, completion traffic may still drain + // but normal control work must be rejected. + // Guarantees: shutdown preserves completion draining while rejecting new + // normal control work and accounting for those drops. + let deadline = Instant::now() + Duration::from_secs(1); + let (tx, mut rx) = node_channel::(test_config()).unwrap(); + + assert_eq!( + tx.accept_shutdown(shutdown(deadline)), + LifecycleSendResult::Accepted + ); + assert_eq!(tx.try_send(ack("ack-1")).unwrap(), SendOutcome::Accepted); + assert_eq!( + tx.try_send(ControlCmd::TimerTick).unwrap(), + SendOutcome::DroppedDuringDrain + ); + assert_eq!( + tx.try_send(ControlCmd::CollectTelemetry).unwrap(), + SendOutcome::DroppedDuringDrain + ); + assert_eq!( + tx.try_send(ControlCmd::Config { + config: serde_json::json!({"ignored": true}), + }) + .unwrap(), + SendOutcome::DroppedDuringDrain + ); + + let stats = tx.stats(); + assert!(stats.shutdown_recorded); + assert_eq!(stats.normal_event_dropped_during_drain_total, 3); + + assert_eq!( + rx.recv().await, + Some(NodeControlEvent::CompletionBatch(vec![CompletionMsg::Ack( + AckMsg::new("ack-1".to_owned()) + )])) + ); + assert_eq!( + rx.recv().await, + Some(NodeControlEvent::Shutdown(ShutdownMsg { + deadline, + reason: "shutdown".to_owned(), + })) + ); + assert_eq!(rx.try_recv(), None); +} + +#[tokio::test(flavor = "current_thread")] +async fn shutdown_deadline_forces_terminal_progress() { + // Scenario: shutdown reaches its deadline while completion backlog still + // exists, so the queue must force terminal progress and abandon the rest. + // Guarantees: the force deadline bounds shutdown latency, abandons any + // remaining completion backlog, and closes the channel afterward. + let deadline = Instant::now() + Duration::from_millis(20); + let (tx, mut rx) = node_channel::(ControlChannelConfig { + completion_msg_capacity: 4, + completion_batch_max: 2, + completion_burst_limit: 2, + }) + .unwrap(); + + assert_eq!( + tx.accept_shutdown(shutdown(deadline)), + LifecycleSendResult::Accepted + ); + assert_eq!(tx.try_send(ack("ack-1")).unwrap(), SendOutcome::Accepted); + assert_eq!(tx.try_send(ack("ack-2")).unwrap(), SendOutcome::Accepted); + + tokio::time::sleep(Duration::from_millis(40)).await; + + assert_eq!( + rx.recv().await, + Some(NodeControlEvent::Shutdown(ShutdownMsg { + deadline, + reason: "shutdown".to_owned(), + })) + ); + assert_eq!(rx.try_recv(), None); + + let stats = tx.stats(); + assert!(stats.shutdown_recorded); + assert_eq!(stats.completion_abandoned_on_forced_shutdown_total, 2); + assert!(stats.closed); + + match tx.try_send(ack("ack-3")) { + Err(TrySendError::Closed(ControlCmd::Ack(ack))) => { + assert_eq!(*ack.accepted, "ack-3".to_owned()); + } + other => panic!("expected closed after forced shutdown, got {other:?}"), + } +} + +#[tokio::test(flavor = "current_thread")] +async fn receiver_returns_none_once_last_sender_drops_and_queue_is_empty() { + // Scenario: dropping the last sender closes the queue and lets the + // receiver terminate once there is no buffered work left. + // Guarantees: last-sender drop transitions the channel to closed and the + // receiver eventually observes `None` after draining buffered state. + let (tx, mut rx) = node_channel::(test_config()).unwrap(); + drop(tx); + + let result = tokio::time::timeout(Duration::from_millis(50), rx.recv()) + .await + .expect("receiver should wake when the last sender drops"); + assert_eq!(result, None); +} + +#[tokio::test(flavor = "current_thread")] +async fn blocking_send_waits_for_capacity_then_completes() { + // Scenario: a blocking send waits for bounded completion capacity and then + // succeeds after the receiver drains one batch. + // Guarantees: blocking completion sends park only until capacity is freed + // and then complete with the original send outcome. + let (tx, mut rx) = node_channel::(ControlChannelConfig { + completion_msg_capacity: 1, + completion_batch_max: 1, + completion_burst_limit: 1, + }) + .unwrap(); + + assert_eq!(tx.try_send(ack("ack-1")).unwrap(), SendOutcome::Accepted); + + let mut blocked = std::pin::pin!(tx.send(ack("ack-2"))); + assert!(is_pending_once(blocked.as_mut()).await); + + assert_eq!( + rx.recv().await, + Some(NodeControlEvent::CompletionBatch(vec![CompletionMsg::Ack( + AckMsg::new("ack-1".to_owned()) + )])) + ); + assert_eq!(blocked.await.unwrap(), SendOutcome::Accepted); + assert_eq!( + rx.recv().await, + Some(NodeControlEvent::CompletionBatch(vec![CompletionMsg::Ack( + AckMsg::new("ack-2".to_owned()) + )])) + ); +} + +#[tokio::test(flavor = "current_thread")] +async fn drain_ingress_does_not_wake_blocked_completion_senders() { + // Scenario: receiver drain begins while completion capacity is full and + // a completion sender is blocked waiting for one retained completion slot. + // Guarantees: `DrainIngress` wakes the channel receiver so lifecycle + // precedence is preserved, but blocked completion senders stay asleep + // until a later completion batch actually frees capacity. + let deadline = Instant::now() + Duration::from_secs(1); + let (tx, mut rx) = receiver_channel::(ControlChannelConfig { + completion_msg_capacity: 1, + completion_batch_max: 1, + completion_burst_limit: 1, + }) + .unwrap(); + + assert_eq!(tx.try_send(ack("ack-1")).unwrap(), SendOutcome::Accepted); + + let wake_counter = Arc::new(CountingWake::default()); + let waker = Waker::from(wake_counter.clone()); + let mut cx = Context::from_waker(&waker); + let mut blocked = std::pin::pin!(tx.send(ack("ack-2"))); + assert!(matches!(blocked.as_mut().poll(&mut cx), Poll::Pending)); + assert_eq!(wake_counter.count(), 0); + + assert_eq!( + tx.accept_drain_ingress(drain(deadline)), + LifecycleSendResult::Accepted + ); + assert_eq!(wake_counter.count(), 0); + assert_eq!( + rx.recv().await, + Some(ReceiverControlEvent::DrainIngress(DrainIngressMsg { + deadline, + reason: "drain".to_owned(), + })) + ); + assert_eq!(wake_counter.count(), 0); + + assert_eq!( + rx.recv().await, + Some(ReceiverControlEvent::CompletionBatch(vec![ + CompletionMsg::Ack(AckMsg::new("ack-1".to_owned())) + ])) + ); + assert_eq!(wake_counter.count(), 1); + + assert_eq!(blocked.await.unwrap(), SendOutcome::Accepted); + assert_eq!( + rx.recv().await, + Some(ReceiverControlEvent::CompletionBatch(vec![ + CompletionMsg::Ack(AckMsg::new("ack-2".to_owned())) + ])) + ); +} + +#[tokio::test(flavor = "current_thread")] +async fn completion_batch_wakes_as_many_blocked_senders_as_slots_freed() { + // Scenario: draining a completion batch should wake one blocked sender per + // freed completion slot, not just one sender for the whole batch. + // Guarantees: blocked-sender wakeups scale with the number of released + // completion slots rather than degenerating into one-wakeup-per-batch behavior. + let (tx, mut rx) = node_channel::(ControlChannelConfig { + completion_msg_capacity: 2, + completion_batch_max: 2, + completion_burst_limit: 2, + }) + .unwrap(); + + assert_eq!(tx.try_send(ack("ack-1")).unwrap(), SendOutcome::Accepted); + assert_eq!(tx.try_send(ack("ack-2")).unwrap(), SendOutcome::Accepted); + + let tx_clone = tx.clone(); + let mut blocked_one = std::pin::pin!(tx.send(ack("ack-3"))); + let mut blocked_two = std::pin::pin!(tx_clone.send(ack("ack-4"))); + assert!(is_pending_once(blocked_one.as_mut()).await); + assert!(is_pending_once(blocked_two.as_mut()).await); + + assert_eq!( + rx.recv().await, + Some(NodeControlEvent::CompletionBatch(vec![ + CompletionMsg::Ack(AckMsg::new("ack-1".to_owned())), + CompletionMsg::Ack(AckMsg::new("ack-2".to_owned())), + ])) + ); + + assert_eq!(blocked_one.await.unwrap(), SendOutcome::Accepted); + assert_eq!(blocked_two.await.unwrap(), SendOutcome::Accepted); + assert_eq!( + rx.recv().await, + Some(NodeControlEvent::CompletionBatch(vec![ + CompletionMsg::Ack(AckMsg::new("ack-3".to_owned())), + CompletionMsg::Ack(AckMsg::new("ack-4".to_owned())), + ])) + ); +} + +#[tokio::test(flavor = "current_thread")] +async fn canceled_blocked_sender_does_not_steal_next_capacity_wakeup() { + // Scenario: canceling a blocked sender should unregister its waiter so the + // next capacity release wakes a live blocked sender instead of a stale one. + // Guarantees: canceled blocked sends do not leave stale waiters that can + // consume a future capacity wakeup meant for a live sender. + let (tx, mut rx) = node_channel::(ControlChannelConfig { + completion_msg_capacity: 1, + completion_batch_max: 1, + completion_burst_limit: 1, + }) + .unwrap(); + + assert_eq!(tx.try_send(ack("ack-1")).unwrap(), SendOutcome::Accepted); + + { + let mut canceled = std::pin::pin!(tx.send(ack("ack-2"))); + assert!(is_pending_once(canceled.as_mut()).await); + } + + let tx_clone = tx.clone(); + let mut live = std::pin::pin!(tx_clone.send(ack("ack-3"))); + assert!(is_pending_once(live.as_mut()).await); + + assert_eq!( + rx.recv().await, + Some(NodeControlEvent::CompletionBatch(vec![CompletionMsg::Ack( + AckMsg::new("ack-1".to_owned()) + )])) + ); + assert_eq!(live.await.unwrap(), SendOutcome::Accepted); + assert_eq!( + rx.recv().await, + Some(NodeControlEvent::CompletionBatch(vec![CompletionMsg::Ack( + AckMsg::new("ack-3".to_owned()) + )])) + ); +} + +#[tokio::test(flavor = "current_thread")] +async fn close_wakes_blocked_senders_with_closed() { + // Scenario: closing the channel must wake blocked senders so they return + // `Closed` with the original command instead of waiting forever. + // Guarantees: terminal close wakes parked senders immediately and preserves + // ownership of the original command in the returned error. + let (tx, _rx) = node_channel::(ControlChannelConfig { + completion_msg_capacity: 1, + completion_batch_max: 1, + completion_burst_limit: 1, + }) + .unwrap(); + + assert_eq!(tx.try_send(ack("ack-1")).unwrap(), SendOutcome::Accepted); + + let mut blocked = std::pin::pin!(tx.send(ack("ack-2"))); + assert!(is_pending_once(blocked.as_mut()).await); + + tx.close(); + + match blocked.await { + Err(SendError::Closed(ControlCmd::Ack(ack))) => { + assert_eq!(*ack.accepted, "ack-2".to_owned()); + } + other => panic!("expected closed send after close(), got {other:?}"), + } +} + +#[tokio::test(flavor = "current_thread")] +async fn try_send_returns_full_with_the_original_command() { + // Scenario: `try_send` preserves the original command when bounded + // backpressured completion admission is full. + // Guarantees: a full error reports the backpressured admission class and + // returns the original command unchanged to the caller. + let (tx, _rx) = node_channel::(ControlChannelConfig { + completion_msg_capacity: 1, + completion_batch_max: 1, + completion_burst_limit: 1, + }) + .unwrap(); + + assert_eq!(tx.try_send(ack("ack-1")).unwrap(), SendOutcome::Accepted); + + match tx.try_send(nack("nack-1")) { + Err(TrySendError::Full { + admission_class: AdmissionClass::Backpressured, + cmd: ControlCmd::Nack(nack), + }) => { + assert_eq!(nack.reason, "nack"); + assert_eq!(*nack.refused, "nack-1".to_owned()); + } + other => panic!("expected full completion error, got {other:?}"), + } +} + +#[tokio::test(flavor = "current_thread")] +async fn completion_metadata_survives_batching_and_full_errors() { + // Scenario: explicit completion metadata must survive batching and full + // backpressure errors so future engine integration can preserve unwind state. + // Guarantees: completion metadata is preserved both when a completion is + // delivered in a batch and when `try_send` returns the original command. + let (tx, mut rx) = node_channel_with_meta::(ControlChannelConfig { + completion_msg_capacity: 1, + completion_batch_max: 1, + completion_burst_limit: 1, + }) + .unwrap(); + + let first_meta = TestMeta { + route: "batch", + seq: 1, + }; + let second_meta = TestMeta { + route: "full", + seq: 2, + }; + + assert_eq!( + tx.try_send(ControlCmd::Ack(AckMsg::with_meta( + "ack-1".to_owned(), + first_meta.clone(), + ))) + .unwrap(), + SendOutcome::Accepted + ); + + match tx.try_send(ControlCmd::Nack(crate::NackMsg::with_meta( + "nack", + "nack-1".to_owned(), + second_meta.clone(), + ))) { + Err(TrySendError::Full { + admission_class: AdmissionClass::Backpressured, + cmd: ControlCmd::Nack(nack), + }) => { + assert_eq!(nack.reason, "nack"); + assert_eq!(*nack.refused, "nack-1".to_owned()); + assert_eq!(nack.meta, second_meta); + } + other => panic!("expected full completion error with preserved metadata, got {other:?}"), + } + + assert_eq!( + rx.recv().await, + Some(NodeControlEvent::CompletionBatch(vec![CompletionMsg::Ack( + AckMsg::with_meta("ack-1".to_owned(), first_meta) + )])) + ); +} + +#[tokio::test(flavor = "current_thread")] +async fn lifecycle_duplicate_is_rejected_before_delivery_and_closed_after_terminal_shutdown() { + // Scenario: duplicate lifecycle offers are rejected while pending, and the + // lifecycle path closes after terminal shutdown has been delivered. + // Guarantees: lifecycle recording is idempotent while pending and becomes + // permanently closed once terminal shutdown delivery completes. + let deadline = Instant::now() + Duration::from_secs(1); + let (tx, mut rx) = node_channel::(test_config()).unwrap(); + + assert_eq!( + tx.accept_shutdown(shutdown(deadline)), + LifecycleSendResult::Accepted + ); + assert_eq!( + tx.accept_shutdown(shutdown(deadline + Duration::from_secs(1))), + LifecycleSendResult::AlreadyAccepted + ); + assert_eq!( + rx.recv().await, + Some(NodeControlEvent::Shutdown(ShutdownMsg { + deadline, + reason: "shutdown".to_owned(), + })) + ); + + assert_eq!( + tx.accept_shutdown(shutdown(deadline + Duration::from_secs(1))), + LifecycleSendResult::Closed + ); +} + +#[tokio::test(flavor = "current_thread")] +async fn lifecycle_accept_returns_closed_after_sender_close() { + // Scenario: once senders close the channel, reserved lifecycle acceptance + // also reports the closed state. + // Guarantees: lifecycle admission shares the same terminal closed state as + // regular send paths after sender-driven close. + let (tx, _rx) = receiver_channel::(test_config()).unwrap(); + tx.close(); + + assert_eq!( + tx.accept_drain_ingress(drain(Instant::now() + Duration::from_secs(1))), + LifecycleSendResult::Closed + ); + assert_eq!( + tx.accept_shutdown(shutdown(Instant::now() + Duration::from_secs(1))), + LifecycleSendResult::Closed + ); +} + +#[tokio::test(flavor = "current_thread")] +async fn stats_track_config_replacement_and_best_effort_coalescing() { + // Scenario: stats expose replacement and coalescing counters while normal + // control events remain pending. + // Guarantees: stats reflect latest-wins config replacement, best-effort + // coalescing, and the presence of still-pending normal control work. + let (tx, _rx) = node_channel::(test_config()).unwrap(); + + assert_eq!( + tx.try_send(ControlCmd::Config { + config: serde_json::json!({"version": 1}), + }) + .unwrap(), + SendOutcome::Accepted + ); + assert_eq!( + tx.try_send(ControlCmd::Config { + config: serde_json::json!({"version": 2}), + }) + .unwrap(), + SendOutcome::Replaced + ); + assert_eq!( + tx.try_send(ControlCmd::TimerTick).unwrap(), + SendOutcome::Accepted + ); + assert_eq!( + tx.try_send(ControlCmd::TimerTick).unwrap(), + SendOutcome::Coalesced + ); + assert_eq!( + tx.try_send(ControlCmd::CollectTelemetry).unwrap(), + SendOutcome::Accepted + ); + assert_eq!( + tx.try_send(ControlCmd::CollectTelemetry).unwrap(), + SendOutcome::Coalesced + ); + + let stats = tx.stats(); + assert_eq!(stats.config_replaced_total, 1); + assert_eq!(stats.timer_tick_coalesced_total, 1); + assert_eq!(stats.collect_telemetry_coalesced_total, 1); + assert!(stats.has_pending_config); + assert!(stats.has_pending_timer_tick); + assert!(stats.has_pending_collect_telemetry); +} diff --git a/rust/otap-dataflow/crates/control-channel/src/types.rs b/rust/otap-dataflow/crates/control-channel/src/types.rs new file mode 100644 index 0000000000..2bdb5041fc --- /dev/null +++ b/rust/otap-dataflow/crates/control-channel/src/types.rs @@ -0,0 +1,377 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +//! Public types for the standalone control-aware channel. + +use std::time::Instant; + +use thiserror::Error; + +/// Phase of the control channel lifecycle. +#[derive(Clone, Copy, Debug, Default, PartialEq, Eq)] +pub enum Phase { + /// Ordinary control delivery is active. + #[default] + Normal, + /// `DrainIngress` has been recorded and best-effort control is suppressed. + IngressDrainRecorded, + /// `Shutdown` has been recorded and the channel is draining retained work. + ShutdownRecorded, +} + +/// Admission class used by queue policy and backpressure reporting. +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum AdmissionClass { + /// Backpressured retained traffic such as completion messages (`Ack`/`Nack`). + Backpressured, + /// Coalesced best-effort work such as timer and telemetry ticks. + BestEffort, +} + +/// Configuration for a control-optimized channel instance. +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct ControlChannelConfig { + /// Maximum number of completion messages retained in the queue. + pub completion_msg_capacity: usize, + /// Maximum number of completion messages returned in a single batch. + pub completion_batch_max: usize, + /// Maximum number of completion messages delivered consecutively before the + /// scheduler must surface one pending non-completion event. + pub completion_burst_limit: usize, +} + +impl Default for ControlChannelConfig { + fn default() -> Self { + Self { + completion_msg_capacity: 256, + completion_batch_max: 32, + completion_burst_limit: 32, + } + } +} + +impl ControlChannelConfig { + /// Validates channel configuration. + pub fn validate(&self) -> Result<(), ConfigError> { + if self.completion_msg_capacity == 0 { + return Err(ConfigError::ZeroCompletionMsgCapacity); + } + if self.completion_batch_max == 0 { + return Err(ConfigError::ZeroCompletionBatchMax); + } + if self.completion_burst_limit == 0 { + return Err(ConfigError::ZeroCompletionBurstLimit); + } + Ok(()) + } +} + +/// Configuration validation errors. +#[derive(Clone, Debug, Error, PartialEq, Eq)] +pub enum ConfigError { + /// `completion_msg_capacity` must be strictly positive. + #[error("completion_msg_capacity must be greater than zero")] + ZeroCompletionMsgCapacity, + /// `completion_batch_max` must be strictly positive. + #[error("completion_batch_max must be greater than zero")] + ZeroCompletionBatchMax, + /// `completion_burst_limit` must be strictly positive. + #[error("completion_burst_limit must be greater than zero")] + ZeroCompletionBurstLimit, +} + +/// Shutdown-drain lifecycle message for receivers. +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct DrainIngressMsg { + /// Deadline for receiver-local ingress drain work. The standalone control + /// channel carries this through unchanged, but only `Shutdown` deadlines + /// drive queue-level forced progress. + pub deadline: Instant, + /// Human-readable reason for the drain request. + pub reason: String, +} + +/// Terminal lifecycle message. +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct ShutdownMsg { + /// Deadline after which shutdown is considered forced. + pub deadline: Instant, + /// Human-readable reason for the shutdown request. + pub reason: String, +} + +/// Completion success message. +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct AckMsg { + /// Accepted payload being returned upstream. + pub accepted: Box, + /// Explicit completion metadata carried with the returned payload. + /// Future engine integration can use this for unwind state such as + /// `UnwindData`. + pub meta: Meta, +} + +impl AckMsg { + /// Creates a new acknowledgment wrapper without additional metadata. + pub fn new(accepted: PData) -> Self { + Self::with_meta(accepted, ()) + } +} + +impl AckMsg { + /// Creates a new acknowledgment wrapper with explicit metadata. + pub fn with_meta(accepted: PData, meta: Meta) -> Self { + Self { + accepted: Box::new(accepted), + meta, + } + } +} + +/// Completion failure message. +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct NackMsg { + /// Human-readable failure reason. + pub reason: String, + /// Refused payload being returned upstream. + pub refused: Box, + /// Explicit completion metadata carried with the returned payload. + /// Future engine integration can use this for unwind state such as + /// `UnwindData`. + pub meta: Meta, + /// Whether the failure is permanent. + pub permanent: bool, +} + +impl NackMsg { + /// Creates a new non-permanent negative acknowledgment without additional metadata. + pub fn new>(reason: T, refused: PData) -> Self { + Self::with_meta(reason, refused, ()) + } + + /// Creates a new permanent negative acknowledgment without additional metadata. + pub fn new_permanent>(reason: T, refused: PData) -> Self { + Self::with_meta_permanent(reason, refused, ()) + } +} + +impl NackMsg { + /// Creates a new non-permanent negative acknowledgment with explicit metadata. + pub fn with_meta>(reason: T, refused: PData, meta: Meta) -> Self { + Self::new_internal(reason, refused, meta, false) + } + + /// Creates a new permanent negative acknowledgment with explicit metadata. + pub fn with_meta_permanent>(reason: T, refused: PData, meta: Meta) -> Self { + Self::new_internal(reason, refused, meta, true) + } + + fn new_internal>( + reason: T, + refused: PData, + meta: Meta, + permanent: bool, + ) -> Self { + Self { + reason: reason.into(), + refused: Box::new(refused), + meta, + permanent, + } + } +} + +/// Completion message retained inside a batched completion queue. +#[derive(Clone, Debug, PartialEq, Eq)] +pub enum CompletionMsg { + /// Positive completion. + Ack(AckMsg), + /// Negative completion. + Nack(NackMsg), +} + +/// Command submitted to the control-aware channel. +#[derive(Clone, Debug, PartialEq)] +pub enum ControlCmd { + /// Completion success. + Ack(AckMsg), + /// Completion failure. + Nack(NackMsg), + /// Latest configuration update. + Config { + /// Resolved configuration payload. + config: serde_json::Value, + }, + /// Timer-driven control work. + TimerTick, + /// Telemetry-driven control work. + CollectTelemetry, +} + +/// Event surfaced by receiver-role control channels. +#[derive(Clone, Debug, PartialEq)] +pub enum ReceiverControlEvent { + /// Lifecycle drain token. + DrainIngress(DrainIngressMsg), + /// Batch of completions in arrival order. + CompletionBatch(Vec>), + /// Latest configuration update. + Config { + /// Resolved configuration payload. + config: serde_json::Value, + }, + /// Timer-driven control work. + TimerTick, + /// Telemetry-driven control work. + CollectTelemetry, + /// Terminal lifecycle token. + Shutdown(ShutdownMsg), +} + +/// Event surfaced by non-receiver node control channels. +#[derive(Clone, Debug, PartialEq)] +pub enum NodeControlEvent { + /// Batch of completions in arrival order. + CompletionBatch(Vec>), + /// Latest configuration update. + Config { + /// Resolved configuration payload. + config: serde_json::Value, + }, + /// Timer-driven control work. + TimerTick, + /// Telemetry-driven control work. + CollectTelemetry, + /// Terminal lifecycle token. + Shutdown(ShutdownMsg), +} + +#[derive(Clone, Debug, PartialEq)] +pub(crate) enum CoreControlEvent { + DrainIngress(DrainIngressMsg), + CompletionBatch(Vec>), + Config { config: serde_json::Value }, + TimerTick, + CollectTelemetry, + Shutdown(ShutdownMsg), +} + +impl ReceiverControlEvent { + pub(crate) fn from_core(event: CoreControlEvent) -> Self { + match event { + CoreControlEvent::DrainIngress(msg) => Self::DrainIngress(msg), + CoreControlEvent::CompletionBatch(batch) => Self::CompletionBatch(batch), + CoreControlEvent::Config { config } => Self::Config { config }, + CoreControlEvent::TimerTick => Self::TimerTick, + CoreControlEvent::CollectTelemetry => Self::CollectTelemetry, + CoreControlEvent::Shutdown(msg) => Self::Shutdown(msg), + } + } +} + +impl NodeControlEvent { + pub(crate) fn from_core(event: CoreControlEvent) -> Self { + match event { + CoreControlEvent::DrainIngress(_) => { + panic!("DrainIngress must not be delivered on node control channels") + } + CoreControlEvent::CompletionBatch(batch) => Self::CompletionBatch(batch), + CoreControlEvent::Config { config } => Self::Config { config }, + CoreControlEvent::TimerTick => Self::TimerTick, + CoreControlEvent::CollectTelemetry => Self::CollectTelemetry, + CoreControlEvent::Shutdown(msg) => Self::Shutdown(msg), + } + } +} + +/// Result of a successful send attempt. +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum SendOutcome { + /// The command mutated channel state and is now retained or pending delivery. + Accepted, + /// The command was coalesced with an already pending equivalent item. + Coalesced, + /// The command replaced an older pending item of the same class. + Replaced, + /// The command was intentionally dropped because the channel is draining. + DroppedDuringDrain, +} + +/// Result of submitting a lifecycle token to the sender API. +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum LifecycleSendResult { + /// The lifecycle token was accepted for the first time. + Accepted, + /// The lifecycle token had already been accepted earlier in the channel lifetime. + AlreadyAccepted, + /// The channel is closed. + Closed, +} + +/// Non-blocking send errors for the control-aware sender. +#[derive(Clone, Debug, Error, PartialEq)] +pub enum TrySendError { + /// The channel has been closed. + #[error("control channel is closed")] + Closed(ControlCmd), + /// The bounded class-specific capacity has been reached. + #[error("control channel capacity reached for {admission_class:?}")] + Full { + /// The admission class whose bounded capacity is saturated. + admission_class: AdmissionClass, + /// The command that could not be enqueued. + cmd: ControlCmd, + }, +} + +/// Blocking send errors for the control-aware sender. +#[derive(Clone, Debug, Error, PartialEq)] +pub enum SendError { + /// The channel closed before the command could be enqueued. + #[error("control channel is closed")] + Closed(ControlCmd), +} + +/// Snapshot of queue occupancy and lifecycle state. +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct ControlChannelStats { + /// Current lifecycle phase. + pub phase: Phase, + /// Whether `DrainIngress` has been accepted at least once during the channel lifetime. + pub drain_ingress_recorded: bool, + /// Whether `Shutdown` has been accepted at least once during the channel lifetime. + pub shutdown_recorded: bool, + /// Whether a drain-ingress token is still pending delivery. + pub has_pending_drain_ingress: bool, + /// Whether a shutdown token is still pending delivery. + pub has_pending_shutdown: bool, + /// Number of retained completion messages. + pub completion_len: usize, + /// Whether a configuration update is pending. + pub has_pending_config: bool, + /// Whether a timer tick is pending. + pub has_pending_timer_tick: bool, + /// Whether a telemetry-collection request is pending. + pub has_pending_collect_telemetry: bool, + /// Number of completion messages delivered since the last non-completion + /// event. This is the fairness budget currently consumed. + pub completion_burst_len: usize, + /// Total number of completion batches emitted by the receiver side. + pub completion_batch_emitted_total: u64, + /// Total number of completion messages emitted across all batches. + pub completion_message_emitted_total: u64, + /// Total number of pending config replacements. + pub config_replaced_total: u64, + /// Total number of timer ticks coalesced into an already pending tick. + pub timer_tick_coalesced_total: u64, + /// Total number of telemetry-collection requests coalesced into an already pending request. + pub collect_telemetry_coalesced_total: u64, + /// Total number of normal control events dropped during drain or shutdown. + pub normal_event_dropped_during_drain_total: u64, + /// Total number of retained completions abandoned when forced shutdown fires. + pub completion_abandoned_on_forced_shutdown_total: u64, + /// Whether the shutdown deadline has already forced terminal progress. + pub shutdown_forced: bool, + /// Whether the channel is closed for new sends. + pub closed: bool, +} diff --git a/rust/otap-dataflow/crates/core-nodes/src/exporters/console_exporter/mod.rs b/rust/otap-dataflow/crates/core-nodes/src/exporters/console_exporter/mod.rs index 173380efb9..941c474044 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/exporters/console_exporter/mod.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/exporters/console_exporter/mod.rs @@ -14,7 +14,7 @@ use otap_df_engine::control::{AckMsg, NodeControlMsg}; use otap_df_engine::error::Error; use otap_df_engine::exporter::ExporterWrapper; use otap_df_engine::local::exporter::{EffectHandler, Exporter}; -use otap_df_engine::message::{ExporterMessageChannel, Message}; +use otap_df_engine::message::{ExporterInbox, Message}; use otap_df_engine::node::NodeId; use otap_df_engine::terminal_state::TerminalState; use otap_df_engine::{ConsumerEffectHandlerExtension, ExporterFactory}; @@ -99,7 +99,7 @@ pub static CONSOLE_EXPORTER: ExporterFactory = ExporterFactory { impl Exporter for ConsoleExporter { async fn start( self: Box, - mut msg_chan: ExporterMessageChannel, + mut msg_chan: ExporterInbox, effect_handler: EffectHandler, ) -> Result { loop { diff --git a/rust/otap-dataflow/crates/core-nodes/src/exporters/error_exporter/mod.rs b/rust/otap-dataflow/crates/core-nodes/src/exporters/error_exporter/mod.rs index 1772411bd7..bd4b9c08f2 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/exporters/error_exporter/mod.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/exporters/error_exporter/mod.rs @@ -10,7 +10,7 @@ use otap_df_engine::control::{NackMsg, NodeControlMsg}; use otap_df_engine::error::Error; use otap_df_engine::exporter::ExporterWrapper; use otap_df_engine::local::exporter::{EffectHandler, Exporter}; -use otap_df_engine::message::{ExporterMessageChannel, Message}; +use otap_df_engine::message::{ExporterInbox, Message}; use otap_df_engine::node::NodeId; use otap_df_engine::terminal_state::TerminalState; use otap_df_engine::{ConsumerEffectHandlerExtension, ExporterFactory}; @@ -78,7 +78,7 @@ impl ErrorExporter { impl Exporter for ErrorExporter { async fn start( self: Box, - mut msg_chan: ExporterMessageChannel, + mut msg_chan: ExporterInbox, effect_handler: EffectHandler, ) -> Result { loop { diff --git a/rust/otap-dataflow/crates/core-nodes/src/exporters/noop_exporter/mod.rs b/rust/otap-dataflow/crates/core-nodes/src/exporters/noop_exporter/mod.rs index ee141caeab..4364e34160 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/exporters/noop_exporter/mod.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/exporters/noop_exporter/mod.rs @@ -10,7 +10,7 @@ use otap_df_engine::control::{AckMsg, NodeControlMsg}; use otap_df_engine::error::Error; use otap_df_engine::exporter::ExporterWrapper; use otap_df_engine::local::exporter::{EffectHandler, Exporter}; -use otap_df_engine::message::{ExporterMessageChannel, Message}; +use otap_df_engine::message::{ExporterInbox, Message}; use otap_df_engine::node::NodeId; use otap_df_engine::terminal_state::TerminalState; use otap_df_engine::{ConsumerEffectHandlerExtension, ExporterFactory}; @@ -48,7 +48,7 @@ pub static NOOP_EXPORTER: ExporterFactory = ExporterFactory { impl Exporter for NoopExporter { async fn start( self: Box, - mut msg_chan: ExporterMessageChannel, + mut msg_chan: ExporterInbox, effect_handler: EffectHandler, ) -> Result { loop { diff --git a/rust/otap-dataflow/crates/core-nodes/src/exporters/otap_exporter/mod.rs b/rust/otap-dataflow/crates/core-nodes/src/exporters/otap_exporter/mod.rs index 2d20b84b53..190467917b 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/exporters/otap_exporter/mod.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/exporters/otap_exporter/mod.rs @@ -19,7 +19,7 @@ use otap_df_engine::control::{AckMsg, NackMsg, NodeControlMsg}; use otap_df_engine::error::{Error, ExporterErrorKind, format_error_sources}; use otap_df_engine::exporter::ExporterWrapper; use otap_df_engine::local::exporter as local; -use otap_df_engine::message::{ExporterMessageChannel, Message}; +use otap_df_engine::message::{ExporterInbox, Message}; use otap_df_engine::node::NodeId; use otap_df_engine::terminal_state::TerminalState; use otap_df_otap::OTAP_EXPORTER_FACTORIES; @@ -108,7 +108,7 @@ impl OTAPExporter { impl local::Exporter for OTAPExporter { async fn start( mut self: Box, - mut msg_chan: ExporterMessageChannel, + mut msg_chan: ExporterInbox, effect_handler: local::EffectHandler, ) -> Result { otel_info!( diff --git a/rust/otap-dataflow/crates/core-nodes/src/exporters/otlp_grpc_exporter/mod.rs b/rust/otap-dataflow/crates/core-nodes/src/exporters/otlp_grpc_exporter/mod.rs index ee7aaa6f56..6ddb3a94cf 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/exporters/otlp_grpc_exporter/mod.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/exporters/otlp_grpc_exporter/mod.rs @@ -24,7 +24,7 @@ use otap_df_engine::control::{AckMsg, NackMsg, NodeControlMsg}; use otap_df_engine::error::{Error, ExporterErrorKind, format_error_sources}; use otap_df_engine::exporter::ExporterWrapper; use otap_df_engine::local::exporter::{EffectHandler, Exporter}; -use otap_df_engine::message::{ExporterMessageChannel, Message}; +use otap_df_engine::message::{ExporterInbox, Message}; use otap_df_engine::node::NodeId; use otap_df_engine::terminal_state::TerminalState; use otap_df_otap::OTAP_EXPORTER_FACTORIES; @@ -119,7 +119,7 @@ impl OTLPExporter { impl Exporter for OTLPExporter { async fn start( mut self: Box, - mut msg_chan: ExporterMessageChannel, + mut msg_chan: ExporterInbox, effect_handler: EffectHandler, ) -> Result { otel_info!( diff --git a/rust/otap-dataflow/crates/core-nodes/src/exporters/otlp_http_exporter/mod.rs b/rust/otap-dataflow/crates/core-nodes/src/exporters/otlp_http_exporter/mod.rs index ce085a1ea4..2887cef50b 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/exporters/otlp_http_exporter/mod.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/exporters/otlp_http_exporter/mod.rs @@ -32,7 +32,7 @@ use otap_df_engine::control::{AckMsg, NackMsg, NodeControlMsg}; use otap_df_engine::error::{Error as EngineError, ExporterErrorKind}; use otap_df_engine::exporter::ExporterWrapper; use otap_df_engine::local::exporter::{EffectHandler, Exporter}; -use otap_df_engine::message::{ExporterMessageChannel, Message}; +use otap_df_engine::message::{ExporterInbox, Message}; use otap_df_engine::node::NodeId; use otap_df_engine::terminal_state::TerminalState; use otap_df_engine::wiring_contract::WiringContract; @@ -202,7 +202,7 @@ struct CompletedExport { impl Exporter for OtlpHttpExporter { async fn start( mut self: Box, - mut msg_chan: ExporterMessageChannel, + mut msg_chan: ExporterInbox, effect_handler: EffectHandler, ) -> Result { let logs_endpoint = Rc::new( diff --git a/rust/otap-dataflow/crates/core-nodes/src/exporters/parquet_exporter/mod.rs b/rust/otap-dataflow/crates/core-nodes/src/exporters/parquet_exporter/mod.rs index 8c5957d70f..b0c25297ec 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/exporters/parquet_exporter/mod.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/exporters/parquet_exporter/mod.rs @@ -48,7 +48,7 @@ use otap_df_engine::control::NodeControlMsg; use otap_df_engine::error::{Error, ExporterErrorKind, format_error_sources}; use otap_df_engine::exporter::ExporterWrapper; use otap_df_engine::local::exporter::{EffectHandler, Exporter}; -use otap_df_engine::message::{ExporterMessageChannel, Message}; +use otap_df_engine::message::{ExporterInbox, Message}; use otap_df_engine::node::NodeId; use otap_df_engine::terminal_state::TerminalState; use otap_df_otap::OTAP_EXPORTER_FACTORIES; @@ -154,7 +154,7 @@ impl ParquetExporter { impl Exporter for ParquetExporter { async fn start( mut self: Box, - mut msg_chan: ExporterMessageChannel, + mut msg_chan: ExporterInbox, effect_handler: EffectHandler, ) -> Result { let exporter_id = effect_handler.exporter_id(); diff --git a/rust/otap-dataflow/crates/core-nodes/src/exporters/perf_exporter/mod.rs b/rust/otap-dataflow/crates/core-nodes/src/exporters/perf_exporter/mod.rs index 9add64a74b..2503116ee6 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/exporters/perf_exporter/mod.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/exporters/perf_exporter/mod.rs @@ -35,7 +35,7 @@ use otap_df_engine::control::{AckMsg, NodeControlMsg}; use otap_df_engine::error::{Error, ExporterErrorKind}; use otap_df_engine::exporter::ExporterWrapper; use otap_df_engine::local::exporter as local; -use otap_df_engine::message::{ExporterMessageChannel, Message}; +use otap_df_engine::message::{ExporterInbox, Message}; use otap_df_engine::node::NodeId; use otap_df_engine::terminal_state::TerminalState; use otap_df_otap::OTAP_EXPORTER_FACTORIES; @@ -130,7 +130,7 @@ impl PerfExporter { impl local::Exporter for PerfExporter { async fn start( mut self: Box, - mut msg_chan: ExporterMessageChannel, + mut msg_chan: ExporterInbox, effect_handler: local::EffectHandler, ) -> Result { // init variables for tracking diff --git a/rust/otap-dataflow/crates/core-nodes/src/exporters/topic_exporter/mod.rs b/rust/otap-dataflow/crates/core-nodes/src/exporters/topic_exporter/mod.rs index 12672a7683..e665faa759 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/exporters/topic_exporter/mod.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/exporters/topic_exporter/mod.rs @@ -16,7 +16,7 @@ use otap_df_engine::control::{AckMsg, NackMsg, NodeControlMsg}; use otap_df_engine::error::Error; use otap_df_engine::exporter::ExporterWrapper; use otap_df_engine::local::exporter::{EffectHandler, Exporter}; -use otap_df_engine::message::{ExporterMessageChannel, Message}; +use otap_df_engine::message::{ExporterInbox, Message}; use otap_df_engine::node::NodeId; use otap_df_engine::terminal_state::TerminalState; use otap_df_engine::topic::{ @@ -158,7 +158,7 @@ impl TopicExporter { impl Exporter for TopicExporter { async fn start( self: Box, - mut msg_chan: ExporterMessageChannel, + mut msg_chan: ExporterInbox, effect_handler: EffectHandler, ) -> Result { let TopicExporter { diff --git a/rust/otap-dataflow/crates/core-nodes/src/processors/batch_processor/mod.rs b/rust/otap-dataflow/crates/core-nodes/src/processors/batch_processor/mod.rs index 5c9b76d542..84634b103f 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/processors/batch_processor/mod.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/processors/batch_processor/mod.rs @@ -38,7 +38,7 @@ use otap_df_engine::MessageSourceLocalEffectHandlerExtension; use otap_df_engine::{ ConsumerEffectHandlerExtension, Interests, ProducerEffectHandlerExtension, config::ProcessorConfig, - control::{AckMsg, CallData, NackMsg, NodeControlMsg}, + control::{AckMsg, CallData, NackMsg, NodeControlMsg, WakeupSlot}, error::{Error as EngineError, ProcessorErrorKind}, local::processor as local, message::Message, @@ -78,6 +78,25 @@ pub const DEFAULT_MAX_BATCH_DURATION_MS: u64 = 200; const LOG_MSG_BATCHING_FAILED_PREFIX: &str = "OTAP batch processor: low-level batching failed for"; const LOG_MSG_BATCHING_FAILED_SUFFIX: &str = "; dropping"; +const WAKEUP_SLOT_OTAP_LOGS: WakeupSlot = WakeupSlot(0); +const WAKEUP_SLOT_OTAP_METRICS: WakeupSlot = WakeupSlot(1); +const WAKEUP_SLOT_OTAP_TRACES: WakeupSlot = WakeupSlot(2); +const WAKEUP_SLOT_OTLP_LOGS: WakeupSlot = WakeupSlot(3); +const WAKEUP_SLOT_OTLP_METRICS: WakeupSlot = WakeupSlot(4); +const WAKEUP_SLOT_OTLP_TRACES: WakeupSlot = WakeupSlot(5); + +const fn signal_from_wakeup_slot(slot: WakeupSlot) -> Option<(SignalFormat, SignalType)> { + match slot { + WAKEUP_SLOT_OTAP_LOGS => Some((SignalFormat::OtapRecords, SignalType::Logs)), + WAKEUP_SLOT_OTAP_METRICS => Some((SignalFormat::OtapRecords, SignalType::Metrics)), + WAKEUP_SLOT_OTAP_TRACES => Some((SignalFormat::OtapRecords, SignalType::Traces)), + WAKEUP_SLOT_OTLP_LOGS => Some((SignalFormat::OtlpBytes, SignalType::Logs)), + WAKEUP_SLOT_OTLP_METRICS => Some((SignalFormat::OtlpBytes, SignalType::Metrics)), + WAKEUP_SLOT_OTLP_TRACES => Some((SignalFormat::OtlpBytes, SignalType::Traces)), + _ => None, + } +} + /// How to size a batch. /// /// Note: these are not always supported. In the present code, the only @@ -149,9 +168,9 @@ trait Batcher { records: Vec, ) -> Result, PDataError>; - /// We are using an empty DelayData request as a one-shot - /// timer. This returns the appropriate empty request. - /// TODO: Add proper one-shot timer and cancellation, see #1472. + fn wakeup_slot(signal: SignalType) -> WakeupSlot; + + /// Returns the appropriate empty request payload for this signal. fn empty(signal: SignalType) -> T; } @@ -743,6 +762,14 @@ impl Batcher for SignalBuffer { SignalType::Traces => OtapArrowRecords::Traces(otap_df_pdata::otap::Traces::default()), } } + + fn wakeup_slot(signal: SignalType) -> WakeupSlot { + match signal { + SignalType::Logs => WAKEUP_SLOT_OTAP_LOGS, + SignalType::Metrics => WAKEUP_SLOT_OTAP_METRICS, + SignalType::Traces => WAKEUP_SLOT_OTAP_TRACES, + } + } } impl Batcher for SignalBuffer { @@ -763,6 +790,14 @@ impl Batcher for SignalBuffer { SignalType::Traces => OtlpProtoBytes::ExportTracesRequest(Bytes::new()), } } + + fn wakeup_slot(signal: SignalType) -> WakeupSlot { + match signal { + SignalType::Logs => WAKEUP_SLOT_OTLP_LOGS, + SignalType::Metrics => WAKEUP_SLOT_OTLP_METRICS, + SignalType::Traces => WAKEUP_SLOT_OTLP_TRACES, + } + } } impl<'a, T: OtapPayloadHelpers> BatchProcessorSignal<'a, T> @@ -848,6 +883,8 @@ where return Ok(()); } + let _ = effect.cancel_wakeup(SignalBuffer::::wakeup_slot(self.signal)); + // If this is a timer-based flush and we were called too soon, // skip. this may happen if the batch for which the timer was set // flushes for size before the timer. @@ -1099,6 +1136,32 @@ impl local::Processor for BatchProcessor { message: e.to_string(), } }), + NodeControlMsg::Wakeup { slot, when } => { + let Some((format, signal)) = signal_from_wakeup_slot(slot) else { + return Ok(()); + }; + + match format { + SignalFormat::OtapRecords => { + if let Some(mut otap_format) = self.otap_format() { + otap_format + .for_signal(signal) + .flush_signal_impl(effect, when, FlushReason::Timer) + .await?; + } + } + SignalFormat::OtlpBytes => { + if let Some(mut otlp_format) = self.otlp_format() { + otlp_format + .for_signal(signal) + .flush_signal_impl(effect, when, FlushReason::Timer) + .await?; + } + } + }; + + Ok(()) + } NodeControlMsg::DelayedData { data, when } => { let signal = data.signal_type(); @@ -1326,18 +1389,11 @@ where self.arrival = Some(now); effect - .delay_data( - now + timeout, - Box::new(OtapPdata::new( - Context::default(), - Self::empty(signal).into(), - )), - ) - .await + .set_wakeup(Self::wakeup_slot(signal), now + timeout) .map_err(|_| EngineError::ProcessorError { processor: effect.processor_id(), kind: ProcessorErrorKind::Other, - error: "could not set one-shot timer".into(), + error: "could not set wakeup".into(), source_detail: "".into(), }) } @@ -1367,12 +1423,12 @@ mod tests { use otap_df_engine::config::ProcessorConfig; use otap_df_engine::context::ControllerContext; use otap_df_engine::control::{ - NodeControlMsg, PipelineCompletionMsg, RuntimeControlMsg, pipeline_completion_msg_channel, + NodeControlMsg, PipelineCompletionMsg, pipeline_completion_msg_channel, runtime_ctrl_msg_channel, }; use otap_df_engine::message::Message; use otap_df_engine::node::Node; - use otap_df_engine::testing::liveness::{next_completion, next_runtime_control}; + use otap_df_engine::testing::liveness::next_completion; use otap_df_engine::testing::processor::TestRuntime; use otap_df_engine::testing::test_node; use otap_df_otap::pdata::OtapPdata; @@ -1620,7 +1676,7 @@ mod tests { #[derive(Clone)] enum TestEvent { Input(OtlpProtoMessage), - Elapsed, // Signal to deliver all pending DelayedData messages + Elapsed, // Signal to deliver due wakeups } /// Policy for acking or nacking an output @@ -1657,6 +1713,17 @@ mod tests { otap_to_otlp(&rec) } + const fn all_wakeup_slots() -> [WakeupSlot; 6] { + [ + WAKEUP_SLOT_OTAP_LOGS, + WAKEUP_SLOT_OTAP_METRICS, + WAKEUP_SLOT_OTAP_TRACES, + WAKEUP_SLOT_OTLP_LOGS, + WAKEUP_SLOT_OTLP_METRICS, + WAKEUP_SLOT_OTLP_TRACES, + ] + } + fn run_batch_processor_test( events: impl Iterator, subscribe: bool, @@ -1686,10 +1753,8 @@ mod tests { phase .run_test(move |mut ctx| async move { - let (runtime_ctrl_tx, mut runtime_ctrl_rx) = runtime_ctrl_msg_channel(10); let (pipeline_completion_tx, mut pipeline_completion_rx) = pipeline_completion_msg_channel(10); - ctx.set_runtime_ctrl_sender(runtime_ctrl_tx); ctx.set_pipeline_completion_sender(pipeline_completion_tx); // Track outputs by event position @@ -1702,16 +1767,11 @@ mod tests { let mut received_acks: Vec = Vec::new(); let mut received_nacks: Vec = Vec::new(); - // Track latest DelayedData message - let mut pending_delay: Option<(Instant, Box)> = None; let mut input_idx = 0; let mut total_outputs = 0; // Process each event in sequence for (event_idx, event) in events.into_iter().enumerate() { - // Determine if this is an elapsed event - let is_elapsed = matches!(event, TestEvent::Elapsed); - // Process the event match event { TestEvent::Input(input_otlp) => { @@ -1744,20 +1804,15 @@ mod tests { input_idx += 1; } TestEvent::Elapsed => { - // Elapsed event - no input to process - } - } - - // If this is an Elapsed event, deliver the pending DelayedData if present - if is_elapsed { - if let Some((when, data)) = pending_delay.take() { - // Note we deliver "when" exactly as the DelayData requested, - // which is a future timestamp; however it's the deadline requested, - // and since "when" passes through, the comparison is succesful using - // the expected instant. - let delayed_msg = - Message::Control(NodeControlMsg::DelayedData { when, data }); - ctx.process(delayed_msg).await.expect("process delayed"); + let when = Instant::now() + Duration::from_secs(1); + for slot in all_wakeup_slots() { + ctx.process(Message::Control(NodeControlMsg::Wakeup { + slot, + when, + })) + .await + .expect("process wakeup"); + } } } @@ -1799,22 +1854,6 @@ mod tests { } } - // Drain control channel for DelayData requests and acks/nacks - loop { - match runtime_ctrl_rx.try_recv() { - Ok(RuntimeControlMsg::DelayData { when, data, .. }) => { - looped += 1; - pending_delay = Some((when, data)); - } - Ok(_) => { - panic!("unexpected case"); - } - Err(_) => { - break; - } - } - } - loop { match pipeline_completion_rx.try_recv() { Ok(PipelineCompletionMsg::DeliverAck { ack }) => { @@ -2010,11 +2049,11 @@ mod tests { test_timer_flush(datagen.generate_logs().into(), true); } - // The processor schedules one-shot DelayedData wakeups without cancelling older - // ones. This test proves that a stale wakeup is ignored and that the current - // wakeup still flushes the buffered input later. + // The processor replaces wakeups per slot. This test proves that an early + // wakeup is ignored and that the current wakeup still flushes the buffered + // input later. #[test] - fn test_timer_flush_ignores_stale_delayed_wakeup() { + fn test_timer_flush_ignores_stale_wakeup() { let (telemetry_registry, metrics_reporter, phase) = setup_test_runtime(json!({ "otap": { "min_size": 5, @@ -2026,9 +2065,6 @@ mod tests { phase .run_test(move |mut ctx| async move { - let (runtime_ctrl_tx, mut runtime_ctrl_rx) = runtime_ctrl_msg_channel(10); - ctx.set_runtime_ctrl_sender(runtime_ctrl_tx); - let mut datagen = DataGenerator::new(1); let first = datagen.generate_logs(); let second = datagen.generate_logs(); @@ -2043,20 +2079,6 @@ mod tests { "first input should remain buffered" ); - let RuntimeControlMsg::DelayData { - when: stale_when, - data: stale_data, - .. - } = next_runtime_control( - &mut runtime_ctrl_rx, - Duration::from_secs(1), - "initial batch timer wakeup", - ) - .await - else { - panic!("expected initial DelayData"); - }; - // The second input takes the buffer over the min size, so the processor flushes // before the original timer fires. let rec = encode_logs_otap_batch(&second).expect("encode logs"); @@ -2075,37 +2097,25 @@ mod tests { "new post-flush batch should remain buffered" ); - let RuntimeControlMsg::DelayData { - when: current_when, - data: current_data, - .. - } = next_runtime_control( - &mut runtime_ctrl_rx, - Duration::from_secs(1), - "replacement batch timer wakeup", - ) - .await - else { - panic!("expected replacement DelayData"); - }; - - ctx.process(Message::Control(NodeControlMsg::DelayedData { + let stale_when = Instant::now(); + ctx.process(Message::Control(NodeControlMsg::Wakeup { + slot: WAKEUP_SLOT_OTAP_LOGS, when: stale_when, - data: stale_data, })) .await - .expect("process stale delayed data"); + .expect("process stale wakeup"); assert!( ctx.drain_pdata().await.is_empty(), "stale wakeup should be ignored" ); - ctx.process(Message::Control(NodeControlMsg::DelayedData { + let current_when = Instant::now() + Duration::from_secs(1); + ctx.process(Message::Control(NodeControlMsg::Wakeup { + slot: WAKEUP_SLOT_OTAP_LOGS, when: current_when, - data: current_data, })) .await - .expect("process current delayed data"); + .expect("process current wakeup"); let final_flush = ctx.drain_pdata().await; assert_eq!( final_flush.len(), @@ -2691,9 +2701,6 @@ mod tests { phase .run_test(move |mut ctx| async move { - let (pipeline_tx, mut pipeline_rx) = runtime_ctrl_msg_channel(10); - ctx.set_runtime_ctrl_sender(pipeline_tx); - // Create test data let mut datagen = DataGenerator::new(1); let logs1: OtlpProtoMessage = datagen.generate_logs().into(); @@ -2704,8 +2711,6 @@ mod tests { let otap_message2 = otlp_to_otap(&logs2); let mut outputs = Vec::new(); - let mut pending_delays: Vec<(Instant, Box)> = Vec::new(); - // Send both ctx.process(Message::PData(OtapPdata::new_default(otlp_message1.into()))) .await @@ -2715,23 +2720,17 @@ mod tests { .await .expect("process otlp"); - // Drain control channel for DelayData - while let Ok(RuntimeControlMsg::DelayData { when, data, .. }) = - pipeline_rx.try_recv() - { - pending_delays.push((when, data)); - } - assert!( ctx.drain_pdata().await.is_empty(), "no outputs before timeout" ); - // Trigger timeout - for (when, data) in pending_delays { - ctx.process(Message::Control(NodeControlMsg::DelayedData { when, data })) + // Trigger timeout for both active batching slots. + let when = Instant::now() + Duration::from_secs(1); + for slot in [WAKEUP_SLOT_OTLP_LOGS, WAKEUP_SLOT_OTAP_LOGS] { + ctx.process(Message::Control(NodeControlMsg::Wakeup { slot, when })) .await - .expect("process delayed"); + .expect("process wakeup"); } // Drain outputs after timeout diff --git a/rust/otap-dataflow/crates/core-nodes/src/processors/durable_buffer_processor/mod.rs b/rust/otap-dataflow/crates/core-nodes/src/processors/durable_buffer_processor/mod.rs index 17689eeeca..549a9fe3a3 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/processors/durable_buffer_processor/mod.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/processors/durable_buffer_processor/mod.rs @@ -43,7 +43,7 @@ //! - `TimerTick`: Poll storage for bundles, send downstream //! - `Ack`: Extract BundleRef from calldata, call handle.ack() //! - `Nack (permanent)`: Call handle.reject() — no retry -//! - `Nack (transient)`: Call handle.defer() and schedule retry via delay_data() +//! - `Nack (transient)`: Call handle.defer() and schedule retry via a wakeup //! - `Shutdown`: Flush storage engine //! //! # Retry Behavior and Error Handling @@ -104,7 +104,7 @@ use otap_df_config::node::NodeUserConfig; use otap_df_engine::config::ProcessorConfig; use otap_df_engine::context::PipelineContext; use otap_df_engine::control::Context8u8; -use otap_df_engine::control::{AckMsg, CallData, NackMsg, NodeControlMsg}; +use otap_df_engine::control::{AckMsg, CallData, NackMsg, NodeControlMsg, WakeupSlot}; use otap_df_engine::error::Error; use otap_df_engine::local::processor::EffectHandler; use otap_df_engine::message::Message; @@ -318,42 +318,6 @@ fn decode_bundle_ref(calldata: &CallData) -> Option { }) } -/// Encode a retry ticket into CallData for DelayedData scheduling. -/// -/// Layout: [segment_seq (u64), bundle_index (u32), retry_count (u32) packed into u64] -fn encode_retry_ticket(bundle_ref: BundleRef, retry_count: u32) -> CallData { - // Pack bundle_index (low 32 bits) and retry_count (high 32 bits) into one u64 - let packed = (bundle_ref.bundle_index.raw() as u64) | ((retry_count as u64) << 32); - smallvec![ - Context8u8::from(bundle_ref.segment_seq.raw()), - Context8u8::from(packed), - ] -} - -/// Decode a retry ticket from CallData. -/// -/// Returns (BundleRef, retry_count) if valid. -fn decode_retry_ticket(calldata: &CallData) -> Option<(BundleRef, u32)> { - if calldata.len() < 2 { - return None; - } - let segment_seq = SegmentSeq::new(u64::from(calldata[0])); - let packed = u64::from(calldata[1]); - let bundle_index = BundleIndex::new((packed & 0xFFFF_FFFF) as u32); - let retry_count = (packed >> 32) as u32; - Some(( - BundleRef { - segment_seq, - bundle_index, - }, - retry_count, - )) -} - -// ───────────────────────────────────────────────────────────────────────────── -// Pending Bundle Tracking -// ───────────────────────────────────────────────────────────────────────────── - /// State for tracking a pending downstream delivery. /// /// Holds the Quiver bundle handle to keep the bundle claimed while in-flight. @@ -370,6 +334,13 @@ struct PendingBundle { signal_type: SignalType, } +/// Local retry state held between wakeup scheduling and wakeup delivery. +#[derive(Clone, Copy)] +struct RetryWakeup { + bundle_ref: BundleRef, + retry_count: u32, +} + /// Result of attempting to process a bundle with non-blocking send. enum ProcessBundleResult { /// Bundle was successfully sent downstream. @@ -440,11 +411,19 @@ pub struct DurableBuffer { /// Key is the (segment_seq, bundle_index) pair encoded as a u128 for fast lookup. pending_bundles: HashMap<(u64, u32), PendingBundle>, - /// Bundles scheduled for retry via delay_data. + /// Bundles scheduled for retry via node-local wakeups. /// These are skipped by poll_next_bundle to enforce backoff. - /// Removed when the delay fires and claim_bundle is called. retry_scheduled: HashSet<(u64, u32)>, + /// Wakeup slot assigned to each bundle currently waiting for retry. + retry_wakeup_slots: HashMap<(u64, u32), WakeupSlot>, + + /// Retry state keyed by wakeup slot. + retry_wakeups: HashMap, + + /// Monotonic slot allocator for retry wakeups. + next_retry_wakeup_slot: u64, + /// Configuration. config: DurableBufferConfig, @@ -538,6 +517,9 @@ impl DurableBuffer { engine_state: EngineState::Uninitialized, pending_bundles: HashMap::new(), retry_scheduled: HashSet::new(), + retry_wakeup_slots: HashMap::new(), + retry_wakeups: HashMap::new(), + next_retry_wakeup_slot: 0, config, core_id, num_cores, @@ -633,14 +615,13 @@ impl DurableBuffer { self.pending_bundles.len() < self.config.max_in_flight } - /// Schedule a retry for a bundle via delay_data. + /// Schedule a retry for a bundle via a processor-local wakeup. /// - /// This is the single point of coordination between `delay_data` scheduling - /// and `retry_scheduled` tracking. Always use this method instead of calling - /// `delay_data` directly to ensure the two stay in sync. + /// This is the single point of coordination between wakeup scheduling and + /// `retry_scheduled` tracking. Always use this method to keep the two in sync. /// /// Returns true if scheduling succeeded, false if it failed (caller should - /// let poll_next_bundle pick up the bundle instead). + /// let `poll_next_bundle` pick up the bundle instead). async fn schedule_retry( &mut self, bundle_ref: BundleRef, @@ -649,40 +630,45 @@ impl DurableBuffer { effect_handler: &mut EffectHandler, ) -> bool { let key = (bundle_ref.segment_seq.raw(), bundle_ref.bundle_index.raw()); - - // Create a lightweight retry ticket - // TODO(#1472): Replace with proper timer support when available. - // Currently we abuse delay_data() with an empty payload as a workaround - // for the lack of a native "schedule callback" primitive. - let retry_ticket = OtapPdata::new( - Default::default(), - OtapPayload::empty(SignalType::Traces), // Signal type doesn't matter for empty payload - ); - let calldata = encode_retry_ticket(bundle_ref, retry_count); - let mut retry_ticket = Box::new(retry_ticket); - effect_handler.subscribe_to(Interests::empty(), calldata, &mut retry_ticket); - + let (slot, is_new_slot) = match self.retry_wakeup_slots.entry(key) { + Entry::Occupied(entry) => (*entry.get(), false), + Entry::Vacant(entry) => { + let slot = WakeupSlot(self.next_retry_wakeup_slot); + self.next_retry_wakeup_slot = self.next_retry_wakeup_slot.saturating_add(1); + let _ = entry.insert(slot); + (slot, true) + } + }; let retry_at = Instant::now() + delay; - if effect_handler - .delay_data(retry_at, retry_ticket) - .await - .is_ok() - { + if effect_handler.set_wakeup(slot, retry_at).is_ok() { // Track that this bundle is scheduled - poll_next_bundle will skip it let _ = self.retry_scheduled.insert(key); + let _ = self.retry_wakeups.insert( + slot, + RetryWakeup { + bundle_ref, + retry_count, + }, + ); true } else { - // Failed to schedule - don't add to retry_scheduled, poll will pick it up + if is_new_slot { + let _ = self.retry_wakeup_slots.remove(&key); + } false } } - /// Remove a bundle from retry_scheduled tracking. - /// - /// Call this when the delay has fired and we're about to process the retry. - fn unschedule_retry(&mut self, bundle_ref: BundleRef) { - let key = (bundle_ref.segment_seq.raw(), bundle_ref.bundle_index.raw()); + /// Remove retry-wakeup tracking for a bundle now being resumed. + fn take_retry_wakeup(&mut self, slot: WakeupSlot) -> Option { + let wakeup = self.retry_wakeups.remove(&slot)?; + let key = ( + wakeup.bundle_ref.segment_seq.raw(), + wakeup.bundle_ref.bundle_index.raw(), + ); let _ = self.retry_scheduled.remove(&key); + let _ = self.retry_wakeup_slots.remove(&key); + Some(wakeup) } /// Lazily initialize the Quiver engine on first use. @@ -1501,10 +1487,9 @@ impl DurableBuffer { /// For permanent NACKs (e.g., malformed data that will never succeed), the bundle /// is rejected immediately without retry. /// - /// For transient NACKs, schedules a retry with exponential backoff using `delay_data()`. - /// The bundle is deferred in Quiver (releasing the claim) and a lightweight - /// retry ticket is scheduled. When the delay expires, `handle_delayed_retry` - /// will re-claim the bundle and attempt redelivery. + /// For transient NACKs, schedules a retry with exponential backoff using a + /// processor-local wakeup. The bundle is deferred in Quiver (releasing the + /// claim) and local retry state is retained until the wakeup fires. async fn handle_nack( &mut self, nack: NackMsg, @@ -1601,29 +1586,26 @@ impl DurableBuffer { Ok(()) } - /// Handle a delayed retry ticket. + /// Handle a retry wakeup. /// /// Re-claims the bundle from Quiver and attempts redelivery downstream. - async fn handle_delayed_retry( + async fn handle_retry_wakeup( &mut self, - retry_ticket: Box, + slot: WakeupSlot, effect_handler: &mut EffectHandler, ) -> Result<(), Error> { - // Decode the retry ticket - let Some(calldata) = retry_ticket.source_route() else { - otel_warn!("durable_buffer.retry.missing_calldata"); - return Ok(()); - }; - - let Some((bundle_ref, retry_count)) = decode_retry_ticket(&calldata.calldata) else { - otel_warn!("durable_buffer.retry.invalid_calldata"); + let Some(RetryWakeup { + bundle_ref, + retry_count, + }) = self.take_retry_wakeup(slot) + else { + otel_warn!("durable_buffer.retry.unknown_wakeup", wakeup_slot = slot.0); return Ok(()); }; // Check max_in_flight limit if !self.can_send_more() { // At capacity - re-schedule with a short delay. - // Bundle stays in retry_scheduled (wasn't removed yet). otel_debug!( "durable_buffer.retry.deferred", segment_seq = bundle_ref.segment_seq.raw(), @@ -1632,8 +1614,6 @@ impl DurableBuffer { max_in_flight = self.config.max_in_flight ); - // Re-schedule - note: bundle is still in retry_scheduled, schedule_retry - // will just update it (insert is idempotent for HashSet) if !self .schedule_retry( bundle_ref, @@ -1643,17 +1623,11 @@ impl DurableBuffer { ) .await { - // Failed to re-schedule - remove from retry_scheduled so poll can pick it up - self.unschedule_retry(bundle_ref); otel_warn!("durable_buffer.retry.reschedule_failed"); } return Ok(()); } - // Backoff period has elapsed and we have capacity - remove from retry_scheduled. - // This allows poll_next_bundle to see it again if claim_bundle fails. - self.unschedule_retry(bundle_ref); - // Re-claim the bundle from Quiver let claim_result = { let (engine, subscriber_id) = self.engine()?; @@ -1924,15 +1898,10 @@ impl otap_df_engine::local::processor::Processor for DurableBuffer { Ok(()) } NodeControlMsg::DrainIngress { .. } => Ok(()), - NodeControlMsg::DelayedData { data, .. } => { - // Check if this is a retry ticket (has BundleRef + retry_count in calldata) - if let Some(route) = data.source_route() { - if decode_retry_ticket(&route.calldata).is_some() { - // This is a retry ticket - handle retry - return self.handle_delayed_retry(data, effect_handler).await; - } - } - // Not a retry ticket - shouldn't happen, but handle gracefully + NodeControlMsg::Wakeup { slot, .. } => { + self.handle_retry_wakeup(slot, effect_handler).await + } + NodeControlMsg::DelayedData { .. } => { otel_warn!("durable_buffer.delayed_data.unexpected"); Ok(()) } @@ -2015,45 +1984,166 @@ mod tests { } #[test] - fn test_retry_ticket_encoding_roundtrip() { + fn test_take_retry_wakeup_clears_tracking() { + use otap_df_engine::context::ControllerContext; + use otap_df_telemetry::registry::TelemetryRegistryHandle; + + let registry = TelemetryRegistryHandle::default(); + let controller_ctx = ControllerContext::new(registry); + let pipeline_ctx = + controller_ctx.pipeline_context_with("test".into(), "test".into(), 0, 1, 0); + + let config = DurableBufferConfig { + path: std::path::PathBuf::from("/tmp/test-retry-wakeup"), + retention_size_cap: byte_unit::Byte::from_u64(256 * 1024 * 1024), + max_age: None, + size_cap_policy: SizeCapPolicy::Backpressure, + poll_interval: Duration::from_millis(100), + otlp_handling: OtlpHandling::PassThrough, + max_segment_open_duration: Duration::from_secs(1), + initial_retry_interval: Duration::from_secs(1), + max_retry_interval: Duration::from_secs(30), + retry_multiplier: 2.0, + max_in_flight: 1000, + }; + + let mut processor = DurableBuffer::new(config, &pipeline_ctx).unwrap(); let bundle_ref = BundleRef { segment_seq: SegmentSeq::new(98765), bundle_index: BundleIndex::new(123), }; - let retry_count = 7u32; - - let calldata = encode_retry_ticket(bundle_ref, retry_count); - let decoded = decode_retry_ticket(&calldata); + let key = (bundle_ref.segment_seq.raw(), bundle_ref.bundle_index.raw()); + let slot = WakeupSlot(7); + let _ = processor.retry_scheduled.insert(key); + let _ = processor.retry_wakeup_slots.insert(key, slot); + let _ = processor.retry_wakeups.insert( + slot, + RetryWakeup { + bundle_ref, + retry_count: 3, + }, + ); - assert!(decoded.is_some()); - let (decoded_ref, decoded_count) = decoded.unwrap(); - assert_eq!(decoded_ref.segment_seq.raw(), 98765); - assert_eq!(decoded_ref.bundle_index.raw(), 123); - assert_eq!(decoded_count, 7); + let taken = processor + .take_retry_wakeup(slot) + .expect("retry wakeup should exist"); + assert_eq!(taken.bundle_ref.segment_seq.raw(), 98765); + assert_eq!(taken.bundle_ref.bundle_index.raw(), 123); + assert_eq!(taken.retry_count, 3); + assert!(!processor.retry_scheduled.contains(&key)); + assert!(!processor.retry_wakeup_slots.contains_key(&key)); + assert!(!processor.retry_wakeups.contains_key(&slot)); } #[test] - fn test_retry_ticket_encoding_max_values() { - let bundle_ref = BundleRef { - segment_seq: SegmentSeq::new(u64::MAX), - bundle_index: BundleIndex::new(u32::MAX), - }; - let retry_count = u32::MAX; + fn test_take_retry_wakeup_unknown_slot_is_ignored() { + use otap_df_engine::context::ControllerContext; + use otap_df_telemetry::registry::TelemetryRegistryHandle; + + let registry = TelemetryRegistryHandle::default(); + let controller_ctx = ControllerContext::new(registry); + let pipeline_ctx = + controller_ctx.pipeline_context_with("test".into(), "test".into(), 0, 1, 0); - let calldata = encode_retry_ticket(bundle_ref, retry_count); - let decoded = decode_retry_ticket(&calldata); + let config = DurableBufferConfig { + path: std::path::PathBuf::from("/tmp/test-retry-wakeup-miss"), + retention_size_cap: byte_unit::Byte::from_u64(256 * 1024 * 1024), + max_age: None, + size_cap_policy: SizeCapPolicy::Backpressure, + poll_interval: Duration::from_millis(100), + otlp_handling: OtlpHandling::PassThrough, + max_segment_open_duration: Duration::from_secs(1), + initial_retry_interval: Duration::from_secs(1), + max_retry_interval: Duration::from_secs(30), + retry_multiplier: 2.0, + max_in_flight: 1000, + }; - assert!(decoded.is_some()); - let (decoded_ref, decoded_count) = decoded.unwrap(); - assert_eq!(decoded_ref.segment_seq.raw(), u64::MAX); - assert_eq!(decoded_ref.bundle_index.raw(), u32::MAX); - assert_eq!(decoded_count, u32::MAX); + let mut processor = DurableBuffer::new(config, &pipeline_ctx).unwrap(); + assert!(processor.take_retry_wakeup(WakeupSlot(999)).is_none()); } #[test] - fn test_decode_retry_ticket_empty_calldata() { - let calldata: CallData = smallvec![]; - assert!(decode_retry_ticket(&calldata).is_none()); + fn test_retry_wakeup_resumes_retry_logic() { + use otap_df_config::node::NodeUserConfig; + use otap_df_engine::config::ProcessorConfig; + use otap_df_engine::context::ControllerContext; + use otap_df_engine::control::pipeline_completion_msg_channel; + use otap_df_engine::message::Message; + use otap_df_engine::testing::processor::TestRuntime; + use otap_df_engine::testing::test_node; + use otap_df_otap::testing::next_nack; + use otap_df_pdata::encode::encode_logs_otap_batch; + use otap_df_pdata::testing::fixtures::DataGenerator; + use serde_json::json; + + let rt = TestRuntime::new(); + let controller = ControllerContext::new(rt.metrics_registry()); + let pipeline_ctx = controller.pipeline_context_with("grp".into(), "pipe".into(), 0, 1, 0); + let temp_dir = tempfile::tempdir().expect("tempdir"); + + let mut node_config = NodeUserConfig::new_processor_config(DURABLE_BUFFER_URN); + node_config.config = json!({ + "path": temp_dir.path(), + "retention_size_cap": "256 MiB", + "poll_interval": "100ms", + "max_segment_open_duration": "1s", + "initial_retry_interval": "1s", + "max_retry_interval": "30s", + "retry_multiplier": 2.0, + "max_in_flight": 1000 + }); + + let processor = create_durable_buffer( + pipeline_ctx, + test_node("durable-buffer-retry-wakeup"), + Arc::new(node_config), + &ProcessorConfig::new("durable-buffer-retry-wakeup"), + ) + .expect("create durable buffer"); + + rt.set_processor(processor) + .run_test(move |mut ctx| async move { + let (pipeline_completion_tx, _pipeline_completion_rx) = + pipeline_completion_msg_channel(10); + ctx.set_pipeline_completion_sender(pipeline_completion_tx); + + let mut datagen = DataGenerator::new(1); + let input = datagen.generate_logs(); + let rec = encode_logs_otap_batch(&input).expect("encode logs"); + ctx.process(Message::PData(OtapPdata::new_default(rec.into()))) + .await + .expect("process input"); + + ctx.process(Message::Control(NodeControlMsg::TimerTick {})) + .await + .expect("process timer tick"); + let mut outputs = ctx.drain_pdata().await; + assert_eq!(outputs.len(), 1, "timer tick should emit one bundle"); + + let sent = outputs.pop().expect("sent bundle"); + let (_, nack) = + next_nack(NackMsg::new("retry", sent)).expect("expected nack subscriber"); + ctx.process(Message::Control(NodeControlMsg::Nack(nack))) + .await + .expect("process nack"); + assert!( + ctx.drain_pdata().await.is_empty(), + "nack should defer delivery until wakeup" + ); + + ctx.process(Message::Control(NodeControlMsg::Wakeup { + slot: WakeupSlot(0), + when: Instant::now() + Duration::from_secs(1), + })) + .await + .expect("process retry wakeup"); + + let retried = ctx.drain_pdata().await; + assert_eq!(retried.len(), 1, "wakeup should resume retry delivery"); + assert_eq!(retried[0].signal_type(), SignalType::Logs); + }) + .validate(|_| async {}); } #[test] diff --git a/rust/otap-dataflow/crates/core-nodes/src/processors/log_sampling_processor/mod.rs b/rust/otap-dataflow/crates/core-nodes/src/processors/log_sampling_processor/mod.rs index 7409b2ac9d..45f1dd162f 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/processors/log_sampling_processor/mod.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/processors/log_sampling_processor/mod.rs @@ -199,6 +199,7 @@ impl local::Processor for LogSamplingProcessor { | NodeControlMsg::Ack(_) | NodeControlMsg::Nack(_) | NodeControlMsg::DrainIngress { .. } + | NodeControlMsg::Wakeup { .. } | NodeControlMsg::DelayedData { .. } => Ok(()), }, } diff --git a/rust/otap-dataflow/crates/core-nodes/src/processors/retry_processor/mod.rs b/rust/otap-dataflow/crates/core-nodes/src/processors/retry_processor/mod.rs index 83dd314126..43ca40d7c9 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/processors/retry_processor/mod.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/processors/retry_processor/mod.rs @@ -546,12 +546,12 @@ impl RetryProcessor { self.metrics.increment_retry_attempts(signal); - // Delay the data, we'll continue in the DelayedData branch next. - match effect_handler.delay_data(next_retry_time_i, rereq).await { + // Requeue the data onto this node, we'll continue in the DelayedData branch next. + match effect_handler.requeue_later(next_retry_time_i, rereq) { Ok(_) => Ok(()), Err(refused) => { effect_handler - .notify_nack(NackMsg::new("cannot delay", refused)) + .notify_nack(NackMsg::new("cannot requeue", refused)) .await?; // This component failed. self.metrics.add_consumed_failure(signal, num_items); @@ -651,6 +651,7 @@ impl Processor for RetryProcessor { NodeControlMsg::TimerTick { .. } => { unreachable!("unused"); } + NodeControlMsg::Wakeup { .. } => Ok(()), NodeControlMsg::DrainIngress { .. } => Ok(()), NodeControlMsg::Shutdown { .. } => Ok(()), }, @@ -683,8 +684,7 @@ mod test { use otap_df_config::node::NodeUserConfig; use otap_df_engine::context::{ControllerContext, PipelineContext}; use otap_df_engine::control::{ - AckMsg, NackMsg, NodeControlMsg, PipelineCompletionMsg, RuntimeControlMsg, - pipeline_completion_msg_channel, runtime_ctrl_msg_channel, + AckMsg, NackMsg, NodeControlMsg, PipelineCompletionMsg, pipeline_completion_msg_channel, }; use otap_df_engine::testing::liveness::next_completion; use otap_df_engine::testing::node::test_node; @@ -913,14 +913,13 @@ mod test { }); } - // Retry scheduling depends on the runtime-control DelayData path. If that path is - // unavailable, the processor must convert the request to a terminal Nack instead - // of leaving retry state stranded forever. + // If the local delayed-resume queue is full, the processor must convert the + // request to a terminal Nack instead of leaving retry state stranded forever. #[test] - fn test_retry_processor_cannot_delay_becomes_terminal_nack() { + fn test_retry_processor_cannot_requeue_becomes_terminal_nack() { let pipeline_ctx = create_test_pipeline_context(); let node = test_node("retry-processor-cannot-delay"); - let rt: TestRuntime = TestRuntime::new(); + let rt: TestRuntime = TestRuntime::with_channel_capacities(10, 1); let mut node_config = NodeUserConfig::new_processor_config(RETRY_PROCESSOR_URN); node_config.config = create_test_config(); @@ -935,11 +934,8 @@ mod test { rt.set_processor(proc) .run_test(move |mut ctx| async move { - let (runtime_ctrl_tx, runtime_ctrl_rx) = runtime_ctrl_msg_channel(1); - drop(runtime_ctrl_rx); let (pipeline_completion_tx, mut pipeline_completion_rx) = pipeline_completion_msg_channel(10); - ctx.set_runtime_ctrl_sender(runtime_ctrl_tx); ctx.set_pipeline_completion_sender(pipeline_completion_tx); let pdata_in = create_test_pdata().test_subscribe_to( @@ -963,17 +959,42 @@ mod test { .await .expect("process nack"); + assert!( + ctx.next_local_control_deadline().is_some(), + "first retry should occupy the only local requeue slot" + ); + + let second_input = create_test_pdata().test_subscribe_to( + Interests::ACKS | Interests::NACKS | Interests::RETURN_DATA, + TestCallData::default().into(), + 4444, + ); + ctx.process(Message::PData(second_input)) + .await + .expect("process second input"); + + let mut output = ctx.drain_pdata().await; + assert_eq!(output.len(), 1); + let second_attempt = output.remove(0); + + let (_, nack_msg) = + next_nack(NackMsg::new("simulated downstream failure", second_attempt)) + .expect("expected nack subscriber"); + ctx.process(Message::nack_ctrl_msg(nack_msg)) + .await + .expect("process second nack"); + match next_completion( &mut pipeline_completion_rx, Duration::from_secs(1), - "retry processor terminal nack when delay_data fails", + "retry processor terminal nack when requeue_later fails", ) .await { PipelineCompletionMsg::DeliverNack { nack } => { let (_node_id, nack) = next_nack(nack).expect("expected nack subscriber"); assert!( - nack.reason.contains("cannot delay"), + nack.reason.contains("cannot requeue"), "unexpected reason: {}", nack.reason ); @@ -1012,11 +1033,8 @@ mod test { phase .run_test(move |mut ctx| async move { - // Set up test runtime control channel - let (runtime_ctrl_tx, mut runtime_ctrl_rx) = runtime_ctrl_msg_channel(10); let (pipeline_completion_tx, mut pipeline_completion_rx) = pipeline_completion_msg_channel(10); - ctx.set_runtime_ctrl_sender(runtime_ctrl_tx); ctx.set_pipeline_completion_sender(pipeline_completion_tx); let mut retry_count: usize = 0; @@ -1038,7 +1056,7 @@ mod test { // Simulate downstream failures and retry let mut current_data = first_attempt; - // have_pmsg is the first non-DelayData message + // have_pmsg is the first non-requeue completion message // received in the loop, this will happen when // number_of_nacks is 4, i.e., the nack before the // final retry attempt. @@ -1056,38 +1074,37 @@ mod test { ctx.process(Message::nack_ctrl_msg(nack_msg)).await.unwrap(); nacks_delivered += 1; - // The processor should schedule a delayed retry via DelayData - let resp = tokio::select! { - recv = runtime_ctrl_rx.recv() => match recv { - Ok(RuntimeControlMsg::DelayData { when, data, .. }) => { - retry_count += 1; + let resp = if let Some(when) = ctx.next_local_control_deadline() { + retry_count += 1; - if working_clock { - ctx.sleep(when.duration_since(Instant::now())).await; - } + if working_clock { + ctx.sleep( + when.checked_duration_since(Instant::now()) + .unwrap_or_default(), + ) + .await; + } - ctx.process(Message::Control(NodeControlMsg::DelayedData { - when, - data, - })) + let control = ctx + .take_due_local_control(when) + .expect("scheduled local control"); + assert!( + matches!(control, NodeControlMsg::DelayedData { .. }), + "retry should requeue retained pdata as DelayedData" + ); + ctx.process(Message::Control(control)).await.unwrap(); + + let mut retry_output = ctx.drain_pdata().await; + assert_eq!(retry_output.len(), 1); + current_data = retry_output.remove(0); + None + } else { + Some( + pipeline_completion_rx + .recv() .await - .unwrap(); - - let mut retry_output = ctx.drain_pdata().await; - assert_eq!(retry_output.len(), 1); - current_data = retry_output.remove(0); - None - } - Ok(msg) => { - panic!("unexpected runtime control message: {:?}", msg); - } - Err(err) => { - panic!("unexpected runtime-control receive error: {:?}", err); - } - }, - recv = pipeline_completion_rx.recv() => Some( - recv.expect("pipeline-completion channel closed unexpectedly") - ), + .expect("pipeline-completion channel closed unexpectedly"), + ) }; have_pmsg = have_pmsg.or(resp); } diff --git a/rust/otap-dataflow/crates/engine/README.md b/rust/otap-dataflow/crates/engine/README.md index d58e7f4422..b07eb6beb9 100644 --- a/rust/otap-dataflow/crates/engine/README.md +++ b/rust/otap-dataflow/crates/engine/README.md @@ -200,7 +200,7 @@ A realistic example looks like this: # - one path being saturated no longer prevents the other from draining ``` -`ProcessorMessageChannel` and `ExporterMessageChannel` both prefer control over +`ProcessorInbox` and `ExporterInbox` both prefer control over `pdata`, but neither gives control absolute priority. After a bounded burst of control messages, the channel forces one `pdata` receive attempt when node-level admission allows it, so control storms do not starve the forward data path. @@ -213,18 +213,18 @@ backpressure upstream. Processors and exporters use that mechanism differently. Processors do not usually call `recv_when(...)` themselves because the engine owns their receive loop. Instead, a processor exposes `accept_pdata()`, and the engine feeds that -policy into `ProcessorMessageChannel::recv_when(...)` on the processor's +policy into `ProcessorInbox::recv_when(...)` on the processor's behalf. Exporters own their run loops directly, so they call -`ExporterMessageChannel::recv()` or `ExporterMessageChannel::recv_when(...)` +`ExporterInbox::recv()` or `ExporterInbox::recv_when(...)` themselves. The two mechanisms therefore serve the same admission-control goal at different layers: `accept_pdata()` is the processor-side readiness hook, while `recv_when(...)` is the channel primitive used by self-driven exporter loops. -The shutdown contract is also role-specific. `ProcessorMessageChannel` +The shutdown contract is also role-specific. `ProcessorInbox` continues to honor closed admission during shutdown, so a processor that keeps `accept_pdata() == false` until the deadline may still strand buffered `pdata`. -`ExporterMessageChannel` is different: once shutdown is latched, it still +`ExporterInbox` is different: once shutdown is latched, it still force-drains already buffered channel data even if the exporter has temporarily closed normal admission. @@ -245,7 +245,7 @@ behavior: 1. **Forward `pdata` flow** Receivers admit external work and emit `OtapPdata` on `pdata` channels. Processors and exporters then consume that `pdata` through - `ProcessorMessageChannel` and `ExporterMessageChannel`. + `ProcessorInbox` and `ExporterInbox`. 2. **Node-control delivery** Receivers consume node control in competition with external ingress. By @@ -285,7 +285,7 @@ The runtime is organized around a small set of guarantees: progress under sustained control traffic. - **Explicit node-level admission control:** processors can temporarily pause `pdata` delivery through `accept_pdata()`, and exporters can apply the same - pattern in their run loops with `ExporterMessageChannel::recv_when(false)`. + pattern in their run loops with `ExporterInbox::recv_when(false)`. The engine uses two entry points because processor receive loops are engine-owned while exporter receive loops are node-owned. During shutdown, exporters still drain already buffered channel data, while processors keep @@ -497,9 +497,9 @@ Once ingress is closed and receiver-local drain work is complete, the receiver reports `RuntimeControlMsg::ReceiverDrained`. After all receivers have reported `ReceiverDrained`, the control manager sends -`NodeControlMsg::Shutdown` to processors and exporters. `ProcessorMessageChannel` +`NodeControlMsg::Shutdown` to processors and exporters. `ProcessorInbox` continues delivering control messages while only draining `pdata` when the -processor reopens admission. `ExporterMessageChannel` also continues delivering +processor reopens admission. `ExporterInbox` also continues delivering control messages, but it force-drains already buffered input-channel `pdata` even if the exporter has temporarily closed normal admission. In both cases, the channel returns final shutdown once inputs are drained or the shutdown @@ -557,8 +557,8 @@ The DST approach combines: This is important for the control plane because many of the interesting failure classes are about ordering and bounded progress, not only about local business -logic. The DST harness therefore runs real `ProcessorMessageChannel`, -`ExporterMessageChannel`, `RuntimeCtrlMsgManager`, and +logic. The DST harness therefore runs real `ProcessorInbox`, +`ExporterInbox`, `RuntimeCtrlMsgManager`, and `PipelineCompletionMsgDispatcher` logic inside the engine's single-threaded runtime model. @@ -661,6 +661,7 @@ questions: ### Predefined Attributes + | Scope | Attribute | Type | Description | |----------|---------------------|---------|--------------------------------------------------------------| | Resource | process_instance_id | string | Unique process instance identifier (base32-encoded UUID v7). | @@ -671,6 +672,7 @@ questions: | Pipeline | pipeline_id | string | Pipeline identifier. | | Node | node_id | string | Node unique identifier (in scope of the pipeline). | | Node | node_type | string | Node type (e.g. "receiver", "processor", "exporter"). | + ### Drain Lifecycle Events diff --git a/rust/otap-dataflow/crates/engine/src/control.rs b/rust/otap-dataflow/crates/engine/src/control.rs index 09a30065ad..8135fbfa0b 100644 --- a/rust/otap-dataflow/crates/engine/src/control.rs +++ b/rust/otap-dataflow/crates/engine/src/control.rs @@ -75,6 +75,11 @@ impl From for f64 { /// numbers, deadline, num_items, etc. pub type CallData = SmallVec<[Context8u8; 3]>; +/// Opaque key used to identify a node-local scheduled wakeup. +#[repr(transparent)] +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)] +pub struct WakeupSlot(pub u64); + /// Engine-managed call data envelope. Wraps the CallData with an envelope /// containing timestamp. Lives on the forward path (in context stack frames). #[derive(Clone, Debug, Default, PartialEq)] @@ -222,6 +227,14 @@ pub enum NodeControlMsg { metrics_reporter: MetricsReporter, }, + /// A processor-local wakeup scheduled by the processor effect handler. + Wakeup { + /// Scheduled wakeup slot. + slot: WakeupSlot, + /// Original scheduled wakeup instant. + when: Instant, + }, + /// Delayed data returning to the node which delayed it. DelayedData { /// When resumed diff --git a/rust/otap-dataflow/crates/engine/src/effect_handler.rs b/rust/otap-dataflow/crates/engine/src/effect_handler.rs index daf8b60c12..1f48818409 100644 --- a/rust/otap-dataflow/crates/engine/src/effect_handler.rs +++ b/rust/otap-dataflow/crates/engine/src/effect_handler.rs @@ -4,13 +4,15 @@ //! Common foundation of all effect handlers. use crate::Interests; +use crate::WakeupError; use crate::completion_emission_metrics::CompletionEmissionMetricsHandle; use crate::control::{ AckMsg, NackMsg, PipelineCompletionMsg, PipelineCompletionMsgSender, RuntimeControlMsg, - RuntimeCtrlMsgSender, + RuntimeCtrlMsgSender, WakeupSlot, }; use crate::error::Error; use crate::node::NodeId; +use crate::node_local_scheduler::NodeLocalSchedulerHandle; use otap_df_channel::error::SendError; use otap_df_telemetry::error::Error as TelemetryError; use otap_df_telemetry::metrics::{MetricSet, MetricSetHandler}; @@ -58,6 +60,8 @@ pub(crate) struct EffectHandlerCore { pub(crate) source_tag: SourceTagging, /// Precomputed node interests derived from metric level. node_interests: Interests, + /// Optional processor-local delayed-resume and wakeup scheduler. + pub(crate) local_scheduler: Option>, } impl EffectHandlerCore { @@ -71,6 +75,7 @@ impl EffectHandlerCore { completion_emission_metrics: None, source_tag: SourceTagging::Disabled, node_interests: Interests::empty(), + local_scheduler: None, } } @@ -103,6 +108,11 @@ impl EffectHandlerCore { self.completion_emission_metrics = completion_emission_metrics; } + /// Sets the processor-local wakeup scheduler for this effect handler. + pub(crate) fn set_local_scheduler(&mut self, local_scheduler: NodeLocalSchedulerHandle) { + self.local_scheduler = Some(local_scheduler); + } + /// Returns outgoing messages source tagging mode. #[must_use] pub const fn source_tagging(&self) -> SourceTagging { @@ -394,6 +404,32 @@ impl EffectHandlerCore { }) } + /// Requeue retained pdata onto this node later. + pub fn requeue_later(&self, when: Instant, data: Box) -> Result<(), PData> { + self.local_scheduler + .as_ref() + .expect("node-local scheduler not set for processor effect handler") + .requeue_later(when, data) + .map_err(|data| *data) + } + + /// Set or replace a processor-local wakeup. + pub fn set_wakeup(&self, slot: WakeupSlot, when: Instant) -> Result<(), WakeupError> { + self.local_scheduler + .as_ref() + .expect("node-local scheduler not set for processor effect handler") + .set_wakeup(slot, when) + } + + /// Cancel a previously scheduled processor-local wakeup. + #[must_use] + pub fn cancel_wakeup(&self, slot: WakeupSlot) -> bool { + self.local_scheduler + .as_ref() + .expect("node-local scheduler not set for processor effect handler") + .cancel_wakeup(slot) + } + /// Notifies the runtime control manager that this receiver has completed /// ingress drain. pub async fn notify_receiver_drained(&self) -> Result<(), Error> { diff --git a/rust/otap-dataflow/crates/engine/src/exporter.rs b/rust/otap-dataflow/crates/engine/src/exporter.rs index a6f9fad0b8..bd3e9592a6 100644 --- a/rust/otap-dataflow/crates/engine/src/exporter.rs +++ b/rust/otap-dataflow/crates/engine/src/exporter.rs @@ -20,7 +20,7 @@ use crate::entity_context::NodeTelemetryGuard; use crate::error::{Error, ExporterErrorKind}; use crate::local::exporter as local; use crate::local::message::{LocalReceiver, LocalSender}; -use crate::message::{ExporterMessageChannel, Receiver, Sender}; +use crate::message::{ExporterInbox, Receiver, Sender}; use crate::node::{Node, NodeId, NodeWithPDataReceiver}; use crate::shared::exporter as shared; use crate::shared::message::{SharedReceiver, SharedSender}; @@ -323,13 +323,13 @@ impl ExporterWrapper { effect_handler .core .set_completion_emission_metrics(completion_emission_metrics.clone()); - let message_channel = ExporterMessageChannel::new( + let inbox = ExporterInbox::new( Receiver::Local(control_receiver), pdata_rx, node_id.index, node_interests, ); - exporter.start(message_channel, effect_handler).await + exporter.start(inbox, effect_handler).await } ( ExporterWrapper::Shared { @@ -359,13 +359,13 @@ impl ExporterWrapper { effect_handler .core .set_completion_emission_metrics(completion_emission_metrics); - let message_channel = shared::ExporterMessageChannel::new( + let inbox = shared::ExporterInbox::new( control_receiver, pdata_rx, node_id.index, node_interests, ); - exporter.start(message_channel, effect_handler).await + exporter.start(inbox, effect_handler).await } } } @@ -445,7 +445,7 @@ mod tests { use crate::exporter::{Error, ExporterWrapper}; use crate::local::exporter as local; use crate::local::message::LocalReceiver; - use crate::message::{ExporterMessageChannel, Message, ProcessorMessageChannel, Receiver}; + use crate::message::{ExporterInbox, Message, ProcessorInbox, Receiver}; use crate::shared::exporter as shared; use crate::shared::message::SharedReceiver; use crate::terminal_state::TerminalState; @@ -481,7 +481,7 @@ mod tests { impl local::Exporter for TestExporter { async fn start( self: Box, - mut msg_chan: ExporterMessageChannel, + mut msg_chan: ExporterInbox, effect_handler: local::EffectHandler, ) -> Result { // Loop until a Shutdown event is received. @@ -518,7 +518,7 @@ mod tests { impl shared::Exporter for TestExporter { async fn start( self: Box, - mut msg_chan: shared::ExporterMessageChannel, + mut msg_chan: shared::ExporterInbox, effect_handler: shared::EffectHandler, ) -> Result { // Loop until a Shutdown event is received. @@ -642,14 +642,14 @@ mod tests { ) -> ( mpsc::Sender>, mpsc::Sender, - ExporterMessageChannel, + ExporterInbox, ) { let (control_tx, control_rx) = mpsc::Channel::>::new(capacity); let (pdata_tx, pdata_rx) = mpsc::Channel::::new(capacity); ( control_tx, pdata_tx, - ExporterMessageChannel::new( + ExporterInbox::new( Receiver::Local(LocalReceiver::mpsc(control_rx)), Receiver::Local(LocalReceiver::mpsc(pdata_rx)), 0, @@ -661,7 +661,7 @@ mod tests { fn make_chan() -> ( mpsc::Sender>, mpsc::Sender, - ExporterMessageChannel, + ExporterInbox, ) { make_chan_with_capacity(10) } @@ -671,14 +671,14 @@ mod tests { ) -> ( mpsc::Sender>, mpsc::Sender, - ProcessorMessageChannel, + ProcessorInbox, ) { let (control_tx, control_rx) = mpsc::Channel::>::new(capacity); let (pdata_tx, pdata_rx) = mpsc::Channel::::new(capacity); ( control_tx, pdata_tx, - ProcessorMessageChannel::new( + ProcessorInbox::new( Receiver::Local(LocalReceiver::mpsc(control_rx)), Receiver::Local(LocalReceiver::mpsc(pdata_rx)), 0, @@ -690,7 +690,7 @@ mod tests { fn make_processor_chan() -> ( mpsc::Sender>, mpsc::Sender, - ProcessorMessageChannel, + ProcessorInbox, ) { make_processor_chan_with_capacity(10) } @@ -1274,14 +1274,14 @@ mod tests { fn make_shared_chan() -> ( tokio::sync::mpsc::Sender>, tokio::sync::mpsc::Sender, - ProcessorMessageChannel, + ProcessorInbox, ) { let (control_tx, control_rx) = tokio::sync::mpsc::channel::>(10); let (pdata_tx, pdata_rx) = tokio::sync::mpsc::channel::(10); ( control_tx, pdata_tx, - ProcessorMessageChannel::new( + ProcessorInbox::new( Receiver::Shared(SharedReceiver::mpsc(control_rx)), Receiver::Shared(SharedReceiver::mpsc(pdata_rx)), 0, diff --git a/rust/otap-dataflow/crates/engine/src/lib.rs b/rust/otap-dataflow/crates/engine/src/lib.rs index 7041caac4d..4cc457a550 100644 --- a/rust/otap-dataflow/crates/engine/src/lib.rs +++ b/rust/otap-dataflow/crates/engine/src/lib.rs @@ -68,6 +68,7 @@ pub mod engine_metrics; pub mod entity_context; pub mod local; pub mod node; +mod node_local_scheduler; pub mod output_router; pub mod pipeline_ctrl; mod pipeline_metrics; @@ -78,6 +79,7 @@ pub mod terminal_state; pub mod testing; pub mod topic; pub mod wiring_contract; +pub use node_local_scheduler::WakeupError; /// Trait for factory types that expose a name. /// diff --git a/rust/otap-dataflow/crates/engine/src/local/exporter.rs b/rust/otap-dataflow/crates/engine/src/local/exporter.rs index 6320145634..9d3e9f4e6a 100644 --- a/rust/otap-dataflow/crates/engine/src/local/exporter.rs +++ b/rust/otap-dataflow/crates/engine/src/local/exporter.rs @@ -37,7 +37,7 @@ use crate::Interests; use crate::control::{AckMsg, NackMsg}; use crate::effect_handler::{EffectHandlerCore, TelemetryTimerCancelHandle, TimerCancelHandle}; use crate::error::Error; -use crate::message::ExporterMessageChannel; +use crate::message::ExporterInbox; use crate::node::NodeId; use crate::terminal_state::TerminalState; use async_trait::async_trait; @@ -69,12 +69,12 @@ pub trait Exporter { /// /// Exporters are expected to process both internal control messages and pipeline data messages, /// prioritizing control messages over data messages. This prioritization guarantee is ensured - /// by the `ExporterMessageChannel` implementation. + /// by the `ExporterInbox` implementation. /// /// # Parameters /// - /// - `msg_chan`: A channel to receive pdata or control messages. Control messages are - /// prioritized over pdata messages. + /// - `inbox`: An inbox that receives pdata or control messages. Control + /// messages are prioritized over pdata messages. /// - `effect_handler`: A handler to perform side effects such as network operations. /// /// # Errors @@ -86,7 +86,7 @@ pub trait Exporter { /// This method should be cancellation safe and clean up any resources when dropped. async fn start( self: Box, - msg_chan: ExporterMessageChannel, + inbox: ExporterInbox, effect_handler: EffectHandler, ) -> Result; } diff --git a/rust/otap-dataflow/crates/engine/src/local/processor.rs b/rust/otap-dataflow/crates/engine/src/local/processor.rs index 844603c85a..cb4db4b3f1 100644 --- a/rust/otap-dataflow/crates/engine/src/local/processor.rs +++ b/rust/otap-dataflow/crates/engine/src/local/processor.rs @@ -33,7 +33,8 @@ //! in parallel on different cores, each with its own processor instance. use crate::Interests; -use crate::control::{AckMsg, NackMsg, RuntimeCtrlMsgSender}; +use crate::WakeupError; +use crate::control::{AckMsg, NackMsg, RuntimeCtrlMsgSender, WakeupSlot}; use crate::effect_handler::{ EffectHandlerCore, SourceTagging, TelemetryTimerCancelHandle, TimerCancelHandle, }; @@ -278,6 +279,22 @@ impl EffectHandler { self.core.delay_data(when, data).await } + /// Requeue retained pdata onto this node later. + pub fn requeue_later(&self, when: Instant, data: Box) -> Result<(), PData> { + self.core.requeue_later(when, data) + } + + /// Set or replace a processor-local wakeup. + pub fn set_wakeup(&self, slot: WakeupSlot, when: Instant) -> Result<(), WakeupError> { + self.core.set_wakeup(slot, when) + } + + /// Cancel a previously scheduled processor-local wakeup. + #[must_use] + pub fn cancel_wakeup(&self, slot: WakeupSlot) -> bool { + self.core.cancel_wakeup(slot) + } + /// Reports metrics collected by the processor. #[allow(dead_code)] // Will be used in the future. ToDo report metrics from channel and messages. pub(crate) fn report_metrics( diff --git a/rust/otap-dataflow/crates/engine/src/message.rs b/rust/otap-dataflow/crates/engine/src/message.rs index 06a2d7c4f6..982aeed415 100644 --- a/rust/otap-dataflow/crates/engine/src/message.rs +++ b/rust/otap-dataflow/crates/engine/src/message.rs @@ -6,6 +6,7 @@ use crate::clock; use crate::control::{AckMsg, NackMsg, NodeControlMsg}; use crate::local::message::{LocalReceiver, LocalSender}; +use crate::node_local_scheduler::NodeLocalSchedulerHandle; use crate::shared::message::{SharedReceiver, SharedSender}; use crate::{Interests, ReceivedAtNode}; use otap_df_channel::error::{RecvError, SendError}; @@ -183,7 +184,7 @@ impl Receiver { } } -/// Small private adapter trait used by [`MessageChannelCore`]. +/// Small private adapter trait used by [`InboxCore`]. /// /// The core receive state machine is shared by: /// @@ -230,7 +231,7 @@ impl ChannelReceiver for SharedReceiver { } } -/// Shutdown-drain policy for [`MessageChannelCore::recv_with_policy`]. +/// Shutdown-drain policy for [`InboxCore::recv_with_policy`]. /// /// Both processor and exporter channels share the same multiplexing and /// shutdown machinery, but they intentionally diverge once shutdown has been @@ -244,6 +245,7 @@ impl ChannelReceiver for SharedReceiver { /// /// This enum lets the shared core express that difference explicitly without /// forking the whole receive loop. +#[derive(Clone, Copy)] enum DrainPolicy { /// Respect the caller's admission flag even after shutdown has been /// latched. @@ -254,9 +256,10 @@ enum DrainPolicy { ForceDrainDuringShutdown, } -struct MessageChannelCore { +struct InboxCore { control_rx: Option, pdata_rx: Option, + local_scheduler: Option>, /// Once a Shutdown is seen, this is set to `Some(instant)` representing the drain deadline. shutting_down_deadline: Option, /// Holds the ControlMsg::Shutdown until after we’ve drained pdata. @@ -269,11 +272,18 @@ struct MessageChannelCore { consecutive_control: usize, } -impl MessageChannelCore { - fn new(control_rx: ControlRx, pdata_rx: PDataRx, node_id: usize, interests: Interests) -> Self { +impl InboxCore { + fn new( + control_rx: ControlRx, + pdata_rx: PDataRx, + local_scheduler: Option>, + node_id: usize, + interests: Interests, + ) -> Self { Self { control_rx: Some(control_rx), pdata_rx: Some(pdata_rx), + local_scheduler, shutting_down_deadline: None, pending_shutdown: None, node_id, @@ -285,12 +295,15 @@ impl MessageChannelCore { fn shutdown(&mut self) { self.shutting_down_deadline = None; self.consecutive_control = 0; + if let Some(local_scheduler) = &self.local_scheduler { + local_scheduler.begin_shutdown(clock::now()); + } drop(self.control_rx.take().expect("control_rx must exist")); drop(self.pdata_rx.take().expect("pdata_rx must exist")); } } -impl MessageChannelCore +impl InboxCore where PData: ReceivedAtNode, ControlRx: ChannelReceiver>, @@ -330,6 +343,33 @@ where accept_pdata || matches!(policy, DrainPolicy::ForceDrainDuringShutdown) } + fn shutdown_drain_complete(&self) -> bool { + self.pdata_rx + .as_ref() + .expect("pdata_rx must exist") + .is_empty() + && self + .local_scheduler + .as_ref() + .map(NodeLocalSchedulerHandle::is_drained) + .unwrap_or(true) + } + + fn pop_local_due(&mut self, now: Instant) -> Option> { + self.local_scheduler + .as_ref() + .and_then(|scheduler| scheduler.pop_due(now)) + .map(|msg| self.control_message(msg)) + } + + fn next_local_expiry_sleep(&self, now: Instant) -> Option { + self.local_scheduler + .as_ref() + .and_then(NodeLocalSchedulerHandle::next_expiry) + .filter(|when| *when > now) + .map(clock::sleep_until) + } + async fn recv_with_policy( &mut self, accept_pdata: bool, @@ -339,7 +379,7 @@ where loop { if self.control_rx.is_none() || self.pdata_rx.is_none() { - // MessageChannel has been shutdown + // Inbox has been shutdown return Err(RecvError::Closed); } @@ -373,12 +413,7 @@ where // only after the bounded pdata backlog is empty. This keeps the // channel-level drain contract explicit: upstream work that was // already accepted into the channel gets a chance to run first. - if self - .pdata_rx - .as_ref() - .expect("pdata_rx must exist") - .is_empty() - { + if self.shutdown_drain_complete() { let shutdown = self .pending_shutdown .take() @@ -392,6 +427,9 @@ where sleep_until_deadline = Some(clock::sleep_until(dl)); } + let now = clock::now(); + let mut sleep_until_local = self.next_local_expiry_sleep(now); + // Even while draining we cap control preference. This prevents a // sustained Ack/Nack or shutdown-control burst from starving the // already buffered pdata that shutdown is trying to drain. @@ -415,6 +453,28 @@ where } } + if !self + .control_rx + .as_ref() + .expect("control_rx must exist") + .is_empty() + { + match self + .control_rx + .as_mut() + .expect("control_rx must exist") + .try_recv() + { + Ok(msg) => return Ok(self.control_message(msg)), + Err(RecvError::Empty) => {} + Err(e) => return Err(e), + } + } + + if let Some(msg) = self.pop_local_due(now) { + return Ok(msg); + } + // Drain pdata (gated by accept_pdata) and deliver control messages. // Honoring accept_pdata during draining lets stateful processors // receive Ack/Nack to reduce in-flight state and reopen capacity. @@ -445,6 +505,22 @@ where Ok(msg) => return Ok(self.control_message(msg)), Err(e) => return Err(e), }, + + _ = async { + if let Some(delay) = sleep_until_local.as_mut() { + delay.await; + } + }, if sleep_until_local.is_some() => { + continue; + }, + + _ = async { + if let Some(local_scheduler) = self.local_scheduler.as_ref() { + local_scheduler.wait_for_change().await; + } + }, if self.local_scheduler.is_some() => { + continue; + }, } } else { tokio::select! { @@ -473,11 +549,30 @@ where return Ok(Message::Control(shutdown)); } }, + + _ = async { + if let Some(delay) = sleep_until_local.as_mut() { + delay.await; + } + }, if sleep_until_local.is_some() => { + continue; + }, + + _ = async { + if let Some(local_scheduler) = self.local_scheduler.as_ref() { + local_scheduler.wait_for_change().await; + } + }, if self.local_scheduler.is_some() => { + continue; + }, } } } // Normal mode: no shutdown yet + let now = clock::now(); + let mut sleep_until_local = self.next_local_expiry_sleep(now); + if accept_pdata && self.consecutive_control >= CONTROL_BURST_LIMIT { match self .pdata_rx @@ -491,6 +586,43 @@ where } } + if !self + .control_rx + .as_ref() + .expect("control_rx must exist") + .is_empty() + { + match self + .control_rx + .as_mut() + .expect("control_rx must exist") + .try_recv() + { + Ok(NodeControlMsg::Shutdown { deadline, reason }) => { + if deadline <= clock::now() { + self.shutdown(); + return Ok(Message::Control(NodeControlMsg::Shutdown { + deadline, + reason, + })); + } + if let Some(local_scheduler) = &self.local_scheduler { + local_scheduler.begin_shutdown(clock::now()); + } + self.shutting_down_deadline = Some(deadline); + self.pending_shutdown = Some(NodeControlMsg::Shutdown { deadline, reason }); + continue; + } + Ok(msg) => return Ok(self.control_message(msg)), + Err(RecvError::Empty) => {} + Err(e) => return Err(e), + } + } + + if let Some(msg) = self.pop_local_due(now) { + return Ok(msg); + } + if accept_pdata && self.consecutive_control >= CONTROL_BURST_LIMIT { tokio::select! { biased; @@ -514,6 +646,9 @@ where self.shutdown(); return Ok(Message::Control(NodeControlMsg::Shutdown { deadline, reason })); } + if let Some(local_scheduler) = &self.local_scheduler { + local_scheduler.begin_shutdown(clock::now()); + } self.shutting_down_deadline = Some(deadline); self.pending_shutdown = Some(NodeControlMsg::Shutdown { deadline, reason }); continue; @@ -521,6 +656,22 @@ where Ok(msg) => return Ok(self.control_message(msg)), Err(e) => return Err(e), }, + + _ = async { + if let Some(delay) = sleep_until_local.as_mut() { + delay.await; + } + }, if sleep_until_local.is_some() => { + continue; + }, + + _ = async { + if let Some(local_scheduler) = self.local_scheduler.as_ref() { + local_scheduler.wait_for_change().await; + } + }, if self.local_scheduler.is_some() => { + continue; + }, } } else { tokio::select! { @@ -536,6 +687,9 @@ where self.shutdown(); return Ok(Message::Control(NodeControlMsg::Shutdown { deadline, reason })); } + if let Some(local_scheduler) = &self.local_scheduler { + local_scheduler.begin_shutdown(clock::now()); + } self.shutting_down_deadline = Some(deadline); self.pending_shutdown = Some(NodeControlMsg::Shutdown { deadline, reason }); continue; @@ -550,6 +704,22 @@ where Err(RecvError::Closed) => return Ok(self.closed_pdata_shutdown()), Err(e) => return Err(e), } + }, + + _ = async { + if let Some(delay) = sleep_until_local.as_mut() { + delay.await; + } + }, if sleep_until_local.is_some() => { + continue; + }, + + _ = async { + if let Some(local_scheduler) = self.local_scheduler.as_ref() { + local_scheduler.wait_for_change().await; + } + }, if self.local_scheduler.is_some() => { + continue; } } } @@ -562,26 +732,50 @@ where /// This preserves the existing processor contract: pdata admission is /// controlled by the engine via `accept_pdata()`, and the admission guard /// remains authoritative during shutdown draining. -pub struct ProcessorMessageChannel { - core: MessageChannelCore>, Receiver>, +pub struct ProcessorInbox { + core: InboxCore>, Receiver>, } -impl ProcessorMessageChannel { - /// Creates a new processor message channel. +impl ProcessorInbox { + /// Creates a new processor inbox. #[must_use] pub fn new( control_rx: Receiver>, pdata_rx: Receiver, node_id: usize, interests: Interests, + ) -> Self { + Self::new_with_local_scheduler( + control_rx, + pdata_rx, + NodeLocalSchedulerHandle::new(32, 32), + node_id, + interests, + ) + } + + /// Creates a new processor inbox with an explicit processor-local scheduler. + #[must_use] + pub(crate) fn new_with_local_scheduler( + control_rx: Receiver>, + pdata_rx: Receiver, + local_scheduler: NodeLocalSchedulerHandle, + node_id: usize, + interests: Interests, ) -> Self { Self { - core: MessageChannelCore::new(control_rx, pdata_rx, node_id, interests), + core: InboxCore::new( + control_rx, + pdata_rx, + Some(local_scheduler), + node_id, + interests, + ), } } } -impl ProcessorMessageChannel { +impl ProcessorInbox { /// Receives the next message while honoring the current processor /// admission state, including during shutdown draining. pub async fn recv_when(&mut self, accept_pdata: bool) -> Result, RecvError> { @@ -596,15 +790,15 @@ impl ProcessorMessageChannel { /// Exporters own their receive loop directly. During shutdown draining, /// buffered pdata is force-drained even when the exporter has temporarily /// closed normal pdata admission. -pub struct ExporterMessageChannel< +pub struct ExporterInbox< PData, ControlRx = Receiver>, PDataRx = Receiver, > { - core: MessageChannelCore, + core: InboxCore, } -impl ExporterMessageChannel { +impl ExporterInbox { #[must_use] pub(crate) fn new_internal( control_rx: ControlRx, @@ -613,13 +807,13 @@ impl ExporterMessageChannel Self { Self { - core: MessageChannelCore::new(control_rx, pdata_rx, node_id, interests), + core: InboxCore::new(control_rx, pdata_rx, None, node_id, interests), } } } #[allow(private_bounds)] -impl ExporterMessageChannel +impl ExporterInbox where PData: ReceivedAtNode, ControlRx: ChannelReceiver>, @@ -639,8 +833,8 @@ where } } -impl ExporterMessageChannel { - /// Creates a new exporter message channel. +impl ExporterInbox { + /// Creates a new exporter inbox. #[must_use] pub fn new( control_rx: Receiver>, @@ -652,7 +846,7 @@ impl ExporterMessageChannel { } } -impl ExporterMessageChannel { +impl ExporterInbox { /// Receives the next message with pdata admission enabled. pub async fn recv(&mut self) -> Result, RecvError> { self.recv_internal().await @@ -665,6 +859,327 @@ impl ExporterMessageChannel { } } -/// Send-friendly exporter channel type for shared exporter runtimes. -pub(crate) type SharedExporterMessageChannel = - ExporterMessageChannel>, SharedReceiver>; +/// Backward-compatible exporter inbox alias. +pub type ExporterMessageChannel< + PData, + ControlRx = Receiver>, + PDataRx = Receiver, +> = ExporterInbox; + +/// Send-friendly exporter inbox type for shared exporter runtimes. +pub(crate) type SharedExporterInbox = + ExporterInbox>, SharedReceiver>; + +#[cfg(test)] +mod tests { + use super::*; + use crate::WakeupError; + use crate::local::message::LocalReceiver; + use crate::testing::TestMsg; + use otap_df_channel::mpsc; + use std::time::Duration; + + fn local_processor_inbox( + delayed_resume_capacity: usize, + wakeup_capacity: usize, + ) -> ( + mpsc::Sender>, + mpsc::Sender, + NodeLocalSchedulerHandle, + ProcessorInbox, + ) { + let (control_tx, control_rx) = mpsc::Channel::>::new(64); + let (pdata_tx, pdata_rx) = mpsc::Channel::::new(64); + let scheduler = NodeLocalSchedulerHandle::new(delayed_resume_capacity, wakeup_capacity); + let inbox = ProcessorInbox::new_with_local_scheduler( + Receiver::Local(LocalReceiver::mpsc(control_rx)), + Receiver::Local(LocalReceiver::mpsc(pdata_rx)), + scheduler.clone(), + 7, + Interests::empty(), + ); + (control_tx, pdata_tx, scheduler, inbox) + } + + #[tokio::test] + async fn processor_inbox_emits_due_delayed_resume_as_control_message() { + let (_control_tx, _pdata_tx, scheduler, mut inbox) = local_processor_inbox(4, 4); + let when = Instant::now(); + scheduler + .requeue_later(when, Box::new(TestMsg::new("delayed"))) + .expect("delayed resume should schedule"); + + let message = tokio::time::timeout(Duration::from_millis(50), inbox.recv_when(true)) + .await + .expect("inbox should wake") + .expect("message should arrive"); + assert!(matches!( + message, + Message::Control(NodeControlMsg::DelayedData { when: observed, data }) + if observed == when && *data == TestMsg::new("delayed") + )); + } + + #[tokio::test] + async fn processor_inbox_emits_due_wakeup_as_control_message() { + let (_control_tx, _pdata_tx, scheduler, mut inbox) = local_processor_inbox(4, 4); + let when = Instant::now(); + scheduler + .set_wakeup(crate::control::WakeupSlot(0), when) + .expect("wakeup should schedule"); + + let message = tokio::time::timeout(Duration::from_millis(50), inbox.recv_when(true)) + .await + .expect("inbox should wake") + .expect("message should arrive"); + assert!(matches!( + message, + Message::Control(NodeControlMsg::Wakeup { + slot: crate::control::WakeupSlot(0), + when: observed, + }) if observed == when + )); + } + + #[tokio::test] + async fn processor_inbox_delayed_resume_preserves_control_fairness() { + let (_control_tx, pdata_tx, scheduler, mut inbox) = local_processor_inbox(64, 4); + pdata_tx + .send_async(TestMsg::new("pdata")) + .await + .expect("pdata should enqueue"); + let when = Instant::now(); + for idx in 0..40 { + scheduler + .requeue_later(when, Box::new(TestMsg::new(format!("delayed-{idx}")))) + .expect("delayed resume should schedule"); + } + + let mut delayed = 0usize; + let mut saw_pdata = false; + while delayed <= CONTROL_BURST_LIMIT { + match inbox.recv_when(true).await.expect("message should arrive") { + Message::PData(TestMsg(value)) => { + assert_eq!(value, "pdata"); + saw_pdata = true; + break; + } + Message::Control(NodeControlMsg::DelayedData { .. }) => { + delayed += 1; + } + other => panic!("unexpected message {other:?}"), + } + } + + assert!( + saw_pdata, + "pdata should not starve behind processor-local delayed resumes" + ); + } + + #[tokio::test] + async fn processor_inbox_wakeup_preserves_control_fairness() { + let (_control_tx, pdata_tx, scheduler, mut inbox) = local_processor_inbox(4, 64); + pdata_tx + .send_async(TestMsg::new("pdata")) + .await + .expect("pdata should enqueue"); + let when = Instant::now(); + for slot in 0..40 { + scheduler + .set_wakeup(crate::control::WakeupSlot(slot), when) + .expect("wakeup should schedule"); + } + + let mut wakeups = 0usize; + let mut saw_pdata = false; + while wakeups <= CONTROL_BURST_LIMIT { + match inbox.recv_when(true).await.expect("message should arrive") { + Message::PData(TestMsg(value)) => { + assert_eq!(value, "pdata"); + saw_pdata = true; + break; + } + Message::Control(NodeControlMsg::Wakeup { .. }) => { + wakeups += 1; + } + other => panic!("unexpected message {other:?}"), + } + } + + assert!( + saw_pdata, + "pdata should not starve behind processor-local wakeups" + ); + } + + #[tokio::test] + async fn processor_inbox_rejects_delayed_resumes_after_shutdown_latch() { + let (control_tx, pdata_tx, scheduler, mut inbox) = local_processor_inbox(4, 4); + pdata_tx + .send_async(TestMsg::new("buffered")) + .await + .expect("pdata should enqueue"); + control_tx + .send_async(NodeControlMsg::Shutdown { + deadline: Instant::now() + Duration::from_secs(1), + reason: "shutdown".to_owned(), + }) + .await + .expect("shutdown should enqueue"); + control_tx + .send_async(NodeControlMsg::Config { + config: serde_json::json!({"mode": "draining"}), + }) + .await + .expect("config should enqueue"); + + let first = inbox + .recv_when(false) + .await + .expect("control should arrive after shutdown latch"); + assert!(matches!( + first, + Message::Control(NodeControlMsg::Config { .. }) + )); + + let rejected = scheduler + .requeue_later(Instant::now(), Box::new(TestMsg::new("rejected"))) + .expect_err("shutdown should reject new delayed resumes"); + assert_eq!(*rejected, TestMsg::new("rejected")); + } + + #[tokio::test] + async fn processor_inbox_rejects_wakeups_after_shutdown_latch() { + let (control_tx, pdata_tx, scheduler, mut inbox) = local_processor_inbox(4, 4); + pdata_tx + .send_async(TestMsg::new("buffered")) + .await + .expect("pdata should enqueue"); + control_tx + .send_async(NodeControlMsg::Shutdown { + deadline: Instant::now() + Duration::from_secs(1), + reason: "shutdown".to_owned(), + }) + .await + .expect("shutdown should enqueue"); + control_tx + .send_async(NodeControlMsg::Config { + config: serde_json::json!({"mode": "draining"}), + }) + .await + .expect("config should enqueue"); + + let first = inbox + .recv_when(false) + .await + .expect("control should arrive after shutdown latch"); + assert!(matches!( + first, + Message::Control(NodeControlMsg::Config { .. }) + )); + assert_eq!( + scheduler.set_wakeup(crate::control::WakeupSlot(1), Instant::now()), + Err(WakeupError::ShuttingDown) + ); + } + + #[tokio::test] + async fn processor_inbox_returns_pending_delayed_resumes_on_shutdown_latch() { + let (control_tx, _pdata_tx, scheduler, mut inbox) = local_processor_inbox(4, 4); + let original_when = Instant::now() + Duration::from_secs(60); + scheduler + .requeue_later(original_when, Box::new(TestMsg::new("delayed"))) + .expect("delayed resume should schedule"); + control_tx + .send_async(NodeControlMsg::Shutdown { + deadline: Instant::now() + Duration::from_secs(1), + reason: "shutdown".to_owned(), + }) + .await + .expect("shutdown should enqueue"); + control_tx + .send_async(NodeControlMsg::Config { + config: serde_json::json!({"drain": true}), + }) + .await + .expect("config should enqueue"); + + let first = inbox + .recv_when(false) + .await + .expect("control should arrive after shutdown latch"); + assert!(matches!( + first, + Message::Control(NodeControlMsg::Config { .. }) + )); + + let resumed = inbox + .recv_when(false) + .await + .expect("delayed resume should return immediately during shutdown"); + assert!(matches!( + resumed, + Message::Control(NodeControlMsg::DelayedData { when, data }) + if when < original_when && *data == TestMsg::new("delayed") + )); + + let shutdown = inbox + .recv_when(false) + .await + .expect("shutdown should follow once the delayed resume drains"); + assert!(matches!( + shutdown, + Message::Control(NodeControlMsg::Shutdown { .. }) + )); + } + + #[tokio::test] + async fn processor_inbox_drops_pending_wakeups_on_shutdown_latch() { + let (control_tx, pdata_tx, scheduler, mut inbox) = local_processor_inbox(4, 4); + pdata_tx + .send_async(TestMsg::new("buffered")) + .await + .expect("pdata should enqueue"); + scheduler + .set_wakeup(crate::control::WakeupSlot(2), Instant::now()) + .expect("wakeup should schedule"); + control_tx + .send_async(NodeControlMsg::Shutdown { + deadline: Instant::now() + Duration::from_secs(1), + reason: "shutdown".to_owned(), + }) + .await + .expect("shutdown should enqueue"); + control_tx + .send_async(NodeControlMsg::Config { + config: serde_json::json!({"drop": true}), + }) + .await + .expect("config should enqueue"); + + let first = inbox + .recv_when(false) + .await + .expect("control should arrive after shutdown latch"); + assert!(matches!( + first, + Message::Control(NodeControlMsg::Config { .. }) + )); + + let drained = inbox + .recv_when(true) + .await + .expect("buffered pdata should drain"); + assert!(matches!(drained, Message::PData(TestMsg(ref value)) if value == "buffered")); + + let shutdown = inbox + .recv_when(true) + .await + .expect("shutdown should follow drain"); + assert!(matches!( + shutdown, + Message::Control(NodeControlMsg::Shutdown { .. }) + )); + } +} diff --git a/rust/otap-dataflow/crates/engine/src/node_local_scheduler.rs b/rust/otap-dataflow/crates/engine/src/node_local_scheduler.rs new file mode 100644 index 0000000000..5596b679bb --- /dev/null +++ b/rust/otap-dataflow/crates/engine/src/node_local_scheduler.rs @@ -0,0 +1,545 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +//! Node-local delayed resume and wakeup scheduling for processor inboxes. + +use crate::clock; +use crate::control::{NodeControlMsg, WakeupSlot}; +use std::cmp::Ordering; +use std::collections::{BinaryHeap, HashMap, VecDeque}; +use std::sync::{Arc, Mutex}; +use std::time::Instant; +use tokio::sync::Notify; + +/// Error returned when a wakeup request cannot be accepted. +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum WakeupError { + /// The processor has already latched shutdown. + ShuttingDown, + /// The bounded live wakeup slot set is full. + Capacity, +} + +#[derive(Debug)] +struct ScheduledResume { + when: Instant, + sequence: u64, + data: Box, +} + +impl Ord for ScheduledResume { + fn cmp(&self, other: &Self) -> Ordering { + other + .when + .cmp(&self.when) + .then_with(|| other.sequence.cmp(&self.sequence)) + } +} + +impl PartialOrd for ScheduledResume { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl PartialEq for ScheduledResume { + fn eq(&self, other: &Self) -> bool { + self.when == other.when && self.sequence == other.sequence + } +} + +impl Eq for ScheduledResume {} + +#[derive(Clone, Copy, Debug)] +struct WakeupState { + when: Instant, + generation: u64, + sequence: u64, +} + +#[derive(Debug)] +struct ScheduledWakeup { + slot: WakeupSlot, + when: Instant, + generation: u64, + sequence: u64, +} + +impl Ord for ScheduledWakeup { + fn cmp(&self, other: &Self) -> Ordering { + other + .when + .cmp(&self.when) + .then_with(|| other.sequence.cmp(&self.sequence)) + } +} + +impl PartialOrd for ScheduledWakeup { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl PartialEq for ScheduledWakeup { + fn eq(&self, other: &Self) -> bool { + self.slot == other.slot + && self.when == other.when + && self.generation == other.generation + && self.sequence == other.sequence + } +} + +impl Eq for ScheduledWakeup {} + +struct NodeLocalScheduler { + delayed_resume_capacity: usize, + wakeup_capacity: usize, + next_sequence: u64, + delayed_resumes: BinaryHeap>, + wakeups: BinaryHeap, + wakeup_state: HashMap, + due_now: VecDeque>, + shutting_down: bool, +} + +impl NodeLocalScheduler { + fn new(delayed_resume_capacity: usize, wakeup_capacity: usize) -> Self { + Self { + delayed_resume_capacity, + wakeup_capacity, + next_sequence: 0, + delayed_resumes: BinaryHeap::new(), + wakeups: BinaryHeap::new(), + wakeup_state: HashMap::new(), + due_now: VecDeque::new(), + shutting_down: false, + } + } + + fn next_sequence(&mut self) -> u64 { + let next = self.next_sequence; + self.next_sequence = self.next_sequence.saturating_add(1); + next + } + + fn requeue_later(&mut self, when: Instant, data: Box) -> Result<(), Box> { + if self.shutting_down || self.delayed_resumes.len() >= self.delayed_resume_capacity { + return Err(data); + } + + let sequence = self.next_sequence(); + self.delayed_resumes.push(ScheduledResume { + when, + sequence, + data, + }); + Ok(()) + } + + fn set_wakeup(&mut self, slot: WakeupSlot, when: Instant) -> Result<(), WakeupError> { + if self.shutting_down { + return Err(WakeupError::ShuttingDown); + } + + let sequence = self.next_sequence(); + let generation = if let Some(state) = self.wakeup_state.get_mut(&slot) { + state.when = when; + state.generation = state.generation.saturating_add(1); + state.sequence = sequence; + state.generation + } else { + if self.wakeup_state.len() >= self.wakeup_capacity { + return Err(WakeupError::Capacity); + } + let _ = self.wakeup_state.insert( + slot, + WakeupState { + when, + generation: 0, + sequence, + }, + ); + 0 + }; + + self.wakeups.push(ScheduledWakeup { + slot, + when, + generation, + sequence, + }); + Ok(()) + } + + fn cancel_wakeup(&mut self, slot: WakeupSlot) -> bool { + if self.shutting_down { + return false; + } + self.wakeup_state.remove(&slot).is_some() + } + + fn discard_stale_wakeup_head(&mut self) { + while let Some(head) = self.wakeups.peek() { + let Some(state) = self.wakeup_state.get(&head.slot) else { + let _ = self.wakeups.pop(); + continue; + }; + if state.generation != head.generation || state.when != head.when { + let _ = self.wakeups.pop(); + continue; + } + break; + } + } + + fn next_expiry(&mut self) -> Option { + if !self.due_now.is_empty() { + return Some(clock::now()); + } + + self.discard_stale_wakeup_head(); + match ( + self.delayed_resumes.peek().map(|resume| resume.when), + self.wakeups.peek().map(|wakeup| wakeup.when), + ) { + (Some(a), Some(b)) => Some(a.min(b)), + (Some(a), None) => Some(a), + (None, Some(b)) => Some(b), + (None, None) => None, + } + } + + fn pop_due(&mut self, now: Instant) -> Option> { + if let Some(msg) = self.due_now.pop_front() { + return Some(msg); + } + + self.discard_stale_wakeup_head(); + + let next_resume = self.delayed_resumes.peek().map(|resume| resume.when); + let next_wakeup = self.wakeups.peek().map(|wakeup| wakeup.when); + let take_resume = match (next_resume, next_wakeup) { + (Some(resume_when), Some(wakeup_when)) => { + resume_when <= now && (wakeup_when > now || resume_when <= wakeup_when) + } + (Some(resume_when), None) => resume_when <= now, + (None, Some(_)) => false, + (None, None) => return None, + }; + + if take_resume { + let resume = self.delayed_resumes.pop().expect("resume must exist"); + return Some(NodeControlMsg::DelayedData { + when: resume.when, + data: resume.data, + }); + } + + if next_wakeup + .map(|wakeup_when| wakeup_when <= now) + .unwrap_or(false) + { + let wakeup = self.wakeups.pop().expect("wakeup must exist"); + let _ = self.wakeup_state.remove(&wakeup.slot); + return Some(NodeControlMsg::Wakeup { + slot: wakeup.slot, + when: wakeup.when, + }); + } + + None + } + + fn begin_shutdown(&mut self, now: Instant) { + if self.shutting_down { + return; + } + + self.shutting_down = true; + + while let Some(resume) = self.delayed_resumes.pop() { + self.due_now.push_back(NodeControlMsg::DelayedData { + when: now, + data: resume.data, + }); + } + + self.wakeup_state.clear(); + self.wakeups.clear(); + } + + fn is_drained(&self) -> bool { + self.due_now.is_empty() && self.delayed_resumes.is_empty() && self.wakeup_state.is_empty() + } +} + +/// Shared handle used by the processor inbox and the processor effect handler. +pub(crate) struct NodeLocalSchedulerHandle { + inner: Arc>>, + notify: Arc, +} + +impl Clone for NodeLocalSchedulerHandle { + fn clone(&self) -> Self { + Self { + inner: Arc::clone(&self.inner), + notify: Arc::clone(&self.notify), + } + } +} + +impl NodeLocalSchedulerHandle { + pub(crate) fn new(delayed_resume_capacity: usize, wakeup_capacity: usize) -> Self { + Self { + inner: Arc::new(Mutex::new(NodeLocalScheduler::new( + delayed_resume_capacity, + wakeup_capacity, + ))), + notify: Arc::new(Notify::new()), + } + } + + fn with_scheduler(&self, f: impl FnOnce(&mut NodeLocalScheduler) -> R) -> R { + let mut guard = self + .inner + .lock() + .unwrap_or_else(|poisoned| poisoned.into_inner()); + f(&mut guard) + } + + pub(crate) fn requeue_later(&self, when: Instant, data: Box) -> Result<(), Box> { + let result = self.with_scheduler(|scheduler| scheduler.requeue_later(when, data)); + if result.is_ok() { + self.notify.notify_one(); + } + result + } + + pub(crate) fn set_wakeup(&self, slot: WakeupSlot, when: Instant) -> Result<(), WakeupError> { + let result = self.with_scheduler(|scheduler| scheduler.set_wakeup(slot, when)); + if result.is_ok() { + self.notify.notify_one(); + } + result + } + + #[must_use] + pub(crate) fn cancel_wakeup(&self, slot: WakeupSlot) -> bool { + let changed = self.with_scheduler(|scheduler| scheduler.cancel_wakeup(slot)); + if changed { + self.notify.notify_one(); + } + changed + } + + pub(crate) fn next_expiry(&self) -> Option { + self.with_scheduler(NodeLocalScheduler::next_expiry) + } + + pub(crate) fn pop_due(&self, now: Instant) -> Option> { + self.with_scheduler(|scheduler| scheduler.pop_due(now)) + } + + pub(crate) fn begin_shutdown(&self, now: Instant) { + self.with_scheduler(|scheduler| scheduler.begin_shutdown(now)); + self.notify.notify_waiters(); + } + + pub(crate) fn is_drained(&self) -> bool { + self.with_scheduler(|scheduler| scheduler.is_drained()) + } + + pub(crate) async fn wait_for_change(&self) { + self.notify.notified().await; + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::time::Duration; + + #[test] + fn requeue_later_emits_the_stored_payload() { + let mut scheduler = NodeLocalScheduler::::new(2, 2); + let when = Instant::now() + Duration::from_secs(1); + + assert_eq!(scheduler.requeue_later(when, Box::new(17)), Ok(())); + assert!(matches!( + scheduler.pop_due(when), + Some(NodeControlMsg::DelayedData { when: observed, data }) + if observed == when && *data == 17 + )); + assert_eq!(scheduler.next_expiry(), None); + } + + #[test] + fn delayed_resumes_preserve_due_time_ordering() { + let mut scheduler = NodeLocalScheduler::new(4, 2); + let now = Instant::now(); + let later = now + Duration::from_secs(3); + let sooner = now + Duration::from_secs(1); + let same_time_a = now + Duration::from_secs(2); + let same_time_b = same_time_a; + + assert_eq!(scheduler.requeue_later(later, Box::new(3)), Ok(())); + assert_eq!(scheduler.requeue_later(same_time_a, Box::new(1)), Ok(())); + assert_eq!(scheduler.requeue_later(same_time_b, Box::new(2)), Ok(())); + assert_eq!(scheduler.requeue_later(sooner, Box::new(0)), Ok(())); + + assert!(matches!( + scheduler.pop_due(sooner), + Some(NodeControlMsg::DelayedData { data, .. }) if *data == 0 + )); + assert!(matches!( + scheduler.pop_due(same_time_a), + Some(NodeControlMsg::DelayedData { data, .. }) if *data == 1 + )); + assert!(matches!( + scheduler.pop_due(same_time_b), + Some(NodeControlMsg::DelayedData { data, .. }) if *data == 2 + )); + assert!(matches!( + scheduler.pop_due(later), + Some(NodeControlMsg::DelayedData { data, .. }) if *data == 3 + )); + } + + #[test] + fn delayed_resume_capacity_is_enforced() { + let mut scheduler = NodeLocalScheduler::new(1, 1); + let when = Instant::now() + Duration::from_secs(1); + + assert_eq!(scheduler.requeue_later(when, Box::new(1)), Ok(())); + let rejected = scheduler + .requeue_later(when, Box::new(2)) + .expect_err("capacity should reject"); + assert_eq!(*rejected, 2); + } + + #[test] + fn rejected_requeue_returns_the_original_payload() { + let mut scheduler = NodeLocalScheduler::new(2, 1); + let now = Instant::now(); + + scheduler.begin_shutdown(now); + let rejected = scheduler + .requeue_later(now + Duration::from_secs(1), Box::new(99)) + .expect_err("shutdown should reject"); + assert_eq!(*rejected, 99); + } + + #[test] + fn shutdown_makes_pending_delayed_resumes_due_immediately() { + let mut scheduler = NodeLocalScheduler::new(4, 2); + let now = Instant::now(); + let later = now + Duration::from_secs(30); + + assert_eq!(scheduler.requeue_later(later, Box::new(11)), Ok(())); + assert_eq!( + scheduler.requeue_later(later + Duration::from_secs(1), Box::new(12)), + Ok(()) + ); + + scheduler.begin_shutdown(now); + + assert!(matches!( + scheduler.pop_due(now), + Some(NodeControlMsg::DelayedData { when: observed, data }) + if observed == now && *data == 11 + )); + assert!(matches!( + scheduler.pop_due(now), + Some(NodeControlMsg::DelayedData { when: observed, data }) + if observed == now && *data == 12 + )); + assert!(scheduler.pop_due(now).is_none()); + } + + #[test] + fn set_wakeup_schedules_a_wakeup() { + let mut scheduler = NodeLocalScheduler::::new(2, 2); + let now = Instant::now(); + let when = now + Duration::from_secs(1); + + assert_eq!(scheduler.set_wakeup(WakeupSlot(7), when), Ok(())); + assert_eq!(scheduler.next_expiry(), Some(when)); + assert!(scheduler.pop_due(now).is_none()); + assert!(matches!( + scheduler.pop_due(when), + Some(NodeControlMsg::Wakeup { + slot: WakeupSlot(7), + when: observed, + }) if observed == when + )); + assert_eq!(scheduler.next_expiry(), None); + } + + #[test] + fn setting_same_slot_replaces_previous_due_time() { + let mut scheduler = NodeLocalScheduler::::new(2, 2); + let now = Instant::now(); + let later = now + Duration::from_secs(10); + let sooner = now + Duration::from_secs(1); + + assert_eq!(scheduler.set_wakeup(WakeupSlot(3), later), Ok(())); + assert_eq!(scheduler.set_wakeup(WakeupSlot(3), sooner), Ok(())); + assert_eq!(scheduler.next_expiry(), Some(sooner)); + assert!(matches!( + scheduler.pop_due(sooner), + Some(NodeControlMsg::Wakeup { + slot: WakeupSlot(3), + when: observed, + }) if observed == sooner + )); + assert!(scheduler.pop_due(later).is_none()); + } + + #[test] + fn cancel_wakeup_removes_pending_wakeup() { + let mut scheduler = NodeLocalScheduler::::new(2, 2); + let when = Instant::now() + Duration::from_secs(1); + + assert_eq!(scheduler.set_wakeup(WakeupSlot(5), when), Ok(())); + assert!(scheduler.cancel_wakeup(WakeupSlot(5))); + assert!(!scheduler.cancel_wakeup(WakeupSlot(5))); + assert_eq!(scheduler.next_expiry(), None); + assert!(scheduler.pop_due(when).is_none()); + } + + #[test] + fn wakeup_capacity_is_enforced_on_distinct_live_slots() { + let mut scheduler = NodeLocalScheduler::::new(1, 1); + let when = Instant::now() + Duration::from_secs(1); + + assert_eq!(scheduler.set_wakeup(WakeupSlot(0), when), Ok(())); + assert_eq!( + scheduler.set_wakeup(WakeupSlot(1), when), + Err(WakeupError::Capacity) + ); + assert_eq!( + scheduler.set_wakeup(WakeupSlot(0), when + Duration::from_secs(1)), + Ok(()) + ); + } + + #[test] + fn stale_heap_entries_are_ignored() { + let mut scheduler = NodeLocalScheduler::::new(2, 2); + let now = Instant::now(); + let first = now + Duration::from_secs(5); + let replacement = now + Duration::from_secs(1); + + assert_eq!(scheduler.set_wakeup(WakeupSlot(9), first), Ok(())); + assert_eq!(scheduler.set_wakeup(WakeupSlot(9), replacement), Ok(())); + assert!(matches!( + scheduler.pop_due(replacement), + Some(NodeControlMsg::Wakeup { + slot: WakeupSlot(9), + when: observed, + }) if observed == replacement + )); + assert!(scheduler.pop_due(first).is_none()); + assert_eq!(scheduler.next_expiry(), None); + } +} diff --git a/rust/otap-dataflow/crates/engine/src/processor.rs b/rust/otap-dataflow/crates/engine/src/processor.rs index 6ff3e51867..aceebf4826 100644 --- a/rust/otap-dataflow/crates/engine/src/processor.rs +++ b/rust/otap-dataflow/crates/engine/src/processor.rs @@ -22,8 +22,9 @@ use crate::entity_context::NodeTelemetryGuard; use crate::error::{Error, ProcessorErrorKind}; use crate::local::message::{LocalReceiver, LocalSender}; use crate::local::processor as local; -use crate::message::{Message, ProcessorMessageChannel, Receiver, Sender}; +use crate::message::{Message, ProcessorInbox, Receiver, Sender}; use crate::node::{Node, NodeId, NodeWithPDataReceiver, NodeWithPDataSender}; +use crate::node_local_scheduler::NodeLocalSchedulerHandle; use crate::shared::message::{SharedReceiver, SharedSender}; use crate::shared::processor as shared; use otap_df_channel::error::SendError; @@ -102,8 +103,8 @@ pub enum ProcessorWrapperRuntime { Local { /// The processor instance. processor: Box>, - /// The message channel - message_channel: ProcessorMessageChannel, + /// The processor inbox + inbox: ProcessorInbox, /// The local effect handler effect_handler: local::EffectHandler, }, @@ -111,8 +112,8 @@ pub enum ProcessorWrapperRuntime { Shared { /// The processor instance. processor: Box>, - /// Message channel - message_channel: ProcessorMessageChannel, + /// Processor inbox + inbox: ProcessorInbox, /// The shared effect handler effect_handler: shared::EffectHandler, }, @@ -325,6 +326,7 @@ impl ProcessorWrapper { match self { ProcessorWrapper::Local { node_id, + runtime_config, processor, control_receiver, pdata_senders, @@ -333,7 +335,11 @@ impl ProcessorWrapper { source_tag, .. } => { - let message_channel = ProcessorMessageChannel::new( + let local_scheduler = NodeLocalSchedulerHandle::new( + runtime_config.input_pdata_channel.capacity, + runtime_config.control_channel.capacity, + ); + let inbox = ProcessorInbox::new_with_local_scheduler( Receiver::Local(control_receiver), pdata_receiver.ok_or_else(|| Error::ProcessorError { processor: node_id.clone(), @@ -341,6 +347,7 @@ impl ProcessorWrapper { error: "The pdata receiver must be defined at this stage".to_owned(), source_detail: String::new(), })?, + local_scheduler.clone(), node_id.index, node_interests, ); @@ -352,14 +359,16 @@ impl ProcessorWrapper { metrics_reporter, ); effect_handler.set_source_tagging(source_tag); + effect_handler.core.set_local_scheduler(local_scheduler); Ok(ProcessorWrapperRuntime::Local { processor, effect_handler, - message_channel, + inbox, }) } ProcessorWrapper::Shared { node_id, + runtime_config, processor, control_receiver, pdata_senders, @@ -368,7 +377,11 @@ impl ProcessorWrapper { source_tag, .. } => { - let message_channel = ProcessorMessageChannel::new( + let local_scheduler = NodeLocalSchedulerHandle::new( + runtime_config.input_pdata_channel.capacity, + runtime_config.control_channel.capacity, + ); + let inbox = ProcessorInbox::new_with_local_scheduler( Receiver::Shared(control_receiver), Receiver::Shared(pdata_receiver.ok_or_else(|| Error::ProcessorError { processor: node_id.clone(), @@ -376,6 +389,7 @@ impl ProcessorWrapper { error: "The pdata receiver must be defined at this stage".to_owned(), source_detail: String::new(), })?), + local_scheduler.clone(), node_id.index, node_interests, ); @@ -387,10 +401,11 @@ impl ProcessorWrapper { metrics_reporter, ); effect_handler.set_source_tagging(source_tag); + effect_handler.core.set_local_scheduler(local_scheduler); Ok(ProcessorWrapperRuntime::Shared { processor, effect_handler, - message_channel, + inbox, }) } } @@ -435,7 +450,7 @@ impl ProcessorWrapper { match runtime { ProcessorWrapperRuntime::Local { mut processor, - mut message_channel, + mut inbox, mut effect_handler, } => { effect_handler @@ -454,7 +469,7 @@ impl ProcessorWrapper { .start_periodic_telemetry(Duration::from_secs(1)) .await?; - while let Ok(msg) = message_channel.recv_when(processor.accept_pdata()).await { + while let Ok(msg) = inbox.recv_when(processor.accept_pdata()).await { processor.process(msg, &mut effect_handler).await?; } // Cancel periodic collection @@ -469,7 +484,7 @@ impl ProcessorWrapper { } ProcessorWrapperRuntime::Shared { mut processor, - mut message_channel, + mut inbox, mut effect_handler, } => { effect_handler @@ -488,7 +503,7 @@ impl ProcessorWrapper { .start_periodic_telemetry(Duration::from_secs(1)) .await?; - while let Ok(msg) = message_channel.recv_when(processor.accept_pdata()).await { + while let Ok(msg) = inbox.recv_when(processor.accept_pdata()).await { processor.process(msg, &mut effect_handler).await?; } // Cancel periodic collection diff --git a/rust/otap-dataflow/crates/engine/src/shared/exporter.rs b/rust/otap-dataflow/crates/engine/src/shared/exporter.rs index b2d9fd0b26..fb27569d72 100644 --- a/rust/otap-dataflow/crates/engine/src/shared/exporter.rs +++ b/rust/otap-dataflow/crates/engine/src/shared/exporter.rs @@ -35,7 +35,7 @@ use crate::control::{AckMsg, NackMsg, NodeControlMsg}; use crate::effect_handler::{EffectHandlerCore, TelemetryTimerCancelHandle, TimerCancelHandle}; use crate::error::Error; -use crate::message::{Message, SharedExporterMessageChannel}; +use crate::message::{Message, SharedExporterInbox}; use crate::node::NodeId; use crate::shared::message::SharedReceiver; use crate::terminal_state::TerminalState; @@ -48,13 +48,13 @@ use otap_df_telemetry::reporter::MetricsReporter; use std::marker::PhantomData; use std::time::Duration; -/// Send-friendly exporter message channel for shared exporter runtimes. -pub struct ExporterMessageChannel { - inner: SharedExporterMessageChannel, +/// Send-friendly exporter inbox for shared exporter runtimes. +pub struct ExporterInbox { + inner: SharedExporterInbox, } -impl ExporterMessageChannel { - /// Creates a new shared exporter message channel. +impl ExporterInbox { + /// Creates a new shared exporter inbox. #[must_use] pub(crate) fn new( control_rx: SharedReceiver>, @@ -63,14 +63,12 @@ impl ExporterMessageChannel { interests: Interests, ) -> Self { Self { - inner: SharedExporterMessageChannel::new_internal( - control_rx, pdata_rx, node_id, interests, - ), + inner: SharedExporterInbox::new_internal(control_rx, pdata_rx, node_id, interests), } } } -impl ExporterMessageChannel { +impl ExporterInbox { /// Receives the next message with pdata admission enabled. pub async fn recv(&mut self) -> Result, RecvError> { self.inner.recv_internal().await @@ -89,7 +87,7 @@ pub trait Exporter { /// Similar to local::exporter::Exporter::start, but operates in a Send context. async fn start( self: Box, - msg_chan: ExporterMessageChannel, + inbox: ExporterInbox, effect_handler: EffectHandler, ) -> Result; } diff --git a/rust/otap-dataflow/crates/engine/src/shared/processor.rs b/rust/otap-dataflow/crates/engine/src/shared/processor.rs index 27e5f3fd50..49e49e6232 100644 --- a/rust/otap-dataflow/crates/engine/src/shared/processor.rs +++ b/rust/otap-dataflow/crates/engine/src/shared/processor.rs @@ -32,7 +32,8 @@ //! in parallel on different cores, each with its own processor instance. use crate::Interests; -use crate::control::{AckMsg, NackMsg, RuntimeCtrlMsgSender}; +use crate::WakeupError; +use crate::control::{AckMsg, NackMsg, RuntimeCtrlMsgSender, WakeupSlot}; use crate::effect_handler::{ EffectHandlerCore, SourceTagging, TelemetryTimerCancelHandle, TimerCancelHandle, }; @@ -251,6 +252,22 @@ impl EffectHandler { self.core.delay_data(when, data).await } + /// Requeue retained pdata onto this node later. + pub fn requeue_later(&self, when: Instant, data: Box) -> Result<(), PData> { + self.core.requeue_later(when, data) + } + + /// Set or replace a processor-local wakeup. + pub fn set_wakeup(&self, slot: WakeupSlot, when: Instant) -> Result<(), WakeupError> { + self.core.set_wakeup(slot, when) + } + + /// Cancel a previously scheduled processor-local wakeup. + #[must_use] + pub fn cancel_wakeup(&self, slot: WakeupSlot) -> bool { + self.core.cancel_wakeup(slot) + } + /// Reports metrics collected by the processor. #[allow(dead_code)] // Will be used in the future. ToDo report metrics from channel and messages. pub(crate) fn report_metrics( diff --git a/rust/otap-dataflow/crates/engine/src/testing/dst/README.md b/rust/otap-dataflow/crates/engine/src/testing/dst/README.md index f221bbb7d5..65c1e8c7bf 100644 --- a/rust/otap-dataflow/crates/engine/src/testing/dst/README.md +++ b/rust/otap-dataflow/crates/engine/src/testing/dst/README.md @@ -38,8 +38,8 @@ interleavings. The DST suite intentionally uses production components such as: -- `ProcessorMessageChannel` -- `ExporterMessageChannel` +- `ProcessorInbox` +- `ExporterInbox` - `RuntimeCtrlMsgManager` - `PipelineCompletionMsgDispatcher` diff --git a/rust/otap-dataflow/crates/engine/src/testing/dst/closed_admission.rs b/rust/otap-dataflow/crates/engine/src/testing/dst/closed_admission.rs index 98138b63d6..51c2e82e76 100644 --- a/rust/otap-dataflow/crates/engine/src/testing/dst/closed_admission.rs +++ b/rust/otap-dataflow/crates/engine/src/testing/dst/closed_admission.rs @@ -4,7 +4,7 @@ use super::{DstRng, SimClock, dst_seeds}; use crate::Interests; use crate::control::NodeControlMsg; -use crate::message::{Message, ProcessorMessageChannel, Receiver}; +use crate::message::{Message, ProcessorInbox, Receiver}; use crate::testing::dst::common::{setup_dst_runtime, yield_cycles}; use otap_df_channel::mpsc; use std::time::Duration; @@ -21,7 +21,7 @@ async fn run_closed_admission_deadline_seed(seed: u64) { let mut rng = DstRng::new(seed); let (control_tx, control_rx) = mpsc::Channel::>::new(32); let (pdata_tx, pdata_rx) = mpsc::Channel::::new(32); - let mut channel = ProcessorMessageChannel::new( + let mut channel = ProcessorInbox::new( Receiver::Local(crate::local::message::LocalReceiver::mpsc(control_rx)), Receiver::Local(crate::local::message::LocalReceiver::mpsc(pdata_rx)), 9, diff --git a/rust/otap-dataflow/crates/engine/src/testing/dst/heavy_ingress.rs b/rust/otap-dataflow/crates/engine/src/testing/dst/heavy_ingress.rs index 9bcb88562d..ced1c32cd2 100644 --- a/rust/otap-dataflow/crates/engine/src/testing/dst/heavy_ingress.rs +++ b/rust/otap-dataflow/crates/engine/src/testing/dst/heavy_ingress.rs @@ -8,7 +8,7 @@ use crate::control::{ AckMsg, ControlSenders, NackMsg, NodeControlMsg, PipelineCompletionMsg, RuntimeControlMsg, pipeline_completion_msg_channel, }; -use crate::message::{ExporterMessageChannel, Message, ProcessorMessageChannel, Receiver}; +use crate::message::{ExporterInbox, Message, ProcessorInbox, Receiver}; use crate::node::NodeType; use crate::pipeline_ctrl::PipelineCompletionMsgDispatcher; use crate::testing::dst::common::{ @@ -213,7 +213,7 @@ async fn run_backpressure_interblock_seed(seed: u64) { let processor_handle = tokio::task::spawn_local(async move { let mut inflight = 0usize; let mut was_paused = false; - let mut msg_channel = ProcessorMessageChannel::new( + let mut msg_channel = ProcessorInbox::new( processor_control_rx.expect("processor control receiver"), Receiver::Local(crate::local::message::LocalReceiver::mpsc(recv_to_proc_rx)), processor_id.index, @@ -277,7 +277,7 @@ async fn run_backpressure_interblock_seed(seed: u64) { nack: bool, } - let mut msg_channel = ExporterMessageChannel::new( + let mut msg_channel = ExporterInbox::new( exporter_control_rx.expect("exporter control receiver"), Receiver::Local(crate::local::message::LocalReceiver::mpsc(proc_to_export_rx)), exporter_id.index, diff --git a/rust/otap-dataflow/crates/engine/src/testing/dst/message_channel.rs b/rust/otap-dataflow/crates/engine/src/testing/dst/message_channel.rs index a1e6cf2bb4..c3c836af83 100644 --- a/rust/otap-dataflow/crates/engine/src/testing/dst/message_channel.rs +++ b/rust/otap-dataflow/crates/engine/src/testing/dst/message_channel.rs @@ -4,7 +4,7 @@ use super::{DstRng, SimClock, dst_seeds}; use crate::Interests; use crate::control::NodeControlMsg; -use crate::message::{ExporterMessageChannel, Message, ProcessorMessageChannel, Receiver}; +use crate::message::{ExporterInbox, Message, ProcessorInbox, Receiver}; use crate::testing::dst::common::setup_dst_runtime; use otap_df_channel::mpsc; use std::time::Duration; @@ -23,7 +23,7 @@ async fn run_message_channel_seed(seed: u64) { // Fairness phase. let (control_tx, control_rx) = mpsc::Channel::>::new(128); let (pdata_tx, pdata_rx) = mpsc::Channel::::new(128); - let mut channel = ProcessorMessageChannel::new( + let mut channel = ProcessorInbox::new( Receiver::Local(crate::local::message::LocalReceiver::mpsc(control_rx)), Receiver::Local(crate::local::message::LocalReceiver::mpsc(pdata_rx)), 1, @@ -72,7 +72,7 @@ async fn run_message_channel_seed(seed: u64) { // Draining phase with explicit deadline expiry. let (control_tx, control_rx) = mpsc::Channel::>::new(128); let (pdata_tx, pdata_rx) = mpsc::Channel::::new(128); - let mut channel = ProcessorMessageChannel::new( + let mut channel = ProcessorInbox::new( Receiver::Local(crate::local::message::LocalReceiver::mpsc(control_rx)), Receiver::Local(crate::local::message::LocalReceiver::mpsc(pdata_rx)), 2, @@ -124,7 +124,7 @@ async fn run_message_channel_seed(seed: u64) { // Exporter draining while admission stays closed. let (control_tx, control_rx) = mpsc::Channel::>::new(16); let (pdata_tx, pdata_rx) = mpsc::Channel::::new(16); - let mut channel = ExporterMessageChannel::new( + let mut channel = ExporterInbox::new( Receiver::Local(crate::local::message::LocalReceiver::mpsc(control_rx)), Receiver::Local(crate::local::message::LocalReceiver::mpsc(pdata_rx)), 3, diff --git a/rust/otap-dataflow/crates/engine/src/testing/dst/mod.rs b/rust/otap-dataflow/crates/engine/src/testing/dst/mod.rs index d85d6aa720..79c4947e1f 100644 --- a/rust/otap-dataflow/crates/engine/src/testing/dst/mod.rs +++ b/rust/otap-dataflow/crates/engine/src/testing/dst/mod.rs @@ -8,8 +8,8 @@ //! deterministic time and deterministic interleavings. Rather than //! reimplementing shutdown, timers, or Ack/Nack unwinding in a separate //! simulator, the harness runs the real -//! [`crate::message::ProcessorMessageChannel`], -//! [`crate::message::ExporterMessageChannel`], +//! [`crate::message::ProcessorInbox`], +//! [`crate::message::ExporterInbox`], //! [`RuntimeCtrlMsgManager`], and [`PipelineCompletionMsgDispatcher`] on the //! same kind of single-threaded runtime used by local engine components. //! diff --git a/rust/otap-dataflow/crates/engine/src/testing/processor.rs b/rust/otap-dataflow/crates/engine/src/testing/processor.rs index 6ab136905c..105ce66d1f 100644 --- a/rust/otap-dataflow/crates/engine/src/testing/processor.rs +++ b/rust/otap-dataflow/crates/engine/src/testing/processor.rs @@ -8,7 +8,7 @@ use crate::Interests; use crate::config::ProcessorConfig; -use crate::control::runtime_ctrl_msg_channel; +use crate::control::{NodeControlMsg, runtime_ctrl_msg_channel}; use crate::effect_handler::SourceTagging; use crate::error::Error; use crate::local::message::{LocalReceiver, LocalSender}; @@ -23,7 +23,7 @@ use otap_df_telemetry::reporter::MetricsReporter; use std::fmt::Debug; use std::future::Future; use std::marker::PhantomData; -use std::time::Duration; +use std::time::{Duration, Instant}; use tokio::task::{JoinHandle, LocalSet}; use tokio::time::sleep; @@ -91,6 +91,40 @@ impl TestContext { sleep(duration).await; } + /// Returns the next scheduled local-control deadline, if any. + #[must_use] + pub fn next_local_control_deadline(&self) -> Option { + match &self.runtime { + ProcessorWrapperRuntime::Local { effect_handler, .. } => effect_handler + .core + .local_scheduler + .as_ref() + .and_then(|scheduler| scheduler.next_expiry()), + ProcessorWrapperRuntime::Shared { effect_handler, .. } => effect_handler + .core + .local_scheduler + .as_ref() + .and_then(|scheduler| scheduler.next_expiry()), + } + } + + /// Pops the next due local control message using the provided logical time. + #[must_use] + pub fn take_due_local_control(&mut self, now: Instant) -> Option> { + match &mut self.runtime { + ProcessorWrapperRuntime::Local { effect_handler, .. } => effect_handler + .core + .local_scheduler + .as_ref() + .and_then(|scheduler| scheduler.pop_due(now)), + ProcessorWrapperRuntime::Shared { effect_handler, .. } => effect_handler + .core + .local_scheduler + .as_ref() + .and_then(|scheduler| scheduler.pop_due(now)), + } + } + /// Sets whether outgoing messages need source node tagging on the effect handler. pub fn set_source_tagging(&mut self, value: SourceTagging) { match &mut self.runtime { @@ -219,6 +253,30 @@ impl TestRuntime { } } + /// Creates a new test runtime with explicit channel capacities. + #[must_use] + pub fn with_channel_capacities( + control_channel_capacity: usize, + pdata_channel_capacity: usize, + ) -> Self { + let metrics_system = InternalTelemetrySystem::default(); + let config = ProcessorConfig::with_channel_capacities( + "test_processor", + control_channel_capacity, + pdata_channel_capacity, + ); + let (rt, local_tasks) = setup_test_runtime(); + + Self { + config, + rt, + local_tasks, + counter: CtrlMsgCounters::new(), + metrics_system, + _pd: PhantomData, + } + } + /// Returns the current receiver configuration. pub const fn config(&self) -> &ProcessorConfig { &self.config diff --git a/rust/otap-dataflow/crates/otap/tests/common/counting_exporter.rs b/rust/otap-dataflow/crates/otap/tests/common/counting_exporter.rs index 3e6c7e7eea..22d21fedaa 100644 --- a/rust/otap-dataflow/crates/otap/tests/common/counting_exporter.rs +++ b/rust/otap-dataflow/crates/otap/tests/common/counting_exporter.rs @@ -18,7 +18,7 @@ use otap_df_engine::control::{AckMsg, NodeControlMsg}; use otap_df_engine::error::Error; use otap_df_engine::exporter::ExporterWrapper; use otap_df_engine::local::exporter::{EffectHandler, Exporter}; -use otap_df_engine::message::{ExporterMessageChannel, Message}; +use otap_df_engine::message::{ExporterInbox, Message}; use otap_df_engine::node::NodeId; use otap_df_engine::terminal_state::TerminalState; use otap_df_engine::{ConsumerEffectHandlerExtension, ExporterFactory}; @@ -88,7 +88,7 @@ static COUNTING_EXPORTER: ExporterFactory = ExporterFactory { impl Exporter for CountingExporter { async fn start( self: Box, - mut msg_chan: ExporterMessageChannel, + mut msg_chan: ExporterInbox, effect_handler: EffectHandler, ) -> Result { loop { diff --git a/rust/otap-dataflow/crates/otap/tests/common/flaky_exporter.rs b/rust/otap-dataflow/crates/otap/tests/common/flaky_exporter.rs index 77c29e248f..9277c6bd74 100644 --- a/rust/otap-dataflow/crates/otap/tests/common/flaky_exporter.rs +++ b/rust/otap-dataflow/crates/otap/tests/common/flaky_exporter.rs @@ -23,7 +23,7 @@ use otap_df_engine::control::{AckMsg, NackMsg, NodeControlMsg}; use otap_df_engine::error::Error; use otap_df_engine::exporter::ExporterWrapper; use otap_df_engine::local::exporter::{EffectHandler, Exporter}; -use otap_df_engine::message::{ExporterMessageChannel, Message}; +use otap_df_engine::message::{ExporterInbox, Message}; use otap_df_engine::node::NodeId; use otap_df_engine::terminal_state::TerminalState; use otap_df_engine::{ConsumerEffectHandlerExtension, ExporterFactory}; @@ -171,7 +171,7 @@ static FLAKY_EXPORTER: ExporterFactory = ExporterFactory { impl Exporter for FlakyExporter { async fn start( self: Box, - mut msg_chan: ExporterMessageChannel, + mut msg_chan: ExporterInbox, effect_handler: EffectHandler, ) -> Result { loop { diff --git a/rust/otap-dataflow/crates/pdata/src/validation/collector.rs b/rust/otap-dataflow/crates/pdata/src/validation/collector.rs index 308ddca3cb..810fcf30f0 100644 --- a/rust/otap-dataflow/crates/pdata/src/validation/collector.rs +++ b/rust/otap-dataflow/crates/pdata/src/validation/collector.rs @@ -40,6 +40,10 @@ pub static COLLECTOR_PATH: LazyLock = LazyLock::new(|| { path }); +pub(super) fn collector_available() -> bool { + Path::new(COLLECTOR_PATH.as_str()).exists() +} + /// Helper function to spawn an async task that reads lines from a buffer and logs them with a prefix. /// Optionally checks for a message substring and sends a signal when it matches. async fn spawn_line_reader( diff --git a/rust/otap-dataflow/crates/pdata/src/validation/scenarios.rs b/rust/otap-dataflow/crates/pdata/src/validation/scenarios.rs index 2b6404cf66..018b4a66a9 100644 --- a/rust/otap-dataflow/crates/pdata/src/validation/scenarios.rs +++ b/rust/otap-dataflow/crates/pdata/src/validation/scenarios.rs @@ -28,6 +28,14 @@ pub async fn run_single_round_trip_test( I::Response: std::fmt::Debug + PartialEq + Default, F: FnOnce() -> I::Request + 'static, { + if !super::collector::collector_available() { + eprintln!( + "Skipping validation test because collector binary is unavailable at '{}'.", + super::collector::COLLECTOR_PATH.as_str() + ); + return; + } + match run_single_round_trip::(create_request, expected_error).await { Ok(_) => {} Err(err) => { diff --git a/rust/otap-dataflow/crates/validation/src/validation_exporter.rs b/rust/otap-dataflow/crates/validation/src/validation_exporter.rs index 3e8d23ee65..8b227645cf 100644 --- a/rust/otap-dataflow/crates/validation/src/validation_exporter.rs +++ b/rust/otap-dataflow/crates/validation/src/validation_exporter.rs @@ -17,7 +17,7 @@ use otap_df_engine::control::NodeControlMsg; use otap_df_engine::error::Error as EngineError; use otap_df_engine::exporter::ExporterWrapper; use otap_df_engine::local::exporter::{EffectHandler, Exporter}; -use otap_df_engine::message::{ExporterMessageChannel, Message}; +use otap_df_engine::message::{ExporterInbox, Message}; use otap_df_engine::node::NodeId; use otap_df_engine::terminal_state::TerminalState; use otap_df_otap::OTAP_EXPORTER_FACTORIES; @@ -170,7 +170,7 @@ impl ValidationExporter { impl Exporter for ValidationExporter { async fn start( mut self: Box, - mut msg_chan: ExporterMessageChannel, + mut msg_chan: ExporterInbox, effect_handler: EffectHandler, ) -> Result { let _ = effect_handler