diff --git a/rust/otap-dataflow/Cargo.toml b/rust/otap-dataflow/Cargo.toml index dbfa9b61e5..d813bb23d2 100644 --- a/rust/otap-dataflow/Cargo.toml +++ b/rust/otap-dataflow/Cargo.toml @@ -109,7 +109,8 @@ miette = { version="7.6.0", features = ["fancy"] } mimalloc = { version = "0.1.48", features = ["extended", "v3", "debug"] } libmimalloc-sys = { version = "0.1.44", features = ["extended", "v3"] } tikv-jemallocator = { version = "0.6.1" } -tikv-jemalloc-ctl = { version = "0.6.1" } +tikv-jemalloc-ctl = { version = "0.6.1", features = ["stats"] } +tikv-jemalloc-sys = "0.6.1" memchr = "2.8.0" memmap2 = "0.9" memory-stats = "1" diff --git a/rust/otap-dataflow/benchmarks/benches/control_channel/main.rs b/rust/otap-dataflow/benchmarks/benches/control_channel/main.rs index a1641640ae..87e273c2c0 100644 --- a/rust/otap-dataflow/benchmarks/benches/control_channel/main.rs +++ b/rust/otap-dataflow/benchmarks/benches/control_channel/main.rs @@ -139,6 +139,7 @@ async fn consume_current_local( NodeControlMsg::CollectTelemetry { .. } => observed.telemetry_ticks += 1, NodeControlMsg::Config { .. } => observed.configs += 1, NodeControlMsg::DrainIngress { .. } + | NodeControlMsg::MemoryPressureChanged { .. } | NodeControlMsg::Shutdown { .. } | NodeControlMsg::DelayedData { .. } => { panic!("unexpected message in benchmark current local receiver"); @@ -202,6 +203,7 @@ async fn consume_current_shared( NodeControlMsg::CollectTelemetry { .. } => observed.telemetry_ticks += 1, NodeControlMsg::Config { .. } => observed.configs += 1, NodeControlMsg::DrainIngress { .. } + | NodeControlMsg::MemoryPressureChanged { .. } | NodeControlMsg::Shutdown { .. } | NodeControlMsg::DelayedData { .. } => { panic!("unexpected message in benchmark current shared receiver"); diff --git a/rust/otap-dataflow/crates/admin/src/health.rs b/rust/otap-dataflow/crates/admin/src/health.rs index 0ef3beee59..46b6078d44 100644 --- a/rust/otap-dataflow/crates/admin/src/health.rs +++ b/rust/otap-dataflow/crates/admin/src/health.rs @@ -1,7 +1,7 @@ // Copyright The OpenTelemetry Authors // SPDX-License-Identifier: Apache-2.0 -//! Global health and status endpoints. +//! Process-wide health and status endpoints. //! //! - GET `/api/v1/status` - list all pipelines and their status //! - GET `/api/v1/livez` - liveness probe @@ -43,6 +43,8 @@ pub(crate) struct ProbeResponse { probe: &'static str, status: &'static str, generated_at: String, + #[serde(skip_serializing_if = "Option::is_none")] + message: Option, #[serde(skip_serializing_if = "Vec::is_empty")] failing: Vec, } @@ -83,6 +85,16 @@ pub(crate) async fn livez(State(state): State) -> (StatusCode, Json) -> (StatusCode, Json) { + if state.memory_pressure_state.should_fail_readiness() { + return ( + StatusCode::SERVICE_UNAVAILABLE, + Json(ProbeResponse::with_message( + "readyz", + "process memory pressure at hard limit", + )), + ); + } + let snapshot = state.observed_state_store.snapshot(); let failing = collect_condition_failures( &snapshot, @@ -160,6 +172,7 @@ impl ProbeResponse { probe, status: "ok", generated_at: Utc::now().to_rfc3339(), + message: None, failing: Vec::new(), } } @@ -169,9 +182,20 @@ impl ProbeResponse { probe, status: "failed", generated_at: Utc::now().to_rfc3339(), + message: None, failing, } } + + fn with_message(probe: &'static str, message: impl Into) -> Self { + Self { + probe, + status: "failed", + generated_at: Utc::now().to_rfc3339(), + message: Some(message.into()), + failing: Vec::new(), + } + } } #[cfg(test)] diff --git a/rust/otap-dataflow/crates/admin/src/lib.rs b/rust/otap-dataflow/crates/admin/src/lib.rs index 387674ed81..dad61cedf8 100644 --- a/rust/otap-dataflow/crates/admin/src/lib.rs +++ b/rust/otap-dataflow/crates/admin/src/lib.rs @@ -21,6 +21,7 @@ use tower::ServiceBuilder; use crate::error::Error; use otap_df_config::engine::HttpAdminSettings; use otap_df_engine::control::PipelineAdminSender; +use otap_df_engine::memory_limiter::MemoryPressureState; use otap_df_state::store::ObservedStateHandle; use otap_df_telemetry::log_tap::InternalLogTapHandle; use otap_df_telemetry::registry::TelemetryRegistryHandle; @@ -40,6 +41,9 @@ struct AppState { /// The control message senders for controlling pipelines. ctrl_msg_senders: Arc>>>, + + /// Shared process-wide memory pressure state. + memory_pressure_state: MemoryPressureState, } /// Run the admin HTTP server until shutdown is requested. @@ -48,6 +52,7 @@ pub async fn run( observed_store: ObservedStateHandle, ctrl_msg_senders: Vec>, metrics_registry: TelemetryRegistryHandle, + memory_pressure_state: MemoryPressureState, log_tap: Option, cancel: CancellationToken, ) -> Result<(), Error> { @@ -56,6 +61,7 @@ pub async fn run( metrics_registry, log_tap, ctrl_msg_senders: Arc::new(Mutex::new(ctrl_msg_senders)), + memory_pressure_state, }; let api_routes = Router::new() diff --git a/rust/otap-dataflow/crates/config/README.md b/rust/otap-dataflow/crates/config/README.md index 4ae8ff40cc..47c074ab0b 100644 --- a/rust/otap-dataflow/crates/config/README.md +++ b/rust/otap-dataflow/crates/config/README.md @@ -110,6 +110,7 @@ Policy families: - `policies.telemetry.tokio_metrics` - `policies.telemetry.runtime_metrics` - `policies.resources.core_allocation` +- `policies.resources.memory_limiter` Defaults: @@ -148,6 +149,17 @@ Observability note: - `engine.observability.pipeline.policies.resources` is intentionally unsupported and rejected. +Memory limiter policy: + +- `policies.resources.memory_limiter` is an optional process-wide policy. +- If configured, `mode` must be explicitly set to either `enforce` or + `observe_only`. +- This policy is supported only at top-level `policies.resources`. + Group/pipeline placements are rejected. +- In Phase 1, `Soft` is informational only; `Hard` is the enforcement threshold. +- For full runtime behavior, metrics, and operational guidance, see + [`docs/memory-limiter-phase1.md`](../../docs/memory-limiter-phase1.md). + Resolution semantics: - precedence applies per policy family (`channel_capacity`, `health`, diff --git a/rust/otap-dataflow/crates/config/src/byte_units.rs b/rust/otap-dataflow/crates/config/src/byte_units.rs index c4bed89c7f..191a067faf 100644 --- a/rust/otap-dataflow/crates/config/src/byte_units.rs +++ b/rust/otap-dataflow/crates/config/src/byte_units.rs @@ -45,9 +45,30 @@ where Ok(Some(bytes as u32)) } +/// Deserialize an optional byte size as `u64`. +pub fn deserialize_u64<'de, D>(deserializer: D) -> Result, D::Error> +where + D: Deserializer<'de>, +{ + let value = Option::::deserialize(deserializer)?; + let Some(value) = value else { + return Ok(None); + }; + + let bytes = match value { + Value::Number(value) => value, + Value::String(text) => { + let parsed: Byte = text.parse().map_err(DeError::custom)?; + parsed.as_u64() + } + }; + + Ok(Some(bytes)) +} + #[cfg(test)] mod tests { - use super::deserialize; + use super::{deserialize, deserialize_u64}; use serde::Deserialize; #[derive(Debug, Deserialize)] @@ -124,4 +145,17 @@ mod tests { let cfg = de_yaml("value: '0.5 KiB'").expect("should parse 0.5 KiB to 512 bytes"); assert_eq!(cfg.value, Some(512)); } + + #[derive(Debug, Deserialize)] + struct Holder64 { + #[serde(default, deserialize_with = "deserialize_u64")] + value: Option, + } + + #[test] + fn deserialize_u64_supports_large_values() { + let cfg = serde_yaml::from_str::("value: 6 GiB") + .expect("should parse large byte values"); + assert_eq!(cfg.value, Some(6 * 1024 * 1024 * 1024)); + } } diff --git a/rust/otap-dataflow/crates/config/src/engine.rs b/rust/otap-dataflow/crates/config/src/engine.rs index c0022dc64f..b37b84a138 100644 --- a/rust/otap-dataflow/crates/config/src/engine.rs +++ b/rust/otap-dataflow/crates/config/src/engine.rs @@ -318,6 +318,42 @@ groups: assert!(config.engine.observability.pipeline.is_some()); } + #[test] + fn from_yaml_requires_explicit_memory_limiter_mode() { + let yaml = r#" +version: otel_dataflow/v1 +policies: + resources: + memory_limiter: + source: auto + soft_limit: 1 GiB + hard_limit: 2 GiB +engine: {} +groups: + default: + pipelines: + main: + nodes: + receiver: + type: "urn:test:receiver:example" + config: null + exporter: + type: "urn:test:exporter:example" + config: null + connections: + - from: receiver + to: exporter +"#; + + let err = OtelDataflowSpec::from_yaml(yaml).expect_err("should reject missing mode"); + match err { + Error::DeserializationError { details, .. } => { + assert!(details.contains("missing field `mode`")); + } + other => panic!("expected deserialization error, got: {other:?}"), + } + } + #[test] fn from_yaml_rejects_reserved_system_group() { let yaml = r#" diff --git a/rust/otap-dataflow/crates/config/src/engine/validate.rs b/rust/otap-dataflow/crates/config/src/engine/validate.rs index c8704fe170..9330e85dee 100644 --- a/rust/otap-dataflow/crates/config/src/engine/validate.rs +++ b/rust/otap-dataflow/crates/config/src/engine/validate.rs @@ -78,6 +78,33 @@ impl OtelDataflowSpec { if let Err(e) = pipeline_group.validate(pipeline_group_id) { errors.push(e); } + if pipeline_group + .policies + .as_ref() + .and_then(|policies| policies.resources.as_ref()) + .and_then(|resources| resources.memory_limiter.as_ref()) + .is_some() + { + errors.push(Error::InvalidUserConfig { + error: format!( + "groups.{pipeline_group_id}.policies.resources.memory_limiter is not supported; configure the process-wide limiter only at top-level policies.resources.memory_limiter" + ), + }); + } + for (pipeline_id, pipeline) in &pipeline_group.pipelines { + if pipeline + .policies() + .and_then(|policies| policies.resources.as_ref()) + .and_then(|resources| resources.memory_limiter.as_ref()) + .is_some() + { + errors.push(Error::InvalidUserConfig { + error: format!( + "groups.{pipeline_group_id}.pipelines.{pipeline_id}.policies.resources.memory_limiter is not supported; configure the process-wide limiter only at top-level policies.resources.memory_limiter" + ), + }); + } + } } if !errors.is_empty() { diff --git a/rust/otap-dataflow/crates/config/src/policy.rs b/rust/otap-dataflow/crates/config/src/policy.rs index bfb9c8238b..5f6e8d0ed4 100644 --- a/rust/otap-dataflow/crates/config/src/policy.rs +++ b/rust/otap-dataflow/crates/config/src/policy.rs @@ -3,11 +3,13 @@ //! Engine and pipeline policy declarations. +use crate::byte_units; use crate::health::HealthPolicy; use crate::transport_headers_policy::TransportHeadersPolicy; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; use std::fmt::Display; +use std::time::Duration; /// Top-level policy set. #[derive(Debug, Clone, Serialize, Deserialize, JsonSchema, PartialEq, Eq, Default)] @@ -52,6 +54,12 @@ impl Policies { self.resources = Some(resources); } + /// Returns the explicitly configured resources policy, if any. + #[must_use] + pub fn resources(&self) -> Option<&ResourcesPolicy> { + self.resources.as_ref() + } + /// Resolves a fully-populated policy set from scopes ordered by precedence. #[must_use] pub fn resolve<'a>(scopes: impl IntoIterator) -> ResolvedPolicies { @@ -112,6 +120,59 @@ impl Policies { )); } } + if let Some(memory_limiter) = self + .resources + .as_ref() + .and_then(|resources| resources.memory_limiter.as_ref()) + { + let limiter_path = format!("{path_prefix}.resources.memory_limiter"); + if memory_limiter.check_interval < Duration::from_millis(100) { + errors.push(format!( + "{limiter_path}.check_interval must be at least 100ms" + )); + } + if memory_limiter.retry_after_secs == 0 { + errors.push(format!( + "{limiter_path}.retry_after_secs must be greater than 0" + )); + } + if memory_limiter.purge_on_hard && memory_limiter.purge_min_interval.is_zero() { + errors.push(format!( + "{limiter_path}.purge_min_interval must be greater than 0" + )); + } + match (memory_limiter.soft_limit, memory_limiter.hard_limit) { + (Some(soft_limit), Some(hard_limit)) => { + if soft_limit == 0 { + errors.push(format!( + "{limiter_path}.soft_limit must be greater than 0" + )); + } + if hard_limit <= soft_limit { + errors.push(format!( + "{limiter_path}.hard_limit must be greater than {limiter_path}.soft_limit" + )); + } + if let Some(hysteresis) = memory_limiter.hysteresis + && hysteresis >= soft_limit + { + errors.push(format!( + "{limiter_path}.hysteresis must be smaller than {limiter_path}.soft_limit" + )); + } + } + (None, None) => { + if memory_limiter.source != MemoryLimiterSource::Auto { + errors.push(format!( + "{limiter_path}.soft_limit and {limiter_path}.hard_limit must be set when {limiter_path}.source is not auto" + )); + } + } + _ => errors.push(format!( + "{limiter_path}.soft_limit and {limiter_path}.hard_limit must either both be set or both be omitted" + )), + } + } errors } } @@ -185,6 +246,10 @@ const fn default_true() -> bool { true } +const fn default_false() -> bool { + false +} + /// Resource-related policy declarations. #[derive(Debug, Clone, Serialize, Deserialize, JsonSchema, PartialEq, Eq, Default)] #[serde(deny_unknown_fields)] @@ -192,6 +257,100 @@ pub struct ResourcesPolicy { /// CPU core allocation strategy for this pipeline. #[serde(default)] pub core_allocation: CoreAllocation, + /// Optional process-wide memory limiter configuration. + /// + /// This is currently supported only at the top-level `policies.resources` + /// scope. Group and pipeline overrides are rejected during engine validation. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub memory_limiter: Option, +} + +/// Process-wide memory limiter declarations. +#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema, PartialEq, Eq)] +#[serde(deny_unknown_fields)] +pub struct MemoryLimiterPolicy { + /// Runtime behavior applied when the limiter classifies `Hard` pressure. + pub mode: MemoryLimiterMode, + /// Preferred memory source used by the limiter. + #[serde(default)] + pub source: MemoryLimiterSource, + /// Period between memory samples. + #[serde( + default = "default_memory_limiter_check_interval", + with = "humantime_serde" + )] + #[schemars(with = "String")] + pub check_interval: Duration, + /// Soft limit in bytes. When omitted with `source: auto`, the runtime derives a value + /// from the detected cgroup memory limit. + #[serde(default, deserialize_with = "byte_units::deserialize_u64")] + #[schemars(with = "Option")] + pub soft_limit: Option, + /// Hard limit in bytes. When omitted with `source: auto`, the runtime derives a value + /// from the detected cgroup memory limit. + #[serde(default, deserialize_with = "byte_units::deserialize_u64")] + #[schemars(with = "Option")] + pub hard_limit: Option, + /// Bytes below the soft limit required to leave `Soft` pressure. + #[serde(default, deserialize_with = "byte_units::deserialize_u64")] + #[schemars(with = "Option")] + pub hysteresis: Option, + /// Retry-After header value returned by HTTP receivers while shedding ingress in + /// `enforce` mode. + #[serde(default = "default_memory_limiter_retry_after_secs")] + pub retry_after_secs: u32, + /// Whether the admin readiness endpoint should fail while in `Hard` pressure in + /// `enforce` mode. + #[serde(default = "default_true")] + pub fail_readiness_on_hard: bool, + /// Whether the limiter should force a jemalloc purge when a tick's pre-purge sample + /// classifies as `Hard`. + #[serde(default = "default_false")] + pub purge_on_hard: bool, + /// Minimum interval between forced jemalloc purges. + #[serde( + default = "default_memory_limiter_purge_min_interval", + with = "humantime_serde" + )] + #[schemars(with = "String")] + pub purge_min_interval: Duration, +} + +/// Enforcement behavior for the process-wide limiter. +#[derive(Debug, Clone, Copy, Serialize, Deserialize, JsonSchema, PartialEq, Eq)] +#[serde(rename_all = "snake_case")] +pub enum MemoryLimiterMode { + /// Update metrics/logs and reject ingress at `Hard`. + Enforce, + /// Update metrics/logs only; `Hard` remains advisory. + ObserveOnly, +} + +const fn default_memory_limiter_check_interval() -> Duration { + Duration::from_secs(1) +} + +const fn default_memory_limiter_retry_after_secs() -> u32 { + 1 +} + +const fn default_memory_limiter_purge_min_interval() -> Duration { + Duration::from_secs(5) +} + +/// Preferred memory source for the process-wide limiter. +#[derive(Debug, Default, Clone, Copy, Serialize, Deserialize, JsonSchema, PartialEq, Eq)] +#[serde(rename_all = "snake_case")] +pub enum MemoryLimiterSource { + /// Prefer cgroup memory if available, otherwise fall back to RSS and then jemalloc resident. + #[default] + Auto, + /// Use cgroup memory accounting only. + Cgroup, + /// Use process RSS only. + Rss, + /// Use jemalloc resident bytes only. + JemallocResident, } /// Defines how CPU cores should be allocated for pipeline execution. @@ -318,7 +477,8 @@ const fn default_pdata_channel_capacity() -> usize { #[cfg(test)] mod tests { - use super::Policies; + use super::{MemoryLimiterMode, MemoryLimiterPolicy, MemoryLimiterSource, Policies}; + use std::time::Duration; #[test] fn defaults_match_expected_values() { @@ -446,4 +606,162 @@ mod tests { let policy: super::TelemetryPolicy = serde_yaml::from_str(yaml).expect("parse"); assert_eq!(policy.runtime_metrics, super::MetricLevel::Basic); } + + #[test] + fn validates_memory_limiter_settings() { + let policies = Policies { + resources: Some(super::ResourcesPolicy { + core_allocation: super::CoreAllocation::AllCores, + memory_limiter: Some(MemoryLimiterPolicy { + mode: MemoryLimiterMode::Enforce, + source: MemoryLimiterSource::Auto, + check_interval: Duration::from_millis(50), + soft_limit: Some(200), + hard_limit: Some(100), + hysteresis: Some(200), + retry_after_secs: 1, + fail_readiness_on_hard: true, + purge_on_hard: false, + purge_min_interval: Duration::from_secs(5), + }), + }), + ..Policies::default() + }; + + let errors = policies.validation_errors("policies"); + assert_eq!(errors.len(), 3); + assert!(errors.iter().any(|error| error.contains("check_interval"))); + assert!(errors.iter().any(|error| error.contains("hard_limit"))); + assert!(errors.iter().any(|error| error.contains("hysteresis"))); + } + + #[test] + fn validates_memory_limiter_requires_both_limits_when_explicit() { + let policies = Policies { + resources: Some(super::ResourcesPolicy { + core_allocation: super::CoreAllocation::AllCores, + memory_limiter: Some(MemoryLimiterPolicy { + mode: MemoryLimiterMode::Enforce, + source: MemoryLimiterSource::Rss, + check_interval: Duration::from_secs(1), + soft_limit: Some(100), + hard_limit: None, + hysteresis: None, + retry_after_secs: 1, + fail_readiness_on_hard: true, + purge_on_hard: false, + purge_min_interval: Duration::from_secs(5), + }), + }), + ..Policies::default() + }; + + let errors = policies.validation_errors("policies"); + assert_eq!(errors.len(), 1); + assert!(errors[0].contains("must either both be set or both be omitted")); + } + + #[test] + fn validates_memory_limiter_rejects_zero_soft_limit() { + let policies = Policies { + resources: Some(super::ResourcesPolicy { + core_allocation: super::CoreAllocation::AllCores, + memory_limiter: Some(MemoryLimiterPolicy { + mode: MemoryLimiterMode::Enforce, + source: MemoryLimiterSource::Rss, + check_interval: Duration::from_secs(1), + soft_limit: Some(0), + hard_limit: Some(100), + hysteresis: None, + retry_after_secs: 1, + fail_readiness_on_hard: true, + purge_on_hard: false, + purge_min_interval: Duration::from_secs(5), + }), + }), + ..Policies::default() + }; + + let errors = policies.validation_errors("policies"); + assert_eq!(errors.len(), 1); + assert!(errors[0].contains("soft_limit must be greater than 0")); + } + + #[test] + fn validates_memory_limiter_requires_limits_for_non_auto_sources() { + let policies = Policies { + resources: Some(super::ResourcesPolicy { + core_allocation: super::CoreAllocation::AllCores, + memory_limiter: Some(MemoryLimiterPolicy { + mode: MemoryLimiterMode::Enforce, + source: MemoryLimiterSource::Rss, + check_interval: Duration::from_secs(1), + soft_limit: None, + hard_limit: None, + hysteresis: None, + retry_after_secs: 1, + fail_readiness_on_hard: true, + purge_on_hard: false, + purge_min_interval: Duration::from_secs(5), + }), + }), + ..Policies::default() + }; + + let errors = policies.validation_errors("policies"); + assert_eq!(errors.len(), 1); + assert!(errors[0].contains("source is not auto")); + } + + #[test] + fn validates_memory_limiter_rejects_zero_retry_after_secs() { + let policies = Policies { + resources: Some(super::ResourcesPolicy { + core_allocation: super::CoreAllocation::AllCores, + memory_limiter: Some(MemoryLimiterPolicy { + mode: MemoryLimiterMode::Enforce, + source: MemoryLimiterSource::Auto, + check_interval: Duration::from_secs(1), + soft_limit: Some(100), + hard_limit: Some(200), + hysteresis: None, + retry_after_secs: 0, + fail_readiness_on_hard: true, + purge_on_hard: false, + purge_min_interval: Duration::from_secs(5), + }), + }), + ..Policies::default() + }; + + let errors = policies.validation_errors("policies"); + assert_eq!(errors.len(), 1); + assert!(errors[0].contains("retry_after_secs must be greater than 0")); + } + + #[test] + fn validates_memory_limiter_rejects_zero_purge_min_interval() { + let policies = Policies { + resources: Some(super::ResourcesPolicy { + core_allocation: super::CoreAllocation::AllCores, + memory_limiter: Some(MemoryLimiterPolicy { + mode: MemoryLimiterMode::Enforce, + source: MemoryLimiterSource::Auto, + check_interval: Duration::from_secs(1), + soft_limit: Some(100), + hard_limit: Some(200), + hysteresis: None, + retry_after_secs: 1, + fail_readiness_on_hard: true, + purge_on_hard: true, + purge_min_interval: Duration::ZERO, + }), + }), + ..Policies::default() + }; + + let errors = policies.validation_errors("policies"); + assert_eq!(errors.len(), 1); + assert!(errors[0].contains("purge_min_interval must be greater than 0")); + } } diff --git a/rust/otap-dataflow/crates/controller/src/error.rs b/rust/otap-dataflow/crates/controller/src/error.rs index 33a74ecf80..f9e520123c 100644 --- a/rust/otap-dataflow/crates/controller/src/error.rs +++ b/rust/otap-dataflow/crates/controller/src/error.rs @@ -31,6 +31,13 @@ pub enum Error { #[error("Telemetry error: {0}")] TelemetryError(#[from] otap_df_telemetry::error::Error), + /// Memory limiter configuration or runtime initialization error. + #[error("Memory limiter error: {message}")] + MemoryLimiterError { + /// Human-readable error message. + message: String, + }, + /// Pipeline runtime error. #[error("Pipeline runtime error: {source}")] PipelineRuntimeError { diff --git a/rust/otap-dataflow/crates/controller/src/lib.rs b/rust/otap-dataflow/crates/controller/src/lib.rs index 082895a3ab..971ac98018 100644 --- a/rust/otap-dataflow/crates/controller/src/lib.rs +++ b/rust/otap-dataflow/crates/controller/src/lib.rs @@ -50,6 +50,7 @@ use otap_df_config::engine::{ SYSTEM_OBSERVABILITY_PIPELINE_ID, SYSTEM_PIPELINE_GROUP_ID, }; use otap_df_config::node::{NodeKind, NodeUserConfig}; +use otap_df_config::policy::MemoryLimiterMode; use otap_df_config::policy::{ChannelCapacityPolicy, CoreAllocation, TelemetryPolicy}; use otap_df_config::topic::{ TopicAckPropagationMode, TopicBackendKind, TopicBroadcastOnLagPolicy, TopicImplSelectionPolicy, @@ -72,6 +73,10 @@ use otap_df_engine::entity_context::{ node_entity_key, pipeline_entity_key, set_pipeline_entity_key, }; use otap_df_engine::error::{Error as EngineError, error_summary_from}; +use otap_df_engine::memory_limiter::{ + EffectiveMemoryLimiter, MemoryLimiterTick, MemoryPressureBehaviorConfig, MemoryPressureChanged, + MemoryPressureLevel, +}; use otap_df_engine::topic::{ InMemoryBackend, PipelineTopicBinding, TopicBroker, TopicOptions, TopicPublishOutcomeConfig, TopicSet, @@ -1069,6 +1074,97 @@ impl { + return Ok::<(), otap_df_telemetry::error::Error>(()); + } + _ = ticker.tick() => { + match limiter.tick(&limiter_state) { + Ok(tick) => { + if tick.transitioned() { + transition_generation += 1; + let _ = limiter_updates.send(MemoryPressureChanged { + generation: transition_generation, + level: tick.current_level, + retry_after_secs: limiter_state.retry_after_secs(), + usage_bytes: tick.sample.usage_bytes, + }); + } + Self::log_memory_limiter_tick(tick) + } + Err(err) => { + otel_warn!( + "process_memory_limiter.sample_failed", + error = err.as_str() + ); + } + } + } + } + } + }, + )?); + } // Declare all topics up front before any pipeline thread starts. let declared_topics = Self::declare_topics(&engine_config)?; @@ -1117,6 +1213,7 @@ impl { + otel_warn!( + "process_memory_limiter.transition", + previous = format!("{:?}", tick.previous_level), + current = format!("{:?}", tick.current_level), + source = source, + usage_bytes = usage_bytes, + soft_limit_bytes = tick.soft_limit_bytes, + hard_limit_bytes = tick.hard_limit_bytes + ); + } + MemoryPressureLevel::Soft => { + otel_info!( + "process_memory_limiter.transition", + previous = format!("{:?}", tick.previous_level), + current = "Soft", + source = source, + usage_bytes = usage_bytes, + soft_limit_bytes = tick.soft_limit_bytes, + hard_limit_bytes = tick.hard_limit_bytes + ); + } + MemoryPressureLevel::Normal => { + otel_info!( + "process_memory_limiter.transition", + previous = format!("{:?}", tick.previous_level), + current = "Normal", + source = source, + usage_bytes = usage_bytes, + soft_limit_bytes = tick.soft_limit_bytes, + hard_limit_bytes = tick.hard_limit_bytes + ); + } + } + } + /// Selects which CPU cores to use based on the given allocation. fn select_cores_for_allocation( mut available_core_ids: Vec, @@ -1567,6 +1744,7 @@ impl, tracing_setup: TracingSetup, ) -> Result, Error>>)>, Error> { let (internal_config, channel_capacity_policy, telemetry_policy): ( @@ -1630,6 +1808,7 @@ impl, pipeline_completion_msg_tx: PipelineCompletionMsgSender, pipeline_completion_msg_rx: PipelineCompletionMsgReceiver, + memory_pressure_rx: tokio::sync::watch::Receiver, tracing_setup: TracingSetup, internal_telemetry: Option<( InternalTelemetrySettings, @@ -1784,6 +1965,7 @@ impl, ) { if let Some(core_allocation) = core_allocation_override(num_cores, core_id_range) { - engine_cfg + let mut resources = engine_cfg .policies - .set_resources(ResourcesPolicy { core_allocation }); + .resources() + .cloned() + .unwrap_or_else(ResourcesPolicy::default); + resources.core_allocation = core_allocation; + engine_cfg.policies.set_resources(resources); } if let Some(http_admin) = http_admin_bind_override(http_admin_bind) { engine_cfg.engine.http_admin = Some(http_admin); diff --git a/rust/otap-dataflow/crates/core-nodes/src/exporters/otlp_http_exporter/mod.rs b/rust/otap-dataflow/crates/core-nodes/src/exporters/otlp_http_exporter/mod.rs index 8b6a3c5948..07048399a7 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/exporters/otlp_http_exporter/mod.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/exporters/otlp_http_exporter/mod.rs @@ -811,6 +811,7 @@ mod test { server_settings, ack_registry, Arc::new(Mutex::new(server_metrics)), + otap_df_engine::memory_limiter::SharedReceiverAdmissionState::default(), None, server_cancellation_token, ) @@ -969,6 +970,7 @@ mod test { server_settings, ack_registry, Arc::new(Mutex::new(server_metrics)), + otap_df_engine::memory_limiter::SharedReceiverAdmissionState::default(), None, server_cancellation_token, ) diff --git a/rust/otap-dataflow/crates/core-nodes/src/processors/batch_processor/mod.rs b/rust/otap-dataflow/crates/core-nodes/src/processors/batch_processor/mod.rs index 5c9b76d542..f180cfcffd 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/processors/batch_processor/mod.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/processors/batch_processor/mod.rs @@ -1123,6 +1123,7 @@ impl local::Processor for BatchProcessor { } NodeControlMsg::Ack(ack) => self.handle_ack(effect, ack).await, NodeControlMsg::Nack(nack) => self.handle_nack(effect, nack).await, + NodeControlMsg::MemoryPressureChanged { .. } => Ok(()), NodeControlMsg::DrainIngress { .. } => Ok(()), NodeControlMsg::TimerTick { .. } => unreachable!(), }, diff --git a/rust/otap-dataflow/crates/core-nodes/src/processors/durable_buffer_processor/mod.rs b/rust/otap-dataflow/crates/core-nodes/src/processors/durable_buffer_processor/mod.rs index 2988346d60..4a7496c44d 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/processors/durable_buffer_processor/mod.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/processors/durable_buffer_processor/mod.rs @@ -1932,6 +1932,7 @@ impl otap_df_engine::local::processor::Processor for DurableBuffer { otel_debug!("durable_buffer.config.update", config = ?config); Ok(()) } + NodeControlMsg::MemoryPressureChanged { .. } => Ok(()), NodeControlMsg::DrainIngress { .. } => Ok(()), NodeControlMsg::DelayedData { data, .. } => { // Check if this is a retry ticket (has BundleRef + retry_count in calldata) diff --git a/rust/otap-dataflow/crates/core-nodes/src/processors/log_sampling_processor/mod.rs b/rust/otap-dataflow/crates/core-nodes/src/processors/log_sampling_processor/mod.rs index 7409b2ac9d..526a02721c 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/processors/log_sampling_processor/mod.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/processors/log_sampling_processor/mod.rs @@ -198,6 +198,7 @@ impl local::Processor for LogSamplingProcessor { | NodeControlMsg::Config { .. } | NodeControlMsg::Ack(_) | NodeControlMsg::Nack(_) + | NodeControlMsg::MemoryPressureChanged { .. } | NodeControlMsg::DrainIngress { .. } | NodeControlMsg::DelayedData { .. } => Ok(()), }, diff --git a/rust/otap-dataflow/crates/core-nodes/src/processors/retry_processor/mod.rs b/rust/otap-dataflow/crates/core-nodes/src/processors/retry_processor/mod.rs index 83dd314126..5a0ba4c14c 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/processors/retry_processor/mod.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/processors/retry_processor/mod.rs @@ -651,6 +651,7 @@ impl Processor for RetryProcessor { NodeControlMsg::TimerTick { .. } => { unreachable!("unused"); } + NodeControlMsg::MemoryPressureChanged { .. } => Ok(()), NodeControlMsg::DrainIngress { .. } => Ok(()), NodeControlMsg::Shutdown { .. } => Ok(()), }, diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/otap_receiver/mod.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/otap_receiver/mod.rs index 6afd26e412..8f76fe41a1 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/receivers/otap_receiver/mod.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/otap_receiver/mod.rs @@ -13,6 +13,7 @@ use otap_df_config::tls::TlsServerConfig; use otap_df_otap::OTAP_RECEIVER_FACTORIES; use otap_df_otap::compression::CompressionMethod; +use otap_df_otap::memory_pressure_layer::{MemoryPressureLayer, MemoryPressureRejectionMetrics}; use otap_df_otap::otap_grpc::middleware::zstd_header::ZstdRequestHeaderAdapter; use otap_df_otap::otap_grpc::otlp::server::{RouteResponse, SharedState}; use otap_df_otap::otap_grpc::{ @@ -32,6 +33,7 @@ use otap_df_engine::config::ReceiverConfig; use otap_df_engine::context::PipelineContext; use otap_df_engine::control::{AckMsg, NackMsg, NodeControlMsg}; use otap_df_engine::error::{Error, ReceiverErrorKind, format_error_sources}; +use otap_df_engine::memory_limiter::SharedReceiverAdmissionState; use otap_df_engine::node::NodeId; use otap_df_engine::receiver::ReceiverWrapper; use otap_df_engine::shared::receiver as shared; @@ -49,6 +51,7 @@ use serde_json::Value; use std::net::SocketAddr; use std::ops::Add; use std::sync::Arc; +use std::sync::atomic::{AtomicU64, Ordering}; use std::task::Poll; use std::time::{Duration, Instant}; use tokio_util::sync::CancellationToken; @@ -110,6 +113,8 @@ const fn default_wait_for_result() -> bool { pub struct OTAPReceiver { config: Config, metrics: MetricSet, + memory_pressure_metrics: Arc, + admission_state: SharedReceiverAdmissionState, } /// Declares the OTAP receiver as a shared receiver factory @@ -150,7 +155,14 @@ impl OTAPReceiver { // Register OTAP receiver metrics for this node. let metrics = pipeline_ctx.register_metrics::(); - Ok(OTAPReceiver { config, metrics }) + Ok(OTAPReceiver { + config, + metrics, + memory_pressure_metrics: Arc::new(SharedOtapMemoryPressureMetrics::default()), + admission_state: SharedReceiverAdmissionState::from_process_state( + &pipeline_ctx.memory_pressure_state(), + ), + }) } fn route_ack_response(&self, states: &SharedStates, ack: AckMsg) -> RouteResponse { @@ -203,6 +215,10 @@ impl OTAPReceiver { RouteResponse::None => {} } } + + fn flush_memory_pressure_metrics(&mut self) { + self.memory_pressure_metrics.flush_into(&mut self.metrics); + } } /// OTAP receiver metrics. @@ -220,6 +236,41 @@ pub struct OtapReceiverMetrics { /// Number of invalid/expired acks/nacks. #[metric(unit = "{ack_or_nack}")] pub acks_nacks_invalid_or_expired: Counter, + + /// Number of OTAP RPCs rejected before entering the pipeline. + #[metric(unit = "{requests}")] + pub rejected_requests: Counter, + + /// Number of OTAP RPCs rejected specifically because memory pressure was active. + #[metric(unit = "{requests}")] + pub refused_memory_pressure: Counter, +} + +#[derive(Default)] +struct SharedOtapMemoryPressureMetrics { + rejected_requests: AtomicU64, + refused_memory_pressure: AtomicU64, +} + +impl SharedOtapMemoryPressureMetrics { + fn flush_into(&self, metrics: &mut MetricSet) { + let rejected_requests = self.rejected_requests.swap(0, Ordering::Relaxed); + if rejected_requests > 0 { + metrics.rejected_requests.add(rejected_requests); + } + + let refused_memory_pressure = self.refused_memory_pressure.swap(0, Ordering::Relaxed); + if refused_memory_pressure > 0 { + metrics.refused_memory_pressure.add(refused_memory_pressure); + } + } +} + +impl MemoryPressureRejectionMetrics for SharedOtapMemoryPressureMetrics { + fn record_memory_pressure_rejection(&self) { + let _ = self.rejected_requests.fetch_add(1, Ordering::Relaxed); + let _ = self.refused_memory_pressure.fetch_add(1, Ordering::Relaxed); + } } /// State shared between gRPC server task and the effect handler. @@ -272,6 +323,8 @@ impl shared::Receiver for OTAPReceiver { response_stream_channel_size: self.config.response_stream_channel_size, max_concurrent_requests: self.config.max_concurrent_requests, wait_for_result: self.config.wait_for_result, + admission_state: self.admission_state.clone(), + memory_pressure_rejection_metrics: Some(self.memory_pressure_metrics.clone()), }; //create services for the grpc server and clone the effect handler to pass message @@ -326,6 +379,10 @@ impl shared::Receiver for OTAPReceiver { let handshake_timeout = self.config.tls.as_ref().and_then(|t| t.handshake_timeout); let server = server_builder + .layer(MemoryPressureLayer::with_metrics( + self.admission_state.clone(), + self.memory_pressure_metrics.clone(), + )) .layer(MiddlewareLayer::new(ZstdRequestHeaderAdapter::default())) .add_service(logs_server) .add_service(metrics_server) @@ -401,6 +458,7 @@ impl shared::Receiver for OTAPReceiver { _ = handle.cancel().await; } effect_handler.notify_receiver_drained().await?; + self.flush_memory_pressure_metrics(); terminal_state = TerminalState::new(deadline, [self.metrics.snapshot()]); break; } @@ -440,12 +498,17 @@ impl shared::Receiver for OTAPReceiver { if let Some(handle) = telemetry_cancel_handle.take() { _ = handle.cancel().await; } + self.flush_memory_pressure_metrics(); terminal_state = TerminalState::new(deadline, [self.metrics.snapshot()]); break; } Ok(NodeControlMsg::CollectTelemetry { mut metrics_reporter }) => { + self.flush_memory_pressure_metrics(); _ = metrics_reporter.report(&mut self.metrics); } + Ok(NodeControlMsg::MemoryPressureChanged { update }) => { + self.admission_state.apply(update); + } Ok(NodeControlMsg::Ack(ack)) => { self.handle_ack_response(self.route_ack_response(&states, ack)); } @@ -478,6 +541,7 @@ impl shared::Receiver for OTAPReceiver { if let Some(handle) = telemetry_cancel_handle.take() { _ = handle.cancel().await; } + self.flush_memory_pressure_metrics(); terminal_state = TerminalState::new( clock::now().add(Duration::from_secs(1)), [self.metrics.snapshot()], @@ -526,6 +590,7 @@ mod tests { receiver::{NotSendValidateContext, TestContext, TestRuntime}, test_node, }; + use otap_df_otap::memory_pressure_layer::MemoryPressureRejectionMetrics; use otap_df_otap::otap_mock::create_otap_batch; use otap_df_otap::pdata::OtapPdata; use otap_df_otap::testing::{next_ack, next_nack}; @@ -541,6 +606,7 @@ mod tests { use std::net::SocketAddr; use std::pin::Pin; use std::sync::Arc; + use std::sync::atomic::Ordering; use std::time::Instant; use tokio::time::{Duration, timeout}; @@ -1095,6 +1161,49 @@ mod tests { assert!(receiver.config.timeout.is_none()); } + #[test] + fn shared_memory_pressure_metrics_flush_into_reported_metric_set() { + use serde_json::json; + + let telemetry_registry_handle = otap_df_telemetry::registry::TelemetryRegistryHandle::new(); + let controller_ctx = + otap_df_engine::context::ControllerContext::new(telemetry_registry_handle); + let pipeline_ctx = + controller_ctx.pipeline_context_with("grp".into(), "pipeline".into(), 0, 1, 0); + + let config = json!({ + "listening_addr": "127.0.0.1:4317", + "response_stream_channel_size": 100 + }); + let mut receiver = OTAPReceiver::from_config(pipeline_ctx, &config).unwrap(); + + receiver + .memory_pressure_metrics + .record_memory_pressure_rejection(); + receiver + .memory_pressure_metrics + .record_memory_pressure_rejection(); + + receiver.flush_memory_pressure_metrics(); + + assert_eq!(receiver.metrics.rejected_requests.get(), 2); + assert_eq!(receiver.metrics.refused_memory_pressure.get(), 2); + assert_eq!( + receiver + .memory_pressure_metrics + .rejected_requests + .load(Ordering::Relaxed), + 0 + ); + assert_eq!( + receiver + .memory_pressure_metrics + .refused_memory_pressure + .load(Ordering::Relaxed), + 0 + ); + } + #[test] fn test_otap_receiver_ack() { let test_runtime = TestRuntime::new(); diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/otlp_receiver/mod.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/otlp_receiver/mod.rs index 4563d5957f..4f1a806e47 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/receivers/otlp_receiver/mod.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/otlp_receiver/mod.rs @@ -34,10 +34,12 @@ use otap_df_engine::config::ReceiverConfig; use otap_df_engine::context::PipelineContext; use otap_df_engine::control::{AckMsg, NackMsg, NodeControlMsg}; use otap_df_engine::error::{Error, ReceiverErrorKind, format_error_sources}; +use otap_df_engine::memory_limiter::SharedReceiverAdmissionState; use otap_df_engine::node::NodeId; use otap_df_engine::receiver::ReceiverWrapper; use otap_df_engine::shared::receiver as shared; use otap_df_engine::terminal_state::TerminalState; +use otap_df_otap::memory_pressure_layer::MemoryPressureLayer; use otap_df_otap::otap_grpc::common; use otap_df_otap::otap_grpc::common::AckRegistry; use otap_df_otap::otap_grpc::server_settings::GrpcServerSettings; @@ -184,6 +186,7 @@ pub struct OTLPReceiver { // Arc> so we can share metrics with the gRPC services which are `Send` due to // tonic requirements. metrics: Arc>>, + admission_state: SharedReceiverAdmissionState, // Global concurrency cap derived from downstream capacity. When both gRPC and HTTP are // enabled, this prevents combined ingress from exceeding what the pipeline can absorb. global_max_concurrent_requests: Option, @@ -264,6 +267,9 @@ impl OTLPReceiver { Ok(Self { config, metrics, + admission_state: SharedReceiverAdmissionState::from_process_state( + &pipeline_ctx.memory_pressure_state(), + ), global_max_concurrent_requests: None, }) } @@ -393,6 +399,9 @@ impl OTLPReceiver { } => { _ = metrics_reporter.report(&mut *self.metrics.lock()); } + NodeControlMsg::MemoryPressureChanged { update } => { + self.admission_state.apply(update); + } NodeControlMsg::Ack(ack) => { self.handle_ack(registry, ack); } @@ -531,11 +540,22 @@ impl shared::Receiver for OTLPReceiver { let limit_layer = if let Some(global) = global_semaphore.clone() { Either::Left( ServiceBuilder::new() + .layer(MemoryPressureLayer::with_otlp_metrics( + self.admission_state.clone(), + self.metrics.clone(), + )) .layer(GlobalConcurrencyLimitLayer::new(grpc_max)) .layer(SharedConcurrencyLayer::new(global)), ) } else { - Either::Right(GlobalConcurrencyLimitLayer::new(grpc_max)) + Either::Right( + ServiceBuilder::new() + .layer(MemoryPressureLayer::with_otlp_metrics( + self.admission_state.clone(), + self.metrics.clone(), + )) + .layer(GlobalConcurrencyLimitLayer::new(grpc_max)), + ) }; let mut server = @@ -606,6 +626,7 @@ impl shared::Receiver for OTLPReceiver { http_config, ack_registry.clone(), self.metrics.clone(), + self.admission_state.clone(), global_semaphore.clone(), http_shutdown.clone(), ))) @@ -1059,6 +1080,9 @@ mod tests { metrics: Arc::new(Mutex::new( pipeline_ctx.register_metrics::(), )), + admission_state: SharedReceiverAdmissionState::from_process_state( + &pipeline_ctx.memory_pressure_state(), + ), global_max_concurrent_requests: None, }; receiver.tune_max_concurrent_requests(16); @@ -1878,6 +1902,9 @@ mod tests { pipeline_ctx.register_metrics::(), )), global_max_concurrent_requests: None, + admission_state: SharedReceiverAdmissionState::from_process_state( + &pipeline_ctx.memory_pressure_state(), + ), }, test_node(test_runtime.config().name.clone()), node_config, @@ -1923,6 +1950,9 @@ mod tests { pipeline_ctx.register_metrics::(), )), global_max_concurrent_requests: None, + admission_state: SharedReceiverAdmissionState::from_process_state( + &pipeline_ctx.memory_pressure_state(), + ), }, test_node(test_runtime.config().name.clone()), node_config, @@ -2013,6 +2043,9 @@ mod tests { pipeline_ctx.register_metrics::(), )), global_max_concurrent_requests: None, + admission_state: SharedReceiverAdmissionState::from_process_state( + &pipeline_ctx.memory_pressure_state(), + ), }, test_node(test_runtime.config().name.clone()), node_config, @@ -2110,6 +2143,9 @@ mod tests { pipeline_ctx.register_metrics::(), )), global_max_concurrent_requests: None, + admission_state: SharedReceiverAdmissionState::from_process_state( + &pipeline_ctx.memory_pressure_state(), + ), }, test_node(test_runtime.config().name.clone()), node_config, @@ -2170,6 +2206,9 @@ mod tests { pipeline_ctx.register_metrics::(), )), global_max_concurrent_requests: None, + admission_state: SharedReceiverAdmissionState::from_process_state( + &pipeline_ctx.memory_pressure_state(), + ), }, test_node(test_runtime.config().name.clone()), node_config, @@ -2239,6 +2278,9 @@ mod tests { pipeline_ctx.register_metrics::(), )), global_max_concurrent_requests: None, + admission_state: SharedReceiverAdmissionState::from_process_state( + &pipeline_ctx.memory_pressure_state(), + ), }, test_node(test_runtime.config().name.clone()), node_config, @@ -2327,6 +2369,9 @@ mod tests { pipeline_ctx.register_metrics::(), )), global_max_concurrent_requests: None, + admission_state: SharedReceiverAdmissionState::from_process_state( + &pipeline_ctx.memory_pressure_state(), + ), }, test_node(test_runtime.config().name.clone()), node_config, @@ -2389,6 +2434,9 @@ mod tests { pipeline_ctx.register_metrics::(), )), global_max_concurrent_requests: None, + admission_state: SharedReceiverAdmissionState::from_process_state( + &pipeline_ctx.memory_pressure_state(), + ), }, test_node(test_runtime.config().name.clone()), node_config, @@ -2450,6 +2498,9 @@ mod tests { pipeline_ctx.register_metrics::(), )), global_max_concurrent_requests: None, + admission_state: SharedReceiverAdmissionState::from_process_state( + &pipeline_ctx.memory_pressure_state(), + ), }, test_node(test_runtime.config().name.clone()), node_config, @@ -2547,6 +2598,9 @@ mod tests { metrics: Arc::new(Mutex::new( pipeline_ctx.register_metrics::(), )), + admission_state: SharedReceiverAdmissionState::from_process_state( + &pipeline_ctx.memory_pressure_state(), + ), global_max_concurrent_requests: None, }, test_node(test_runtime.config().name.clone()), @@ -2628,6 +2682,9 @@ mod tests { metrics: Arc::new(Mutex::new( pipeline_ctx.register_metrics::(), )), + admission_state: SharedReceiverAdmissionState::from_process_state( + &pipeline_ctx.memory_pressure_state(), + ), global_max_concurrent_requests: None, }, test_node(test_runtime.config().name.clone()), @@ -2725,6 +2782,9 @@ mod tests { pipeline_ctx.register_metrics::(), )), global_max_concurrent_requests: None, + admission_state: SharedReceiverAdmissionState::from_process_state( + &pipeline_ctx.memory_pressure_state(), + ), }, test_node(test_runtime.config().name.clone()), node_config, @@ -2807,6 +2867,9 @@ mod tests { pipeline_ctx.register_metrics::(), )), global_max_concurrent_requests: None, + admission_state: SharedReceiverAdmissionState::from_process_state( + &pipeline_ctx.memory_pressure_state(), + ), }, test_node(test_runtime.config().name.clone()), node_config, @@ -2869,6 +2932,9 @@ mod tests { pipeline_ctx.register_metrics::(), )), global_max_concurrent_requests: None, + admission_state: SharedReceiverAdmissionState::from_process_state( + &pipeline_ctx.memory_pressure_state(), + ), }, test_node(test_runtime.config().name.clone()), node_config, @@ -2932,6 +2998,9 @@ mod tests { pipeline_ctx.register_metrics::(), )), global_max_concurrent_requests: None, + admission_state: SharedReceiverAdmissionState::from_process_state( + &pipeline_ctx.memory_pressure_state(), + ), }, test_node(test_runtime.config().name.clone()), node_config, @@ -3018,6 +3087,9 @@ mod tests { pipeline_ctx.register_metrics::(), )), global_max_concurrent_requests: None, + admission_state: SharedReceiverAdmissionState::from_process_state( + &pipeline_ctx.memory_pressure_state(), + ), }, test_node(test_runtime.config().name.clone()), node_config, @@ -3117,6 +3189,9 @@ mod tests { pipeline_ctx.register_metrics::(), )), global_max_concurrent_requests: None, + admission_state: SharedReceiverAdmissionState::from_process_state( + &pipeline_ctx.memory_pressure_state(), + ), }, test_node(test_runtime.config().name.clone()), node_config, @@ -3197,6 +3272,9 @@ mod tests { pipeline_ctx.register_metrics::(), )), global_max_concurrent_requests: None, + admission_state: SharedReceiverAdmissionState::from_process_state( + &pipeline_ctx.memory_pressure_state(), + ), }, test_node(test_runtime.config().name.clone()), node_config, @@ -3285,6 +3363,9 @@ mod tests { pipeline_ctx.register_metrics::(), )), global_max_concurrent_requests: None, + admission_state: SharedReceiverAdmissionState::from_process_state( + &pipeline_ctx.memory_pressure_state(), + ), }, test_node(test_runtime.config().name.clone()), node_config, @@ -3339,6 +3420,9 @@ mod tests { pipeline_ctx.register_metrics::(), )), global_max_concurrent_requests: None, + admission_state: SharedReceiverAdmissionState::from_process_state( + &pipeline_ctx.memory_pressure_state(), + ), }, test_node(test_runtime.config().name.clone()), node_config, @@ -3398,6 +3482,9 @@ mod tests { pipeline_ctx.register_metrics::(), )), global_max_concurrent_requests: None, + admission_state: SharedReceiverAdmissionState::from_process_state( + &pipeline_ctx.memory_pressure_state(), + ), }, test_node(test_runtime.config().name.clone()), node_config, @@ -3517,6 +3604,9 @@ mod tests { pipeline_ctx.register_metrics::(), )), global_max_concurrent_requests: Some(1), + admission_state: SharedReceiverAdmissionState::from_process_state( + &pipeline_ctx.memory_pressure_state(), + ), }, test_node(test_runtime.config().name.clone()), node_config, diff --git a/rust/otap-dataflow/crates/core-nodes/src/receivers/syslog_cef_receiver/mod.rs b/rust/otap-dataflow/crates/core-nodes/src/receivers/syslog_cef_receiver/mod.rs index ba1b5a2aaf..08f78f438f 100644 --- a/rust/otap-dataflow/crates/core-nodes/src/receivers/syslog_cef_receiver/mod.rs +++ b/rust/otap-dataflow/crates/core-nodes/src/receivers/syslog_cef_receiver/mod.rs @@ -8,6 +8,7 @@ use otap_df_config::node::NodeUserConfig; use otap_df_engine::config::ReceiverConfig; use otap_df_engine::context::PipelineContext; use otap_df_engine::control::NodeControlMsg; +use otap_df_engine::memory_limiter::LocalReceiverAdmissionState; use otap_df_engine::node::NodeId; use otap_df_engine::receiver::ReceiverWrapper; use otap_df_engine::terminal_state::TerminalState; @@ -239,6 +240,7 @@ struct SyslogCefReceiver { config: Config, /// RFC-aligned internal telemetry for this receiver metrics: Rc>>, + admission_state: LocalReceiverAdmissionState, } impl SyslogCefReceiver { @@ -248,6 +250,9 @@ impl SyslogCefReceiver { SyslogCefReceiver { config, metrics: Rc::new(RefCell::new(metrics)), + admission_state: LocalReceiverAdmissionState::from_process_state( + &pipeline.memory_pressure_state(), + ), } } @@ -265,6 +270,25 @@ impl SyslogCefReceiver { } } +/// Discards any buffered records without sending downstream. +/// +/// Used when memory pressure is active: flushing downstream could block behind the +/// full pipeline that pressure is trying to protect. +fn drop_syslog_batch( + metrics: &Rc>>, + arrow_records_builder: &mut ArrowRecordsBuilder, +) { + let items = u64::from(arrow_records_builder.len()); + if items == 0 { + return; + } + *arrow_records_builder = ArrowRecordsBuilder::new(); + metrics + .borrow_mut() + .received_logs_rejected_memory_pressure + .add(items); +} + /// Add the syslog receiver to the receiver factory #[allow(unsafe_code)] #[distributed_slice(OTAP_RECEIVER_FACTORIES)] @@ -393,6 +417,9 @@ impl local::Receiver for SyslogCefReceiver { let mut m = self.metrics.borrow_mut(); let _ = metrics_reporter.report(&mut m); } + Ok(NodeControlMsg::MemoryPressureChanged { update }) => { + self.admission_state.apply(update); + } Err(e) => { return Err(Error::ChannelRecvError(e)); } @@ -406,9 +433,18 @@ impl local::Receiver for SyslogCefReceiver { accept_result = listener.accept() => { match accept_result { Ok((socket, peer_addr)) => { + if self.admission_state.should_shed_ingress() { + self.metrics + .borrow_mut() + .tcp_connections_rejected_memory_pressure + .inc(); + continue; + } + // Clone the effect handler so the spawned task can send messages. let effect_handler = effect_handler.clone(); let metrics = self.metrics.clone(); + let admission_state = self.admission_state.clone(); // Clone TLS acceptor for the spawned task #[cfg(feature = "experimental-tls")] @@ -505,6 +541,19 @@ impl local::Receiver for SyslogCefReceiver { break; } + if admission_state.should_shed_ingress() { + otel_warn!( + "syslog_cef_receiver.memory_pressure.disconnect", + peer = %peer_addr, + message = "Closing TCP syslog connection due to memory pressure" + ); + drop_syslog_batch(&metrics, &mut arrow_records_builder); + metrics.borrow_mut().tcp_connections_rejected_memory_pressure.inc(); + metrics.borrow_mut().tcp_connections_active.dec(); + task_active_count.set(task_active_count.get() - 1); + break; + } + tokio::select! { biased; // Prioritize incoming data over timeout @@ -525,13 +574,29 @@ impl local::Receiver for SyslogCefReceiver { // Count total received at socket level before parsing metrics.borrow_mut().received_logs_total.inc(); - match parser::parse(message_bytes) { - Ok(parsed_message) => { - arrow_records_builder.append_syslog(parsed_message); - } - Err(_e) => { - // parse error => count one failed item - metrics.borrow_mut().received_logs_invalid.inc(); + if admission_state.should_shed_ingress() { + otel_warn!( + "syslog_cef_receiver.memory_pressure.disconnect", + peer = %peer_addr, + message = "Closing TCP syslog connection due to memory pressure" + ); + // Count the current in-flight message (not yet in the builder) + // then drop the already-buffered batch. + metrics.borrow_mut().received_logs_rejected_memory_pressure.inc(); + drop_syslog_batch(&metrics, &mut arrow_records_builder); + metrics.borrow_mut().tcp_connections_rejected_memory_pressure.inc(); + metrics.borrow_mut().tcp_connections_active.dec(); + task_active_count.set(task_active_count.get() - 1); + break; + } else { + match parser::parse(message_bytes) { + Ok(parsed_message) => { + arrow_records_builder.append_syslog(parsed_message); + } + Err(_e) => { + // parse error => count one failed item + metrics.borrow_mut().received_logs_invalid.inc(); + } } } } @@ -587,6 +652,23 @@ impl local::Receiver for SyslogCefReceiver { // Count total received at socket level before parsing metrics.borrow_mut().received_logs_total.inc(); + if admission_state.should_shed_ingress() { + otel_warn!( + "syslog_cef_receiver.memory_pressure.disconnect", + peer = %peer_addr, + message = "Closing TCP syslog connection due to memory pressure" + ); + // Count the current in-flight message (not yet in the builder) + // then drop the already-buffered batch. + metrics.borrow_mut().received_logs_rejected_memory_pressure.inc(); + line_bytes.clear(); + drop_syslog_batch(&metrics, &mut arrow_records_builder); + metrics.borrow_mut().tcp_connections_rejected_memory_pressure.inc(); + metrics.borrow_mut().tcp_connections_active.dec(); + task_active_count.set(task_active_count.get() - 1); + break; + } + match parser::parse(message_to_parse) { Ok(parsed) => { arrow_records_builder.append_syslog(parsed); @@ -774,6 +856,9 @@ impl local::Receiver for SyslogCefReceiver { let mut m = self.metrics.borrow_mut(); let _ = metrics_reporter.report(&mut m); } + Ok(NodeControlMsg::MemoryPressureChanged { update }) => { + self.admission_state.apply(update); + } Err(e) => { return Err(Error::ChannelRecvError(e)); } @@ -798,6 +883,14 @@ impl local::Receiver for SyslogCefReceiver { // Count total received at socket level before parsing self.metrics.borrow_mut().received_logs_total.inc(); + if self.admission_state.should_shed_ingress() { + self.metrics + .borrow_mut() + .received_logs_rejected_memory_pressure + .inc(); + continue; + } + let parsed_message = match parser::parse(&buf[..n]) { Ok(parsed) => parsed, Err(_e) => { @@ -933,6 +1026,14 @@ pub struct SyslogCefReceiverMetrics { #[cfg(feature = "experimental-tls")] #[metric(unit = "{error}")] pub tls_handshake_failures: Counter, + + /// Number of log records dropped due to process-wide memory pressure. + #[metric(unit = "{item}")] + pub received_logs_rejected_memory_pressure: Counter, + + /// Number of TCP connections rejected or closed due to process-wide memory pressure. + #[metric(unit = "{conn}")] + pub tcp_connections_rejected_memory_pressure: Counter, } #[cfg(test)] @@ -952,10 +1053,14 @@ mod tests { SyslogCefReceiver { config, metrics: Rc::new(RefCell::new(metric_set)), + admission_state: LocalReceiverAdmissionState::from_process_state( + &MemoryPressureState::default(), + ), } } } use otap_df_config::node::NodeUserConfig; + use otap_df_engine::memory_limiter::MemoryPressureState; use otap_df_engine::receiver::ReceiverWrapper; use otap_df_engine::testing::{ receiver::{NotSendValidateContext, TestContext, TestRuntime}, @@ -970,6 +1075,31 @@ mod tests { use tokio::net::{TcpStream, UdpSocket}; use tokio::time::{Duration, timeout}; + #[test] + fn drop_syslog_batch_discards_records_without_downstream_send() { + let receiver = SyslogCefReceiver::new(Config::new_tcp( + "127.0.0.1:0".parse().expect("valid loopback address"), + )); + let metrics = receiver.metrics.clone(); + let mut arrow_records_builder = ArrowRecordsBuilder::new(); + let parsed = parser::parse(b"<34>1 2024-01-15T10:30:45.123Z host app - ID1 msg") + .expect("valid syslog line"); + arrow_records_builder.append_syslog(parsed); + + drop_syslog_batch(&metrics, &mut arrow_records_builder); + + assert_eq!(arrow_records_builder.len(), 0); + assert_eq!( + metrics + .borrow() + .received_logs_rejected_memory_pressure + .get(), + 1 + ); + assert_eq!(metrics.borrow().received_logs_forwarded.get(), 0); + assert_eq!(metrics.borrow().received_logs_forward_failed.get(), 0); + } + /// Test closure that simulates a typical UDP syslog receiver scenario. fn udp_scenario( listening_addr: SocketAddr, @@ -2188,4 +2318,100 @@ mod telemetry_tests { assert_eq!(m[3].to_u64_lossy(), 1, "forward_failed == 1"); })); } + + #[test] + fn udp_sheds_ingress_under_hard_memory_pressure() { + let (rt, local) = setup_test_runtime(); + rt.block_on(local.run_until(async move { + let telemetry_registry = TelemetryRegistryHandle::new(); + let controller = ControllerContext::new(telemetry_registry.clone()); + let pipeline = controller.pipeline_context_with( + otap_df_config::PipelineGroupId::from("grp".to_string()), + otap_df_config::PipelineId::from("pipe".to_string()), + 0, + 1, + 0, + ); + + let port = portpicker::pick_unused_port().expect("No free ports"); + let addr: SocketAddr = format!("127.0.0.1:{port}").parse().unwrap(); + + pipeline + .memory_pressure_state() + .set_level_for_tests(otap_df_engine::memory_limiter::MemoryPressureLevel::Hard); + + let receiver = SyslogCefReceiver::with_pipeline( + pipeline, + Config { + protocol: Protocol::Udp(UdpConfig { + listening_addr: addr, + }), + batch: Some(BatchConfig { + max_batch_duration_ms: None, + max_size: NonZeroU16::new(1), + }), + }, + ); + + let (out_tx, mut _out_rx) = otap_df_channel::mpsc::Channel::new(8); + let mut senders = std::collections::HashMap::new(); + let _ = senders.insert( + "".into(), + Sender::Local(otap_df_engine::local::message::LocalSender::mpsc(out_tx)), + ); + + let (pipe_tx, _pipe_rx) = otap_df_engine::control::runtime_ctrl_msg_channel(10); + let (metrics_rx, reporter) = MetricsReporter::create_new_and_receiver(4); + let eh = otap_df_engine::local::receiver::EffectHandler::new( + test_node("syslog_memory_pressure"), + senders, + None, + pipe_tx, + reporter.clone(), + ); + + let (ctrl_tx, ctrl_rx) = otap_df_channel::mpsc::Channel::new(16); + let ctrl_rx = otap_df_engine::message::Receiver::Local( + otap_df_engine::local::message::LocalReceiver::mpsc(ctrl_rx), + ); + let ctrl_chan = otap_df_engine::local::receiver::ControlChannel::new(ctrl_rx); + + let handle = tokio::task::spawn_local(async move { + let _ = Box::new(receiver).start(ctrl_chan, eh).await; + }); + + tokio::time::sleep(Duration::from_millis(50)).await; + + let sock = UdpSocket::bind("127.0.0.1:0").await.unwrap(); + let _ = sock + .send_to(b"<34>1 2024-01-15T10:30:45.123Z host app - ID1 msg", addr) + .await + .unwrap(); + + tokio::time::sleep(Duration::from_millis(150)).await; + let _ = ctrl_tx.send(NodeControlMsg::CollectTelemetry { + metrics_reporter: reporter.clone(), + }); + let _ = ctrl_tx.send(NodeControlMsg::Shutdown { + deadline: Instant::now(), + reason: "test".into(), + }); + let _ = handle.await; + + let snap = metrics_rx.recv_async().await.unwrap(); + let m = snap.get_metrics(); + assert_eq!(m[4].to_u64_lossy(), 1, "total == 1"); + assert_eq!(m[0].to_u64_lossy(), 0, "forwarded == 0"); + assert_eq!( + m[m.len() - 2].to_u64_lossy(), + 1, + "memory-pressure dropped == 1" + ); + assert_eq!( + m[m.len() - 1].to_u64_lossy(), + 0, + "tcp connection rejects == 0 for UDP" + ); + })); + } } diff --git a/rust/otap-dataflow/crates/engine/Cargo.toml b/rust/otap-dataflow/crates/engine/Cargo.toml index 23d6679463..9305c47688 100644 --- a/rust/otap-dataflow/crates/engine/Cargo.toml +++ b/rust/otap-dataflow/crates/engine/Cargo.toml @@ -14,7 +14,7 @@ workspace = true [features] testing = [] -jemalloc = ["dep:tikv-jemalloc-ctl"] +jemalloc = ["dep:tikv-jemalloc-ctl", "dep:tikv-jemalloc-sys"] jemalloc-testing = ["jemalloc"] test-utils = [] @@ -53,6 +53,7 @@ async-channel = { workspace = true } [target.'cfg(not(windows))'.dependencies] tikv-jemalloc-ctl = { workspace = true, optional = true } +tikv-jemalloc-sys = { workspace = true, optional = true } [dev-dependencies] otap-df-engine = { workspace = true, features = ["test-utils"] } diff --git a/rust/otap-dataflow/crates/engine/src/context.rs b/rust/otap-dataflow/crates/engine/src/context.rs index dc74cb5cc5..0e42692a2b 100644 --- a/rust/otap-dataflow/crates/engine/src/context.rs +++ b/rust/otap-dataflow/crates/engine/src/context.rs @@ -9,6 +9,7 @@ use crate::attributes::{ PipelineAttributeSet, config_map_to_telemetry, }; use crate::entity_context::{current_node_telemetry_handle, node_entity_key}; +use crate::memory_limiter::MemoryPressureState; use crate::node::NodeId as EngineNodeId; use otap_df_config::node::NodeKind; use otap_df_config::pipeline::telemetry::TelemetryAttribute; @@ -101,6 +102,7 @@ pub struct ControllerContext { host_id: Cow<'static, str>, container_id: Cow<'static, str>, numa_node_id: usize, + memory_pressure_state: MemoryPressureState, } /// Parameters required to create a pipeline context. @@ -152,6 +154,7 @@ impl ControllerContext { host_id: HOST_ID.clone(), container_id: CONTAINER_ID.clone(), numa_node_id: 0, // ToDo(LQ): Set NUMA node ID if available + memory_pressure_state: MemoryPressureState::default(), } } @@ -199,6 +202,12 @@ impl ControllerContext { pub fn telemetry_registry(&self) -> TelemetryRegistryHandle { self.telemetry_registry_handle.clone() } + + /// Returns the shared process-wide memory pressure state. + #[must_use] + pub fn memory_pressure_state(&self) -> MemoryPressureState { + self.memory_pressure_state.clone() + } } impl PipelineContext { @@ -265,6 +274,12 @@ impl PipelineContext { self.internal_telemetry.as_ref() } + /// Returns the shared process-wide memory pressure state. + #[must_use] + pub fn memory_pressure_state(&self) -> MemoryPressureState { + self.controller_context.memory_pressure_state() + } + /// Sets the shared node-name-to-index mapping for this pipeline context. pub fn set_node_names(&mut self, node_names: NodeNameIndex) { self.node_names = node_names; diff --git a/rust/otap-dataflow/crates/engine/src/control.rs b/rust/otap-dataflow/crates/engine/src/control.rs index 09a30065ad..f1335bc1a7 100644 --- a/rust/otap-dataflow/crates/engine/src/control.rs +++ b/rust/otap-dataflow/crates/engine/src/control.rs @@ -7,6 +7,7 @@ use crate::clock; use crate::error::{Error, TypedError}; +use crate::memory_limiter::MemoryPressureChanged; use crate::message::Sender; use crate::node::{NodeId, NodeType}; use crate::shared::message::{SharedReceiver, SharedSender}; @@ -231,6 +232,12 @@ pub enum NodeControlMsg { data: Box, }, + /// Announces a process-wide memory pressure transition to receiver-local admission state. + MemoryPressureChanged { + /// Latest process-wide pressure transition snapshot. + update: MemoryPressureChanged, + }, + /// Requests that a receiver stop admitting new external work while keeping /// already-admitted work alive until it can finish receiver-local drain work. DrainIngress { diff --git a/rust/otap-dataflow/crates/engine/src/engine_metrics.rs b/rust/otap-dataflow/crates/engine/src/engine_metrics.rs index 5eee0a94d5..2b15391eb1 100644 --- a/rust/otap-dataflow/crates/engine/src/engine_metrics.rs +++ b/rust/otap-dataflow/crates/engine/src/engine_metrics.rs @@ -20,6 +20,13 @@ //! in use; `0.5` on an 8-core machine corresponds to 4 fully loaded cores. //! Aligned with the OTel semantic convention `process.cpu.utilization`. //! +//! - `memory_pressure_state` (`Gauge`, `{state}`): +//! Process-wide memory limiter state encoded as `0=normal`, `1=soft`, `2=hard`. +//! +//! - `process_memory_usage_bytes`, `process_memory_soft_limit_bytes`, +//! `process_memory_hard_limit_bytes` (`Gauge`, `{By}`): +//! Process-wide memory limiter sample and effective limits. +//! //! We emit utilization directly (rather than a cumulative `cpu_time` counter) //! so that users can read the metric as-is without requiring PromQL `rate()` //! or similar query-time derivations. @@ -27,6 +34,7 @@ //! TODO: Also emit a cumulative `cpu_time` counter (like the Go Collector's //! `process_cpu_seconds_total`) for users who prefer query-time computation. +use crate::memory_limiter::MemoryPressureState; use cpu_time::ProcessTime; use otap_df_telemetry::instrument::{Gauge, ObserveUpDownCounter}; use otap_df_telemetry::metrics::MetricSet; @@ -51,6 +59,22 @@ pub struct EngineMetrics { /// The `cpu.mode` attribute is not set; this reports combined user + system time. #[metric(unit = "{1}")] pub cpu_utilization: Gauge, + + /// Process-wide memory limiter state encoded as `0=normal`, `1=soft`, `2=hard`. + #[metric(unit = "{state}")] + pub memory_pressure_state: Gauge, + + /// Most recent process-wide memory limiter sample, in bytes. + #[metric(unit = "{By}")] + pub process_memory_usage_bytes: Gauge, + + /// Effective process-wide memory limiter soft limit, in bytes. + #[metric(unit = "{By}")] + pub process_memory_soft_limit_bytes: Gauge, + + /// Effective process-wide memory limiter hard limit, in bytes. + #[metric(unit = "{By}")] + pub process_memory_hard_limit_bytes: Gauge, } /// Monitors and reports engine-wide metrics. @@ -68,6 +92,8 @@ pub struct EngineMetricsMonitor { cpu_start: ProcessTime, /// Total number of logical CPU cores available on the system. num_cores: usize, + /// Shared process-wide memory limiter state. + memory_pressure_state: MemoryPressureState, } impl EngineMetricsMonitor { @@ -80,6 +106,7 @@ impl EngineMetricsMonitor { registry: TelemetryRegistryHandle, entity_key: EntityKey, reporter: MetricsReporter, + memory_pressure_state: MemoryPressureState, ) -> Self { let metrics = registry.register_metric_set_for_entity::(entity_key); let num_cores = std::thread::available_parallelism() @@ -92,6 +119,7 @@ impl EngineMetricsMonitor { wall_start: Instant::now(), cpu_start: ProcessTime::now(), num_cores, + memory_pressure_state, } } @@ -112,6 +140,18 @@ impl EngineMetricsMonitor { } else { self.metrics.cpu_utilization.set(0.0); } + self.metrics + .memory_pressure_state + .set(self.memory_pressure_state.level() as u64); + self.metrics + .process_memory_usage_bytes + .set(self.memory_pressure_state.usage_bytes()); + self.metrics + .process_memory_soft_limit_bytes + .set(self.memory_pressure_state.soft_limit_bytes()); + self.metrics + .process_memory_hard_limit_bytes + .set(self.memory_pressure_state.hard_limit_bytes()); self.wall_start = now_wall; self.cpu_start = now_cpu; } @@ -153,7 +193,12 @@ mod tests { let entity_key = controller.register_engine_entity(); let (_rx, reporter) = MetricsReporter::create_new_and_receiver(16); - let mut monitor = EngineMetricsMonitor::new(registry, entity_key, reporter); + let mut monitor = EngineMetricsMonitor::new( + registry, + entity_key, + reporter, + controller.memory_pressure_state(), + ); monitor.update(); assert!( @@ -169,7 +214,12 @@ mod tests { let entity_key = controller.register_engine_entity(); let (_rx, reporter) = MetricsReporter::create_new_and_receiver(16); - let mut monitor = EngineMetricsMonitor::new(registry, entity_key, reporter); + let mut monitor = EngineMetricsMonitor::new( + registry, + entity_key, + reporter, + controller.memory_pressure_state(), + ); monitor.update(); assert!(monitor.report().is_ok()); } @@ -181,7 +231,12 @@ mod tests { let entity_key = controller.register_engine_entity(); let (_rx, reporter) = MetricsReporter::create_new_and_receiver(16); - let mut monitor = EngineMetricsMonitor::new(registry, entity_key, reporter); + let mut monitor = EngineMetricsMonitor::new( + registry, + entity_key, + reporter, + controller.memory_pressure_state(), + ); // Do a small busy-spin so there is measurable CPU time. let start = Instant::now(); @@ -196,4 +251,33 @@ mod tests { "cpu_utilization should be in [0, 1], got {util}" ); } + + #[test] + fn engine_metrics_expose_process_memory_limiter_usage_and_limits() { + let registry = TelemetryRegistryHandle::new(); + let controller = ControllerContext::new(registry.clone()); + let state = controller.memory_pressure_state(); + state.configure(crate::memory_limiter::MemoryPressureBehaviorConfig { + retry_after_secs: 1, + fail_readiness_on_hard: true, + mode: otap_df_config::policy::MemoryLimiterMode::Enforce, + }); + state.set_sample_for_tests( + crate::memory_limiter::MemoryPressureLevel::Soft, + 95, + 90, + 100, + ); + + let entity_key = controller.register_engine_entity(); + let (_rx, reporter) = MetricsReporter::create_new_and_receiver(16); + let mut monitor = EngineMetricsMonitor::new(registry, entity_key, reporter, state); + + monitor.update(); + + assert_eq!(monitor.metrics.memory_pressure_state.get(), 1); + assert_eq!(monitor.metrics.process_memory_usage_bytes.get(), 95); + assert_eq!(monitor.metrics.process_memory_soft_limit_bytes.get(), 90); + assert_eq!(monitor.metrics.process_memory_hard_limit_bytes.get(), 100); + } } diff --git a/rust/otap-dataflow/crates/engine/src/lib.rs b/rust/otap-dataflow/crates/engine/src/lib.rs index 94d9eb2292..bec1a4896d 100644 --- a/rust/otap-dataflow/crates/engine/src/lib.rs +++ b/rust/otap-dataflow/crates/engine/src/lib.rs @@ -70,6 +70,7 @@ pub mod effect_handler; pub mod engine_metrics; pub mod entity_context; pub mod local; +pub mod memory_limiter; pub mod node; pub mod output_router; pub mod pipeline_ctrl; diff --git a/rust/otap-dataflow/crates/engine/src/memory_limiter.rs b/rust/otap-dataflow/crates/engine/src/memory_limiter.rs new file mode 100644 index 0000000000..a508fa568e --- /dev/null +++ b/rust/otap-dataflow/crates/engine/src/memory_limiter.rs @@ -0,0 +1,1476 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +//! Process-wide memory limiter state and sampling. + +use otap_df_config::policy::{MemoryLimiterMode, MemoryLimiterPolicy, MemoryLimiterSource}; +use std::cell::Cell; +#[cfg(all(not(windows), feature = "jemalloc"))] +use std::ffi::c_char; +use std::fs; +use std::path::{Path, PathBuf}; +#[cfg(all(not(windows), feature = "jemalloc"))] +use std::ptr; +use std::rc::Rc; +use std::sync::Arc; +use std::sync::atomic::{AtomicBool, AtomicU8, AtomicU32, AtomicU64, Ordering}; +use std::time::{Duration, Instant}; + +#[cfg(test)] +use std::collections::VecDeque; + +#[cfg(all(not(windows), feature = "jemalloc"))] +use tikv_jemalloc_ctl::{epoch, stats}; + +/// Values at or above this threshold are treated as "no limit set" by the +/// cgroup memory controller (e.g. `memory.max = max` parses to `u64::MAX`). +const CGROUP_UNLIMITED_THRESHOLD_BYTES: u64 = 1 << 60; + +/// When `source = auto` and no explicit limits are configured, soft and hard +/// limits are derived as percentages of the detected cgroup memory cap: +/// soft = 90 %, hard = 95 %. +const AUTO_DERIVED_SOFT_NUMERATOR: u64 = 90; +const AUTO_DERIVED_HARD_NUMERATOR: u64 = 95; +const AUTO_DERIVED_DENOMINATOR: u64 = 100; + +/// Process-wide memory pressure level. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +#[repr(u8)] +pub enum MemoryPressureLevel { + /// Below the configured soft limit. + Normal = 0, + /// Above the soft limit; pressure is elevated but ingress still flows in Phase 1. + Soft = 1, + /// Above the hard limit; ingress should be shed and readiness can fail. + Hard = 2, +} + +impl MemoryPressureLevel { + const fn from_u8(value: u8) -> Self { + match value { + 1 => Self::Soft, + 2 => Self::Hard, + _ => Self::Normal, + } + } +} + +/// Transition payload emitted when the process-wide limiter changes pressure level. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct MemoryPressureChanged { + /// Monotonic update number assigned by the global sampler. + pub generation: u64, + /// Newly classified pressure level. + pub level: MemoryPressureLevel, + /// Receiver-facing retry hint to use while shedding ingress. + pub retry_after_secs: u32, + /// Most recent sampled process memory usage in bytes. + pub usage_bytes: u64, +} + +impl MemoryPressureChanged { + /// Initial watch-channel value before the first real transition. + #[must_use] + pub const fn initial() -> Self { + Self { + generation: 0, + level: MemoryPressureLevel::Normal, + retry_after_secs: 1, + usage_bytes: 0, + } + } +} + +const fn mode_to_u8(mode: MemoryLimiterMode) -> u8 { + match mode { + MemoryLimiterMode::Enforce => 0, + MemoryLimiterMode::ObserveOnly => 1, + } +} + +const fn mode_from_u8(val: u8) -> MemoryLimiterMode { + match val { + 1 => MemoryLimiterMode::ObserveOnly, + _ => MemoryLimiterMode::Enforce, + } +} + +/// Shared process-wide memory pressure state. +#[derive(Clone, Debug)] +pub struct MemoryPressureState { + inner: Arc, +} + +#[derive(Debug)] +struct MemoryPressureStateInner { + level: AtomicU8, + usage_bytes: AtomicU64, + soft_limit_bytes: AtomicU64, + hard_limit_bytes: AtomicU64, + retry_after_secs: AtomicU32, + mode: AtomicU8, + fail_readiness_on_hard: AtomicBool, +} + +impl Default for MemoryPressureState { + fn default() -> Self { + Self { + inner: Arc::new(MemoryPressureStateInner { + level: AtomicU8::new(MemoryPressureLevel::Normal as u8), + usage_bytes: AtomicU64::new(0), + soft_limit_bytes: AtomicU64::new(0), + hard_limit_bytes: AtomicU64::new(0), + retry_after_secs: AtomicU32::new(1), + mode: AtomicU8::new(mode_to_u8(MemoryLimiterMode::Enforce)), + fail_readiness_on_hard: AtomicBool::new(true), + }), + } + } +} + +/// Runtime behavior applied by the shared memory pressure state. +/// +/// This is configured once at engine startup. Live mode switching is not +/// supported. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct MemoryPressureBehaviorConfig { + /// Retry-After seconds advertised by receivers while shedding ingress. + pub retry_after_secs: u32, + /// Whether readiness should fail while in `Hard` pressure in `enforce` mode. + pub fail_readiness_on_hard: bool, + /// Whether `Hard` pressure is enforced or only observed. + pub mode: MemoryLimiterMode, +} + +impl MemoryPressureState { + /// Configures shared limiter metadata. + /// + /// This is expected to be called once during engine startup. Reconfiguring + /// the limiter mode while receivers are active is not supported. + pub fn configure(&self, config: MemoryPressureBehaviorConfig) { + self.inner + .retry_after_secs + .store(config.retry_after_secs.max(1), Ordering::Relaxed); + self.inner + .mode + .store(mode_to_u8(config.mode), Ordering::Relaxed); + self.inner + .fail_readiness_on_hard + .store(config.fail_readiness_on_hard, Ordering::Relaxed); + } + + /// Returns the current pressure level. + #[must_use] + pub fn level(&self) -> MemoryPressureLevel { + MemoryPressureLevel::from_u8(self.inner.level.load(Ordering::Relaxed)) + } + + /// Returns whether ingress should be rejected. + /// + /// In Phase 1, only `Hard` pressure sheds ingress. `Soft` remains advisory so + /// operators can observe rising pressure without immediately rejecting traffic. + #[must_use] + pub fn should_shed_ingress(&self) -> bool { + mode_from_u8(self.inner.mode.load(Ordering::Relaxed)) == MemoryLimiterMode::Enforce + && self.level() == MemoryPressureLevel::Hard + } + + /// Returns whether the admin readiness endpoint should fail. + #[must_use] + pub fn should_fail_readiness(&self) -> bool { + mode_from_u8(self.inner.mode.load(Ordering::Relaxed)) == MemoryLimiterMode::Enforce + && self.level() == MemoryPressureLevel::Hard + && self.inner.fail_readiness_on_hard.load(Ordering::Relaxed) + } + + /// Returns the configured limiter mode. + #[must_use] + pub fn mode(&self) -> MemoryLimiterMode { + mode_from_u8(self.inner.mode.load(Ordering::Relaxed)) + } + + /// Returns the Retry-After value for HTTP shedding responses. + #[must_use] + pub fn retry_after_secs(&self) -> u32 { + self.inner.retry_after_secs.load(Ordering::Relaxed) + } + + /// Returns the most recently sampled memory usage in bytes. + #[must_use] + pub fn usage_bytes(&self) -> u64 { + self.inner.usage_bytes.load(Ordering::Relaxed) + } + + /// Returns the configured soft limit in bytes. + #[must_use] + pub fn soft_limit_bytes(&self) -> u64 { + self.inner.soft_limit_bytes.load(Ordering::Relaxed) + } + + /// Returns the configured hard limit in bytes. + #[must_use] + pub fn hard_limit_bytes(&self) -> u64 { + self.inner.hard_limit_bytes.load(Ordering::Relaxed) + } + + /// Returns the current process-wide state as a receiver-facing transition payload. + #[must_use] + pub fn current_update(&self, generation: u64) -> MemoryPressureChanged { + MemoryPressureChanged { + generation, + level: self.level(), + retry_after_secs: self.retry_after_secs(), + usage_bytes: self.usage_bytes(), + } + } + + fn update_limits(&self, soft_limit_bytes: u64, hard_limit_bytes: u64) { + self.inner + .soft_limit_bytes + .store(soft_limit_bytes, Ordering::Relaxed); + self.inner + .hard_limit_bytes + .store(hard_limit_bytes, Ordering::Relaxed); + } + + fn update_level(&self, level: MemoryPressureLevel, usage_bytes: u64) -> MemoryPressureLevel { + let previous = self.level(); + self.inner.level.store(level as u8, Ordering::Relaxed); + self.inner.usage_bytes.store(usage_bytes, Ordering::Relaxed); + previous + } + + /// Sets the current pressure level for tests without sampling memory. + #[cfg(any(test, feature = "test-utils"))] + pub fn set_level_for_tests(&self, level: MemoryPressureLevel) { + self.inner.level.store(level as u8, Ordering::Relaxed); + } + + /// Sets the sampled usage and effective limits for tests. + #[cfg(any(test, feature = "test-utils"))] + pub fn set_sample_for_tests( + &self, + level: MemoryPressureLevel, + usage_bytes: u64, + soft_limit_bytes: u64, + hard_limit_bytes: u64, + ) { + self.update_limits(soft_limit_bytes, hard_limit_bytes); + _ = self.update_level(level, usage_bytes); + } +} + +#[derive(Debug)] +struct SharedReceiverAdmissionStateInner { + generation: AtomicU64, + level: AtomicU8, + retry_after_secs: AtomicU32, + usage_bytes: AtomicU64, + mode: MemoryLimiterMode, +} + +/// Receiver-local admission state shared across task/service clones inside a receiver. +#[derive(Clone, Debug)] +pub struct SharedReceiverAdmissionState { + inner: Arc, +} + +impl Default for SharedReceiverAdmissionState { + fn default() -> Self { + Self::from_process_state(&MemoryPressureState::default()) + } +} + +impl SharedReceiverAdmissionState { + /// Bootstraps receiver-local admission state from the current process-wide snapshot. + #[must_use] + pub fn from_process_state(state: &MemoryPressureState) -> Self { + Self { + inner: Arc::new(SharedReceiverAdmissionStateInner { + generation: AtomicU64::new(0), + level: AtomicU8::new(state.level() as u8), + retry_after_secs: AtomicU32::new(state.retry_after_secs()), + usage_bytes: AtomicU64::new(state.usage_bytes()), + mode: state.mode(), + }), + } + } + + /// Applies a transition update, ignoring stale generations. + pub fn apply(&self, update: MemoryPressureChanged) { + let current = self.inner.generation.load(Ordering::Relaxed); + if update.generation <= current { + return; + } + + self.inner + .level + .store(update.level as u8, Ordering::Relaxed); + self.inner + .retry_after_secs + .store(update.retry_after_secs.max(1), Ordering::Relaxed); + self.inner + .usage_bytes + .store(update.usage_bytes, Ordering::Relaxed); + self.inner + .generation + .store(update.generation, Ordering::Relaxed); + } + + /// Returns whether ingress should be shed for this receiver. + #[must_use] + pub fn should_shed_ingress(&self) -> bool { + self.inner.mode == MemoryLimiterMode::Enforce + && MemoryPressureLevel::from_u8(self.inner.level.load(Ordering::Relaxed)) + == MemoryPressureLevel::Hard + } + + /// Returns the Retry-After value advertised while shedding ingress. + #[must_use] + pub fn retry_after_secs(&self) -> u32 { + self.inner.retry_after_secs.load(Ordering::Relaxed) + } + + /// Returns the current local pressure level. + #[must_use] + pub fn level(&self) -> MemoryPressureLevel { + MemoryPressureLevel::from_u8(self.inner.level.load(Ordering::Relaxed)) + } +} + +#[derive(Debug)] +struct LocalReceiverAdmissionStateInner { + generation: Cell, + level: Cell, + retry_after_secs: Cell, + usage_bytes: Cell, + mode: MemoryLimiterMode, +} + +/// Receiver-local admission state for LocalSet-only receivers that do not cross task boundaries. +#[derive(Clone, Debug)] +pub struct LocalReceiverAdmissionState { + inner: Rc, +} + +impl LocalReceiverAdmissionState { + /// Bootstraps receiver-local admission state from the current process-wide snapshot. + #[must_use] + pub fn from_process_state(state: &MemoryPressureState) -> Self { + Self { + inner: Rc::new(LocalReceiverAdmissionStateInner { + generation: Cell::new(0), + level: Cell::new(state.level()), + retry_after_secs: Cell::new(state.retry_after_secs()), + usage_bytes: Cell::new(state.usage_bytes()), + mode: state.mode(), + }), + } + } + + /// Applies a transition update, ignoring stale generations. + pub fn apply(&self, update: MemoryPressureChanged) { + if update.generation <= self.inner.generation.get() { + return; + } + + self.inner.level.set(update.level); + self.inner + .retry_after_secs + .set(update.retry_after_secs.max(1)); + self.inner.usage_bytes.set(update.usage_bytes); + self.inner.generation.set(update.generation); + } + + /// Returns whether ingress should be shed for this receiver. + #[must_use] + pub fn should_shed_ingress(&self) -> bool { + self.inner.mode == MemoryLimiterMode::Enforce + && self.inner.level.get() == MemoryPressureLevel::Hard + } + + /// Returns the Retry-After value advertised while shedding ingress. + #[must_use] + pub fn retry_after_secs(&self) -> u32 { + self.inner.retry_after_secs.get() + } + + /// Returns the current local pressure level. + #[must_use] + pub fn level(&self) -> MemoryPressureLevel { + self.inner.level.get() + } +} + +/// Runtime source used for memory sampling. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum MemorySampleSource { + /// Linux cgroup accounting working set. + Cgroup, + /// Process RSS. + Rss, + /// jemalloc resident bytes. + JemallocResident, +} + +impl MemorySampleSource { + /// Returns a stable string form for logs. + #[must_use] + pub const fn as_str(self) -> &'static str { + match self { + Self::Cgroup => "cgroup", + Self::Rss => "rss", + Self::JemallocResident => "jemalloc_resident", + } + } +} + +/// One memory sample from the selected source. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct MemorySample { + /// Sampled usage in bytes. + pub usage_bytes: u64, + /// Source used for the sample. + pub source: MemorySampleSource, +} + +/// Effective process-wide memory limiter configuration. +#[derive(Debug)] +pub struct EffectiveMemoryLimiter { + mode: MemoryLimiterMode, + check_interval: Duration, + soft_limit_bytes: u64, + hard_limit_bytes: u64, + hysteresis_bytes: u64, + purge_on_hard: bool, + purge_min_interval: Duration, + last_purge_at: Option, + sampler: MemoryUsageSampler, +} + +impl EffectiveMemoryLimiter { + /// Builds a limiter from user policy. + pub fn from_policy(policy: &MemoryLimiterPolicy) -> Result { + let sampler = MemoryUsageSampler::new(policy.source)?; + let detected_limit = sampler.detected_limit_bytes(); + + let (soft_limit_bytes, hard_limit_bytes) = match (policy.soft_limit, policy.hard_limit) { + (Some(soft), Some(hard)) => (soft, hard), + (None, None) if policy.source == MemoryLimiterSource::Auto => { + let limit = detected_limit.ok_or_else(|| { + "memory_limiter.soft_limit and hard_limit must be set when no cgroup memory.max limit is available".to_string() + })?; + let soft = + limit.saturating_mul(AUTO_DERIVED_SOFT_NUMERATOR) / AUTO_DERIVED_DENOMINATOR; + let hard = + limit.saturating_mul(AUTO_DERIVED_HARD_NUMERATOR) / AUTO_DERIVED_DENOMINATOR; + (soft, hard) + } + (None, None) => { + return Err( + "memory_limiter.soft_limit and hard_limit must be set when memory_limiter.source is not auto" + .to_string(), + ); + } + _ => { + return Err( + "memory_limiter.soft_limit and hard_limit must either both be set or both be omitted" + .to_string(), + ); + } + }; + + if hard_limit_bytes <= soft_limit_bytes { + return Err( + "memory_limiter.hard_limit must be greater than memory_limiter.soft_limit" + .to_string(), + ); + } + + let hysteresis_bytes = policy.hysteresis.unwrap_or_else(|| { + hard_limit_bytes + .saturating_sub(soft_limit_bytes) + .min(soft_limit_bytes.saturating_sub(1)) + }); + + if hysteresis_bytes >= soft_limit_bytes { + return Err( + "memory_limiter.hysteresis must be smaller than memory_limiter.soft_limit" + .to_string(), + ); + } + + Ok(Self { + mode: policy.mode, + check_interval: policy.check_interval, + soft_limit_bytes, + hard_limit_bytes, + hysteresis_bytes, + purge_on_hard: policy.purge_on_hard, + purge_min_interval: policy.purge_min_interval, + last_purge_at: None, + sampler, + }) + } + + /// Returns the sampling interval. + #[must_use] + pub const fn check_interval(&self) -> Duration { + self.check_interval + } + + fn classify(&self, current: MemoryPressureLevel, usage_bytes: u64) -> MemoryPressureLevel { + match current { + MemoryPressureLevel::Normal => { + if usage_bytes >= self.hard_limit_bytes { + MemoryPressureLevel::Hard + } else if usage_bytes >= self.soft_limit_bytes { + MemoryPressureLevel::Soft + } else { + MemoryPressureLevel::Normal + } + } + MemoryPressureLevel::Soft => { + if usage_bytes >= self.hard_limit_bytes { + MemoryPressureLevel::Hard + } else if usage_bytes < self.soft_limit_bytes.saturating_sub(self.hysteresis_bytes) + { + MemoryPressureLevel::Normal + } else { + MemoryPressureLevel::Soft + } + } + MemoryPressureLevel::Hard => { + if usage_bytes < self.soft_limit_bytes { + MemoryPressureLevel::Soft + } else { + MemoryPressureLevel::Hard + } + } + } + } + + fn should_attempt_purge(&self, level: MemoryPressureLevel, now: Instant) -> bool { + self.mode == MemoryLimiterMode::Enforce + && self.purge_on_hard + && level == MemoryPressureLevel::Hard + && self.sampler.supports_purge() + && self.last_purge_at.is_none_or(|last_purge_at| { + now.duration_since(last_purge_at) >= self.purge_min_interval + }) + } + + /// Returns whether purge support is available for this limiter build. + #[must_use] + pub fn purge_supported(&self) -> bool { + self.sampler.supports_purge() + } + + /// Returns whether forced purge is enabled in policy. + #[must_use] + pub const fn purge_on_hard(&self) -> bool { + self.purge_on_hard + } + + /// Samples memory and updates the shared state. + pub fn tick(&mut self, state: &MemoryPressureState) -> Result { + let current = state.level(); + let mut sample = self.sampler.sample()?; + state.update_limits(self.soft_limit_bytes, self.hard_limit_bytes); + let mut level = self.classify(current, sample.usage_bytes); + let mut pre_purge_usage_bytes = None; + let mut purge_duration = None; + let mut purge_error = None; + + let started_at = Instant::now(); + if self.should_attempt_purge(level, started_at) { + pre_purge_usage_bytes = Some(sample.usage_bytes); + self.last_purge_at = Some(started_at); + match self.sampler.purge() { + Ok(()) => { + let elapsed = started_at.elapsed(); + purge_duration = Some(elapsed); + match self.sampler.sample() { + Ok(post_purge_sample) => { + sample = post_purge_sample; + level = self.classify(current, sample.usage_bytes); + } + Err(err) => { + purge_error = Some(format!("post-purge sample failed: {err}")); + } + } + } + Err(err) => { + purge_error = Some(err); + } + } + } + + let previous = state.update_level(level, sample.usage_bytes); + Ok(MemoryLimiterTick { + previous_level: previous, + current_level: level, + sample, + soft_limit_bytes: self.soft_limit_bytes, + hard_limit_bytes: self.hard_limit_bytes, + pre_purge_usage_bytes, + purge_duration, + purge_error, + }) + } +} + +/// Result of one limiter iteration. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct MemoryLimiterTick { + /// Level before the sample was applied. + pub previous_level: MemoryPressureLevel, + /// Level after the sample was applied. + pub current_level: MemoryPressureLevel, + /// Memory sample used to drive the transition. + pub sample: MemorySample, + /// Configured soft limit. + pub soft_limit_bytes: u64, + /// Configured hard limit. + pub hard_limit_bytes: u64, + /// Usage before a forced purge, when one was attempted during this tick. + pub pre_purge_usage_bytes: Option, + /// Duration of a forced purge, when one was attempted during this tick. + pub purge_duration: Option, + /// Error from a forced purge attempt, when one failed during this tick. + pub purge_error: Option, +} + +impl MemoryLimiterTick { + /// Returns whether the level changed. + #[must_use] + pub fn transitioned(&self) -> bool { + self.previous_level != self.current_level + } +} + +trait MemoryUsageProbe: Send { + fn sample_usage(&mut self) -> Result; +} + +trait MemoryLimitProbe: Send + Sync { + fn detect_limit(&self) -> Option; +} + +trait MemoryPurgeHook: Send { + fn purge(&mut self) -> Result<(), String>; +} + +struct MemoryUsageSampler { + usage_probe: Box, + limit_probe: Option>, + purge_hook: Option>, +} + +impl std::fmt::Debug for MemoryUsageSampler { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("MemoryUsageSampler") + .field("has_limit_probe", &self.limit_probe.is_some()) + .field("has_purge_hook", &self.purge_hook.is_some()) + .finish_non_exhaustive() + } +} + +impl MemoryUsageSampler { + fn new(source: MemoryLimiterSource) -> Result { + let cgroup = CgroupMemorySampler::discover(); + let rss_available = rss_bytes().is_some(); + #[cfg(all(not(windows), feature = "jemalloc"))] + let jemalloc_resident = JemallocResidentProbe::new(); + + let (usage_probe, limit_probe): ( + Box, + Option>, + ) = match source { + MemoryLimiterSource::Auto => { + if let Some(cgroup_probe) = cgroup { + (Box::new(cgroup_probe.clone()), Some(Box::new(cgroup_probe))) + } else if rss_available { + (Box::new(RssProbe), None) + } else { + #[cfg(all(not(windows), feature = "jemalloc"))] + { + if let Some(jemalloc_probe) = jemalloc_resident { + (Box::new(jemalloc_probe), None) + } else { + return Err("no supported memory source is available".to_string()); + } + } + #[cfg(any(windows, not(feature = "jemalloc")))] + { + return Err("no supported memory source is available".to_string()); + } + } + } + MemoryLimiterSource::Cgroup => { + let cgroup_probe = cgroup.ok_or_else(|| { + "memory limiter source `cgroup` requested, but no cgroup memory controller was detected".to_string() + })?; + (Box::new(cgroup_probe.clone()), Some(Box::new(cgroup_probe))) + } + MemoryLimiterSource::Rss => { + if !rss_available { + return Err( + "memory limiter source `rss` requested, but process RSS sampling is unavailable" + .to_string(), + ); + } + (Box::new(RssProbe), None) + } + MemoryLimiterSource::JemallocResident => { + #[cfg(all(not(windows), feature = "jemalloc"))] + { + let jemalloc_probe = jemalloc_resident.ok_or_else(|| { + "memory limiter source `jemalloc_resident` requested, but jemalloc resident metrics are unavailable".to_string() + })?; + (Box::new(jemalloc_probe), None) + } + #[cfg(any(windows, not(feature = "jemalloc")))] + { + return Err("memory limiter source `jemalloc_resident` requested, but this build does not expose jemalloc resident metrics".to_string()); + } + } + }; + + Ok(Self { + usage_probe, + limit_probe, + purge_hook: Self::build_purge_hook(), + }) + } + + fn detected_limit_bytes(&self) -> Option { + self.limit_probe + .as_ref() + .and_then(|probe| probe.detect_limit()) + } + + fn sample(&mut self) -> Result { + self.usage_probe.sample_usage() + } + + fn supports_purge(&self) -> bool { + self.purge_hook.is_some() + } + + fn purge(&mut self) -> Result<(), String> { + self.purge_hook + .as_mut() + .ok_or_else(|| "memory purge is unavailable for this build".to_string())? + .purge() + } + + /// Returns the best available allocator purge hook for this build. + /// + /// Backends are evaluated in priority order. To add a new backend, + /// insert a `#[cfg]`-gated `return` above the final `None`. + #[allow(unreachable_code)] + fn build_purge_hook() -> Option> { + // Priority 1: jemalloc (non-Windows builds with the jemalloc feature). + #[cfg(all(not(windows), feature = "jemalloc"))] + return Some(Box::new(JemallocPurgeHook)); + + None + } + + #[cfg(test)] + fn for_tests(source: MemorySampleSource) -> Self { + let usage_probe: Box = match source { + MemorySampleSource::Rss => Box::new(RssProbe), + MemorySampleSource::Cgroup => { + panic!("cgroup test probe must be constructed explicitly") + } + MemorySampleSource::JemallocResident => { + #[cfg(all(not(windows), feature = "jemalloc"))] + { + Box::new(JemallocResidentProbe) + } + #[cfg(any(windows, not(feature = "jemalloc")))] + { + panic!("jemalloc resident test probe unavailable on this platform") + } + } + }; + Self { + usage_probe, + limit_probe: None, + purge_hook: None, + } + } + + #[cfg(test)] + fn from_test_probes( + usage_probe: Box, + purge_hook: Option>, + ) -> Self { + Self { + usage_probe, + limit_probe: None, + purge_hook, + } + } +} + +#[derive(Debug, Clone)] +struct CgroupMemorySampler { + current_path: PathBuf, + stat_path: PathBuf, + limit_path: PathBuf, + stat_key: &'static str, +} + +impl CgroupMemorySampler { + fn discover() -> Option { + let cgroup_file = fs::read_to_string("/proc/self/cgroup").ok()?; + + // cgroup v2: `0::/path` + if let Some(path) = cgroup_file + .lines() + .find_map(|line| line.strip_prefix("0::")) + .map(str::trim) + { + let base = cgroup_path(Path::new("/sys/fs/cgroup"), path); + let current_path = base.join("memory.current"); + let stat_path = base.join("memory.stat"); + let limit_path = base.join("memory.max"); + if current_path.exists() && stat_path.exists() && limit_path.exists() { + return Some(Self { + current_path, + stat_path, + limit_path, + stat_key: "inactive_file", + }); + } + } + + // cgroup v1: `hierarchy:controllers:/path` + for line in cgroup_file.lines() { + let mut parts = line.splitn(3, ':'); + let _ = parts.next(); + let controllers = parts.next().unwrap_or_default(); + let path = parts.next().unwrap_or_default().trim(); + if !controllers + .split(',') + .any(|controller| controller == "memory") + { + continue; + } + let base = cgroup_path(Path::new("/sys/fs/cgroup/memory"), path); + let current_path = base.join("memory.usage_in_bytes"); + let stat_path = base.join("memory.stat"); + let limit_path = base.join("memory.limit_in_bytes"); + if current_path.exists() && stat_path.exists() && limit_path.exists() { + return Some(Self { + current_path, + stat_path, + limit_path, + stat_key: "total_inactive_file", + }); + } + } + + None + } + + fn limit_bytes(&self) -> Option { + let raw = read_u64_from_file(&self.limit_path).ok()?; + (!is_unlimited_limit(raw)).then_some(raw) + } + + fn sample(&self) -> Result { + let usage = read_u64_from_file(&self.current_path) + .map_err(|err| format!("failed to read {}: {err}", self.current_path.display()))?; + let inactive_file = read_memory_stat_value(&self.stat_path, self.stat_key).unwrap_or(0); + + Ok(MemorySample { + usage_bytes: usage.saturating_sub(inactive_file), + source: MemorySampleSource::Cgroup, + }) + } +} + +impl MemoryUsageProbe for CgroupMemorySampler { + fn sample_usage(&mut self) -> Result { + self.sample() + } +} + +impl MemoryLimitProbe for CgroupMemorySampler { + fn detect_limit(&self) -> Option { + self.limit_bytes() + } +} + +fn cgroup_path(base: &Path, relative: &str) -> PathBuf { + if relative == "/" || relative.is_empty() { + return base.to_path_buf(); + } + base.join(relative.trim_start_matches('/')) +} + +fn read_u64_from_file(path: &Path) -> Result { + let raw = fs::read_to_string(path).map_err(|err| err.to_string())?; + let trimmed = raw.trim(); + if trimmed.eq_ignore_ascii_case("max") { + return Ok(u64::MAX); + } + trimmed.parse::().map_err(|err| err.to_string()) +} + +fn read_memory_stat_value(path: &Path, key: &str) -> Option { + let stats = fs::read_to_string(path).ok()?; + for line in stats.lines() { + let mut parts = line.split_whitespace(); + let stat_key = parts.next()?; + let stat_value = parts.next()?; + if stat_key == key { + return stat_value.parse::().ok(); + } + } + None +} + +fn is_unlimited_limit(limit_bytes: u64) -> bool { + limit_bytes == u64::MAX || limit_bytes >= CGROUP_UNLIMITED_THRESHOLD_BYTES +} + +fn rss_bytes() -> Option { + memory_stats::memory_stats().map(|stats| stats.physical_mem as u64) +} + +#[derive(Debug, Clone, Copy)] +struct RssProbe; + +impl MemoryUsageProbe for RssProbe { + fn sample_usage(&mut self) -> Result { + rss_bytes() + .map(|usage_bytes| MemorySample { + usage_bytes, + source: MemorySampleSource::Rss, + }) + .ok_or_else(|| "failed to sample process RSS".to_string()) + } +} + +#[cfg(all(not(windows), feature = "jemalloc"))] +#[derive(Debug, Clone, Copy)] +struct JemallocResidentProbe; + +#[cfg(all(not(windows), feature = "jemalloc"))] +impl JemallocResidentProbe { + fn new() -> Option { + Some(Self) + } +} + +#[cfg(all(not(windows), feature = "jemalloc"))] +impl MemoryUsageProbe for JemallocResidentProbe { + fn sample_usage(&mut self) -> Result { + _ = epoch::advance().map_err(|err| format!("failed to advance jemalloc epoch: {err}"))?; + let usage_bytes = stats::resident::read() + .map_err(|err| format!("failed to read jemalloc resident bytes: {err}"))?; + Ok(MemorySample { + usage_bytes: usage_bytes as u64, + source: MemorySampleSource::JemallocResident, + }) + } +} + +#[cfg(all(not(windows), feature = "jemalloc"))] +#[derive(Debug, Clone, Copy)] +struct JemallocPurgeHook; + +#[cfg(all(not(windows), feature = "jemalloc"))] +impl MemoryPurgeHook for JemallocPurgeHook { + #[allow(unsafe_code)] + fn purge(&mut self) -> Result<(), String> { + const PURGE_MALLCTL: &[u8] = b"arena.4096.purge\0"; + + // Safety: `PURGE_MALLCTL` is a static NUL-terminated mallctl name, and + // `arena..purge` is a void control, so both read and write + // pointers must be null with a zero write length. + let rc = unsafe { + tikv_jemalloc_sys::mallctl( + PURGE_MALLCTL.as_ptr().cast::(), + ptr::null_mut(), + ptr::null_mut(), + ptr::null_mut(), + 0, + ) + }; + + if rc == 0 { + Ok(()) + } else { + Err(format!( + "failed to purge jemalloc arenas: {}", + std::io::Error::from_raw_os_error(rc) + )) + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::sync::{ + Arc, + atomic::{AtomicUsize, Ordering}, + }; + + struct SequenceProbe { + samples: VecDeque, + } + + impl MemoryUsageProbe for SequenceProbe { + fn sample_usage(&mut self) -> Result { + self.samples + .pop_front() + .ok_or_else(|| "no test sample available".to_string()) + } + } + + struct CountingPurgeHook { + calls: Arc, + } + + impl MemoryPurgeHook for CountingPurgeHook { + fn purge(&mut self) -> Result<(), String> { + _ = self.calls.fetch_add(1, Ordering::Relaxed); + Ok(()) + } + } + + struct FailingPurgeHook; + + impl MemoryPurgeHook for FailingPurgeHook { + fn purge(&mut self) -> Result<(), String> { + Err("purge failed".to_string()) + } + } + + #[cfg(all(not(windows), feature = "jemalloc-testing"))] + #[test] + fn jemalloc_purge_hook_succeeds() { + let mut hook = JemallocPurgeHook; + hook.purge().expect("jemalloc purge should succeed"); + } + + #[test] + fn limiter_escalates_and_recovers_with_hysteresis() { + let limiter = EffectiveMemoryLimiter { + mode: MemoryLimiterMode::Enforce, + check_interval: Duration::from_secs(1), + soft_limit_bytes: 90, + hard_limit_bytes: 95, + hysteresis_bytes: 5, + purge_on_hard: false, + purge_min_interval: Duration::from_secs(5), + last_purge_at: None, + sampler: MemoryUsageSampler::for_tests(MemorySampleSource::Rss), + }; + + assert_eq!( + limiter.classify(MemoryPressureLevel::Normal, 89), + MemoryPressureLevel::Normal + ); + assert_eq!( + limiter.classify(MemoryPressureLevel::Normal, 90), + MemoryPressureLevel::Soft + ); + assert_eq!( + limiter.classify(MemoryPressureLevel::Normal, 95), + MemoryPressureLevel::Hard + ); + assert_eq!( + limiter.classify(MemoryPressureLevel::Soft, 90), + MemoryPressureLevel::Soft + ); + assert_eq!( + limiter.classify(MemoryPressureLevel::Soft, 84), + MemoryPressureLevel::Normal + ); + assert_eq!( + limiter.classify(MemoryPressureLevel::Hard, 89), + MemoryPressureLevel::Soft + ); + } + + #[test] + fn readiness_only_fails_on_hard_pressure_when_enabled() { + let state = MemoryPressureState::default(); + state.configure(MemoryPressureBehaviorConfig { + retry_after_secs: 3, + fail_readiness_on_hard: true, + mode: MemoryLimiterMode::Enforce, + }); + _ = state.update_level(MemoryPressureLevel::Soft, 91); + assert!(!state.should_fail_readiness()); + + _ = state.update_level(MemoryPressureLevel::Hard, 96); + assert!(state.should_fail_readiness()); + + state.configure(MemoryPressureBehaviorConfig { + retry_after_secs: 3, + fail_readiness_on_hard: false, + mode: MemoryLimiterMode::Enforce, + }); + assert!(!state.should_fail_readiness()); + } + + #[test] + fn observe_only_never_sheds_or_fails_readiness() { + let state = MemoryPressureState::default(); + state.configure(MemoryPressureBehaviorConfig { + retry_after_secs: 3, + fail_readiness_on_hard: true, + mode: MemoryLimiterMode::ObserveOnly, + }); + _ = state.update_level(MemoryPressureLevel::Hard, 96); + + assert_eq!(state.mode(), MemoryLimiterMode::ObserveOnly); + assert!(!state.should_shed_ingress()); + assert!(!state.should_fail_readiness()); + } + + #[test] + fn omitted_hysteresis_defaults_to_a_value_below_soft_limit() { + let limiter = EffectiveMemoryLimiter::from_policy(&MemoryLimiterPolicy { + mode: MemoryLimiterMode::Enforce, + source: MemoryLimiterSource::Rss, + check_interval: Duration::from_secs(1), + soft_limit: Some(100), + hard_limit: Some(250), + hysteresis: None, + retry_after_secs: 1, + fail_readiness_on_hard: true, + purge_on_hard: false, + purge_min_interval: Duration::from_secs(5), + }) + .expect("limiter should accept omitted hysteresis"); + + assert_eq!(limiter.hysteresis_bytes, 99); + } + + #[test] + fn state_exposes_latest_usage_and_limits() { + let state = MemoryPressureState::default(); + state.update_limits(90, 95); + _ = state.update_level(MemoryPressureLevel::Hard, 96); + + assert_eq!(state.usage_bytes(), 96); + assert_eq!(state.soft_limit_bytes(), 90); + assert_eq!(state.hard_limit_bytes(), 95); + } + + #[test] + fn shared_receiver_admission_state_bootstraps_from_process_state() { + let state = MemoryPressureState::default(); + state.configure(MemoryPressureBehaviorConfig { + retry_after_secs: 7, + fail_readiness_on_hard: true, + mode: MemoryLimiterMode::Enforce, + }); + _ = state.update_level(MemoryPressureLevel::Hard, 96); + + let local = SharedReceiverAdmissionState::from_process_state(&state); + assert!(local.should_shed_ingress()); + assert_eq!(local.retry_after_secs(), 7); + assert_eq!(local.level(), MemoryPressureLevel::Hard); + } + + #[test] + fn shared_receiver_admission_state_ignores_stale_generations() { + let state = MemoryPressureState::default(); + let local = SharedReceiverAdmissionState::from_process_state(&state); + + local.apply(MemoryPressureChanged { + generation: 2, + level: MemoryPressureLevel::Hard, + retry_after_secs: 9, + usage_bytes: 123, + }); + local.apply(MemoryPressureChanged { + generation: 1, + level: MemoryPressureLevel::Normal, + retry_after_secs: 1, + usage_bytes: 0, + }); + + assert!(local.should_shed_ingress()); + assert_eq!(local.retry_after_secs(), 9); + assert_eq!(local.level(), MemoryPressureLevel::Hard); + } + + #[test] + fn shared_receiver_admission_state_clones_observe_updates() { + let state = MemoryPressureState::default(); + let local = SharedReceiverAdmissionState::from_process_state(&state); + let clone = local.clone(); + + local.apply(MemoryPressureChanged { + generation: 1, + level: MemoryPressureLevel::Hard, + retry_after_secs: 5, + usage_bytes: 88, + }); + + assert!(clone.should_shed_ingress()); + assert_eq!(clone.retry_after_secs(), 5); + assert_eq!(clone.level(), MemoryPressureLevel::Hard); + } + + #[test] + fn local_receiver_admission_state_ignores_stale_generations() { + let state = MemoryPressureState::default(); + let local = LocalReceiverAdmissionState::from_process_state(&state); + + local.apply(MemoryPressureChanged { + generation: 3, + level: MemoryPressureLevel::Soft, + retry_after_secs: 4, + usage_bytes: 22, + }); + local.apply(MemoryPressureChanged { + generation: 2, + level: MemoryPressureLevel::Normal, + retry_after_secs: 1, + usage_bytes: 0, + }); + + assert!(!local.should_shed_ingress()); + assert_eq!(local.retry_after_secs(), 4); + assert_eq!(local.level(), MemoryPressureLevel::Soft); + } + + #[test] + fn local_receiver_admission_state_clones_observe_updates() { + let state = MemoryPressureState::default(); + let local = LocalReceiverAdmissionState::from_process_state(&state); + let clone = local.clone(); + + local.apply(MemoryPressureChanged { + generation: 1, + level: MemoryPressureLevel::Hard, + retry_after_secs: 6, + usage_bytes: 77, + }); + + assert!(clone.should_shed_ingress()); + assert_eq!(clone.retry_after_secs(), 6); + assert_eq!(clone.level(), MemoryPressureLevel::Hard); + } + + #[test] + fn non_auto_sources_require_explicit_limits() { + let err = EffectiveMemoryLimiter::from_policy(&MemoryLimiterPolicy { + mode: MemoryLimiterMode::Enforce, + source: MemoryLimiterSource::Rss, + check_interval: Duration::from_secs(1), + soft_limit: None, + hard_limit: None, + hysteresis: None, + retry_after_secs: 1, + fail_readiness_on_hard: true, + purge_on_hard: false, + purge_min_interval: Duration::from_secs(5), + }) + .expect_err("non-auto source should require explicit limits"); + + assert!(err.contains("source is not auto")); + } + + #[test] + fn tick_can_purge_before_entering_hard() { + let purge_calls = Arc::new(AtomicUsize::new(0)); + let sampler = MemoryUsageSampler::from_test_probes( + Box::new(SequenceProbe { + samples: VecDeque::from([ + MemorySample { + usage_bytes: 96, + source: MemorySampleSource::Rss, + }, + MemorySample { + usage_bytes: 80, + source: MemorySampleSource::Rss, + }, + ]), + }), + Some(Box::new(CountingPurgeHook { + calls: purge_calls.clone(), + })), + ); + let mut limiter = EffectiveMemoryLimiter { + mode: MemoryLimiterMode::Enforce, + check_interval: Duration::from_secs(1), + soft_limit_bytes: 90, + hard_limit_bytes: 95, + hysteresis_bytes: 5, + purge_on_hard: true, + purge_min_interval: Duration::from_secs(5), + last_purge_at: None, + sampler, + }; + let state = MemoryPressureState::default(); + + let tick = limiter.tick(&state).expect("tick should succeed"); + + assert_eq!(purge_calls.load(Ordering::Relaxed), 1); + assert_eq!(tick.pre_purge_usage_bytes, Some(96)); + assert!(tick.purge_duration.is_some()); + assert_eq!(tick.current_level, MemoryPressureLevel::Normal); + assert_eq!(state.level(), MemoryPressureLevel::Normal); + assert_eq!(state.usage_bytes(), 80); + } + + #[test] + fn tick_rate_limits_purge_attempts() { + let purge_calls = Arc::new(AtomicUsize::new(0)); + let sampler = MemoryUsageSampler::from_test_probes( + Box::new(SequenceProbe { + samples: VecDeque::from([MemorySample { + usage_bytes: 96, + source: MemorySampleSource::Rss, + }]), + }), + Some(Box::new(CountingPurgeHook { + calls: purge_calls.clone(), + })), + ); + let mut limiter = EffectiveMemoryLimiter { + mode: MemoryLimiterMode::Enforce, + check_interval: Duration::from_secs(1), + soft_limit_bytes: 90, + hard_limit_bytes: 95, + hysteresis_bytes: 5, + purge_on_hard: true, + purge_min_interval: Duration::from_secs(5), + last_purge_at: Some(Instant::now()), + sampler, + }; + let state = MemoryPressureState::default(); + + let tick = limiter.tick(&state).expect("tick should succeed"); + + assert_eq!(purge_calls.load(Ordering::Relaxed), 0); + assert_eq!(tick.pre_purge_usage_bytes, None); + assert!(tick.purge_duration.is_none()); + assert_eq!(tick.current_level, MemoryPressureLevel::Hard); + } + + #[test] + fn tick_purge_failure_is_non_fatal() { + let sampler = MemoryUsageSampler::from_test_probes( + Box::new(SequenceProbe { + samples: VecDeque::from([MemorySample { + usage_bytes: 96, + source: MemorySampleSource::Rss, + }]), + }), + Some(Box::new(FailingPurgeHook)), + ); + let mut limiter = EffectiveMemoryLimiter { + mode: MemoryLimiterMode::Enforce, + check_interval: Duration::from_secs(1), + soft_limit_bytes: 90, + hard_limit_bytes: 95, + hysteresis_bytes: 5, + purge_on_hard: true, + purge_min_interval: Duration::from_secs(5), + last_purge_at: None, + sampler, + }; + let state = MemoryPressureState::default(); + + let tick = limiter.tick(&state).expect("tick should succeed"); + + assert_eq!(tick.pre_purge_usage_bytes, Some(96)); + assert_eq!(tick.current_level, MemoryPressureLevel::Hard); + assert!(tick.purge_duration.is_none()); + assert_eq!(tick.purge_error.as_deref(), Some("purge failed")); + assert!(limiter.last_purge_at.is_some()); + assert_eq!(state.level(), MemoryPressureLevel::Hard); + assert_eq!(state.usage_bytes(), 96); + } + + #[test] + fn tick_post_purge_sample_failure_is_non_fatal() { + let purge_calls = Arc::new(AtomicUsize::new(0)); + let sampler = MemoryUsageSampler::from_test_probes( + Box::new(SequenceProbe { + samples: VecDeque::from([MemorySample { + usage_bytes: 96, + source: MemorySampleSource::Rss, + }]), + }), + Some(Box::new(CountingPurgeHook { + calls: purge_calls.clone(), + })), + ); + let mut limiter = EffectiveMemoryLimiter { + mode: MemoryLimiterMode::Enforce, + check_interval: Duration::from_secs(1), + soft_limit_bytes: 90, + hard_limit_bytes: 95, + hysteresis_bytes: 5, + purge_on_hard: true, + purge_min_interval: Duration::from_secs(5), + last_purge_at: None, + sampler, + }; + let state = MemoryPressureState::default(); + + let tick = limiter.tick(&state).expect("tick should succeed"); + + assert_eq!(purge_calls.load(Ordering::Relaxed), 1); + assert_eq!(tick.pre_purge_usage_bytes, Some(96)); + assert!(tick.purge_duration.is_some()); + assert_eq!( + tick.purge_error.as_deref(), + Some("post-purge sample failed: no test sample available") + ); + assert_eq!(tick.current_level, MemoryPressureLevel::Hard); + assert_eq!(state.level(), MemoryPressureLevel::Hard); + assert_eq!(state.usage_bytes(), 96); + } + + #[test] + fn observe_only_suppresses_purge_attempts() { + let purge_calls = Arc::new(AtomicUsize::new(0)); + let sampler = MemoryUsageSampler::from_test_probes( + Box::new(SequenceProbe { + samples: VecDeque::from([MemorySample { + usage_bytes: 96, + source: MemorySampleSource::Rss, + }]), + }), + Some(Box::new(CountingPurgeHook { + calls: purge_calls.clone(), + })), + ); + let mut limiter = EffectiveMemoryLimiter { + mode: MemoryLimiterMode::ObserveOnly, + check_interval: Duration::from_secs(1), + soft_limit_bytes: 90, + hard_limit_bytes: 95, + hysteresis_bytes: 5, + purge_on_hard: true, + purge_min_interval: Duration::from_secs(5), + last_purge_at: None, + sampler, + }; + let state = MemoryPressureState::default(); + + let tick = limiter.tick(&state).expect("tick should succeed"); + + assert_eq!(purge_calls.load(Ordering::Relaxed), 0); + assert_eq!(tick.pre_purge_usage_bytes, None); + assert!(tick.purge_duration.is_none()); + assert_eq!(tick.current_level, MemoryPressureLevel::Hard); + } +} diff --git a/rust/otap-dataflow/crates/engine/src/pipeline_ctrl.rs b/rust/otap-dataflow/crates/engine/src/pipeline_ctrl.rs index d9cd56ed23..60827c7a35 100644 --- a/rust/otap-dataflow/crates/engine/src/pipeline_ctrl.rs +++ b/rust/otap-dataflow/crates/engine/src/pipeline_ctrl.rs @@ -23,6 +23,7 @@ use crate::control::{ }; use crate::control_plane_metrics::{PipelineCompletionMetricsState, RuntimeControlMetricsState}; use crate::error::Error; +use crate::memory_limiter::MemoryPressureChanged; use crate::pipeline_metrics::PipelineMetricsMonitor; use crate::{Interests, RequestOutcome, Unwindable}; use otap_df_config::DeployedPipelineKey; @@ -39,6 +40,7 @@ use std::cmp::Reverse; use std::collections::{BinaryHeap, HashMap, HashSet, VecDeque}; use std::rc::Rc; use std::time::{Duration, Instant}; +use tokio::sync::watch; /// Threshold for the pending sends buffer. When the buffer exceeds this size, /// a warning is logged to help operators diagnose sustained backpressure. @@ -244,6 +246,8 @@ pub struct RuntimeCtrlMsgManager { pipeline_context: PipelineContext, /// Receives control messages from nodes (e.g., start/cancel timer). runtime_ctrl_msg_receiver: RuntimeCtrlMsgReceiver, + /// Receives process-wide memory pressure transitions from the controller. + memory_pressure_rx: watch::Receiver, /// Allows sending control messages back to nodes. control_senders: ControlSenders, /// Repeating timers for generic TimerTick. @@ -282,6 +286,7 @@ impl RuntimeCtrlMsgManager { pipeline_key: DeployedPipelineKey, pipeline_context: PipelineContext, runtime_ctrl_msg_receiver: RuntimeCtrlMsgReceiver, + memory_pressure_rx: watch::Receiver, control_senders: ControlSenders, event_reporter: ObservedEventReporter, metrics_reporter: MetricsReporter, @@ -302,6 +307,7 @@ impl RuntimeCtrlMsgManager { pipeline_key, pipeline_context, runtime_ctrl_msg_receiver, + memory_pressure_rx, control_senders, tick_timers: TimerSet::new(), telemetry_timers: TimerSet::new(), @@ -340,6 +346,7 @@ impl RuntimeCtrlMsgManager { let mut retry_delay: Option = None; let mut metrics_flush_delay: Option = None; let mut shutdown_deadline_forced = false; + let mut memory_pressure_updates_open = true; loop { // Drain any buffered sends before processing new messages. @@ -630,6 +637,16 @@ impl RuntimeCtrlMsgManager { } } } + changed = self.memory_pressure_rx.changed(), if memory_pressure_updates_open => { + if changed.is_err() { + memory_pressure_updates_open = false; + continue; + } + let update = *self.memory_pressure_rx.borrow_and_update(); + for node_id in self.control_senders.receiver_ids() { + self.send(node_id, NodeControlMsg::MemoryPressureChanged { update }); + } + } _ = async { if let Some(when) = next_earliest { if when > now { @@ -1285,6 +1302,8 @@ mod tests { crate::entity_context::PipelineEntityScope, ) { let (pipeline_tx, pipeline_rx) = runtime_ctrl_msg_channel(pipeline_capacity); + let (_memory_pressure_tx, memory_pressure_rx) = + watch::channel(MemoryPressureChanged::initial()); let metrics_system = otap_df_telemetry::InternalTelemetrySystem::default(); let metrics_reporter = metrics_system.reporter(); @@ -1316,6 +1335,7 @@ mod tests { }, pipeline_context, pipeline_rx, + memory_pressure_rx, control_senders, observed_state_store.reporter(SendPolicy::default()), metrics_reporter, @@ -1809,12 +1829,15 @@ mod tests { pipeline_context.metrics_registry(), pipeline_entity_key, ); + let (_memory_pressure_tx, memory_pressure_rx) = + watch::channel(MemoryPressureChanged::initial()); // Create manager with empty control_senders map (no registered nodes) let manager = RuntimeCtrlMsgManager::<()>::new( pipeline_key, pipeline_context, pipeline_rx, + memory_pressure_rx, ControlSenders::new(), observed_state_store.reporter(SendPolicy::default()), metrics_reporter, @@ -3144,6 +3167,8 @@ mod tests { }; let node_metric_handles = Rc::new(RefCell::new(node_metric_handles)); + let (_memory_pressure_tx, memory_pressure_rx) = + watch::channel(MemoryPressureChanged::initial()); let manager = RuntimeCtrlMsgManager::new( DeployedPipelineKey { @@ -3153,6 +3178,7 @@ mod tests { }, pipeline_context.clone(), pipeline_rx, + memory_pressure_rx, control_senders, observed_state_store.reporter(SendPolicy::default()), metrics_reporter.clone(), @@ -3322,6 +3348,15 @@ mod tests { _guard: crate::entity_context::PipelineEntityScope, } + struct MemoryPressureFanoutHarness { + manager: RuntimeCtrlMsgManager, + _pipeline_tx: crate::control::RuntimeCtrlMsgSender, + memory_pressure_tx: watch::Sender, + control_receivers: HashMap>>, + nodes: Vec, + _guard: crate::entity_context::PipelineEntityScope, + } + fn setup_runtime_control_telemetry_harness( node_specs: Vec<(&'static str, NodeType, usize)>, metric_level: MetricLevel, @@ -3373,6 +3408,8 @@ mod tests { _log_tx, engine_tx, ); + let (_memory_pressure_tx, memory_pressure_rx) = + watch::channel(MemoryPressureChanged::initial()); let manager = RuntimeCtrlMsgManager::new( DeployedPipelineKey { @@ -3382,6 +3419,7 @@ mod tests { }, pipeline_context, pipeline_rx, + memory_pressure_rx, control_senders, event_reporter, metrics_reporter, @@ -3408,6 +3446,79 @@ mod tests { } } + fn setup_memory_pressure_fanout_harness( + node_specs: Vec<(&'static str, NodeType, usize)>, + ) -> MemoryPressureFanoutHarness { + let (pipeline_tx, pipeline_rx) = runtime_ctrl_msg_channel(16); + let metrics_system = otap_df_telemetry::InternalTelemetrySystem::default(); + let metrics_reporter = metrics_system.reporter(); + let controller_context = ControllerContext::new(metrics_system.registry()); + let pipeline_group_id: PipelineGroupId = Default::default(); + let pipeline_id: PipelineId = Default::default(); + let pipeline_context_params = PipelineContextParams { + pipeline_group_id: pipeline_group_id.clone(), + pipeline_id: pipeline_id.clone(), + core_id: 0, + num_cores: 1, + thread_id: 0, + }; + let pipeline_context = PipelineContext::new(controller_context, pipeline_context_params); + let pipeline_entity_key = pipeline_context.register_pipeline_entity(); + let pipeline_entity_guard = crate::entity_context::set_pipeline_entity_key( + pipeline_context.metrics_registry(), + pipeline_entity_key, + ); + let nodes = test_nodes(node_specs.iter().map(|(name, _, _)| *name).collect()); + let mut control_senders = ControlSenders::new(); + let mut control_receivers = HashMap::new(); + for (node, (_, node_type, capacity)) in nodes.iter().zip(node_specs.iter()) { + let (sender, receiver) = create_mock_control_sender_with_capacity(*capacity); + control_senders.register(node.clone(), *node_type, sender); + let _ = control_receivers.insert(node.index, receiver); + } + + let (_log_tx, _log_rx) = flume::bounded(1); + let (engine_tx, _engine_rx) = flume::unbounded(); + let event_reporter = ObservedEventReporter::new_with_engine_sender( + SendPolicy::default(), + _log_tx, + engine_tx, + ); + let (memory_pressure_tx, memory_pressure_rx) = + watch::channel(MemoryPressureChanged::initial()); + + let manager = RuntimeCtrlMsgManager::new( + DeployedPipelineKey { + pipeline_group_id, + pipeline_id, + core_id: 0, + }, + pipeline_context, + pipeline_rx, + memory_pressure_rx, + control_senders, + event_reporter, + metrics_reporter, + TEST_CONTROL_PLANE_METRICS_FLUSH_INTERVAL, + TelemetryPolicy { + pipeline_metrics: false, + tokio_metrics: false, + runtime_metrics: MetricLevel::None, + }, + Vec::new(), + empty_node_metric_handles(), + ); + + MemoryPressureFanoutHarness { + manager, + _pipeline_tx: pipeline_tx, + memory_pressure_tx, + control_receivers, + nodes, + _guard: pipeline_entity_guard, + } + } + struct CompletionTelemetryHarness { dispatcher: PipelineCompletionMsgDispatcher, completion_tx: crate::control::PipelineCompletionMsgSender, @@ -4053,6 +4164,67 @@ mod tests { // Shutdown of a receiver+processor pipeline should first stop ingress, then // wait for the receiver to report drained before sending downstream // shutdown. The runtime-control metrics should expose both phases. + #[tokio::test] + async fn test_memory_pressure_updates_are_fanned_out_only_to_receivers() { + let local = LocalSet::new(); + + local + .run_until(async { + let MemoryPressureFanoutHarness { + manager, + _pipeline_tx, + memory_pressure_tx, + mut control_receivers, + nodes, + _guard: _, + } = setup_memory_pressure_fanout_harness::(vec![ + ("receiver", NodeType::Receiver, 16), + ("processor", NodeType::Processor, 16), + ]); + + let receiver = nodes[0].clone(); + let processor = nodes[1].clone(); + let manager_handle = tokio::task::spawn_local(async move { manager.run().await }); + + memory_pressure_tx + .send(MemoryPressureChanged { + generation: 1, + level: crate::memory_limiter::MemoryPressureLevel::Hard, + retry_after_secs: 5, + usage_bytes: 123, + }) + .expect("watch send should succeed"); + + let mut receiver_ctrl = control_receivers.remove(&receiver.index).unwrap(); + let receiver_msg = timeout(Duration::from_millis(100), receiver_ctrl.recv()) + .await + .expect("receiver should get memory pressure update") + .expect("receiver control channel should stay open"); + assert!(matches!( + receiver_msg, + NodeControlMsg::MemoryPressureChanged { + update: MemoryPressureChanged { + generation: 1, + level: crate::memory_limiter::MemoryPressureLevel::Hard, + retry_after_secs: 5, + usage_bytes: 123, + } + } + )); + + let mut processor_ctrl = control_receivers.remove(&processor.index).unwrap(); + assert!( + timeout(Duration::from_millis(50), processor_ctrl.recv()) + .await + .is_err(), + "non-receiver nodes should not get memory pressure updates" + ); + + manager_handle.abort(); + }) + .await; + } + #[tokio::test] async fn test_runtime_control_metrics_track_receiver_first_drain() { let local = LocalSet::new(); diff --git a/rust/otap-dataflow/crates/engine/src/runtime_pipeline.rs b/rust/otap-dataflow/crates/engine/src/runtime_pipeline.rs index adb800d014..515f0590fd 100644 --- a/rust/otap-dataflow/crates/engine/src/runtime_pipeline.rs +++ b/rust/otap-dataflow/crates/engine/src/runtime_pipeline.rs @@ -17,6 +17,7 @@ use crate::control::{ }; use crate::entity_context::{NodeTaskContext, NodeTelemetryHandle, instrument_with_node_context}; use crate::error::{Error, TypedError}; +use crate::memory_limiter::MemoryPressureChanged; use crate::node::{Node, NodeDefs, NodeId, NodeType, NodeWithPDataReceiver, NodeWithPDataSender}; use crate::pipeline_ctrl::{ NodeMetricHandles, PipelineCompletionMsgDispatcher, RuntimeCtrlMsgManager, @@ -35,6 +36,7 @@ use std::fmt::Debug; use std::rc::Rc; use std::time::Duration; use tokio::runtime::Builder; +use tokio::sync::watch; use tokio::task::LocalSet; /// Build produced-request metric sets indexed by sorted output port name, @@ -180,6 +182,7 @@ impl RuntimePipeli event_reporter: ObservedEventReporter, metrics_reporter: MetricsReporter, control_plane_metrics_flush_interval: Duration, + memory_pressure_rx: watch::Receiver, runtime_ctrl_msg_tx: RuntimeCtrlMsgSender, runtime_ctrl_msg_rx: RuntimeCtrlMsgReceiver, pipeline_completion_msg_tx: PipelineCompletionMsgSender, @@ -398,6 +401,7 @@ impl RuntimePipeli let manager_pipeline_context = pipeline_context.clone(); let manager_metrics_reporter = metrics_reporter.clone(); let manager_telemetry_policy = telemetry_policy.clone(); + let manager_memory_pressure_rx = memory_pressure_rx; let dispatcher_pipeline_context = pipeline_context.clone(); let dispatcher_metrics_reporter = metrics_reporter.clone(); let dispatcher_telemetry_policy = telemetry_policy.clone(); @@ -406,6 +410,7 @@ impl RuntimePipeli pipeline_key, manager_pipeline_context, runtime_ctrl_msg_rx, + manager_memory_pressure_rx, control_senders, event_reporter, manager_metrics_reporter, diff --git a/rust/otap-dataflow/crates/engine/src/testing/dst/common.rs b/rust/otap-dataflow/crates/engine/src/testing/dst/common.rs index 447ca9c40d..b53f9530e7 100644 --- a/rust/otap-dataflow/crates/engine/src/testing/dst/common.rs +++ b/rust/otap-dataflow/crates/engine/src/testing/dst/common.rs @@ -5,6 +5,7 @@ use crate::clock; use crate::context::{ControllerContext, PipelineContext, PipelineContextParams}; use crate::control::{ControlSenders, Frame, NodeControlMsg, RouteData, runtime_ctrl_msg_channel}; use crate::entity_context::set_pipeline_entity_key; +use crate::memory_limiter::MemoryPressureChanged; use crate::message::{Receiver, Sender}; use crate::pipeline_ctrl::{NodeMetricHandles, RuntimeCtrlMsgManager}; use crate::shared::message::{SharedReceiver, SharedSender}; @@ -19,6 +20,7 @@ use smallvec::smallvec; use std::cell::RefCell; use std::rc::Rc; use std::time::Duration; +use tokio::sync::watch; use tokio::time::timeout; // Keep DST control-plane metrics on a short cadence so any metric-side @@ -129,6 +131,8 @@ pub(super) fn build_manager( let pipeline_entity_key = pipeline_context.register_pipeline_entity(); let pipeline_entity_guard = set_pipeline_entity_key(pipeline_context.metrics_registry(), pipeline_entity_key); + let (_memory_pressure_tx, memory_pressure_rx) = + watch::channel(MemoryPressureChanged::initial()); let manager = RuntimeCtrlMsgManager::new( otap_df_config::DeployedPipelineKey { @@ -138,6 +142,7 @@ pub(super) fn build_manager( }, pipeline_context.clone(), pipeline_rx, + memory_pressure_rx, control_senders, observed_state_store.reporter(SendPolicy::default()), metrics_reporter, diff --git a/rust/otap-dataflow/crates/otap/src/lib.rs b/rust/otap-dataflow/crates/otap/src/lib.rs index a6c483ef75..855cba238f 100644 --- a/rust/otap-dataflow/crates/otap/src/lib.rs +++ b/rust/otap-dataflow/crates/otap/src/lib.rs @@ -39,6 +39,9 @@ pub(crate) mod socket_options; /// Shared concurrency limiting across protocol servers pub mod shared_concurrency; +/// Shared ingress shedding based on process-wide memory pressure. +pub mod memory_pressure_layer; + /// gRPC service implementation pub mod otlp_grpc; diff --git a/rust/otap-dataflow/crates/otap/src/memory_pressure_layer.rs b/rust/otap-dataflow/crates/otap/src/memory_pressure_layer.rs new file mode 100644 index 0000000000..bfa887a4ad --- /dev/null +++ b/rust/otap-dataflow/crates/otap/src/memory_pressure_layer.rs @@ -0,0 +1,280 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +//! Tower middleware that rejects requests while hard memory pressure is active. + +use futures::future::BoxFuture; +use http::{Request, Response}; +use otap_df_engine::memory_limiter::SharedReceiverAdmissionState; +use otap_df_telemetry::metrics::MetricSet; +use parking_lot::Mutex; +use std::sync::Arc; +use std::task::{Context, Poll}; +use tonic::{Code, Status, body::Body, metadata::MetadataMap}; +use tower::{Layer, Service}; + +use crate::otlp_metrics::OtlpReceiverMetrics; + +/// Records request rejections caused by process-wide hard memory pressure. +pub trait MemoryPressureRejectionMetrics: Send + Sync { + /// Records one request rejected before entering the pipeline due to hard memory pressure. + fn record_memory_pressure_rejection(&self); +} + +/// Builds a gRPC `resource_exhausted` status with retry pushback metadata. +#[must_use] +pub fn grpc_memory_pressure_status(state: &SharedReceiverAdmissionState) -> Status { + let mut metadata = MetadataMap::new(); + let retry_pushback_ms = u64::from(state.retry_after_secs().max(1)) * 1_000; + let _ = metadata.insert( + "grpc-retry-pushback-ms", + retry_pushback_ms + .to_string() + .parse() + .expect("retry pushback metadata should be valid ASCII"), + ); + Status::with_metadata(Code::ResourceExhausted, "memory pressure", metadata) +} + +impl MemoryPressureRejectionMetrics for Mutex> { + fn record_memory_pressure_rejection(&self) { + let mut metrics = self.lock(); + metrics.rejected_requests.inc(); + metrics.refused_memory_pressure.inc(); + } +} + +/// Layer that fails fast with `resource_exhausted` before tonic decodes request bodies. +/// +/// This is only enforced at `Hard` pressure. `Soft` remains advisory in the +/// process-wide state machine for this Phase 1 implementation. +#[derive(Clone)] +pub struct MemoryPressureLayer { + state: SharedReceiverAdmissionState, + metrics: Option>, +} + +impl MemoryPressureLayer { + /// Creates a new layer backed by the shared process-wide memory pressure state. + #[must_use] + pub const fn new(state: SharedReceiverAdmissionState) -> Self { + Self { + state, + metrics: None, + } + } + + /// Creates a new layer that also records dedicated rejection metrics. + #[must_use] + pub fn with_metrics(state: SharedReceiverAdmissionState, metrics: Arc) -> Self + where + M: MemoryPressureRejectionMetrics + 'static, + { + Self { + state, + metrics: Some(metrics), + } + } + + /// Creates a new layer that also records dedicated OTLP rejection metrics. + #[must_use] + pub fn with_otlp_metrics( + state: SharedReceiverAdmissionState, + metrics: Arc>>, + ) -> Self { + Self::with_metrics(state, metrics) + } +} + +impl Layer for MemoryPressureLayer { + type Service = MemoryPressureService; + + fn layer(&self, inner: S) -> Self::Service { + MemoryPressureService { + inner, + state: self.state.clone(), + metrics: self.metrics.clone(), + reject_next_call: false, + } + } +} + +/// Service implementation for [`MemoryPressureLayer`]. +#[derive(Clone)] +pub struct MemoryPressureService { + inner: S, + state: SharedReceiverAdmissionState, + metrics: Option>, + reject_next_call: bool, +} + +impl Service> for MemoryPressureService +where + S: Service, Response = Response> + Send + 'static, + S::Future: Send + 'static, +{ + type Response = Response; + type Error = S::Error; + type Future = BoxFuture<'static, Result>; + + fn poll_ready(&mut self, cx: &mut Context<'_>) -> Poll> { + if self.state.should_shed_ingress() { + self.reject_next_call = true; + return Poll::Ready(Ok(())); + } + self.reject_next_call = false; + self.inner.poll_ready(cx) + } + + fn call(&mut self, request: Request) -> Self::Future { + if self.reject_next_call || self.state.should_shed_ingress() { + self.reject_next_call = false; + if let Some(metrics) = &self.metrics { + metrics.record_memory_pressure_rejection(); + } + let response = grpc_memory_pressure_status(&self.state).into_http(); + return Box::pin(async move { Ok(response) }); + } + + let future = self.inner.call(request); + Box::pin(future) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use http::{Request, Response, StatusCode}; + use otap_df_config::policy::MemoryLimiterMode; + use otap_df_engine::memory_limiter::{MemoryPressureBehaviorConfig, MemoryPressureState}; + use std::convert::Infallible; + use std::sync::Arc; + use std::sync::atomic::{AtomicUsize, Ordering}; + use std::task::{Context, Poll, Waker}; + + #[derive(Clone)] + struct CountingService { + poll_ready_calls: Arc, + call_count: Arc, + } + + impl CountingService { + fn new() -> Self { + Self { + poll_ready_calls: Arc::new(AtomicUsize::new(0)), + call_count: Arc::new(AtomicUsize::new(0)), + } + } + } + + impl Service> for CountingService { + type Response = Response; + type Error = Infallible; + type Future = futures::future::Ready>; + + fn poll_ready(&mut self, _cx: &mut Context<'_>) -> Poll> { + let _ = self.poll_ready_calls.fetch_add(1, Ordering::Relaxed); + Poll::Ready(Ok(())) + } + + fn call(&mut self, _request: Request) -> Self::Future { + let _ = self.call_count.fetch_add(1, Ordering::Relaxed); + futures::future::ready(Ok(Response::new(Body::empty()))) + } + } + + #[test] + fn hard_pressure_short_circuits_before_inner_readiness_and_call() { + let state = MemoryPressureState::default(); + state.set_level_for_tests(otap_df_engine::memory_limiter::MemoryPressureLevel::Hard); + state.configure(MemoryPressureBehaviorConfig { + retry_after_secs: 3, + fail_readiness_on_hard: true, + mode: MemoryLimiterMode::Enforce, + }); + + let inner = CountingService::new(); + let poll_ready_calls = inner.poll_ready_calls.clone(); + let call_count = inner.call_count.clone(); + + let mut service = + MemoryPressureLayer::new(SharedReceiverAdmissionState::from_process_state(&state)) + .layer(inner); + let waker = Waker::noop(); + let mut cx = Context::from_waker(waker); + + assert!(matches!(service.poll_ready(&mut cx), Poll::Ready(Ok(())))); + + let response = futures::executor::block_on(service.call(Request::new(Body::empty()))) + .expect("memory pressure rejection should not error"); + + assert_eq!(poll_ready_calls.load(Ordering::Relaxed), 0); + assert_eq!(call_count.load(Ordering::Relaxed), 0); + assert_eq!(response.status(), StatusCode::OK); + assert_eq!( + response + .headers() + .get("grpc-status") + .and_then(|v| v.to_str().ok()), + Some("8") + ); + assert_eq!( + response + .headers() + .get("grpc-retry-pushback-ms") + .and_then(|v| v.to_str().ok()), + Some("3000") + ); + } + + #[test] + fn hard_rejection_decision_from_poll_ready_is_sticky_for_the_following_call() { + let state = MemoryPressureState::default(); + state.set_level_for_tests(otap_df_engine::memory_limiter::MemoryPressureLevel::Hard); + + let inner = CountingService::new(); + let poll_ready_calls = inner.poll_ready_calls.clone(); + let call_count = inner.call_count.clone(); + + let mut service = + MemoryPressureLayer::new(SharedReceiverAdmissionState::from_process_state(&state)) + .layer(inner); + let waker = Waker::noop(); + let mut cx = Context::from_waker(waker); + + assert!(matches!(service.poll_ready(&mut cx), Poll::Ready(Ok(())))); + state.set_level_for_tests(otap_df_engine::memory_limiter::MemoryPressureLevel::Normal); + + let response = futures::executor::block_on(service.call(Request::new(Body::empty()))) + .expect("memory pressure rejection should not error"); + + assert_eq!(poll_ready_calls.load(Ordering::Relaxed), 0); + assert_eq!(call_count.load(Ordering::Relaxed), 0); + assert_eq!(response.status(), StatusCode::OK); + } + + #[test] + fn soft_pressure_remains_advisory() { + let state = MemoryPressureState::default(); + state.set_level_for_tests(otap_df_engine::memory_limiter::MemoryPressureLevel::Soft); + + let inner = CountingService::new(); + let poll_ready_calls = inner.poll_ready_calls.clone(); + let call_count = inner.call_count.clone(); + + let mut service = + MemoryPressureLayer::new(SharedReceiverAdmissionState::from_process_state(&state)) + .layer(inner); + let waker = Waker::noop(); + let mut cx = Context::from_waker(waker); + + assert!(matches!(service.poll_ready(&mut cx), Poll::Ready(Ok(())))); + + let response = futures::executor::block_on(service.call(Request::new(Body::empty()))) + .expect("soft pressure should not error"); + + assert_eq!(poll_ready_calls.load(Ordering::Relaxed), 1); + assert_eq!(call_count.load(Ordering::Relaxed), 1); + assert_eq!(response.status(), StatusCode::OK); + } +} diff --git a/rust/otap-dataflow/crates/otap/src/otap_grpc.rs b/rust/otap-dataflow/crates/otap/src/otap_grpc.rs index aeb98b4c85..73b1009ccd 100644 --- a/rust/otap-dataflow/crates/otap/src/otap_grpc.rs +++ b/rust/otap-dataflow/crates/otap/src/otap_grpc.rs @@ -13,7 +13,7 @@ use crate::pdata::{Context, OtapPdata}; use otap_df_engine::{ Interests, MessageSourceSharedEffectHandlerExtension, ProducerEffectHandlerExtension, - shared::receiver as shared, + memory_limiter::SharedReceiverAdmissionState, shared::receiver as shared, }; use otap_df_pdata::{ Consumer, @@ -24,8 +24,9 @@ use otap_df_pdata::{ arrow_traces_service_server::ArrowTracesService, }, }; -use otap_df_telemetry::otel_error; +use otap_df_telemetry::{otel_error, otel_warn}; use std::pin::Pin; +use std::sync::Arc; use tokio::sync::oneshot; use tokio_stream::Stream; use tokio_stream::wrappers::ReceiverStream; @@ -38,6 +39,7 @@ pub mod otlp; pub mod proxy; pub mod server_settings; +use crate::memory_pressure_layer::{MemoryPressureRejectionMetrics, grpc_memory_pressure_status}; use crate::otap_grpc::otlp::server::SharedState; pub use client_settings::GrpcClientSettings; pub use server_settings::GrpcServerSettings; @@ -52,7 +54,7 @@ pub struct NewSettings { } /// Common settings for OTLP receivers. -#[derive(Clone, Debug)] +#[derive(Clone)] pub struct Settings { /// Size of the channel used to buffer outgoing responses to the client. pub response_stream_channel_size: usize, @@ -60,6 +62,10 @@ pub struct Settings { pub max_concurrent_requests: usize, /// Whether the receiver should wait. pub wait_for_result: bool, + /// Receiver-local memory pressure admission state. + pub admission_state: SharedReceiverAdmissionState, + /// Shared rejection counters used by both stream-open and per-batch shedding. + pub memory_pressure_rejection_metrics: Option>, } /// struct that implements the ArrowLogsService trait @@ -155,6 +161,7 @@ impl ArrowLogsService for ArrowLogsServiceImpl { let (tx, rx) = tokio::sync::mpsc::channel(self.settings.response_stream_channel_size); let effect_handler_clone = self.effect_handler.clone(); let state_clone = self.state.clone(); + let settings = self.settings.clone(); // Provide client a stream to listen to let output = ReceiverStream::new(rx); @@ -165,7 +172,22 @@ impl ArrowLogsService for ArrowLogsServiceImpl { let mut consumer = Consumer::default(); // Process messages until stream ends or error occurs - while let Ok(Some(batch)) = input_stream.message().await { + loop { + if reject_open_stream_for_memory_pressure( + &settings.admission_state, + settings.memory_pressure_rejection_metrics.as_deref(), + &tx, + ) + .await + { + break; + } + + let batch = match input_stream.message().await { + Ok(Some(batch)) => batch, + Ok(None) | Err(_) => break, + }; + // accept the batch data and handle output response if accept_data::( OtapArrowRecords::Logs, @@ -173,6 +195,8 @@ impl ArrowLogsService for ArrowLogsServiceImpl { batch, &effect_handler_clone, state_clone.clone(), + &settings.admission_state, + settings.memory_pressure_rejection_metrics.as_deref(), &tx, ) .await @@ -200,6 +224,7 @@ impl ArrowMetricsService for ArrowMetricsServiceImpl { let (tx, rx) = tokio::sync::mpsc::channel(self.settings.response_stream_channel_size); let effect_handler_clone = self.effect_handler.clone(); let state_clone = self.state.clone(); + let settings = self.settings.clone(); // Provide client a stream to listen to let output = ReceiverStream::new(rx); @@ -209,7 +234,22 @@ impl ArrowMetricsService for ArrowMetricsServiceImpl { let mut consumer = Consumer::default(); // Process messages until stream ends or error occurs - while let Ok(Some(batch)) = input_stream.message().await { + loop { + if reject_open_stream_for_memory_pressure( + &settings.admission_state, + settings.memory_pressure_rejection_metrics.as_deref(), + &tx, + ) + .await + { + break; + } + + let batch = match input_stream.message().await { + Ok(Some(batch)) => batch, + Ok(None) | Err(_) => break, + }; + // accept the batch data and handle output response if accept_data::( OtapArrowRecords::Metrics, @@ -217,6 +257,8 @@ impl ArrowMetricsService for ArrowMetricsServiceImpl { batch, &effect_handler_clone, state_clone.clone(), + &settings.admission_state, + settings.memory_pressure_rejection_metrics.as_deref(), &tx, ) .await @@ -244,6 +286,7 @@ impl ArrowTracesService for ArrowTracesServiceImpl { let (tx, rx) = tokio::sync::mpsc::channel(self.settings.response_stream_channel_size); let effect_handler_clone = self.effect_handler.clone(); let state_clone = self.state.clone(); + let settings = self.settings.clone(); // create a stream to output result to let output = ReceiverStream::new(rx); @@ -253,7 +296,22 @@ impl ArrowTracesService for ArrowTracesServiceImpl { let mut consumer = Consumer::default(); // Process messages until stream ends or error occurs - while let Ok(Some(batch)) = input_stream.message().await { + loop { + if reject_open_stream_for_memory_pressure( + &settings.admission_state, + settings.memory_pressure_rejection_metrics.as_deref(), + &tx, + ) + .await + { + break; + } + + let batch = match input_stream.message().await { + Ok(Some(batch)) => batch, + Ok(None) | Err(_) => break, + }; + // accept the batch data and handle output response if accept_data::( OtapArrowRecords::Traces, @@ -261,6 +319,8 @@ impl ArrowTracesService for ArrowTracesServiceImpl { batch, &effect_handler_clone, state_clone.clone(), + &settings.admission_state, + settings.memory_pressure_rejection_metrics.as_deref(), &tx, ) .await @@ -276,6 +336,39 @@ impl ArrowTracesService for ArrowTracesServiceImpl { } } +async fn reject_open_stream_for_memory_pressure( + admission_state: &SharedReceiverAdmissionState, + memory_pressure_metrics: Option<&dyn MemoryPressureRejectionMetrics>, + tx: &tokio::sync::mpsc::Sender>, +) -> bool { + if !admission_state.should_shed_ingress() { + return false; + } + + if let Some(metrics) = memory_pressure_metrics { + metrics.record_memory_pressure_rejection(); + } + + otel_warn!( + "otap.stream.memory_pressure", + message = "Process memory pressure active while receiving an OTAP stream" + ); + + let _ = tx + .send(Err(grpc_memory_pressure_status(admission_state))) + .await + .map_err(|e| { + otel_error!( + "otap.response.send_failed", + error = ?e, + message = "Error sending streamed memory pressure response" + ); + }) + .ok(); + + true +} + /// handles sending the data down the pipeline via effect_handler and generating the appropriate response async fn accept_data( otap_batch: F, @@ -283,12 +376,37 @@ async fn accept_data( mut batch: BatchArrowRecords, effect_handler: &shared::EffectHandler, state: Option, + admission_state: &SharedReceiverAdmissionState, + memory_pressure_metrics: Option<&dyn MemoryPressureRejectionMetrics>, tx: &tokio::sync::mpsc::Sender>, ) -> Result<(), ()> where F: Fn(T) -> OtapArrowRecords, { let batch_id = batch.batch_id; + if admission_state.should_shed_ingress() { + if let Some(metrics) = memory_pressure_metrics { + metrics.record_memory_pressure_rejection(); + } + + otel_warn!( + "otap.request.memory_pressure", + message = "Process memory pressure active while receiving streamed batch" + ); + + tx.send(Ok(BatchStatus { + batch_id, + status_code: StatusCode::ResourceExhausted as i32, + status_message: "Process memory pressure".to_string(), + })) + .await + .map_err(|e| { + otel_error!("otap.response.send_failed", error = ?e, message = "Error sending BatchStatus response"); + })?; + + return Ok(()); + } + let batch = consumer.consume_bar(&mut batch).map_err(|e| { otel_error!("otap.batch.decode_failed", error = ?e, message = "Error decoding OTAP Batch. Closing stream"); })?; @@ -409,6 +527,60 @@ where }) } +#[cfg(test)] +mod tests { + use super::*; + use otap_df_config::policy::MemoryLimiterMode; + use otap_df_engine::memory_limiter::{ + MemoryPressureBehaviorConfig, MemoryPressureLevel, MemoryPressureState, + }; + use std::sync::atomic::{AtomicUsize, Ordering}; + use tonic::Code; + + #[derive(Default)] + struct CountingMemoryPressureMetrics { + calls: AtomicUsize, + } + + impl MemoryPressureRejectionMetrics for CountingMemoryPressureMetrics { + fn record_memory_pressure_rejection(&self) { + let _ = self.calls.fetch_add(1, Ordering::Relaxed); + } + } + + #[tokio::test] + async fn open_stream_rejection_stops_before_reading_next_batch() { + let state = MemoryPressureState::default(); + state.configure(MemoryPressureBehaviorConfig { + retry_after_secs: 3, + fail_readiness_on_hard: true, + mode: MemoryLimiterMode::Enforce, + }); + state.set_level_for_tests(MemoryPressureLevel::Hard); + + let metrics = CountingMemoryPressureMetrics::default(); + let local_state = SharedReceiverAdmissionState::from_process_state(&state); + let (tx, mut rx) = tokio::sync::mpsc::channel(1); + + assert!( + reject_open_stream_for_memory_pressure(&local_state, Some(&metrics), &tx).await, + "hard pressure should reject an already-open stream before reading the next batch" + ); + + let response = rx.recv().await.expect("stream rejection should be emitted"); + let status = response.expect_err("memory pressure should surface as a gRPC stream error"); + assert_eq!(status.code(), Code::ResourceExhausted); + assert_eq!( + status + .metadata() + .get("grpc-retry-pushback-ms") + .and_then(|value| value.to_str().ok()), + Some("3000") + ); + assert_eq!(metrics.calls.load(Ordering::Relaxed), 1); + } +} + /// Enum to describe the Arrow data. /// /// Within this type, the Arrow batches are serialized as Arrow IPC inside the diff --git a/rust/otap-dataflow/crates/otap/src/otlp_http.rs b/rust/otap-dataflow/crates/otap/src/otlp_http.rs index e260a61b98..f6d90af642 100644 --- a/rust/otap-dataflow/crates/otap/src/otlp_http.rs +++ b/rust/otap-dataflow/crates/otap/src/otlp_http.rs @@ -25,6 +25,7 @@ use hyper::service::service_fn; use hyper_util::rt::TokioIo; use otap_df_config::SignalType; use otap_df_config::byte_units; +use otap_df_engine::memory_limiter::SharedReceiverAdmissionState; use otap_df_engine::shared::receiver::EffectHandler; use otap_df_engine::{ Interests, MessageSourceSharedEffectHandlerExtension, ProducerEffectHandlerExtension, @@ -321,6 +322,16 @@ fn service_unavailable() -> Response> { rpc_status_response(StatusCode::SERVICE_UNAVAILABLE, 14, "service unavailable") } +fn memory_pressure_unavailable(retry_after_secs: u32) -> Response> { + let mut response = rpc_status_response(StatusCode::SERVICE_UNAVAILABLE, 8, "memory pressure"); + if let Ok(retry_after) = HeaderValue::from_str(&retry_after_secs.max(1).to_string()) { + _ = response + .headers_mut() + .insert(http::header::RETRY_AFTER, retry_after); + } + response +} + fn internal_error() -> Response> { rpc_status_response(StatusCode::INTERNAL_SERVER_ERROR, 13, "internal error") } @@ -503,6 +514,7 @@ struct HttpHandler { ack_registry: AckRegistry, metrics: Arc>>, settings: HttpServerSettings, + admission_state: SharedReceiverAdmissionState, /// Optional global semaphore shared across protocols (e.g., gRPC + HTTP) to enforce /// receiver-wide backpressure tied to downstream capacity. global_semaphore: Option>, @@ -537,6 +549,15 @@ impl HttpHandler { let permit_timeout = self.settings.timeout.unwrap_or(Duration::from_secs(5)); let fut = async move { + if self.admission_state.should_shed_ingress() { + let mut metrics = self.metrics.lock(); + metrics.rejected_requests.inc(); + metrics.refused_memory_pressure.inc(); + return Err(memory_pressure_unavailable( + self.admission_state.retry_after_secs(), + )); + } + // Acquire permits in a consistent order to avoid deadlocks when both gRPC and // HTTP are enabled: global (if any) first, then protocol-local. let _global_permit = if let Some(global) = &self.global_semaphore { @@ -569,6 +590,17 @@ impl HttpHandler { None }; + // Re-check after potentially waiting for the global permit: pressure may have + // escalated while this request was queued. + if self.admission_state.should_shed_ingress() { + let mut metrics = self.metrics.lock(); + metrics.rejected_requests.inc(); + metrics.refused_memory_pressure.inc(); + return Err(memory_pressure_unavailable( + self.admission_state.retry_after_secs(), + )); + } + let permit_result = tokio::time::timeout(permit_timeout, self.local_semaphore.clone().acquire_owned()) .await; @@ -599,6 +631,16 @@ impl HttpHandler { } }; + // Re-check after waiting for the local permit. + if self.admission_state.should_shed_ingress() { + let mut metrics = self.metrics.lock(); + metrics.rejected_requests.inc(); + metrics.refused_memory_pressure.inc(); + return Err(memory_pressure_unavailable( + self.admission_state.retry_after_secs(), + )); + } + self.metrics.lock().requests_started.inc(); let max_len = self.settings.max_request_body_size as usize; @@ -781,6 +823,7 @@ pub async fn serve( settings: HttpServerSettings, ack_registry: AckRegistry, metrics: Arc>>, + admission_state: SharedReceiverAdmissionState, global_semaphore: Option>, shutdown: CancellationToken, ) -> std::io::Result<()> { @@ -836,6 +879,7 @@ pub async fn serve( ack_registry: ack_registry.clone(), metrics: metrics.clone(), settings: settings.clone(), + admission_state: admission_state.clone(), global_semaphore: global_semaphore.clone(), local_semaphore: local_semaphore.clone(), }; @@ -923,7 +967,9 @@ pub async fn serve( mod tests { use super::*; + use otap_df_engine::memory_limiter::{MemoryPressureLevel, MemoryPressureState}; use std::collections::HashMap; + use std::sync::Arc; use std::time::Duration; #[test] @@ -986,6 +1032,7 @@ mod tests { settings.clone(), ack_registry.clone(), metrics, + SharedReceiverAdmissionState::default(), None, shutdown.clone(), )); @@ -1059,4 +1106,276 @@ mod tests { .expect("server finished"); assert!(server_result.unwrap().is_ok()); } + + #[tokio::test] + async fn queued_request_rechecks_process_memory_pressure_before_body_read() { + use hyper::Method; + use hyper::client::conn::http1; + use hyper::header::{CONTENT_TYPE, HOST, RETRY_AFTER}; + use hyper_util::rt::TokioIo; + use otap_df_engine::control::runtime_ctrl_msg_channel; + use otap_df_engine::memory_limiter::MemoryPressureLevel; + use otap_df_engine::shared::message::SharedSender; + use otap_df_engine::testing::test_node; + use otap_df_pdata::proto::opentelemetry::collector::logs::v1::ExportLogsServiceRequest; + use otap_df_telemetry::registry::TelemetryRegistryHandle; + use otap_df_telemetry::reporter::MetricsReporter; + use tokio::net::TcpStream; + use tokio::sync::Semaphore; + use tokio::sync::mpsc as tokio_mpsc; + use tokio_util::sync::CancellationToken; + + let port = portpicker::pick_unused_port().expect("free port"); + let addr: SocketAddr = format!("127.0.0.1:{port}").parse().unwrap(); + + let gate = Arc::new(Semaphore::new(1)); + let held_permit = gate + .clone() + .acquire_owned() + .await + .expect("initial semaphore permit"); + + let (msg_tx, _msg_rx) = tokio_mpsc::channel(4); + let mut senders = HashMap::new(); + let _ = senders.insert("default".into(), SharedSender::mpsc(msg_tx)); + let (ctrl_tx, _ctrl_rx) = runtime_ctrl_msg_channel(4); + let (_metrics_rx, metrics_reporter) = MetricsReporter::create_new_and_receiver(1); + let effect_handler = EffectHandler::new( + test_node("http_process_pressure"), + senders, + None, + ctrl_tx, + metrics_reporter, + ); + + let settings = HttpServerSettings { + listening_addr: addr, + max_concurrent_requests: 1, + timeout: Some(Duration::from_secs(2)), + ..Default::default() + }; + let shutdown = CancellationToken::new(); + + let metrics_registry_handle = TelemetryRegistryHandle::new(); + let controller_ctx = + otap_df_engine::context::ControllerContext::new(metrics_registry_handle); + let pipeline_ctx = + controller_ctx.pipeline_context_with("grp".into(), "pipeline".into(), 0, 1, 0); + let metrics = Arc::new(Mutex::new( + pipeline_ctx.register_metrics::(), + )); + let memory_pressure_state = MemoryPressureState::default(); + let admission_state = + SharedReceiverAdmissionState::from_process_state(&memory_pressure_state); + + let server = tokio::spawn(serve( + effect_handler, + settings, + AckRegistry::new(None, None, None), + metrics.clone(), + admission_state.clone(), + Some(gate.clone()), + shutdown.clone(), + )); + + let mut stream = None; + for _ in 0..10 { + match TcpStream::connect(addr).await { + Ok(s) => { + stream = Some(s); + break; + } + Err(_) => tokio::time::sleep(Duration::from_millis(50)).await, + } + } + let stream = stream.expect("Failed to connect to server"); + + let (mut sender, conn) = http1::handshake(TokioIo::new(stream)).await.unwrap(); + drop(tokio::spawn(async move { + let _ = conn.await; + })); + + let mut request_bytes = Vec::new(); + ExportLogsServiceRequest::default() + .encode(&mut request_bytes) + .unwrap(); + + let req = Request::builder() + .method(Method::POST) + .uri("/v1/logs") + .header(HOST, "localhost") + .header(CONTENT_TYPE, PROTOBUF_CONTENT_TYPE) + .body(Full::new(Bytes::from(request_bytes))) + .unwrap(); + + let response = tokio::spawn(async move { + let resp = sender.send_request(req).await.unwrap(); + ( + resp.status(), + resp.headers() + .get(RETRY_AFTER) + .and_then(|value| value.to_str().ok()) + .map(str::to_owned), + ) + }); + + tokio::time::sleep(Duration::from_millis(100)).await; + admission_state.apply(otap_df_engine::memory_limiter::MemoryPressureChanged { + generation: 1, + level: MemoryPressureLevel::Hard, + retry_after_secs: 1, + usage_bytes: 0, + }); + drop(held_permit); + + let (status, retry_after) = tokio::time::timeout(Duration::from_secs(2), response) + .await + .expect("request completed") + .expect("request task succeeded"); + assert_eq!(status, StatusCode::SERVICE_UNAVAILABLE); + assert_eq!(retry_after.as_deref(), Some("1")); + + { + let metrics = metrics.lock(); + assert_eq!(metrics.requests_started.get(), 0); + assert_eq!(metrics.request_bytes.get(), 0); + assert_eq!(metrics.rejected_requests.get(), 1); + assert_eq!(metrics.refused_memory_pressure.get(), 1); + } + + shutdown.cancel(); + let server_result = tokio::time::timeout(Duration::from_secs(2), server) + .await + .expect("server finished"); + assert!(server_result.unwrap().is_ok()); + } + + #[tokio::test] + async fn soft_pressure_does_not_reject_after_waiting_for_permit() { + use http_body_util::Full; + use hyper::Method; + use hyper::client::conn::http1; + use hyper::header::{CONTENT_TYPE, HOST}; + use hyper_util::rt::TokioIo; + use otap_df_engine::control::runtime_ctrl_msg_channel; + use otap_df_engine::shared::message::SharedSender; + use otap_df_engine::testing::test_node; + use otap_df_pdata::proto::opentelemetry::collector::logs::v1::ExportLogsServiceRequest; + use otap_df_telemetry::registry::TelemetryRegistryHandle; + use otap_df_telemetry::reporter::MetricsReporter; + use tokio::net::TcpStream; + use tokio::sync::mpsc as tokio_mpsc; + use tokio_util::sync::CancellationToken; + + let port = portpicker::pick_unused_port().expect("free port"); + let addr: SocketAddr = format!("127.0.0.1:{port}").parse().unwrap(); + + let (msg_tx, mut msg_rx) = tokio_mpsc::channel(4); + let mut senders = HashMap::new(); + let _ = senders.insert("default".into(), SharedSender::mpsc(msg_tx)); + let (ctrl_tx, _ctrl_rx) = runtime_ctrl_msg_channel(4); + let (_metrics_rx, metrics_reporter) = MetricsReporter::create_new_and_receiver(1); + let effect_handler = + EffectHandler::new(test_node("http"), senders, None, ctrl_tx, metrics_reporter); + + let settings = HttpServerSettings { + listening_addr: addr, + max_concurrent_requests: 1, + wait_for_result: false, + ..Default::default() + }; + let shutdown = CancellationToken::new(); + + let metrics_registry_handle = TelemetryRegistryHandle::new(); + let controller_ctx = + otap_df_engine::context::ControllerContext::new(metrics_registry_handle); + let pipeline_ctx = + controller_ctx.pipeline_context_with("grp".into(), "pipeline".into(), 0, 1, 0); + let metrics = Arc::new(Mutex::new( + pipeline_ctx.register_metrics::(), + )); + + let local_semaphore = Arc::new(Semaphore::new(1)); + let held_permit = local_semaphore + .clone() + .acquire_owned() + .await + .expect("permit"); + + let memory_pressure_state = MemoryPressureState::default(); + let admission_state = + SharedReceiverAdmissionState::from_process_state(&memory_pressure_state); + let server = tokio::spawn(serve( + effect_handler.clone(), + settings.clone(), + AckRegistry::new(None, None, None), + metrics.clone(), + admission_state.clone(), + Some(local_semaphore), + shutdown.clone(), + )); + + let mut stream = None; + for _ in 0..10 { + match TcpStream::connect(addr).await { + Ok(s) => { + stream = Some(s); + break; + } + Err(_) => tokio::time::sleep(Duration::from_millis(50)).await, + } + } + let stream = stream.expect("Failed to connect to server"); + + let (mut sender, conn) = http1::handshake(TokioIo::new(stream)).await.unwrap(); + drop(tokio::spawn(async move { + let _ = conn.await; + })); + + let mut request_bytes = Vec::new(); + ExportLogsServiceRequest::default() + .encode(&mut request_bytes) + .unwrap(); + + let req = Request::builder() + .method(Method::POST) + .uri("/v1/logs") + .header(HOST, "localhost") + .header(CONTENT_TYPE, PROTOBUF_CONTENT_TYPE) + .body(Full::new(Bytes::from(request_bytes))) + .unwrap(); + + let response = + tokio::spawn(async move { sender.send_request(req).await.unwrap().status() }); + + tokio::time::sleep(Duration::from_millis(100)).await; + admission_state.apply(otap_df_engine::memory_limiter::MemoryPressureChanged { + generation: 1, + level: MemoryPressureLevel::Soft, + retry_after_secs: 1, + usage_bytes: 0, + }); + drop(held_permit); + + let status = tokio::time::timeout(Duration::from_secs(2), response) + .await + .expect("request completed") + .expect("request task succeeded"); + assert_eq!(status, StatusCode::OK); + + { + let metrics = metrics.lock(); + assert_eq!(metrics.rejected_requests.get(), 0); + assert_eq!(metrics.refused_memory_pressure.get(), 0); + assert_eq!(metrics.requests_started.get(), 1); + } + + let _ = msg_rx.recv().await.expect("request forwarded downstream"); + + shutdown.cancel(); + let server_result = tokio::time::timeout(Duration::from_secs(2), server) + .await + .expect("server finished"); + assert!(server_result.unwrap().is_ok()); + } } diff --git a/rust/otap-dataflow/crates/otap/src/otlp_metrics.rs b/rust/otap-dataflow/crates/otap/src/otlp_metrics.rs index 8e08dbce3f..1077919a13 100644 --- a/rust/otap-dataflow/crates/otap/src/otlp_metrics.rs +++ b/rust/otap-dataflow/crates/otap/src/otlp_metrics.rs @@ -34,6 +34,10 @@ pub struct OtlpReceiverMetrics { #[metric(unit = "{requests}")] pub rejected_requests: Counter, + /// Number of OTLP RPCs rejected specifically because process-wide memory pressure was active. + #[metric(unit = "{requests}")] + pub refused_memory_pressure: Counter, + /// Number of transport-level errors surfaced by tonic/server. #[metric(unit = "{errors}")] pub transport_errors: Counter, diff --git a/rust/otap-dataflow/crates/otap/tests/core_node_liveness_tests.rs b/rust/otap-dataflow/crates/otap/tests/core_node_liveness_tests.rs index 4e84e2091b..0ce4b565de 100644 --- a/rust/otap-dataflow/crates/otap/tests/core_node_liveness_tests.rs +++ b/rust/otap-dataflow/crates/otap/tests/core_node_liveness_tests.rs @@ -200,12 +200,16 @@ fn run_pipeline_with_condition( let run_result = { let _pipeline_entity_guard = set_pipeline_entity_key(pipeline_ctx.metrics_registry(), pipeline_entity_key); + let (_memory_pressure_tx, memory_pressure_rx) = tokio::sync::watch::channel( + otap_df_engine::memory_limiter::MemoryPressureChanged::initial(), + ); runtime_pipeline.run_forever( pipeline_key, pipeline_ctx, event_reporter, metrics_reporter, Duration::from_secs(1), + memory_pressure_rx, runtime_ctrl_tx, runtime_ctrl_rx, pipeline_completion_tx, diff --git a/rust/otap-dataflow/crates/otap/tests/durable_buffer_processor_tests.rs b/rust/otap-dataflow/crates/otap/tests/durable_buffer_processor_tests.rs index ef85ccc120..488e4c4072 100644 --- a/rust/otap-dataflow/crates/otap/tests/durable_buffer_processor_tests.rs +++ b/rust/otap-dataflow/crates/otap/tests/durable_buffer_processor_tests.rs @@ -583,12 +583,16 @@ where let run_result = { let _pipeline_entity_guard = set_pipeline_entity_key(pipeline_ctx.metrics_registry(), pipeline_entity_key); + let (_memory_pressure_tx, memory_pressure_rx) = tokio::sync::watch::channel( + otap_df_engine::memory_limiter::MemoryPressureChanged::initial(), + ); runtime_pipeline.run_forever( pipeline_key, pipeline_ctx, event_reporter, metrics_reporter, Duration::from_secs(1), + memory_pressure_rx, runtime_ctrl_tx, runtime_ctrl_rx, pipeline_completion_tx, @@ -877,12 +881,16 @@ where let run_result = { let _pipeline_entity_guard = set_pipeline_entity_key(pipeline_ctx.metrics_registry(), pipeline_entity_key); + let (_memory_pressure_tx, memory_pressure_rx) = tokio::sync::watch::channel( + otap_df_engine::memory_limiter::MemoryPressureChanged::initial(), + ); runtime_pipeline.run_forever( pipeline_key, pipeline_ctx, event_reporter, metrics_reporter, Duration::from_secs(1), + memory_pressure_rx, runtime_ctrl_tx, runtime_ctrl_rx, pipeline_completion_tx, diff --git a/rust/otap-dataflow/crates/otap/tests/pipeline_tests.rs b/rust/otap-dataflow/crates/otap/tests/pipeline_tests.rs index a50551a7da..dc3d111cb2 100644 --- a/rust/otap-dataflow/crates/otap/tests/pipeline_tests.rs +++ b/rust/otap-dataflow/crates/otap/tests/pipeline_tests.rs @@ -105,12 +105,16 @@ fn test_telemetry_registries_cleanup() { let run_result = { let _pipeline_entity_guard = set_pipeline_entity_key(pipeline_ctx.metrics_registry(), pipeline_entity_key); + let (_memory_pressure_tx, memory_pressure_rx) = tokio::sync::watch::channel( + otap_df_engine::memory_limiter::MemoryPressureChanged::initial(), + ); runtime_pipeline.run_forever( pipeline_key, pipeline_ctx, event_reporter, metrics_reporter, Duration::from_secs(1), + memory_pressure_rx, runtime_ctrl_tx, runtime_ctrl_rx, pipeline_completion_tx, diff --git a/rust/otap-dataflow/docs/configuration-model.md b/rust/otap-dataflow/docs/configuration-model.md index 62c55cfee1..369dcce0f5 100644 --- a/rust/otap-dataflow/docs/configuration-model.md +++ b/rust/otap-dataflow/docs/configuration-model.md @@ -229,6 +229,11 @@ policies: resources: core_allocation: type: all_cores + memory_limiter: + mode: observe_only + source: auto + soft_limit: 7 GiB + hard_limit: 8 GiB transport_headers: header_capture: headers: @@ -260,7 +265,18 @@ Defaults at top-level: - `telemetry.tokio_metrics = true` - `telemetry.runtime_metrics = basic` - `resources.core_allocation = all_cores` -- `transport_headers` = not set (opt-in; no headers captured or propagated) +- `transport_headers = not set` (opt-in; no headers captured or propagated) + +Memory limiter configuration: + +- `policies.resources.memory_limiter` is optional and process-wide. +- If configured, `mode` must be explicitly set to `enforce` or `observe_only`. +- The policy is supported only at top-level `policies.resources`. + Group, pipeline, and observability-pipeline resource placements are rejected. +- In Phase 1, `Soft` remains informational; `Hard` is the ingress-shedding + threshold. +- Detailed runtime behavior and rollout guidance are documented in + [memory-limiter-phase1.md](memory-limiter-phase1.md). Control channel keys: diff --git a/rust/otap-dataflow/docs/memory-limiter-phase1.md b/rust/otap-dataflow/docs/memory-limiter-phase1.md new file mode 100644 index 0000000000..28b9630724 --- /dev/null +++ b/rust/otap-dataflow/docs/memory-limiter-phase1.md @@ -0,0 +1,427 @@ +# Memory Limiter - Phase 1 + +This document describes the **Phase 1 implementation** of the process-wide memory +limiter. It covers current behavior only. The longer-term hierarchical +lease-and-ticket design is planned for a separate document. + +## Problem + +The collector already has bounded channels, topic publish limits, and +receiver-side backpressure. Those controls are local: each one protects a +single queue or subsystem, but nothing enforces a shared RAM ceiling across: + +- concurrent receiver ingress +- decoded and buffered request bodies +- multiple queues and topics simultaneously +- allocator overhead and fragmentation +- retained protocol state + +Phase 1 adds a **process-wide guardrail** against sustained memory pressure. + +## Scope + +Phase 1 is a **process-wide observed-memory limiter** implemented as an engine +service. It samples actual process memory on a fixed interval and gates +receiver ingress based on the result. + +Implementation note: + +- Sampling and pressure classification remain process-wide in the controller. +- On pressure transitions, the controller propagates updates to receivers + through the pipeline control plane. +- These transitions are delivered as receiver control messages + (`NodeControlMsg::MemoryPressureChanged`). +- Each receiver maintains receiver-local admission state and consults that + local state on ingress hot paths. + +**What it does:** + +- Sample process memory on a configurable interval +- Classify pressure as `Normal`, `Soft`, or `Hard` +- Keep `Soft` informational - requests continue flowing +- Shed ingress at the receiver boundary only under `Hard` (in `enforce` mode) +- Optionally fail the readiness probe under `Hard` (in `enforce` mode) +- Optionally run in `observe_only` mode for metrics and logs without + enforcement +- Expose process-level memory and pressure metrics + +**What it does not do:** + +- Per-pipeline memory budgets +- Ticketed byte accounting +- Per-core local leases +- Queue or topic byte charging +- Reclaim hooks for stateful components +- OTAP stream recycling + +## Why This Complements Bounded Channels + +Bounded channels and topic policies are not replaced by the memory limiter. +They serve different purposes: + +- **Bounded channels / topics** control local backlog growth within one queue +- **Memory limiter** controls total process memory at the outer ingress boundary + +When an internal queue fills, cooperative producers block or drop according to +that queue's policy. When the **process as a whole** approaches its memory +limit, a separate ingress policy is needed at the boundary - one that does not +depend on knowing which internal queue is the cause. + +A key difference in timing: bounded channels react after a message has already +been accepted, decoded, and buffered. The memory limiter acts earlier - at the +receiver ingress boundary, before expensive body accumulation or downstream +admission. This means the limiter can shed load without the full cost of +accepting the work first. + +## Configuration + +The memory limiter is configured under `policies.resources.memory_limiter` in +the engine config. This field is supported only at the top-level `policies` +scope; group and pipeline overrides are rejected during validation. + +```yaml +policies: + resources: + memory_limiter: + mode: enforce # or observe_only for metrics/logs without rejection + source: auto # prefer auto/cgroup on Linux containers + check_interval: 1s # minimum 100ms + soft_limit: 7 GiB # if set explicitly, keep headroom above idle + hard_limit: 8 GiB # shedding threshold; not a strict cap + hysteresis: 512 MiB # bytes below soft_limit required to leave Soft + retry_after_secs: 5 # used only in enforce mode + fail_readiness_on_hard: true # used only in enforce mode + purge_on_hard: false # optional jemalloc purge hook, disabled by default + purge_min_interval: 5s +``` + +### Limit selection + +Limits are resolved in this order: + +1. Explicit `soft_limit` and `hard_limit` (both required if either is set) +2. `source: auto` with cgroup-derived limits when a cgroup memory controller + is detected + +When `source: auto` is used and no explicit limits are configured, the limiter +reads the cgroup hard cap (`memory.max` on cgroup v2, `memory.limit_in_bytes` +on v1) and derives both thresholds from it: + +- `soft_limit` = 90% of the cgroup limit +- `hard_limit` = 95% of the cgroup limit + +This leaves a 5% buffer between the limiter's shedding threshold and the +kernel's actual OOM kill boundary. + +When no cgroup memory controller is detected (for example on macOS, Windows, +or a bare-metal Linux process without a memory cgroup), `auto` does **not** +fall back to deriving limits from total physical RAM. Explicit `soft_limit` +and `hard_limit` must be provided, or startup fails with a configuration +error. This is intentional: silently deriving limits from total host RAM +could produce dangerously high thresholds on large machines. + +### Sizing guidance + +- Do not set `soft_limit` and `hard_limit` only a few MiB above observed idle + RSS. Small run-to-run variance can be enough to flip the limiter from "never + reaches Hard" to "enters Hard and stays there". +- Treat `hard_limit` as an ingress-shedding threshold, not as a promise that + process memory will stay below that value. The limiter is periodic and + reactive, so bursty workloads can overshoot it before shedding takes effect. +- For RSS-based configurations, size limits with explicit headroom above + sustained steady-state memory, not just above an idle snapshot. As a rule of + thumb, start with at least 15-20% headroom above observed steady-state + memory, then adjust using production measurements. +- For Linux containerized deployments, prefer `source: auto` so the limiter can + derive cgroup-based limits instead of relying on RSS alone. +- Auto-derived cgroup limits use 90%/95% of the cgroup cap. If you want the + limiter to begin shedding well before the container approaches its memory + limit, configure explicit `soft_limit` and `hard_limit` values relative to + expected peak working-set usage. +- Recovery from `Hard` requires usage to fall below `soft_limit`, not merely + below `hard_limit`. In practice, `soft_limit - steady_state_usage` is the + real recovery headroom. If `soft_limit` is set below the process's + irreducible working set, `Hard` becomes a permanent state. + +### Mode + +- `mode` is required when `memory_limiter` is configured. This is an explicit + operator choice, not an implicit default. +- `mode: enforce` sheds ingress under `Hard` pressure and can fail readiness. +- `mode: observe_only` keeps classification, logs, and metrics enabled but + suppresses ingress shedding, readiness failure, and forced jemalloc purge. +- For a first rollout, prefer `mode: observe_only` so you can validate sampled + usage, pressure transitions, dashboards, and readiness behavior before + enabling enforcement. + +Mode is set at startup and cannot be changed while the process is running. + +### Purge + +- `purge_on_hard` is an optional jemalloc-only mitigation for RSS-based + retention. When enabled, a tick whose pre-purge sample classifies as `Hard` + attempts a forced jemalloc purge, then re-samples memory before classifying + the next state. +- `purge_min_interval` controls the minimum time between purge attempts. Both + successful and failed attempts count toward the rate limit, which prevents + the limiter from spamming a broken purge call on every tick. +- Purge is best-effort. If the purge call or the post-purge re-sample fails, + the limiter logs a warning (`process_memory_limiter.purge_failed`) and + commits the pre-purge `Hard` classification. A purge failure never prevents + the limiter from updating shared state. +- `purge_on_hard` is ignored in `observe_only` mode. If `purge_on_hard` is + enabled but jemalloc purge support is not available in the build, the + limiter logs a startup warning (`process_memory_limiter.purge_unavailable`) + and continues without purge. +- Keep `purge_on_hard` disabled unless you have validated it on your workload. + It is intended as an escape hatch for allocator-retained resident pages, not + as the default recovery mechanism. + +### Memory source + + +| Source | Description | +| --- | --- | +| `auto` | Cgroup working set if available, otherwise RSS, otherwise jemalloc resident | +| `cgroup` | Cgroup working set (v1 and v2 supported); fails if no cgroup controller detected | +| `rss` | Process RSS via the `memory_stats` crate | +| `jemalloc_resident` | jemalloc resident bytes; requires `jemalloc` feature | + + +Cgroup sampling subtracts `inactive_file` pages from the raw usage counter, +consistent with how container orchestrators report memory usage. + +### Source selection guidance + +- Prefer `source: auto` for Linux containerized deployments. In Kubernetes and + other cgroup-managed environments, this usually resolves to `cgroup`. +- `cgroup` is not Kubernetes-specific. It is also meaningful for Linux services + that run inside a memory-constrained systemd slice or another configured + cgroup. +- For a plain Linux process started from a shell without a meaningful cgroup + memory limit, explicit `soft_limit` and `hard_limit` are typically required. +- `jemalloc_resident` is a resident-memory signal, not a live-allocation + signal. It is not a recovery-oriented alternative to `rss`. + +## Platform Support + +The Phase 1 limiter architecture is portable, but the current implementation is +strongest on Linux. + +- Linux has the best support today because `auto` can use cgroup working-set + sampling and cgroup-derived limits. +- On non-Linux platforms, the limiter can still use RSS-based sampling, and it + can use explicit configured limits. +- `auto` does **not** fall back to total physical RAM on macOS or Windows. + If no cgroup limit is available and no explicit limits are set, startup + fails. + +Internally, the implementation keeps the platform-specific logic at the memory +probe boundary. The pressure-state logic, receiver shedding behavior, and +controller integration remain platform-independent. + +### Recovery caveat for `rss` + +When the collector is built with jemalloc and configured with `source: rss`, +recovery after burst load can be slow. Freed allocations may remain resident in +jemalloc arenas for reuse, so process RSS can stay above `hard_limit` even +after live workload drops. In that state the limiter may continue to reject new +ingress until resident pages are released or the process restarts. + +For containerized Linux deployments, prefer `source: auto` or `source: cgroup` +when available. Resident-memory sources are conservative protection signals, +but they are not ideal recovery signals after allocator-heavy bursts. + +### Container deployment notes + +- In containers, `source: auto` should resolve to `cgroup` and generally + provides better recovery behavior than `rss`. +- If you rely on `/readyz` from Kubernetes or another orchestrator, bind the + admin server to a pod-reachable address such as `--http-admin-bind + 0.0.0.0:8080`. The default loopback bind is not sufficient for external + readiness probes. +- Set `--num-cores` in line with the container CPU limit. Otherwise the engine + may start more worker threads than the container is intended to run, which + increases idle memory overhead. + +## Pressure Semantics + +The limiter maintains a three-level pressure state: + +| Level | Meaning | Receiver behavior | +| --- | --- | --- | +| `Normal` | Below `soft_limit` | No action | +| `Soft` | Above `soft_limit` | Informational only; requests continue flowing | +| `Hard` | Above `hard_limit` | Ingress shedding enabled (`enforce` mode only) | + +When `mode: observe_only` is configured, the same state transitions still +occur, but `Hard` remains advisory: receivers continue accepting requests and +the readiness endpoint stays healthy. + +```mermaid +stateDiagram-v2 + [*] --> Normal + + Normal --> Soft : usage >= soft_limit + Normal --> Hard : usage >= hard_limit + + Soft --> Hard : usage >= hard_limit + Soft --> Normal : usage < soft_limit - hysteresis + + Hard --> Soft : usage < soft_limit +``` + +### Transitions + +- **Escalation** (Normal -> Soft, Soft -> Hard, Normal -> Hard) is immediate + when the threshold is crossed. +- **Recovery from Soft** requires usage to drop below + `soft_limit - hysteresis` before returning to `Normal`. This prevents + oscillation when usage hovers near the soft threshold. +- **Recovery from Hard** requires usage to drop below `soft_limit` + before returning to `Soft`. +- Phase 1 does **not** implement cooldown timers. Those are planned for a + later phase. + +Because sampling is periodic, the limiter can move directly from `Normal` to +`Hard` under fast bursts without spending a full interval in `Soft`. + +Operationally, this means `soft_limit` is the reopening threshold after +shedding begins. `hard_limit` starts rejection, but recovery does not begin +until usage has fallen below `soft_limit`. + +## Receiver Behavior Under Hard Pressure + +Phase 1 applies protocol-native overload signals at each receiver. The +following behaviors apply in `enforce` mode only. In `observe_only` mode, +receivers continue accepting requests regardless of pressure level. + + +| Receiver | Hard-pressure behavior | +| --- | --- | +| OTLP HTTP | `503 Service Unavailable` with `Retry-After: ` header | +| OTLP gRPC | `RESOURCE_EXHAUSTED` with `grpc-retry-pushback-ms: ` metadata | +| OTAP gRPC stream open / next-read boundary | `RESOURCE_EXHAUSTED` + `grpc-retry-pushback-ms` before stream admission, and for already-open streams at the next read boundary | +| OTAP gRPC per-batch | `ResourceExhausted` in the OTAP Arrow batch status (ArrowStatus code 8) | +| Syslog / CEF TCP | Accept then immediately drop new connections; close active connections mid-stream | +| Syslog / CEF UDP | Drop incoming datagrams | + + +**Soft pressure:** all receivers continue operating normally - no requests are +rejected and no receiver-level rejection counters increment. The engine-level +`memory_pressure_state` metric reflects `1` (Soft) and +`process_memory_usage_bytes` reflects the elevated usage. A +`process_memory_limiter.transition` log event is emitted at `info` level on +entry to `Soft`. The behaviors in the table above apply only at `Hard` in +`enforce` mode. + +**Syslog / CEF client behavior under Hard pressure:** + +- **TCP:** The receiver accepts new connections and then immediately drops the + socket, closing active connections mid-stream. The connection is closed at the + transport layer with no application-level retry hint - unlike OTLP/OTAP, + no `Retry-After` or pushback value is sent. Most syslog clients (rsyslog, + syslog-ng, Fluent Bit) have their own reconnect backoff, but they have no + signal about why the connection was closed or how long to wait before + reconnecting. +- **UDP:** Datagrams are silently dropped at the receiver. UDP is fire-and-forget, + so the sender receives no feedback at all. Events are permanently lost with no + indication to the sending client. Operators relying on UDP syslog should treat + Hard pressure as a potential data-loss event and monitor + `received_logs_rejected_memory_pressure` to detect it. + +**Design rationale:** explicit rejection is preferred over transport-level +stalling. For TCP, holding large numbers of stalled open connections under +pressure can consume more resources than the data they carry. Explicit close or +refusal is observable, bounded, and gives senders a clear signal to back off. + +**Known gap:** OTAP stream reads are checked for memory pressure at the next +read boundary. If pressure flips to `Hard` while a stream task is already +blocked in `message().await`, one additional batch may still be read before the +stream is rejected. + +## Readiness Integration + +When `fail_readiness_on_hard` is enabled (default: `true`), the `/readyz` +endpoint returns `503 Service Unavailable` while the limiter is in `Hard` +pressure in `enforce` mode. In `observe_only`, readiness remains healthy even +if pressure reaches `Hard`. The `/livez` endpoint is unaffected. + +## Metrics + +### Engine-level (emitted by the engine metrics monitor) + +All engine metrics are registered under the `engine.metrics` metric-set. + + +| Metric | Description | +| --- | --- | +| `memory_rss` | Current process RSS in bytes | +| `process_memory_usage_bytes` | Most recent memory limiter sample in bytes | +| `process_memory_soft_limit_bytes` | Effective soft limit in bytes | +| `process_memory_hard_limit_bytes` | Effective hard limit in bytes | +| `memory_pressure_state` | Current pressure level (0=Normal, 1=Soft, 2=Hard) | +| `cpu_utilization` | Process CPU utilization as a ratio in [0, 1], normalized across all system cores | + + +### Receiver-level + + +| Metric | Receiver | Description | +| --- | --- | --- | +| `otlp.receiver.metrics.refused_memory_pressure` | OTLP (gRPC + HTTP) | Requests rejected due to memory pressure | +| `otlp.receiver.metrics.rejected_requests` | OTLP (gRPC + HTTP) | Total rejected requests (includes memory pressure) | +| `otap.receiver.metrics.refused_memory_pressure` | OTAP gRPC | Requests rejected due to memory pressure | +| `otap.receiver.metrics.rejected_requests` | OTAP gRPC | Total rejected requests (includes memory pressure) | +| `syslog_cef.receiver.metrics.tcp_connections_rejected_memory_pressure` | Syslog / CEF TCP | Connections rejected or closed | +| `syslog_cef.receiver.metrics.received_logs_rejected_memory_pressure` | Syslog / CEF | Log records dropped under pressure | + + +### Structured log events + + +| Event | Level | Description | +| --- | --- | --- | +| `process_memory_limiter.transition` | info/warn | Emitted on every pressure level change. `Hard` transitions log at warn level. | +| `process_memory_limiter.purge` | info | Emitted after a successful forced jemalloc purge. Includes pre/post usage and duration. | +| `process_memory_limiter.purge_failed` | warn | Emitted when a purge attempt or post-purge re-sample fails. | +| `process_memory_limiter.purge_unavailable` | warn | Emitted at startup when `purge_on_hard` is enabled but no allocator purge backend is available in this build. | +| `process_memory_limiter.sample_failed` | warn | Emitted when a periodic memory sample fails. | +| `process_memory_limiter.observe_only_ignored_setting` | warn | Emitted at startup when `purge_on_hard: true` is set with `mode: observe_only` (purge is suppressed in that mode). | + + +## Tradeoffs + +Phase 1 is deliberately simpler than the long-term design. + +**Benefits:** + +- Low implementation risk +- Additional process-wide protection against memory pressure +- Enforcement hot paths are receiver-local and NUMA-friendly; the process-wide + sampler is not consulted on ingress +- Clean fit with existing receiver admission controls +- No invasive queue or pdata instrumentation required + +**Limitations:** + +- Reactive: detects pressure after memory is already allocated +- The configured `hard_limit` is a shedding threshold, not a strict cap on + peak process memory +- Process-wide only: cannot isolate one misbehaving pipeline +- No accounting for bytes retained in queues, topics, or processor state +- No reclaim actions by default; optional `purge_on_hard` can force a + jemalloc purge before reclassification + +## Relationship to Later Phases + +Later phases will add: + +- Queue and topic byte accounting +- Per-pipeline memory budgets +- Per-core local leases with bounded overshoot +- `MemoryTicket` ownership on retained work items +- Reclaim hooks for stateful components (batch processor, retry, durable buffer) +- OTAP stream-state accounting and recycling + +Those are out of scope for this document and this branch.