diff --git a/pkg/apis/clickhouse.altinity.com/v1/type_configuration_chop.go b/pkg/apis/clickhouse.altinity.com/v1/type_configuration_chop.go index d1ffbfb6f..2c88de135 100644 --- a/pkg/apis/clickhouse.altinity.com/v1/type_configuration_chop.go +++ b/pkg/apis/clickhouse.altinity.com/v1/type_configuration_chop.go @@ -58,11 +58,15 @@ const ( // OnConfigurationChangeRestart means exit the process so the pod restarts with the new config. OnConfigurationChangeRestart = "restart" - // RecoveryActionNone means do nothing, CHI stays Aborted. + // RecoveryActionNone means do nothing, CHI stays in its current state. RecoveryActionNone = "none" // RecoveryActionRetry means re-enqueue CHI for reconcile (default). RecoveryActionRetry = "retry" + // defaultCompletedOnPodNotReadyThreshold is the minimum time a pod must remain in + // Ready=False before the operator considers the host stuck and re-enqueues a reconcile + defaultCompletedOnPodNotReadyThreshold = 5 * time.Minute + // Default values for ClickHouse user configuration // 1. user/profile // 2. user/quota @@ -578,6 +582,8 @@ type OperatorConfigReconcileRecovery struct { type OperatorConfigReconcileRecoveryFrom struct { // Aborted scope — recovery from Status=Aborted. Aborted OperatorConfigReconcileRecoveryScope `json:"aborted,omitempty" yaml:"aborted,omitempty"` + // Completed scope — recovery from Status=Completed when a child pod regresses to Ready=False + Completed OperatorConfigReconcileRecoveryCompletedScope `json:"completed,omitempty" yaml:"completed,omitempty"` // Future: Failed, Broken, etc. } @@ -592,6 +598,21 @@ type OperatorConfigReconcileRecoveryScope struct { // Future: OnKeeperReady, OnOperatorRestart. } +// OperatorConfigReconcileRecoveryCompletedScope holds the event→action mappings for the +// Completed scope. +type OperatorConfigReconcileRecoveryCompletedScope struct { + // OnPodNotReady controls reaction when a pod belonging to a Completed CHI flips + // Ready=True → Ready=False and stays NotReady for at least OnPodNotReadyThreshold: + // nil / "retry" (default) — re-enqueue CHI for reconcile so shouldForceRestartHost + // can decide whether to restart the host + // "none" — do nothing, host stays Ready=False until external action + OnPodNotReady *types.String `json:"onPodNotReady,omitempty" yaml:"onPodNotReady,omitempty"` + // OnPodNotReadyThreshold is the minimum duration a pod must remain in Ready=False + // before this scope fires. Accepts any time.ParseDuration string (default "5m" + // when unset, empty, or unparseable). + OnPodNotReadyThreshold *types.String `json:"onPodNotReadyThreshold,omitempty" yaml:"onPodNotReadyThreshold,omitempty"` +} + type OperatorConfigReconcileRuntime struct { ReconcileCHIsThreadsNumber int `json:"reconcileCHIsThreadsNumber" yaml:"reconcileCHIsThreadsNumber"` ReconcileShardsThreadsNumber int `json:"reconcileShardsThreadsNumber" yaml:"reconcileShardsThreadsNumber"` @@ -1634,6 +1655,35 @@ func (c *OperatorConfig) ShouldRecoverAbortedOnPodReady() bool { return value == RecoveryActionRetry } +// ShouldRecoverCompletedOnPodNotReady reports whether the operator should re-enqueue a +// CHI reconcile when a pod belonging to a Completed CHI flips to Ready=False and stays +// there for longer than CompletedOnPodNotReadyThreshold. Default is to retry. +// Backed by reconcile.recovery.from.completed.onPodNotReady config key. +func (c *OperatorConfig) ShouldRecoverCompletedOnPodNotReady() bool { + value := strings.ToLower(c.Reconcile.Recovery.From.Completed.OnPodNotReady.String()) + if value == "" { + // Default behavior — retry + return true + } + return value == RecoveryActionRetry +} + +// CompletedOnPodNotReadyThreshold returns the minimum duration a pod must remain in +// Ready=False before the Completed recovery scope fires. Falls back to the package +// default (5m) if the config value is unset, empty, or unparseable. +// Backed by reconcile.recovery.from.completed.onPodNotReadyThreshold config key. +func (c *OperatorConfig) CompletedOnPodNotReadyThreshold() time.Duration { + raw := strings.TrimSpace(c.Reconcile.Recovery.From.Completed.OnPodNotReadyThreshold.String()) + if raw == "" { + return defaultCompletedOnPodNotReadyThreshold + } + d, err := time.ParseDuration(raw) + if err != nil || d <= 0 { + return defaultCompletedOnPodNotReadyThreshold + } + return d +} + // IsNamespaceWatched returns whether specified namespace is in a list of watched // TODO unify with GetInformerNamespace func (c *OperatorConfig) IsNamespaceWatched(namespace string) bool { diff --git a/pkg/apis/clickhouse.altinity.com/v1/type_configuration_chop_recovery_test.go b/pkg/apis/clickhouse.altinity.com/v1/type_configuration_chop_recovery_test.go index 1efc717f2..f5c1b8be2 100644 --- a/pkg/apis/clickhouse.altinity.com/v1/type_configuration_chop_recovery_test.go +++ b/pkg/apis/clickhouse.altinity.com/v1/type_configuration_chop_recovery_test.go @@ -16,6 +16,7 @@ package v1 import ( "testing" + "time" "github.com/stretchr/testify/require" @@ -57,3 +58,66 @@ func TestRecoveryActionConstants(t *testing.T) { require.Equal(t, "none", RecoveryActionNone) require.Equal(t, "retry", RecoveryActionRetry) } + +// TestShouldRecoverCompletedOnPodNotReady verifies the accessor's behavior across the +// full matrix of possible values for reconcile.recovery.from.completed.onPodNotReady. +// Mirrors TestShouldRecoverAbortedOnPodReady so symmetric config keys behave identically. +func TestShouldRecoverCompletedOnPodNotReady(t *testing.T) { + tests := []struct { + name string + onPodNotRdy *types.String + expected bool + }{ + {"nil defaults to retry (close the gap by default)", nil, true}, + {"empty string defaults to retry", types.NewString(""), true}, + {"retry lowercase", types.NewString("retry"), true}, + {"Retry mixed case", types.NewString("Retry"), true}, + {"RETRY upper case", types.NewString("RETRY"), true}, + {"none lowercase — opt-out", types.NewString("none"), false}, + {"None mixed case", types.NewString("None"), false}, + {"NONE upper case", types.NewString("NONE"), false}, + {"unknown value treated as no-retry (fail safe)", types.NewString("bogus"), false}, + {"whitespace-only treated as no-retry", types.NewString(" "), false}, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + c := &OperatorConfig{} + c.Reconcile.Recovery.From.Completed.OnPodNotReady = tc.onPodNotRdy + require.Equal(t, tc.expected, c.ShouldRecoverCompletedOnPodNotReady()) + }) + } +} + +// TestCompletedOnPodNotReadyThreshold verifies the threshold parser. Unparseable, +// empty, and non-positive values must fall back to the package default — operators +// who *really* want to disable the safety net should use onPodNotReady=none, not +// pass a malformed duration. +func TestCompletedOnPodNotReadyThreshold(t *testing.T) { + tests := []struct { + name string + raw *types.String + expected time.Duration + }{ + {"nil falls back to default", nil, defaultCompletedOnPodNotReadyThreshold}, + {"empty falls back to default", types.NewString(""), defaultCompletedOnPodNotReadyThreshold}, + {"whitespace falls back to default", types.NewString(" "), defaultCompletedOnPodNotReadyThreshold}, + {"unparseable falls back to default", types.NewString("five minutes"), defaultCompletedOnPodNotReadyThreshold}, + {"zero falls back to default (don't accidentally disable)", + types.NewString("0s"), defaultCompletedOnPodNotReadyThreshold}, + {"negative falls back to default", types.NewString("-30s"), defaultCompletedOnPodNotReadyThreshold}, + {"30 seconds — aggressive", types.NewString("30s"), 30 * time.Second}, + {"5 minutes — the documented default in string form", + types.NewString("5m"), 5 * time.Minute}, + {"1 hour — conservative", types.NewString("1h"), time.Hour}, + {"complex duration: 1h30m", types.NewString("1h30m"), 90 * time.Minute}, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + c := &OperatorConfig{} + c.Reconcile.Recovery.From.Completed.OnPodNotReadyThreshold = tc.raw + require.Equal(t, tc.expected, c.CompletedOnPodNotReadyThreshold()) + }) + } +} diff --git a/pkg/controller/chi/worker-boilerplate.go b/pkg/controller/chi/worker-boilerplate.go index 97b157602..d8b28fde1 100644 --- a/pkg/controller/chi/worker-boilerplate.go +++ b/pkg/controller/chi/worker-boilerplate.go @@ -151,6 +151,8 @@ func (w *worker) processReconcilePod(ctx context.Context, cmd *cmd_queue.Reconci // and re-enqueue the CHI for reconcile. Controlled by config option // reconcile.recovery.from.aborted.onPodReady (default: retry). w.recoverAbortedReconcileOnPodReady(ctx, cmd.Old, cmd.New) + // Symmetric path for Ready→NotReady on Completed CHIs. + w.recoverCompletedReconcileOnPodNotReady(ctx, cmd.Old, cmd.New) return nil case cmd_queue.ReconcileDelete: w.a.V(1).M(cmd.Old).F().Info("Delete Pod. %s/%s", cmd.Old.Namespace, cmd.Old.Name) diff --git a/pkg/controller/chi/worker-pod-retry.go b/pkg/controller/chi/worker-pod-retry.go index 86b901b46..3cb4d2980 100644 --- a/pkg/controller/chi/worker-pod-retry.go +++ b/pkg/controller/chi/worker-pod-retry.go @@ -17,6 +17,7 @@ package chi import ( "context" "strings" + "time" core "k8s.io/api/core/v1" @@ -28,6 +29,14 @@ import ( "github.com/altinity/clickhouse-operator/pkg/model/k8s" ) +// stuckHostMinDelay floors the deferred-re-enqueue delay so a 1-second flap +// can't produce an immediate reconcile. +const stuckHostMinDelay = 5 * time.Second + +// stuckHostExtraDelay buffers threshold against apiserver/informer latency +// so the eventual reconcile observes an up-to-date LastTransitionTime. +const stuckHostExtraDelay = 2 * time.Second + // normalizeTimeAbortReasons enumerates Aborted reasons that cannot recover via // pod transitions — the spec itself must be edited. Auto-recovery skips these // to avoid metrics churn on pod-Ready flips that would just re-trigger the same @@ -132,3 +141,104 @@ func isPodNotReadyToReadyTransition(oldPod, newPod *core.Pod) bool { isReadyNow := !k8s.PodHasNotReadyContainers(newPod) return wasNotReady && isReadyNow } + +// recoverCompletedReconcileOnPodNotReady is the symmetric counterpart of +// recoverAbortedReconcileOnPodReady. It inspects a pod update event and schedules a +// delayed CHI reconcile when a child pod of a Completed CHI transitions Ready → NotReady. +// The delay equals the configured threshold (default 5m), so by the time the reconcile +// fires, shouldForceRestartHost can observe a sustained Ready=False and decide whether +// to restart the host. +func (w *worker) recoverCompletedReconcileOnPodNotReady(ctx context.Context, oldPod, newPod *core.Pod) { + if !chop.Config().ShouldRecoverCompletedOnPodNotReady() { + return + } + + if !isPodReadyToNotReadyTransition(oldPod, newPod) { + return + } + + // Skip pods that are terminating — the Ready→NotReady flip is normal + // shutdown bookkeeping, not a host regression. + if newPod.GetDeletionTimestamp() != nil && !newPod.GetDeletionTimestamp().IsZero() { + return + } + + // Skip pods already in a kubelet-driven failure mode (ImagePullBackOff, + // CrashLoopBackOff, Pending, etc.) — kubelet is handling those and an + // operator-driven StatefulSet rollout would just race it. + if podIsInKubeletFailureMode(newPod) { + return + } + + cr, err := w.c.GetCR(&newPod.ObjectMeta) + if err != nil || cr == nil { + return + } + + if !shouldTriggerStuckHostRecovery(cr) { + return + } + + threshold := chop.Config().CompletedOnPodNotReadyThreshold() + delay := stuckHostScheduleDelay(newPod, threshold, time.Now()) + + w.a.V(1).M(cr).F(). + WithEvent(cr, a.EventActionReconcile, a.EventReasonStuckHostRecoveryTriggered). + Info( + "Stuck-host recovery scheduled: pod %s became NotReady while CHI %s/%s is Completed; "+ + "re-enqueue in %s (threshold %s)", + newPod.Name, cr.Namespace, cr.Name, delay.Truncate(time.Second), threshold, + ) + + scheduled := cr + time.AfterFunc(delay, func() { + w.c.enqueueObject(cmd_queue.NewReconcileCHI(cmd_queue.ReconcileAdd, nil, scheduled)) + }) +} + +// shouldTriggerStuckHostRecovery reports whether the given CHI is a valid stuck-host +// recovery target: status is Completed and the CHI is not being deleted. +func shouldTriggerStuckHostRecovery(cr *api.ClickHouseInstallation) bool { + if cr == nil { + return false + } + status := cr.EnsureStatus() + if status.GetStatus() != api.StatusCompleted { + return false + } + if !cr.GetDeletionTimestamp().IsZero() { + return false + } + return true +} + +// isPodReadyToNotReadyTransition reports whether the pod transitioned from "all containers +// ready" to "some container not ready". The dual of isPodNotReadyToReadyTransition. +func isPodReadyToNotReadyTransition(oldPod, newPod *core.Pod) bool { + if oldPod == nil || newPod == nil { + return false + } + wasReady := !k8s.PodHasNotReadyContainers(oldPod) + isNotReadyNow := k8s.PodHasNotReadyContainers(newPod) + return wasReady && isNotReadyNow +} + +// stuckHostScheduleDelay computes how long to wait before firing the stuck-host +// re-enqueue. It returns max(threshold − elapsed + extra, minDelay), clamped to +// non-negative. +func stuckHostScheduleDelay(newPod *core.Pod, threshold time.Duration, now time.Time) time.Duration { + elapsed := time.Duration(0) + if newPod != nil { + for _, cond := range newPod.Status.Conditions { + if cond.Type == core.PodReady && !cond.LastTransitionTime.IsZero() { + elapsed = now.Sub(cond.LastTransitionTime.Time) + break + } + } + } + delay := threshold - elapsed + stuckHostExtraDelay + if delay < stuckHostMinDelay { + delay = stuckHostMinDelay + } + return delay +} diff --git a/pkg/controller/chi/worker-pod-retry_test.go b/pkg/controller/chi/worker-pod-retry_test.go index 39893bb18..829375919 100644 --- a/pkg/controller/chi/worker-pod-retry_test.go +++ b/pkg/controller/chi/worker-pod-retry_test.go @@ -145,3 +145,186 @@ func TestShouldTriggerAutoRecovery(t *testing.T) { }) } } + +// TestIsPodReadyToNotReadyTransition verifies the dual of isPodNotReadyToReadyTransition: +// fires only on Ready→NotReady, mirrors the same nil/edge-case handling. +func TestIsPodReadyToNotReadyTransition(t *testing.T) { + tests := []struct { + name string + old, new *core.Pod + expected bool + }{ + {"nil old", nil, pod(false), false}, + {"nil new", pod(true), nil, false}, + {"both nil", nil, nil, false}, + {"ready → not ready (the target case)", pod(true), pod(false), true}, + {"ready → ready (no transition)", pod(true), pod(true), false}, + {"not ready → ready (wrong direction, handled by sibling)", pod(false), pod(true), false}, + {"not ready → not ready", pod(false), pod(false), false}, + {"multi-container: all ready → one not ready", multiContainerPod(true, true), multiContainerPod(false, true), true}, + {"multi-container: one not ready → all ready", multiContainerPod(true, false), multiContainerPod(true, true), false}, + {"multi-container: all ready → all ready", multiContainerPod(true, true), multiContainerPod(true, true), false}, + {"empty statuses → not ready (fires; empty counts as ready)", + &core.Pod{}, pod(false), true}, + {"12-container pod: last flips to not ready", + multiContainerPod(true, true, true, true, true, true, true, true, true, true, true, true), + multiContainerPod(true, true, true, true, true, true, true, true, true, true, true, false), + true}, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + got := isPodReadyToNotReadyTransition(tc.old, tc.new) + require.Equal(t, tc.expected, got) + }) + } +} + +// TestShouldTriggerStuckHostRecovery verifies the CR-state gate used by +// recoverCompletedReconcileOnPodNotReady. +func TestShouldTriggerStuckHostRecovery(t *testing.T) { + makeCR := func(status string, deleting bool) *api.ClickHouseInstallation { + cr := &api.ClickHouseInstallation{ + ObjectMeta: meta.ObjectMeta{Name: "chi", Namespace: "ns"}, + } + cr.EnsureStatus().Status = status + if deleting { + now := meta.NewTime(time.Now()) + cr.ObjectMeta.DeletionTimestamp = &now + } + return cr + } + + tests := []struct { + name string + cr *api.ClickHouseInstallation + expected bool + }{ + {"nil CR — reject", nil, false}, + // The target case: Completed CHI whose host has just regressed. + {"Completed, not deleting — accept (the target case)", makeCR(api.StatusCompleted, false), true}, + // Aborted is the sibling path's responsibility; firing stuck-host recovery on it + // would double-enqueue with recoverAbortedReconcileOnPodReady once the pod + // eventually becomes Ready again. + {"Aborted — reject (handled by sibling recoverAbortedReconcileOnPodReady path)", + makeCR(api.StatusAborted, false), false}, + // InProgress means a reconcile is already in flight; let it observe the pod state + // on its own rather than racing another enqueue. + {"InProgress — reject (reconcile already running)", makeCR(api.StatusInProgress, false), false}, + {"Terminating — reject", makeCR(api.StatusTerminating, false), false}, + {"Completed but being deleted — reject", makeCR(api.StatusCompleted, true), false}, + // Fresh CR with no status field set yet — happens between Create and the first + // status update by the operator. + {"empty status (fresh CR) — reject", makeCR("", false), false}, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + require.Equal(t, tc.expected, shouldTriggerStuckHostRecovery(tc.cr)) + }) + } +} + +// TestStuckHostScheduleDelay verifies the delay computation for the deferred re-enqueue. +// The helper is pure (clock + threshold injected as args), so we can exercise the +// boundary conditions without time-mocking the rest of the controller. +func TestStuckHostScheduleDelay(t *testing.T) { + now := time.Date(2026, 5, 28, 12, 0, 0, 0, time.UTC) + + // podWithReadyTransition builds a pod whose PodReady condition transitioned at the + // given offset from "now". Negative offset means the transition is in the past. + podWithReadyTransition := func(offset time.Duration) *core.Pod { + return &core.Pod{ + Status: core.PodStatus{ + Conditions: []core.PodCondition{ + {Type: core.PodReady, Status: core.ConditionFalse, + LastTransitionTime: meta.NewTime(now.Add(offset))}, + }, + }, + } + } + + tests := []struct { + name string + pod *core.Pod + threshold time.Duration + // expected delay must satisfy lo <= got <= hi (small tolerance for arithmetic). + expectMin time.Duration + expectMax time.Duration + }{ + { + // Fresh transition: full threshold + small extra padding for apiserver + // catch-up. With threshold=5m and elapsed=0, delay should be ~5m02s. + name: "fresh transition: schedule full threshold + extra", + pod: podWithReadyTransition(0), + threshold: 5 * time.Minute, + expectMin: 5*time.Minute + stuckHostExtraDelay - time.Second, + expectMax: 5*time.Minute + stuckHostExtraDelay + time.Second, + }, + { + // Already half-elapsed: remaining ~2.5m + extra. + name: "half-elapsed: schedule the remainder", + pod: podWithReadyTransition(-150 * time.Second), + threshold: 5 * time.Minute, + expectMin: 150*time.Second + stuckHostExtraDelay - time.Second, + expectMax: 150*time.Second + stuckHostExtraDelay + time.Second, + }, + { + // Threshold already past at schedule time (e.g. operator restart after + // long outage): clamp to stuckHostMinDelay rather than firing instantly, + // so a single quick flap doesn't produce an immediate restart. + name: "threshold already past: clamp to minDelay", + pod: podWithReadyTransition(-10 * time.Minute), + threshold: 5 * time.Minute, + expectMin: stuckHostMinDelay, + expectMax: stuckHostMinDelay, + }, + { + // Nil pod: no LastTransitionTime info → treat as elapsed=0 → full threshold. + name: "nil pod: full threshold", + pod: nil, + threshold: 5 * time.Minute, + expectMin: 5*time.Minute + stuckHostExtraDelay, + expectMax: 5*time.Minute + stuckHostExtraDelay, + }, + { + // Pod has no PodReady condition (very early in lifecycle): elapsed=0. + name: "pod missing PodReady condition: full threshold", + pod: &core.Pod{Status: core.PodStatus{Conditions: []core.PodCondition{}}}, + threshold: 5 * time.Minute, + expectMin: 5*time.Minute + stuckHostExtraDelay, + expectMax: 5*time.Minute + stuckHostExtraDelay, + }, + { + // Zero LastTransitionTime (apiserver hasn't stamped it yet): treat as + // elapsed=0, schedule full threshold. + name: "zero LastTransitionTime: full threshold", + pod: &core.Pod{ + Status: core.PodStatus{ + Conditions: []core.PodCondition{ + {Type: core.PodReady, Status: core.ConditionFalse}, + }, + }, + }, + threshold: 5 * time.Minute, + expectMin: 5*time.Minute + stuckHostExtraDelay, + expectMax: 5*time.Minute + stuckHostExtraDelay, + }, + { + // Threshold smaller than minDelay: minDelay still floors the result. + name: "tiny threshold: clamp to minDelay", + pod: podWithReadyTransition(0), + threshold: 1 * time.Second, + expectMin: stuckHostMinDelay, + expectMax: stuckHostMinDelay, + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + got := stuckHostScheduleDelay(tc.pod, tc.threshold, now) + require.GreaterOrEqual(t, got, tc.expectMin, "delay below expected minimum") + require.LessOrEqual(t, got, tc.expectMax, "delay above expected maximum") + }) + } +} diff --git a/pkg/controller/chi/worker-status-helpers.go b/pkg/controller/chi/worker-status-helpers.go index 8da4e493d..f04cb3f2d 100644 --- a/pkg/controller/chi/worker-status-helpers.go +++ b/pkg/controller/chi/worker-status-helpers.go @@ -18,6 +18,8 @@ import ( "context" "time" + core "k8s.io/api/core/v1" + log "github.com/altinity/clickhouse-operator/pkg/announcer" api "github.com/altinity/clickhouse-operator/pkg/apis/clickhouse.altinity.com/v1" "github.com/altinity/clickhouse-operator/pkg/apis/common/types" @@ -52,6 +54,94 @@ func (w *worker) isPodReady(ctx context.Context, host *api.Host) bool { return false } +// isPodSustainedNotReady reports whether the host's pod is currently Ready=False AND +// has been so for at least `threshold`. Returns false for pods whose failure mode is +// already being handled by kubelet (ImagePullBackOff, CrashLoopBackOff, Pending, etc.) +// so the operator does not race kubelet on its own recovery path. +func (w *worker) isPodSustainedNotReady(ctx context.Context, host *api.Host, threshold time.Duration) bool { + if threshold <= 0 { + // Threshold of 0/negative means "feature disabled" + return false + } + pod, err := w.c.kube.Pod().Get(ctx, host) + if err != nil || pod == nil { + return false + } + if podIsInKubeletFailureMode(pod) { + return false + } + return podIsSustainedNotReady(pod, threshold, time.Now()) +} + +// podIsSustainedNotReady is the pure inner predicate of isPodSustainedNotReady, +// extracted so it can be exercised without a kube client. Returns true iff the pod +// has a PodReady condition that is currently not True and whose LastTransitionTime +// is at least `threshold` in the past relative to `now`. +func podIsSustainedNotReady(pod *core.Pod, threshold time.Duration, now time.Time) bool { + if pod == nil || threshold <= 0 { + return false + } + for _, cond := range pod.Status.Conditions { + if cond.Type != core.PodReady { + continue + } + if cond.Status == core.ConditionTrue { + return false + } + // Status is False or Unknown. Treat both as "not ready" + if cond.LastTransitionTime.IsZero() { + return false + } + return now.Sub(cond.LastTransitionTime.Time) >= threshold + } + // No PodReady condition at all. + return false +} + +// kubeletDrivenWaitingReasons is the set of container Waiting.Reason values that +// indicate kubelet is already actively recovering the pod and a parallel +// operator-driven StatefulSet rollout would just race kubelet. +var kubeletDrivenWaitingReasons = map[string]struct{}{ + "CrashLoopBackOff": {}, + "ImagePullBackOff": {}, + "ErrImagePull": {}, + "InvalidImageName": {}, + "CreateContainerError": {}, + "RunContainerError": {}, + "ContainerCannotRun": {}, + "CreateContainerConfigError": {}, +} + +// podIsInKubeletFailureMode reports whether the pod is in a state where kubelet +// (or the kube-scheduler) is already handling the failure: not yet scheduled, +// in Pending phase, or any container in a kubelet-driven waiting reason. +// In those states an operator-driven reconcile would race kubelet without value. +func podIsInKubeletFailureMode(pod *core.Pod) bool { + if pod == nil { + return false + } + if pod.Status.Phase == core.PodPending { + return true + } + for _, cs := range pod.Status.ContainerStatuses { + if cs.State.Waiting == nil { + continue + } + if _, hit := kubeletDrivenWaitingReasons[cs.State.Waiting.Reason]; hit { + return true + } + } + for _, cs := range pod.Status.InitContainerStatuses { + if cs.State.Waiting == nil { + continue + } + if _, hit := kubeletDrivenWaitingReasons[cs.State.Waiting.Reason]; hit { + return true + } + } + return false +} + func (w *worker) isPodStarted(ctx context.Context, host *api.Host) bool { if pod, err := w.c.kube.Pod().Get(ctx, host); err == nil { return k8s.PodHasAllContainersStarted(pod) diff --git a/pkg/controller/chi/worker-status-helpers_test.go b/pkg/controller/chi/worker-status-helpers_test.go new file mode 100644 index 000000000..cb4d7391b --- /dev/null +++ b/pkg/controller/chi/worker-status-helpers_test.go @@ -0,0 +1,201 @@ +// Copyright 2019 Altinity Ltd and/or its affiliates. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package chi + +import ( + "testing" + "time" + + "github.com/stretchr/testify/require" + core "k8s.io/api/core/v1" + meta "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +// TestPodIsSustainedNotReady covers the pure post-fetch decision used by +// isPodSustainedNotReady. +func TestPodIsSustainedNotReady(t *testing.T) { + now := time.Date(2026, 5, 28, 12, 0, 0, 0, time.UTC) + + withReady := func(status core.ConditionStatus, transitionOffset time.Duration) *core.Pod { + return &core.Pod{ + Status: core.PodStatus{ + Conditions: []core.PodCondition{ + {Type: core.PodReady, Status: status, + LastTransitionTime: meta.NewTime(now.Add(transitionOffset))}, + }, + }, + } + } + + tests := []struct { + name string + pod *core.Pod + threshold time.Duration + expected bool + }{ + { + name: "nil pod — never sustained", + pod: nil, + threshold: 5 * time.Minute, + expected: false, + }, + { + name: "zero threshold — feature disabled, never fires", + pod: withReady(core.ConditionFalse, -30*time.Minute), + threshold: 0, + expected: false, + }, + { + name: "negative threshold — feature disabled, never fires", + pod: withReady(core.ConditionFalse, -30*time.Minute), + threshold: -1 * time.Second, + expected: false, + }, + { + name: "no PodReady condition — early lifecycle, never sustained", + pod: &core.Pod{Status: core.PodStatus{Conditions: []core.PodCondition{ + {Type: core.PodInitialized, Status: core.ConditionTrue, + LastTransitionTime: meta.NewTime(now.Add(-10 * time.Minute))}, + }}}, + threshold: 5 * time.Minute, + expected: false, + }, + { + name: "PodReady=True — not sustained even with old LastTransitionTime", + pod: withReady(core.ConditionTrue, -30*time.Minute), + threshold: 5 * time.Minute, + expected: false, + }, + { + name: "PodReady=False but only 1m ago — under threshold (transient)", + pod: withReady(core.ConditionFalse, -1*time.Minute), + threshold: 5 * time.Minute, + expected: false, + }, + { + name: "PodReady=False for exactly the threshold — fires (>= semantics)", + pod: withReady(core.ConditionFalse, -5*time.Minute), + threshold: 5 * time.Minute, + expected: true, + }, + { + name: "PodReady=False for 26h — the production incident, fires", + pod: withReady(core.ConditionFalse, -26*time.Hour), + threshold: 5 * time.Minute, + expected: true, + }, + { + name: "PodReady=Unknown for 10m — treated as not-ready, fires", + pod: withReady(core.ConditionUnknown, -10*time.Minute), + threshold: 5 * time.Minute, + expected: true, + }, + { + name: "PodReady=False but LastTransitionTime is zero — conservative, don't fire", + pod: &core.Pod{Status: core.PodStatus{Conditions: []core.PodCondition{{Type: core.PodReady, Status: core.ConditionFalse}}}}, + threshold: 5 * time.Minute, + expected: false, + }, + { + name: "multiple PodReady entries — use first match", + pod: &core.Pod{Status: core.PodStatus{Conditions: []core.PodCondition{ + {Type: core.PodReady, Status: core.ConditionFalse, + LastTransitionTime: meta.NewTime(now.Add(-10 * time.Minute))}, + {Type: core.PodReady, Status: core.ConditionTrue, + LastTransitionTime: meta.NewTime(now)}, + }}}, + threshold: 5 * time.Minute, + expected: true, + }, + { + name: "PodScheduled present alongside PodReady=False — still fires on Ready", + pod: &core.Pod{Status: core.PodStatus{Conditions: []core.PodCondition{ + {Type: core.PodScheduled, Status: core.ConditionTrue, + LastTransitionTime: meta.NewTime(now.Add(-1 * time.Hour))}, + {Type: core.PodReady, Status: core.ConditionFalse, + LastTransitionTime: meta.NewTime(now.Add(-10 * time.Minute))}, + }}}, + threshold: 5 * time.Minute, + expected: true, + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + require.Equal(t, tc.expected, podIsSustainedNotReady(tc.pod, tc.threshold, now)) + }) + } +} + +// TestPodIsInKubeletFailureMode locks in the kubelet-recovery filter: any pod whose +// failure mode is already being handled by kubelet (image pull errors, crash loops, +// pending, etc.) must NOT trigger the stuck-host recovery path. +func TestPodIsInKubeletFailureMode(t *testing.T) { + waitingContainer := func(reason string) *core.Pod { + return &core.Pod{Status: core.PodStatus{ + Phase: core.PodRunning, + ContainerStatuses: []core.ContainerStatus{ + {Name: "clickhouse", State: core.ContainerState{ + Waiting: &core.ContainerStateWaiting{Reason: reason}, + }}, + }, + }} + } + waitingInit := func(reason string) *core.Pod { + return &core.Pod{Status: core.PodStatus{ + Phase: core.PodRunning, + InitContainerStatuses: []core.ContainerStatus{ + {Name: "init", State: core.ContainerState{ + Waiting: &core.ContainerStateWaiting{Reason: reason}, + }}, + }, + }} + } + + tests := []struct { + name string + pod *core.Pod + expected bool + }{ + {"nil pod", nil, false}, + {"no statuses, running phase", &core.Pod{Status: core.PodStatus{Phase: core.PodRunning}}, false}, + {"Pending phase — scheduler/kubelet handling", &core.Pod{Status: core.PodStatus{Phase: core.PodPending}}, true}, + {"ImagePullBackOff — kubelet handling", waitingContainer("ImagePullBackOff"), true}, + {"ErrImagePull — kubelet handling", waitingContainer("ErrImagePull"), true}, + {"InvalidImageName — kubelet handling", waitingContainer("InvalidImageName"), true}, + {"CrashLoopBackOff — kubelet handling", waitingContainer("CrashLoopBackOff"), true}, + {"CreateContainerError — kubelet handling", waitingContainer("CreateContainerError"), true}, + {"RunContainerError — kubelet handling", waitingContainer("RunContainerError"), true}, + {"ContainerCannotRun — kubelet handling", waitingContainer("ContainerCannotRun"), true}, + {"CreateContainerConfigError — kubelet handling", waitingContainer("CreateContainerConfigError"), true}, + {"init container in ImagePullBackOff — kubelet handling", waitingInit("ImagePullBackOff"), true}, + {"ContainerCreating — transient, not kubelet failure", waitingContainer("ContainerCreating"), false}, + {"PodInitializing — transient, not kubelet failure", waitingContainer("PodInitializing"), false}, + {"running container, no waiting state", &core.Pod{Status: core.PodStatus{ + Phase: core.PodRunning, + ContainerStatuses: []core.ContainerStatus{ + {Name: "clickhouse", Ready: true, State: core.ContainerState{ + Running: &core.ContainerStateRunning{}, + }}, + }, + }}, false}, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + require.Equal(t, tc.expected, podIsInKubeletFailureMode(tc.pod)) + }) + } +} diff --git a/pkg/controller/chi/worker.go b/pkg/controller/chi/worker.go index caf251699..77e1eed29 100644 --- a/pkg/controller/chi/worker.go +++ b/pkg/controller/chi/worker.go @@ -27,6 +27,7 @@ import ( log "github.com/altinity/clickhouse-operator/pkg/announcer" api "github.com/altinity/clickhouse-operator/pkg/apis/clickhouse.altinity.com/v1" "github.com/altinity/clickhouse-operator/pkg/apis/common/types" + "github.com/altinity/clickhouse-operator/pkg/chop" "github.com/altinity/clickhouse-operator/pkg/controller/chi/metrics" "github.com/altinity/clickhouse-operator/pkg/controller/common" a "github.com/altinity/clickhouse-operator/pkg/controller/common/announcer" @@ -193,6 +194,17 @@ func (w *worker) shouldForceRestartHost(ctx context.Context, host *api.Host) boo w.a.V(1).M(host).F().Info("Host with unknown version and in CrashLoopBackOff should be restarted. It most likely is unable to start due to bad config. Host: %s", host.GetName()) return true + case chop.Config().ShouldRecoverCompletedOnPodNotReady() && + w.isPodSustainedNotReady(ctx, host, chop.Config().CompletedOnPodNotReadyThreshold()): + // Closes the gap where Completed CHIs with a sustained-NotReady host + // were left stuck indefinitely. + threshold := chop.Config().CompletedOnPodNotReadyThreshold() + w.a.V(1).M(host).F(). + WithEvent(host.GetCR(), a.EventActionReconcile, a.EventReasonHostStuckNotReady). + Info("Host pod has been Ready=False past threshold %s — force restart. Host: %s", + threshold, host.GetName()) + return true + default: w.a.V(1).M(host).F().Info("Host force restart is not required. Host: %s", host.GetName()) return false diff --git a/pkg/controller/common/announcer/event-emitter.go b/pkg/controller/common/announcer/event-emitter.go index f7247b6c5..e71547ad1 100644 --- a/pkg/controller/common/announcer/event-emitter.go +++ b/pkg/controller/common/announcer/event-emitter.go @@ -65,6 +65,15 @@ const ( // reconcile was aborted, on observing a recovery signal (e.g. a pod became Ready). EventReasonAutoRecoveryTriggered = "AutoRecoveryTriggered" + // EventReasonStuckHostRecoveryTriggered fires when the operator re-enqueues a + // Completed CHI for reconcile because one of its hosts has been Ready=False for + // longer than the configured threshold. + EventReasonStuckHostRecoveryTriggered = "StuckHostRecoveryTriggered" + + // EventReasonHostStuckNotReady fires when shouldForceRestartHost decides to force + // a host restart because the pod has been Ready=False past the configured threshold. + EventReasonHostStuckNotReady = "HostStuckNotReady" + // EventReasonKeeperUpdateNoEndpointChange fires when the operator observes a referenced // CHK reconcile completing but decides not to trigger a CHI reconcile because the resolved // zookeeper endpoints have not changed.