Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 51 additions & 1 deletion pkg/apis/clickhouse.altinity.com/v1/type_configuration_chop.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,11 +58,15 @@ const (
// OnConfigurationChangeRestart means exit the process so the pod restarts with the new config.
OnConfigurationChangeRestart = "restart"

// RecoveryActionNone means do nothing, CHI stays Aborted.
// RecoveryActionNone means do nothing, CHI stays in its current state.
RecoveryActionNone = "none"
// RecoveryActionRetry means re-enqueue CHI for reconcile (default).
RecoveryActionRetry = "retry"

// defaultCompletedOnPodNotReadyThreshold is the minimum time a pod must remain in
// Ready=False before the operator considers the host stuck and re-enqueues a reconcile
defaultCompletedOnPodNotReadyThreshold = 5 * time.Minute

// Default values for ClickHouse user configuration
// 1. user/profile
// 2. user/quota
Expand Down Expand Up @@ -578,6 +582,8 @@ type OperatorConfigReconcileRecovery struct {
type OperatorConfigReconcileRecoveryFrom struct {
// Aborted scope — recovery from Status=Aborted.
Aborted OperatorConfigReconcileRecoveryScope `json:"aborted,omitempty" yaml:"aborted,omitempty"`
// Completed scope — recovery from Status=Completed when a child pod regresses to Ready=False
Completed OperatorConfigReconcileRecoveryCompletedScope `json:"completed,omitempty" yaml:"completed,omitempty"`
// Future: Failed, Broken, etc.
}

Expand All @@ -592,6 +598,21 @@ type OperatorConfigReconcileRecoveryScope struct {
// Future: OnKeeperReady, OnOperatorRestart.
}

// OperatorConfigReconcileRecoveryCompletedScope holds the event→action mappings for the
// Completed scope.
type OperatorConfigReconcileRecoveryCompletedScope struct {
// OnPodNotReady controls reaction when a pod belonging to a Completed CHI flips
// Ready=True → Ready=False and stays NotReady for at least OnPodNotReadyThreshold:
// nil / "retry" (default) — re-enqueue CHI for reconcile so shouldForceRestartHost
// can decide whether to restart the host
// "none" — do nothing, host stays Ready=False until external action
OnPodNotReady *types.String `json:"onPodNotReady,omitempty" yaml:"onPodNotReady,omitempty"`
// OnPodNotReadyThreshold is the minimum duration a pod must remain in Ready=False
// before this scope fires. Accepts any time.ParseDuration string (default "5m"
// when unset, empty, or unparseable).
OnPodNotReadyThreshold *types.String `json:"onPodNotReadyThreshold,omitempty" yaml:"onPodNotReadyThreshold,omitempty"`
}

type OperatorConfigReconcileRuntime struct {
ReconcileCHIsThreadsNumber int `json:"reconcileCHIsThreadsNumber" yaml:"reconcileCHIsThreadsNumber"`
ReconcileShardsThreadsNumber int `json:"reconcileShardsThreadsNumber" yaml:"reconcileShardsThreadsNumber"`
Expand Down Expand Up @@ -1634,6 +1655,35 @@ func (c *OperatorConfig) ShouldRecoverAbortedOnPodReady() bool {
return value == RecoveryActionRetry
}

// ShouldRecoverCompletedOnPodNotReady reports whether the operator should re-enqueue a
// CHI reconcile when a pod belonging to a Completed CHI flips to Ready=False and stays
// there for longer than CompletedOnPodNotReadyThreshold. Default is to retry.
// Backed by reconcile.recovery.from.completed.onPodNotReady config key.
func (c *OperatorConfig) ShouldRecoverCompletedOnPodNotReady() bool {
value := strings.ToLower(c.Reconcile.Recovery.From.Completed.OnPodNotReady.String())
if value == "" {
// Default behavior — retry
return true
}
return value == RecoveryActionRetry
}

// CompletedOnPodNotReadyThreshold returns the minimum duration a pod must remain in
// Ready=False before the Completed recovery scope fires. Falls back to the package
// default (5m) if the config value is unset, empty, or unparseable.
// Backed by reconcile.recovery.from.completed.onPodNotReadyThreshold config key.
func (c *OperatorConfig) CompletedOnPodNotReadyThreshold() time.Duration {
raw := strings.TrimSpace(c.Reconcile.Recovery.From.Completed.OnPodNotReadyThreshold.String())
if raw == "" {
return defaultCompletedOnPodNotReadyThreshold
}
d, err := time.ParseDuration(raw)
if err != nil || d <= 0 {
return defaultCompletedOnPodNotReadyThreshold
}
return d
}

// IsNamespaceWatched returns whether specified namespace is in a list of watched
// TODO unify with GetInformerNamespace
func (c *OperatorConfig) IsNamespaceWatched(namespace string) bool {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ package v1

import (
"testing"
"time"

"github.com/stretchr/testify/require"

Expand Down Expand Up @@ -57,3 +58,66 @@ func TestRecoveryActionConstants(t *testing.T) {
require.Equal(t, "none", RecoveryActionNone)
require.Equal(t, "retry", RecoveryActionRetry)
}

// TestShouldRecoverCompletedOnPodNotReady verifies the accessor's behavior across the
// full matrix of possible values for reconcile.recovery.from.completed.onPodNotReady.
// Mirrors TestShouldRecoverAbortedOnPodReady so symmetric config keys behave identically.
func TestShouldRecoverCompletedOnPodNotReady(t *testing.T) {
tests := []struct {
name string
onPodNotRdy *types.String
expected bool
}{
{"nil defaults to retry (close the gap by default)", nil, true},
{"empty string defaults to retry", types.NewString(""), true},
{"retry lowercase", types.NewString("retry"), true},
{"Retry mixed case", types.NewString("Retry"), true},
{"RETRY upper case", types.NewString("RETRY"), true},
{"none lowercase — opt-out", types.NewString("none"), false},
{"None mixed case", types.NewString("None"), false},
{"NONE upper case", types.NewString("NONE"), false},
{"unknown value treated as no-retry (fail safe)", types.NewString("bogus"), false},
{"whitespace-only treated as no-retry", types.NewString(" "), false},
}

for _, tc := range tests {
t.Run(tc.name, func(t *testing.T) {
c := &OperatorConfig{}
c.Reconcile.Recovery.From.Completed.OnPodNotReady = tc.onPodNotRdy
require.Equal(t, tc.expected, c.ShouldRecoverCompletedOnPodNotReady())
})
}
}

// TestCompletedOnPodNotReadyThreshold verifies the threshold parser. Unparseable,
// empty, and non-positive values must fall back to the package default — operators
// who *really* want to disable the safety net should use onPodNotReady=none, not
// pass a malformed duration.
func TestCompletedOnPodNotReadyThreshold(t *testing.T) {
tests := []struct {
name string
raw *types.String
expected time.Duration
}{
{"nil falls back to default", nil, defaultCompletedOnPodNotReadyThreshold},
{"empty falls back to default", types.NewString(""), defaultCompletedOnPodNotReadyThreshold},
{"whitespace falls back to default", types.NewString(" "), defaultCompletedOnPodNotReadyThreshold},
{"unparseable falls back to default", types.NewString("five minutes"), defaultCompletedOnPodNotReadyThreshold},
{"zero falls back to default (don't accidentally disable)",
types.NewString("0s"), defaultCompletedOnPodNotReadyThreshold},
{"negative falls back to default", types.NewString("-30s"), defaultCompletedOnPodNotReadyThreshold},
{"30 seconds — aggressive", types.NewString("30s"), 30 * time.Second},
{"5 minutes — the documented default in string form",
types.NewString("5m"), 5 * time.Minute},
{"1 hour — conservative", types.NewString("1h"), time.Hour},
{"complex duration: 1h30m", types.NewString("1h30m"), 90 * time.Minute},
}

for _, tc := range tests {
t.Run(tc.name, func(t *testing.T) {
c := &OperatorConfig{}
c.Reconcile.Recovery.From.Completed.OnPodNotReadyThreshold = tc.raw
require.Equal(t, tc.expected, c.CompletedOnPodNotReadyThreshold())
})
}
}
2 changes: 2 additions & 0 deletions pkg/controller/chi/worker-boilerplate.go
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,8 @@ func (w *worker) processReconcilePod(ctx context.Context, cmd *cmd_queue.Reconci
// and re-enqueue the CHI for reconcile. Controlled by config option
// reconcile.recovery.from.aborted.onPodReady (default: retry).
w.recoverAbortedReconcileOnPodReady(ctx, cmd.Old, cmd.New)
// Symmetric path for Ready→NotReady on Completed CHIs.
w.recoverCompletedReconcileOnPodNotReady(ctx, cmd.Old, cmd.New)
return nil
case cmd_queue.ReconcileDelete:
w.a.V(1).M(cmd.Old).F().Info("Delete Pod. %s/%s", cmd.Old.Namespace, cmd.Old.Name)
Expand Down
110 changes: 110 additions & 0 deletions pkg/controller/chi/worker-pod-retry.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ package chi
import (
"context"
"strings"
"time"

core "k8s.io/api/core/v1"

Expand All @@ -28,6 +29,14 @@ import (
"github.com/altinity/clickhouse-operator/pkg/model/k8s"
)

// stuckHostMinDelay floors the deferred-re-enqueue delay so a 1-second flap
// can't produce an immediate reconcile.
const stuckHostMinDelay = 5 * time.Second

// stuckHostExtraDelay buffers threshold against apiserver/informer latency
// so the eventual reconcile observes an up-to-date LastTransitionTime.
const stuckHostExtraDelay = 2 * time.Second

// normalizeTimeAbortReasons enumerates Aborted reasons that cannot recover via
// pod transitions — the spec itself must be edited. Auto-recovery skips these
// to avoid metrics churn on pod-Ready flips that would just re-trigger the same
Expand Down Expand Up @@ -132,3 +141,104 @@ func isPodNotReadyToReadyTransition(oldPod, newPod *core.Pod) bool {
isReadyNow := !k8s.PodHasNotReadyContainers(newPod)
return wasNotReady && isReadyNow
}

// recoverCompletedReconcileOnPodNotReady is the symmetric counterpart of
// recoverAbortedReconcileOnPodReady. It inspects a pod update event and schedules a
// delayed CHI reconcile when a child pod of a Completed CHI transitions Ready → NotReady.
// The delay equals the configured threshold (default 5m), so by the time the reconcile
// fires, shouldForceRestartHost can observe a sustained Ready=False and decide whether
// to restart the host.
func (w *worker) recoverCompletedReconcileOnPodNotReady(ctx context.Context, oldPod, newPod *core.Pod) {
if !chop.Config().ShouldRecoverCompletedOnPodNotReady() {
return
}

if !isPodReadyToNotReadyTransition(oldPod, newPod) {
return
}

// Skip pods that are terminating — the Ready→NotReady flip is normal
// shutdown bookkeeping, not a host regression.
if newPod.GetDeletionTimestamp() != nil && !newPod.GetDeletionTimestamp().IsZero() {
return
}

// Skip pods already in a kubelet-driven failure mode (ImagePullBackOff,
// CrashLoopBackOff, Pending, etc.) — kubelet is handling those and an
// operator-driven StatefulSet rollout would just race it.
if podIsInKubeletFailureMode(newPod) {
return
}

cr, err := w.c.GetCR(&newPod.ObjectMeta)
if err != nil || cr == nil {
return
}

if !shouldTriggerStuckHostRecovery(cr) {
return
}

threshold := chop.Config().CompletedOnPodNotReadyThreshold()
delay := stuckHostScheduleDelay(newPod, threshold, time.Now())

w.a.V(1).M(cr).F().
WithEvent(cr, a.EventActionReconcile, a.EventReasonStuckHostRecoveryTriggered).
Info(
"Stuck-host recovery scheduled: pod %s became NotReady while CHI %s/%s is Completed; "+
"re-enqueue in %s (threshold %s)",
newPod.Name, cr.Namespace, cr.Name, delay.Truncate(time.Second), threshold,
)

scheduled := cr
time.AfterFunc(delay, func() {
w.c.enqueueObject(cmd_queue.NewReconcileCHI(cmd_queue.ReconcileAdd, nil, scheduled))
})
}

// shouldTriggerStuckHostRecovery reports whether the given CHI is a valid stuck-host
// recovery target: status is Completed and the CHI is not being deleted.
func shouldTriggerStuckHostRecovery(cr *api.ClickHouseInstallation) bool {
if cr == nil {
return false
}
status := cr.EnsureStatus()
if status.GetStatus() != api.StatusCompleted {
return false
}
if !cr.GetDeletionTimestamp().IsZero() {
return false
}
return true
}

// isPodReadyToNotReadyTransition reports whether the pod transitioned from "all containers
// ready" to "some container not ready". The dual of isPodNotReadyToReadyTransition.
func isPodReadyToNotReadyTransition(oldPod, newPod *core.Pod) bool {
if oldPod == nil || newPod == nil {
return false
}
wasReady := !k8s.PodHasNotReadyContainers(oldPod)
isNotReadyNow := k8s.PodHasNotReadyContainers(newPod)
return wasReady && isNotReadyNow
}

// stuckHostScheduleDelay computes how long to wait before firing the stuck-host
// re-enqueue. It returns max(threshold − elapsed + extra, minDelay), clamped to
// non-negative.
func stuckHostScheduleDelay(newPod *core.Pod, threshold time.Duration, now time.Time) time.Duration {
elapsed := time.Duration(0)
if newPod != nil {
for _, cond := range newPod.Status.Conditions {
if cond.Type == core.PodReady && !cond.LastTransitionTime.IsZero() {
elapsed = now.Sub(cond.LastTransitionTime.Time)
break
}
}
}
delay := threshold - elapsed + stuckHostExtraDelay
if delay < stuckHostMinDelay {
delay = stuckHostMinDelay
}
return delay
}
Loading
Loading