Skip to content

Commit 075dccd

Browse files
committed
only report Progressing for active nw rollouts
Keep pod-based Progressing tied to an actual CNO rollout instead of temporary unavailability during node reboot churn. Persist the rollout-active state so Progressing stays true until the rollout is fully available, while still ignoring plain reboot churn after install. For machine config status, stop treating generic MCP node convergence as a CNO rollout signal. Reuse shared mcutil helpers for source-only checks so routine MCO updates do not flip network Progressing to true. Signed-off-by: Jamo Luhrsen <jluhrsen@gmail.com> Co-authored-by: Codex
1 parent 2b7b23c commit 075dccd

File tree

4 files changed

+424
-69
lines changed

4 files changed

+424
-69
lines changed

pkg/controller/statusmanager/machineconfig_status.go

Lines changed: 2 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -140,18 +140,11 @@ func (status *StatusManager) SetFromMachineConfigPool(mcPools []mcfgv1.MachineCo
140140
// No degraded pools, so clear degraded status
141141
status.setNotDegraded(MachineConfig)
142142

143-
// Now check for progressing and process machine configs
144143
for role, machineConfigs := range status.renderedMachineConfigs {
145144
pools, err := status.findMachineConfigPoolsForLabel(mcPools, map[string]string{names.MachineConfigLabelRoleKey: role})
146145
if err != nil {
147146
klog.Errorf("failed to get machine config pools for the role %s: %v", role, err)
148147
}
149-
150-
progressingPool := status.isAnyMachineConfigPoolProgressing(pools)
151-
if progressingPool != "" {
152-
status.setProgressing(MachineConfig, "MachineConfig", fmt.Sprintf("%s machine config pool in progressing state", progressingPool))
153-
return nil
154-
}
155148
for _, pool := range pools {
156149
if pool.Spec.Paused {
157150
// When a machine config pool is in paused state, then it is expected that mco doesn't process any machine configs for the pool.
@@ -165,7 +158,7 @@ func (status *StatusManager) SetFromMachineConfigPool(mcPools []mcfgv1.MachineCo
165158
mcSet := sets.Set[string]{}
166159
mcSet.Insert(machineConfig)
167160
if mcsBeingRemoved, ok := status.machineConfigsBeingRemoved[role]; ok && mcsBeingRemoved.Has(machineConfig) {
168-
removed = mcutil.AreMachineConfigsRemovedFromPool(pool.Status, mcSet)
161+
removed = mcutil.AreMachineConfigsRemovedFromPoolSource(pool.Status, mcSet)
169162
if removed {
170163
status.machineConfigsBeingRemoved[role].Delete(machineConfig)
171164
// Delete map entry from status cache if role doesn't have machine configs. By deleting the entry,
@@ -183,7 +176,7 @@ func (status *StatusManager) SetFromMachineConfigPool(mcPools []mcfgv1.MachineCo
183176
}
184177
}
185178
} else {
186-
added = mcutil.AreMachineConfigsRenderedOnPool(pool.Status, mcSet)
179+
added = mcutil.AreMachineConfigsRenderedOnPoolSource(pool.Status, mcSet)
187180
}
188181
if !added || !removed {
189182
status.setProgressing(MachineConfig, "MachineConfig",
@@ -250,17 +243,6 @@ func (status *StatusManager) isAnyMachineConfigPoolDegraded(pools []mcfgv1.Machi
250243
return degradedPool
251244
}
252245

253-
func (status *StatusManager) isAnyMachineConfigPoolProgressing(pools []mcfgv1.MachineConfigPool) string {
254-
var progressingPool string
255-
for _, pool := range pools {
256-
if mcomcfgv1.IsMachineConfigPoolConditionTrue(pool.Status.Conditions, mcfgv1.MachineConfigPoolUpdating) {
257-
progressingPool = pool.Name
258-
break
259-
}
260-
}
261-
return progressingPool
262-
}
263-
264246
func (status *StatusManager) findMachineConfigPoolsForLabel(mcPools []mcfgv1.MachineConfigPool, mcLabel labels.Set) ([]mcfgv1.MachineConfigPool, error) {
265247
var mcps []mcfgv1.MachineConfigPool
266248
for _, mcPool := range mcPools {

pkg/controller/statusmanager/pod_status.go

Lines changed: 113 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ type podState struct {
3838
DaemonsetStates []daemonsetState
3939
DeploymentStates []deploymentState
4040
StatefulsetStates []statefulsetState
41+
InstallComplete bool
4142
}
4243

4344
// daemonsetState is the internal state we use to check if a rollout has
@@ -47,6 +48,7 @@ type daemonsetState struct {
4748

4849
LastSeenStatus appsv1.DaemonSetStatus
4950
LastChangeTime time.Time
51+
RolloutActive bool
5052
}
5153

5254
// deploymentState is the same as daemonsetState.. but for deployments!
@@ -55,6 +57,7 @@ type deploymentState struct {
5557

5658
LastSeenStatus appsv1.DeploymentStatus
5759
LastChangeTime time.Time
60+
RolloutActive bool
5861
}
5962

6063
// statefulsetState is the same as daemonsetState.. but for statefulsets!
@@ -63,6 +66,7 @@ type statefulsetState struct {
6366

6467
LastSeenStatus appsv1.StatefulSetStatus
6568
LastChangeTime time.Time
69+
RolloutActive bool
6670
}
6771

6872
// SetFromPods sets the operator Degraded/Progressing/Available status, based on
@@ -79,14 +83,20 @@ func (status *StatusManager) SetFromPods() {
7983
progressing := []string{}
8084
hung := []string{}
8185

82-
daemonsetStates, deploymentStates, statefulsetStates := status.getLastPodState()
86+
daemonsetStates, deploymentStates, statefulsetStates, installComplete := status.getLastPodState()
87+
if !status.installComplete && installComplete {
88+
status.installComplete = true
89+
}
8390

8491
if (len(daemonSets) + len(deployments) + len(statefulSets)) == 0 {
8592
progressing = append(progressing, "Deploying")
8693
}
8794

8895
for _, ds := range daemonSets {
8996
dsName := NewClusteredName(ds)
97+
dsState, hadState := daemonsetStates[dsName]
98+
dsState.RolloutActive = daemonSetRolloutActive(ds, dsState.RolloutActive, status.installComplete)
99+
dsRolloutActive := dsState.RolloutActive
90100

91101
dsProgressing := false
92102

@@ -97,13 +107,14 @@ func (status *StatusManager) SetFromPods() {
97107
progressing = append(progressing, fmt.Sprintf("DaemonSet %q update is rolling out (%d out of %d updated)", dsName.String(), ds.Status.UpdatedNumberScheduled, ds.Status.DesiredNumberScheduled))
98108
dsProgressing = true
99109
} else if ds.Status.NumberUnavailable > 0 {
100-
progressing = append(progressing, fmt.Sprintf("DaemonSet %q is not available (awaiting %d nodes)", dsName.String(), ds.Status.NumberUnavailable))
101-
dsProgressing = true
102-
// Check for any pods in CrashLoopBackOff state and mark the operator as degraded if so.
110+
if dsRolloutActive {
111+
progressing = append(progressing, fmt.Sprintf("DaemonSet %q is not available (awaiting %d nodes)", dsName.String(), ds.Status.NumberUnavailable))
112+
dsProgressing = true
113+
}
103114
if !isNonCritical(ds) {
104115
hung = append(hung, status.CheckCrashLoopBackOffPods(dsName, ds.Spec.Selector.MatchLabels, "DaemonSet")...)
105116
}
106-
} else if ds.Status.NumberAvailable == 0 && ds.Status.DesiredNumberScheduled > 0 {
117+
} else if ds.Status.NumberAvailable == 0 && dsRolloutActive {
107118
progressing = append(progressing, fmt.Sprintf("DaemonSet %q is not yet scheduled on any nodes", dsName.String()))
108119
dsProgressing = true
109120
} else if ds.Generation > ds.Status.ObservedGeneration {
@@ -120,19 +131,20 @@ func (status *StatusManager) SetFromPods() {
120131
if dsProgressing && !isNonCritical(ds) {
121132
reachedAvailableLevel = false
122133

123-
dsState, exists := daemonsetStates[dsName]
124-
if !exists || !reflect.DeepEqual(dsState.LastSeenStatus, ds.Status) {
134+
if !hadState || !reflect.DeepEqual(dsState.LastSeenStatus, ds.Status) {
125135
dsState.LastChangeTime = time.Now()
126136
ds.Status.DeepCopyInto(&dsState.LastSeenStatus)
127-
daemonsetStates[dsName] = dsState
128137
}
129138

130139
// Catch hung rollouts
131-
if exists && (time.Since(dsState.LastChangeTime)) > ProgressTimeout {
140+
if hadState && (time.Since(dsState.LastChangeTime)) > ProgressTimeout {
132141
hung = append(hung, fmt.Sprintf("DaemonSet %q rollout is not making progress - last change %s", dsName.String(), dsState.LastChangeTime.Format(time.RFC3339)))
133142
empty := ""
134143
dsHung = &empty
135144
}
145+
}
146+
if dsRolloutActive {
147+
daemonsetStates[dsName] = dsState
136148
} else {
137149
delete(daemonsetStates, dsName)
138150
}
@@ -143,6 +155,9 @@ func (status *StatusManager) SetFromPods() {
143155

144156
for _, ss := range statefulSets {
145157
ssName := NewClusteredName(ss)
158+
ssState, hadState := statefulsetStates[ssName]
159+
ssState.RolloutActive = statefulSetRolloutActive(ss, ssState.RolloutActive, status.installComplete)
160+
ssRolloutActive := ssState.RolloutActive
146161

147162
ssProgressing := false
148163

@@ -153,13 +168,15 @@ func (status *StatusManager) SetFromPods() {
153168
progressing = append(progressing, fmt.Sprintf("StatefulSet %q update is rolling out (%d out of %d updated)", ssName.String(), ss.Status.UpdatedReplicas, ss.Status.Replicas))
154169
ssProgressing = true
155170
} else if ss.Status.ReadyReplicas > 0 && ss.Status.ReadyReplicas < ss.Status.Replicas {
156-
progressing = append(progressing, fmt.Sprintf("StatefulSet %q is not available (awaiting %d nodes)", ssName.String(), (ss.Status.Replicas-ss.Status.ReadyReplicas)))
157-
ssProgressing = true
171+
if ssRolloutActive {
172+
progressing = append(progressing, fmt.Sprintf("StatefulSet %q is not available (awaiting %d nodes)", ssName.String(), (ss.Status.Replicas-ss.Status.ReadyReplicas)))
173+
ssProgressing = true
174+
}
158175
// Check for any pods in CrashLoopBackOff state and mark the operator as degraded if so.
159176
if !isNonCritical(ss) {
160177
hung = append(hung, status.CheckCrashLoopBackOffPods(ssName, ss.Spec.Selector.MatchLabels, "StatefulSet")...)
161178
}
162-
} else if ss.Status.AvailableReplicas == 0 {
179+
} else if ss.Status.AvailableReplicas == 0 && ssRolloutActive {
163180
progressing = append(progressing, fmt.Sprintf("StatefulSet %q is not yet scheduled on any nodes", ssName.String()))
164181
ssProgressing = true
165182
} else if ss.Status.ObservedGeneration < ss.Generation {
@@ -176,19 +193,20 @@ func (status *StatusManager) SetFromPods() {
176193
if ssProgressing && !isNonCritical(ss) {
177194
reachedAvailableLevel = false
178195

179-
ssState, exists := statefulsetStates[ssName]
180-
if !exists || !reflect.DeepEqual(ssState.LastSeenStatus, ss.Status) {
196+
if !hadState || !reflect.DeepEqual(ssState.LastSeenStatus, ss.Status) {
181197
ssState.LastChangeTime = time.Now()
182198
ss.Status.DeepCopyInto(&ssState.LastSeenStatus)
183-
statefulsetStates[ssName] = ssState
184199
}
185200

186201
// Catch hung rollouts
187-
if exists && (time.Since(ssState.LastChangeTime)) > ProgressTimeout {
202+
if hadState && (time.Since(ssState.LastChangeTime)) > ProgressTimeout {
188203
hung = append(hung, fmt.Sprintf("StatefulSet %q rollout is not making progress - last change %s", ssName.String(), ssState.LastChangeTime.Format(time.RFC3339)))
189204
empty := ""
190205
ssHung = &empty
191206
}
207+
}
208+
if ssRolloutActive {
209+
statefulsetStates[ssName] = ssState
192210
} else {
193211
delete(statefulsetStates, ssName)
194212
}
@@ -199,6 +217,9 @@ func (status *StatusManager) SetFromPods() {
199217

200218
for _, dep := range deployments {
201219
depName := NewClusteredName(dep)
220+
depState, hadState := deploymentStates[depName]
221+
depState.RolloutActive = deploymentRolloutActive(dep, depState.RolloutActive, status.installComplete)
222+
depRolloutActive := depState.RolloutActive
202223
depProgressing := false
203224

204225
if isNonCritical(dep) && dep.Status.UnavailableReplicas > 0 && !status.installComplete {
@@ -208,13 +229,15 @@ func (status *StatusManager) SetFromPods() {
208229
progressing = append(progressing, fmt.Sprintf("Deployment %q update is rolling out (%d out of %d updated)", depName.String(), dep.Status.UpdatedReplicas, dep.Status.Replicas))
209230
depProgressing = true
210231
} else if dep.Status.UnavailableReplicas > 0 {
211-
progressing = append(progressing, fmt.Sprintf("Deployment %q is not available (awaiting %d nodes)", depName.String(), dep.Status.UnavailableReplicas))
212-
depProgressing = true
232+
if depRolloutActive {
233+
progressing = append(progressing, fmt.Sprintf("Deployment %q is not available (awaiting %d nodes)", depName.String(), dep.Status.UnavailableReplicas))
234+
depProgressing = true
235+
}
213236
// Check for any pods in CrashLoopBackOff state and mark the operator as degraded if so.
214237
if !isNonCritical(dep) {
215238
hung = append(hung, status.CheckCrashLoopBackOffPods(depName, dep.Spec.Selector.MatchLabels, "Deployment")...)
216239
}
217-
} else if dep.Status.AvailableReplicas == 0 {
240+
} else if dep.Status.AvailableReplicas == 0 && depRolloutActive {
218241
progressing = append(progressing, fmt.Sprintf("Deployment %q is not yet scheduled on any nodes", depName.String()))
219242
depProgressing = true
220243
} else if dep.Status.ObservedGeneration < dep.Generation {
@@ -231,19 +254,20 @@ func (status *StatusManager) SetFromPods() {
231254
if depProgressing && !isNonCritical(dep) {
232255
reachedAvailableLevel = false
233256

234-
depState, exists := deploymentStates[depName]
235-
if !exists || !reflect.DeepEqual(depState.LastSeenStatus, dep.Status) {
257+
if !hadState || !reflect.DeepEqual(depState.LastSeenStatus, dep.Status) {
236258
depState.LastChangeTime = time.Now()
237259
dep.Status.DeepCopyInto(&depState.LastSeenStatus)
238-
deploymentStates[depName] = depState
239260
}
240261

241262
// Catch hung rollouts
242-
if exists && (time.Since(depState.LastChangeTime)) > ProgressTimeout {
263+
if hadState && (time.Since(depState.LastChangeTime)) > ProgressTimeout {
243264
hung = append(hung, fmt.Sprintf("Deployment %q rollout is not making progress - last change %s", depName.String(), depState.LastChangeTime.Format(time.RFC3339)))
244265
empty := ""
245266
depHung = &empty
246267
}
268+
}
269+
if depRolloutActive {
270+
deploymentStates[depName] = depState
247271
} else {
248272
delete(deploymentStates, depName)
249273
}
@@ -253,7 +277,10 @@ func (status *StatusManager) SetFromPods() {
253277
}
254278

255279
status.setNotDegraded(PodDeployment)
256-
if err := status.setLastPodState(daemonsetStates, deploymentStates, statefulsetStates); err != nil {
280+
if reachedAvailableLevel && len(progressing) == 0 {
281+
status.installComplete = true
282+
}
283+
if err := status.setLastPodState(daemonsetStates, deploymentStates, statefulsetStates, status.installComplete); err != nil {
257284
log.Printf("Failed to set pod state (continuing): %+v\n", err)
258285
}
259286

@@ -269,21 +296,71 @@ func (status *StatusManager) SetFromPods() {
269296
Status: operv1.ConditionTrue})
270297
}
271298

272-
if reachedAvailableLevel && len(progressing) == 0 {
273-
status.installComplete = true
274-
}
275-
276299
if len(hung) > 0 {
277300
status.setDegraded(RolloutHung, "RolloutHung", strings.Join(hung, "\n"))
278301
} else {
279302
status.setNotDegraded(RolloutHung)
280303
}
281304
}
282305

306+
// We only want pod unavailability to count as Progressing when we already know a
307+
// CNO-managed rollout is in flight. The status snapshots below distinguish:
308+
// - rollout started: controller still processing a spec change
309+
// - rollout complete: controller has observed that change and all replicas are healthy
310+
// If we have neither signal after install, the same "unavailable" counters are
311+
// treated as ordinary node reboot churn rather than a network rollout.
312+
func daemonSetRolloutActive(ds *appsv1.DaemonSet, rolloutActive, installComplete bool) bool {
313+
rolloutStarted := ds.Generation > ds.Status.ObservedGeneration || ds.Status.UpdatedNumberScheduled < ds.Status.DesiredNumberScheduled
314+
rolloutComplete := ds.Status.ObservedGeneration >= expectedGeneration(ds.Generation) &&
315+
ds.Status.NumberUnavailable == 0 &&
316+
(ds.Status.DesiredNumberScheduled == 0 || ds.Status.UpdatedNumberScheduled >= ds.Status.DesiredNumberScheduled) &&
317+
(ds.Status.DesiredNumberScheduled == 0 || ds.Status.NumberAvailable >= ds.Status.DesiredNumberScheduled)
318+
319+
return updateRolloutActive(rolloutActive, installComplete, rolloutStarted, rolloutComplete)
320+
}
321+
322+
func statefulSetRolloutActive(ss *appsv1.StatefulSet, rolloutActive, installComplete bool) bool {
323+
rolloutStarted := ss.Generation > ss.Status.ObservedGeneration || ss.Status.UpdatedReplicas < ss.Status.Replicas
324+
rolloutComplete := ss.Status.ObservedGeneration >= expectedGeneration(ss.Generation) &&
325+
ss.Status.UpdatedReplicas >= ss.Status.Replicas &&
326+
ss.Status.ReadyReplicas >= ss.Status.Replicas
327+
328+
return updateRolloutActive(rolloutActive, installComplete, rolloutStarted, rolloutComplete)
329+
}
330+
331+
func deploymentRolloutActive(dep *appsv1.Deployment, rolloutActive, installComplete bool) bool {
332+
rolloutStarted := dep.Generation > dep.Status.ObservedGeneration || dep.Status.UpdatedReplicas < dep.Status.Replicas
333+
rolloutComplete := dep.Status.ObservedGeneration >= expectedGeneration(dep.Generation) &&
334+
dep.Status.UpdatedReplicas >= dep.Status.Replicas &&
335+
dep.Status.UnavailableReplicas == 0 &&
336+
(dep.Status.Replicas == 0 || dep.Status.AvailableReplicas >= dep.Status.Replicas)
337+
338+
return updateRolloutActive(rolloutActive, installComplete, rolloutStarted, rolloutComplete)
339+
}
340+
341+
// Once install is complete, only explicit rollout signals should reactivate Progressing.
342+
func updateRolloutActive(rolloutActive, installComplete, rolloutStarted, rolloutComplete bool) bool {
343+
if !installComplete || rolloutStarted {
344+
rolloutActive = true
345+
}
346+
if rolloutActive && rolloutComplete {
347+
return false
348+
}
349+
return rolloutActive
350+
}
351+
352+
// Real workload objects start at generation 1; tests often omit it and leave the zero value.
353+
func expectedGeneration(generation int64) int64 {
354+
if generation > 0 {
355+
return generation
356+
}
357+
return 1
358+
}
359+
283360
// getLastPodState reads the last-seen daemonset + deployment + statefulset
284361
// states from the clusteroperator annotation and parses it. On error, it
285362
// returns an empty state, since this should not block updating operator status.
286-
func (status *StatusManager) getLastPodState() (map[ClusteredName]daemonsetState, map[ClusteredName]deploymentState, map[ClusteredName]statefulsetState) {
363+
func (status *StatusManager) getLastPodState() (map[ClusteredName]daemonsetState, map[ClusteredName]deploymentState, map[ClusteredName]statefulsetState, bool) {
287364
// with maps allocated
288365
daemonsetStates := map[ClusteredName]daemonsetState{}
289366
deploymentStates := map[ClusteredName]deploymentState{}
@@ -294,20 +371,20 @@ func (status *StatusManager) getLastPodState() (map[ClusteredName]daemonsetState
294371
err := status.client.ClientFor("").CRClient().Get(context.TODO(), types.NamespacedName{Name: status.name}, co)
295372
if err != nil {
296373
log.Printf("Failed to get ClusterOperator: %v", err)
297-
return daemonsetStates, deploymentStates, statefulsetStates
374+
return daemonsetStates, deploymentStates, statefulsetStates, false
298375
}
299376

300377
lsbytes := co.Annotations[lastSeenAnnotation]
301378
if lsbytes == "" {
302-
return daemonsetStates, deploymentStates, statefulsetStates
379+
return daemonsetStates, deploymentStates, statefulsetStates, false
303380
}
304381

305382
out := podState{}
306383
err = json.Unmarshal([]byte(lsbytes), &out)
307384
if err != nil {
308385
// No need to return error; just move on
309386
log.Printf("failed to unmashal last-seen-status: %v", err)
310-
return daemonsetStates, deploymentStates, statefulsetStates
387+
return daemonsetStates, deploymentStates, statefulsetStates, false
311388
}
312389

313390
for _, ds := range out.DaemonsetStates {
@@ -322,18 +399,20 @@ func (status *StatusManager) getLastPodState() (map[ClusteredName]daemonsetState
322399
statefulsetStates[ss.ClusteredName] = ss
323400
}
324401

325-
return daemonsetStates, deploymentStates, statefulsetStates
402+
return daemonsetStates, deploymentStates, statefulsetStates, out.InstallComplete
326403
}
327404

328405
func (status *StatusManager) setLastPodState(
329406
dss map[ClusteredName]daemonsetState,
330407
deps map[ClusteredName]deploymentState,
331-
sss map[ClusteredName]statefulsetState) error {
408+
sss map[ClusteredName]statefulsetState,
409+
installComplete bool) error {
332410

333411
ps := podState{
334412
DaemonsetStates: make([]daemonsetState, 0, len(dss)),
335413
DeploymentStates: make([]deploymentState, 0, len(deps)),
336414
StatefulsetStates: make([]statefulsetState, 0, len(sss)),
415+
InstallComplete: installComplete,
337416
}
338417

339418
for nsn, ds := range dss {

0 commit comments

Comments
 (0)