Skip to content

Commit 727c270

Browse files
committed
CORENET-6572: only report Progressing for active network rollouts
Keep pod-based Progressing tied to an actual CNO rollout instead of temporary unavailability during node reboot churn. Persist the rollout-active state so Progressing stays true until the rollout is fully available, while still ignoring plain reboot churn after install.\n\nFor machine config status, stop treating generic MCP node convergence as a CNO rollout signal. Reuse shared mcutil helpers for source-only checks so routine MCO updates do not flip network Progressing to true. Signed-off-by: Jamo Luhrsen <[email protected]>
1 parent 2b7b23c commit 727c270

4 files changed

Lines changed: 424 additions & 69 deletions

File tree

pkg/controller/statusmanager/machineconfig_status.go

Lines changed: 2 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -140,18 +140,11 @@ func (status *StatusManager) SetFromMachineConfigPool(mcPools []mcfgv1.MachineCo
140140
// No degraded pools, so clear degraded status
141141
status.setNotDegraded(MachineConfig)
142142

143-
// Now check for progressing and process machine configs
144143
for role, machineConfigs := range status.renderedMachineConfigs {
145144
pools, err := status.findMachineConfigPoolsForLabel(mcPools, map[string]string{names.MachineConfigLabelRoleKey: role})
146145
if err != nil {
147146
klog.Errorf("failed to get machine config pools for the role %s: %v", role, err)
148147
}
149-
150-
progressingPool := status.isAnyMachineConfigPoolProgressing(pools)
151-
if progressingPool != "" {
152-
status.setProgressing(MachineConfig, "MachineConfig", fmt.Sprintf("%s machine config pool in progressing state", progressingPool))
153-
return nil
154-
}
155148
for _, pool := range pools {
156149
if pool.Spec.Paused {
157150
// When a machine config pool is in paused state, then it is expected that mco doesn't process any machine configs for the pool.
@@ -165,7 +158,7 @@ func (status *StatusManager) SetFromMachineConfigPool(mcPools []mcfgv1.MachineCo
165158
mcSet := sets.Set[string]{}
166159
mcSet.Insert(machineConfig)
167160
if mcsBeingRemoved, ok := status.machineConfigsBeingRemoved[role]; ok && mcsBeingRemoved.Has(machineConfig) {
168-
removed = mcutil.AreMachineConfigsRemovedFromPool(pool.Status, mcSet)
161+
removed = mcutil.AreMachineConfigsRemovedFromPoolSource(pool.Status, mcSet)
169162
if removed {
170163
status.machineConfigsBeingRemoved[role].Delete(machineConfig)
171164
// Delete map entry from status cache if role doesn't have machine configs. By deleting the entry,
@@ -183,7 +176,7 @@ func (status *StatusManager) SetFromMachineConfigPool(mcPools []mcfgv1.MachineCo
183176
}
184177
}
185178
} else {
186-
added = mcutil.AreMachineConfigsRenderedOnPool(pool.Status, mcSet)
179+
added = mcutil.AreMachineConfigsRenderedOnPoolSource(pool.Status, mcSet)
187180
}
188181
if !added || !removed {
189182
status.setProgressing(MachineConfig, "MachineConfig",
@@ -250,17 +243,6 @@ func (status *StatusManager) isAnyMachineConfigPoolDegraded(pools []mcfgv1.Machi
250243
return degradedPool
251244
}
252245

253-
func (status *StatusManager) isAnyMachineConfigPoolProgressing(pools []mcfgv1.MachineConfigPool) string {
254-
var progressingPool string
255-
for _, pool := range pools {
256-
if mcomcfgv1.IsMachineConfigPoolConditionTrue(pool.Status.Conditions, mcfgv1.MachineConfigPoolUpdating) {
257-
progressingPool = pool.Name
258-
break
259-
}
260-
}
261-
return progressingPool
262-
}
263-
264246
func (status *StatusManager) findMachineConfigPoolsForLabel(mcPools []mcfgv1.MachineConfigPool, mcLabel labels.Set) ([]mcfgv1.MachineConfigPool, error) {
265247
var mcps []mcfgv1.MachineConfigPool
266248
for _, mcPool := range mcPools {

pkg/controller/statusmanager/pod_status.go

Lines changed: 113 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ type podState struct {
3838
DaemonsetStates []daemonsetState
3939
DeploymentStates []deploymentState
4040
StatefulsetStates []statefulsetState
41+
InstallComplete bool
4142
}
4243

4344
// daemonsetState is the internal state we use to check if a rollout has
@@ -47,6 +48,7 @@ type daemonsetState struct {
4748

4849
LastSeenStatus appsv1.DaemonSetStatus
4950
LastChangeTime time.Time
51+
RolloutActive bool
5052
}
5153

5254
// deploymentState is the same as daemonsetState.. but for deployments!
@@ -55,6 +57,7 @@ type deploymentState struct {
5557

5658
LastSeenStatus appsv1.DeploymentStatus
5759
LastChangeTime time.Time
60+
RolloutActive bool
5861
}
5962

6063
// statefulsetState is the same as daemonsetState.. but for statefulsets!
@@ -63,6 +66,7 @@ type statefulsetState struct {
6366

6467
LastSeenStatus appsv1.StatefulSetStatus
6568
LastChangeTime time.Time
69+
RolloutActive bool
6670
}
6771

6872
// SetFromPods sets the operator Degraded/Progressing/Available status, based on
@@ -79,14 +83,20 @@ func (status *StatusManager) SetFromPods() {
7983
progressing := []string{}
8084
hung := []string{}
8185

82-
daemonsetStates, deploymentStates, statefulsetStates := status.getLastPodState()
86+
daemonsetStates, deploymentStates, statefulsetStates, installComplete := status.getLastPodState()
87+
if !status.installComplete && installComplete {
88+
status.installComplete = true
89+
}
8390

8491
if (len(daemonSets) + len(deployments) + len(statefulSets)) == 0 {
8592
progressing = append(progressing, "Deploying")
8693
}
8794

8895
for _, ds := range daemonSets {
8996
dsName := NewClusteredName(ds)
97+
dsState, hadState := daemonsetStates[dsName]
98+
dsState.RolloutActive = daemonSetRolloutActive(ds, dsState.RolloutActive, status.installComplete)
99+
dsRolloutActive := dsState.RolloutActive
90100

91101
dsProgressing := false
92102

@@ -97,13 +107,14 @@ func (status *StatusManager) SetFromPods() {
97107
progressing = append(progressing, fmt.Sprintf("DaemonSet %q update is rolling out (%d out of %d updated)", dsName.String(), ds.Status.UpdatedNumberScheduled, ds.Status.DesiredNumberScheduled))
98108
dsProgressing = true
99109
} else if ds.Status.NumberUnavailable > 0 {
100-
progressing = append(progressing, fmt.Sprintf("DaemonSet %q is not available (awaiting %d nodes)", dsName.String(), ds.Status.NumberUnavailable))
101-
dsProgressing = true
102-
// Check for any pods in CrashLoopBackOff state and mark the operator as degraded if so.
110+
if dsRolloutActive {
111+
progressing = append(progressing, fmt.Sprintf("DaemonSet %q is not available (awaiting %d nodes)", dsName.String(), ds.Status.NumberUnavailable))
112+
dsProgressing = true
113+
}
103114
if !isNonCritical(ds) {
104115
hung = append(hung, status.CheckCrashLoopBackOffPods(dsName, ds.Spec.Selector.MatchLabels, "DaemonSet")...)
105116
}
106-
} else if ds.Status.NumberAvailable == 0 && ds.Status.DesiredNumberScheduled > 0 {
117+
} else if ds.Status.NumberAvailable == 0 && dsRolloutActive {
107118
progressing = append(progressing, fmt.Sprintf("DaemonSet %q is not yet scheduled on any nodes", dsName.String()))
108119
dsProgressing = true
109120
} else if ds.Generation > ds.Status.ObservedGeneration {
@@ -120,19 +131,20 @@ func (status *StatusManager) SetFromPods() {
120131
if dsProgressing && !isNonCritical(ds) {
121132
reachedAvailableLevel = false
122133

123-
dsState, exists := daemonsetStates[dsName]
124-
if !exists || !reflect.DeepEqual(dsState.LastSeenStatus, ds.Status) {
134+
if !hadState || !reflect.DeepEqual(dsState.LastSeenStatus, ds.Status) {
125135
dsState.LastChangeTime = time.Now()
126136
ds.Status.DeepCopyInto(&dsState.LastSeenStatus)
127-
daemonsetStates[dsName] = dsState
128137
}
129138

130139
// Catch hung rollouts
131-
if exists && (time.Since(dsState.LastChangeTime)) > ProgressTimeout {
140+
if hadState && (time.Since(dsState.LastChangeTime)) > ProgressTimeout {
132141
hung = append(hung, fmt.Sprintf("DaemonSet %q rollout is not making progress - last change %s", dsName.String(), dsState.LastChangeTime.Format(time.RFC3339)))
133142
empty := ""
134143
dsHung = &empty
135144
}
145+
}
146+
if dsRolloutActive {
147+
daemonsetStates[dsName] = dsState
136148
} else {
137149
delete(daemonsetStates, dsName)
138150
}
@@ -143,6 +155,9 @@ func (status *StatusManager) SetFromPods() {
143155

144156
for _, ss := range statefulSets {
145157
ssName := NewClusteredName(ss)
158+
ssState, hadState := statefulsetStates[ssName]
159+
ssState.RolloutActive = statefulSetRolloutActive(ss, ssState.RolloutActive, status.installComplete)
160+
ssRolloutActive := ssState.RolloutActive
146161

147162
ssProgressing := false
148163

@@ -153,13 +168,15 @@ func (status *StatusManager) SetFromPods() {
153168
progressing = append(progressing, fmt.Sprintf("StatefulSet %q update is rolling out (%d out of %d updated)", ssName.String(), ss.Status.UpdatedReplicas, ss.Status.Replicas))
154169
ssProgressing = true
155170
} else if ss.Status.ReadyReplicas > 0 && ss.Status.ReadyReplicas < ss.Status.Replicas {
156-
progressing = append(progressing, fmt.Sprintf("StatefulSet %q is not available (awaiting %d nodes)", ssName.String(), (ss.Status.Replicas-ss.Status.ReadyReplicas)))
157-
ssProgressing = true
171+
if ssRolloutActive {
172+
progressing = append(progressing, fmt.Sprintf("StatefulSet %q is not available (awaiting %d nodes)", ssName.String(), (ss.Status.Replicas-ss.Status.ReadyReplicas)))
173+
ssProgressing = true
174+
}
158175
// Check for any pods in CrashLoopBackOff state and mark the operator as degraded if so.
159176
if !isNonCritical(ss) {
160177
hung = append(hung, status.CheckCrashLoopBackOffPods(ssName, ss.Spec.Selector.MatchLabels, "StatefulSet")...)
161178
}
162-
} else if ss.Status.AvailableReplicas == 0 {
179+
} else if ss.Status.AvailableReplicas == 0 && ssRolloutActive {
163180
progressing = append(progressing, fmt.Sprintf("StatefulSet %q is not yet scheduled on any nodes", ssName.String()))
164181
ssProgressing = true
165182
} else if ss.Status.ObservedGeneration < ss.Generation {
@@ -176,19 +193,20 @@ func (status *StatusManager) SetFromPods() {
176193
if ssProgressing && !isNonCritical(ss) {
177194
reachedAvailableLevel = false
178195

179-
ssState, exists := statefulsetStates[ssName]
180-
if !exists || !reflect.DeepEqual(ssState.LastSeenStatus, ss.Status) {
196+
if !hadState || !reflect.DeepEqual(ssState.LastSeenStatus, ss.Status) {
181197
ssState.LastChangeTime = time.Now()
182198
ss.Status.DeepCopyInto(&ssState.LastSeenStatus)
183-
statefulsetStates[ssName] = ssState
184199
}
185200

186201
// Catch hung rollouts
187-
if exists && (time.Since(ssState.LastChangeTime)) > ProgressTimeout {
202+
if hadState && (time.Since(ssState.LastChangeTime)) > ProgressTimeout {
188203
hung = append(hung, fmt.Sprintf("StatefulSet %q rollout is not making progress - last change %s", ssName.String(), ssState.LastChangeTime.Format(time.RFC3339)))
189204
empty := ""
190205
ssHung = &empty
191206
}
207+
}
208+
if ssRolloutActive {
209+
statefulsetStates[ssName] = ssState
192210
} else {
193211
delete(statefulsetStates, ssName)
194212
}
@@ -199,6 +217,9 @@ func (status *StatusManager) SetFromPods() {
199217

200218
for _, dep := range deployments {
201219
depName := NewClusteredName(dep)
220+
depState, hadState := deploymentStates[depName]
221+
depState.RolloutActive = deploymentRolloutActive(dep, depState.RolloutActive, status.installComplete)
222+
depRolloutActive := depState.RolloutActive
202223
depProgressing := false
203224

204225
if isNonCritical(dep) && dep.Status.UnavailableReplicas > 0 && !status.installComplete {
@@ -208,13 +229,15 @@ func (status *StatusManager) SetFromPods() {
208229
progressing = append(progressing, fmt.Sprintf("Deployment %q update is rolling out (%d out of %d updated)", depName.String(), dep.Status.UpdatedReplicas, dep.Status.Replicas))
209230
depProgressing = true
210231
} else if dep.Status.UnavailableReplicas > 0 {
211-
progressing = append(progressing, fmt.Sprintf("Deployment %q is not available (awaiting %d nodes)", depName.String(), dep.Status.UnavailableReplicas))
212-
depProgressing = true
232+
if depRolloutActive {
233+
progressing = append(progressing, fmt.Sprintf("Deployment %q is not available (awaiting %d nodes)", depName.String(), dep.Status.UnavailableReplicas))
234+
depProgressing = true
235+
}
213236
// Check for any pods in CrashLoopBackOff state and mark the operator as degraded if so.
214237
if !isNonCritical(dep) {
215238
hung = append(hung, status.CheckCrashLoopBackOffPods(depName, dep.Spec.Selector.MatchLabels, "Deployment")...)
216239
}
217-
} else if dep.Status.AvailableReplicas == 0 {
240+
} else if dep.Status.AvailableReplicas == 0 && depRolloutActive {
218241
progressing = append(progressing, fmt.Sprintf("Deployment %q is not yet scheduled on any nodes", depName.String()))
219242
depProgressing = true
220243
} else if dep.Status.ObservedGeneration < dep.Generation {
@@ -231,19 +254,20 @@ func (status *StatusManager) SetFromPods() {
231254
if depProgressing && !isNonCritical(dep) {
232255
reachedAvailableLevel = false
233256

234-
depState, exists := deploymentStates[depName]
235-
if !exists || !reflect.DeepEqual(depState.LastSeenStatus, dep.Status) {
257+
if !hadState || !reflect.DeepEqual(depState.LastSeenStatus, dep.Status) {
236258
depState.LastChangeTime = time.Now()
237259
dep.Status.DeepCopyInto(&depState.LastSeenStatus)
238-
deploymentStates[depName] = depState
239260
}
240261

241262
// Catch hung rollouts
242-
if exists && (time.Since(depState.LastChangeTime)) > ProgressTimeout {
263+
if hadState && (time.Since(depState.LastChangeTime)) > ProgressTimeout {
243264
hung = append(hung, fmt.Sprintf("Deployment %q rollout is not making progress - last change %s", depName.String(), depState.LastChangeTime.Format(time.RFC3339)))
244265
empty := ""
245266
depHung = &empty
246267
}
268+
}
269+
if depRolloutActive {
270+
deploymentStates[depName] = depState
247271
} else {
248272
delete(deploymentStates, depName)
249273
}
@@ -253,7 +277,10 @@ func (status *StatusManager) SetFromPods() {
253277
}
254278

255279
status.setNotDegraded(PodDeployment)
256-
if err := status.setLastPodState(daemonsetStates, deploymentStates, statefulsetStates); err != nil {
280+
if reachedAvailableLevel && len(progressing) == 0 {
281+
status.installComplete = true
282+
}
283+
if err := status.setLastPodState(daemonsetStates, deploymentStates, statefulsetStates, status.installComplete); err != nil {
257284
log.Printf("Failed to set pod state (continuing): %+v\n", err)
258285
}
259286

@@ -269,21 +296,71 @@ func (status *StatusManager) SetFromPods() {
269296
Status: operv1.ConditionTrue})
270297
}
271298

272-
if reachedAvailableLevel && len(progressing) == 0 {
273-
status.installComplete = true
274-
}
275-
276299
if len(hung) > 0 {
277300
status.setDegraded(RolloutHung, "RolloutHung", strings.Join(hung, "\n"))
278301
} else {
279302
status.setNotDegraded(RolloutHung)
280303
}
281304
}
282305

306+
// We only want pod unavailability to count as Progressing when we already know a
307+
// CNO-managed rollout is in flight. The status snapshots below distinguish:
308+
// - rollout started: controller still processing a spec change
309+
// - rollout complete: controller has observed that change and all replicas are healthy
310+
// If we have neither signal after install, the same "unavailable" counters are
311+
// treated as ordinary node reboot churn rather than a network rollout.
312+
func daemonSetRolloutActive(ds *appsv1.DaemonSet, rolloutActive, installComplete bool) bool {
313+
rolloutStarted := ds.Generation > ds.Status.ObservedGeneration || ds.Status.UpdatedNumberScheduled < ds.Status.DesiredNumberScheduled
314+
rolloutComplete := ds.Status.ObservedGeneration >= expectedGeneration(ds.Generation) &&
315+
ds.Status.NumberUnavailable == 0 &&
316+
(ds.Status.DesiredNumberScheduled == 0 || ds.Status.UpdatedNumberScheduled >= ds.Status.DesiredNumberScheduled) &&
317+
(ds.Status.DesiredNumberScheduled == 0 || ds.Status.NumberAvailable >= ds.Status.DesiredNumberScheduled)
318+
319+
return updateRolloutActive(rolloutActive, installComplete, rolloutStarted, rolloutComplete)
320+
}
321+
322+
func statefulSetRolloutActive(ss *appsv1.StatefulSet, rolloutActive, installComplete bool) bool {
323+
rolloutStarted := ss.Generation > ss.Status.ObservedGeneration || ss.Status.UpdatedReplicas < ss.Status.Replicas
324+
rolloutComplete := ss.Status.ObservedGeneration >= expectedGeneration(ss.Generation) &&
325+
ss.Status.UpdatedReplicas >= ss.Status.Replicas &&
326+
ss.Status.ReadyReplicas >= ss.Status.Replicas
327+
328+
return updateRolloutActive(rolloutActive, installComplete, rolloutStarted, rolloutComplete)
329+
}
330+
331+
func deploymentRolloutActive(dep *appsv1.Deployment, rolloutActive, installComplete bool) bool {
332+
rolloutStarted := dep.Generation > dep.Status.ObservedGeneration || dep.Status.UpdatedReplicas < dep.Status.Replicas
333+
rolloutComplete := dep.Status.ObservedGeneration >= expectedGeneration(dep.Generation) &&
334+
dep.Status.UpdatedReplicas >= dep.Status.Replicas &&
335+
dep.Status.UnavailableReplicas == 0 &&
336+
(dep.Status.Replicas == 0 || dep.Status.AvailableReplicas >= dep.Status.Replicas)
337+
338+
return updateRolloutActive(rolloutActive, installComplete, rolloutStarted, rolloutComplete)
339+
}
340+
341+
// Once install is complete, only explicit rollout signals should reactivate Progressing.
342+
func updateRolloutActive(rolloutActive, installComplete, rolloutStarted, rolloutComplete bool) bool {
343+
if !installComplete || rolloutStarted {
344+
rolloutActive = true
345+
}
346+
if rolloutActive && rolloutComplete {
347+
return false
348+
}
349+
return rolloutActive
350+
}
351+
352+
// Real workload objects start at generation 1; tests often omit it and leave the zero value.
353+
func expectedGeneration(generation int64) int64 {
354+
if generation > 0 {
355+
return generation
356+
}
357+
return 1
358+
}
359+
283360
// getLastPodState reads the last-seen daemonset + deployment + statefulset
284361
// states from the clusteroperator annotation and parses it. On error, it
285362
// returns an empty state, since this should not block updating operator status.
286-
func (status *StatusManager) getLastPodState() (map[ClusteredName]daemonsetState, map[ClusteredName]deploymentState, map[ClusteredName]statefulsetState) {
363+
func (status *StatusManager) getLastPodState() (map[ClusteredName]daemonsetState, map[ClusteredName]deploymentState, map[ClusteredName]statefulsetState, bool) {
287364
// with maps allocated
288365
daemonsetStates := map[ClusteredName]daemonsetState{}
289366
deploymentStates := map[ClusteredName]deploymentState{}
@@ -294,20 +371,20 @@ func (status *StatusManager) getLastPodState() (map[ClusteredName]daemonsetState
294371
err := status.client.ClientFor("").CRClient().Get(context.TODO(), types.NamespacedName{Name: status.name}, co)
295372
if err != nil {
296373
log.Printf("Failed to get ClusterOperator: %v", err)
297-
return daemonsetStates, deploymentStates, statefulsetStates
374+
return daemonsetStates, deploymentStates, statefulsetStates, false
298375
}
299376

300377
lsbytes := co.Annotations[lastSeenAnnotation]
301378
if lsbytes == "" {
302-
return daemonsetStates, deploymentStates, statefulsetStates
379+
return daemonsetStates, deploymentStates, statefulsetStates, false
303380
}
304381

305382
out := podState{}
306383
err = json.Unmarshal([]byte(lsbytes), &out)
307384
if err != nil {
308385
// No need to return error; just move on
309386
log.Printf("failed to unmashal last-seen-status: %v", err)
310-
return daemonsetStates, deploymentStates, statefulsetStates
387+
return daemonsetStates, deploymentStates, statefulsetStates, false
311388
}
312389

313390
for _, ds := range out.DaemonsetStates {
@@ -322,18 +399,20 @@ func (status *StatusManager) getLastPodState() (map[ClusteredName]daemonsetState
322399
statefulsetStates[ss.ClusteredName] = ss
323400
}
324401

325-
return daemonsetStates, deploymentStates, statefulsetStates
402+
return daemonsetStates, deploymentStates, statefulsetStates, out.InstallComplete
326403
}
327404

328405
func (status *StatusManager) setLastPodState(
329406
dss map[ClusteredName]daemonsetState,
330407
deps map[ClusteredName]deploymentState,
331-
sss map[ClusteredName]statefulsetState) error {
408+
sss map[ClusteredName]statefulsetState,
409+
installComplete bool) error {
332410

333411
ps := podState{
334412
DaemonsetStates: make([]daemonsetState, 0, len(dss)),
335413
DeploymentStates: make([]deploymentState, 0, len(deps)),
336414
StatefulsetStates: make([]statefulsetState, 0, len(sss)),
415+
InstallComplete: installComplete,
337416
}
338417

339418
for nsn, ds := range dss {

0 commit comments

Comments
 (0)