Skip to content

Commit da9acf2

Browse files
committed
only report Progressing for active network rollouts
Keep pod-based Progressing tied to an actual CNO rollout instead of temporary unavailability during node reboot churn. Persist the rollout generation in status manager state so Progressing stays true until the rollout is both observed and fully available. For machine config status, stop treating generic MCP node convergence as a CNO rollout signal. Check whether the CNO machine config is still present in the pool's rendered source list so routine MCO updates do not flip network Progressing to true. Signed-off-by: Jamo Luhrsen <jluhrsen@gmail.com> Co-Authored-by: Claude Code and Codex
1 parent 2b7b23c commit da9acf2

File tree

3 files changed

+246
-50
lines changed

3 files changed

+246
-50
lines changed

pkg/controller/statusmanager/machineconfig_status.go

Lines changed: 18 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@ import (
99
configv1 "github.com/openshift/api/config/v1"
1010
mcfgv1 "github.com/openshift/api/machineconfiguration/v1"
1111
"github.com/openshift/cluster-network-operator/pkg/names"
12-
mcutil "github.com/openshift/cluster-network-operator/pkg/util/machineconfig"
1312
mcomcfgv1 "github.com/openshift/machine-config-operator/pkg/apihelpers"
1413
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1514
"k8s.io/apimachinery/pkg/labels"
@@ -140,18 +139,11 @@ func (status *StatusManager) SetFromMachineConfigPool(mcPools []mcfgv1.MachineCo
140139
// No degraded pools, so clear degraded status
141140
status.setNotDegraded(MachineConfig)
142141

143-
// Now check for progressing and process machine configs
144142
for role, machineConfigs := range status.renderedMachineConfigs {
145143
pools, err := status.findMachineConfigPoolsForLabel(mcPools, map[string]string{names.MachineConfigLabelRoleKey: role})
146144
if err != nil {
147145
klog.Errorf("failed to get machine config pools for the role %s: %v", role, err)
148146
}
149-
150-
progressingPool := status.isAnyMachineConfigPoolProgressing(pools)
151-
if progressingPool != "" {
152-
status.setProgressing(MachineConfig, "MachineConfig", fmt.Sprintf("%s machine config pool in progressing state", progressingPool))
153-
return nil
154-
}
155147
for _, pool := range pools {
156148
if pool.Spec.Paused {
157149
// When a machine config pool is in paused state, then it is expected that mco doesn't process any machine configs for the pool.
@@ -165,7 +157,7 @@ func (status *StatusManager) SetFromMachineConfigPool(mcPools []mcfgv1.MachineCo
165157
mcSet := sets.Set[string]{}
166158
mcSet.Insert(machineConfig)
167159
if mcsBeingRemoved, ok := status.machineConfigsBeingRemoved[role]; ok && mcsBeingRemoved.Has(machineConfig) {
168-
removed = mcutil.AreMachineConfigsRemovedFromPool(pool.Status, mcSet)
160+
removed = areMachineConfigsRemovedFromPoolSource(pool.Status, mcSet)
169161
if removed {
170162
status.machineConfigsBeingRemoved[role].Delete(machineConfig)
171163
// Delete map entry from status cache if role doesn't have machine configs. By deleting the entry,
@@ -183,7 +175,7 @@ func (status *StatusManager) SetFromMachineConfigPool(mcPools []mcfgv1.MachineCo
183175
}
184176
}
185177
} else {
186-
added = mcutil.AreMachineConfigsRenderedOnPool(pool.Status, mcSet)
178+
added = areMachineConfigsRenderedOnPoolSource(pool.Status, mcSet)
187179
}
188180
if !added || !removed {
189181
status.setProgressing(MachineConfig, "MachineConfig",
@@ -239,6 +231,22 @@ func (status *StatusManager) setLastRenderedMachineConfigState(renderedMachineCo
239231
return status.setAnnotation(context.TODO(), co, renderedMachineConfigAnnotation, &anno)
240232
}
241233

234+
func areMachineConfigsRenderedOnPoolSource(status mcfgv1.MachineConfigPoolStatus, machineConfigs sets.Set[string]) bool {
235+
sourceNames := sets.New[string]()
236+
for _, source := range status.Configuration.Source {
237+
sourceNames.Insert(source.Name)
238+
}
239+
return sourceNames.IsSuperset(machineConfigs)
240+
}
241+
242+
func areMachineConfigsRemovedFromPoolSource(status mcfgv1.MachineConfigPoolStatus, machineConfigs sets.Set[string]) bool {
243+
sourceNames := sets.New[string]()
244+
for _, source := range status.Configuration.Source {
245+
sourceNames.Insert(source.Name)
246+
}
247+
return !sourceNames.HasAny(machineConfigs.UnsortedList()...)
248+
}
249+
242250
func (status *StatusManager) isAnyMachineConfigPoolDegraded(pools []mcfgv1.MachineConfigPool) string {
243251
var degradedPool string
244252
for _, pool := range pools {
@@ -250,17 +258,6 @@ func (status *StatusManager) isAnyMachineConfigPoolDegraded(pools []mcfgv1.Machi
250258
return degradedPool
251259
}
252260

253-
func (status *StatusManager) isAnyMachineConfigPoolProgressing(pools []mcfgv1.MachineConfigPool) string {
254-
var progressingPool string
255-
for _, pool := range pools {
256-
if mcomcfgv1.IsMachineConfigPoolConditionTrue(pool.Status.Conditions, mcfgv1.MachineConfigPoolUpdating) {
257-
progressingPool = pool.Name
258-
break
259-
}
260-
}
261-
return progressingPool
262-
}
263-
264261
func (status *StatusManager) findMachineConfigPoolsForLabel(mcPools []mcfgv1.MachineConfigPool, mcLabel labels.Set) ([]mcfgv1.MachineConfigPool, error) {
265262
var mcps []mcfgv1.MachineConfigPool
266263
for _, mcPool := range mcPools {

pkg/controller/statusmanager/pod_status.go

Lines changed: 91 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -45,24 +45,27 @@ type podState struct {
4545
type daemonsetState struct {
4646
ClusteredName
4747

48-
LastSeenStatus appsv1.DaemonSetStatus
49-
LastChangeTime time.Time
48+
LastSeenStatus appsv1.DaemonSetStatus
49+
LastChangeTime time.Time
50+
RolloutGeneration int64
5051
}
5152

5253
// deploymentState is the same as daemonsetState.. but for deployments!
5354
type deploymentState struct {
5455
ClusteredName
5556

56-
LastSeenStatus appsv1.DeploymentStatus
57-
LastChangeTime time.Time
57+
LastSeenStatus appsv1.DeploymentStatus
58+
LastChangeTime time.Time
59+
RolloutGeneration int64
5860
}
5961

6062
// statefulsetState is the same as daemonsetState.. but for statefulsets!
6163
type statefulsetState struct {
6264
ClusteredName
6365

64-
LastSeenStatus appsv1.StatefulSetStatus
65-
LastChangeTime time.Time
66+
LastSeenStatus appsv1.StatefulSetStatus
67+
LastChangeTime time.Time
68+
RolloutGeneration int64
6669
}
6770

6871
// SetFromPods sets the operator Degraded/Progressing/Available status, based on
@@ -87,6 +90,24 @@ func (status *StatusManager) SetFromPods() {
8790

8891
for _, ds := range daemonSets {
8992
dsName := NewClusteredName(ds)
93+
dsState, exists := daemonsetStates[dsName]
94+
currentRolloutGeneration := rolloutGeneration(ds.Generation, ds.Status.ObservedGeneration)
95+
if exists && dsState.RolloutGeneration == 0 {
96+
dsState.RolloutGeneration = currentRolloutGeneration
97+
}
98+
if !status.installComplete || ds.Generation > ds.Status.ObservedGeneration || ds.Status.UpdatedNumberScheduled < ds.Status.DesiredNumberScheduled {
99+
if dsState.RolloutGeneration < currentRolloutGeneration {
100+
dsState.RolloutGeneration = currentRolloutGeneration
101+
}
102+
}
103+
if dsState.RolloutGeneration != 0 &&
104+
ds.Status.ObservedGeneration >= dsState.RolloutGeneration &&
105+
ds.Status.NumberUnavailable == 0 &&
106+
(ds.Status.DesiredNumberScheduled == 0 || ds.Status.UpdatedNumberScheduled >= ds.Status.DesiredNumberScheduled) &&
107+
(ds.Status.DesiredNumberScheduled == 0 || ds.Status.NumberAvailable >= ds.Status.DesiredNumberScheduled) {
108+
dsState.RolloutGeneration = 0
109+
}
110+
dsRolloutActive := dsState.RolloutGeneration != 0
90111

91112
dsProgressing := false
92113

@@ -97,13 +118,14 @@ func (status *StatusManager) SetFromPods() {
97118
progressing = append(progressing, fmt.Sprintf("DaemonSet %q update is rolling out (%d out of %d updated)", dsName.String(), ds.Status.UpdatedNumberScheduled, ds.Status.DesiredNumberScheduled))
98119
dsProgressing = true
99120
} else if ds.Status.NumberUnavailable > 0 {
100-
progressing = append(progressing, fmt.Sprintf("DaemonSet %q is not available (awaiting %d nodes)", dsName.String(), ds.Status.NumberUnavailable))
101-
dsProgressing = true
102-
// Check for any pods in CrashLoopBackOff state and mark the operator as degraded if so.
121+
if dsRolloutActive {
122+
progressing = append(progressing, fmt.Sprintf("DaemonSet %q is not available (awaiting %d nodes)", dsName.String(), ds.Status.NumberUnavailable))
123+
dsProgressing = true
124+
}
103125
if !isNonCritical(ds) {
104126
hung = append(hung, status.CheckCrashLoopBackOffPods(dsName, ds.Spec.Selector.MatchLabels, "DaemonSet")...)
105127
}
106-
} else if ds.Status.NumberAvailable == 0 && ds.Status.DesiredNumberScheduled > 0 {
128+
} else if ds.Status.NumberAvailable == 0 && dsRolloutActive {
107129
progressing = append(progressing, fmt.Sprintf("DaemonSet %q is not yet scheduled on any nodes", dsName.String()))
108130
dsProgressing = true
109131
} else if ds.Generation > ds.Status.ObservedGeneration {
@@ -120,12 +142,11 @@ func (status *StatusManager) SetFromPods() {
120142
if dsProgressing && !isNonCritical(ds) {
121143
reachedAvailableLevel = false
122144

123-
dsState, exists := daemonsetStates[dsName]
124145
if !exists || !reflect.DeepEqual(dsState.LastSeenStatus, ds.Status) {
125146
dsState.LastChangeTime = time.Now()
126147
ds.Status.DeepCopyInto(&dsState.LastSeenStatus)
127-
daemonsetStates[dsName] = dsState
128148
}
149+
daemonsetStates[dsName] = dsState
129150

130151
// Catch hung rollouts
131152
if exists && (time.Since(dsState.LastChangeTime)) > ProgressTimeout {
@@ -143,6 +164,23 @@ func (status *StatusManager) SetFromPods() {
143164

144165
for _, ss := range statefulSets {
145166
ssName := NewClusteredName(ss)
167+
ssState, exists := statefulsetStates[ssName]
168+
currentRolloutGeneration := rolloutGeneration(ss.Generation, ss.Status.ObservedGeneration)
169+
if exists && ssState.RolloutGeneration == 0 {
170+
ssState.RolloutGeneration = currentRolloutGeneration
171+
}
172+
if !status.installComplete || ss.Generation > ss.Status.ObservedGeneration || ss.Status.UpdatedReplicas < ss.Status.Replicas {
173+
if ssState.RolloutGeneration < currentRolloutGeneration {
174+
ssState.RolloutGeneration = currentRolloutGeneration
175+
}
176+
}
177+
if ssState.RolloutGeneration != 0 &&
178+
ss.Status.ObservedGeneration >= ssState.RolloutGeneration &&
179+
ss.Status.UpdatedReplicas >= ss.Status.Replicas &&
180+
ss.Status.ReadyReplicas >= ss.Status.Replicas {
181+
ssState.RolloutGeneration = 0
182+
}
183+
ssRolloutActive := ssState.RolloutGeneration != 0
146184

147185
ssProgressing := false
148186

@@ -153,13 +191,15 @@ func (status *StatusManager) SetFromPods() {
153191
progressing = append(progressing, fmt.Sprintf("StatefulSet %q update is rolling out (%d out of %d updated)", ssName.String(), ss.Status.UpdatedReplicas, ss.Status.Replicas))
154192
ssProgressing = true
155193
} else if ss.Status.ReadyReplicas > 0 && ss.Status.ReadyReplicas < ss.Status.Replicas {
156-
progressing = append(progressing, fmt.Sprintf("StatefulSet %q is not available (awaiting %d nodes)", ssName.String(), (ss.Status.Replicas-ss.Status.ReadyReplicas)))
157-
ssProgressing = true
194+
if ssRolloutActive {
195+
progressing = append(progressing, fmt.Sprintf("StatefulSet %q is not available (awaiting %d nodes)", ssName.String(), (ss.Status.Replicas-ss.Status.ReadyReplicas)))
196+
ssProgressing = true
197+
}
158198
// Check for any pods in CrashLoopBackOff state and mark the operator as degraded if so.
159199
if !isNonCritical(ss) {
160200
hung = append(hung, status.CheckCrashLoopBackOffPods(ssName, ss.Spec.Selector.MatchLabels, "StatefulSet")...)
161201
}
162-
} else if ss.Status.AvailableReplicas == 0 {
202+
} else if ss.Status.AvailableReplicas == 0 && ssRolloutActive {
163203
progressing = append(progressing, fmt.Sprintf("StatefulSet %q is not yet scheduled on any nodes", ssName.String()))
164204
ssProgressing = true
165205
} else if ss.Status.ObservedGeneration < ss.Generation {
@@ -176,12 +216,11 @@ func (status *StatusManager) SetFromPods() {
176216
if ssProgressing && !isNonCritical(ss) {
177217
reachedAvailableLevel = false
178218

179-
ssState, exists := statefulsetStates[ssName]
180219
if !exists || !reflect.DeepEqual(ssState.LastSeenStatus, ss.Status) {
181220
ssState.LastChangeTime = time.Now()
182221
ss.Status.DeepCopyInto(&ssState.LastSeenStatus)
183-
statefulsetStates[ssName] = ssState
184222
}
223+
statefulsetStates[ssName] = ssState
185224

186225
// Catch hung rollouts
187226
if exists && (time.Since(ssState.LastChangeTime)) > ProgressTimeout {
@@ -199,6 +238,24 @@ func (status *StatusManager) SetFromPods() {
199238

200239
for _, dep := range deployments {
201240
depName := NewClusteredName(dep)
241+
depState, exists := deploymentStates[depName]
242+
currentRolloutGeneration := rolloutGeneration(dep.Generation, dep.Status.ObservedGeneration)
243+
if exists && depState.RolloutGeneration == 0 {
244+
depState.RolloutGeneration = currentRolloutGeneration
245+
}
246+
if !status.installComplete || dep.Generation > dep.Status.ObservedGeneration || dep.Status.UpdatedReplicas < dep.Status.Replicas {
247+
if depState.RolloutGeneration < currentRolloutGeneration {
248+
depState.RolloutGeneration = currentRolloutGeneration
249+
}
250+
}
251+
if depState.RolloutGeneration != 0 &&
252+
dep.Status.ObservedGeneration >= depState.RolloutGeneration &&
253+
dep.Status.UpdatedReplicas >= dep.Status.Replicas &&
254+
dep.Status.UnavailableReplicas == 0 &&
255+
(dep.Status.Replicas == 0 || dep.Status.AvailableReplicas >= dep.Status.Replicas) {
256+
depState.RolloutGeneration = 0
257+
}
258+
depRolloutActive := depState.RolloutGeneration != 0
202259
depProgressing := false
203260

204261
if isNonCritical(dep) && dep.Status.UnavailableReplicas > 0 && !status.installComplete {
@@ -208,13 +265,15 @@ func (status *StatusManager) SetFromPods() {
208265
progressing = append(progressing, fmt.Sprintf("Deployment %q update is rolling out (%d out of %d updated)", depName.String(), dep.Status.UpdatedReplicas, dep.Status.Replicas))
209266
depProgressing = true
210267
} else if dep.Status.UnavailableReplicas > 0 {
211-
progressing = append(progressing, fmt.Sprintf("Deployment %q is not available (awaiting %d nodes)", depName.String(), dep.Status.UnavailableReplicas))
212-
depProgressing = true
268+
if depRolloutActive {
269+
progressing = append(progressing, fmt.Sprintf("Deployment %q is not available (awaiting %d nodes)", depName.String(), dep.Status.UnavailableReplicas))
270+
depProgressing = true
271+
}
213272
// Check for any pods in CrashLoopBackOff state and mark the operator as degraded if so.
214273
if !isNonCritical(dep) {
215274
hung = append(hung, status.CheckCrashLoopBackOffPods(depName, dep.Spec.Selector.MatchLabels, "Deployment")...)
216275
}
217-
} else if dep.Status.AvailableReplicas == 0 {
276+
} else if dep.Status.AvailableReplicas == 0 && depRolloutActive {
218277
progressing = append(progressing, fmt.Sprintf("Deployment %q is not yet scheduled on any nodes", depName.String()))
219278
depProgressing = true
220279
} else if dep.Status.ObservedGeneration < dep.Generation {
@@ -231,12 +290,11 @@ func (status *StatusManager) SetFromPods() {
231290
if depProgressing && !isNonCritical(dep) {
232291
reachedAvailableLevel = false
233292

234-
depState, exists := deploymentStates[depName]
235293
if !exists || !reflect.DeepEqual(depState.LastSeenStatus, dep.Status) {
236294
depState.LastChangeTime = time.Now()
237295
dep.Status.DeepCopyInto(&depState.LastSeenStatus)
238-
deploymentStates[depName] = depState
239296
}
297+
deploymentStates[depName] = depState
240298

241299
// Catch hung rollouts
242300
if exists && (time.Since(depState.LastChangeTime)) > ProgressTimeout {
@@ -280,6 +338,17 @@ func (status *StatusManager) SetFromPods() {
280338
}
281339
}
282340

341+
func rolloutGeneration(generation, observedGeneration int64) int64 {
342+
switch {
343+
case generation > 0:
344+
return generation
345+
case observedGeneration > 0:
346+
return observedGeneration
347+
default:
348+
return 1
349+
}
350+
}
351+
283352
// getLastPodState reads the last-seen daemonset + deployment + statefulset
284353
// states from the clusteroperator annotation and parses it. On error, it
285354
// returns an empty state, since this should not block updating operator status.

0 commit comments

Comments
 (0)