@@ -38,6 +38,7 @@ type podState struct {
3838 DaemonsetStates []daemonsetState
3939 DeploymentStates []deploymentState
4040 StatefulsetStates []statefulsetState
41+ InstallComplete bool
4142}
4243
4344// daemonsetState is the internal state we use to check if a rollout has
@@ -47,6 +48,7 @@ type daemonsetState struct {
4748
4849 LastSeenStatus appsv1.DaemonSetStatus
4950 LastChangeTime time.Time
51+ RolloutActive bool
5052}
5153
5254// deploymentState is the same as daemonsetState.. but for deployments!
@@ -55,6 +57,7 @@ type deploymentState struct {
5557
5658 LastSeenStatus appsv1.DeploymentStatus
5759 LastChangeTime time.Time
60+ RolloutActive bool
5861}
5962
6063// statefulsetState is the same as daemonsetState.. but for statefulsets!
@@ -63,6 +66,7 @@ type statefulsetState struct {
6366
6467 LastSeenStatus appsv1.StatefulSetStatus
6568 LastChangeTime time.Time
69+ RolloutActive bool
6670}
6771
6872// SetFromPods sets the operator Degraded/Progressing/Available status, based on
@@ -79,14 +83,20 @@ func (status *StatusManager) SetFromPods() {
7983 progressing := []string {}
8084 hung := []string {}
8185
82- daemonsetStates , deploymentStates , statefulsetStates := status .getLastPodState ()
86+ daemonsetStates , deploymentStates , statefulsetStates , installComplete := status .getLastPodState ()
87+ if ! status .installComplete && installComplete {
88+ status .installComplete = true
89+ }
8390
8491 if (len (daemonSets ) + len (deployments ) + len (statefulSets )) == 0 {
8592 progressing = append (progressing , "Deploying" )
8693 }
8794
8895 for _ , ds := range daemonSets {
8996 dsName := NewClusteredName (ds )
97+ dsState , hadState := daemonsetStates [dsName ]
98+ dsState .RolloutActive = daemonSetRolloutActive (ds , dsState .RolloutActive , status .installComplete )
99+ dsRolloutActive := dsState .RolloutActive
90100
91101 dsProgressing := false
92102
@@ -97,13 +107,14 @@ func (status *StatusManager) SetFromPods() {
97107 progressing = append (progressing , fmt .Sprintf ("DaemonSet %q update is rolling out (%d out of %d updated)" , dsName .String (), ds .Status .UpdatedNumberScheduled , ds .Status .DesiredNumberScheduled ))
98108 dsProgressing = true
99109 } else if ds .Status .NumberUnavailable > 0 {
100- progressing = append (progressing , fmt .Sprintf ("DaemonSet %q is not available (awaiting %d nodes)" , dsName .String (), ds .Status .NumberUnavailable ))
101- dsProgressing = true
102- // Check for any pods in CrashLoopBackOff state and mark the operator as degraded if so.
110+ if dsRolloutActive {
111+ progressing = append (progressing , fmt .Sprintf ("DaemonSet %q is not available (awaiting %d nodes)" , dsName .String (), ds .Status .NumberUnavailable ))
112+ dsProgressing = true
113+ }
103114 if ! isNonCritical (ds ) {
104115 hung = append (hung , status .CheckCrashLoopBackOffPods (dsName , ds .Spec .Selector .MatchLabels , "DaemonSet" )... )
105116 }
106- } else if ds .Status .NumberAvailable == 0 && ds . Status . DesiredNumberScheduled > 0 {
117+ } else if ds .Status .NumberAvailable == 0 && dsRolloutActive {
107118 progressing = append (progressing , fmt .Sprintf ("DaemonSet %q is not yet scheduled on any nodes" , dsName .String ()))
108119 dsProgressing = true
109120 } else if ds .Generation > ds .Status .ObservedGeneration {
@@ -120,19 +131,20 @@ func (status *StatusManager) SetFromPods() {
120131 if dsProgressing && ! isNonCritical (ds ) {
121132 reachedAvailableLevel = false
122133
123- dsState , exists := daemonsetStates [dsName ]
124- if ! exists || ! reflect .DeepEqual (dsState .LastSeenStatus , ds .Status ) {
134+ if ! hadState || ! reflect .DeepEqual (dsState .LastSeenStatus , ds .Status ) {
125135 dsState .LastChangeTime = time .Now ()
126136 ds .Status .DeepCopyInto (& dsState .LastSeenStatus )
127- daemonsetStates [dsName ] = dsState
128137 }
129138
130139 // Catch hung rollouts
131- if exists && (time .Since (dsState .LastChangeTime )) > ProgressTimeout {
140+ if hadState && (time .Since (dsState .LastChangeTime )) > ProgressTimeout {
132141 hung = append (hung , fmt .Sprintf ("DaemonSet %q rollout is not making progress - last change %s" , dsName .String (), dsState .LastChangeTime .Format (time .RFC3339 )))
133142 empty := ""
134143 dsHung = & empty
135144 }
145+ }
146+ if dsRolloutActive {
147+ daemonsetStates [dsName ] = dsState
136148 } else {
137149 delete (daemonsetStates , dsName )
138150 }
@@ -143,6 +155,9 @@ func (status *StatusManager) SetFromPods() {
143155
144156 for _ , ss := range statefulSets {
145157 ssName := NewClusteredName (ss )
158+ ssState , hadState := statefulsetStates [ssName ]
159+ ssState .RolloutActive = statefulSetRolloutActive (ss , ssState .RolloutActive , status .installComplete )
160+ ssRolloutActive := ssState .RolloutActive
146161
147162 ssProgressing := false
148163
@@ -153,13 +168,15 @@ func (status *StatusManager) SetFromPods() {
153168 progressing = append (progressing , fmt .Sprintf ("StatefulSet %q update is rolling out (%d out of %d updated)" , ssName .String (), ss .Status .UpdatedReplicas , ss .Status .Replicas ))
154169 ssProgressing = true
155170 } else if ss .Status .ReadyReplicas > 0 && ss .Status .ReadyReplicas < ss .Status .Replicas {
156- progressing = append (progressing , fmt .Sprintf ("StatefulSet %q is not available (awaiting %d nodes)" , ssName .String (), (ss .Status .Replicas - ss .Status .ReadyReplicas )))
157- ssProgressing = true
171+ if ssRolloutActive {
172+ progressing = append (progressing , fmt .Sprintf ("StatefulSet %q is not available (awaiting %d nodes)" , ssName .String (), (ss .Status .Replicas - ss .Status .ReadyReplicas )))
173+ ssProgressing = true
174+ }
158175 // Check for any pods in CrashLoopBackOff state and mark the operator as degraded if so.
159176 if ! isNonCritical (ss ) {
160177 hung = append (hung , status .CheckCrashLoopBackOffPods (ssName , ss .Spec .Selector .MatchLabels , "StatefulSet" )... )
161178 }
162- } else if ss .Status .AvailableReplicas == 0 {
179+ } else if ss .Status .AvailableReplicas == 0 && ssRolloutActive {
163180 progressing = append (progressing , fmt .Sprintf ("StatefulSet %q is not yet scheduled on any nodes" , ssName .String ()))
164181 ssProgressing = true
165182 } else if ss .Status .ObservedGeneration < ss .Generation {
@@ -176,19 +193,20 @@ func (status *StatusManager) SetFromPods() {
176193 if ssProgressing && ! isNonCritical (ss ) {
177194 reachedAvailableLevel = false
178195
179- ssState , exists := statefulsetStates [ssName ]
180- if ! exists || ! reflect .DeepEqual (ssState .LastSeenStatus , ss .Status ) {
196+ if ! hadState || ! reflect .DeepEqual (ssState .LastSeenStatus , ss .Status ) {
181197 ssState .LastChangeTime = time .Now ()
182198 ss .Status .DeepCopyInto (& ssState .LastSeenStatus )
183- statefulsetStates [ssName ] = ssState
184199 }
185200
186201 // Catch hung rollouts
187- if exists && (time .Since (ssState .LastChangeTime )) > ProgressTimeout {
202+ if hadState && (time .Since (ssState .LastChangeTime )) > ProgressTimeout {
188203 hung = append (hung , fmt .Sprintf ("StatefulSet %q rollout is not making progress - last change %s" , ssName .String (), ssState .LastChangeTime .Format (time .RFC3339 )))
189204 empty := ""
190205 ssHung = & empty
191206 }
207+ }
208+ if ssRolloutActive {
209+ statefulsetStates [ssName ] = ssState
192210 } else {
193211 delete (statefulsetStates , ssName )
194212 }
@@ -199,6 +217,9 @@ func (status *StatusManager) SetFromPods() {
199217
200218 for _ , dep := range deployments {
201219 depName := NewClusteredName (dep )
220+ depState , hadState := deploymentStates [depName ]
221+ depState .RolloutActive = deploymentRolloutActive (dep , depState .RolloutActive , status .installComplete )
222+ depRolloutActive := depState .RolloutActive
202223 depProgressing := false
203224
204225 if isNonCritical (dep ) && dep .Status .UnavailableReplicas > 0 && ! status .installComplete {
@@ -208,13 +229,15 @@ func (status *StatusManager) SetFromPods() {
208229 progressing = append (progressing , fmt .Sprintf ("Deployment %q update is rolling out (%d out of %d updated)" , depName .String (), dep .Status .UpdatedReplicas , dep .Status .Replicas ))
209230 depProgressing = true
210231 } else if dep .Status .UnavailableReplicas > 0 {
211- progressing = append (progressing , fmt .Sprintf ("Deployment %q is not available (awaiting %d nodes)" , depName .String (), dep .Status .UnavailableReplicas ))
212- depProgressing = true
232+ if depRolloutActive {
233+ progressing = append (progressing , fmt .Sprintf ("Deployment %q is not available (awaiting %d nodes)" , depName .String (), dep .Status .UnavailableReplicas ))
234+ depProgressing = true
235+ }
213236 // Check for any pods in CrashLoopBackOff state and mark the operator as degraded if so.
214237 if ! isNonCritical (dep ) {
215238 hung = append (hung , status .CheckCrashLoopBackOffPods (depName , dep .Spec .Selector .MatchLabels , "Deployment" )... )
216239 }
217- } else if dep .Status .AvailableReplicas == 0 {
240+ } else if dep .Status .AvailableReplicas == 0 && depRolloutActive {
218241 progressing = append (progressing , fmt .Sprintf ("Deployment %q is not yet scheduled on any nodes" , depName .String ()))
219242 depProgressing = true
220243 } else if dep .Status .ObservedGeneration < dep .Generation {
@@ -231,19 +254,20 @@ func (status *StatusManager) SetFromPods() {
231254 if depProgressing && ! isNonCritical (dep ) {
232255 reachedAvailableLevel = false
233256
234- depState , exists := deploymentStates [depName ]
235- if ! exists || ! reflect .DeepEqual (depState .LastSeenStatus , dep .Status ) {
257+ if ! hadState || ! reflect .DeepEqual (depState .LastSeenStatus , dep .Status ) {
236258 depState .LastChangeTime = time .Now ()
237259 dep .Status .DeepCopyInto (& depState .LastSeenStatus )
238- deploymentStates [depName ] = depState
239260 }
240261
241262 // Catch hung rollouts
242- if exists && (time .Since (depState .LastChangeTime )) > ProgressTimeout {
263+ if hadState && (time .Since (depState .LastChangeTime )) > ProgressTimeout {
243264 hung = append (hung , fmt .Sprintf ("Deployment %q rollout is not making progress - last change %s" , depName .String (), depState .LastChangeTime .Format (time .RFC3339 )))
244265 empty := ""
245266 depHung = & empty
246267 }
268+ }
269+ if depRolloutActive {
270+ deploymentStates [depName ] = depState
247271 } else {
248272 delete (deploymentStates , depName )
249273 }
@@ -253,7 +277,10 @@ func (status *StatusManager) SetFromPods() {
253277 }
254278
255279 status .setNotDegraded (PodDeployment )
256- if err := status .setLastPodState (daemonsetStates , deploymentStates , statefulsetStates ); err != nil {
280+ if reachedAvailableLevel && len (progressing ) == 0 {
281+ status .installComplete = true
282+ }
283+ if err := status .setLastPodState (daemonsetStates , deploymentStates , statefulsetStates , status .installComplete ); err != nil {
257284 log .Printf ("Failed to set pod state (continuing): %+v\n " , err )
258285 }
259286
@@ -269,21 +296,71 @@ func (status *StatusManager) SetFromPods() {
269296 Status : operv1 .ConditionTrue })
270297 }
271298
272- if reachedAvailableLevel && len (progressing ) == 0 {
273- status .installComplete = true
274- }
275-
276299 if len (hung ) > 0 {
277300 status .setDegraded (RolloutHung , "RolloutHung" , strings .Join (hung , "\n " ))
278301 } else {
279302 status .setNotDegraded (RolloutHung )
280303 }
281304}
282305
306+ // We only want pod unavailability to count as Progressing when we already know a
307+ // CNO-managed rollout is in flight. The status snapshots below distinguish:
308+ // - rollout started: controller still processing a spec change
309+ // - rollout complete: controller has observed that change and all replicas are healthy
310+ // If we have neither signal after install, the same "unavailable" counters are
311+ // treated as ordinary node reboot churn rather than a network rollout.
312+ func daemonSetRolloutActive (ds * appsv1.DaemonSet , rolloutActive , installComplete bool ) bool {
313+ rolloutStarted := ds .Generation > ds .Status .ObservedGeneration || ds .Status .UpdatedNumberScheduled < ds .Status .DesiredNumberScheduled
314+ rolloutComplete := ds .Status .ObservedGeneration >= expectedGeneration (ds .Generation ) &&
315+ ds .Status .NumberUnavailable == 0 &&
316+ (ds .Status .DesiredNumberScheduled == 0 || ds .Status .UpdatedNumberScheduled >= ds .Status .DesiredNumberScheduled ) &&
317+ (ds .Status .DesiredNumberScheduled == 0 || ds .Status .NumberAvailable >= ds .Status .DesiredNumberScheduled )
318+
319+ return updateRolloutActive (rolloutActive , installComplete , rolloutStarted , rolloutComplete )
320+ }
321+
322+ func statefulSetRolloutActive (ss * appsv1.StatefulSet , rolloutActive , installComplete bool ) bool {
323+ rolloutStarted := ss .Generation > ss .Status .ObservedGeneration || ss .Status .UpdatedReplicas < ss .Status .Replicas
324+ rolloutComplete := ss .Status .ObservedGeneration >= expectedGeneration (ss .Generation ) &&
325+ ss .Status .UpdatedReplicas >= ss .Status .Replicas &&
326+ ss .Status .ReadyReplicas >= ss .Status .Replicas
327+
328+ return updateRolloutActive (rolloutActive , installComplete , rolloutStarted , rolloutComplete )
329+ }
330+
331+ func deploymentRolloutActive (dep * appsv1.Deployment , rolloutActive , installComplete bool ) bool {
332+ rolloutStarted := dep .Generation > dep .Status .ObservedGeneration || dep .Status .UpdatedReplicas < dep .Status .Replicas
333+ rolloutComplete := dep .Status .ObservedGeneration >= expectedGeneration (dep .Generation ) &&
334+ dep .Status .UpdatedReplicas >= dep .Status .Replicas &&
335+ dep .Status .UnavailableReplicas == 0 &&
336+ (dep .Status .Replicas == 0 || dep .Status .AvailableReplicas >= dep .Status .Replicas )
337+
338+ return updateRolloutActive (rolloutActive , installComplete , rolloutStarted , rolloutComplete )
339+ }
340+
341+ // Once install is complete, only explicit rollout signals should reactivate Progressing.
342+ func updateRolloutActive (rolloutActive , installComplete , rolloutStarted , rolloutComplete bool ) bool {
343+ if ! installComplete || rolloutStarted {
344+ rolloutActive = true
345+ }
346+ if rolloutActive && rolloutComplete {
347+ return false
348+ }
349+ return rolloutActive
350+ }
351+
352+ // Real workload objects start at generation 1; tests often omit it and leave the zero value.
353+ func expectedGeneration (generation int64 ) int64 {
354+ if generation > 0 {
355+ return generation
356+ }
357+ return 1
358+ }
359+
283360// getLastPodState reads the last-seen daemonset + deployment + statefulset
284361// states from the clusteroperator annotation and parses it. On error, it
285362// returns an empty state, since this should not block updating operator status.
286- func (status * StatusManager ) getLastPodState () (map [ClusteredName ]daemonsetState , map [ClusteredName ]deploymentState , map [ClusteredName ]statefulsetState ) {
363+ func (status * StatusManager ) getLastPodState () (map [ClusteredName ]daemonsetState , map [ClusteredName ]deploymentState , map [ClusteredName ]statefulsetState , bool ) {
287364 // with maps allocated
288365 daemonsetStates := map [ClusteredName ]daemonsetState {}
289366 deploymentStates := map [ClusteredName ]deploymentState {}
@@ -294,20 +371,20 @@ func (status *StatusManager) getLastPodState() (map[ClusteredName]daemonsetState
294371 err := status .client .ClientFor ("" ).CRClient ().Get (context .TODO (), types.NamespacedName {Name : status .name }, co )
295372 if err != nil {
296373 log .Printf ("Failed to get ClusterOperator: %v" , err )
297- return daemonsetStates , deploymentStates , statefulsetStates
374+ return daemonsetStates , deploymentStates , statefulsetStates , false
298375 }
299376
300377 lsbytes := co .Annotations [lastSeenAnnotation ]
301378 if lsbytes == "" {
302- return daemonsetStates , deploymentStates , statefulsetStates
379+ return daemonsetStates , deploymentStates , statefulsetStates , false
303380 }
304381
305382 out := podState {}
306383 err = json .Unmarshal ([]byte (lsbytes ), & out )
307384 if err != nil {
308385 // No need to return error; just move on
309386 log .Printf ("failed to unmashal last-seen-status: %v" , err )
310- return daemonsetStates , deploymentStates , statefulsetStates
387+ return daemonsetStates , deploymentStates , statefulsetStates , false
311388 }
312389
313390 for _ , ds := range out .DaemonsetStates {
@@ -322,18 +399,20 @@ func (status *StatusManager) getLastPodState() (map[ClusteredName]daemonsetState
322399 statefulsetStates [ss .ClusteredName ] = ss
323400 }
324401
325- return daemonsetStates , deploymentStates , statefulsetStates
402+ return daemonsetStates , deploymentStates , statefulsetStates , out . InstallComplete
326403}
327404
328405func (status * StatusManager ) setLastPodState (
329406 dss map [ClusteredName ]daemonsetState ,
330407 deps map [ClusteredName ]deploymentState ,
331- sss map [ClusteredName ]statefulsetState ) error {
408+ sss map [ClusteredName ]statefulsetState ,
409+ installComplete bool ) error {
332410
333411 ps := podState {
334412 DaemonsetStates : make ([]daemonsetState , 0 , len (dss )),
335413 DeploymentStates : make ([]deploymentState , 0 , len (deps )),
336414 StatefulsetStates : make ([]statefulsetState , 0 , len (sss )),
415+ InstallComplete : installComplete ,
337416 }
338417
339418 for nsn , ds := range dss {
0 commit comments