diff --git a/pkg/controller/statusmanager/machineconfig_status.go b/pkg/controller/statusmanager/machineconfig_status.go index adc6e13495..e68f1f79d2 100644 --- a/pkg/controller/statusmanager/machineconfig_status.go +++ b/pkg/controller/statusmanager/machineconfig_status.go @@ -140,55 +140,43 @@ func (status *StatusManager) SetFromMachineConfigPool(mcPools []mcfgv1.MachineCo // No degraded pools, so clear degraded status status.setNotDegraded(MachineConfig) - // Now check for progressing and process machine configs for role, machineConfigs := range status.renderedMachineConfigs { pools, err := status.findMachineConfigPoolsForLabel(mcPools, map[string]string{names.MachineConfigLabelRoleKey: role}) if err != nil { klog.Errorf("failed to get machine config pools for the role %s: %v", role, err) } + for _, machineConfig := range machineConfigs.UnsortedList() { + mcSet := sets.New[string](machineConfig) + beingRemoved := status.machineConfigsBeingRemoved[role].Has(machineConfig) - progressingPool := status.isAnyMachineConfigPoolProgressing(pools) - if progressingPool != "" { - status.setProgressing(MachineConfig, "MachineConfig", fmt.Sprintf("%s machine config pool in progressing state", progressingPool)) - return nil - } - for _, pool := range pools { - if pool.Spec.Paused { - // When a machine config pool is in paused state, then it is expected that mco doesn't process any machine configs for the pool. - // so if we report network status as progressing state then it blocks networking upgrade until machine config pool is changed - // into unpaused state. so let's not consider the pool for reporting status. - continue - } - for _, machineConfig := range machineConfigs.UnsortedList() { - added := true - removed := true - mcSet := sets.Set[string]{} - mcSet.Insert(machineConfig) - if mcsBeingRemoved, ok := status.machineConfigsBeingRemoved[role]; ok && mcsBeingRemoved.Has(machineConfig) { - removed = mcutil.AreMachineConfigsRemovedFromPool(pool.Status, mcSet) - if removed { - status.machineConfigsBeingRemoved[role].Delete(machineConfig) - // Delete map entry from status cache if role doesn't have machine configs. By deleting the entry, - // there won't be any unnecessary processing of pools in the reconcile loop when it's not dealing - // with network operator machine configs anymore. - if status.machineConfigsBeingRemoved[role].Len() == 0 { - delete(status.machineConfigsBeingRemoved, role) - } - status.renderedMachineConfigs[role].Delete(machineConfig) - if status.renderedMachineConfigs[role].Len() == 0 { - delete(status.renderedMachineConfigs, role) - } - if err := status.setLastRenderedMachineConfigState(status.renderedMachineConfigs); err != nil { - return fmt.Errorf("failed to update rendered machine config state: %v", err) - } + sawNonPausedPool := false + for _, pool := range pools { + if pool.Spec.Paused { + // When a machine config pool is in paused state, then it is expected that mco doesn't process any machine configs for the pool. + // so if we report network status as progressing state then it blocks networking upgrade until machine config pool is changed + // into unpaused state. so let's not consider the pool for reporting status. + continue + } + sawNonPausedPool = true + + if beingRemoved { + if mcutil.AreMachineConfigsRemovedFromPoolSource(pool.Status, mcSet) { + continue } - } else { - added = mcutil.AreMachineConfigsRenderedOnPool(pool.Status, mcSet) + } else if mcutil.AreMachineConfigsRenderedOnPoolSource(pool.Status, mcSet) { + continue } - if !added || !removed { - status.setProgressing(MachineConfig, "MachineConfig", - fmt.Sprintf("%s machine config pool is still processing %s machine config", pool.Name, machineConfig)) - return nil + + status.setProgressing(MachineConfig, "MachineConfig", + fmt.Sprintf("%s machine config pool is still processing %s machine config", pool.Name, machineConfig)) + return nil + } + + // Wait to prune cached removal state until every non-paused pool for + // this role reflects the updated rendered source. + if beingRemoved && sawNonPausedPool { + if err := status.forgetRemovedMachineConfig(role, machineConfig); err != nil { + return err } } } @@ -197,6 +185,24 @@ func (status *StatusManager) SetFromMachineConfigPool(mcPools []mcfgv1.MachineCo return nil } +func (status *StatusManager) forgetRemovedMachineConfig(role, machineConfig string) error { + status.machineConfigsBeingRemoved[role].Delete(machineConfig) + // Delete map entry from status cache if role doesn't have machine configs. By deleting the entry, + // there won't be any unnecessary processing of pools in the reconcile loop when it's not dealing + // with network operator machine configs anymore. + if status.machineConfigsBeingRemoved[role].Len() == 0 { + delete(status.machineConfigsBeingRemoved, role) + } + status.renderedMachineConfigs[role].Delete(machineConfig) + if status.renderedMachineConfigs[role].Len() == 0 { + delete(status.renderedMachineConfigs, role) + } + if err := status.setLastRenderedMachineConfigState(status.renderedMachineConfigs); err != nil { + return fmt.Errorf("failed to update rendered machine config state: %v", err) + } + return nil +} + func (status *StatusManager) getLastRenderedMachineConfigState() (map[string]sets.Set[string], error) { renderedMachineConfigs := map[string]sets.Set[string]{} co := &configv1.ClusterOperator{ObjectMeta: metav1.ObjectMeta{Name: status.name}} @@ -250,17 +256,6 @@ func (status *StatusManager) isAnyMachineConfigPoolDegraded(pools []mcfgv1.Machi return degradedPool } -func (status *StatusManager) isAnyMachineConfigPoolProgressing(pools []mcfgv1.MachineConfigPool) string { - var progressingPool string - for _, pool := range pools { - if mcomcfgv1.IsMachineConfigPoolConditionTrue(pool.Status.Conditions, mcfgv1.MachineConfigPoolUpdating) { - progressingPool = pool.Name - break - } - } - return progressingPool -} - func (status *StatusManager) findMachineConfigPoolsForLabel(mcPools []mcfgv1.MachineConfigPool, mcLabel labels.Set) ([]mcfgv1.MachineConfigPool, error) { var mcps []mcfgv1.MachineConfigPool for _, mcPool := range mcPools { diff --git a/pkg/controller/statusmanager/pod_status.go b/pkg/controller/statusmanager/pod_status.go index 6ce6eea55e..6e83c910f9 100644 --- a/pkg/controller/statusmanager/pod_status.go +++ b/pkg/controller/statusmanager/pod_status.go @@ -14,6 +14,7 @@ import ( operv1 "github.com/openshift/api/operator/v1" "github.com/openshift/cluster-network-operator/pkg/names" "github.com/openshift/cluster-network-operator/pkg/util" + cohelpers "github.com/openshift/library-go/pkg/config/clusteroperator/v1helpers" appsv1 "k8s.io/api/apps/v1" v1 "k8s.io/api/core/v1" @@ -38,6 +39,7 @@ type podState struct { DaemonsetStates []daemonsetState DeploymentStates []deploymentState StatefulsetStates []statefulsetState + InstallComplete bool } // daemonsetState is the internal state we use to check if a rollout has @@ -80,7 +82,10 @@ func (status *StatusManager) SetFromPods() { hung := []string{} clbo := []string{} - daemonsetStates, deploymentStates, statefulsetStates := status.getLastPodState() + daemonsetStates, deploymentStates, statefulsetStates, installComplete := status.getLastPodState() + if !status.installComplete && installComplete { + status.installComplete = true + } if (len(daemonSets) + len(deployments) + len(statefulSets)) == 0 { progressing = append(progressing, "Deploying") @@ -88,28 +93,28 @@ func (status *StatusManager) SetFromPods() { for _, ds := range daemonSets { dsName := NewClusteredName(ds) + dsState, hadState := daemonsetStates[dsName] + dsRolloutActive := !status.installComplete || ds.Status.UpdatedNumberScheduled < ds.Status.CurrentNumberScheduled dsProgressing := false if isNonCritical(ds) && ds.Status.NumberReady == 0 && !status.installComplete { progressing = append(progressing, fmt.Sprintf("DaemonSet %q is waiting for other operators to become ready", dsName.String())) dsProgressing = true - } else if ds.Status.UpdatedNumberScheduled < ds.Status.DesiredNumberScheduled { - progressing = append(progressing, fmt.Sprintf("DaemonSet %q update is rolling out (%d out of %d updated)", dsName.String(), ds.Status.UpdatedNumberScheduled, ds.Status.DesiredNumberScheduled)) + } else if ds.Status.UpdatedNumberScheduled < ds.Status.CurrentNumberScheduled { + progressing = append(progressing, fmt.Sprintf("DaemonSet %q update is rolling out (%d out of %d updated)", dsName.String(), ds.Status.UpdatedNumberScheduled, ds.Status.CurrentNumberScheduled)) dsProgressing = true } else if ds.Status.NumberUnavailable > 0 { - progressing = append(progressing, fmt.Sprintf("DaemonSet %q is not available (awaiting %d nodes)", dsName.String(), ds.Status.NumberUnavailable)) - dsProgressing = true - // Check for any pods in CrashLoopBackOff state and mark the operator as degraded if so. + if dsRolloutActive { + progressing = append(progressing, fmt.Sprintf("DaemonSet %q is not available (awaiting %d nodes)", dsName.String(), ds.Status.NumberUnavailable)) + dsProgressing = true + } if !isNonCritical(ds) { clbo = append(clbo, status.CheckCrashLoopBackOffPods(dsName, ds.Spec.Selector.MatchLabels, "DaemonSet")...) } - } else if ds.Status.NumberAvailable == 0 && ds.Status.DesiredNumberScheduled > 0 { + } else if ds.Status.NumberAvailable == 0 && dsRolloutActive { progressing = append(progressing, fmt.Sprintf("DaemonSet %q is not yet scheduled on any nodes", dsName.String())) dsProgressing = true - } else if ds.Generation > ds.Status.ObservedGeneration { - progressing = append(progressing, fmt.Sprintf("DaemonSet %q update is being processed (generation %d, observed generation %d)", dsName.String(), ds.Generation, ds.Status.ObservedGeneration)) - dsProgressing = true } if ds.Annotations["release.openshift.io/version"] != targetLevel { @@ -121,19 +126,20 @@ func (status *StatusManager) SetFromPods() { if dsProgressing && !isNonCritical(ds) { reachedAvailableLevel = false - dsState, exists := daemonsetStates[dsName] - if !exists || !reflect.DeepEqual(dsState.LastSeenStatus, ds.Status) { + if !hadState || !reflect.DeepEqual(dsState.LastSeenStatus, ds.Status) { dsState.LastChangeTime = time.Now() ds.Status.DeepCopyInto(&dsState.LastSeenStatus) - daemonsetStates[dsName] = dsState } // Catch hung rollouts - if exists && (time.Since(dsState.LastChangeTime)) > ProgressTimeout { + if hadState && (time.Since(dsState.LastChangeTime)) > ProgressTimeout { hung = append(hung, fmt.Sprintf("DaemonSet %q rollout is not making progress - last change %s", dsName.String(), dsState.LastChangeTime.Format(time.RFC3339))) empty := "" dsHung = &empty } + } + if dsProgressing && !isNonCritical(ds) { + daemonsetStates[dsName] = dsState } else { delete(daemonsetStates, dsName) } @@ -144,6 +150,8 @@ func (status *StatusManager) SetFromPods() { for _, ss := range statefulSets { ssName := NewClusteredName(ss) + ssState, hadState := statefulsetStates[ssName] + ssRolloutActive := !status.installComplete || ss.Status.UpdatedReplicas < ss.Status.Replicas ssProgressing := false @@ -154,18 +162,17 @@ func (status *StatusManager) SetFromPods() { progressing = append(progressing, fmt.Sprintf("StatefulSet %q update is rolling out (%d out of %d updated)", ssName.String(), ss.Status.UpdatedReplicas, ss.Status.Replicas)) ssProgressing = true } else if ss.Status.ReadyReplicas > 0 && ss.Status.ReadyReplicas < ss.Status.Replicas { - progressing = append(progressing, fmt.Sprintf("StatefulSet %q is not available (awaiting %d nodes)", ssName.String(), (ss.Status.Replicas-ss.Status.ReadyReplicas))) - ssProgressing = true + if ssRolloutActive { + progressing = append(progressing, fmt.Sprintf("StatefulSet %q is not available (awaiting %d nodes)", ssName.String(), (ss.Status.Replicas-ss.Status.ReadyReplicas))) + ssProgressing = true + } // Check for any pods in CrashLoopBackOff state and mark the operator as degraded if so. if !isNonCritical(ss) { clbo = append(clbo, status.CheckCrashLoopBackOffPods(ssName, ss.Spec.Selector.MatchLabels, "StatefulSet")...) } - } else if ss.Status.AvailableReplicas == 0 { + } else if ss.Status.AvailableReplicas == 0 && ssRolloutActive { progressing = append(progressing, fmt.Sprintf("StatefulSet %q is not yet scheduled on any nodes", ssName.String())) ssProgressing = true - } else if ss.Status.ObservedGeneration < ss.Generation { - progressing = append(progressing, fmt.Sprintf("StatefulSet %q update is being processed (generation %d, observed generation %d)", ssName.String(), ss.Generation, ss.Status.ObservedGeneration)) - ssProgressing = true } if ss.Annotations["release.openshift.io/version"] != targetLevel { @@ -177,19 +184,20 @@ func (status *StatusManager) SetFromPods() { if ssProgressing && !isNonCritical(ss) { reachedAvailableLevel = false - ssState, exists := statefulsetStates[ssName] - if !exists || !reflect.DeepEqual(ssState.LastSeenStatus, ss.Status) { + if !hadState || !reflect.DeepEqual(ssState.LastSeenStatus, ss.Status) { ssState.LastChangeTime = time.Now() ss.Status.DeepCopyInto(&ssState.LastSeenStatus) - statefulsetStates[ssName] = ssState } // Catch hung rollouts - if exists && (time.Since(ssState.LastChangeTime)) > ProgressTimeout { + if hadState && (time.Since(ssState.LastChangeTime)) > ProgressTimeout { hung = append(hung, fmt.Sprintf("StatefulSet %q rollout is not making progress - last change %s", ssName.String(), ssState.LastChangeTime.Format(time.RFC3339))) empty := "" ssHung = &empty } + } + if ssProgressing && !isNonCritical(ss) { + statefulsetStates[ssName] = ssState } else { delete(statefulsetStates, ssName) } @@ -200,6 +208,8 @@ func (status *StatusManager) SetFromPods() { for _, dep := range deployments { depName := NewClusteredName(dep) + depState, hadState := deploymentStates[depName] + depRolloutActive := !status.installComplete || dep.Status.UpdatedReplicas < dep.Status.Replicas depProgressing := false if isNonCritical(dep) && dep.Status.UnavailableReplicas > 0 && !status.installComplete { @@ -209,18 +219,17 @@ func (status *StatusManager) SetFromPods() { progressing = append(progressing, fmt.Sprintf("Deployment %q update is rolling out (%d out of %d updated)", depName.String(), dep.Status.UpdatedReplicas, dep.Status.Replicas)) depProgressing = true } else if dep.Status.UnavailableReplicas > 0 { - progressing = append(progressing, fmt.Sprintf("Deployment %q is not available (awaiting %d nodes)", depName.String(), dep.Status.UnavailableReplicas)) - depProgressing = true + if depRolloutActive { + progressing = append(progressing, fmt.Sprintf("Deployment %q is not available (awaiting %d nodes)", depName.String(), dep.Status.UnavailableReplicas)) + depProgressing = true + } // Check for any pods in CrashLoopBackOff state and mark the operator as degraded if so. if !isNonCritical(dep) { clbo = append(clbo, status.CheckCrashLoopBackOffPods(depName, dep.Spec.Selector.MatchLabels, "Deployment")...) } - } else if dep.Status.AvailableReplicas == 0 { + } else if dep.Status.AvailableReplicas == 0 && depRolloutActive { progressing = append(progressing, fmt.Sprintf("Deployment %q is not yet scheduled on any nodes", depName.String())) depProgressing = true - } else if dep.Status.ObservedGeneration < dep.Generation { - progressing = append(progressing, fmt.Sprintf("Deployment %q update is being processed (generation %d, observed generation %d)", depName.String(), dep.Generation, dep.Status.ObservedGeneration)) - depProgressing = true } if dep.Annotations["release.openshift.io/version"] != targetLevel { @@ -232,19 +241,20 @@ func (status *StatusManager) SetFromPods() { if depProgressing && !isNonCritical(dep) { reachedAvailableLevel = false - depState, exists := deploymentStates[depName] - if !exists || !reflect.DeepEqual(depState.LastSeenStatus, dep.Status) { + if !hadState || !reflect.DeepEqual(depState.LastSeenStatus, dep.Status) { depState.LastChangeTime = time.Now() dep.Status.DeepCopyInto(&depState.LastSeenStatus) - deploymentStates[depName] = depState } // Catch hung rollouts - if exists && (time.Since(depState.LastChangeTime)) > ProgressTimeout { + if hadState && (time.Since(depState.LastChangeTime)) > ProgressTimeout { hung = append(hung, fmt.Sprintf("Deployment %q rollout is not making progress - last change %s", depName.String(), depState.LastChangeTime.Format(time.RFC3339))) empty := "" depHung = &empty } + } + if depProgressing && !isNonCritical(dep) { + deploymentStates[depName] = depState } else { delete(deploymentStates, depName) } @@ -254,7 +264,10 @@ func (status *StatusManager) SetFromPods() { } status.setNotDegraded(PodDeployment) - if err := status.setLastPodState(daemonsetStates, deploymentStates, statefulsetStates); err != nil { + if reachedAvailableLevel && len(progressing) == 0 { + status.installComplete = true + } + if err := status.setLastPodState(daemonsetStates, deploymentStates, statefulsetStates, status.installComplete); err != nil { log.Printf("Failed to set pod state (continuing): %+v\n", err) } @@ -270,10 +283,6 @@ func (status *StatusManager) SetFromPods() { Status: operv1.ConditionTrue}) } - if reachedAvailableLevel && len(progressing) == 0 { - status.installComplete = true - } - if len(hung) > 0 { status.setDegraded(RolloutHung, "RolloutHung", strings.Join(hung, "\n")) } else { @@ -290,7 +299,7 @@ func (status *StatusManager) SetFromPods() { // getLastPodState reads the last-seen daemonset + deployment + statefulset // states from the clusteroperator annotation and parses it. On error, it // returns an empty state, since this should not block updating operator status. -func (status *StatusManager) getLastPodState() (map[ClusteredName]daemonsetState, map[ClusteredName]deploymentState, map[ClusteredName]statefulsetState) { +func (status *StatusManager) getLastPodState() (map[ClusteredName]daemonsetState, map[ClusteredName]deploymentState, map[ClusteredName]statefulsetState, bool) { // with maps allocated daemonsetStates := map[ClusteredName]daemonsetState{} deploymentStates := map[ClusteredName]deploymentState{} @@ -301,12 +310,12 @@ func (status *StatusManager) getLastPodState() (map[ClusteredName]daemonsetState err := status.client.ClientFor("").CRClient().Get(context.TODO(), types.NamespacedName{Name: status.name}, co) if err != nil { log.Printf("Failed to get ClusterOperator: %v", err) - return daemonsetStates, deploymentStates, statefulsetStates + return daemonsetStates, deploymentStates, statefulsetStates, false } lsbytes := co.Annotations[lastSeenAnnotation] if lsbytes == "" { - return daemonsetStates, deploymentStates, statefulsetStates + return daemonsetStates, deploymentStates, statefulsetStates, false } out := podState{} @@ -314,7 +323,7 @@ func (status *StatusManager) getLastPodState() (map[ClusteredName]daemonsetState if err != nil { // No need to return error; just move on log.Printf("failed to unmashal last-seen-status: %v", err) - return daemonsetStates, deploymentStates, statefulsetStates + return daemonsetStates, deploymentStates, statefulsetStates, false } for _, ds := range out.DaemonsetStates { @@ -329,18 +338,42 @@ func (status *StatusManager) getLastPodState() (map[ClusteredName]daemonsetState statefulsetStates[ss.ClusteredName] = ss } - return daemonsetStates, deploymentStates, statefulsetStates + installComplete, err := installCompleteFromLastPodState(lsbytes, co.Status.Conditions) + if err != nil { + log.Printf("failed to unmarshal last-seen install-complete state: %v", err) + return daemonsetStates, deploymentStates, statefulsetStates, false + } + + return daemonsetStates, deploymentStates, statefulsetStates, installComplete +} + +func installCompleteFromLastPodState(annotation string, conditions []configv1.ClusterOperatorStatusCondition) (bool, error) { + raw := struct { + InstallComplete *bool `json:"InstallComplete"` + }{} + if err := json.Unmarshal([]byte(annotation), &raw); err != nil { + return false, err + } + if raw.InstallComplete != nil { + return *raw.InstallComplete, nil + } + + // Older annotations predate InstallComplete; fall back to the persisted + // ClusterOperator availability so upgrades do not re-enter bootstrap mode. + return cohelpers.IsStatusConditionTrue(conditions, configv1.OperatorAvailable), nil } func (status *StatusManager) setLastPodState( dss map[ClusteredName]daemonsetState, deps map[ClusteredName]deploymentState, - sss map[ClusteredName]statefulsetState) error { + sss map[ClusteredName]statefulsetState, + installComplete bool) error { ps := podState{ DaemonsetStates: make([]daemonsetState, 0, len(dss)), DeploymentStates: make([]deploymentState, 0, len(deps)), StatefulsetStates: make([]statefulsetState, 0, len(sss)), + InstallComplete: installComplete, } for nsn, ds := range dss { diff --git a/pkg/controller/statusmanager/status_manager_test.go b/pkg/controller/statusmanager/status_manager_test.go index 6fc5ae400e..4ce4977898 100644 --- a/pkg/controller/statusmanager/status_manager_test.go +++ b/pkg/controller/statusmanager/status_manager_test.go @@ -834,6 +834,165 @@ func TestStatusManagerSetFromIPsecConfigs(t *testing.T) { } } +func TestStatusManagerSetFromMachineConfigPoolIgnoresNodeRebootChurn(t *testing.T) { + client := fake.NewFakeClient() + status := New(client, "testing", names.StandAloneClusterName) + setFakeListers(status) + no := &operv1.Network{ObjectMeta: metav1.ObjectMeta{Name: names.OPERATOR_CONFIG}, + Spec: operv1.NetworkSpec{DefaultNetwork: operv1.DefaultNetworkDefinition{ + OVNKubernetesConfig: &operv1.OVNKubernetesConfig{IPsecConfig: &operv1.IPsecConfig{Mode: operv1.IPsecModeFull}}}}} + setOC(t, client, no) + setCO(t, client, "testing") + + masterIPsecMachineConfig := mcfgv1.MachineConfig{ObjectMeta: metav1.ObjectMeta{Name: masterMachineConfigIPsecExtName, + Labels: names.MasterRoleMachineConfigLabel(), + OwnerReferences: networkOwnerRef()}, + Spec: mcfgv1.MachineConfigSpec{Extensions: []string{"ipsec"}}} + if err := status.SetMachineConfigs(t.Context(), []mcfgv1.MachineConfig{masterIPsecMachineConfig}); err != nil { + t.Fatalf("error setting machine configs: %v", err) + } + + masterPool := mcfgv1.MachineConfigPool{ + ObjectMeta: metav1.ObjectMeta{Name: "master"}, + Spec: mcfgv1.MachineConfigPoolSpec{MachineConfigSelector: &metav1.LabelSelector{ + MatchLabels: names.MasterRoleMachineConfigLabel(), + }}, + Status: mcfgv1.MachineConfigPoolStatus{ + MachineCount: 3, + ReadyMachineCount: 2, + UpdatedMachineCount: 2, + Configuration: mcfgv1.MachineConfigPoolStatusConfiguration{ + Source: []v1.ObjectReference{{Name: masterMachineConfigIPsecExtName}}, + }, + Conditions: []mcfgv1.MachineConfigPoolCondition{{ + Type: mcfgv1.MachineConfigPoolUpdating, + Status: v1.ConditionTrue, + }}, + }, + } + if err := status.SetFromMachineConfigPool([]mcfgv1.MachineConfigPool{masterPool}); err != nil { + t.Fatalf("error processing machine config pools: %v", err) + } + + _, oc, err := getStatuses(client, "testing") + if err != nil { + t.Fatalf("error getting ClusterOperator: %v", err) + } + if !conditionsInclude(oc.Status.Conditions, []operv1.OperatorCondition{{ + Type: operv1.OperatorStatusTypeProgressing, + Status: operv1.ConditionFalse, + }}) { + t.Fatalf("unexpected Status.Conditions: %#v", oc.Status.Conditions) + } +} + +func TestStatusManagerSetFromMachineConfigPoolWaitsForAllMatchingPoolsOnRemoval(t *testing.T) { + client := fake.NewFakeClient() + status := New(client, "testing", names.StandAloneClusterName) + setFakeListers(status) + no := &operv1.Network{ObjectMeta: metav1.ObjectMeta{Name: names.OPERATOR_CONFIG}, + Spec: operv1.NetworkSpec{DefaultNetwork: operv1.DefaultNetworkDefinition{ + OVNKubernetesConfig: &operv1.OVNKubernetesConfig{IPsecConfig: &operv1.IPsecConfig{Mode: operv1.IPsecModeFull}}}}} + setOC(t, client, no) + setCO(t, client, "testing") + + workerIPsecMachineConfig := mcfgv1.MachineConfig{ObjectMeta: metav1.ObjectMeta{Name: workerMachineConfigIPsecExtName, + Labels: names.WorkerRoleMachineConfigLabel(), + OwnerReferences: networkOwnerRef()}, + Spec: mcfgv1.MachineConfigSpec{Extensions: []string{"ipsec"}}} + if err := status.SetMachineConfigs(t.Context(), []mcfgv1.MachineConfig{workerIPsecMachineConfig}); err != nil { + t.Fatalf("error setting machine configs: %v", err) + } + + workerPool := mcfgv1.MachineConfigPool{ + ObjectMeta: metav1.ObjectMeta{Name: "worker"}, + Spec: mcfgv1.MachineConfigPoolSpec{MachineConfigSelector: &metav1.LabelSelector{ + MatchLabels: names.WorkerRoleMachineConfigLabel(), + }}, + Status: mcfgv1.MachineConfigPoolStatus{ + Configuration: mcfgv1.MachineConfigPoolStatusConfiguration{ + Source: []v1.ObjectReference{{Name: workerMachineConfigIPsecExtName}}, + }, + }, + } + customWorkerPool := mcfgv1.MachineConfigPool{ + ObjectMeta: metav1.ObjectMeta{Name: "worker-custom"}, + Spec: mcfgv1.MachineConfigPoolSpec{MachineConfigSelector: &metav1.LabelSelector{ + MatchLabels: names.WorkerRoleMachineConfigLabel(), + }}, + Status: mcfgv1.MachineConfigPoolStatus{ + Configuration: mcfgv1.MachineConfigPoolStatusConfiguration{ + Source: []v1.ObjectReference{{Name: workerMachineConfigIPsecExtName}}, + }, + }, + } + if err := status.SetFromMachineConfigPool([]mcfgv1.MachineConfigPool{workerPool, customWorkerPool}); err != nil { + t.Fatalf("error processing machine config pools: %v", err) + } + + if err := status.SetMachineConfigs(t.Context(), []mcfgv1.MachineConfig{}); err != nil { + t.Fatalf("error setting machine configs: %v", err) + } + + customWorkerPool.Status.Configuration.Source = nil + if err := status.SetFromMachineConfigPool([]mcfgv1.MachineConfigPool{workerPool, customWorkerPool}); err != nil { + t.Fatalf("error processing machine config pools: %v", err) + } + + _, oc, err := getStatuses(client, "testing") + if err != nil { + t.Fatalf("error getting ClusterOperator: %v", err) + } + if !conditionsInclude(oc.Status.Conditions, []operv1.OperatorCondition{{ + Type: operv1.OperatorStatusTypeProgressing, + Status: operv1.ConditionTrue, + }}) { + t.Fatalf("unexpected Status.Conditions: %#v", oc.Status.Conditions) + } + if !status.renderedMachineConfigs["worker"].Has(workerMachineConfigIPsecExtName) { + t.Fatalf("expected rendered machine config to stay cached until every matching pool removes it: %#v", status.renderedMachineConfigs) + } + if !status.machineConfigsBeingRemoved["worker"].Has(workerMachineConfigIPsecExtName) { + t.Fatalf("expected machine config removal state to stay cached until every matching pool removes it: %#v", status.machineConfigsBeingRemoved) + } + renderedMachineConfigs, err := status.getLastRenderedMachineConfigState() + if err != nil { + t.Fatalf("error getting rendered machine config state: %v", err) + } + if !renderedMachineConfigs["worker"].Has(workerMachineConfigIPsecExtName) { + t.Fatalf("expected rendered machine config annotation to keep %s until every matching pool removes it: %#v", workerMachineConfigIPsecExtName, renderedMachineConfigs) + } + + workerPool.Status.Configuration.Source = nil + if err := status.SetFromMachineConfigPool([]mcfgv1.MachineConfigPool{workerPool, customWorkerPool}); err != nil { + t.Fatalf("error processing machine config pools: %v", err) + } + + _, oc, err = getStatuses(client, "testing") + if err != nil { + t.Fatalf("error getting ClusterOperator: %v", err) + } + if !conditionsInclude(oc.Status.Conditions, []operv1.OperatorCondition{{ + Type: operv1.OperatorStatusTypeProgressing, + Status: operv1.ConditionFalse, + }}) { + t.Fatalf("unexpected Status.Conditions: %#v", oc.Status.Conditions) + } + if _, ok := status.renderedMachineConfigs["worker"]; ok { + t.Fatalf("expected rendered machine config cache to be cleared after all matching pools remove it: %#v", status.renderedMachineConfigs) + } + if _, ok := status.machineConfigsBeingRemoved["worker"]; ok { + t.Fatalf("expected machine config removal state to be cleared after all matching pools remove it: %#v", status.machineConfigsBeingRemoved) + } + renderedMachineConfigs, err = status.getLastRenderedMachineConfigState() + if err != nil { + t.Fatalf("error getting rendered machine config state: %v", err) + } + if _, ok := renderedMachineConfigs["worker"]; ok { + t.Fatalf("expected rendered machine config annotation to be cleared after all matching pools remove it: %#v", renderedMachineConfigs) + } +} + func TestStatusManagerSetFromDaemonSets(t *testing.T) { client := fake.NewFakeClient() status := New(client, "testing", names.StandAloneClusterName) @@ -1031,8 +1190,9 @@ func TestStatusManagerSetFromDaemonSets(t *testing.T) { t.Fatalf("Progressing condition unexpectedly missing") } - // Now, bump the generation of one of the daemonsets, and verify - // that we enter Progressing state but otherwise stay Available + // Now, bump the generation of one of the daemonsets. Without the + // generation-based rollout signal, this alone should not enter + // Progressing until the daemonset status starts to reflect rollout work. dsA.Generation = 2 set(t, client, dsA) status.SetFromPods() @@ -1048,7 +1208,7 @@ func TestStatusManagerSetFromDaemonSets(t *testing.T) { }, { Type: operv1.OperatorStatusTypeProgressing, - Status: operv1.ConditionTrue, + Status: operv1.ConditionFalse, }, { Type: operv1.OperatorStatusTypeUpgradeable, @@ -1158,14 +1318,15 @@ func TestStatusManagerSetFromDaemonSets(t *testing.T) { } setStatus(t, client, dsA) - t0 := time.Now() - time.Sleep(time.Second / 10) status.SetFromPods() co, oc, err = getStatuses(client, "testing") if err != nil { t.Fatalf("error getting ClusterOperator: %v", err) } + // With the simplified rollout detection logic, once UpdatedNumberScheduled >= CurrentNumberScheduled, + // the rollout is complete. Unavailability after rollout completion is treated as + // reboot churn, not a network rollout, so Progressing should be False. if !conditionsInclude(oc.Status.Conditions, []operv1.OperatorCondition{ { Type: operv1.OperatorStatusTypeDegraded, @@ -1173,7 +1334,7 @@ func TestStatusManagerSetFromDaemonSets(t *testing.T) { }, { Type: operv1.OperatorStatusTypeProgressing, - Status: operv1.ConditionTrue, + Status: operv1.ConditionFalse, }, { Type: operv1.OperatorStatusTypeUpgradeable, @@ -1190,80 +1351,15 @@ func TestStatusManagerSetFromDaemonSets(t *testing.T) { t.Fatalf("unexpected Status.Versions: %#v", co.Status.Versions) } - // See that the last pod state is reasonable + // With the new simplified logic, since the rollout is complete (UpdatedNumberScheduled >= CurrentNumberScheduled), + // the DaemonSet state is not tracked. Verify the state is empty. ps := getLastPodState(t, client, "testing") nsn := ClusteredName{Namespace: "one", Name: "alpha"} - found := false for _, ds := range ps.DaemonsetStates { if ds.ClusteredName == nsn { - found = true - if !ds.LastChangeTime.After(t0) { - t.Fatalf("Expected %s to be after %s", ds.LastChangeTime, t0) - } - if !reflect.DeepEqual(dsA.Status, ds.LastSeenStatus) { - t.Fatal("expected cached status to equal last seen status") - } - - break - } - } - if !found { - t.Fatalf("Didn't find %s in pod state", nsn) - } - - // intermission: set the last-update-time to an hour ago, make sure we - // set degraded (because the rollout is hung) - ps = getLastPodState(t, client, "testing") - for idx, ds := range ps.DaemonsetStates { - if ds.ClusteredName == nsn { - ps.DaemonsetStates[idx].LastChangeTime = time.Now().Add(-time.Hour) - break + t.Fatalf("DaemonSet state should not be tracked when rollout is complete, but found: %#v", ds) } } - setLastPodState(t, client, "testing", ps) - status.SetFromPods() - - // RolloutHung is debounced for 2 min, so advance clock and call again - status.clock.(*testingclock.FakeClock).Step(3 * time.Minute) - status.SetFromPods() - - co, oc, err = getStatuses(client, "testing") - if err != nil { - t.Fatalf("error getting ClusterOperator: %v", err) - } - if !conditionsInclude(oc.Status.Conditions, []operv1.OperatorCondition{ - { - Type: operv1.OperatorStatusTypeDegraded, - Status: operv1.ConditionTrue, - }, - { - Type: operv1.OperatorStatusTypeProgressing, - Status: operv1.ConditionTrue, - }, - { - Type: operv1.OperatorStatusTypeUpgradeable, - Status: operv1.ConditionTrue, - }, - { - Type: operv1.OperatorStatusTypeAvailable, - Status: operv1.ConditionTrue, - }, - }) { - t.Fatalf("unexpected Status.Conditions: %#v", oc.Status.Conditions) - } - if len(co.Status.Versions) != 1 { - t.Fatalf("unexpected Status.Versions: %#v", co.Status.Versions) - } - - // check hung annotation is set (also, need to refresh objects since they were updated) - err = client.ClientFor("").CRClient().Get(t.Context(), types.NamespacedName{Namespace: "one", Name: "alpha"}, dsA) - if err != nil { - t.Fatalf("error getting DaemonSet: %v", err) - } - - if _, set := dsA.Annotations[names.RolloutHungAnnotation]; !set { - t.Fatalf("Expected rollout-hung annotation, but was missing") - } // done: numberReady -> 1, numberUnavailable -> 0 dsA.Status = appsv1.DaemonSetStatus{ @@ -1335,7 +1431,8 @@ func TestStatusManagerSetFromDaemonSets(t *testing.T) { set(t, client, dsNC) status.SetFromPods() - // We should now be Progressing, but not un-Available + // Non-critical daemonsets that are merely unavailable after install should not + // move the operator into Progressing without an actual rollout in flight. co, oc, err = getStatuses(client, "testing") if err != nil { t.Fatalf("error getting ClusterOperator: %v", err) @@ -1347,7 +1444,7 @@ func TestStatusManagerSetFromDaemonSets(t *testing.T) { }, { Type: operv1.OperatorStatusTypeProgressing, - Status: operv1.ConditionTrue, + Status: operv1.ConditionFalse, }, { Type: operv1.OperatorStatusTypeUpgradeable, @@ -1387,7 +1484,7 @@ func TestStatusManagerSetFromDaemonSets(t *testing.T) { }, { Type: operv1.OperatorStatusTypeProgressing, - Status: operv1.ConditionTrue, + Status: operv1.ConditionFalse, }, { Type: operv1.OperatorStatusTypeUpgradeable, @@ -1441,6 +1538,96 @@ func TestStatusManagerSetFromDaemonSets(t *testing.T) { } } +func TestStatusManagerIgnoresDaemonSetNodeScaleChurnAfterInstall(t *testing.T) { + client := fake.NewFakeClient() + status := New(client, "testing", names.StandAloneClusterName) + status.clock = testingclock.NewFakeClock(time.Now()) + setFakeListers(status) + no := &operv1.Network{ObjectMeta: metav1.ObjectMeta{Name: names.OPERATOR_CONFIG}} + setOC(t, client, no) + + ds := &appsv1.DaemonSet{ + ObjectMeta: metav1.ObjectMeta{Namespace: "one", Name: "alpha", Generation: 1, Labels: sl}, + Spec: appsv1.DaemonSetSpec{ + Selector: &metav1.LabelSelector{ + MatchLabels: map[string]string{"app": "alpha"}, + }, + }, + Status: appsv1.DaemonSetStatus{ + CurrentNumberScheduled: 1, + DesiredNumberScheduled: 1, + UpdatedNumberScheduled: 1, + NumberAvailable: 1, + NumberReady: 1, + ObservedGeneration: 1, + }, + } + set(t, client, ds) + status.SetFromPods() + + _, oc, err := getStatuses(client, "testing") + if err != nil { + t.Fatalf("error getting ClusterOperator: %v", err) + } + if !conditionsInclude(oc.Status.Conditions, []operv1.OperatorCondition{ + { + Type: operv1.OperatorStatusTypeDegraded, + Status: operv1.ConditionFalse, + }, + { + Type: operv1.OperatorStatusTypeProgressing, + Status: operv1.ConditionFalse, + }, + { + Type: operv1.OperatorStatusTypeUpgradeable, + Status: operv1.ConditionTrue, + }, + { + Type: operv1.OperatorStatusTypeAvailable, + Status: operv1.ConditionTrue, + }, + }) { + t.Fatalf("unexpected Status.Conditions: %#v", oc.Status.Conditions) + } + + // Simulate a new node being added after install. Desired increases before + // the daemon pod is scheduled, but this is not a CNO rollout. + ds.Status.DesiredNumberScheduled = 2 + ds.Status.NumberUnavailable = 1 + setStatus(t, client, ds) + status.SetFromPods() + + _, oc, err = getStatuses(client, "testing") + if err != nil { + t.Fatalf("error getting ClusterOperator: %v", err) + } + if !conditionsInclude(oc.Status.Conditions, []operv1.OperatorCondition{ + { + Type: operv1.OperatorStatusTypeDegraded, + Status: operv1.ConditionFalse, + }, + { + Type: operv1.OperatorStatusTypeProgressing, + Status: operv1.ConditionFalse, + }, + { + Type: operv1.OperatorStatusTypeUpgradeable, + Status: operv1.ConditionTrue, + }, + { + Type: operv1.OperatorStatusTypeAvailable, + Status: operv1.ConditionTrue, + }, + }) { + t.Fatalf("unexpected Status.Conditions: %#v", oc.Status.Conditions) + } + + ps := getLastPodState(t, client, "testing") + if len(ps.DaemonsetStates) != 0 { + t.Fatalf("expected daemonset rollout state to stay empty during node scale churn: %#v", ps.DaemonsetStates) + } +} + func TestStatusManagerSetFromDeployments(t *testing.T) { client := fake.NewFakeClient() status := New(client, "testing", names.StandAloneClusterName) @@ -1541,7 +1728,7 @@ func TestStatusManagerSetFromDeployments(t *testing.T) { } depB := &appsv1.Deployment{ - ObjectMeta: metav1.ObjectMeta{Namespace: "one", Name: "beta", Labels: sl}, + ObjectMeta: metav1.ObjectMeta{Namespace: "one", Name: "beta", Generation: 1, Labels: sl}, Status: appsv1.DeploymentStatus{ Replicas: 1, UpdatedReplicas: 1, @@ -1564,17 +1751,14 @@ func TestStatusManagerSetFromDeployments(t *testing.T) { } set(t, client, ds) - t0 := time.Now() - time.Sleep(time.Second / 10) status.SetFromPods() co, oc, err = getStatuses(client, "testing") if err != nil { t.Fatalf("error getting ClusterOperator: %v", err) } - // We should now be progressing because both Deployments and the DaemonSet exist, - // but depB is still incomplete. We're still Available though because we were - // Available before + // Generation skew by itself no longer starts rollout tracking. Until the + // deployment counters show an update in progress, we stay non-Progressing. if !conditionsInclude(oc.Status.Conditions, []operv1.OperatorCondition{ { Type: operv1.OperatorStatusTypeDegraded, @@ -1582,7 +1766,7 @@ func TestStatusManagerSetFromDeployments(t *testing.T) { }, { Type: operv1.OperatorStatusTypeProgressing, - Status: operv1.ConditionTrue, + Status: operv1.ConditionFalse, }, { Type: operv1.OperatorStatusTypeUpgradeable, @@ -1600,25 +1784,12 @@ func TestStatusManagerSetFromDeployments(t *testing.T) { } ps := getLastPodState(t, client, "testing") - // see that the pod state is sensible nsn := ClusteredName{Namespace: "one", Name: "beta"} - found := false for _, ds := range ps.DeploymentStates { if ds.ClusteredName == nsn { - found = true - if !ds.LastChangeTime.After(t0) { - t.Fatalf("Expected %s to be after %s", ds.LastChangeTime, t0) - } - if !reflect.DeepEqual(depB.Status, ds.LastSeenStatus) { - t.Fatal("expected cached status to equal last seen status") - } - - break + t.Fatalf("expected deployment rollout state for %s to stay empty before rollout counters change: %#v", nsn, ps.DeploymentStates) } } - if !found { - t.Fatalf("Didn't find %s in pod state", nsn) - } err = client.ClientFor("").CRClient().Get(t.Context(), types.NamespacedName{Namespace: depB.Namespace, Name: depB.Name}, depB) if err != nil { @@ -1631,6 +1802,8 @@ func TestStatusManagerSetFromDeployments(t *testing.T) { depB.Status.UnavailableReplicas = 0 depB.Status.AvailableReplicas = depB.Status.Replicas + t0 := time.Now() + time.Sleep(time.Second / 10) setStatus(t, client, depB) status.SetFromPods() @@ -1663,16 +1836,31 @@ func TestStatusManagerSetFromDeployments(t *testing.T) { t.Fatalf("unexpected Status.Versions: %#v", co.Status.Versions) } - // intermission: set back last-seen times by an hour, see that we mark - // as hung ps = getLastPodState(t, client, "testing") - for idx, ds := range ps.DeploymentStates { + found := false + for _, ds := range ps.DeploymentStates { if ds.ClusteredName == nsn { - ps.DeploymentStates[idx].LastChangeTime = time.Now().Add(-time.Hour) + found = true + if !ds.LastChangeTime.After(t0) { + t.Fatalf("Expected %s to be after %s", ds.LastChangeTime, t0) + } + if !reflect.DeepEqual(depB.Status, ds.LastSeenStatus) { + t.Fatal("expected cached status to equal last seen status") + } + break } } - setLastPodState(t, client, "testing", ps) + if !found { + t.Fatalf("Didn't find %s in pod state", nsn) + } + + depB.Status.UpdatedReplicas = depB.Status.Replicas + depB.Status.UnavailableReplicas = 1 + depB.Status.AvailableReplicas = 0 + depB.Status.ObservedGeneration = depB.Generation + + setStatus(t, client, depB) status.SetFromPods() // RolloutHung is debounced for 2 min, so advance clock and call again @@ -1683,16 +1871,17 @@ func TestStatusManagerSetFromDeployments(t *testing.T) { if err != nil { t.Fatalf("error getting ClusterOperator: %v", err) } - // We should still be Progressing, since nothing else has changed, but - // now we're also Degraded, since rollout has not made any progress + // With the simplified rollout detection logic, once UpdatedReplicas >= Replicas, + // the rollout is complete. Unavailability after rollout completion is treated as + // reboot churn, not a network rollout, so Progressing should be False. if !conditionsInclude(oc.Status.Conditions, []operv1.OperatorCondition{ { Type: operv1.OperatorStatusTypeDegraded, - Status: operv1.ConditionTrue, + Status: operv1.ConditionFalse, }, { Type: operv1.OperatorStatusTypeProgressing, - Status: operv1.ConditionTrue, + Status: operv1.ConditionFalse, }, { Type: operv1.OperatorStatusTypeUpgradeable, @@ -1718,6 +1907,7 @@ func TestStatusManagerSetFromDeployments(t *testing.T) { depB.Status.UpdatedReplicas = depB.Status.Replicas depB.Status.UnavailableReplicas = 0 depB.Status.AvailableReplicas = depB.Status.Replicas + depB.Status.ObservedGeneration = depB.Generation setStatus(t, client, depB) status.SetFromPods() @@ -1748,6 +1938,379 @@ func TestStatusManagerSetFromDeployments(t *testing.T) { if len(co.Status.Versions) != 1 { t.Fatalf("unexpected Status.Versions: %#v", co.Status.Versions) } + + err = client.ClientFor("").CRClient().Get(t.Context(), types.NamespacedName{Namespace: depB.Namespace, Name: depB.Name}, depB) + if err != nil { + t.Fatalf("error getting Deployment: %v", err) + } + + depB.Status.UnavailableReplicas = 1 + depB.Status.AvailableReplicas = 0 + setStatus(t, client, depB) + status.SetFromPods() + + _, oc, err = getStatuses(client, "testing") + if err != nil { + t.Fatalf("error getting ClusterOperator: %v", err) + } + // A single unavailable replica without a rollout in flight should not set + // Progressing; this is the reboot-churn case. + if !conditionsInclude(oc.Status.Conditions, []operv1.OperatorCondition{ + { + Type: operv1.OperatorStatusTypeDegraded, + Status: operv1.ConditionFalse, + }, + { + Type: operv1.OperatorStatusTypeProgressing, + Status: operv1.ConditionFalse, + }, + { + Type: operv1.OperatorStatusTypeUpgradeable, + Status: operv1.ConditionTrue, + }, + { + Type: operv1.OperatorStatusTypeAvailable, + Status: operv1.ConditionTrue, + }, + }) { + t.Fatalf("unexpected Status.Conditions: %#v", oc.Status.Conditions) + } +} + +func TestStatusManagerRestoresInstallCompleteAfterRestart(t *testing.T) { + client := fake.NewFakeClient() + status := New(client, "testing", names.StandAloneClusterName) + status.clock = testingclock.NewFakeClock(time.Now()) + setFakeListers(status) + no := &operv1.Network{ObjectMeta: metav1.ObjectMeta{Name: names.OPERATOR_CONFIG}} + setOC(t, client, no) + + depA := &appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{Namespace: "one", Name: "alpha", Generation: 1, Labels: sl}, + Status: appsv1.DeploymentStatus{ + Replicas: 1, + UpdatedReplicas: 1, + AvailableReplicas: 1, + UnavailableReplicas: 0, + ObservedGeneration: 1, + }, + } + set(t, client, depA) + + status.SetFromPods() + + ps := getLastPodState(t, client, "testing") + if !ps.InstallComplete { + t.Fatal("expected installComplete to be persisted once pods are stable") + } + + depNC := &appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "one", + Name: "non-critical", + Generation: 1, + Annotations: map[string]string{ + names.NonCriticalAnnotation: "", + }, + Labels: sl, + }, + Status: appsv1.DeploymentStatus{ + Replicas: 1, + UpdatedReplicas: 1, + AvailableReplicas: 0, + UnavailableReplicas: 1, + ObservedGeneration: 1, + }, + } + set(t, client, depNC) + + restarted := New(client, "testing", names.StandAloneClusterName) + restarted.clock = testingclock.NewFakeClock(time.Now()) + setFakeListers(restarted) + restarted.SetFromPods() + + _, oc, err := getStatuses(client, "testing") + if err != nil { + t.Fatalf("error getting ClusterOperator: %v", err) + } + if !conditionsInclude(oc.Status.Conditions, []operv1.OperatorCondition{ + { + Type: operv1.OperatorStatusTypeDegraded, + Status: operv1.ConditionFalse, + }, + { + Type: operv1.OperatorStatusTypeProgressing, + Status: operv1.ConditionFalse, + }, + { + Type: operv1.OperatorStatusTypeUpgradeable, + Status: operv1.ConditionTrue, + }, + { + Type: operv1.OperatorStatusTypeAvailable, + Status: operv1.ConditionTrue, + }, + }) { + t.Fatalf("unexpected Status.Conditions: %#v", oc.Status.Conditions) + } +} + +func TestStatusManagerRestoresInstallCompleteFromLegacyAnnotation(t *testing.T) { + client := fake.NewFakeClient() + status := New(client, "testing", names.StandAloneClusterName) + status.clock = testingclock.NewFakeClock(time.Now()) + setFakeListers(status) + no := &operv1.Network{ObjectMeta: metav1.ObjectMeta{Name: names.OPERATOR_CONFIG}} + setOC(t, client, no) + + depA := &appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{Namespace: "one", Name: "alpha", Generation: 1, Labels: sl}, + Status: appsv1.DeploymentStatus{ + Replicas: 1, + UpdatedReplicas: 1, + AvailableReplicas: 1, + UnavailableReplicas: 0, + ObservedGeneration: 1, + }, + } + set(t, client, depA) + + status.SetFromPods() + + ps := getLastPodState(t, client, "testing") + legacyPodState := struct { + DaemonsetStates []daemonsetState + DeploymentStates []deploymentState + StatefulsetStates []statefulsetState + }{ + DaemonsetStates: ps.DaemonsetStates, + DeploymentStates: ps.DeploymentStates, + StatefulsetStates: ps.StatefulsetStates, + } + co, err := getCO(client, "testing") + if err != nil { + t.Fatalf("error getting ClusterOperator: %v", err) + } + legacyStateBytes, err := json.Marshal(legacyPodState) + if err != nil { + t.Fatalf("error marshalling legacy pod state: %v", err) + } + co.Annotations[lastSeenAnnotation] = string(legacyStateBytes) + if err := client.ClientFor("").CRClient().Update(t.Context(), co); err != nil { + t.Fatalf("error updating ClusterOperator: %v", err) + } + + depNC := &appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "one", + Name: "non-critical", + Generation: 1, + Annotations: map[string]string{ + names.NonCriticalAnnotation: "", + }, + Labels: sl, + }, + Status: appsv1.DeploymentStatus{ + Replicas: 1, + UpdatedReplicas: 1, + AvailableReplicas: 0, + UnavailableReplicas: 1, + ObservedGeneration: 1, + }, + } + set(t, client, depNC) + + restarted := New(client, "testing", names.StandAloneClusterName) + restarted.clock = testingclock.NewFakeClock(time.Now()) + setFakeListers(restarted) + restarted.SetFromPods() + + if !restarted.installComplete { + t.Fatal("expected installComplete to be restored from ClusterOperator availability when legacy annotation omits it") + } + + _, oc, err := getStatuses(client, "testing") + if err != nil { + t.Fatalf("error getting ClusterOperator: %v", err) + } + if !conditionsInclude(oc.Status.Conditions, []operv1.OperatorCondition{ + { + Type: operv1.OperatorStatusTypeDegraded, + Status: operv1.ConditionFalse, + }, + { + Type: operv1.OperatorStatusTypeProgressing, + Status: operv1.ConditionFalse, + }, + { + Type: operv1.OperatorStatusTypeUpgradeable, + Status: operv1.ConditionTrue, + }, + { + Type: operv1.OperatorStatusTypeAvailable, + Status: operv1.ConditionTrue, + }, + }) { + t.Fatalf("unexpected Status.Conditions: %#v", oc.Status.Conditions) + } +} + +func TestStatusManagerRestoresActiveRolloutAfterRestart(t *testing.T) { + client := fake.NewFakeClient() + status := New(client, "testing", names.StandAloneClusterName) + status.clock = testingclock.NewFakeClock(time.Now()) + setFakeListers(status) + no := &operv1.Network{ObjectMeta: metav1.ObjectMeta{Name: names.OPERATOR_CONFIG}} + setOC(t, client, no) + + depA := &appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{Namespace: "one", Name: "alpha", Generation: 1, Labels: sl}, + Status: appsv1.DeploymentStatus{ + Replicas: 1, + UpdatedReplicas: 1, + AvailableReplicas: 1, + UnavailableReplicas: 0, + ObservedGeneration: 1, + }, + } + set(t, client, depA) + status.SetFromPods() + + depB := &appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{Namespace: "one", Name: "beta", Generation: 1, Labels: sl}, + Status: appsv1.DeploymentStatus{ + Replicas: 1, + UpdatedReplicas: 0, + AvailableReplicas: 1, + UnavailableReplicas: 0, + ObservedGeneration: 0, + }, + Spec: appsv1.DeploymentSpec{ + Selector: &metav1.LabelSelector{ + MatchLabels: map[string]string{"app": "beta"}, + }, + }, + } + set(t, client, depB) + status.SetFromPods() + + depB.Status.UpdatedReplicas = depB.Status.Replicas + depB.Status.AvailableReplicas = 0 + depB.Status.UnavailableReplicas = 1 + depB.Status.ObservedGeneration = depB.Generation + setStatus(t, client, depB) + + restarted := New(client, "testing", names.StandAloneClusterName) + restarted.clock = testingclock.NewFakeClock(time.Now()) + setFakeListers(restarted) + restarted.SetFromPods() + + _, oc, err := getStatuses(client, "testing") + if err != nil { + t.Fatalf("error getting ClusterOperator: %v", err) + } + // With the simplified rollout detection logic, once UpdatedReplicas >= Replicas, + // the rollout is complete. Unavailability after rollout completion is treated as + // reboot churn, not a network rollout, so Progressing should be False. + if !conditionsInclude(oc.Status.Conditions, []operv1.OperatorCondition{ + { + Type: operv1.OperatorStatusTypeDegraded, + Status: operv1.ConditionFalse, + }, + { + Type: operv1.OperatorStatusTypeProgressing, + Status: operv1.ConditionFalse, + }, + { + Type: operv1.OperatorStatusTypeUpgradeable, + Status: operv1.ConditionTrue, + }, + { + Type: operv1.OperatorStatusTypeAvailable, + Status: operv1.ConditionTrue, + }, + }) { + t.Fatalf("unexpected Status.Conditions: %#v", oc.Status.Conditions) + } +} + +func TestStatusManagerRestoresStatefulSetActiveRolloutAfterRestart(t *testing.T) { + client := fake.NewFakeClient() + status := New(client, "testing", names.StandAloneClusterName) + status.clock = testingclock.NewFakeClock(time.Now()) + setFakeListers(status) + no := &operv1.Network{ObjectMeta: metav1.ObjectMeta{Name: names.OPERATOR_CONFIG}} + setOC(t, client, no) + + ssA := &appsv1.StatefulSet{ + ObjectMeta: metav1.ObjectMeta{Namespace: "one", Name: "alpha", Generation: 1, Labels: sl}, + Status: appsv1.StatefulSetStatus{ + Replicas: 1, + UpdatedReplicas: 1, + ReadyReplicas: 1, + AvailableReplicas: 1, + ObservedGeneration: 1, + }, + } + set(t, client, ssA) + status.SetFromPods() + + ssB := &appsv1.StatefulSet{ + ObjectMeta: metav1.ObjectMeta{Namespace: "one", Name: "beta", Generation: 1, Labels: sl}, + Status: appsv1.StatefulSetStatus{ + Replicas: 2, + UpdatedReplicas: 0, + ReadyReplicas: 2, + AvailableReplicas: 2, + ObservedGeneration: 0, + }, + Spec: appsv1.StatefulSetSpec{ + Selector: &metav1.LabelSelector{ + MatchLabels: map[string]string{"app": "beta"}, + }, + }, + } + set(t, client, ssB) + status.SetFromPods() + + ssB.Status.UpdatedReplicas = ssB.Status.Replicas + ssB.Status.ReadyReplicas = ssB.Status.Replicas - 1 + ssB.Status.AvailableReplicas = ssB.Status.Replicas - 1 + ssB.Status.ObservedGeneration = ssB.Generation + setStatus(t, client, ssB) + + restarted := New(client, "testing", names.StandAloneClusterName) + restarted.clock = testingclock.NewFakeClock(time.Now()) + setFakeListers(restarted) + restarted.SetFromPods() + + _, oc, err := getStatuses(client, "testing") + if err != nil { + t.Fatalf("error getting ClusterOperator: %v", err) + } + // With the simplified rollout detection logic, once UpdatedReplicas >= Replicas, + // the rollout is complete. Unready replicas after rollout completion are treated as + // reboot churn, not a network rollout, so Progressing should be False. + if !conditionsInclude(oc.Status.Conditions, []operv1.OperatorCondition{ + { + Type: operv1.OperatorStatusTypeDegraded, + Status: operv1.ConditionFalse, + }, + { + Type: operv1.OperatorStatusTypeProgressing, + Status: operv1.ConditionFalse, + }, + { + Type: operv1.OperatorStatusTypeUpgradeable, + Status: operv1.ConditionTrue, + }, + { + Type: operv1.OperatorStatusTypeAvailable, + Status: operv1.ConditionTrue, + }, + }) { + t.Fatalf("unexpected Status.Conditions: %#v", oc.Status.Conditions) + } } func getLastPodState(t *testing.T, client cnoclient.Client, name string) podState { @@ -1947,9 +2510,10 @@ func TestStatusManagerCheckCrashLoopBackOffPods(t *testing.T) { // Check that crashlooping deployments also are detected dep := &appsv1.Deployment{ ObjectMeta: metav1.ObjectMeta{ - Namespace: "three", - Name: "gamma", - Labels: sl, + Namespace: "three", + Name: "gamma", + Generation: 1, + Labels: sl, }, Spec: appsv1.DeploymentSpec{ Selector: &metav1.LabelSelector{ diff --git a/pkg/util/machineconfig/util.go b/pkg/util/machineconfig/util.go index 8566e52751..f1f1c28fb1 100644 --- a/pkg/util/machineconfig/util.go +++ b/pkg/util/machineconfig/util.go @@ -26,21 +26,31 @@ func IsUserDefinedIPsecMachineConfig(machineConfig *mcfgv1.MachineConfig) bool { // AreMachineConfigsRenderedOnPool returns true if machineConfigs are completely rendered on the given machine config // pool status, otherwise returns false. func AreMachineConfigsRenderedOnPool(status mcfgv1.MachineConfigPoolStatus, machineConfigs sets.Set[string]) bool { - checkSource := func(sourceNames sets.Set[string], machineConfigs sets.Set[string]) bool { - return sourceNames.IsSuperset(machineConfigs) - } return status.MachineCount == status.UpdatedMachineCount && - checkSourceInMachineConfigPoolStatus(status, machineConfigs, checkSource) + AreMachineConfigsRenderedOnPoolSource(status, machineConfigs) } // AreMachineConfigsRemovedFromPool returns true if machineConfigs are completely removed on the given machine config // pool status, otherwise returns false. func AreMachineConfigsRemovedFromPool(status mcfgv1.MachineConfigPoolStatus, machineConfigs sets.Set[string]) bool { + return status.MachineCount == status.UpdatedMachineCount && + AreMachineConfigsRemovedFromPoolSource(status, machineConfigs) +} + +// AreMachineConfigsRenderedOnPoolSource returns true if machineConfigs are present in the pool's rendered source list. +func AreMachineConfigsRenderedOnPoolSource(status mcfgv1.MachineConfigPoolStatus, machineConfigs sets.Set[string]) bool { + checkSource := func(sourceNames sets.Set[string], machineConfigs sets.Set[string]) bool { + return sourceNames.IsSuperset(machineConfigs) + } + return checkSourceInMachineConfigPoolStatus(status, machineConfigs, checkSource) +} + +// AreMachineConfigsRemovedFromPoolSource returns true if machineConfigs are absent from the pool's rendered source list. +func AreMachineConfigsRemovedFromPoolSource(status mcfgv1.MachineConfigPoolStatus, machineConfigs sets.Set[string]) bool { checkSource := func(sourceNames sets.Set[string], machineConfigs sets.Set[string]) bool { return !sourceNames.HasAny(machineConfigs.UnsortedList()...) } - return status.MachineCount == status.UpdatedMachineCount && - checkSourceInMachineConfigPoolStatus(status, machineConfigs, checkSource) + return checkSourceInMachineConfigPoolStatus(status, machineConfigs, checkSource) } func checkSourceInMachineConfigPoolStatus(machineConfigStatus mcfgv1.MachineConfigPoolStatus, machineConfigs sets.Set[string],