Skip to content

Commit e2b09f2

Browse files
committed
operator: block upgrades on skew violation
This commit implements upgrade blocking when boot image version skew exceeds acceptable limits, via the ClusterOperator Upgradeable condition.
1 parent c82370a commit e2b09f2

2 files changed

Lines changed: 187 additions & 0 deletions

File tree

pkg/controller/common/constants.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -155,6 +155,10 @@ const (
155155

156156
// NodeSizingEnabledEnvPath is the file path for the node sizing enabled environment file
157157
NodeSizingEnabledEnvPath = "/etc/node-sizing-enabled.env"
158+
159+
// Current Boot Image Skew Limits
160+
RHCOSVersionBootImageSkewLimit = "9.2"
161+
OCPVersionBootImageSkewLimit = "4.13.0"
158162
)
159163

160164
// Commonly-used MCO ConfigMap names

pkg/operator/status.go

Lines changed: 183 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,13 +12,15 @@ import (
1212

1313
configv1 "github.com/openshift/api/config/v1"
1414
features "github.com/openshift/api/features"
15+
opv1 "github.com/openshift/api/operator/v1"
1516
cov1helpers "github.com/openshift/library-go/pkg/config/clusteroperator/v1helpers"
1617
corev1 "k8s.io/api/core/v1"
1718
"k8s.io/apimachinery/pkg/api/equality"
1819
apierrors "k8s.io/apimachinery/pkg/api/errors"
1920
"k8s.io/apimachinery/pkg/api/meta"
2021
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2122
"k8s.io/apimachinery/pkg/labels"
23+
k8sversion "k8s.io/apimachinery/pkg/util/version"
2224
"k8s.io/klog/v2"
2325
"k8s.io/utils/clock"
2426

@@ -269,6 +271,18 @@ func (optr *Operator) syncUpgradeableStatus(co *configv1.ClusterOperator) error
269271
Reason: asExpectedReason,
270272
}
271273

274+
// Check boot image skew upgradeable guards
275+
skewErrorExists, skewErrorMessage, err := optr.checkBootImageSkewUpgradeableGuard()
276+
if err != nil {
277+
return err
278+
}
279+
280+
if skewErrorExists {
281+
coStatusCondition.Status = configv1.ConditionFalse
282+
coStatusCondition.Reason = "ClusterBootImageSkewError"
283+
coStatusCondition.Message = skewErrorMessage
284+
}
285+
272286
var degraded, interrupted bool
273287
for _, pool := range pools {
274288
interrupted = isPoolStatusConditionTrue(pool, mcfgv1.MachineConfigPoolBuildInterrupted)
@@ -600,3 +614,172 @@ func machineConfigPoolStatus(fgHandler ctrlcommon.FeatureGatesHandler, pool *mcf
600614
func taskFailed(task string) string {
601615
return task + "Failed"
602616
}
617+
618+
// checkBootImageSkewUpgradeableGuard checks if the boot image version is within acceptable limits.
619+
// It returns an error if there is no skew enforcement opinion specified. If one is specified,
620+
// it checks if boot image skew is within the expected limit.
621+
func (optr *Operator) checkBootImageSkewUpgradeableGuard() (bool, string, error) {
622+
// Check if feature gate is enabled
623+
if !optr.fgHandler.Enabled(features.FeatureGateBootImageSkewEnforcement) {
624+
return false, "", nil
625+
}
626+
627+
// Fetch MachineConfiguration
628+
mcop, err := optr.mcopLister.Get(ctrlcommon.MCOOperatorKnobsObjectName)
629+
if err != nil {
630+
if apierrors.IsNotFound(err) {
631+
klog.V(4).Infof("MachineConfiguration not found, skipping boot image skew enforcement")
632+
return false, "", nil
633+
}
634+
return false, "", fmt.Errorf("failed to get MachineConfiguration: %w", err)
635+
}
636+
637+
// Perform boot image skew enforcement based on mode
638+
skewLimitExceeded := false
639+
skewLimitExceededMessage := ""
640+
641+
switch mcop.Status.BootImageSkewEnforcementStatus.Mode {
642+
case opv1.BootImageSkewEnforcementModeStatusAutomatic:
643+
skewLimitExceeded, skewLimitExceededMessage = checkBootImageSkew(
644+
mcop.Status.BootImageSkewEnforcementStatus.Automatic.OCPVersion,
645+
mcop.Status.BootImageSkewEnforcementStatus.Automatic.RHCOSVersion,
646+
)
647+
case opv1.BootImageSkewEnforcementModeStatusManual:
648+
skewLimitExceeded, skewLimitExceededMessage = checkBootImageSkew(
649+
mcop.Status.BootImageSkewEnforcementStatus.Manual.OCPVersion,
650+
mcop.Status.BootImageSkewEnforcementStatus.Manual.RHCOSVersion,
651+
)
652+
case opv1.BootImageSkewEnforcementModeStatusNone:
653+
// TODO: Set a low level prom alert to set scaling risk
654+
// Tracked in https://issues.redhat.com/browse/MCO-2035
655+
klog.V(4).Infof("evaluating boot image skew enforcement: mode set to None")
656+
return false, "", nil
657+
default:
658+
// Sanity check, this should only be possible if status hasn't been populated yet.
659+
return false, "", nil
660+
}
661+
662+
if skewLimitExceeded {
663+
// TODO: Update error message; tracked in https://issues.redhat.com/browse/MCO-2034
664+
return true, fmt.Sprintf("Upgrades have been disabled because %s. To enable upgrades, please update your boot images following the documentation at [TODO: insert link], or disable boot image skew enforcement at [TODO: insert link]", skewLimitExceededMessage), nil
665+
}
666+
667+
return false, "", nil
668+
}
669+
670+
// checkBootImageSkew determines if the cluster's boot images are within acceptable version skew.
671+
// It compares the oldest boot image version (currentOCPVersion, currentRHCOSVersion) against the minimum
672+
// supported version.
673+
// Returns true if the boot image version is older than the minimum, along with an error message.
674+
func checkBootImageSkew(currentOCPVersion, currentRHCOSVersion string) (bool, string) {
675+
676+
if currentOCPVersion != "" {
677+
return checkOCPVersionSkew(currentOCPVersion)
678+
}
679+
680+
if currentRHCOSVersion != "" {
681+
return checkRHCOSVersionSkew(currentRHCOSVersion)
682+
}
683+
684+
// This isn't possible due to API validations; more of a sanity check for safety
685+
klog.Warningf("no boot image versions provided, skipping skew check")
686+
return false, ""
687+
}
688+
689+
// checkOCPVersionSkew compares a version string against the minimum supported version.
690+
// Returns true if the version is below the minimum, along with an error message.
691+
func checkOCPVersionSkew(version string) (bool, string) {
692+
// Parse the boot image version
693+
bootImageVersion, err := k8sversion.ParseGeneric(version)
694+
if err != nil {
695+
klog.Warningf("Failed to parse boot image version %q: %v", version, err)
696+
return false, ""
697+
}
698+
699+
// Parse the minimum supported version
700+
minSupportedVersion, err := k8sversion.ParseGeneric(ctrlcommon.OCPVersionBootImageSkewLimit)
701+
if err != nil {
702+
klog.Errorf("Failed to parse OCPVersionBootImageSkewLimit constant %q: %v", ctrlcommon.OCPVersionBootImageSkewLimit, err)
703+
return false, ""
704+
}
705+
706+
// Check if boot image version is less than the minimum supported version
707+
if bootImageVersion.LessThan(minSupportedVersion) {
708+
return true, fmt.Sprintf("the cluster is using OCP boot image version %s, which is below the minimum required version %s",
709+
version, ctrlcommon.OCPVersionBootImageSkewLimit)
710+
}
711+
712+
klog.V(4).Infof("Boot image version %s meets minimum version requirement (>= %s)",
713+
version, ctrlcommon.OCPVersionBootImageSkewLimit)
714+
return false, ""
715+
}
716+
717+
// checkRHCOSVersionSkew compares an RHCOS version string against the minimum supported version.
718+
// Returns true if the version is below the minimum, along with an error message.
719+
//
720+
// Note: RHCOS versions can either have formatting of [major].[minor].[datestamp(YYYYMMDD)]-[buildnumber] (example:9.6.20251023-0) or the legacy
721+
// format of [major].[minor].[timestamp(YYYYMMDDHHmm)]-[buildnumber] (example: 48.84.202208021106-0). In the modern(or RHEL) formatting, we just
722+
// need to compare [major.minor] against the RHCOS skew limit. In the legacy format, the minor version includes the whole RHEL major/minor
723+
// and only that bit should be used to compare against the RHCOS skew limit.
724+
func checkRHCOSVersionSkew(version string) (bool, string) {
725+
// Split version to extract components
726+
parts := strings.Split(version, ".")
727+
if len(parts) < 3 {
728+
klog.Warningf("Failed to parse RHCOS version %q: expected at least 3 parts", version)
729+
return false, ""
730+
}
731+
732+
major := parts[0]
733+
minor := parts[1]
734+
735+
// Extract timestamp (remove build number suffix if present)
736+
timestampPart := parts[2]
737+
if idx := strings.Index(timestampPart, "-"); idx != -1 {
738+
timestampPart = timestampPart[:idx]
739+
}
740+
741+
var versionToCompare string
742+
743+
// Determine format based on timestamp length
744+
switch len(timestampPart) {
745+
case 8:
746+
// Modern format (YYYYMMDD): compare major.minor directly
747+
versionToCompare = fmt.Sprintf("%s.%s", major, minor)
748+
case 12:
749+
// Legacy format (YYYYMMDDHHmm): minor contains RHEL version (e.g., 84 = RHEL 8.4, 810 = RHEL 8.10)
750+
// First digit is RHEL major, remaining digits are RHEL minor.
751+
if len(minor) >= 2 {
752+
versionToCompare = fmt.Sprintf("%s.%s", minor[:1], minor[1:])
753+
} else {
754+
klog.Warningf("Failed to parse RHCOS legacy version %q: minor version too short", version)
755+
return false, ""
756+
}
757+
default:
758+
klog.Warningf("Failed to parse RHCOS version %q: unexpected timestamp format (length %d)", version, len(timestampPart))
759+
return false, ""
760+
}
761+
762+
// Parse the version to compare
763+
bootImageVersion, err := k8sversion.ParseGeneric(versionToCompare)
764+
if err != nil {
765+
klog.Warningf("Failed to parse RHCOS version %q (extracted %q): %v", version, versionToCompare, err)
766+
return false, ""
767+
}
768+
769+
// Parse the minimum supported version
770+
minSupportedVersion, err := k8sversion.ParseGeneric(ctrlcommon.RHCOSVersionBootImageSkewLimit)
771+
if err != nil {
772+
klog.Errorf("Failed to parse RHCOSVersionBootImageSkewLimit constant %q: %v", ctrlcommon.RHCOSVersionBootImageSkewLimit, err)
773+
return false, ""
774+
}
775+
776+
// Check if boot image version is less than the minimum supported version
777+
if bootImageVersion.LessThan(minSupportedVersion) {
778+
return true, fmt.Sprintf("the cluster is using RHCOS boot image version %s(RHEL version: %s), which is below the minimum required RHEL version %s",
779+
version, versionToCompare, ctrlcommon.RHCOSVersionBootImageSkewLimit)
780+
}
781+
782+
klog.V(4).Infof("RHCOS boot image version %s meets minimum version requirement (>= %s)",
783+
version, ctrlcommon.RHCOSVersionBootImageSkewLimit)
784+
return false, ""
785+
}

0 commit comments

Comments
 (0)