@@ -15,6 +15,7 @@ import (
1515 "github.com/google/go-cmp/cmp"
1616 "github.com/google/go-cmp/cmp/cmpopts"
1717 "github.com/google/uuid"
18+ prometheusv1 "github.com/prometheus/client_golang/api/prometheus/v1"
1819
1920 "k8s.io/apimachinery/pkg/api/equality"
2021 "k8s.io/apimachinery/pkg/api/meta"
@@ -102,6 +103,7 @@ func (optr *Operator) syncAvailableUpdates(ctx context.Context, config *configv1
102103 if ! acceptRisks .Equal (optrAvailableUpdates .AcceptRisks ) {
103104 needsConditionalUpdateEval = true
104105 }
106+ // If risks from alerts, conditional updates might be stale for maximally 2 x minimumUpdateCheckInterval
105107 if ! needsConditionalUpdateEval {
106108 klog .V (2 ).Infof ("Available updates were recently retrieved, with less than %s elapsed since %s, will try later." , optr .minimumUpdateCheckInterval , optrAvailableUpdates .LastAttempt .Format (time .RFC3339 ))
107109 return nil
@@ -162,6 +164,7 @@ func (optr *Operator) syncAvailableUpdates(ctx context.Context, config *configv1
162164
163165 optrAvailableUpdates .AcceptRisks = acceptRisks
164166 optrAvailableUpdates .ShouldReconcileAcceptRisks = optr .shouldReconcileAcceptRisks
167+ optrAvailableUpdates .AlertGetter = optr .AlertGetter
165168 optrAvailableUpdates .evaluateConditionalUpdates (ctx )
166169
167170 queueKey := optr .queueKey ()
@@ -215,6 +218,8 @@ type availableUpdates struct {
215218
216219 // RiskConditions stores the condition for every risk (name, url, message, matchingRules).
217220 RiskConditions map [string ][]metav1.Condition
221+
222+ AlertGetter AlertGetter
218223}
219224
220225func (u * availableUpdates ) RecentlyAttempted (interval time.Duration ) bool {
@@ -320,6 +325,7 @@ func (optr *Operator) getAvailableUpdates() *availableUpdates {
320325 ShouldReconcileAcceptRisks : optr .shouldReconcileAcceptRisks ,
321326 AcceptRisks : optr .availableUpdates .AcceptRisks ,
322327 RiskConditions : optr .availableUpdates .RiskConditions ,
328+ AlertGetter : optr .availableUpdates .AlertGetter ,
323329 LastAttempt : optr .availableUpdates .LastAttempt ,
324330 LastSyncOrConfigChange : optr .availableUpdates .LastSyncOrConfigChange ,
325331 Current : * optr .availableUpdates .Current .DeepCopy (),
@@ -512,6 +518,157 @@ func calculateAvailableUpdatesStatus(ctx context.Context, clusterID string, tran
512518 }
513519}
514520
521+ type AlertGetter interface {
522+ Get (ctx context.Context ) prometheusv1.AlertsResult
523+ }
524+
525+ func (u * availableUpdates ) evaluateAlertRisks (ctx context.Context ) []configv1.ConditionalUpdateRisk {
526+ if u == nil || u .AlertGetter == nil {
527+ return nil
528+ }
529+ alertsResult := u .AlertGetter .Get (ctx )
530+ return alertsToRisks (alertsResult .Alerts )
531+ }
532+
533+ func alertsToRisks (alerts []prometheusv1.Alert ) []configv1.ConditionalUpdateRisk {
534+ klog .V (2 ).Infof ("Found %d alerts" , len (alerts ))
535+ risks := map [string ]configv1.ConditionalUpdateRisk {}
536+ for _ , alert := range alerts {
537+ var alertName string
538+ if alertName = string (alert .Labels ["alertname" ]); alertName == "" {
539+ continue
540+ }
541+ if alert .State == "pending" {
542+ continue
543+ }
544+
545+ var summary string
546+ if summary = string (alert .Annotations ["summary" ]); summary == "" {
547+ summary = alertName
548+ }
549+ if ! strings .HasSuffix (summary , "." ) {
550+ summary += "."
551+ }
552+
553+ var description string
554+ alertMessage := string (alert .Annotations ["message" ])
555+ alertDescription := string (alert .Annotations ["description" ])
556+ switch {
557+ case alertMessage != "" && alertDescription != "" :
558+ description += " The alert description is: " + alertDescription + " | " + alertMessage
559+ case alertDescription != "" :
560+ description += " The alert description is: " + alertDescription
561+ case alertMessage != "" :
562+ description += " The alert description is: " + alertMessage
563+ default :
564+ description += " The alert has no description."
565+ }
566+
567+ var runbook string
568+ alertURL := "https://github.com/openshift/runbooks/tree/master/alerts?runbook=notfound"
569+ if runbook = string (alert .Annotations ["runbook_url" ]); runbook == "" {
570+ runbook = "<alert does not have a runbook_url annotation>"
571+ } else {
572+ alertURL = runbook
573+ }
574+
575+ details := fmt .Sprintf ("%s%s %s" , summary , description , runbook )
576+
577+ severity := string (alert .Labels ["severity" ])
578+ if severity == "critical" {
579+ if alertName == internal .AlertNamePodDisruptionBudgetLimit {
580+ details = fmt .Sprintf ("Namespace=%s, PodDisruptionBudget=%s. %s" , alert .Labels ["namespace" ], alert .Labels ["poddisruptionbudget" ], details )
581+ }
582+ risks [alertName ] = getRisk (risks , alertName , summary , alertURL , metav1.Condition {
583+ Type : internal .ConditionalUpdateRiskConditionTypeApplies ,
584+ Status : metav1 .ConditionTrue ,
585+ Reason : fmt .Sprintf ("Alert:%s" , alert .State ),
586+ Message : internal .AlertConditionMessage (alertName , severity , string (alert .State ), "suggesting significant cluster issues worth investigating" , details ),
587+ LastTransitionTime : metav1 .NewTime (alert .ActiveAt ),
588+ })
589+ continue
590+ }
591+
592+ if alertName == internal .AlertNamePodDisruptionBudgetAtLimit {
593+ details = fmt .Sprintf ("Namespace=%s, PodDisruptionBudget=%s. %s" , alert .Labels ["namespace" ], alert .Labels ["poddisruptionbudget" ], details )
594+ risks [alertName ] = getRisk (risks , alertName , summary , alertURL , metav1.Condition {
595+ Type : internal .ConditionalUpdateRiskConditionTypeApplies ,
596+ Status : metav1 .ConditionTrue ,
597+ Reason : internal .AlertConditionReason (string (alert .State )),
598+ Message : internal .AlertConditionMessage (alertName , severity , string (alert .State ), "which might slow node drains" , details ),
599+ LastTransitionTime : metav1 .NewTime (alert .ActiveAt ),
600+ })
601+ continue
602+ }
603+
604+ if internal .HavePullWaiting .Has (alertName ) ||
605+ internal .HaveNodes .Has (alertName ) ||
606+ alertName == internal .AlertNameVirtHandlerDaemonSetRolloutFailing ||
607+ alertName == internal .AlertNameVMCannotBeEvicted {
608+ risks [alertName ] = getRisk (risks , alertName , summary , alertURL , metav1.Condition {
609+ Type : internal .ConditionalUpdateRiskConditionTypeApplies ,
610+ Status : metav1 .ConditionTrue ,
611+ Reason : internal .AlertConditionReason (string (alert .State )),
612+ Message : internal .AlertConditionMessage (alertName , severity , string (alert .State ), "which may slow workload redistribution during rolling node updates" , details ),
613+ LastTransitionTime : metav1 .NewTime (alert .ActiveAt ),
614+ })
615+ continue
616+ }
617+
618+ updatePrecheck := string (alert .Labels ["openShiftUpdatePrecheck" ])
619+ if updatePrecheck == "true" {
620+ risks [alertName ] = getRisk (risks , alertName , summary , alertURL , metav1.Condition {
621+ Type : internal .ConditionalUpdateRiskConditionTypeApplies ,
622+ Status : metav1 .ConditionTrue ,
623+ Reason : fmt .Sprintf ("Alert:%s" , alert .State ),
624+ Message : internal .AlertConditionMessage (alertName , severity , string (alert .State ), "suggesting issues worth investigating before updating the cluster" , details ),
625+ LastTransitionTime : metav1 .NewTime (alert .ActiveAt ),
626+ })
627+ continue
628+ }
629+ }
630+
631+ klog .V (2 ).Infof ("Got %d risks" , len (risks ))
632+ if len (risks ) == 0 {
633+ return nil
634+ }
635+
636+ var ret []configv1.ConditionalUpdateRisk
637+ var keys []string
638+ for k := range risks {
639+ keys = append (keys , k )
640+ }
641+ sort .Strings (keys )
642+ for _ , k := range keys {
643+ ret = append (ret , risks [k ])
644+ }
645+ return ret
646+ }
647+
648+ func getRisk (risks map [string ]configv1.ConditionalUpdateRisk , riskName , message , url string , condition metav1.Condition ) configv1.ConditionalUpdateRisk {
649+ risk , ok := risks [riskName ]
650+ if ! ok {
651+ return configv1.ConditionalUpdateRisk {
652+ Name : riskName ,
653+ Message : message ,
654+ URL : url ,
655+ // Always as the alert is firing
656+ MatchingRules : []configv1.ClusterCondition {{Type : "Always" }},
657+ Conditions : []metav1.Condition {condition },
658+ }
659+ }
660+
661+ if c := meta .FindStatusCondition (risk .Conditions , condition .Type ); c != nil {
662+ c .Message = fmt .Sprintf ("%s; %s" , c .Message , condition .Message )
663+ if c .LastTransitionTime .After (condition .LastTransitionTime .Time ) {
664+ c .LastTransitionTime = condition .LastTransitionTime
665+ }
666+ meta .SetStatusCondition (& risk .Conditions , * c )
667+ }
668+
669+ return risk
670+ }
671+
515672func (u * availableUpdates ) evaluateConditionalUpdates (ctx context.Context ) {
516673 if u == nil {
517674 return
@@ -521,9 +678,14 @@ func (u *availableUpdates) evaluateConditionalUpdates(ctx context.Context) {
521678 risks := risksInOrder (riskVersions )
522679 u .RiskConditions = loadRiskConditions (ctx , risks , riskVersions , u .ConditionRegistry )
523680
681+ if u .ShouldReconcileAcceptRisks () {
682+ u .attachAlertRisksToUpdates (u .evaluateAlertRisks (ctx ))
683+ }
684+
524685 if err := sanityCheck (u .ConditionalUpdates ); err != nil {
525686 klog .Errorf ("Sanity check failed which might impact risk evaluation: %v" , err )
526687 }
688+
527689 for i , conditionalUpdate := range u .ConditionalUpdates {
528690 condition := evaluateConditionalUpdate (conditionalUpdate .Risks , u .AcceptRisks , u .ShouldReconcileAcceptRisks , u .RiskConditions )
529691
@@ -576,6 +738,45 @@ func (u *availableUpdates) removeUpdate(image string) {
576738 }
577739}
578740
741+ func (u * availableUpdates ) attachAlertRisksToUpdates (alertRisks []configv1.ConditionalUpdateRisk ) {
742+ if u == nil || len (alertRisks ) == 0 {
743+ return
744+ }
745+ if u .RiskConditions == nil {
746+ u .RiskConditions = map [string ][]metav1.Condition {}
747+ }
748+ for _ , alertRisk := range alertRisks {
749+ u .RiskConditions [alertRisk .Name ] = alertRisk .Conditions
750+ }
751+ var conditionalUpdates []configv1.ConditionalUpdate
752+ for _ , update := range u .Updates {
753+ conditionalUpdates = append (conditionalUpdates , configv1.ConditionalUpdate {
754+ Release : update ,
755+ Risks : alertRisks ,
756+ })
757+ }
758+ u .Updates = nil
759+ for _ , conditionalUpdate := range u .ConditionalUpdates {
760+ for _ , alertRisk := range alertRisks {
761+ var found bool
762+ for _ , risk := range conditionalUpdate .Risks {
763+ if alertRisk .Name == risk .Name {
764+ found = true
765+ break
766+ }
767+ }
768+ if ! found {
769+ conditionalUpdate .Risks = append (conditionalUpdate .Risks , alertRisk )
770+ }
771+ }
772+ conditionalUpdates = append (conditionalUpdates , conditionalUpdate )
773+ }
774+ sort .Slice (conditionalUpdates , func (i , j int ) bool {
775+ return conditionalUpdates [i ].Release .Version < conditionalUpdates [j ].Release .Version
776+ })
777+ u .ConditionalUpdates = conditionalUpdates
778+ }
779+
579780func unknownExposureMessage (risk configv1.ConditionalUpdateRisk , err error ) string {
580781 template := `Could not evaluate exposure to update risk %s (%v)
581782 %s description: %s
@@ -616,10 +817,13 @@ var reasonPattern = regexp.MustCompile(`^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$
616817func newRecommendedReason (now , want string ) string {
617818 switch {
618819 case now == recommendedReasonRisksNotExposed ||
619- now == recommendedReasonAllExposedRisksAccepted ||
820+ now == recommendedReasonAllExposedRisksAccepted && want != recommendedReasonRisksNotExposed ||
821+ now == recommendedReasonEvaluationFailed && want == recommendedReasonExposed ||
620822 now == want :
621823 return want
622- case want == recommendedReasonRisksNotExposed :
824+ case want == recommendedReasonRisksNotExposed ||
825+ now == recommendedReasonEvaluationFailed && want == recommendedReasonAllExposedRisksAccepted ||
826+ now == recommendedReasonExposed && (want == recommendedReasonAllExposedRisksAccepted || want == recommendedReasonEvaluationFailed ):
623827 return now
624828 default :
625829 return recommendedReasonMultiple
@@ -676,7 +880,6 @@ func evaluateConditionalUpdate(
676880 if len (errorMessages ) > 0 {
677881 recommended .Message = strings .Join (errorMessages , "\n \n " )
678882 }
679-
680883 return recommended
681884}
682885
0 commit comments