Skip to content

Commit 3b365d0

Browse files
committed
OTA-1813: Populate risks from alerts
CVO fetches alerts from OpenShift monitoring via its RestAPI, then filters them out by its state such as firing, severity such as critical, name, or label. Those alerts are converted to conditional update risks and displayed in the cv.status. Moreover, they are associated to each update targets, regardless of conditional or not. Their Recommended condition is evaluated accordingly. CVO caches the returned alerts for a configurable expiration. This avoids bombarding the OpenShift monitoring. In addition, it fixes `newRecommendedReason` which returns a new combined reason or a new one that overrides the existing one. The issue comes with `recommendedReasonAllExposedRisksAccepted` in [1]. The unit test covers all the possible combinations. [1]. #1284
1 parent 0276666 commit 3b365d0

9 files changed

Lines changed: 1206 additions & 13 deletions

File tree

Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
package promql
2+
3+
import (
4+
"context"
5+
"fmt"
6+
"sync"
7+
"time"
8+
9+
"github.com/prometheus/client_golang/api"
10+
prometheusv1 "github.com/prometheus/client_golang/api/prometheus/v1"
11+
"github.com/prometheus/common/config"
12+
13+
"k8s.io/klog/v2"
14+
15+
"github.com/openshift/cluster-version-operator/pkg/clusterconditions"
16+
)
17+
18+
type Getter interface {
19+
Get(ctx context.Context) prometheusv1.AlertsResult
20+
}
21+
22+
func NewAlertGetter(promQLTarget clusterconditions.PromQLTarget) Getter {
23+
p := NewPromQL(promQLTarget)
24+
condition := p.Condition
25+
v, ok := condition.(*PromQL)
26+
if !ok {
27+
panic("invalid condition type")
28+
}
29+
return &ocAlertGetter{promQL: v, expiration: 1 * time.Minute}
30+
}
31+
32+
type ocAlertGetter struct {
33+
promQL *PromQL
34+
35+
mutex sync.Mutex
36+
cached prometheusv1.AlertsResult
37+
expiration time.Duration
38+
lastRefresh time.Time
39+
}
40+
41+
func (o *ocAlertGetter) Get(ctx context.Context) prometheusv1.AlertsResult {
42+
if time.Now().After(o.lastRefresh.Add(o.expiration)) {
43+
if err := o.refresh(ctx); err != nil {
44+
klog.Errorf("Failed to refresh alerts, using stale cache instead: %v", err)
45+
}
46+
}
47+
return o.cached
48+
}
49+
50+
func (o *ocAlertGetter) refresh(ctx context.Context) error {
51+
o.mutex.Lock()
52+
defer o.mutex.Unlock()
53+
54+
klog.Info("refresh alerts ...")
55+
p := o.promQL
56+
host, err := p.Host(ctx)
57+
if err != nil {
58+
return fmt.Errorf("failure determine thanos IP: %w", err)
59+
}
60+
p.url.Host = host
61+
clientConfig := api.Config{Address: p.url.String()}
62+
63+
if roundTripper, err := config.NewRoundTripperFromConfig(p.HTTPClientConfig, "cluster-conditions"); err == nil {
64+
clientConfig.RoundTripper = roundTripper
65+
} else {
66+
return fmt.Errorf("creating PromQL round-tripper: %w", err)
67+
}
68+
69+
promqlClient, err := api.NewClient(clientConfig)
70+
if err != nil {
71+
return fmt.Errorf("creating PromQL client: %w", err)
72+
}
73+
74+
client := &statusCodeNotImplementedForPostClient{
75+
client: promqlClient,
76+
}
77+
78+
v1api := prometheusv1.NewAPI(client)
79+
80+
queryContext := ctx
81+
if p.QueryTimeout > 0 {
82+
var cancel context.CancelFunc
83+
queryContext, cancel = context.WithTimeout(ctx, p.QueryTimeout)
84+
defer cancel()
85+
}
86+
87+
r, err := v1api.Alerts(queryContext)
88+
if err != nil {
89+
return fmt.Errorf("failed to get alerts: %w", err)
90+
}
91+
o.cached = r
92+
o.lastRefresh = time.Now()
93+
klog.Infof("refreshed: %d alerts", len(o.cached.Alerts))
94+
return nil
95+
}

pkg/cvo/availableupdates.go

Lines changed: 206 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ import (
1515
"github.com/google/go-cmp/cmp"
1616
"github.com/google/go-cmp/cmp/cmpopts"
1717
"github.com/google/uuid"
18+
prometheusv1 "github.com/prometheus/client_golang/api/prometheus/v1"
1819

1920
"k8s.io/apimachinery/pkg/api/equality"
2021
"k8s.io/apimachinery/pkg/api/meta"
@@ -102,6 +103,7 @@ func (optr *Operator) syncAvailableUpdates(ctx context.Context, config *configv1
102103
if !acceptRisks.Equal(optrAvailableUpdates.AcceptRisks) {
103104
needsConditionalUpdateEval = true
104105
}
106+
// If risks from alerts, conditional updates might be stale for maximally 2 x minimumUpdateCheckInterval
105107
if !needsConditionalUpdateEval {
106108
klog.V(2).Infof("Available updates were recently retrieved, with less than %s elapsed since %s, will try later.", optr.minimumUpdateCheckInterval, optrAvailableUpdates.LastAttempt.Format(time.RFC3339))
107109
return nil
@@ -162,6 +164,7 @@ func (optr *Operator) syncAvailableUpdates(ctx context.Context, config *configv1
162164

163165
optrAvailableUpdates.AcceptRisks = acceptRisks
164166
optrAvailableUpdates.ShouldReconcileAcceptRisks = optr.shouldReconcileAcceptRisks
167+
optrAvailableUpdates.AlertGetter = optr.AlertGetter
165168
optrAvailableUpdates.evaluateConditionalUpdates(ctx)
166169

167170
queueKey := optr.queueKey()
@@ -215,6 +218,8 @@ type availableUpdates struct {
215218

216219
// RiskConditions stores the condition for every risk (name, url, message, matchingRules).
217220
RiskConditions map[string][]metav1.Condition
221+
222+
AlertGetter AlertGetter
218223
}
219224

220225
func (u *availableUpdates) RecentlyAttempted(interval time.Duration) bool {
@@ -320,6 +325,7 @@ func (optr *Operator) getAvailableUpdates() *availableUpdates {
320325
ShouldReconcileAcceptRisks: optr.shouldReconcileAcceptRisks,
321326
AcceptRisks: optr.availableUpdates.AcceptRisks,
322327
RiskConditions: optr.availableUpdates.RiskConditions,
328+
AlertGetter: optr.availableUpdates.AlertGetter,
323329
LastAttempt: optr.availableUpdates.LastAttempt,
324330
LastSyncOrConfigChange: optr.availableUpdates.LastSyncOrConfigChange,
325331
Current: *optr.availableUpdates.Current.DeepCopy(),
@@ -512,6 +518,157 @@ func calculateAvailableUpdatesStatus(ctx context.Context, clusterID string, tran
512518
}
513519
}
514520

521+
type AlertGetter interface {
522+
Get(ctx context.Context) prometheusv1.AlertsResult
523+
}
524+
525+
func (u *availableUpdates) evaluateAlertRisks(ctx context.Context) []configv1.ConditionalUpdateRisk {
526+
if u == nil || u.AlertGetter == nil {
527+
return nil
528+
}
529+
alertsResult := u.AlertGetter.Get(ctx)
530+
return alertsToRisks(alertsResult.Alerts)
531+
}
532+
533+
func alertsToRisks(alerts []prometheusv1.Alert) []configv1.ConditionalUpdateRisk {
534+
klog.V(2).Infof("Found %d alerts", len(alerts))
535+
risks := map[string]configv1.ConditionalUpdateRisk{}
536+
for _, alert := range alerts {
537+
var alertName string
538+
if alertName = string(alert.Labels["alertname"]); alertName == "" {
539+
continue
540+
}
541+
if alert.State == "pending" {
542+
continue
543+
}
544+
545+
var summary string
546+
if summary = string(alert.Annotations["summary"]); summary == "" {
547+
summary = alertName
548+
}
549+
if !strings.HasSuffix(summary, ".") {
550+
summary += "."
551+
}
552+
553+
var description string
554+
alertMessage := string(alert.Annotations["message"])
555+
alertDescription := string(alert.Annotations["description"])
556+
switch {
557+
case alertMessage != "" && alertDescription != "":
558+
description += " The alert description is: " + alertDescription + " | " + alertMessage
559+
case alertDescription != "":
560+
description += " The alert description is: " + alertDescription
561+
case alertMessage != "":
562+
description += " The alert description is: " + alertMessage
563+
default:
564+
description += " The alert has no description."
565+
}
566+
567+
var runbook string
568+
alertURL := "https://github.com/openshift/runbooks/tree/master/alerts?runbook=notfound"
569+
if runbook = string(alert.Annotations["runbook_url"]); runbook == "" {
570+
runbook = "<alert does not have a runbook_url annotation>"
571+
} else {
572+
alertURL = runbook
573+
}
574+
575+
details := fmt.Sprintf("%s%s %s", summary, description, runbook)
576+
577+
severity := string(alert.Labels["severity"])
578+
if severity == "critical" {
579+
if alertName == internal.AlertNamePodDisruptionBudgetLimit {
580+
details = fmt.Sprintf("Namespace=%s, PodDisruptionBudget=%s. %s", alert.Labels["namespace"], alert.Labels["poddisruptionbudget"], details)
581+
}
582+
risks[alertName] = getRisk(risks, alertName, summary, alertURL, metav1.Condition{
583+
Type: internal.ConditionalUpdateRiskConditionTypeApplies,
584+
Status: metav1.ConditionTrue,
585+
Reason: fmt.Sprintf("Alert:%s", alert.State),
586+
Message: internal.AlertConditionMessage(alertName, severity, string(alert.State), "suggesting significant cluster issues worth investigating", details),
587+
LastTransitionTime: metav1.NewTime(alert.ActiveAt),
588+
})
589+
continue
590+
}
591+
592+
if alertName == internal.AlertNamePodDisruptionBudgetAtLimit {
593+
details = fmt.Sprintf("Namespace=%s, PodDisruptionBudget=%s. %s", alert.Labels["namespace"], alert.Labels["poddisruptionbudget"], details)
594+
risks[alertName] = getRisk(risks, alertName, summary, alertURL, metav1.Condition{
595+
Type: internal.ConditionalUpdateRiskConditionTypeApplies,
596+
Status: metav1.ConditionTrue,
597+
Reason: internal.AlertConditionReason(string(alert.State)),
598+
Message: internal.AlertConditionMessage(alertName, severity, string(alert.State), "which might slow node drains", details),
599+
LastTransitionTime: metav1.NewTime(alert.ActiveAt),
600+
})
601+
continue
602+
}
603+
604+
if internal.HavePullWaiting.Has(alertName) ||
605+
internal.HaveNodes.Has(alertName) ||
606+
alertName == internal.AlertNameVirtHandlerDaemonSetRolloutFailing ||
607+
alertName == internal.AlertNameVMCannotBeEvicted {
608+
risks[alertName] = getRisk(risks, alertName, summary, alertURL, metav1.Condition{
609+
Type: internal.ConditionalUpdateRiskConditionTypeApplies,
610+
Status: metav1.ConditionTrue,
611+
Reason: internal.AlertConditionReason(string(alert.State)),
612+
Message: internal.AlertConditionMessage(alertName, severity, string(alert.State), "which may slow workload redistribution during rolling node updates", details),
613+
LastTransitionTime: metav1.NewTime(alert.ActiveAt),
614+
})
615+
continue
616+
}
617+
618+
updatePrecheck := string(alert.Labels["openShiftUpdatePrecheck"])
619+
if updatePrecheck == "true" {
620+
risks[alertName] = getRisk(risks, alertName, summary, alertURL, metav1.Condition{
621+
Type: internal.ConditionalUpdateRiskConditionTypeApplies,
622+
Status: metav1.ConditionTrue,
623+
Reason: fmt.Sprintf("Alert:%s", alert.State),
624+
Message: internal.AlertConditionMessage(alertName, severity, string(alert.State), "suggesting issues worth investigating before updating the cluster", details),
625+
LastTransitionTime: metav1.NewTime(alert.ActiveAt),
626+
})
627+
continue
628+
}
629+
}
630+
631+
klog.V(2).Infof("Got %d risks", len(risks))
632+
if len(risks) == 0 {
633+
return nil
634+
}
635+
636+
var ret []configv1.ConditionalUpdateRisk
637+
var keys []string
638+
for k := range risks {
639+
keys = append(keys, k)
640+
}
641+
sort.Strings(keys)
642+
for _, k := range keys {
643+
ret = append(ret, risks[k])
644+
}
645+
return ret
646+
}
647+
648+
func getRisk(risks map[string]configv1.ConditionalUpdateRisk, riskName, message, url string, condition metav1.Condition) configv1.ConditionalUpdateRisk {
649+
risk, ok := risks[riskName]
650+
if !ok {
651+
return configv1.ConditionalUpdateRisk{
652+
Name: riskName,
653+
Message: message,
654+
URL: url,
655+
// Always as the alert is firing
656+
MatchingRules: []configv1.ClusterCondition{{Type: "Always"}},
657+
Conditions: []metav1.Condition{condition},
658+
}
659+
}
660+
661+
if c := meta.FindStatusCondition(risk.Conditions, condition.Type); c != nil {
662+
c.Message = fmt.Sprintf("%s; %s", c.Message, condition.Message)
663+
if c.LastTransitionTime.After(condition.LastTransitionTime.Time) {
664+
c.LastTransitionTime = condition.LastTransitionTime
665+
}
666+
meta.SetStatusCondition(&risk.Conditions, *c)
667+
}
668+
669+
return risk
670+
}
671+
515672
func (u *availableUpdates) evaluateConditionalUpdates(ctx context.Context) {
516673
if u == nil {
517674
return
@@ -521,9 +678,14 @@ func (u *availableUpdates) evaluateConditionalUpdates(ctx context.Context) {
521678
risks := risksInOrder(riskVersions)
522679
u.RiskConditions = loadRiskConditions(ctx, risks, riskVersions, u.ConditionRegistry)
523680

681+
if u.ShouldReconcileAcceptRisks() {
682+
u.attachAlertRisksToUpdates(u.evaluateAlertRisks(ctx))
683+
}
684+
524685
if err := sanityCheck(u.ConditionalUpdates); err != nil {
525686
klog.Errorf("Sanity check failed which might impact risk evaluation: %v", err)
526687
}
688+
527689
for i, conditionalUpdate := range u.ConditionalUpdates {
528690
condition := evaluateConditionalUpdate(conditionalUpdate.Risks, u.AcceptRisks, u.ShouldReconcileAcceptRisks, u.RiskConditions)
529691

@@ -576,6 +738,45 @@ func (u *availableUpdates) removeUpdate(image string) {
576738
}
577739
}
578740

741+
func (u *availableUpdates) attachAlertRisksToUpdates(alertRisks []configv1.ConditionalUpdateRisk) {
742+
if u == nil || len(alertRisks) == 0 {
743+
return
744+
}
745+
if u.RiskConditions == nil {
746+
u.RiskConditions = map[string][]metav1.Condition{}
747+
}
748+
for _, alertRisk := range alertRisks {
749+
u.RiskConditions[alertRisk.Name] = alertRisk.Conditions
750+
}
751+
var conditionalUpdates []configv1.ConditionalUpdate
752+
for _, update := range u.Updates {
753+
conditionalUpdates = append(conditionalUpdates, configv1.ConditionalUpdate{
754+
Release: update,
755+
Risks: alertRisks,
756+
})
757+
}
758+
u.Updates = nil
759+
for _, conditionalUpdate := range u.ConditionalUpdates {
760+
for _, alertRisk := range alertRisks {
761+
var found bool
762+
for _, risk := range conditionalUpdate.Risks {
763+
if alertRisk.Name == risk.Name {
764+
found = true
765+
break
766+
}
767+
}
768+
if !found {
769+
conditionalUpdate.Risks = append(conditionalUpdate.Risks, alertRisk)
770+
}
771+
}
772+
conditionalUpdates = append(conditionalUpdates, conditionalUpdate)
773+
}
774+
sort.Slice(conditionalUpdates, func(i, j int) bool {
775+
return conditionalUpdates[i].Release.Version < conditionalUpdates[j].Release.Version
776+
})
777+
u.ConditionalUpdates = conditionalUpdates
778+
}
779+
579780
func unknownExposureMessage(risk configv1.ConditionalUpdateRisk, err error) string {
580781
template := `Could not evaluate exposure to update risk %s (%v)
581782
%s description: %s
@@ -616,10 +817,13 @@ var reasonPattern = regexp.MustCompile(`^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$
616817
func newRecommendedReason(now, want string) string {
617818
switch {
618819
case now == recommendedReasonRisksNotExposed ||
619-
now == recommendedReasonAllExposedRisksAccepted ||
820+
now == recommendedReasonAllExposedRisksAccepted && want != recommendedReasonRisksNotExposed ||
821+
now == recommendedReasonEvaluationFailed && want == recommendedReasonExposed ||
620822
now == want:
621823
return want
622-
case want == recommendedReasonRisksNotExposed:
824+
case want == recommendedReasonRisksNotExposed ||
825+
now == recommendedReasonEvaluationFailed && want == recommendedReasonAllExposedRisksAccepted ||
826+
now == recommendedReasonExposed && (want == recommendedReasonAllExposedRisksAccepted || want == recommendedReasonEvaluationFailed):
623827
return now
624828
default:
625829
return recommendedReasonMultiple
@@ -676,7 +880,6 @@ func evaluateConditionalUpdate(
676880
if len(errorMessages) > 0 {
677881
recommended.Message = strings.Join(errorMessages, "\n\n")
678882
}
679-
680883
return recommended
681884
}
682885

0 commit comments

Comments
 (0)