Skip to content

Commit e9c1c39

Browse files
Merge pull request #1367 from wking/generic-risk-interface
OTA-1933: pkg/risk: Refactor alerts into a generic update-risk interface
2 parents 9679c0f + d9fe47b commit e9c1c39

10 files changed

Lines changed: 971 additions & 626 deletions

File tree

pkg/cvo/availableupdates.go

Lines changed: 80 additions & 245 deletions
Large diffs are not rendered by default.

pkg/cvo/availableupdates_test.go

Lines changed: 29 additions & 376 deletions
Large diffs are not rendered by default.

pkg/cvo/cvo.go

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,6 @@ import (
4343
"github.com/openshift/cluster-version-operator/lib/resourcebuilder"
4444
"github.com/openshift/cluster-version-operator/lib/validation"
4545
"github.com/openshift/cluster-version-operator/pkg/clusterconditions"
46-
"github.com/openshift/cluster-version-operator/pkg/clusterconditions/promql"
4746
"github.com/openshift/cluster-version-operator/pkg/clusterconditions/standard"
4847
"github.com/openshift/cluster-version-operator/pkg/customsignaturestore"
4948
"github.com/openshift/cluster-version-operator/pkg/cvo/configuration"
@@ -54,6 +53,8 @@ import (
5453
"github.com/openshift/cluster-version-operator/pkg/payload"
5554
"github.com/openshift/cluster-version-operator/pkg/payload/precondition"
5655
preconditioncv "github.com/openshift/cluster-version-operator/pkg/payload/precondition/clusterversion"
56+
"github.com/openshift/cluster-version-operator/pkg/risk"
57+
"github.com/openshift/cluster-version-operator/pkg/risk/alert"
5758
)
5859

5960
const (
@@ -209,7 +210,9 @@ type Operator struct {
209210
// configuration, if enabled, reconciles the ClusterVersionOperator configuration.
210211
configuration *configuration.ClusterVersionOperatorConfiguration
211212

212-
AlertGetter AlertGetter
213+
// risks holds update-risk source (in-cluster alerts, etc.)
214+
// that will be aggregated into conditional update risks.
215+
risks risk.Source
213216
}
214217

215218
// New returns a new cluster version operator.
@@ -270,7 +273,7 @@ func New(
270273
exclude: exclude,
271274
clusterProfile: clusterProfile,
272275
conditionRegistry: standard.NewConditionRegistry(promqlTarget),
273-
AlertGetter: promql.NewAlertGetter(promqlTarget),
276+
risks: alert.New("Alert", promqlTarget),
274277
injectClusterIdIntoPromQL: injectClusterIdIntoPromQL,
275278

276279
requiredFeatureSet: featureSet,

pkg/cvo/cvo_test.go

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@ import (
1616

1717
"github.com/davecgh/go-spew/spew"
1818
"github.com/google/go-cmp/cmp"
19-
"github.com/google/go-cmp/cmp/cmpopts"
2019
"github.com/google/uuid"
2120

2221
appsv1 "k8s.io/api/apps/v1"
@@ -2783,7 +2782,7 @@ func TestOperator_availableUpdatesSync(t *testing.T) {
27832782
}
27842783
}
27852784

2786-
if diff := cmp.Diff(tt.wantUpdates, optr.availableUpdates, cmpopts.IgnoreFields(availableUpdates{}, "ShouldReconcileAcceptRisks")); diff != "" {
2785+
if diff := cmp.Diff(tt.wantUpdates, optr.availableUpdates, availableUpdatesCmpOpts...); diff != "" {
27872786
t.Fatalf("unexpected: %s", diff)
27882787
}
27892788
if (optr.queue.Len() > 0) != (optr.availableUpdates != nil) {

pkg/risk/alert/alert.go

Lines changed: 186 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,186 @@
1+
package alert
2+
3+
import (
4+
"context"
5+
"fmt"
6+
"sort"
7+
"strings"
8+
9+
prometheusv1 "github.com/prometheus/client_golang/api/prometheus/v1"
10+
11+
"k8s.io/apimachinery/pkg/api/meta"
12+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
13+
"k8s.io/klog/v2"
14+
15+
configv1 "github.com/openshift/api/config/v1"
16+
17+
"github.com/openshift/cluster-version-operator/pkg/clusterconditions"
18+
"github.com/openshift/cluster-version-operator/pkg/clusterconditions/promql"
19+
"github.com/openshift/cluster-version-operator/pkg/internal"
20+
"github.com/openshift/cluster-version-operator/pkg/risk"
21+
)
22+
23+
func New(name string, promQLTarget clusterconditions.PromQLTarget) risk.Source {
24+
return &alertRisk{
25+
name: name,
26+
getter: promql.NewAlertGetter(promQLTarget),
27+
}
28+
}
29+
30+
type alertRisk struct {
31+
name string
32+
getter promql.Getter
33+
}
34+
35+
func (a *alertRisk) Name() string {
36+
return a.name
37+
}
38+
39+
func (a *alertRisk) Risks(ctx context.Context, versions []string) ([]configv1.ConditionalUpdateRisk, map[string][]string, error) {
40+
alerts := a.getter.Get(ctx)
41+
risks := alertsToRisks(alerts.Alerts)
42+
versionMap := make(map[string][]string, len(risks))
43+
for _, risk := range risks {
44+
versionMap[risk.Name] = versions
45+
}
46+
return risks, versionMap, nil
47+
}
48+
49+
func alertsToRisks(alerts []prometheusv1.Alert) []configv1.ConditionalUpdateRisk {
50+
klog.V(2).Infof("Found %d alerts", len(alerts))
51+
risks := map[string]configv1.ConditionalUpdateRisk{}
52+
for _, alert := range alerts {
53+
var alertName string
54+
if alertName = string(alert.Labels["alertname"]); alertName == "" {
55+
continue
56+
}
57+
if alert.State == "pending" {
58+
continue
59+
}
60+
61+
var summary string
62+
if summary = string(alert.Annotations["summary"]); summary == "" {
63+
summary = alertName
64+
}
65+
if !strings.HasSuffix(summary, ".") {
66+
summary += "."
67+
}
68+
69+
var description string
70+
alertMessage := string(alert.Annotations["message"])
71+
alertDescription := string(alert.Annotations["description"])
72+
switch {
73+
case alertMessage != "" && alertDescription != "":
74+
description += " The alert description is: " + alertDescription + " | " + alertMessage
75+
case alertDescription != "":
76+
description += " The alert description is: " + alertDescription
77+
case alertMessage != "":
78+
description += " The alert description is: " + alertMessage
79+
default:
80+
description += " The alert has no description."
81+
}
82+
83+
var runbook string
84+
alertURL := "https://github.com/openshift/runbooks/tree/master/alerts?runbook=notfound"
85+
if runbook = string(alert.Annotations["runbook_url"]); runbook == "" {
86+
runbook = "<alert does not have a runbook_url annotation>"
87+
} else {
88+
alertURL = runbook
89+
}
90+
91+
details := fmt.Sprintf("%s%s %s", summary, description, runbook)
92+
93+
severity := string(alert.Labels["severity"])
94+
if severity == "critical" {
95+
if alertName == internal.AlertNamePodDisruptionBudgetLimit {
96+
details = fmt.Sprintf("Namespace=%s, PodDisruptionBudget=%s. %s", alert.Labels["namespace"], alert.Labels["poddisruptionbudget"], details)
97+
}
98+
risks[alertName] = getRisk(risks, alertName, summary, alertURL, metav1.Condition{
99+
Type: internal.ConditionalUpdateRiskConditionTypeApplies,
100+
Status: metav1.ConditionTrue,
101+
Reason: fmt.Sprintf("Alert:%s", alert.State),
102+
Message: internal.AlertConditionMessage(alertName, severity, string(alert.State), "suggesting significant cluster issues worth investigating", details),
103+
LastTransitionTime: metav1.NewTime(alert.ActiveAt),
104+
})
105+
continue
106+
}
107+
108+
if alertName == internal.AlertNamePodDisruptionBudgetAtLimit {
109+
details = fmt.Sprintf("Namespace=%s, PodDisruptionBudget=%s. %s", alert.Labels["namespace"], alert.Labels["poddisruptionbudget"], details)
110+
risks[alertName] = getRisk(risks, alertName, summary, alertURL, metav1.Condition{
111+
Type: internal.ConditionalUpdateRiskConditionTypeApplies,
112+
Status: metav1.ConditionTrue,
113+
Reason: internal.AlertConditionReason(string(alert.State)),
114+
Message: internal.AlertConditionMessage(alertName, severity, string(alert.State), "which might slow node drains", details),
115+
LastTransitionTime: metav1.NewTime(alert.ActiveAt),
116+
})
117+
continue
118+
}
119+
120+
if internal.HavePullWaiting.Has(alertName) ||
121+
internal.HaveNodes.Has(alertName) ||
122+
alertName == internal.AlertNameVirtHandlerDaemonSetRolloutFailing ||
123+
alertName == internal.AlertNameVMCannotBeEvicted {
124+
risks[alertName] = getRisk(risks, alertName, summary, alertURL, metav1.Condition{
125+
Type: internal.ConditionalUpdateRiskConditionTypeApplies,
126+
Status: metav1.ConditionTrue,
127+
Reason: internal.AlertConditionReason(string(alert.State)),
128+
Message: internal.AlertConditionMessage(alertName, severity, string(alert.State), "which may slow workload redistribution during rolling node updates", details),
129+
LastTransitionTime: metav1.NewTime(alert.ActiveAt),
130+
})
131+
continue
132+
}
133+
134+
updatePrecheck := string(alert.Labels["openShiftUpdatePrecheck"])
135+
if updatePrecheck == "true" {
136+
risks[alertName] = getRisk(risks, alertName, summary, alertURL, metav1.Condition{
137+
Type: internal.ConditionalUpdateRiskConditionTypeApplies,
138+
Status: metav1.ConditionTrue,
139+
Reason: fmt.Sprintf("Alert:%s", alert.State),
140+
Message: internal.AlertConditionMessage(alertName, severity, string(alert.State), "suggesting issues worth investigating before updating the cluster", details),
141+
LastTransitionTime: metav1.NewTime(alert.ActiveAt),
142+
})
143+
continue
144+
}
145+
}
146+
147+
klog.V(2).Infof("Got %d risks", len(risks))
148+
if len(risks) == 0 {
149+
return nil
150+
}
151+
152+
var ret []configv1.ConditionalUpdateRisk
153+
var keys []string
154+
for k := range risks {
155+
keys = append(keys, k)
156+
}
157+
sort.Strings(keys)
158+
for _, k := range keys {
159+
ret = append(ret, risks[k])
160+
}
161+
return ret
162+
}
163+
164+
func getRisk(risks map[string]configv1.ConditionalUpdateRisk, riskName, message, url string, condition metav1.Condition) configv1.ConditionalUpdateRisk {
165+
risk, ok := risks[riskName]
166+
if !ok {
167+
return configv1.ConditionalUpdateRisk{
168+
Name: riskName,
169+
Message: message,
170+
URL: url,
171+
// Always as the alert is firing
172+
MatchingRules: []configv1.ClusterCondition{{Type: "Always"}},
173+
Conditions: []metav1.Condition{condition},
174+
}
175+
}
176+
177+
if c := meta.FindStatusCondition(risk.Conditions, condition.Type); c != nil {
178+
c.Message = fmt.Sprintf("%s; %s", c.Message, condition.Message)
179+
if c.LastTransitionTime.After(condition.LastTransitionTime.Time) {
180+
c.LastTransitionTime = condition.LastTransitionTime
181+
}
182+
meta.SetStatusCondition(&risk.Conditions, *c)
183+
}
184+
185+
return risk
186+
}

0 commit comments

Comments
 (0)