Skip to content

Commit a6050bf

Browse files
committed
Make deployments HA-ready with configurable replica count
1 parent a307a6d commit a6050bf

13 files changed

+607
-27
lines changed

Makefile

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -181,7 +181,7 @@ MANIFESTS ?= $(STANDARD_MANIFEST) $(STANDARD_E2E_MANIFEST) $(EXPERIMENTAL_MANIFE
181181
$(STANDARD_MANIFEST) ?= helm/cert-manager.yaml
182182
$(STANDARD_E2E_MANIFEST) ?= helm/cert-manager.yaml helm/e2e.yaml
183183
$(EXPERIMENTAL_MANIFEST) ?= helm/cert-manager.yaml helm/experimental.yaml
184-
$(EXPERIMENTAL_E2E_MANIFEST) ?= helm/cert-manager.yaml helm/experimental.yaml helm/e2e.yaml
184+
$(EXPERIMENTAL_E2E_MANIFEST) ?= helm/cert-manager.yaml helm/experimental.yaml helm/e2e.yaml helm/high-availability.yaml
185185
HELM_SETTINGS ?=
186186
.PHONY: $(MANIFESTS)
187187
$(MANIFESTS): $(HELM) $(CONFTEST)
@@ -524,8 +524,8 @@ run-experimental: run-internal #HELP Build the operator-controller then deploy i
524524
CATD_NAMESPACE := olmv1-system
525525
.PHONY: wait
526526
wait:
527-
kubectl wait --for=condition=Available --namespace=$(CATD_NAMESPACE) deployment/catalogd-controller-manager --timeout=60s
528-
kubectl wait --for=condition=Ready --namespace=$(CATD_NAMESPACE) certificate/catalogd-service-cert # Avoid upgrade test flakes when reissuing cert
527+
kubectl wait --for=condition=Available --namespace=$(CATD_NAMESPACE) deployment/catalogd-controller-manager --timeout=3m
528+
kubectl wait --for=condition=Ready --namespace=$(CATD_NAMESPACE) certificate/catalogd-service-cert --timeout=3m # Avoid upgrade test flakes when reissuing cert
529529

530530
.PHONY: docker-build
531531
docker-build: build-linux #EXHELP Build docker image for operator-controller and catalog with GOOS=linux and local GOARCH.

hack/test/pre-upgrade-setup.sh

Lines changed: 159 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,159 @@
1+
#!/bin/bash
2+
3+
set -euo pipefail
4+
5+
help="pre-upgrade-setup.sh is used to create some basic resources
6+
which will later be used in upgrade testing.
7+
8+
Usage:
9+
post-upgrade-checks.sh [TEST_CATALOG_IMG] [TEST_CATALOG_NAME] [TEST_CLUSTER_EXTENSION_NAME]
10+
"
11+
12+
if [[ "$#" -ne 3 ]]; then
13+
echo "Illegal number of arguments passed"
14+
echo "${help}"
15+
exit 1
16+
fi
17+
18+
TEST_CATALOG_IMG=$1
19+
TEST_CLUSTER_CATALOG_NAME=$2
20+
TEST_CLUSTER_EXTENSION_NAME=$3
21+
22+
kubectl apply -f - << EOF
23+
apiVersion: olm.operatorframework.io/v1
24+
kind: ClusterCatalog
25+
metadata:
26+
name: ${TEST_CLUSTER_CATALOG_NAME}
27+
spec:
28+
source:
29+
type: Image
30+
image:
31+
ref: ${TEST_CATALOG_IMG}
32+
pollIntervalMinutes: 1440
33+
EOF
34+
35+
kubectl apply -f - <<EOF
36+
apiVersion: v1
37+
kind: ServiceAccount
38+
metadata:
39+
name: upgrade-e2e
40+
namespace: default
41+
EOF
42+
43+
kubectl apply -f - <<EOF
44+
apiVersion: rbac.authorization.k8s.io/v1
45+
kind: ClusterRole
46+
metadata:
47+
name: upgrade-e2e
48+
rules:
49+
- apiGroups:
50+
- ""
51+
resources:
52+
- "configmaps"
53+
- "secrets"
54+
- "services"
55+
- "serviceaccounts"
56+
verbs:
57+
- "create"
58+
- "update"
59+
- "patch"
60+
- "delete"
61+
- "get"
62+
- "list"
63+
- "watch"
64+
- apiGroups:
65+
- "apps"
66+
resources:
67+
- "deployments"
68+
verbs:
69+
- "create"
70+
- "update"
71+
- "patch"
72+
- "delete"
73+
- "get"
74+
- "list"
75+
- "watch"
76+
- apiGroups:
77+
- "apiextensions.k8s.io"
78+
resources:
79+
- "customresourcedefinitions"
80+
verbs:
81+
- "create"
82+
- "update"
83+
- "patch"
84+
- "delete"
85+
- "get"
86+
- "list"
87+
- "watch"
88+
- apiGroups:
89+
- "rbac.authorization.k8s.io"
90+
resources:
91+
- "clusterroles"
92+
- "clusterrolebindings"
93+
- "roles"
94+
- "rolebindings"
95+
verbs:
96+
- "create"
97+
- "update"
98+
- "patch"
99+
- "delete"
100+
- "get"
101+
- "list"
102+
- "watch"
103+
- "bind"
104+
- "escalate"
105+
- apiGroups:
106+
- networking.k8s.io
107+
resources:
108+
- networkpolicies
109+
verbs:
110+
- get
111+
- list
112+
- watch
113+
- create
114+
- update
115+
- patch
116+
- delete
117+
- apiGroups:
118+
- "olm.operatorframework.io"
119+
resources:
120+
- "clusterextensions/finalizers"
121+
verbs:
122+
- "update"
123+
resourceNames:
124+
- "${TEST_CLUSTER_EXTENSION_NAME}"
125+
EOF
126+
127+
kubectl apply -f - <<EOF
128+
apiVersion: rbac.authorization.k8s.io/v1
129+
kind: ClusterRoleBinding
130+
metadata:
131+
name: upgrade-e2e
132+
subjects:
133+
- kind: ServiceAccount
134+
name: upgrade-e2e
135+
namespace: default
136+
roleRef:
137+
apiGroup: rbac.authorization.k8s.io
138+
kind: ClusterRole
139+
name: upgrade-e2e
140+
EOF
141+
142+
kubectl apply -f - << EOF
143+
apiVersion: olm.operatorframework.io/v1
144+
kind: ClusterExtension
145+
metadata:
146+
name: ${TEST_CLUSTER_EXTENSION_NAME}
147+
spec:
148+
namespace: default
149+
serviceAccount:
150+
name: upgrade-e2e
151+
source:
152+
sourceType: Catalog
153+
catalog:
154+
packageName: test
155+
version: 1.0.0
156+
EOF
157+
158+
kubectl wait --for=condition=Serving --timeout=5m ClusterCatalog $TEST_CLUSTER_CATALOG_NAME
159+
kubectl wait --for=condition=Installed --timeout=5m ClusterExtension $TEST_CLUSTER_EXTENSION_NAME

helm/high-availability.yaml

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
# High Availability (HA) configuration for OLMv1
2+
# Sets replicas to 2 for both operator-controller and catalogd to enable HA setup
3+
# This is used in experimental-e2e.yaml to test multi-replica deployments
4+
#
5+
# Pod anti-affinity is configured as "preferred" (not "required") to ensure:
6+
# - In multi-node clusters: replicas are scheduled on different nodes for better availability
7+
# - In single-node clusters (like kind): both replicas can still be scheduled on the same node
8+
options:
9+
operatorController:
10+
deployment:
11+
replicas: 2
12+
catalogd:
13+
deployment:
14+
replicas: 2
15+
16+
# Pod anti-affinity configuration to prefer spreading replicas across different nodes
17+
# Uses preferredDuringSchedulingIgnoredDuringExecution (soft constraint) to allow
18+
# scheduling on the same node when necessary (e.g., single-node kind clusters for e2e tests)
19+
deployments:
20+
templateSpec:
21+
affinity:
22+
podAntiAffinity:
23+
preferredDuringSchedulingIgnoredDuringExecution:
24+
- weight: 100
25+
podAffinityTerm:
26+
labelSelector:
27+
matchExpressions:
28+
- key: control-plane
29+
operator: In
30+
values:
31+
- operator-controller-controller-manager
32+
- catalogd-controller-manager
33+
topologyKey: kubernetes.io/hostname

helm/olmv1/templates/deployment-olmv1-system-catalogd-controller-manager.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,11 +12,11 @@ metadata:
1212
namespace: {{ .Values.namespaces.olmv1.name }}
1313
spec:
1414
minReadySeconds: 5
15-
replicas: 1
15+
replicas: {{ .Values.options.catalogd.deployment.replicas }}
1616
strategy:
1717
type: RollingUpdate
1818
rollingUpdate:
19-
maxSurge: 1 # Allow temporary 2 pods (1 + 1) for zero-downtime updates
19+
maxSurge: 1 # Allow temporary extra pod for zero-downtime updates
2020
maxUnavailable: 0 # Never allow pods to be unavailable during updates
2121
selector:
2222
matchLabels:

helm/olmv1/templates/deployment-olmv1-system-operator-controller-controller-manager.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,11 @@ metadata:
1111
name: operator-controller-controller-manager
1212
namespace: {{ .Values.namespaces.olmv1.name }}
1313
spec:
14-
replicas: 1
14+
replicas: {{ .Values.options.operatorController.deployment.replicas }}
1515
strategy:
1616
type: RollingUpdate
1717
rollingUpdate:
18-
maxSurge: 1 # Allow temporary 2 pods (1 + 1) for zero-downtime updates
18+
maxSurge: 1 # Allow temporary extra pod for zero-downtime updates
1919
maxUnavailable: 0 # Never allow pods to be unavailable during updates
2020
selector:
2121
matchLabels:

helm/olmv1/values.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ options:
88
enabled: true
99
deployment:
1010
image: quay.io/operator-framework/operator-controller:devel
11+
replicas: 1
1112
extraArguments: []
1213
features:
1314
enabled: []
@@ -19,6 +20,7 @@ options:
1920
enabled: true
2021
deployment:
2122
image: quay.io/operator-framework/catalogd:devel
23+
replicas: 1
2224
extraArguments: []
2325
features:
2426
enabled: []

manifests/experimental-e2e.yaml

Lines changed: 28 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2535,11 +2535,11 @@ metadata:
25352535
namespace: olmv1-system
25362536
spec:
25372537
minReadySeconds: 5
2538-
replicas: 1
2538+
replicas: 2
25392539
strategy:
25402540
type: RollingUpdate
25412541
rollingUpdate:
2542-
maxSurge: 1 # Allow temporary 2 pods (1 + 1) for zero-downtime updates
2542+
maxSurge: 1 # Allow temporary extra pod for zero-downtime updates
25432543
maxUnavailable: 0 # Never allow pods to be unavailable during updates
25442544
selector:
25452545
matchLabels:
@@ -2652,6 +2652,18 @@ spec:
26522652
operator: In
26532653
values:
26542654
- linux
2655+
podAntiAffinity:
2656+
preferredDuringSchedulingIgnoredDuringExecution:
2657+
- podAffinityTerm:
2658+
labelSelector:
2659+
matchExpressions:
2660+
- key: control-plane
2661+
operator: In
2662+
values:
2663+
- operator-controller-controller-manager
2664+
- catalogd-controller-manager
2665+
topologyKey: kubernetes.io/hostname
2666+
weight: 100
26552667
nodeSelector:
26562668
kubernetes.io/os: linux
26572669
node-role.kubernetes.io/control-plane: ""
@@ -2686,11 +2698,11 @@ metadata:
26862698
name: operator-controller-controller-manager
26872699
namespace: olmv1-system
26882700
spec:
2689-
replicas: 1
2701+
replicas: 2
26902702
strategy:
26912703
type: RollingUpdate
26922704
rollingUpdate:
2693-
maxSurge: 1 # Allow temporary 2 pods (1 + 1) for zero-downtime updates
2705+
maxSurge: 1 # Allow temporary extra pod for zero-downtime updates
26942706
maxUnavailable: 0 # Never allow pods to be unavailable during updates
26952707
selector:
26962708
matchLabels:
@@ -2812,6 +2824,18 @@ spec:
28122824
operator: In
28132825
values:
28142826
- linux
2827+
podAntiAffinity:
2828+
preferredDuringSchedulingIgnoredDuringExecution:
2829+
- podAffinityTerm:
2830+
labelSelector:
2831+
matchExpressions:
2832+
- key: control-plane
2833+
operator: In
2834+
values:
2835+
- operator-controller-controller-manager
2836+
- catalogd-controller-manager
2837+
topologyKey: kubernetes.io/hostname
2838+
weight: 100
28152839
nodeSelector:
28162840
kubernetes.io/os: linux
28172841
node-role.kubernetes.io/control-plane: ""

manifests/experimental.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2459,7 +2459,7 @@ spec:
24592459
strategy:
24602460
type: RollingUpdate
24612461
rollingUpdate:
2462-
maxSurge: 1 # Allow temporary 2 pods (1 + 1) for zero-downtime updates
2462+
maxSurge: 1 # Allow temporary extra pod for zero-downtime updates
24632463
maxUnavailable: 0 # Never allow pods to be unavailable during updates
24642464
selector:
24652465
matchLabels:
@@ -2597,7 +2597,7 @@ spec:
25972597
strategy:
25982598
type: RollingUpdate
25992599
rollingUpdate:
2600-
maxSurge: 1 # Allow temporary 2 pods (1 + 1) for zero-downtime updates
2600+
maxSurge: 1 # Allow temporary extra pod for zero-downtime updates
26012601
maxUnavailable: 0 # Never allow pods to be unavailable during updates
26022602
selector:
26032603
matchLabels:

manifests/standard-e2e.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1782,7 +1782,7 @@ spec:
17821782
strategy:
17831783
type: RollingUpdate
17841784
rollingUpdate:
1785-
maxSurge: 1 # Allow temporary 2 pods (1 + 1) for zero-downtime updates
1785+
maxSurge: 1 # Allow temporary extra pod for zero-downtime updates
17861786
maxUnavailable: 0 # Never allow pods to be unavailable during updates
17871787
selector:
17881788
matchLabels:
@@ -1932,7 +1932,7 @@ spec:
19321932
strategy:
19331933
type: RollingUpdate
19341934
rollingUpdate:
1935-
maxSurge: 1 # Allow temporary 2 pods (1 + 1) for zero-downtime updates
1935+
maxSurge: 1 # Allow temporary extra pod for zero-downtime updates
19361936
maxUnavailable: 0 # Never allow pods to be unavailable during updates
19371937
selector:
19381938
matchLabels:

manifests/standard.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1702,7 +1702,7 @@ spec:
17021702
strategy:
17031703
type: RollingUpdate
17041704
rollingUpdate:
1705-
maxSurge: 1 # Allow temporary 2 pods (1 + 1) for zero-downtime updates
1705+
maxSurge: 1 # Allow temporary extra pod for zero-downtime updates
17061706
maxUnavailable: 0 # Never allow pods to be unavailable during updates
17071707
selector:
17081708
matchLabels:
@@ -1839,7 +1839,7 @@ spec:
18391839
strategy:
18401840
type: RollingUpdate
18411841
rollingUpdate:
1842-
maxSurge: 1 # Allow temporary 2 pods (1 + 1) for zero-downtime updates
1842+
maxSurge: 1 # Allow temporary extra pod for zero-downtime updates
18431843
maxUnavailable: 0 # Never allow pods to be unavailable during updates
18441844
selector:
18451845
matchLabels:

0 commit comments

Comments
 (0)