Make deployments HA-ready with configurable replica count

jianzhangbjz · jianzhangbjz · commit a6050bf56432 · 2026-03-23T15:34:19.000+08:00
diff --git a/Makefile b/Makefile
@@ -181,7 +181,7 @@ MANIFESTS ?= $(STANDARD_MANIFEST) $(STANDARD_E2E_MANIFEST) $(EXPERIMENTAL_MANIFE
 $(STANDARD_MANIFEST) ?= helm/cert-manager.yaml
 $(STANDARD_E2E_MANIFEST) ?= helm/cert-manager.yaml helm/e2e.yaml
 $(EXPERIMENTAL_MANIFEST) ?= helm/cert-manager.yaml helm/experimental.yaml
-$(EXPERIMENTAL_E2E_MANIFEST) ?= helm/cert-manager.yaml helm/experimental.yaml helm/e2e.yaml
+$(EXPERIMENTAL_E2E_MANIFEST) ?= helm/cert-manager.yaml helm/experimental.yaml helm/e2e.yaml helm/high-availability.yaml
 HELM_SETTINGS ?=
 .PHONY: $(MANIFESTS)
 $(MANIFESTS): $(HELM) $(CONFTEST)
@@ -524,8 +524,8 @@ run-experimental: run-internal #HELP Build the operator-controller then deploy i
 CATD_NAMESPACE := olmv1-system
 .PHONY: wait
 wait:
-	kubectl wait --for=condition=Available --namespace=$(CATD_NAMESPACE) deployment/catalogd-controller-manager --timeout=60s
-	kubectl wait --for=condition=Ready --namespace=$(CATD_NAMESPACE) certificate/catalogd-service-cert # Avoid upgrade test flakes when reissuing cert
+	kubectl wait --for=condition=Available --namespace=$(CATD_NAMESPACE) deployment/catalogd-controller-manager --timeout=3m
+	kubectl wait --for=condition=Ready --namespace=$(CATD_NAMESPACE) certificate/catalogd-service-cert --timeout=3m # Avoid upgrade test flakes when reissuing cert
 
 .PHONY: docker-build
 docker-build: build-linux #EXHELP Build docker image for operator-controller and catalog with GOOS=linux and local GOARCH.
diff --git a/hack/test/pre-upgrade-setup.sh b/hack/test/pre-upgrade-setup.sh
@@ -0,0 +1,159 @@
+#!/bin/bash
+
+set -euo pipefail
+
+help="pre-upgrade-setup.sh is used to create some basic resources
+which will later be used in upgrade testing.
+
+Usage:
+  post-upgrade-checks.sh [TEST_CATALOG_IMG] [TEST_CATALOG_NAME] [TEST_CLUSTER_EXTENSION_NAME]
+"
+
+if [[ "$#" -ne 3 ]]; then
+  echo "Illegal number of arguments passed"
+  echo "${help}"
+  exit 1
+fi
+
+TEST_CATALOG_IMG=$1
+TEST_CLUSTER_CATALOG_NAME=$2
+TEST_CLUSTER_EXTENSION_NAME=$3
+
+kubectl apply -f - << EOF
+apiVersion: olm.operatorframework.io/v1
+kind: ClusterCatalog
+metadata:
+  name: ${TEST_CLUSTER_CATALOG_NAME}
+spec:
+  source:
+    type: Image
+    image:
+      ref: ${TEST_CATALOG_IMG}
+      pollIntervalMinutes: 1440
+EOF
+
+kubectl apply -f - <<EOF
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: upgrade-e2e
+  namespace: default
+EOF
+
+kubectl apply -f - <<EOF
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: upgrade-e2e
+rules:
+  - apiGroups:
+    - ""
+    resources:
+    - "configmaps"
+    - "secrets"
+    - "services"
+    - "serviceaccounts"
+    verbs:
+    - "create"
+    - "update"
+    - "patch"
+    - "delete"
+    - "get"
+    - "list"
+    - "watch"
+  - apiGroups:
+    - "apps"
+    resources:
+    - "deployments"
+    verbs:
+    - "create"
+    - "update"
+    - "patch"
+    - "delete"
+    - "get"
+    - "list"
+    - "watch"
+  - apiGroups:
+    - "apiextensions.k8s.io"
+    resources:
+    - "customresourcedefinitions"
+    verbs:
+    - "create"
+    - "update"
+    - "patch"
+    - "delete"
+    - "get"
+    - "list"
+    - "watch"
+  - apiGroups:
+    - "rbac.authorization.k8s.io"
+    resources:
+    - "clusterroles"
+    - "clusterrolebindings"
+    - "roles"
+    - "rolebindings"
+    verbs:
+    - "create"
+    - "update"
+    - "patch"
+    - "delete"
+    - "get"
+    - "list"
+    - "watch"
+    - "bind"
+    - "escalate"
+  - apiGroups:
+    - networking.k8s.io
+    resources:
+    - networkpolicies
+    verbs:
+    - get
+    - list
+    - watch
+    - create
+    - update
+    - patch
+    - delete
+  - apiGroups:
+    - "olm.operatorframework.io"
+    resources:
+    - "clusterextensions/finalizers"
+    verbs:
+    - "update"
+    resourceNames:
+    - "${TEST_CLUSTER_EXTENSION_NAME}"
+EOF
+
+kubectl apply -f - <<EOF
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: upgrade-e2e
+subjects:
+  - kind: ServiceAccount
+    name: upgrade-e2e
+    namespace: default
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: upgrade-e2e
+EOF
+
+kubectl apply -f - << EOF
+apiVersion: olm.operatorframework.io/v1
+kind: ClusterExtension
+metadata:
+  name: ${TEST_CLUSTER_EXTENSION_NAME}
+spec:
+  namespace: default
+  serviceAccount:
+    name: upgrade-e2e
+  source:
+    sourceType: Catalog
+    catalog:
+      packageName: test
+      version: 1.0.0
+EOF
+
+kubectl wait --for=condition=Serving --timeout=5m ClusterCatalog $TEST_CLUSTER_CATALOG_NAME
+kubectl wait --for=condition=Installed --timeout=5m ClusterExtension $TEST_CLUSTER_EXTENSION_NAME
diff --git a/helm/high-availability.yaml b/helm/high-availability.yaml
@@ -0,0 +1,33 @@
+# High Availability (HA) configuration for OLMv1
+# Sets replicas to 2 for both operator-controller and catalogd to enable HA setup
+# This is used in experimental-e2e.yaml to test multi-replica deployments
+#
+# Pod anti-affinity is configured as "preferred" (not "required") to ensure:
+# - In multi-node clusters: replicas are scheduled on different nodes for better availability
+# - In single-node clusters (like kind): both replicas can still be scheduled on the same node
+options:
+  operatorController:
+    deployment:
+      replicas: 2
+  catalogd:
+    deployment:
+      replicas: 2
+
+# Pod anti-affinity configuration to prefer spreading replicas across different nodes
+# Uses preferredDuringSchedulingIgnoredDuringExecution (soft constraint) to allow
+# scheduling on the same node when necessary (e.g., single-node kind clusters for e2e tests)
+deployments:
+  templateSpec:
+    affinity:
+      podAntiAffinity:
+        preferredDuringSchedulingIgnoredDuringExecution:
+          - weight: 100
+            podAffinityTerm:
+              labelSelector:
+                matchExpressions:
+                  - key: control-plane
+                    operator: In
+                    values:
+                      - operator-controller-controller-manager
+                      - catalogd-controller-manager
+              topologyKey: kubernetes.io/hostname
diff --git a/helm/olmv1/templates/deployment-olmv1-system-catalogd-controller-manager.yml b/helm/olmv1/templates/deployment-olmv1-system-catalogd-controller-manager.yml
@@ -12,11 +12,11 @@ metadata:
   namespace: {{ .Values.namespaces.olmv1.name }}
 spec:
   minReadySeconds: 5
-  replicas: 1
+  replicas: {{ .Values.options.catalogd.deployment.replicas }}
   strategy:
     type: RollingUpdate
     rollingUpdate:
-      maxSurge: 1          # Allow temporary 2 pods (1 + 1) for zero-downtime updates
+      maxSurge: 1          # Allow temporary extra pod for zero-downtime updates
       maxUnavailable: 0    # Never allow pods to be unavailable during updates
   selector:
     matchLabels:
diff --git a/helm/olmv1/templates/deployment-olmv1-system-operator-controller-controller-manager.yml b/helm/olmv1/templates/deployment-olmv1-system-operator-controller-controller-manager.yml
@@ -11,11 +11,11 @@ metadata:
   name: operator-controller-controller-manager
   namespace: {{ .Values.namespaces.olmv1.name }}
 spec:
-  replicas: 1
+  replicas: {{ .Values.options.operatorController.deployment.replicas }}
   strategy:
     type: RollingUpdate
     rollingUpdate:
-      maxSurge: 1          # Allow temporary 2 pods (1 + 1) for zero-downtime updates
+      maxSurge: 1          # Allow temporary extra pod for zero-downtime updates
       maxUnavailable: 0    # Never allow pods to be unavailable during updates
   selector:
     matchLabels:
diff --git a/helm/olmv1/values.yaml b/helm/olmv1/values.yaml
@@ -8,6 +8,7 @@ options:
     enabled: true
     deployment:
       image: quay.io/operator-framework/operator-controller:devel
+      replicas: 1
       extraArguments: []
     features:
       enabled: []
@@ -19,6 +20,7 @@ options:
     enabled: true
     deployment:
       image: quay.io/operator-framework/catalogd:devel
+      replicas: 1
       extraArguments: []
     features:
       enabled: []
diff --git a/manifests/experimental-e2e.yaml b/manifests/experimental-e2e.yaml
@@ -2535,11 +2535,11 @@ metadata:
   namespace: olmv1-system
 spec:
   minReadySeconds: 5
-  replicas: 1
+  replicas: 2
   strategy:
     type: RollingUpdate
     rollingUpdate:
-      maxSurge: 1          # Allow temporary 2 pods (1 + 1) for zero-downtime updates
+      maxSurge: 1          # Allow temporary extra pod for zero-downtime updates
       maxUnavailable: 0    # Never allow pods to be unavailable during updates
   selector:
     matchLabels:
@@ -2652,6 +2652,18 @@ spec:
                     operator: In
                     values:
                       - linux
+        podAntiAffinity:
+          preferredDuringSchedulingIgnoredDuringExecution:
+            - podAffinityTerm:
+                labelSelector:
+                  matchExpressions:
+                    - key: control-plane
+                      operator: In
+                      values:
+                        - operator-controller-controller-manager
+                        - catalogd-controller-manager
+                topologyKey: kubernetes.io/hostname
+              weight: 100
       nodeSelector:
         kubernetes.io/os: linux
         node-role.kubernetes.io/control-plane: ""
@@ -2686,11 +2698,11 @@ metadata:
   name: operator-controller-controller-manager
   namespace: olmv1-system
 spec:
-  replicas: 1
+  replicas: 2
   strategy:
     type: RollingUpdate
     rollingUpdate:
-      maxSurge: 1          # Allow temporary 2 pods (1 + 1) for zero-downtime updates
+      maxSurge: 1          # Allow temporary extra pod for zero-downtime updates
       maxUnavailable: 0    # Never allow pods to be unavailable during updates
   selector:
     matchLabels:
@@ -2812,6 +2824,18 @@ spec:
                     operator: In
                     values:
                       - linux
+        podAntiAffinity:
+          preferredDuringSchedulingIgnoredDuringExecution:
+            - podAffinityTerm:
+                labelSelector:
+                  matchExpressions:
+                    - key: control-plane
+                      operator: In
+                      values:
+                        - operator-controller-controller-manager
+                        - catalogd-controller-manager
+                topologyKey: kubernetes.io/hostname
+              weight: 100
       nodeSelector:
         kubernetes.io/os: linux
         node-role.kubernetes.io/control-plane: ""
diff --git a/manifests/experimental.yaml b/manifests/experimental.yaml
@@ -2459,7 +2459,7 @@ spec:
   strategy:
     type: RollingUpdate
     rollingUpdate:
-      maxSurge: 1          # Allow temporary 2 pods (1 + 1) for zero-downtime updates
+      maxSurge: 1          # Allow temporary extra pod for zero-downtime updates
       maxUnavailable: 0    # Never allow pods to be unavailable during updates
   selector:
     matchLabels:
@@ -2597,7 +2597,7 @@ spec:
   strategy:
     type: RollingUpdate
     rollingUpdate:
-      maxSurge: 1          # Allow temporary 2 pods (1 + 1) for zero-downtime updates
+      maxSurge: 1          # Allow temporary extra pod for zero-downtime updates
       maxUnavailable: 0    # Never allow pods to be unavailable during updates
   selector:
     matchLabels:
diff --git a/manifests/standard-e2e.yaml b/manifests/standard-e2e.yaml
@@ -1782,7 +1782,7 @@ spec:
   strategy:
     type: RollingUpdate
     rollingUpdate:
-      maxSurge: 1          # Allow temporary 2 pods (1 + 1) for zero-downtime updates
+      maxSurge: 1          # Allow temporary extra pod for zero-downtime updates
       maxUnavailable: 0    # Never allow pods to be unavailable during updates
   selector:
     matchLabels:
@@ -1932,7 +1932,7 @@ spec:
   strategy:
     type: RollingUpdate
     rollingUpdate:
-      maxSurge: 1          # Allow temporary 2 pods (1 + 1) for zero-downtime updates
+      maxSurge: 1          # Allow temporary extra pod for zero-downtime updates
       maxUnavailable: 0    # Never allow pods to be unavailable during updates
   selector:
     matchLabels:
diff --git a/manifests/standard.yaml b/manifests/standard.yaml
@@ -1702,7 +1702,7 @@ spec:
   strategy:
     type: RollingUpdate
     rollingUpdate:
-      maxSurge: 1          # Allow temporary 2 pods (1 + 1) for zero-downtime updates
+      maxSurge: 1          # Allow temporary extra pod for zero-downtime updates
       maxUnavailable: 0    # Never allow pods to be unavailable during updates
   selector:
     matchLabels:
@@ -1839,7 +1839,7 @@ spec:
   strategy:
     type: RollingUpdate
     rollingUpdate:
-      maxSurge: 1          # Allow temporary 2 pods (1 + 1) for zero-downtime updates
+      maxSurge: 1          # Allow temporary extra pod for zero-downtime updates
       maxUnavailable: 0    # Never allow pods to be unavailable during updates
   selector:
     matchLabels:
diff --git a/test/helpers/helpers.go b/test/helpers/helpers.go
diff --git a/test/upgrade-e2e/post_upgrade_test.go b/test/upgrade-e2e/post_upgrade_test.go
diff --git a/testdata/build-test-registry.sh b/testdata/build-test-registry.sh