Skip to content

Commit acadaca

Browse files
OlivierCazadeclaude
andcommitted
Address code review feedback for observability controller
1. Fix CSV readiness check to handle multiple CSVs during upgrades - Check all matching CSVs instead of just the first one - Prioritize CSVs in "Succeeded" state to avoid checking old versions - Continue checking if status not found instead of failing 2. Return error when markNetworkObservabilityDeployed fails - Previously only logged the error, causing reconciliation to appear successful - Now sets Degraded status and returns error to trigger retry - Prevents duplicate FlowCollector creation attempts 3. Add automatic retry when operator readiness check times out - Previously returned RequeueAfter: 0, causing no retry - Now returns RequeueAfter: 5 minutes to check again later - Ensures operator installation is eventually noticed and FlowCollector is created Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
1 parent 9548585 commit acadaca

2 files changed

Lines changed: 17 additions & 9 deletions

File tree

pkg/controller/observability/observability_controller.go

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ const (
4242

4343
checkInterval = 10 * time.Second
4444
checkTimeout = 10 * time.Minute
45+
requeueAfter = 5 * time.Minute // Requeue interval when operator is not ready yet
4546

4647
// NetworkObservabilityDeployed is the condition type that indicates Network Observability was successfully deployed
4748
NetworkObservabilityDeployed = "NetworkObservabilityDeployed"
@@ -156,9 +157,9 @@ func (r *ReconcileObservability) Reconcile(ctx context.Context, req ctrl.Request
156157
klog.Info("Wait for Network Observability to be ready")
157158
if err := r.waitForNetObservOperator(ctx); err != nil {
158159
if err == context.DeadlineExceeded {
159-
klog.Errorf("Timed out waiting for Network Observability Operator to be ready after %v. Stopping reconciliation.", checkTimeout)
160+
klog.Errorf("Timed out waiting for Network Observability Operator to be ready after %v. Will retry in %v.", checkTimeout, requeueAfter)
160161
r.status.SetDegraded(statusmanager.ObservabilityConfig, "OperatorNotReady", fmt.Sprintf("Timed out waiting for Network Observability Operator to be ready after %v", checkTimeout))
161-
return ctrl.Result{RequeueAfter: 0}, nil // Don't requeue
162+
return ctrl.Result{RequeueAfter: requeueAfter}, nil
162163
}
163164
r.status.SetDegraded(statusmanager.ObservabilityConfig, "WaitOperatorError", fmt.Sprintf("Failed waiting for Network Observability Operator: %v", err))
164165
return ctrl.Result{}, err
@@ -181,7 +182,8 @@ func (r *ReconcileObservability) Reconcile(ctx context.Context, req ctrl.Request
181182

182183
// Mark as deployed in Network CR status
183184
if err := r.markNetworkObservabilityDeployed(ctx, &network); err != nil {
184-
klog.Warningf("Failed to update Network Observability deployment status: %v", err)
185+
r.status.SetDegraded(statusmanager.ObservabilityConfig, "UpdateStatusError", fmt.Sprintf("Failed to update Network Observability deployment status: %v", err))
186+
return ctrl.Result{}, err
185187
}
186188

187189
r.status.SetNotDegraded(statusmanager.ObservabilityConfig)
@@ -337,6 +339,8 @@ func (r *ReconcileObservability) waitForNetObservOperator(ctx context.Context) e
337339
}
338340

339341
// Find the netobserv operator CSV
342+
// During upgrades, there can be multiple CSVs. Check all of them and
343+
// prioritize ones in "Succeeded" state to avoid checking the wrong CSV.
340344
for _, csv := range csvs.Items {
341345
name := csv.GetName()
342346
// CSV names are typically like "netobserv-operator.v1.2.3"
@@ -346,11 +350,15 @@ func (r *ReconcileObservability) waitForNetObservOperator(ctx context.Context) e
346350
return false, err
347351
}
348352
if !found {
349-
return false, nil
353+
continue
354+
}
355+
// If we find a CSV in Succeeded state, return true immediately
356+
if phase == "Succeeded" {
357+
return true, nil
350358
}
351-
return phase == "Succeeded", nil
352359
}
353360
}
361+
// If we found matching CSVs but none in Succeeded state, keep waiting
354362
return false, nil
355363
}
356364
return wait.PollUntilContextTimeout(ctx, checkInterval, checkTimeout, true, condition)

pkg/controller/observability/observability_controller_test.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -928,9 +928,9 @@ func TestReconcile_OperatorNotReady(t *testing.T) {
928928

929929
result, err := r.Reconcile(ctx, req)
930930

931-
// Controller returns no error on timeout, but result.RequeueAfter should be 0
931+
// Controller returns no error on timeout, but should requeue after 5 minutes
932932
g.Expect(err).NotTo(HaveOccurred())
933-
g.Expect(result.RequeueAfter).To(BeZero())
933+
g.Expect(result.RequeueAfter).To(Equal(5 * time.Minute))
934934
}
935935

936936
// TestReconcile_FlowCollectorDeleted tests that reconciliation recreates
@@ -1147,13 +1147,13 @@ func TestReconcile_RecoveryAfterOperatorBecomesReady(t *testing.T) {
11471147

11481148
req := ctrl.Request{NamespacedName: types.NamespacedName{Name: "cluster"}}
11491149

1150-
// First reconciliation should timeout (returns no error but RequeueAfter=0)
1150+
// First reconciliation should timeout (returns no error but RequeueAfter=5 minutes)
11511151
ctx1, cancel1 := context.WithTimeout(context.Background(), 1*time.Second)
11521152
defer cancel1()
11531153

11541154
result1, err1 := r.Reconcile(ctx1, req)
11551155
g.Expect(err1).NotTo(HaveOccurred())
1156-
g.Expect(result1.RequeueAfter).To(BeZero())
1156+
g.Expect(result1.RequeueAfter).To(Equal(5 * time.Minute))
11571157

11581158
// Update CSV to Succeeded phase
11591159
_ = unstructured.SetNestedField(csv.Object, "Succeeded", "status", "phase")

0 commit comments

Comments
 (0)