Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
84 changes: 83 additions & 1 deletion test/extended/two_node/tnf_recovery.go
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,12 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual
peerNode = nodes.Items[randomIndex]
// Select the remaining index
targetNode = nodes.Items[(randomIndex+1)%len(nodes.Items)]

// Log concise pcs and etcd status after every test (pass or fail) via SSH.
// Complements deferDiagnosticsOnFailure which gathers verbose diagnostics only on failure.
g.DeferCleanup(func() {
logFinalClusterStatus([]corev1.Node{peerNode, targetNode})
})
})

g.It("should recover from graceful node shutdown with etcd member re-addition", func() {
Expand Down Expand Up @@ -671,12 +677,16 @@ func setupMinimalTestEnvironment(oc *exutil.CLI, nodeA, nodeB *corev1.Node) (c h
}

sshConfig := exutil.GetHypervisorConfig()
if sshConfig == nil {
err = fmt.Errorf("failed to parse hypervisor config")
return
}
c.HypervisorConfig.IP = sshConfig.HypervisorIP
c.HypervisorConfig.User = sshConfig.SSHUser
c.HypervisorConfig.PrivateKeyPath = sshConfig.PrivateKeyPath

// Validate that the private key file exists
if _, err = os.Stat(c.HypervisorConfig.PrivateKeyPath); os.IsNotExist(err) {
if _, err = os.Stat(c.HypervisorConfig.PrivateKeyPath); err != nil {
return
}

Expand Down Expand Up @@ -778,6 +788,78 @@ func restartVms(dataPair []vmNodePair, c hypervisorExtendedConfig) {
}
}

// logFinalClusterStatus logs pcs status and etcd member list via SSH after every test
// (pass or fail). Uses the hypervisor SSH path because the Kubernetes API may not be
// available after a recovery test. Errors are logged but never fail the test.
func logFinalClusterStatus(nodes []corev1.Node) {
if !exutil.HasHypervisorConfig() {
return
}

sshConfig := exutil.GetHypervisorConfig()
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: HasHypervisorConfig() only checks that the config string is non-empty — if the JSON is malformed, GetHypervisorConfig() returns nil and the next line panics. Same gap exists in setupMinimalTestEnvironment (line 673), but a panic inside DeferCleanup is uglier since it fails the test.

Suggested change
sshConfig := exutil.GetHypervisorConfig()
sshConfig := exutil.GetHypervisorConfig()
if sshConfig == nil {
framework.Logf("Skipping final cluster status: failed to parse hypervisor config")
return
}

if sshConfig == nil {
framework.Logf("Skipping final cluster status: failed to parse hypervisor config")
return
}
hypervisorConfig := core.SSHConfig{
IP: sshConfig.HypervisorIP,
User: sshConfig.SSHUser,
PrivateKeyPath: sshConfig.PrivateKeyPath,
}

if _, err := os.Stat(hypervisorConfig.PrivateKeyPath); err != nil {
framework.Logf("Skipping final cluster status: cannot access private key at %s: %v", hypervisorConfig.PrivateKeyPath, err)
return
}

knownHostsPath, err := core.PrepareLocalKnownHostsFile(&hypervisorConfig)
if err != nil {
framework.Logf("Skipping final cluster status: failed to prepare known hosts: %v", err)
return
}

framework.Logf("========== FINAL CLUSTER STATUS ==========")

for _, node := range nodes {
nodeIP := utils.GetNodeInternalIP(&node)
if nodeIP == "" {
framework.Logf("Skipping node %s: no internal IP", node.Name)
continue
}

remoteKnownHostsPath, err := core.PrepareRemoteKnownHostsFile(nodeIP, &hypervisorConfig, knownHostsPath)
if err != nil {
framework.Logf("Failed to prepare remote known hosts for node %s: %v", node.Name, err)
continue
}

// pcs status
pcsOutput, pcsStderr, pcsErr := services.PcsStatus(nodeIP, &hypervisorConfig, knownHostsPath, remoteKnownHostsPath)
if pcsErr != nil {
framework.Logf("Failed to get pcs status from node %s: %v\nstdout: %s\nstderr: %s", node.Name, pcsErr, pcsOutput, pcsStderr)
} else {
framework.Logf("pcs status from node %s:\n%s", node.Name, pcsOutput)
}

// etcd member list via SSH (-w table is the etcdctl v3 flag for table output)
etcdOutput, etcdStderr, etcdErr := core.ExecuteRemoteSSHCommand(nodeIP,
"sudo podman exec etcd etcdctl member list -w table",
&hypervisorConfig, knownHostsPath, remoteKnownHostsPath)
if etcdErr != nil {
framework.Logf("Failed to get etcd member list from node %s: %v\nstdout: %s\nstderr: %s", node.Name, etcdErr, etcdOutput, etcdStderr)
} else {
framework.Logf("etcd member list from node %s:\n%s", node.Name, etcdOutput)
}

// Only need one successful node for cluster-wide status
if pcsErr == nil && etcdErr == nil {
break
}
}

framework.Logf("========== END FINAL CLUSTER STATUS ==========")
}

// deferDiagnosticsOnFailure registers a DeferCleanup handler that gathers diagnostic
// information when the current test spec fails. This should be called early in test
// setup to ensure diagnostics are collected on any failure.
Expand Down