Skip to content
Merged
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
73 changes: 73 additions & 0 deletions test/extended/two_node/tnf_recovery.go
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,11 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual
peerNode = nodes.Items[randomIndex]
// Select the remaining index
targetNode = nodes.Items[(randomIndex+1)%len(nodes.Items)]

// Log final pcs and etcd status after every test (pass or fail) via SSH
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: a reader seeing both this and deferDiagnosticsOnFailure in the It blocks may wonder about redundancy. Consider clarifying:

Suggested change
// Log final pcs and etcd status after every test (pass or fail) via SSH
// Log concise pcs and etcd status after every test (pass or fail) via SSH.
// Complements deferDiagnosticsOnFailure which gathers verbose diagnostics only on failure.

g.DeferCleanup(func() {
logFinalClusterStatus([]corev1.Node{peerNode, targetNode})
})
})

g.It("should recover from graceful node shutdown with etcd member re-addition", func() {
Expand Down Expand Up @@ -778,6 +783,74 @@ func restartVms(dataPair []vmNodePair, c hypervisorExtendedConfig) {
}
}

// logFinalClusterStatus logs pcs status and etcd member list via SSH after every test
// (pass or fail). Uses the hypervisor SSH path because the Kubernetes API may not be
// available after a recovery test. Errors are logged but never fail the test.
func logFinalClusterStatus(nodes []corev1.Node) {
if !exutil.HasHypervisorConfig() {
return
}

sshConfig := exutil.GetHypervisorConfig()
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: HasHypervisorConfig() only checks that the config string is non-empty — if the JSON is malformed, GetHypervisorConfig() returns nil and the next line panics. Same gap exists in setupMinimalTestEnvironment (line 673), but a panic inside DeferCleanup is uglier since it fails the test.

Suggested change
sshConfig := exutil.GetHypervisorConfig()
sshConfig := exutil.GetHypervisorConfig()
if sshConfig == nil {
framework.Logf("Skipping final cluster status: failed to parse hypervisor config")
return
}

hypervisorConfig := core.SSHConfig{
IP: sshConfig.HypervisorIP,
User: sshConfig.SSHUser,
PrivateKeyPath: sshConfig.PrivateKeyPath,
}

if _, err := os.Stat(hypervisorConfig.PrivateKeyPath); os.IsNotExist(err) {
framework.Logf("Skipping final cluster status: private key not found at %s", hypervisorConfig.PrivateKeyPath)
return
}
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: os.IsNotExist(err) only matches one specific stat error. If os.Stat fails with e.g. EACCES, the guard is false, execution continues, and SSH fails later with a misleading auth error. Checking err != nil is strictly better here.

Suggested change
if _, err := os.Stat(hypervisorConfig.PrivateKeyPath); os.IsNotExist(err) {
framework.Logf("Skipping final cluster status: private key not found at %s", hypervisorConfig.PrivateKeyPath)
return
}
if _, err := os.Stat(hypervisorConfig.PrivateKeyPath); err != nil {
framework.Logf("Skipping final cluster status: cannot access private key at %s: %v", hypervisorConfig.PrivateKeyPath, err)
return
}


knownHostsPath, err := core.PrepareLocalKnownHostsFile(&hypervisorConfig)
if err != nil {
framework.Logf("Skipping final cluster status: failed to prepare known hosts: %v", err)
return
}

framework.Logf("========== FINAL CLUSTER STATUS ==========")

for _, node := range nodes {
nodeIP := utils.GetNodeInternalIP(&node)
if nodeIP == "" {
framework.Logf("Skipping node %s: no internal IP", node.Name)
continue
}

remoteKnownHostsPath, err := core.PrepareRemoteKnownHostsFile(nodeIP, &hypervisorConfig, knownHostsPath)
if err != nil {
framework.Logf("Failed to prepare remote known hosts for node %s: %v", node.Name, err)
continue
}

// pcs status
pcsOutput, pcsStderr, pcsErr := services.PcsStatus(nodeIP, &hypervisorConfig, knownHostsPath, remoteKnownHostsPath)
if pcsErr != nil {
framework.Logf("Failed to get pcs status from node %s: %v\nstdout: %s\nstderr: %s", node.Name, pcsErr, pcsOutput, pcsStderr)
} else {
framework.Logf("pcs status from node %s:\n%s", node.Name, pcsOutput)
}

// etcd member list via SSH (-w table is the etcdctl v3 flag for table output)
etcdOutput, etcdStderr, etcdErr := core.ExecuteRemoteSSHCommand(nodeIP,
"sudo podman exec etcd etcdctl member list -w table",
&hypervisorConfig, knownHostsPath, remoteKnownHostsPath)
if etcdErr != nil {
framework.Logf("Failed to get etcd member list from node %s: %v\nstdout: %s\nstderr: %s", node.Name, etcdErr, etcdOutput, etcdStderr)
} else {
framework.Logf("etcd member list from node %s:\n%s", node.Name, etcdOutput)
}

// Only need one successful node for cluster-wide status
if pcsErr == nil && etcdErr == nil {
break
}
}

framework.Logf("========== END FINAL CLUSTER STATUS ==========")
}

// deferDiagnosticsOnFailure registers a DeferCleanup handler that gathers diagnostic
// information when the current test spec fails. This should be called early in test
// setup to ensure diagnostics are collected on any failure.
Expand Down