Skip to content

Commit e0adae7

Browse files
committed
contest: hw: mctrl: make reboot handling more robust
SSH reboots can get stuck. Signed-off-by: Jakub Kicinski <kuba@kernel.org>
1 parent 0514e92 commit e0adae7

File tree

4 files changed

+63
-35
lines changed

4 files changed

+63
-35
lines changed

contest/hw/README.rst

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -240,9 +240,10 @@ machine_state
240240

241241
If the machine is not reserved service SSHs to it every 5min and
242242
checks uptime and kernel version. If SSH fails machine state progresses
243-
HEALTHY -> MISS_ONE -> MISS_TWO. If SSH fails again, power cycle is
244-
issued and machine enters REBOOT_ISSUED, where it stays until it
245-
starts responding to SSH again.
243+
HEALTHY -> MISS_ONE -> MISS_TWO. After three missed checks the machine
244+
is power-cycled via BMC (POWER_CYCLE_ISSUED). If the machine is still
245+
down after power cycle, the miss counter restarts (MISS_ONE) and the
246+
cycle repeats.
246247

247248
If the machine is RESERVED the machine state is just tracking last
248249
refresh time to potentially time out the reservation.
@@ -253,7 +254,7 @@ to reserve a machine not in HEALTHY state we respond with try again.
253254
Refreshed every 5min, and immediately after reservation is released.
254255

255256
- machine ID
256-
- state, enum: RESERVED, HEALTHY, MISS_ONE, MISS_TWO, REBOOT_ISSUED
257+
- state, enum: RESERVED, HEALTHY, MISS_ONE, MISS_TWO, POWER_CYCLE_ISSUED
257258
- last_reservation: CLOSED, TIMEOUT
258259
- reservation_last_refresh
259260
- uptime

contest/hw/lib/health.py

Lines changed: 33 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import enum
66
import subprocess
77
import threading
8+
import time
89

910

1011
class MachineState(enum.Enum):
@@ -14,7 +15,12 @@ class MachineState(enum.Enum):
1415
RESERVED = "RESERVED"
1516
MISS_ONE = "MISS_ONE"
1617
MISS_TWO = "MISS_TWO"
17-
REBOOT_ISSUED = "REBOOT_ISSUED"
18+
SSH_REBOOT_ISSUED = "SSH_REBOOT_ISSUED"
19+
POWER_CYCLE_ISSUED = "POWER_CYCLE_ISSUED"
20+
21+
22+
# How long to wait for an SSH reboot before escalating to power cycle (sec)
23+
SSH_REBOOT_TIMEOUT = 600
1824

1925

2026
class HealthChecker(threading.Thread):
@@ -24,23 +30,18 @@ class HealthChecker(threading.Thread):
2430
State transitions:
2531
HEALTHY -> MISS_ONE (SSH fail)
2632
MISS_ONE -> MISS_TWO (SSH fail again)
27-
MISS_TWO -> REBOOT_ISSUED (SSH fail, triggers power cycle)
28-
REBOOT_ISSUED -> HEALTHY (SSH succeeds)
33+
MISS_TWO -> POWER_CYCLE_ISSUED (SSH fail, BMC power cycle)
34+
SSH_REBOOT_ISSUED -> POWER_CYCLE_ISSUED (>10min and still down)
35+
SSH_REBOOT_ISSUED -> HEALTHY (SSH succeeds)
36+
POWER_CYCLE_ISSUED -> MISS_ONE (SSH fail, restart miss counter)
37+
POWER_CYCLE_ISSUED -> HEALTHY (SSH succeeds)
2938
Any state -> HEALTHY (SSH succeeds)
39+
40+
SSH_REBOOT_ISSUED is set by the reservation manager when it reboots
41+
a machine via SSH after releasing a reservation. The health checker
42+
monitors it and escalates to power cycle if SSH doesn't come back.
3043
"""
3144
def __init__(self, machines, bmc_map, interval=300, lock=None):
32-
"""
33-
Parameters
34-
----------
35-
machines : dict
36-
machine_id -> {'name': str, 'mgmt_ipaddr': str, 'state': MachineState}
37-
bmc_map : dict
38-
machine_id -> BMC instance
39-
interval : int
40-
Seconds between health check rounds
41-
lock : threading.Lock, optional
42-
External lock for machines dict; creates own if not provided
43-
"""
4445
super().__init__(daemon=True)
4546
self.machines = machines
4647
self.bmc_map = bmc_map
@@ -77,28 +78,39 @@ def check_machine(self, machine_id, machine):
7778
alive = self._ssh_check(ipaddr)
7879

7980
with self.lock:
80-
# Re-check in case state changed while we were polling
8181
state = machine['state']
8282
if state == MachineState.RESERVED:
8383
return
8484

8585
if alive:
8686
machine['state'] = MachineState.HEALTHY
87+
machine.pop('ssh_reboot_at', None)
8788
elif state == MachineState.HEALTHY:
8889
machine['state'] = MachineState.MISS_ONE
8990
print(f"Health: {machine['name']} missed one check")
9091
elif state == MachineState.MISS_ONE:
9192
machine['state'] = MachineState.MISS_TWO
9293
print(f"Health: {machine['name']} missed two checks")
9394
elif state == MachineState.MISS_TWO:
94-
machine['state'] = MachineState.REBOOT_ISSUED
95-
print(f"Health: {machine['name']} missed three checks, rebooting")
9695
bmc = self.bmc_map.get(machine_id)
9796
if bmc:
9897
bmc.power_cycle()
99-
elif state == MachineState.REBOOT_ISSUED:
100-
# Still waiting for reboot to take effect
101-
pass
98+
machine['state'] = MachineState.POWER_CYCLE_ISSUED
99+
print(f"Health: {machine['name']} missed three checks, "
100+
"power cycling")
101+
elif state == MachineState.SSH_REBOOT_ISSUED:
102+
elapsed = time.monotonic() - machine.get('ssh_reboot_at', 0)
103+
if elapsed >= SSH_REBOOT_TIMEOUT:
104+
bmc = self.bmc_map.get(machine_id)
105+
if bmc:
106+
bmc.power_cycle()
107+
machine['state'] = MachineState.POWER_CYCLE_ISSUED
108+
machine.pop('ssh_reboot_at', None)
109+
print(f"Health: {machine['name']} SSH reboot timed out, "
110+
"power cycling")
111+
elif state == MachineState.POWER_CYCLE_ISSUED:
112+
machine['state'] = MachineState.MISS_ONE
113+
print(f"Health: {machine['name']} still down after power cycle")
102114

103115
def run(self):
104116
while not self._stop_event.is_set():

contest/hw/lib/reservations.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import datetime
66
import subprocess
77
import threading
8+
import time
89

910
import psycopg2
1011

@@ -156,15 +157,16 @@ def _finish_close(self, reservation_id, machine_ids):
156157
if ipaddr and self._ssh_reboot(ipaddr):
157158
print(f"Reservation: SSH reboot sent to machine {mid} ({ipaddr})")
158159
with self.lock:
159-
machine['state'] = MachineState.REBOOT_ISSUED
160+
machine['state'] = MachineState.SSH_REBOOT_ISSUED
161+
machine['ssh_reboot_at'] = time.monotonic()
160162
else:
161163
# Fall back to BMC power cycle
162164
bmc = self.bmc_map.get(mid)
163165
if bmc:
164166
print(f"Reservation: BMC power cycle for machine {mid}")
165167
bmc.power_cycle()
166168
with self.lock:
167-
machine['state'] = MachineState.REBOOT_ISSUED
169+
machine['state'] = MachineState.POWER_CYCLE_ISSUED
168170

169171
@staticmethod
170172
def _ssh_reboot(ipaddr):

contest/hw/tests/test_machine_control.py

Lines changed: 21 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313

1414
from lib.bmc import BMC
1515
from lib.health import HealthChecker, MachineState
16-
from lib.sol_listener import SOLListener
16+
from lib.sol_listener import SOLCollector as SOLListener
1717
from lib.reservations import ReservationManager
1818

1919

@@ -169,11 +169,12 @@ def test_miss_two_triggers_reboot(self):
169169
mock_run.return_value = mock.Mock(returncode=1)
170170
hc.check_machine(1, machines[1])
171171

172-
self.assertEqual(machines[1]['state'], MachineState.REBOOT_ISSUED)
172+
# SSH unreachable -> power cycle
173+
self.assertEqual(machines[1]['state'], MachineState.POWER_CYCLE_ISSUED)
173174
bmc_map[1].power_cycle.assert_called_once()
174175

175-
def test_reboot_issued_to_healthy(self):
176-
machines = self._make_machines({1: MachineState.REBOOT_ISSUED})
176+
def test_power_cycle_issued_to_healthy(self):
177+
machines = self._make_machines({1: MachineState.POWER_CYCLE_ISSUED})
177178
bmc_map = self._make_bmc_map()
178179
hc = HealthChecker(machines, bmc_map)
179180

@@ -183,6 +184,18 @@ def test_reboot_issued_to_healthy(self):
183184

184185
self.assertEqual(machines[1]['state'], MachineState.HEALTHY)
185186

187+
def test_power_cycle_issued_still_down(self):
188+
machines = self._make_machines({1: MachineState.POWER_CYCLE_ISSUED})
189+
bmc_map = self._make_bmc_map()
190+
hc = HealthChecker(machines, bmc_map)
191+
192+
with mock.patch('subprocess.run') as mock_run:
193+
mock_run.return_value = mock.Mock(returncode=1)
194+
hc.check_machine(1, machines[1])
195+
196+
# Restart miss counter
197+
self.assertEqual(machines[1]['state'], MachineState.MISS_ONE)
198+
186199
def test_reserved_skipped(self):
187200
machines = self._make_machines({1: MachineState.RESERVED})
188201
bmc_map = self._make_bmc_map()
@@ -339,7 +352,7 @@ def test_timeout(self):
339352

340353
mgr.check_timeouts()
341354

342-
self.assertEqual(machines[1]['state'], MachineState.HEALTHY)
355+
self.assertEqual(machines[1]['state'], MachineState.POWER_CYCLE_ISSUED)
343356
self.assertNotIn(42, mgr.active)
344357
bmc_map[1].power_cycle.assert_called_once()
345358

@@ -349,8 +362,8 @@ def test_close(self):
349362

350363
ok, err = mgr.close('test-caller', 42)
351364
self.assertTrue(ok)
352-
self.assertEqual(machines[1]['state'], MachineState.HEALTHY)
353-
self.assertEqual(machines[2]['state'], MachineState.HEALTHY)
365+
self.assertEqual(machines[1]['state'], MachineState.POWER_CYCLE_ISSUED)
366+
self.assertEqual(machines[2]['state'], MachineState.POWER_CYCLE_ISSUED)
354367
bmc_map[1].power_cycle.assert_called_once()
355368
bmc_map[2].power_cycle.assert_called_once()
356369

@@ -491,7 +504,7 @@ def test_reserve_and_close(self):
491504
self.assertEqual(resp.status_code, 200)
492505

493506
self.assertEqual(
494-
self.mc_mod.machines[1]['state'], MachineState.HEALTHY
507+
self.mc_mod.machines[1]['state'], MachineState.POWER_CYCLE_ISSUED
495508
)
496509

497510
def test_reservation_refresh(self):

0 commit comments

Comments
 (0)