55import enum
66import subprocess
77import threading
8+ import time
89
910
1011class MachineState (enum .Enum ):
@@ -14,7 +15,12 @@ class MachineState(enum.Enum):
1415 RESERVED = "RESERVED"
1516 MISS_ONE = "MISS_ONE"
1617 MISS_TWO = "MISS_TWO"
17- REBOOT_ISSUED = "REBOOT_ISSUED"
18+ SSH_REBOOT_ISSUED = "SSH_REBOOT_ISSUED"
19+ POWER_CYCLE_ISSUED = "POWER_CYCLE_ISSUED"
20+
21+
22+ # How long to wait for an SSH reboot before escalating to power cycle (sec)
23+ SSH_REBOOT_TIMEOUT = 600
1824
1925
2026class HealthChecker (threading .Thread ):
@@ -24,23 +30,18 @@ class HealthChecker(threading.Thread):
2430 State transitions:
2531 HEALTHY -> MISS_ONE (SSH fail)
2632 MISS_ONE -> MISS_TWO (SSH fail again)
27- MISS_TWO -> REBOOT_ISSUED (SSH fail, triggers power cycle)
28- REBOOT_ISSUED -> HEALTHY (SSH succeeds)
33+ MISS_TWO -> POWER_CYCLE_ISSUED (SSH fail, BMC power cycle)
34+ SSH_REBOOT_ISSUED -> POWER_CYCLE_ISSUED (>10min and still down)
35+ SSH_REBOOT_ISSUED -> HEALTHY (SSH succeeds)
36+ POWER_CYCLE_ISSUED -> MISS_ONE (SSH fail, restart miss counter)
37+ POWER_CYCLE_ISSUED -> HEALTHY (SSH succeeds)
2938 Any state -> HEALTHY (SSH succeeds)
39+
40+ SSH_REBOOT_ISSUED is set by the reservation manager when it reboots
41+ a machine via SSH after releasing a reservation. The health checker
42+ monitors it and escalates to power cycle if SSH doesn't come back.
3043 """
3144 def __init__ (self , machines , bmc_map , interval = 300 , lock = None ):
32- """
33- Parameters
34- ----------
35- machines : dict
36- machine_id -> {'name': str, 'mgmt_ipaddr': str, 'state': MachineState}
37- bmc_map : dict
38- machine_id -> BMC instance
39- interval : int
40- Seconds between health check rounds
41- lock : threading.Lock, optional
42- External lock for machines dict; creates own if not provided
43- """
4445 super ().__init__ (daemon = True )
4546 self .machines = machines
4647 self .bmc_map = bmc_map
@@ -77,28 +78,39 @@ def check_machine(self, machine_id, machine):
7778 alive = self ._ssh_check (ipaddr )
7879
7980 with self .lock :
80- # Re-check in case state changed while we were polling
8181 state = machine ['state' ]
8282 if state == MachineState .RESERVED :
8383 return
8484
8585 if alive :
8686 machine ['state' ] = MachineState .HEALTHY
87+ machine .pop ('ssh_reboot_at' , None )
8788 elif state == MachineState .HEALTHY :
8889 machine ['state' ] = MachineState .MISS_ONE
8990 print (f"Health: { machine ['name' ]} missed one check" )
9091 elif state == MachineState .MISS_ONE :
9192 machine ['state' ] = MachineState .MISS_TWO
9293 print (f"Health: { machine ['name' ]} missed two checks" )
9394 elif state == MachineState .MISS_TWO :
94- machine ['state' ] = MachineState .REBOOT_ISSUED
95- print (f"Health: { machine ['name' ]} missed three checks, rebooting" )
9695 bmc = self .bmc_map .get (machine_id )
9796 if bmc :
9897 bmc .power_cycle ()
99- elif state == MachineState .REBOOT_ISSUED :
100- # Still waiting for reboot to take effect
101- pass
98+ machine ['state' ] = MachineState .POWER_CYCLE_ISSUED
99+ print (f"Health: { machine ['name' ]} missed three checks, "
100+ "power cycling" )
101+ elif state == MachineState .SSH_REBOOT_ISSUED :
102+ elapsed = time .monotonic () - machine .get ('ssh_reboot_at' , 0 )
103+ if elapsed >= SSH_REBOOT_TIMEOUT :
104+ bmc = self .bmc_map .get (machine_id )
105+ if bmc :
106+ bmc .power_cycle ()
107+ machine ['state' ] = MachineState .POWER_CYCLE_ISSUED
108+ machine .pop ('ssh_reboot_at' , None )
109+ print (f"Health: { machine ['name' ]} SSH reboot timed out, "
110+ "power cycling" )
111+ elif state == MachineState .POWER_CYCLE_ISSUED :
112+ machine ['state' ] = MachineState .MISS_ONE
113+ print (f"Health: { machine ['name' ]} still down after power cycle" )
102114
103115 def run (self ):
104116 while not self ._stop_event .is_set ():
0 commit comments