Skip to content

Commit bdaf844

Browse files
committed
fix(fitness): modernize types, fix async blocking, and improve error handling
- Replace typing.Optional/List/Dict with native annotations via __future__ - Convert subprocess.run to asyncio.create_subprocess_exec in _get_cuda_version to avoid blocking the event loop from async functions - Add proper exception chaining (from e / from None) across all modules - Remove broad except Exception catches that silently suppressed registration failures in _ensure_gpu_check_registered and _ensure_system_checks_registered - Fix memory unit conversion bug in /proc/meminfo fallback path - Add zero-division guard in _check_disk_space - Re-raise RuntimeError in benchmark checks to propagate actual failures - Update tests to mock asyncio.create_subprocess_exec instead of subprocess.run
1 parent 40c4e24 commit bdaf844

5 files changed

Lines changed: 198 additions & 143 deletions

File tree

runpod/_binary_helpers.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,13 @@
22
Helper utilities for locating package-bundled binaries.
33
"""
44

5+
from __future__ import annotations
6+
57
import os
68
from pathlib import Path
7-
from typing import Optional
89

910

10-
def get_binary_path(binary_name: str) -> Optional[Path]:
11+
def get_binary_path(binary_name: str) -> Path | None:
1112
"""
1213
Locate a binary file within the runpod package.
1314

runpod/serverless/modules/rp_fitness.py

Lines changed: 8 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -8,18 +8,20 @@
88
Fitness checks do NOT run in local development mode or testing mode.
99
"""
1010

11+
from __future__ import annotations
12+
1113
import inspect
1214
import sys
1315
import time
1416
import traceback
15-
from typing import Callable, List
17+
from collections.abc import Callable
1618

1719
from .rp_logger import RunPodLogger
1820

1921
log = RunPodLogger()
2022

2123
# Global registry for fitness check functions, preserves registration order
22-
_fitness_checks: List[Callable] = []
24+
_fitness_checks: list[Callable] = []
2325

2426

2527
def register_fitness_check(func: Callable) -> Callable:
@@ -99,11 +101,7 @@ def _ensure_gpu_check_registered() -> None:
99101

100102
auto_register_gpu_check()
101103
except ImportError:
102-
# GPU fitness module not available
103104
log.debug("GPU fitness check module not found, skipping auto-registration")
104-
except Exception as e:
105-
# Don't fail fitness checks if auto-registration has issues
106-
log.warn(f"Failed to auto-register GPU fitness check: {e}")
107105

108106

109107
def _ensure_system_checks_registered() -> None:
@@ -122,7 +120,9 @@ def _ensure_system_checks_registered() -> None:
122120

123121
# Allow disabling system checks for testing
124122
if os.environ.get("RUNPOD_SKIP_AUTO_SYSTEM_CHECKS", "").lower() == "true":
125-
log.debug("System fitness checks disabled via environment (RUNPOD_SKIP_AUTO_SYSTEM_CHECKS)")
123+
log.debug(
124+
"System fitness checks disabled via environment (RUNPOD_SKIP_AUTO_SYSTEM_CHECKS)"
125+
)
126126
_system_checks_registered = True
127127
return
128128

@@ -133,11 +133,7 @@ def _ensure_system_checks_registered() -> None:
133133

134134
auto_register_system_checks()
135135
except ImportError:
136-
# System fitness module not available
137136
log.debug("System fitness check module not found, skipping auto-registration")
138-
except Exception as e:
139-
# Don't fail fitness checks if auto-registration has issues
140-
log.warn(f"Failed to auto-register system fitness checks: {e}")
141137

142138

143139
async def run_fitness_checks() -> None:
@@ -206,8 +202,7 @@ async def run_fitness_checks() -> None:
206202
full_traceback = traceback.format_exc()
207203

208204
log.error(
209-
f"Fitness check failed: {check_name} | "
210-
f"{error_type}: {error_message}"
205+
f"Fitness check failed: {check_name} | {error_type}: {error_message}"
211206
)
212207
log.debug(f"Traceback:\n{full_traceback}")
213208

runpod/serverless/modules/rp_gpu_fitness.py

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,13 @@
88
Auto-registers when GPUs are detected, skips silently on CPU-only workers.
99
"""
1010

11+
from __future__ import annotations
12+
1113
import asyncio
1214
import os
1315
import subprocess
1416
from pathlib import Path
15-
from typing import Any, Dict, Optional
17+
from typing import Any
1618

1719
from runpod._binary_helpers import get_binary_path
1820
from .rp_fitness import register_fitness_check
@@ -25,7 +27,7 @@
2527
MAX_ERROR_MESSAGES = int(os.environ.get("RUNPOD_GPU_MAX_ERROR_MESSAGES", "10"))
2628

2729

28-
def _get_gpu_test_binary_path() -> Optional[Path]:
30+
def _get_gpu_test_binary_path() -> Path | None:
2931
"""
3032
Locate gpu_test binary in package.
3133
@@ -35,7 +37,7 @@ def _get_gpu_test_binary_path() -> Optional[Path]:
3537
return get_binary_path("gpu_test")
3638

3739

38-
def _parse_gpu_test_output(output: str) -> Dict[str, Any]:
40+
def _parse_gpu_test_output(output: str) -> dict[str, Any]:
3941
"""
4042
Parse gpu_test binary output and detect success/failure.
4143
@@ -92,9 +94,7 @@ def _parse_gpu_test_output(output: str) -> Dict[str, Any]:
9294
passed_count += 1
9395

9496
# Check for errors
95-
if any(
96-
err in line.lower() for err in ["failed", "error", "cannot", "unable"]
97-
):
97+
if any(err in line.lower() for err in ["failed", "error", "cannot", "unable"]):
9898
result["errors"].append(line)
9999

100100
result["gpu_count"] = passed_count
@@ -105,7 +105,7 @@ def _parse_gpu_test_output(output: str) -> Dict[str, Any]:
105105
return result
106106

107107

108-
async def _run_gpu_test_binary() -> Dict[str, Any]:
108+
async def _run_gpu_test_binary() -> dict[str, Any]:
109109
"""
110110
Execute gpu_test binary and parse output.
111111
@@ -211,11 +211,15 @@ def _run_gpu_test_fallback() -> None:
211211
)
212212

213213
except FileNotFoundError:
214-
raise RuntimeError("nvidia-smi not found. Cannot validate GPU availability.") from None
214+
raise RuntimeError(
215+
"nvidia-smi not found. Cannot validate GPU availability."
216+
) from None
215217
except subprocess.TimeoutExpired:
216218
raise RuntimeError("nvidia-smi timed out") from None
217219
except RuntimeError:
218220
raise
221+
except Exception as e:
222+
raise RuntimeError(f"nvidia-smi fallback check failed: {e}") from e
219223

220224

221225
async def _check_gpu_health() -> None:

0 commit comments

Comments
 (0)