Skip to content

Commit d019b37

Browse files
committed
fix(fitness): address CodeQL and Copilot PR review feedback
- Fix subprocess leak on timeout: kill orphaned processes in _get_cuda_version() and _run_gpu_test_binary() when wait_for times out - Fix pyproject.toml: use find-packages to include all subpackages, not just top-level runpod package - Fix no-op test_respects_env_override: add actual assertion - Fix conftest: also skip GPU auto-registration in test fixtures - Fix unused reader variable in _check_network_connectivity - Fix docstring in auto_register_gpu_check to match actual behavior - Add inline comments on empty except clauses for CodeQL compliance - Annotate global state variables used via global keyword
1 parent bdaf844 commit d019b37

8 files changed

Lines changed: 26 additions & 19 deletions

File tree

pyproject.toml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,8 +41,10 @@ Changelog = "https://github.com/runpod/runpod-python/blob/main/CHANGELOG.md"
4141
"Bug Tracker" = "https://github.com/runpod/runpod-python/issues"
4242

4343

44+
[tool.setuptools.packages.find]
45+
include = ["runpod*"]
46+
4447
[tool.setuptools]
45-
packages = ["runpod"]
4648
include-package-data = true
4749

4850
[tool.setuptools.package-data]

runpod/cli/groups/pod/commands.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -243,4 +243,4 @@ def sync_pods(source_pod_id, dest_pod_id, source_workspace, dest_workspace):
243243
if 'local_temp_path' in locals():
244244
os.unlink(local_temp_path)
245245
except OSError:
246-
pass
246+
pass # Best-effort cleanup of temp file

runpod/serverless/modules/rp_fitness.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -67,8 +67,8 @@ def clear_fitness_checks() -> None:
6767
_fitness_checks.clear()
6868

6969

70-
_gpu_check_registered = False
71-
_system_checks_registered = False
70+
_gpu_check_registered = False # used via global in _ensure_gpu_check_registered
71+
_system_checks_registered = False # used via global in _ensure_system_checks_registered
7272

7373

7474
def _reset_registration_state() -> None:

runpod/serverless/modules/rp_gpu_fitness.py

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -86,8 +86,7 @@ def _parse_gpu_test_output(output: str) -> dict[str, Any]:
8686
found_gpus = int(line.split()[1])
8787
result["found_gpus"] = found_gpus
8888
except (IndexError, ValueError):
89-
# Line format doesn't match expected "Found N GPUs:" - skip parsing
90-
pass
89+
pass # Line format doesn't match expected "Found N GPUs:" - skip
9190

9291
# Check for success
9392
if "memory allocation test passed" in line.lower():
@@ -163,6 +162,8 @@ async def _run_gpu_test_binary() -> dict[str, Any]:
163162
return result
164163

165164
except asyncio.TimeoutError:
165+
process.kill()
166+
await process.wait()
166167
raise RuntimeError(
167168
f"GPU test binary timed out after {TIMEOUT_SECONDS}s"
168169
) from None
@@ -280,11 +281,8 @@ def auto_register_gpu_check() -> None:
280281
It detects GPU presence via nvidia-smi and registers the check if found.
281282
On CPU-only workers, the check is skipped silently.
282283
283-
The check cannot be disabled when GPUs are present - this is a required
284-
health check for GPU workers.
285-
286284
Environment variables:
287-
- RUNPOD_SKIP_GPU_CHECK: Set to "true" to skip auto-registration (for testing)
285+
- RUNPOD_SKIP_GPU_CHECK: Set to "true" to skip auto-registration
288286
"""
289287
# Allow skipping during tests
290288
if os.environ.get("RUNPOD_SKIP_GPU_CHECK", "").lower() == "true":

runpod/serverless/modules/rp_system_fitness.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -166,7 +166,7 @@ async def _check_network_connectivity() -> None:
166166

167167
try:
168168
start_time = time.perf_counter()
169-
reader, writer = await asyncio.wait_for(
169+
_, writer = await asyncio.wait_for(
170170
asyncio.open_connection(host, port), timeout=NETWORK_CHECK_TIMEOUT
171171
)
172172
elapsed_ms = (time.perf_counter() - start_time) * 1000
@@ -200,6 +200,7 @@ async def _get_cuda_version() -> str | None:
200200
RuntimeError: If CUDA check fails critically
201201
"""
202202
# Try nvcc first
203+
process = None
203204
try:
204205
process = await asyncio.create_subprocess_exec(
205206
"nvcc",
@@ -214,9 +215,13 @@ async def _get_cuda_version() -> str | None:
214215
if "release" in line.lower() or "version" in line.lower():
215216
return line.strip()
216217
except Exception as e:
218+
if process and process.returncode is None:
219+
process.kill()
220+
await process.wait()
217221
log.debug(f"nvcc not available: {e}")
218222

219223
# Fallback: try nvidia-smi and parse CUDA version from output
224+
process = None
220225
try:
221226
process = await asyncio.create_subprocess_exec(
222227
"nvidia-smi",
@@ -234,6 +239,9 @@ async def _get_cuda_version() -> str | None:
234239
return f"CUDA Version: {cuda_version}"
235240
log.debug("nvidia-smi output found but couldn't parse CUDA version")
236241
except Exception as e:
242+
if process and process.returncode is None:
243+
process.kill()
244+
await process.wait()
237245
log.debug(f"nvidia-smi not available: {e}")
238246

239247
return None

tests/test_serverless/test_modules/test_fitness/conftest.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ def cleanup_fitness_checks(monkeypatch):
1616
with fitness check framework tests.
1717
"""
1818
monkeypatch.setenv("RUNPOD_SKIP_AUTO_SYSTEM_CHECKS", "true")
19+
monkeypatch.setenv("RUNPOD_SKIP_GPU_CHECK", "true")
1920
_reset_registration_state()
2021
clear_fitness_checks()
2122
yield

tests/test_serverless/test_modules/test_fitness/test_gpu_checks.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -157,8 +157,8 @@ def test_respects_env_override(self):
157157
"""Test environment variable override takes precedence."""
158158
with patch("pathlib.Path.exists", return_value=True), \
159159
patch("pathlib.Path.is_file", return_value=True):
160-
# When env var is set and path exists, it should be used
161-
pass
160+
path = _get_gpu_test_binary_path()
161+
assert path == Path("/custom/gpu_test")
162162

163163

164164
# ============================================================================
@@ -297,6 +297,7 @@ async def test_health_check_binary_success(self):
297297
class TestAutoRegistration:
298298
"""Tests for GPU check auto-registration."""
299299

300+
@patch.dict(os.environ, {"RUNPOD_SKIP_GPU_CHECK": ""})
300301
def test_auto_register_gpu_found(self):
301302
"""Test auto-registration when GPU detected."""
302303
with patch("subprocess.run") as mock_run:

tests/test_serverless/test_modules/test_fitness/test_gpu_integration.py

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -41,8 +41,7 @@ def mock_gpu_test_binary():
4141
try:
4242
os.unlink(binary_path)
4343
except OSError:
44-
# Best-effort cleanup: ignore if file already deleted or inaccessible
45-
pass
44+
pass # Best-effort cleanup: ignore if file already deleted
4645

4746

4847
@pytest.fixture
@@ -66,8 +65,7 @@ def mock_gpu_test_binary_failure():
6665
try:
6766
os.unlink(binary_path)
6867
except OSError:
69-
# Best-effort cleanup: ignore if file already deleted or inaccessible
70-
pass
68+
pass # Best-effort cleanup: ignore if file already deleted
7169

7270

7371
@pytest.fixture
@@ -97,8 +95,7 @@ def mock_gpu_test_binary_multi_gpu():
9795
try:
9896
os.unlink(binary_path)
9997
except OSError:
100-
# Best-effort cleanup: ignore if file already deleted or inaccessible
101-
pass
98+
pass # Best-effort cleanup: ignore if file already deleted
10299

103100

104101
# ============================================================================

0 commit comments

Comments
 (0)