fix(fitness): address CodeQL and Copilot PR review feedback

deanq · deanq · commit d019b37c29b6 · 2026-03-20T21:51:11.000-07:00
- Fix subprocess leak on timeout: kill orphaned processes in
  _get_cuda_version() and _run_gpu_test_binary() when wait_for times out
- Fix pyproject.toml: use find-packages to include all subpackages,
  not just top-level runpod package
- Fix no-op test_respects_env_override: add actual assertion
- Fix conftest: also skip GPU auto-registration in test fixtures
- Fix unused reader variable in _check_network_connectivity
- Fix docstring in auto_register_gpu_check to match actual behavior
- Add inline comments on empty except clauses for CodeQL compliance
- Annotate global state variables used via global keyword
diff --git a/pyproject.toml b/pyproject.toml
@@ -41,8 +41,10 @@ Changelog = "https://github.com/runpod/runpod-python/blob/main/CHANGELOG.md"
 "Bug Tracker" = "https://github.com/runpod/runpod-python/issues"
 
 
+[tool.setuptools.packages.find]
+include = ["runpod*"]
+
 [tool.setuptools]
-packages = ["runpod"]
 include-package-data = true
 
 [tool.setuptools.package-data]
diff --git a/runpod/cli/groups/pod/commands.py b/runpod/cli/groups/pod/commands.py
@@ -243,4 +243,4 @@ def sync_pods(source_pod_id, dest_pod_id, source_workspace, dest_workspace):
             if 'local_temp_path' in locals():
                 os.unlink(local_temp_path)
         except OSError:
-            pass
+            pass  # Best-effort cleanup of temp file
diff --git a/runpod/serverless/modules/rp_fitness.py b/runpod/serverless/modules/rp_fitness.py
@@ -67,8 +67,8 @@ def clear_fitness_checks() -> None:
     _fitness_checks.clear()
 
 
-_gpu_check_registered = False
-_system_checks_registered = False
+_gpu_check_registered = False  # used via global in _ensure_gpu_check_registered
+_system_checks_registered = False  # used via global in _ensure_system_checks_registered
 
 
 def _reset_registration_state() -> None:
diff --git a/runpod/serverless/modules/rp_gpu_fitness.py b/runpod/serverless/modules/rp_gpu_fitness.py
@@ -86,8 +86,7 @@ def _parse_gpu_test_output(output: str) -> dict[str, Any]:
                 found_gpus = int(line.split()[1])
                 result["found_gpus"] = found_gpus
             except (IndexError, ValueError):
-                # Line format doesn't match expected "Found N GPUs:" - skip parsing
-                pass
+                pass  # Line format doesn't match expected "Found N GPUs:" - skip
 
         # Check for success
         if "memory allocation test passed" in line.lower():
@@ -163,6 +162,8 @@ async def _run_gpu_test_binary() -> dict[str, Any]:
         return result
 
     except asyncio.TimeoutError:
+        process.kill()
+        await process.wait()
         raise RuntimeError(
             f"GPU test binary timed out after {TIMEOUT_SECONDS}s"
         ) from None
@@ -280,11 +281,8 @@ def auto_register_gpu_check() -> None:
     It detects GPU presence via nvidia-smi and registers the check if found.
     On CPU-only workers, the check is skipped silently.
 
-    The check cannot be disabled when GPUs are present - this is a required
-    health check for GPU workers.
-
     Environment variables:
-    - RUNPOD_SKIP_GPU_CHECK: Set to "true" to skip auto-registration (for testing)
+    - RUNPOD_SKIP_GPU_CHECK: Set to "true" to skip auto-registration
     """
     # Allow skipping during tests
     if os.environ.get("RUNPOD_SKIP_GPU_CHECK", "").lower() == "true":
diff --git a/runpod/serverless/modules/rp_system_fitness.py b/runpod/serverless/modules/rp_system_fitness.py
@@ -166,7 +166,7 @@ async def _check_network_connectivity() -> None:
 
     try:
         start_time = time.perf_counter()
-        reader, writer = await asyncio.wait_for(
+        _, writer = await asyncio.wait_for(
             asyncio.open_connection(host, port), timeout=NETWORK_CHECK_TIMEOUT
         )
         elapsed_ms = (time.perf_counter() - start_time) * 1000
@@ -200,6 +200,7 @@ async def _get_cuda_version() -> str | None:
         RuntimeError: If CUDA check fails critically
     """
     # Try nvcc first
+    process = None
     try:
         process = await asyncio.create_subprocess_exec(
             "nvcc",
@@ -214,9 +215,13 @@ async def _get_cuda_version() -> str | None:
                 if "release" in line.lower() or "version" in line.lower():
                     return line.strip()
     except Exception as e:
+        if process and process.returncode is None:
+            process.kill()
+            await process.wait()
         log.debug(f"nvcc not available: {e}")
 
     # Fallback: try nvidia-smi and parse CUDA version from output
+    process = None
     try:
         process = await asyncio.create_subprocess_exec(
             "nvidia-smi",
@@ -234,6 +239,9 @@ async def _get_cuda_version() -> str | None:
                         return f"CUDA Version: {cuda_version}"
             log.debug("nvidia-smi output found but couldn't parse CUDA version")
     except Exception as e:
+        if process and process.returncode is None:
+            process.kill()
+            await process.wait()
         log.debug(f"nvidia-smi not available: {e}")
 
     return None
diff --git a/tests/test_serverless/test_modules/test_fitness/conftest.py b/tests/test_serverless/test_modules/test_fitness/conftest.py
@@ -16,6 +16,7 @@ def cleanup_fitness_checks(monkeypatch):
     with fitness check framework tests.
     """
     monkeypatch.setenv("RUNPOD_SKIP_AUTO_SYSTEM_CHECKS", "true")
+    monkeypatch.setenv("RUNPOD_SKIP_GPU_CHECK", "true")
     _reset_registration_state()
     clear_fitness_checks()
     yield
diff --git a/tests/test_serverless/test_modules/test_fitness/test_gpu_checks.py b/tests/test_serverless/test_modules/test_fitness/test_gpu_checks.py
@@ -157,8 +157,8 @@ def test_respects_env_override(self):
         """Test environment variable override takes precedence."""
         with patch("pathlib.Path.exists", return_value=True), \
              patch("pathlib.Path.is_file", return_value=True):
-            # When env var is set and path exists, it should be used
-            pass
+            path = _get_gpu_test_binary_path()
+            assert path == Path("/custom/gpu_test")
 
 
 # ============================================================================
@@ -297,6 +297,7 @@ async def test_health_check_binary_success(self):
 class TestAutoRegistration:
     """Tests for GPU check auto-registration."""
 
+    @patch.dict(os.environ, {"RUNPOD_SKIP_GPU_CHECK": ""})
     def test_auto_register_gpu_found(self):
         """Test auto-registration when GPU detected."""
         with patch("subprocess.run") as mock_run:
diff --git a/tests/test_serverless/test_modules/test_fitness/test_gpu_integration.py b/tests/test_serverless/test_modules/test_fitness/test_gpu_integration.py
@@ -41,8 +41,7 @@ def mock_gpu_test_binary():
     try:
         os.unlink(binary_path)
     except OSError:
-        # Best-effort cleanup: ignore if file already deleted or inaccessible
-        pass
+        pass  # Best-effort cleanup: ignore if file already deleted
 
 
 @pytest.fixture
@@ -66,8 +65,7 @@ def mock_gpu_test_binary_failure():
     try:
         os.unlink(binary_path)
     except OSError:
-        # Best-effort cleanup: ignore if file already deleted or inaccessible
-        pass
+        pass  # Best-effort cleanup: ignore if file already deleted
 
 
 @pytest.fixture
@@ -97,8 +95,7 @@ def mock_gpu_test_binary_multi_gpu():
     try:
         os.unlink(binary_path)
     except OSError:
-        # Best-effort cleanup: ignore if file already deleted or inaccessible
-        pass
+        pass  # Best-effort cleanup: ignore if file already deleted
 
 
 # ============================================================================