fix(fitness): merge main and address CodeQL findings

deanq · deanq · commit f371fa029d08 · 2026-03-23T20:19:28.000-07:00
Merge main branch to resolve conflicts (PR #479 prerequisite now merged). Resolve 6 file conflicts by taking main's refined e2e test versions. Address unresolved CodeQL code scanning findings: - Replace global boolean guards with dict-based _registration_state to eliminate "unused global variable" false positives - Move inline comments before pass statements so CodeQL recognizes explanatory comments on empty except clauses All 464 tests pass, 94.26% coverage.
diff --git a/.github/workflows/CI-e2e.yml b/.github/workflows/CI-e2e.yml
@@ -14,9 +14,7 @@ jobs:
     steps:
       - uses: actions/checkout@v4
 
-      - uses: astral-sh/setup-uv@v3
-        with:
-          version: "latest"
+      - uses: astral-sh/setup-uv@v6
 
       - uses: actions/setup-python@v5
         with:
@@ -26,7 +24,7 @@ jobs:
         run: |
           uv venv
           source .venv/bin/activate
-          uv pip install -e ".[test]" 2>/dev/null || uv pip install -e .
+          uv pip install -e ".[test]" --quiet || uv pip install -e .
           uv pip install runpod-flash pytest pytest-asyncio pytest-timeout pytest-rerunfailures httpx
           uv pip install -e . --reinstall --no-deps
           python -c "import runpod; print(f'runpod: {runpod.__version__} from {runpod.__file__}')"
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,13 @@
 # Changelog
 
+## [1.8.2](https://github.com/runpod/runpod-python/compare/v1.8.1...v1.8.2) (2026-03-24)
+
+
+### Bug Fixes
+
+* **config:** handle corrupted config.toml in credential functions ([#481](https://github.com/runpod/runpod-python/issues/481)) ([9894894](https://github.com/runpod/runpod-python/commit/9894894ee2022e7db0777c9dd24c23208e52f90c))
+* use flashBootType instead of appending -fb ([#484](https://github.com/runpod/runpod-python/issues/484)) ([7938936](https://github.com/runpod/runpod-python/commit/7938936158c351d6e00caebbf4242e085f7565ae))
+
 ## [1.8.1](https://github.com/runpod/runpod-python/compare/v1.8.0...v1.8.1) (2025-11-19)
 
 
diff --git a/runpod/api/mutations/endpoints.py b/runpod/api/mutations/endpoints.py
@@ -1,4 +1,4 @@
-""" Runpod | API Wrapper | Mutations | Endpoints """
+f"""Runpod | API Wrapper | Mutations | Endpoints"""
 
 # pylint: disable=too-many-arguments
 
@@ -23,7 +23,7 @@ def generate_endpoint_mutation(
 
     # ------------------------------ Required Fields ----------------------------- #
     if flashboot:
-        name = name + "-fb"
+        input_fields.append('flashBootType: "FLASHBOOT"')
 
     input_fields.append(f'name: "{name}"')
     input_fields.append(f'templateId: "{template_id}"')
@@ -75,12 +75,12 @@ def generate_endpoint_mutation(
             workersMax
             allowedCudaVersions
             gpuCount
+            flashBootType
         }}
     }}
     """
 
 
-
 def update_endpoint_template_mutation(endpoint_id: str, template_id: str):
     """Generate a string for a GraphQL mutation to update an existing endpoint's template."""
     input_fields = []
diff --git a/runpod/cli/groups/config/functions.py b/runpod/cli/groups/config/functions.py
@@ -31,11 +31,15 @@ def set_credentials(api_key: str, profile: str = "default", overwrite=False) ->
     Path(CREDENTIAL_FILE).touch(exist_ok=True)
 
     if not overwrite:
-        with open(CREDENTIAL_FILE, "rb") as cred_file:
-            if profile in toml.load(cred_file):
-                raise ValueError(
-                    "Profile already exists. Use `update_credentials` instead."
-                )
+        try:
+            with open(CREDENTIAL_FILE, "rb") as cred_file:
+                existing = toml.load(cred_file)
+        except (TypeError, ValueError):
+            existing = {}
+        if profile in existing:
+            raise ValueError(
+                "Profile already exists. Use `update_credentials` instead."
+            )
 
     with open(CREDENTIAL_FILE, "w", encoding="UTF-8") as cred_file:
         cred_file.write("[" + profile + "]\n")
@@ -72,12 +76,18 @@ def check_credentials(profile: str = "default"):
 def get_credentials(profile="default"):
     """
     Returns the credentials for the specified profile from ~/.runpod/config.toml
+
+    Returns None if the file does not exist, is not valid TOML, or does not
+    contain the requested profile.
     """
     if not os.path.exists(CREDENTIAL_FILE):
         return None
 
-    with open(CREDENTIAL_FILE, "rb") as cred_file:
-        credentials = toml.load(cred_file)
+    try:
+        with open(CREDENTIAL_FILE, "rb") as cred_file:
+            credentials = toml.load(cred_file)
+    except (TypeError, ValueError):
+        return None
 
     if profile not in credentials:
         return None
diff --git a/runpod/cli/groups/pod/commands.py b/runpod/cli/groups/pod/commands.py
@@ -243,4 +243,5 @@ def sync_pods(source_pod_id, dest_pod_id, source_workspace, dest_workspace):
             if 'local_temp_path' in locals():
                 os.unlink(local_temp_path)
         except OSError:
-            pass  # Best-effort cleanup of temp file
+            # Best-effort cleanup of temp file
+            pass
diff --git a/runpod/serverless/modules/rp_fitness.py b/runpod/serverless/modules/rp_fitness.py
@@ -67,8 +67,10 @@ def clear_fitness_checks() -> None:
     _fitness_checks.clear()
 
 
-_gpu_check_registered = False  # used via global in _ensure_gpu_check_registered
-_system_checks_registered = False  # used via global in _ensure_system_checks_registered
+_registration_state: dict[str, bool] = {
+    "gpu_check": False,
+    "system_checks": False,
+}
 
 
 def _reset_registration_state() -> None:
@@ -77,9 +79,8 @@ def _reset_registration_state() -> None:
 
     Used for testing to ensure clean state between tests.
     """
-    global _gpu_check_registered, _system_checks_registered
-    _gpu_check_registered = False
-    _system_checks_registered = False
+    _registration_state["gpu_check"] = False
+    _registration_state["system_checks"] = False
 
 
 def _ensure_gpu_check_registered() -> None:
@@ -89,12 +90,10 @@ def _ensure_gpu_check_registered() -> None:
     Deferred until first run to avoid circular import issues during module
     initialization. Called from run_fitness_checks() on first invocation.
     """
-    global _gpu_check_registered
-
-    if _gpu_check_registered:
+    if _registration_state["gpu_check"]:
         return
 
-    _gpu_check_registered = True
+    _registration_state["gpu_check"] = True
 
     try:
         from .rp_gpu_fitness import auto_register_gpu_check
@@ -113,20 +112,18 @@ def _ensure_system_checks_registered() -> None:
     """
     import os
 
-    global _system_checks_registered
-
-    if _system_checks_registered:
+    if _registration_state["system_checks"]:
         return
 
     # Allow disabling system checks for testing
     if os.environ.get("RUNPOD_SKIP_AUTO_SYSTEM_CHECKS", "").lower() == "true":
         log.debug(
             "System fitness checks disabled via environment (RUNPOD_SKIP_AUTO_SYSTEM_CHECKS)"
         )
-        _system_checks_registered = True
+        _registration_state["system_checks"] = True
         return
 
-    _system_checks_registered = True
+    _registration_state["system_checks"] = True
 
     try:
         from .rp_system_fitness import auto_register_system_checks
diff --git a/runpod/serverless/modules/rp_gpu_fitness.py b/runpod/serverless/modules/rp_gpu_fitness.py
@@ -86,7 +86,8 @@ def _parse_gpu_test_output(output: str) -> dict[str, Any]:
                 found_gpus = int(line.split()[1])
                 result["found_gpus"] = found_gpus
             except (IndexError, ValueError):
-                pass  # Line format doesn't match expected "Found N GPUs:" - skip
+                # Line format doesn't match expected "Found N GPUs:" — skip
+                pass
 
         # Check for success
         if "memory allocation test passed" in line.lower():
diff --git a/tests/e2e/conftest.py b/tests/e2e/conftest.py
@@ -3,6 +3,7 @@
 import logging
 import os
 import subprocess
+from pathlib import Path
 
 import pytest
 import runpod
@@ -12,14 +13,18 @@
 log = logging.getLogger(__name__)
 REQUEST_TIMEOUT = 300  # seconds per job request
 
+# Repo root: tests/e2e/conftest.py -> ../../
+_REPO_ROOT = Path(__file__).resolve().parents[2]
+
 
 @pytest.fixture(scope="session", autouse=True)
 def verify_local_runpod():
     """Fail fast if the local runpod-python is not installed."""
     log.info("runpod version=%s path=%s", runpod.__version__, runpod.__file__)
-    if "runpod-python" not in runpod.__file__:
+    runpod_path = Path(runpod.__file__).resolve()
+    if not runpod_path.is_relative_to(_REPO_ROOT):
         pytest.fail(
-            f"Expected local runpod-python but got {runpod.__file__}. "
+            f"Expected runpod installed from {_REPO_ROOT} but got {runpod_path}. "
             "Run: pip install -e . --force-reinstall --no-deps"
         )
 
@@ -52,23 +57,17 @@ def endpoints(require_api_key, test_cases):
         log.info("Endpoint ready: name=%s image=%s template.dockerArgs=%s", ep.name, ep.image, ep.template.dockerArgs if ep.template else "N/A")
     yield eps
 
-    # Undeploy all provisioned endpoints via CLI
-    log.info("Cleaning up %d provisioned endpoints via flash undeploy", len(eps))
+    log.info("Cleaning up all provisioned endpoints")
     try:
         result = subprocess.run(
             ["flash", "undeploy", "--all", "--force"],
             capture_output=True,
             text=True,
             timeout=120,
         )
-        log.info("flash undeploy stdout: %s", result.stdout)
-        if result.returncode != 0:
-            log.warning("flash undeploy failed (rc=%d): %s", result.returncode, result.stderr)
+        if result.returncode == 0:
+            log.info("Undeployed all endpoints")
+        else:
+            log.warning("flash undeploy --all --force failed (rc=%d): %s", result.returncode, result.stderr)
     except Exception:
-        log.exception("Failed to run flash undeploy")
-
-
-@pytest.fixture(scope="session")
-def api_key():
-    """Return the RUNPOD_API_KEY."""
-    return os.environ.get("RUNPOD_API_KEY", "")
+        log.exception("Failed to undeploy endpoints")
diff --git a/tests/e2e/e2e_provisioner.py b/tests/e2e/e2e_provisioner.py
@@ -9,14 +9,15 @@
 import json
 import logging
 import os
+import uuid
 from pathlib import Path
 from typing import Any
 
 log = logging.getLogger(__name__)
 
-# Force Flash to use ServerlessEndpoint (deploy mode) instead of LiveServerless.
-# LiveServerless forcefully overwrites imageName with Flash's base image,
-# ignoring the mock-worker image we need to deploy.
+# Must be set before importing runpod_flash — Flash reads this env var at
+# import time to decide between LiveServerless (overwrites imageName with
+# Flash's base image) and ServerlessEndpoint (preserves our mock-worker image).
 os.environ["FLASH_IS_LIVE_PROVISIONING"] = "false"
 
 from runpod_flash import Endpoint, GpuGroup, PodTemplate  # noqa: E402
@@ -25,6 +26,10 @@
 DEFAULT_CMD = "python -u /handler.py"
 TESTS_JSON = Path(__file__).parent / "tests.json"
 
+# Short unique suffix to avoid endpoint name collisions across parallel CI
+# runs sharing the same API key.
+_RUN_ID = uuid.uuid4().hex[:8]
+
 # Map gpuIds strings from tests.json to GpuGroup enum values
 _GPU_MAP: dict[str, GpuGroup] = {g.value: g for g in GpuGroup}
 
@@ -70,6 +75,11 @@ def hardware_config_key(hw: dict) -> str:
 
     Excludes endpoint name so tests with identical GPU and template
     settings share a single provisioned endpoint.
+
+    Only gpuIds and dockerArgs are included because they determine worker
+    behaviour.  Other templateConfig fields (env, image, scalerConfig)
+    are constant across our tests.json entries — if future tests vary
+    those fields, add them here.
     """
     normalized = {
         "gpuIds": hw.get("endpointConfig", {}).get("gpuIds", ""),
@@ -114,7 +124,8 @@ def provision_endpoints(
         gpu_ids = endpoint_config.get("gpuIds", "ADA_24")
         gpus = _parse_gpu_ids(gpu_ids)
 
-        ep_name = endpoint_config.get("name", f"rp-python-e2e-{len(seen)}")
+        base_name = endpoint_config.get("name", f"rp-python-e2e-{len(seen)}")
+        ep_name = f"{base_name}-{_RUN_ID}"
         log.info(
             "Provisioning endpoint: name=%s image=%s gpus=%s dockerArgs=%s",
             ep_name, MOCK_WORKER_IMAGE, [g.value for g in gpus], docker_args,
diff --git a/tests/e2e/test_cold_start.py b/tests/e2e/test_cold_start.py
@@ -1,6 +1,7 @@
 import asyncio
 import os
 import signal
+import tempfile
 import time
 
 import httpx
@@ -10,6 +11,7 @@
 
 COLD_START_PORT = 8199
 COLD_START_THRESHOLD = 60  # seconds
+LOG_TAIL_LINES = 50  # lines of output to include on failure
 
 
 async def _wait_for_ready(url: str, timeout: float, poll_interval: float = 0.5) -> None:
@@ -22,22 +24,36 @@ async def _wait_for_ready(url: str, timeout: float, poll_interval: float = 0.5)
                 if resp.status_code == 200:
                     return
             except (httpx.ConnectError, httpx.ConnectTimeout):
-                pass  # expected while server is starting up
+                # Expected while server is booting — retry until deadline.
+                continue
             await asyncio.sleep(poll_interval)
     raise TimeoutError(f"Server not ready at {url} after {timeout}s")
 
 
+def _tail(path: str, n: int = LOG_TAIL_LINES) -> str:
+    """Return the last n lines of a file, or empty string if unreadable."""
+    try:
+        with open(path) as f:
+            lines = f.readlines()
+        return "".join(lines[-n:])
+    except OSError:
+        return ""
+
+
 @pytest.mark.asyncio
 async def test_cold_start_under_threshold():
     """flash run reaches health within 60 seconds."""
     fixture_dir = os.path.join(
         os.path.dirname(__file__), "fixtures", "cold_start"
     )
+    log_file = tempfile.NamedTemporaryFile(
+        prefix="flash-cold-start-", suffix=".log", delete=False, mode="w"
+    )
     proc = await asyncio.create_subprocess_exec(
         "flash", "run", "--port", str(COLD_START_PORT),
         cwd=fixture_dir,
-        stdout=asyncio.subprocess.DEVNULL,
-        stderr=asyncio.subprocess.DEVNULL,
+        stdout=log_file,
+        stderr=asyncio.subprocess.STDOUT,
     )
 
     start = time.monotonic()
@@ -49,12 +65,23 @@ async def test_cold_start_under_threshold():
         elapsed = time.monotonic() - start
         assert elapsed < COLD_START_THRESHOLD, (
             f"Cold start took {elapsed:.1f}s, expected < {COLD_START_THRESHOLD}s"
+            f"\n--- flash run output (last {LOG_TAIL_LINES} lines) ---\n"
+            f"{_tail(log_file.name)}"
+        )
+    except (TimeoutError, AssertionError):
+        log_file.flush()
+        raise AssertionError(
+            f"Cold start failed (elapsed={time.monotonic() - start:.1f}s)"
+            f"\n--- flash run output (last {LOG_TAIL_LINES} lines) ---\n"
+            f"{_tail(log_file.name)}"
         )
     finally:
+        log_file.close()
         if proc.returncode is None:
             proc.send_signal(signal.SIGINT)
             try:
                 await asyncio.wait_for(proc.wait(), timeout=30)
             except asyncio.TimeoutError:
                 proc.kill()
                 await proc.wait()
+        os.unlink(log_file.name)
diff --git a/tests/e2e/test_mock_worker.py b/tests/e2e/test_mock_worker.py
@@ -23,7 +23,7 @@ def _load_test_cases():
     return json.loads(TESTS_JSON.read_text())
 
 
-async def _run_single_case(test_case: dict, endpoints: dict, api_key: str) -> None:
+async def _run_single_case(test_case: dict, endpoints: dict) -> None:
     """Submit one job, wait for completion, and assert output."""
     test_id = test_case.get("id", "unknown")
     hw_key = hardware_config_key(test_case["hardwareConfig"])
@@ -49,11 +49,11 @@ async def _run_single_case(test_case: dict, endpoints: dict, api_key: str) -> No
 
 
 @pytest.mark.asyncio
-async def test_mock_worker_jobs(endpoints, api_key):
+async def test_mock_worker_jobs(endpoints):
     """Submit all test jobs concurrently and verify outputs."""
     test_cases = _load_test_cases()
     results = await asyncio.gather(
-        *[_run_single_case(tc, endpoints, api_key) for tc in test_cases],
+        *[_run_single_case(tc, endpoints) for tc in test_cases],
         return_exceptions=True,
     )
 
diff --git a/tests/test_api/test_mutation_endpoints.py b/tests/test_api/test_mutation_endpoints.py
diff --git a/tests/test_cli/test_cli_groups/test_config_functions.py b/tests/test_cli/test_cli_groups/test_config_functions.py
diff --git a/tests/test_performance/test_cold_start.py b/tests/test_performance/test_cold_start.py
diff --git a/tests/test_serverless/test_modules/test_fitness/test_gpu_integration.py b/tests/test_serverless/test_modules/test_fitness/test_gpu_integration.py