Skip to content

Commit f371fa0

Browse files
committed
fix(fitness): merge main and address CodeQL findings
Merge main branch to resolve conflicts (PR #479 prerequisite now merged). Resolve 6 file conflicts by taking main's refined e2e test versions. Address unresolved CodeQL code scanning findings: - Replace global boolean guards with dict-based _registration_state to eliminate "unused global variable" false positives - Move inline comments before pass statements so CodeQL recognizes explanatory comments on empty except clauses All 464 tests pass, 94.26% coverage.
2 parents d019b37 + 3a5f8c7 commit f371fa0

15 files changed

Lines changed: 173 additions & 98 deletions

File tree

.github/workflows/CI-e2e.yml

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,7 @@ jobs:
1414
steps:
1515
- uses: actions/checkout@v4
1616

17-
- uses: astral-sh/setup-uv@v3
18-
with:
19-
version: "latest"
17+
- uses: astral-sh/setup-uv@v6
2018

2119
- uses: actions/setup-python@v5
2220
with:
@@ -26,7 +24,7 @@ jobs:
2624
run: |
2725
uv venv
2826
source .venv/bin/activate
29-
uv pip install -e ".[test]" 2>/dev/null || uv pip install -e .
27+
uv pip install -e ".[test]" --quiet || uv pip install -e .
3028
uv pip install runpod-flash pytest pytest-asyncio pytest-timeout pytest-rerunfailures httpx
3129
uv pip install -e . --reinstall --no-deps
3230
python -c "import runpod; print(f'runpod: {runpod.__version__} from {runpod.__file__}')"

CHANGELOG.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,13 @@
11
# Changelog
22

3+
## [1.8.2](https://github.com/runpod/runpod-python/compare/v1.8.1...v1.8.2) (2026-03-24)
4+
5+
6+
### Bug Fixes
7+
8+
* **config:** handle corrupted config.toml in credential functions ([#481](https://github.com/runpod/runpod-python/issues/481)) ([9894894](https://github.com/runpod/runpod-python/commit/9894894ee2022e7db0777c9dd24c23208e52f90c))
9+
* use flashBootType instead of appending -fb ([#484](https://github.com/runpod/runpod-python/issues/484)) ([7938936](https://github.com/runpod/runpod-python/commit/7938936158c351d6e00caebbf4242e085f7565ae))
10+
311
## [1.8.1](https://github.com/runpod/runpod-python/compare/v1.8.0...v1.8.1) (2025-11-19)
412

513

runpod/api/mutations/endpoints.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
""" Runpod | API Wrapper | Mutations | Endpoints """
1+
f"""Runpod | API Wrapper | Mutations | Endpoints"""
22

33
# pylint: disable=too-many-arguments
44

@@ -23,7 +23,7 @@ def generate_endpoint_mutation(
2323

2424
# ------------------------------ Required Fields ----------------------------- #
2525
if flashboot:
26-
name = name + "-fb"
26+
input_fields.append('flashBootType: "FLASHBOOT"')
2727

2828
input_fields.append(f'name: "{name}"')
2929
input_fields.append(f'templateId: "{template_id}"')
@@ -75,12 +75,12 @@ def generate_endpoint_mutation(
7575
workersMax
7676
allowedCudaVersions
7777
gpuCount
78+
flashBootType
7879
}}
7980
}}
8081
"""
8182

8283

83-
8484
def update_endpoint_template_mutation(endpoint_id: str, template_id: str):
8585
"""Generate a string for a GraphQL mutation to update an existing endpoint's template."""
8686
input_fields = []

runpod/cli/groups/config/functions.py

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -31,11 +31,15 @@ def set_credentials(api_key: str, profile: str = "default", overwrite=False) ->
3131
Path(CREDENTIAL_FILE).touch(exist_ok=True)
3232

3333
if not overwrite:
34-
with open(CREDENTIAL_FILE, "rb") as cred_file:
35-
if profile in toml.load(cred_file):
36-
raise ValueError(
37-
"Profile already exists. Use `update_credentials` instead."
38-
)
34+
try:
35+
with open(CREDENTIAL_FILE, "rb") as cred_file:
36+
existing = toml.load(cred_file)
37+
except (TypeError, ValueError):
38+
existing = {}
39+
if profile in existing:
40+
raise ValueError(
41+
"Profile already exists. Use `update_credentials` instead."
42+
)
3943

4044
with open(CREDENTIAL_FILE, "w", encoding="UTF-8") as cred_file:
4145
cred_file.write("[" + profile + "]\n")
@@ -72,12 +76,18 @@ def check_credentials(profile: str = "default"):
7276
def get_credentials(profile="default"):
7377
"""
7478
Returns the credentials for the specified profile from ~/.runpod/config.toml
79+
80+
Returns None if the file does not exist, is not valid TOML, or does not
81+
contain the requested profile.
7582
"""
7683
if not os.path.exists(CREDENTIAL_FILE):
7784
return None
7885

79-
with open(CREDENTIAL_FILE, "rb") as cred_file:
80-
credentials = toml.load(cred_file)
86+
try:
87+
with open(CREDENTIAL_FILE, "rb") as cred_file:
88+
credentials = toml.load(cred_file)
89+
except (TypeError, ValueError):
90+
return None
8191

8292
if profile not in credentials:
8393
return None

runpod/cli/groups/pod/commands.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -243,4 +243,5 @@ def sync_pods(source_pod_id, dest_pod_id, source_workspace, dest_workspace):
243243
if 'local_temp_path' in locals():
244244
os.unlink(local_temp_path)
245245
except OSError:
246-
pass # Best-effort cleanup of temp file
246+
# Best-effort cleanup of temp file
247+
pass

runpod/serverless/modules/rp_fitness.py

Lines changed: 11 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -67,8 +67,10 @@ def clear_fitness_checks() -> None:
6767
_fitness_checks.clear()
6868

6969

70-
_gpu_check_registered = False # used via global in _ensure_gpu_check_registered
71-
_system_checks_registered = False # used via global in _ensure_system_checks_registered
70+
_registration_state: dict[str, bool] = {
71+
"gpu_check": False,
72+
"system_checks": False,
73+
}
7274

7375

7476
def _reset_registration_state() -> None:
@@ -77,9 +79,8 @@ def _reset_registration_state() -> None:
7779
7880
Used for testing to ensure clean state between tests.
7981
"""
80-
global _gpu_check_registered, _system_checks_registered
81-
_gpu_check_registered = False
82-
_system_checks_registered = False
82+
_registration_state["gpu_check"] = False
83+
_registration_state["system_checks"] = False
8384

8485

8586
def _ensure_gpu_check_registered() -> None:
@@ -89,12 +90,10 @@ def _ensure_gpu_check_registered() -> None:
8990
Deferred until first run to avoid circular import issues during module
9091
initialization. Called from run_fitness_checks() on first invocation.
9192
"""
92-
global _gpu_check_registered
93-
94-
if _gpu_check_registered:
93+
if _registration_state["gpu_check"]:
9594
return
9695

97-
_gpu_check_registered = True
96+
_registration_state["gpu_check"] = True
9897

9998
try:
10099
from .rp_gpu_fitness import auto_register_gpu_check
@@ -113,20 +112,18 @@ def _ensure_system_checks_registered() -> None:
113112
"""
114113
import os
115114

116-
global _system_checks_registered
117-
118-
if _system_checks_registered:
115+
if _registration_state["system_checks"]:
119116
return
120117

121118
# Allow disabling system checks for testing
122119
if os.environ.get("RUNPOD_SKIP_AUTO_SYSTEM_CHECKS", "").lower() == "true":
123120
log.debug(
124121
"System fitness checks disabled via environment (RUNPOD_SKIP_AUTO_SYSTEM_CHECKS)"
125122
)
126-
_system_checks_registered = True
123+
_registration_state["system_checks"] = True
127124
return
128125

129-
_system_checks_registered = True
126+
_registration_state["system_checks"] = True
130127

131128
try:
132129
from .rp_system_fitness import auto_register_system_checks

runpod/serverless/modules/rp_gpu_fitness.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,8 @@ def _parse_gpu_test_output(output: str) -> dict[str, Any]:
8686
found_gpus = int(line.split()[1])
8787
result["found_gpus"] = found_gpus
8888
except (IndexError, ValueError):
89-
pass # Line format doesn't match expected "Found N GPUs:" - skip
89+
# Line format doesn't match expected "Found N GPUs:" — skip
90+
pass
9091

9192
# Check for success
9293
if "memory allocation test passed" in line.lower():

tests/e2e/conftest.py

Lines changed: 13 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import logging
44
import os
55
import subprocess
6+
from pathlib import Path
67

78
import pytest
89
import runpod
@@ -12,14 +13,18 @@
1213
log = logging.getLogger(__name__)
1314
REQUEST_TIMEOUT = 300 # seconds per job request
1415

16+
# Repo root: tests/e2e/conftest.py -> ../../
17+
_REPO_ROOT = Path(__file__).resolve().parents[2]
18+
1519

1620
@pytest.fixture(scope="session", autouse=True)
1721
def verify_local_runpod():
1822
"""Fail fast if the local runpod-python is not installed."""
1923
log.info("runpod version=%s path=%s", runpod.__version__, runpod.__file__)
20-
if "runpod-python" not in runpod.__file__:
24+
runpod_path = Path(runpod.__file__).resolve()
25+
if not runpod_path.is_relative_to(_REPO_ROOT):
2126
pytest.fail(
22-
f"Expected local runpod-python but got {runpod.__file__}. "
27+
f"Expected runpod installed from {_REPO_ROOT} but got {runpod_path}. "
2328
"Run: pip install -e . --force-reinstall --no-deps"
2429
)
2530

@@ -52,23 +57,17 @@ def endpoints(require_api_key, test_cases):
5257
log.info("Endpoint ready: name=%s image=%s template.dockerArgs=%s", ep.name, ep.image, ep.template.dockerArgs if ep.template else "N/A")
5358
yield eps
5459

55-
# Undeploy all provisioned endpoints via CLI
56-
log.info("Cleaning up %d provisioned endpoints via flash undeploy", len(eps))
60+
log.info("Cleaning up all provisioned endpoints")
5761
try:
5862
result = subprocess.run(
5963
["flash", "undeploy", "--all", "--force"],
6064
capture_output=True,
6165
text=True,
6266
timeout=120,
6367
)
64-
log.info("flash undeploy stdout: %s", result.stdout)
65-
if result.returncode != 0:
66-
log.warning("flash undeploy failed (rc=%d): %s", result.returncode, result.stderr)
68+
if result.returncode == 0:
69+
log.info("Undeployed all endpoints")
70+
else:
71+
log.warning("flash undeploy --all --force failed (rc=%d): %s", result.returncode, result.stderr)
6772
except Exception:
68-
log.exception("Failed to run flash undeploy")
69-
70-
71-
@pytest.fixture(scope="session")
72-
def api_key():
73-
"""Return the RUNPOD_API_KEY."""
74-
return os.environ.get("RUNPOD_API_KEY", "")
73+
log.exception("Failed to undeploy endpoints")

tests/e2e/e2e_provisioner.py

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,14 +9,15 @@
99
import json
1010
import logging
1111
import os
12+
import uuid
1213
from pathlib import Path
1314
from typing import Any
1415

1516
log = logging.getLogger(__name__)
1617

17-
# Force Flash to use ServerlessEndpoint (deploy mode) instead of LiveServerless.
18-
# LiveServerless forcefully overwrites imageName with Flash's base image,
19-
# ignoring the mock-worker image we need to deploy.
18+
# Must be set before importing runpod_flash — Flash reads this env var at
19+
# import time to decide between LiveServerless (overwrites imageName with
20+
# Flash's base image) and ServerlessEndpoint (preserves our mock-worker image).
2021
os.environ["FLASH_IS_LIVE_PROVISIONING"] = "false"
2122

2223
from runpod_flash import Endpoint, GpuGroup, PodTemplate # noqa: E402
@@ -25,6 +26,10 @@
2526
DEFAULT_CMD = "python -u /handler.py"
2627
TESTS_JSON = Path(__file__).parent / "tests.json"
2728

29+
# Short unique suffix to avoid endpoint name collisions across parallel CI
30+
# runs sharing the same API key.
31+
_RUN_ID = uuid.uuid4().hex[:8]
32+
2833
# Map gpuIds strings from tests.json to GpuGroup enum values
2934
_GPU_MAP: dict[str, GpuGroup] = {g.value: g for g in GpuGroup}
3035

@@ -70,6 +75,11 @@ def hardware_config_key(hw: dict) -> str:
7075
7176
Excludes endpoint name so tests with identical GPU and template
7277
settings share a single provisioned endpoint.
78+
79+
Only gpuIds and dockerArgs are included because they determine worker
80+
behaviour. Other templateConfig fields (env, image, scalerConfig)
81+
are constant across our tests.json entries — if future tests vary
82+
those fields, add them here.
7383
"""
7484
normalized = {
7585
"gpuIds": hw.get("endpointConfig", {}).get("gpuIds", ""),
@@ -114,7 +124,8 @@ def provision_endpoints(
114124
gpu_ids = endpoint_config.get("gpuIds", "ADA_24")
115125
gpus = _parse_gpu_ids(gpu_ids)
116126

117-
ep_name = endpoint_config.get("name", f"rp-python-e2e-{len(seen)}")
127+
base_name = endpoint_config.get("name", f"rp-python-e2e-{len(seen)}")
128+
ep_name = f"{base_name}-{_RUN_ID}"
118129
log.info(
119130
"Provisioning endpoint: name=%s image=%s gpus=%s dockerArgs=%s",
120131
ep_name, MOCK_WORKER_IMAGE, [g.value for g in gpus], docker_args,

tests/e2e/test_cold_start.py

Lines changed: 30 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import asyncio
22
import os
33
import signal
4+
import tempfile
45
import time
56

67
import httpx
@@ -10,6 +11,7 @@
1011

1112
COLD_START_PORT = 8199
1213
COLD_START_THRESHOLD = 60 # seconds
14+
LOG_TAIL_LINES = 50 # lines of output to include on failure
1315

1416

1517
async def _wait_for_ready(url: str, timeout: float, poll_interval: float = 0.5) -> None:
@@ -22,22 +24,36 @@ async def _wait_for_ready(url: str, timeout: float, poll_interval: float = 0.5)
2224
if resp.status_code == 200:
2325
return
2426
except (httpx.ConnectError, httpx.ConnectTimeout):
25-
pass # expected while server is starting up
27+
# Expected while server is booting — retry until deadline.
28+
continue
2629
await asyncio.sleep(poll_interval)
2730
raise TimeoutError(f"Server not ready at {url} after {timeout}s")
2831

2932

33+
def _tail(path: str, n: int = LOG_TAIL_LINES) -> str:
34+
"""Return the last n lines of a file, or empty string if unreadable."""
35+
try:
36+
with open(path) as f:
37+
lines = f.readlines()
38+
return "".join(lines[-n:])
39+
except OSError:
40+
return ""
41+
42+
3043
@pytest.mark.asyncio
3144
async def test_cold_start_under_threshold():
3245
"""flash run reaches health within 60 seconds."""
3346
fixture_dir = os.path.join(
3447
os.path.dirname(__file__), "fixtures", "cold_start"
3548
)
49+
log_file = tempfile.NamedTemporaryFile(
50+
prefix="flash-cold-start-", suffix=".log", delete=False, mode="w"
51+
)
3652
proc = await asyncio.create_subprocess_exec(
3753
"flash", "run", "--port", str(COLD_START_PORT),
3854
cwd=fixture_dir,
39-
stdout=asyncio.subprocess.DEVNULL,
40-
stderr=asyncio.subprocess.DEVNULL,
55+
stdout=log_file,
56+
stderr=asyncio.subprocess.STDOUT,
4157
)
4258

4359
start = time.monotonic()
@@ -49,12 +65,23 @@ async def test_cold_start_under_threshold():
4965
elapsed = time.monotonic() - start
5066
assert elapsed < COLD_START_THRESHOLD, (
5167
f"Cold start took {elapsed:.1f}s, expected < {COLD_START_THRESHOLD}s"
68+
f"\n--- flash run output (last {LOG_TAIL_LINES} lines) ---\n"
69+
f"{_tail(log_file.name)}"
70+
)
71+
except (TimeoutError, AssertionError):
72+
log_file.flush()
73+
raise AssertionError(
74+
f"Cold start failed (elapsed={time.monotonic() - start:.1f}s)"
75+
f"\n--- flash run output (last {LOG_TAIL_LINES} lines) ---\n"
76+
f"{_tail(log_file.name)}"
5277
)
5378
finally:
79+
log_file.close()
5480
if proc.returncode is None:
5581
proc.send_signal(signal.SIGINT)
5682
try:
5783
await asyncio.wait_for(proc.wait(), timeout=30)
5884
except asyncio.TimeoutError:
5985
proc.kill()
6086
await proc.wait()
87+
os.unlink(log_file.name)

0 commit comments

Comments
 (0)