Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
518af22
Add GitHub Actions CI: lint workflow and runner setup
jarcherNV Mar 11, 2026
d56213b
Run linter on changed files only
jarcherNV Mar 11, 2026
354810b
Add additional pre stage workflows
jarcherNV Mar 11, 2026
3f0d60d
Add real cleanup-dev-squash step, disabled for now
jarcherNV Mar 11, 2026
ff21c37
Add test stage
jarcherNV Mar 12, 2026
4250773
Install pip if it is not available
jarcherNV Mar 17, 2026
49e8139
Enable docker tests and disable slurm test
jarcherNV Mar 17, 2026
ec0682d
Update docker compose test tags
jarcherNV Mar 17, 2026
5a05561
Remove docker compose placeholder text and run the tests
jarcherNV Mar 17, 2026
2cfd5aa
Run tests inside docker containers
jarcherNV Mar 17, 2026
1919eae
Use HF_TOKEN from github secrets
jarcherNV Mar 17, 2026
8a76065
Update test paths
jarcherNV Mar 17, 2026
9162e97
Fix workspace ownership
jarcherNV Mar 17, 2026
f86f5eb
Only use 1 GPU for test workflows
jarcherNV Mar 17, 2026
2264c15
Install both git and git-lfs into the containers
jarcherNV Mar 17, 2026
c5c50eb
Update data paths
jarcherNV Mar 17, 2026
5a74dc5
Update docker compose to actually run integration tests
jarcherNV Mar 17, 2026
fe833ca
Add tests to validate the tutorial
jarcherNV Mar 17, 2026
f834589
Update git logic to grab changed files
jarcherNV Mar 18, 2026
edf4d60
Update docker compose deployment
jarcherNV Mar 18, 2026
f2853ac
Add install step for and switch to docker compose
jarcherNV Mar 18, 2026
2296812
Update service workdirs
jarcherNV Mar 18, 2026
12f4319
Force rebuild instead of reusing old images
jarcherNV Mar 18, 2026
322a901
Make safe folders for post checkout git commands
jarcherNV Mar 18, 2026
27ce72a
Use LOCAL_OUTPUT_PATH instead of GITHUB_WORKSPACE
jarcherNV Mar 18, 2026
e28474e
Run uv sync on grpc
jarcherNV Mar 18, 2026
12db364
Run download_vavam_assets.sh in integration tests
jarcherNV Mar 18, 2026
d195f45
Add controller to smoke test
jarcherNV Mar 18, 2026
1ddbb31
ci: retrigger checks on new runner
jarcherNV Mar 25, 2026
2d4a8eb
Install git-lfs before running git checkout
jarcherNV Mar 25, 2026
291179a
0414 backup
Apr 14, 2026
bfd167d
0417 实现pdm vam双driver
Hyperslip Apr 17, 2026
d954c7e
add web
Hyperslip May 11, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions .github/scripts/changed-files.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#!/usr/bin/env bash
set -euo pipefail

EVENT_NAME="${1:-${GITHUB_EVENT_NAME:-}}"
BASE_SHA="${2:-}"
HEAD_SHA="${3:-${GITHUB_SHA:-HEAD}}"

ZERO_SHA="0000000000000000000000000000000000000000"

if [[ -z "${BASE_SHA}" || "${BASE_SHA}" == "${ZERO_SHA}" ]]; then
git ls-files
exit 0
fi

if git cat-file -e "${BASE_SHA}^{commit}" 2>/dev/null; then
git diff --name-only "${BASE_SHA}" "${HEAD_SHA}"
exit 0
fi

echo "Base SHA ${BASE_SHA} is not available; using fallback diff." >&2
if [[ "${EVENT_NAME}" == "pull_request" ]]; then
git diff --name-only HEAD^1 HEAD 2>/dev/null || git ls-files
else
git diff --name-only HEAD~1 HEAD 2>/dev/null || git ls-files
fi
213 changes: 213 additions & 0 deletions .github/scripts/cleanup-dev-squash.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,213 @@
#!/usr/bin/env python3
# SPDX-License-Identifier: Apache-2.0
# Copyright (c) 2025 NVIDIA Corporation

"""
Clean up old squash files in the dev/ subdirectory.

Removes .sqsh files older than 7 days via SSH to the remote SLURM filesystem.
This is the GitHub Actions equivalent of GitLab's cleanup-dev-squash job.
"""

from __future__ import annotations

import os
import subprocess
import sys
from datetime import datetime


def log(level: str, message: str) -> None:
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
print(f"[{timestamp}] [{level}] {message}", flush=True)


def _require_env(name: str) -> str | None:
value = os.getenv(name, "").strip()
if not value:
log("WARN", f"{name} is not set")
return None
return value


def _first_env(names: list[str]) -> tuple[str | None, str | None]:
"""Return the first non-empty env value and the env name used."""
for name in names:
value = os.getenv(name, "").strip()
if value:
return value, name
return None, None


def make_ssh_cmd(command: str) -> str:
# Prefer generic GitHub names; keep legacy GitLab names as fallback.
ssh_key, key_name = _first_env(
["DEV_SQUASH_SSH_KEY_B64", "SLURM_FRONTEND_USER_KEY"]
)
ssh_user, user_name = _first_env(["DEV_SQUASH_SSH_USER", "SLURM_FRONTEND_USER"])
ssh_host, host_name = _first_env(["DEV_SQUASH_SSH_HOST", "SLURM_ORD_HOST"])

if not ssh_key:
_require_env("DEV_SQUASH_SSH_KEY_B64")
_require_env("SLURM_FRONTEND_USER_KEY")
if not ssh_user:
_require_env("DEV_SQUASH_SSH_USER")
_require_env("SLURM_FRONTEND_USER")
if not ssh_host:
_require_env("DEV_SQUASH_SSH_HOST")
_require_env("SLURM_ORD_HOST")

if not (ssh_key and ssh_user and ssh_host):
raise RuntimeError("Missing one or more required SSH environment variables")

log(
"INFO",
f"Using SSH user from {user_name}, host from {host_name}, key from {key_name}",
)

return (
"ssh-agent bash -c "
'"ssh-add <(echo ' + ssh_key + " | base64 -d) 2>/dev/null && "
"ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null "
"-o LogLevel=QUIET -n " + ssh_user + "@" + ssh_host + " '" + command + "'\""
)


def main() -> int:
# Prefer generic GitHub name; keep legacy name as fallback.
squash_cache_dir = (
os.getenv("DEV_SQUASH_CACHE_DIR", "").strip()
or os.getenv("SQUASH_CACHE_DIR", "").strip()
)
if not squash_cache_dir:
log("WARN", "DEV_SQUASH_CACHE_DIR/SQUASH_CACHE_DIR is not set")
log("INFO", "Skipping cleanup because cache directory is not configured")
return 0

# For testability and safety in GitHub, allow dry-run mode.
dry_run = os.getenv("CLEANUP_DEV_SQUASH_DRY_RUN", "false").lower() == "true"

log("INFO", f"DEV_SQUASH_CACHE_DIR={squash_cache_dir}")
log("INFO", f"Dry run mode: {dry_run}")

dev_dir = os.path.join(squash_cache_dir, "dev")
log("INFO", f"Target directory: {dev_dir}")

if not dev_dir.endswith("dev"):
log("ERROR", f"Safety check failed: path {dev_dir} does not end with dev")
return 1

# If SSH env isn't configured, skip rather than hard-fail.
missing_ssh = []
if not (
os.getenv("DEV_SQUASH_SSH_KEY_B64", "").strip()
or os.getenv("SLURM_FRONTEND_USER_KEY", "").strip()
):
missing_ssh.append("DEV_SQUASH_SSH_KEY_B64")
if not (
os.getenv("DEV_SQUASH_SSH_USER", "").strip()
or os.getenv("SLURM_FRONTEND_USER", "").strip()
):
missing_ssh.append("DEV_SQUASH_SSH_USER")
if not (
os.getenv("DEV_SQUASH_SSH_HOST", "").strip()
or os.getenv("SLURM_ORD_HOST", "").strip()
):
missing_ssh.append("DEV_SQUASH_SSH_HOST")
if missing_ssh:
log("WARN", f"Missing SSH config env vars: {', '.join(missing_ssh)}")
log("INFO", "Skipping cleanup because remote SSH connection is not configured")
return 0

try:
check_cmd = make_ssh_cmd(
f"[ -d {dev_dir} ] && echo 'exists' || echo 'not_found'"
)
result = subprocess.run(
check_cmd, shell=True, capture_output=True, text=True, timeout=30
)
if "not_found" in result.stdout:
log(
"WARN",
f"Dev directory {dev_dir} not found on remote. Nothing to clean.",
)
return 0

list_cmd = make_ssh_cmd(
f"find {dev_dir} -type f -name '\\''*.sqsh'\\'' -mtime +7 2>/dev/null"
)
result = subprocess.run(
list_cmd, shell=True, capture_output=True, text=True, timeout=60
)
paths_to_delete = [
p.strip() for p in (result.stdout or "").strip().splitlines() if p.strip()
]

count_all_cmd = make_ssh_cmd(
f"find {dev_dir} -type f -name '\\''*.sqsh'\\'' 2>/dev/null | wc -l"
)
result_all = subprocess.run(
count_all_cmd, shell=True, capture_output=True, text=True, timeout=60
)
file_count_all = (
result_all.stdout.strip() if result_all.returncode == 0 else "?"
)

if not paths_to_delete:
log("INFO", "No old squash files to clean up")
log("INFO", f"Total files: {file_count_all}")
return 0

file_count = len(paths_to_delete)
log(
"INFO",
f"Found {file_count} of {file_count_all} squash files older than 7 days",
)
log("INFO", "Sample files to be deleted:")
for path in paths_to_delete[:10]:
log("INFO", f" - {os.path.basename(path)}")

if dry_run:
log("INFO", "Dry run enabled, skipping deletion")
return 0

quoted_paths = " ".join(
"'\\''" + p.replace("'", "'\\''") + "'\\''" for p in paths_to_delete
)
delete_cmd = make_ssh_cmd(f"rm -- {quoted_paths} || true")
result = subprocess.run(
delete_cmd, shell=True, capture_output=True, text=True, timeout=300
)
if result.returncode != 0:
log("ERROR", f"Failed to delete old files (exit code {result.returncode})")
if result.stderr.strip():
log("ERROR", f"stderr: {result.stderr.strip()}")
return 1

log("INFO", f"Cleanup complete - removed up to {file_count} old dev files")

verify_cmd = make_ssh_cmd(
f"find {dev_dir} -type f -name '\\''*.sqsh'\\'' -mtime +7 2>/dev/null | wc -l"
)
result = subprocess.run(
verify_cmd, shell=True, capture_output=True, text=True, timeout=60
)
if result.returncode == 0:
remaining = int(result.stdout.strip())
if remaining > 0:
log("WARN", f"Verification shows {remaining} old files still remain")
else:
log("INFO", "Verification successful - all old files removed")

except subprocess.TimeoutExpired:
log("ERROR", "SSH command timed out")
return 1
except Exception as exc: # pragma: no cover
log("ERROR", f"Unexpected error during cleanup: {exc}")
return 1

return 0


if __name__ == "__main__":
sys.exit(main())
52 changes: 52 additions & 0 deletions .github/workflows/cleanup-dev-squash.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
name: Cleanup Dev Squash

on:
pull_request:
schedule:
# Weekly on Sunday at 07:00 UTC. Adjust as needed.
- cron: "0 7 * * 0"
workflow_dispatch:

permissions:
contents: read

jobs:
cleanup-dev-squash:
# Temporarily disabled:
# this job requires a dedicated infra runner with access to internal cluster/lustre.
# Re-enable by replacing the condition below with:
# github.event_name != 'pull_request' || github.base_ref == github.event.repository.default_branch
if: ${{ false }}
runs-on: self-hosted
continue-on-error: true
steps:
- name: Checkout
uses: actions/checkout@v4

- name: Install SSH client
shell: bash
run: |
set -euo pipefail
if ! command -v ssh >/dev/null 2>&1; then
sudo apt-get update
sudo apt-get install -y openssh-client
fi

- name: Cleanup old dev squash files
shell: bash
env:
# Generic, repo-neutral names.
DEV_SQUASH_CACHE_DIR: ${{ vars.DEV_SQUASH_CACHE_DIR }}
DEV_SQUASH_SSH_USER: ${{ vars.DEV_SQUASH_SSH_USER }}
DEV_SQUASH_SSH_HOST: ${{ vars.DEV_SQUASH_SSH_HOST }}
DEV_SQUASH_SSH_KEY_B64: ${{ secrets.DEV_SQUASH_SSH_KEY_B64 }}
# Legacy fallback names (optional, for migration).
SQUASH_CACHE_DIR: ${{ vars.SQUASH_CACHE_DIR }}
SLURM_FRONTEND_USER: ${{ vars.SLURM_FRONTEND_USER }}
SLURM_ORD_HOST: ${{ vars.SLURM_ORD_HOST }}
SLURM_FRONTEND_USER_KEY: ${{ secrets.SLURM_FRONTEND_USER_KEY }}
# Optional safety mode for first rollout.
CLEANUP_DEV_SQUASH_DRY_RUN: ${{ vars.CLEANUP_DEV_SQUASH_DRY_RUN }}
run: |
set -euo pipefail
python3 .github/scripts/cleanup-dev-squash.py
69 changes: 69 additions & 0 deletions .github/workflows/lint.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
# Lint: run pre-commit for changed files.
# Skips insert-license to match CI behavior in GitLab.
name: Lint

on:
push:
branches: [main, master]
pull_request:
# Run on all PRs (matches GitLab merge_request_event for lint)

jobs:
pre-commit:
runs-on: ubuntu-22.04
steps:
- name: Checkout
uses: actions/checkout@v4
with:
# Required to diff against base SHA for pre-commit ranges.
fetch-depth: 0

- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: "3.12"

- name: Install pre-commit
run: pip install pre-commit

- name: Run pre-commit (pull_request)
if: github.event_name == 'pull_request'
run: |
BASE_SHA="${{ github.event.pull_request.base.sha }}"
HEAD_SHA="${{ github.event.pull_request.head.sha }}"
mapfile -t CHANGED_FILES < <(bash ./.github/scripts/changed-files.sh pull_request "${BASE_SHA}" "${HEAD_SHA}")

EXISTING_FILES=()
for path in "${CHANGED_FILES[@]}"; do
if [[ -e "${path}" ]]; then
EXISTING_FILES+=("${path}")
fi
done

if [[ ${#EXISTING_FILES[@]} -eq 0 ]]; then
echo "No changed files to lint."
exit 0
fi

SKIP=insert-license pre-commit run -c .pre-commit-config.yaml --files "${EXISTING_FILES[@]}"

- name: Run pre-commit (push)
if: github.event_name == 'push'
run: |
BEFORE_SHA="${{ github.event.before }}"
HEAD_SHA="${{ github.sha }}"
mapfile -t CHANGED_FILES < <(bash ./.github/scripts/changed-files.sh push "${BEFORE_SHA}" "${HEAD_SHA}")

EXISTING_FILES=()
for path in "${CHANGED_FILES[@]}"; do
if [[ -e "${path}" ]]; then
EXISTING_FILES+=("${path}")
fi
done

if [[ ${#EXISTING_FILES[@]} -eq 0 ]]; then
echo "No changed files to lint."
exit 0
fi

SKIP=insert-license pre-commit run -c .pre-commit-config.yaml --files "${EXISTING_FILES[@]}"
Loading