diff --git a/buildkite/bootstrap-amd.sh b/buildkite/bootstrap-amd.sh index d6473642..cabea4a3 100644 --- a/buildkite/bootstrap-amd.sh +++ b/buildkite/bootstrap-amd.sh @@ -27,6 +27,32 @@ if [[ -z "${COV_ENABLED:-}" ]]; then COV_ENABLED=0 fi +# --------------------------------------------------------------------------- +# Helper functions +# --------------------------------------------------------------------------- +fetch_origin_ref() { + local ref="$1" + git fetch --no-tags --depth=50 origin "${ref}:refs/remotes/origin/${ref}" >/dev/null 2>&1 || \ + git fetch --no-tags origin "${ref}:refs/remotes/origin/${ref}" >/dev/null 2>&1 +} + +get_pr_labels() { + if [[ "${BUILDKITE_PULL_REQUEST:-false}" == "false" ]]; then + return 0 + fi + + curl -fsSL "https://api.github.com/repos/vllm-project/vllm/pulls/$BUILDKITE_PULL_REQUEST" 2>/dev/null | \ + jq -r '.labels[].name' 2>/dev/null || true +} + +join_file_diff() { + if [[ -z "${1:-}" ]]; then + return 0 + fi + + printf '%s\n' "$1" | tr -d '\r' | paste -sd'|' - +} + # --------------------------------------------------------------------------- # Git setup: ensure origin/main is available and compute merge base once. # On K8s (blobless clones with --filter=blob:none), origin/main may not be @@ -35,9 +61,14 @@ fi # --------------------------------------------------------------------------- git config --global --add safe.directory "$(pwd)" 2>/dev/null || true +if git rev-parse --is-shallow-repository 2>/dev/null | grep -q "true"; then + echo "Shallow repository detected, deepening history..." + git fetch --no-tags --deepen=50 origin >/dev/null 2>&1 || true +fi + if ! git rev-parse --verify origin/main >/dev/null 2>&1; then echo "origin/main not found, fetching..." - git fetch origin main --depth=1 2>/dev/null || git fetch origin main || true + fetch_origin_ref main || true fi if [[ -z "${MERGE_BASE_COMMIT:-}" ]]; then @@ -49,15 +80,11 @@ if [[ -z "${MERGE_BASE_COMMIT:-}" ]]; then fi fi -# --------------------------------------------------------------------------- -# Helper functions -# --------------------------------------------------------------------------- - fail_fast() { DISABLE_LABEL="ci-no-fail-fast" # If BUILDKITE_PULL_REQUEST != "false", then we check the PR labels using curl and jq if [ "$BUILDKITE_PULL_REQUEST" != "false" ]; then - PR_LABELS=$(curl -s "https://api.github.com/repos/vllm-project/vllm/pulls/$BUILDKITE_PULL_REQUEST" | jq -r '.labels[].name') + PR_LABELS=$(get_pr_labels) if [[ $PR_LABELS == *"$DISABLE_LABEL"* ]]; then echo false else @@ -72,7 +99,7 @@ check_run_all_label() { RUN_ALL_LABEL="ready-run-all-tests" # If BUILDKITE_PULL_REQUEST != "false", then we check the PR labels using curl and jq if [ "$BUILDKITE_PULL_REQUEST" != "false" ]; then - PR_LABELS=$(curl -s "https://api.github.com/repos/vllm-project/vllm/pulls/$BUILDKITE_PULL_REQUEST" | jq -r '.labels[].name') + PR_LABELS=$(get_pr_labels) if [[ $PR_LABELS == *"$RUN_ALL_LABEL"* ]]; then echo true else @@ -107,16 +134,20 @@ upload_pipeline() { # Install minijinja ls .buildkite || buildkite-agent annotate --style error 'Please merge upstream main branch for buildkite CI' curl -sSfL https://github.com/mitsuhiko/minijinja/releases/download/2.3.1/minijinja-cli-installer.sh | sh - source "$HOME/.cargo/env" + TEMPLATE_PATH=".buildkite/test-template-amd.j2" + CARGO_ENV="${CARGO_HOME:-$HOME/.cargo}/env" + if [[ ! -f "$CARGO_ENV" ]]; then + echo "Error: Cargo env file not found at $CARGO_ENV" + exit 1 + fi + # shellcheck disable=SC1090 + source "$CARGO_ENV" if [[ $BUILDKITE_PIPELINE_SLUG == "fastcheck" ]]; then AMD_MIRROR_HW="amdtentative" - curl -o .buildkite/test-template.j2 \ - "https://raw.githubusercontent.com/vllm-project/ci-infra/$VLLM_CI_BRANCH/buildkite/test-template-amd.j2?$(date +%s)" - else - curl -o .buildkite/test-template.j2 \ - "https://raw.githubusercontent.com/vllm-project/ci-infra/$VLLM_CI_BRANCH/buildkite/test-template-amd.j2?$(date +%s)" fi + curl -fsSL -o "$TEMPLATE_PATH" \ + "https://raw.githubusercontent.com/vllm-project/ci-infra/$VLLM_CI_BRANCH/buildkite/test-template-amd.j2?$(date +%s)" # (WIP) Use pipeline generator instead of jinja template @@ -137,7 +168,7 @@ upload_pipeline() { ( set -x # Output pipeline.yaml with all blank lines removed - minijinja-cli test-template.j2 test-amd.yaml \ + minijinja-cli test-template-amd.j2 test-amd.yaml \ -D branch="$BUILDKITE_BRANCH" \ -D list_file_diff="$LIST_FILE_DIFF" \ -D run_all="$RUN_ALL" \ @@ -160,9 +191,26 @@ upload_pipeline() { # --------------------------------------------------------------------------- # Compute file diff # --------------------------------------------------------------------------- -file_diff=$(get_diff) +if [[ $BUILDKITE_BRANCH == "main" ]] && ! git rev-parse --verify HEAD~1 >/dev/null 2>&1; then + echo "HEAD~1 not available on main, fetching one more commit..." + git fetch --no-tags --deepen=1 origin >/dev/null 2>&1 || true +fi + +diff_unavailable=0 +if [[ $BUILDKITE_BRANCH == "main" ]] && ! git rev-parse --verify HEAD~1 >/dev/null 2>&1; then + echo "WARNING: Could not resolve HEAD~1 on main, falling back to run_all=1" + RUN_ALL=1 + diff_unavailable=1 +fi + if [[ $BUILDKITE_BRANCH == "main" ]]; then - file_diff=$(get_diff_main) + if [[ $diff_unavailable -eq 1 ]]; then + file_diff="" + else + file_diff=$(get_diff_main) + fi +else + file_diff=$(get_diff) fi # ---------------------------------------------------------------------- @@ -183,13 +231,13 @@ if [[ "${DOCS_ONLY_DISABLE}" != "1" ]]; then docs_only=0 break fi - done < <(printf '%s\n' "$file_diff" | tr ' ' '\n' | tr -d '\r') + done < <(printf '%s\n' "$file_diff" | tr -d '\r') if [[ "$docs_only" -eq 1 ]]; then buildkite-agent annotate ":memo: CI skipped — docs/Markdown/mkdocs-only changes detected \`\`\` -$(printf '%s\n' "$file_diff" | tr ' ' '\n') +$(printf '%s\n' "$file_diff" | tr -d '\r') \`\`\`" --style "info" || true echo "[docs-only] All changes are docs/**, *.md, or mkdocs.yaml. Exiting before pipeline upload." exit 0 @@ -206,12 +254,9 @@ patterns=( "docker/Dockerfile.rocm_base" "CMakeLists.txt" "requirements/common.txt" - "requirements/cuda.txt" - "requirements/build.txt" - "requirements/test.txt" "requirements/rocm.txt" - "requirements/rocm-build.txt" - "requirements/rocm-test.txt" + "requirements/build/rocm.txt" + "requirements/test/rocm.txt" "setup.py" "csrc/" "cmake/" @@ -219,12 +264,11 @@ patterns=( ignore_patterns=( "csrc/cpu" - "csrc/rocm" - "cmake/hipify.py" "cmake/cpu_extension.cmake" ) -for file in $file_diff; do +while IFS= read -r file; do + [[ -z "$file" ]] && continue # First check if file matches any pattern matches_pattern=0 for pattern in "${patterns[@]}"; do @@ -250,7 +294,7 @@ for file in $file_diff; do break fi fi -done +done < <(printf '%s\n' "$file_diff" | tr -d '\r') # Check for ready-run-all-tests label LABEL_RUN_ALL=$(check_run_all_label) @@ -279,7 +323,7 @@ fi if [[ $RUN_ALL -eq 1 ]]; then LIST_FILE_DIFF="run_all" else - LIST_FILE_DIFF=$(echo "$file_diff" | tr ' ' '|') + LIST_FILE_DIFF=$(join_file_diff "$file_diff") fi upload_pipeline diff --git a/buildkite/pipeline_generator/buildkite_step.py b/buildkite/pipeline_generator/buildkite_step.py index 9830a698..e4710a03 100644 --- a/buildkite/pipeline_generator/buildkite_step.py +++ b/buildkite/pipeline_generator/buildkite_step.py @@ -27,7 +27,9 @@ class BuildkiteCommandStep(BaseModel): def to_yaml(self): return { "label": self.label, + "key": self.key, "group": self.group, + "agents": self.agents, "commands": self.commands, "depends_on": self.depends_on, "soft_fail": self.soft_fail, @@ -267,16 +269,16 @@ def convert_group_step_to_buildkite_step( # Create AMD mirror step and its block step if specified/applicable if step.mirror and step.mirror.get("amd"): - amd_block_step = None + amd_step = _create_amd_mirror_step(step, step_commands, step.mirror["amd"]) + # Block step depends on the shared AMD image build. + mirror_build_dep = amd_step.depends_on[0] if amd_step.depends_on else "image-build-amd" amd_block_step = BuildkiteBlockStep( block=f"Run AMD: {step.label}", - depends_on=["image-build-amd"], + depends_on=[mirror_build_dep], key=f"block-amd-{_generate_step_key(step.label)}", ) amd_mirror_steps.append(amd_block_step) - amd_step = _create_amd_mirror_step(step, step_commands, step.mirror["amd"]) - if amd_block_step: - amd_step.depends_on.extend([amd_block_step.key]) + amd_step.depends_on.append(amd_block_step.key) amd_mirror_steps.append(amd_step) buildkite_group_steps.append( @@ -304,6 +306,14 @@ def _step_should_run(step: Step, list_file_diff: List[str]) -> bool: return False global_config = get_global_config() if step.key and step.key.startswith("image-build"): + # The shared AMD image build stays on-demand for non-main branches, + # except on scheduled nightlies where it should run automatically. + if ( + step.key == "image-build-amd" + and global_config["branch"] != "main" + and global_config["nightly"] != "1" + ): + return False return True if global_config["nightly"] == "1": return True @@ -377,6 +387,8 @@ def _create_amd_mirror_step(step: Step, original_commands: List[str], amd: Dict[ DeviceType.AMD_MI355_8: AgentQueue.AMD_MI355_8, } + build_dep = "image-build-amd" + amd_queue = amd_queue_map.get(amd_device) if not amd_queue: raise ValueError(f"Invalid AMD device: {amd_device}. Valid devices: {list(amd_queue_map.keys())}") @@ -384,9 +396,21 @@ def _create_amd_mirror_step(step: Step, original_commands: List[str], amd: Dict[ return BuildkiteCommandStep( label=amd_label, commands=[amd_command_wrapped], - depends_on=["image-build-amd"], + depends_on=[build_dep], agents={"queue": amd_queue}, - env={"DOCKER_BUILDKIT": "1", "VLLM_TEST_COMMANDS": amd_commands_str}, + env={ + "DOCKER_BUILDKIT": "1", + # Agent hooks read DOCKER_IMAGE_NAME before run-amd-test.py starts. + # Keep the hook warmup on ci_base; the runner uses the full image + # only if ci_base or artifact setup fails before tests begin. + "DOCKER_IMAGE_NAME": "rocm/vllm-dev:ci_base", + "VLLM_CI_BASE_IMAGE": "rocm/vllm-dev:ci_base", + "VLLM_CI_FALLBACK_IMAGE": "rocm/vllm-ci:$BUILDKITE_COMMIT", + "VLLM_CI_USE_ARTIFACTS": "1", + "VLLM_CI_ARTIFACT_GLOB": "artifacts/vllm-rocm-install/vllm-rocm-install.tar.gz", + "VLLM_CI_RESULTS_ROOT": "/home/buildkite-agent/huggingface/amd-ci-results", + "VLLM_TEST_COMMANDS": amd_commands_str, + }, priority=200, soft_fail=False, retry=None, diff --git a/buildkite/test-template-amd.j2 b/buildkite/test-template-amd.j2 index 1b222968..31c1976a 100644 --- a/buildkite/test-template-amd.j2 +++ b/buildkite/test-template-amd.j2 @@ -1,4 +1,3 @@ -{% set docker_image_amd = "rocm/vllm-ci:$BUILDKITE_COMMIT" %} {% set default_working_dir = "/vllm-workspace/tests" %} {% set list_file_diff = list_file_diff | replace("\r", "") | replace("\n", "|") | split("|") %} @@ -21,39 +20,66 @@ - group: "AMD Tests" depends_on: ~ steps: - - label: "AMD: :docker: build image" + - label: "AMD: :docker: ensure ci_base" depends_on: ~ soft_fail: false commands: - # Handle the introduction of test target in Dockerfile.rocm + - bash .buildkite/scripts/ci-bake-rocm.sh ci-base-rocm-ci-with-deps + key: "amd-ci-base-ready" + env: + DOCKER_BUILDKIT: "1" + CI_BASE_CONTENT_FILES: "requirements/common.txt requirements/rocm.txt requirements/test/rocm.txt docker/Dockerfile.rocm_base tools/install_torchcodec_rocm.sh tests/vllm_test_utils" + CI_BASE_DOCKERFILE: "docker/Dockerfile.rocm" + CI_BASE_DOCKERFILE_STAGES: "base build_rixl build_rocshmem build_deepep mori_base ci_base" + IMAGE_TAG: "rocm/vllm-dev:ci_base" + CI_BASE_IMAGE_TAG: "rocm/vllm-dev:ci_base" + VLLM_BAKE_FILE: "docker/docker-bake-rocm.hcl" + CI_HCL_URL: "https://raw.githubusercontent.com/vllm-project/ci-infra/{{ vllm_ci_branch | default('main') }}/docker/ci-rocm.hcl" + PYTORCH_ROCM_ARCH: "gfx90a;gfx942;gfx950" + REMOTE_VLLM: "1" + VLLM_BRANCH: "$BUILDKITE_COMMIT" +{{ amd_infra_retry(' ') }} + agents: + queue: amd-cpu + + {% if branch != "main" and nightly != "1" %} + - block: "Run AMD ROCm image build" + depends_on: "amd-ci-base-ready" + key: "block-amd-build" + {% endif %} + + - label: "AMD: :docker: build image" + {% if branch == "main" or nightly == "1" %} + depends_on: "amd-ci-base-ready" + {% else %} + depends_on: "block-amd-build" + {% endif %} + soft_fail: false + commands: - | - #!/bin/bash - if docker manifest inspect {{ docker_image_amd }} >/dev/null 2>&1; then - echo "Image already exists: {{ docker_image_amd }}" - echo "Skipping AMD image build" - exit 0 + if [[ "${ROCM_CI_ARTIFACT_ONLY:-0}" == "1" ]]; then + echo "ROCM_CI_ARTIFACT_ONLY=1; building ROCm wheel artifact only" + IMAGE_TAG="" bash .buildkite/scripts/ci-bake-rocm.sh test-rocm-ci-with-artifacts + else + bash .buildkite/scripts/ci-bake-rocm.sh test-rocm-ci-with-wheel fi - echo "Image not found, proceeding with build" - - > - docker build - --build-arg REMOTE_VLLM=1 - --build-arg ARG_PYTORCH_ROCM_ARCH='gfx90a;gfx942;gfx950' - --build-arg VLLM_BRANCH=$BUILDKITE_COMMIT - --tag {{ docker_image_amd }} - -f docker/Dockerfile.rocm - --target test - --progress plain . - - "docker push {{ docker_image_amd }}" key: "amd-build" env: DOCKER_BUILDKIT: "1" + VLLM_BAKE_FILE: "docker/docker-bake-rocm.hcl" + CI_HCL_URL: "https://raw.githubusercontent.com/vllm-project/ci-infra/{{ vllm_ci_branch | default('main') }}/docker/ci-rocm.hcl" + PYTORCH_ROCM_ARCH: "gfx90a;gfx942;gfx950" + CI_BASE_IMAGE: "rocm/vllm-dev:ci_base" + IMAGE_TAG: "rocm/vllm-ci:$BUILDKITE_COMMIT" + REMOTE_VLLM: "1" + VLLM_BRANCH: "$BUILDKITE_COMMIT" {{ amd_infra_retry(' ') }} agents: queue: amd-cpu {% for step in steps %} {% if step.mirror_hardwares and mirror_hw in step.mirror_hardwares %} - + {% set ns = namespace(blocked=1) %} {% set step_slug = amd_step_slug(step) %} {% set container_timeout_s = (step.timeout_in_minutes * 60 - 60) if step.timeout_in_minutes and step.timeout_in_minutes > 1 else (step.timeout_in_minutes * 60 if step.timeout_in_minutes else none) %} @@ -98,10 +124,30 @@ command: bash .buildkite/scripts/hardware_ci/run-amd-test.sh env: DOCKER_BUILDKIT: "1" + NUM_NODES: "{{ step.num_nodes or 1 }}" + # Agent hooks read DOCKER_IMAGE_NAME before run-amd-test.py starts. + # Keep the hook warmup on ci_base; the runner uses the full image only + # if ci_base or artifact setup fails before tests begin. + DOCKER_IMAGE_NAME: "rocm/vllm-dev:ci_base" + VLLM_CI_BASE_IMAGE: "rocm/vllm-dev:ci_base" + VLLM_CI_FALLBACK_IMAGE: "rocm/vllm-ci:$BUILDKITE_COMMIT" + VLLM_CI_USE_ARTIFACTS: "1" + VLLM_CI_ARTIFACT_GLOB: "artifacts/vllm-rocm-install/vllm-rocm-install.tar.gz" + VLLM_CI_RESULTS_ROOT: "/home/buildkite-agent/huggingface/amd-ci-results" {% if container_timeout_s %} CONTAINER_TIMEOUT_S: "{{ container_timeout_s }}" {% endif %} + {% if step.num_nodes and step.num_nodes >= 2 %} + VLLM_CI_EXECUTION_MODE: "multi-node" + VLLM_NUM_GPUS_PER_NODE: "{{ step.num_gpus or 1 }}" + VLLM_NODE_COMMAND_COUNT: "{{ step.commands | length }}" + {% for node_command in step.commands %} + VLLM_NODE_COMMAND_{{ loop.index0 }}: "(command rocm-smi || true) && export VLLM_TEST_GROUP_NAME={{ step_slug }}-node{{ loop.index0 }} && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 && cd {{ (step.working_dir or default_working_dir) | safe }} && {{ node_command | replace('\"', '\\\"') | safe }}" + {% endfor %} + {% else %} + VLLM_CI_EXECUTION_MODE: "single-node" VLLM_TEST_COMMANDS: "(command rocm-smi || true) && export VLLM_TEST_GROUP_NAME={{ step_slug }} && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 && cd {{ (step.working_dir or default_working_dir) | safe }} && {{ (step.command or (step.commands | join(' && '))) | replace('\"', '\\\"') | safe }}" + {% endif %} priority: 100 {% if step.grade and step.grade == "Blocking" %} soft_fail: false diff --git a/docker/ci-rocm.hcl b/docker/ci-rocm.hcl new file mode 100644 index 00000000..41234a4a --- /dev/null +++ b/docker/ci-rocm.hcl @@ -0,0 +1,337 @@ +# ci-rocm.hcl - CI-specific configuration for vLLM ROCm Docker builds +# +# This file lives in ci-infra repo at docker/ci-rocm.hcl +# Used with: docker buildx bake -f docker/docker-bake-rocm.hcl -f ci-rocm.hcl test-rocm-ci +# +# Registry cache: Docker Hub (rocm/vllm-ci-cache) is used exclusively. +# AMD build agents already have Docker Hub credentials (they push the test +# image to rocm/vllm-ci), so no additional credential setup is required. +# ROCm CI does not use a separate remote compiler cache. + +# CI metadata + +variable "BUILDKITE_COMMIT" { + default = "" +} + +variable "BUILDKITE_BUILD_NUMBER" { + default = "" +} + +variable "BUILDKITE_BUILD_ID" { + default = "" +} + +variable "PARENT_COMMIT" { + default = "" +} + +# Merge-base of HEAD with main - provides a more stable cache fallback than +# parent commit for long-lived PRs. Mirrors the VLLM_MERGE_BASE_COMMIT +# pattern used in the shared ci.hcl file. Auto-computed by ci-bake-rocm.sh +# when unset. +variable "VLLM_MERGE_BASE_COMMIT" { + default = "" +} + +# Bridge to vLLM's COMMIT variable for OCI labels +variable "COMMIT" { + default = BUILDKITE_COMMIT +} + +# Image tags (set by CI) + +variable "IMAGE_TAG" { + default = "" +} + +variable "IMAGE_TAG_LATEST" { + default = "" +} + +# ROCm-specific GPU architecture targets + +variable "PYTORCH_ROCM_ARCH" { + default = "gfx90a;gfx942;gfx950" +} + +# Pre-built CI base image (Tier 1). Per-PR builds pull this instead of +# rebuilding RIXL/DeepEP/torchcodec from scratch. The ci_base stage in +# Dockerfile.rocm inherits from base, so CI_BASE_IMAGE only affects the test +# stage and is irrelevant when building --target ci_base itself. +variable "CI_BASE_IMAGE" { + default = "rocm/vllm-dev:ci_base" +} + +# Leave CI_MAX_JOBS empty so the Dockerfile falls back to $(nproc) and uses +# the full builder parallelism. Operators can still override this per build. +variable "CI_MAX_JOBS" { + default = "" +} + +# Upstream dependency commit pins -- extracted from Dockerfile.rocm by +# ci-bake-rocm.sh at build time. Empty defaults are safe: the cache +# functions produce no entries when the variable is empty. +variable "RIXL_BRANCH" { + default = "" +} + +variable "UCX_BRANCH" { + default = "" +} + +variable "ROCSHMEM_BRANCH" { + default = "" +} + +variable "DEEPEP_BRANCH" { + default = "" +} + +# Docker Hub registry cache for AMD builds. +# +# A separate repo (rocm/vllm-ci-cache) is used for BuildKit layer cache. +# cache-to uses mode=min to reduce the volume of data pushed. +# NOTE: mode=min still includes all layers referenced by the final image +# manifest, including inherited base layers (~7.25GB ROCm runtime). +# Docker Hub auto-creates the repo on first push. +# +# Final-image cache stays commit-scoped. Branch-to-branch reuse for the test +# image comes from importing the parent and merge-base commit cache refs. +# +# The source-scoped native cache is exported both per-commit and per-branch so +# ROCm extension rebuilds are shareable within the same commit reruns and across +# consecutive commits on the same branch without depending on a single global +# latest tag. + +variable "DOCKERHUB_CACHE_REPO" { + default = "rocm/vllm-ci-cache" +} + +variable "DOCKERHUB_CACHE_TO" { + default = "" +} + +variable "ROCM_CACHE_BRANCH_TAG" { + default = "" +} + +variable "ROCM_CACHE_UPSTREAM_BRANCH_TAG" { + default = "" +} + +# Functions + +function "get_cache_from_rocm" { + params = [] + result = compact([ + # Exact commit hit - fastest cache on re-runs of the same commit + BUILDKITE_COMMIT != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:rocm-${BUILDKITE_COMMIT}" : "", + # Parent commit - useful cache for incremental changes + PARENT_COMMIT != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:rocm-${PARENT_COMMIT}" : "", + # Merge-base with main - stable fallback for long-lived or rebased PRs; + # maps to a real main-branch commit whose cache layers are likely warm + VLLM_MERGE_BASE_COMMIT != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:rocm-${VLLM_MERGE_BASE_COMMIT}" : "", + # Import the source-scoped native build cache as well so builds whose + # Python/package layers changed can still reuse compiled ROCm objects. + BUILDKITE_COMMIT != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:csrc-rocm-${BUILDKITE_COMMIT}" : "", + PARENT_COMMIT != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:csrc-rocm-${PARENT_COMMIT}" : "", + VLLM_MERGE_BASE_COMMIT != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:csrc-rocm-${VLLM_MERGE_BASE_COMMIT}" : "", + ROCM_CACHE_BRANCH_TAG != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:csrc-rocm-branch-${ROCM_CACHE_BRANCH_TAG}" : "", + ROCM_CACHE_UPSTREAM_BRANCH_TAG != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:csrc-rocm-branch-${ROCM_CACHE_UPSTREAM_BRANCH_TAG}" : "", + # Branch-scoped full image cache - fallback when parent-commit cache is evicted + ROCM_CACHE_BRANCH_TAG != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:rocm-branch-${ROCM_CACHE_BRANCH_TAG}" : "", + ROCM_CACHE_UPSTREAM_BRANCH_TAG != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:rocm-branch-${ROCM_CACHE_UPSTREAM_BRANCH_TAG}" : "", + ]) +} + +function "get_cache_to_rocm" { + params = [] + result = compact([ + # Commit-scoped cache for exact re-runs. + BUILDKITE_COMMIT != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:rocm-${BUILDKITE_COMMIT},mode=min" : "", + # Branch-scoped cache so later commits on the same branch can reuse the full + # image layers when the parent-commit cache is evicted. Unlike the old + # rocm-latest tag (which caused duplicate exporter 400s), this is per-branch. + ROCM_CACHE_BRANCH_TAG != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:rocm-branch-${ROCM_CACHE_BRANCH_TAG},mode=min" : "", + ]) +} + +function "get_cache_from_rocm_csrc" { + params = [] + result = compact([ + BUILDKITE_COMMIT != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:csrc-rocm-${BUILDKITE_COMMIT}" : "", + PARENT_COMMIT != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:csrc-rocm-${PARENT_COMMIT}" : "", + VLLM_MERGE_BASE_COMMIT != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:csrc-rocm-${VLLM_MERGE_BASE_COMMIT}" : "", + ROCM_CACHE_BRANCH_TAG != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:csrc-rocm-branch-${ROCM_CACHE_BRANCH_TAG}" : "", + ROCM_CACHE_UPSTREAM_BRANCH_TAG != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:csrc-rocm-branch-${ROCM_CACHE_UPSTREAM_BRANCH_TAG}" : "", + ]) +} + +function "get_cache_to_rocm_csrc" { + params = [] + result = compact([ + # Export the exact-commit native cache for same-commit reruns. + BUILDKITE_COMMIT != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:csrc-rocm-${BUILDKITE_COMMIT},mode=min" : "", + # Export the branch-scoped native cache so later commits on the same branch + # can reuse compiled ROCm objects even when the exact parent cache is absent. + ROCM_CACHE_BRANCH_TAG != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:csrc-rocm-branch-${ROCM_CACHE_BRANCH_TAG},mode=min" : "", + ]) +} + +# Cache functions for upstream dependency stages (RIXL/UCX, ROCShmem, DeepEP). +# These stages are pinned to specific upstream commit hashes, so cache keys use +# those hashes rather than the Buildkite commit. This means the cache persists +# across all vLLM commits as long as the upstream dependency pins don't change. + +function "get_cache_from_rocm_deps" { + params = [] + result = compact([ + RIXL_BRANCH != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:rixl-rocm-${RIXL_BRANCH}-ucx-${UCX_BRANCH}" : "", + ROCSHMEM_BRANCH != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:rocshmem-rocm-${ROCSHMEM_BRANCH}" : "", + DEEPEP_BRANCH != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:deepep-rocm-${DEEPEP_BRANCH}-rocshmem-${ROCSHMEM_BRANCH}" : "", + ]) +} + +function "get_cache_to_rocm_rixl" { + params = [] + result = compact([ + RIXL_BRANCH != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:rixl-rocm-${RIXL_BRANCH}-ucx-${UCX_BRANCH},mode=min" : "", + ]) +} + +function "get_cache_to_rocm_rocshmem" { + params = [] + result = compact([ + ROCSHMEM_BRANCH != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:rocshmem-rocm-${ROCSHMEM_BRANCH},mode=min" : "", + ]) +} + +function "get_cache_to_rocm_deepep" { + params = [] + result = compact([ + DEEPEP_BRANCH != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:deepep-rocm-${DEEPEP_BRANCH}-rocshmem-${ROCSHMEM_BRANCH},mode=min" : "", + ]) +} + +# CI targets + +target "_ci-rocm" { + annotations = [ + "manifest:vllm.buildkite.build_number=${BUILDKITE_BUILD_NUMBER}", + "manifest:vllm.buildkite.build_id=${BUILDKITE_BUILD_ID}", + ] + args = { + ARG_PYTORCH_ROCM_ARCH = PYTORCH_ROCM_ARCH + CI_BASE_IMAGE = CI_BASE_IMAGE + max_jobs = CI_MAX_JOBS + } +} + +target "test-rocm-ci" { + inherits = ["_common-rocm", "_ci-rocm", "_labels"] + target = "test" + cache-from = get_cache_from_rocm() + cache-to = get_cache_to_rocm() + tags = compact([ + IMAGE_TAG, + IMAGE_TAG_LATEST, + ]) + output = ["type=registry"] +} + +# Cache-only target for the source-scoped ROCm native build stage. +# This persists the csrc-build stage in the registry cache even though the +# final test image only consumes it indirectly while packaging the wheel. +target "csrc-rocm-ci" { + inherits = ["_common-rocm", "_ci-rocm"] + target = "csrc-build" + cache-from = get_cache_from_rocm_csrc() + cache-to = get_cache_to_rocm_csrc() + output = ["type=cacheonly"] +} + +# Keep wheel export on the same CI graph as the test image build so the +# shared build_vllm/export_vllm stages resolve identically within one bake +# invocation. Without this, export-wheel-rocm uses the plain local target +# args while test-rocm-ci uses CI-only args, which can lead to separate +# cache lineages and inconsistent export_vllm results. +target "export-wheel-rocm" { + inherits = ["_common-rocm", "_ci-rocm"] + target = "export_vllm" + cache-from = get_cache_from_rocm() + cache-to = get_cache_to_rocm() + output = ["type=local,dest=./wheel-export"] +} + +# Artifact-only vLLM build. GPU test jobs consume this artifact on top of +# ci_base, avoiding a per-commit multi-GB image push/pull. +group "test-rocm-ci-with-artifacts" { + targets = ["csrc-rocm-ci", "export-wheel-rocm"] +} + +# Full test image + wheel export. Kept for fallback/debugging when a pushed +# per-commit image is useful. +group "test-rocm-ci-with-wheel" { + targets = ["csrc-rocm-ci", "test-rocm-ci", "export-wheel-rocm"] +} + +# Image tag for the ci_base build. The ensure-ci-base step (in both the Jinja +# template and amd.yaml) rebuilds this image when content-hash drift is detected. +variable "CI_BASE_IMAGE_TAG" { + default = "rocm/vllm-dev:ci_base" +} + +# Cache-only targets for upstream dependency stages. These persist each stage +# in the registry cache keyed by its upstream commit hash. When ci_base rebuilds +# (e.g., requirements change), these stages are cache hits if their upstream +# pins haven't changed -- saving ~35min of compilation. +target "rixl-rocm-ci" { + inherits = ["_common-rocm", "_ci-rocm"] + target = "build_rixl" + cache-from = get_cache_from_rocm_deps() + cache-to = get_cache_to_rocm_rixl() + output = ["type=cacheonly"] +} + +target "rocshmem-rocm-ci" { + inherits = ["_common-rocm", "_ci-rocm"] + target = "build_rocshmem" + cache-from = get_cache_from_rocm_deps() + cache-to = get_cache_to_rocm_rocshmem() + output = ["type=cacheonly"] +} + +target "deepep-rocm-ci" { + inherits = ["_common-rocm", "_ci-rocm"] + target = "build_deepep" + cache-from = get_cache_from_rocm_deps() + cache-to = get_cache_to_rocm_deepep() + output = ["type=cacheonly"] +} + +# Builds only the ci_base stage (RIXL, DeepEP, torchcodec, etc.) +# Invoked by the ensure-ci-base step when the content hash of ci_base-affecting +# files drifts from the remote image label. Per-PR builds then pull the result +# as CI_BASE_IMAGE instead of rebuilding those slow layers on every commit. +# Uses inline cache metadata on the ci_base image itself instead of exporting a +# separate registry cache artifact. +target "ci-base-rocm-ci" { + inherits = ["_common-rocm", "_ci-rocm", "_labels"] + target = "ci_base" + cache-from = concat( + ["type=registry,ref=${CI_BASE_IMAGE_TAG}"], + # Import upstream dependency caches so RIXL/ROCShmem/DeepEP stages + # are cache hits even when ci_base itself needs rebuilding. + get_cache_from_rocm_deps(), + ) + cache-to = ["type=inline"] + tags = [CI_BASE_IMAGE_TAG] + output = ["type=registry"] +} + +# Group for ci_base builds -- exports dependency stage caches alongside the +# ci_base image so future rebuilds can reuse them independently. +group "ci-base-rocm-ci-with-deps" { + targets = ["rixl-rocm-ci", "rocshmem-rocm-ci", "deepep-rocm-ci", "ci-base-rocm-ci"] +}