diff --git a/buildkite/bootstrap-amd.sh b/buildkite/bootstrap-amd.sh
index d6473642..cabea4a3 100644
--- a/buildkite/bootstrap-amd.sh
+++ b/buildkite/bootstrap-amd.sh
@@ -27,6 +27,32 @@ if [[ -z "${COV_ENABLED:-}" ]]; then
     COV_ENABLED=0
 fi
 
+# ---------------------------------------------------------------------------
+# Helper functions
+# ---------------------------------------------------------------------------
+fetch_origin_ref() {
+    local ref="$1"
+    git fetch --no-tags --depth=50 origin "${ref}:refs/remotes/origin/${ref}" >/dev/null 2>&1 || \
+        git fetch --no-tags origin "${ref}:refs/remotes/origin/${ref}" >/dev/null 2>&1
+}
+
+get_pr_labels() {
+    if [[ "${BUILDKITE_PULL_REQUEST:-false}" == "false" ]]; then
+        return 0
+    fi
+
+    curl -fsSL "https://api.github.com/repos/vllm-project/vllm/pulls/$BUILDKITE_PULL_REQUEST" 2>/dev/null | \
+        jq -r '.labels[].name' 2>/dev/null || true
+}
+
+join_file_diff() {
+    if [[ -z "${1:-}" ]]; then
+        return 0
+    fi
+
+    printf '%s\n' "$1" | tr -d '\r' | paste -sd'|' -
+}
+
 # ---------------------------------------------------------------------------
 # Git setup: ensure origin/main is available and compute merge base once.
 # On K8s (blobless clones with --filter=blob:none), origin/main may not be
@@ -35,9 +61,14 @@ fi
 # ---------------------------------------------------------------------------
 git config --global --add safe.directory "$(pwd)" 2>/dev/null || true
 
+if git rev-parse --is-shallow-repository 2>/dev/null | grep -q "true"; then
+    echo "Shallow repository detected, deepening history..."
+    git fetch --no-tags --deepen=50 origin >/dev/null 2>&1 || true
+fi
+
 if ! git rev-parse --verify origin/main >/dev/null 2>&1; then
     echo "origin/main not found, fetching..."
-    git fetch origin main --depth=1 2>/dev/null || git fetch origin main || true
+    fetch_origin_ref main || true
 fi
 
 if [[ -z "${MERGE_BASE_COMMIT:-}" ]]; then
@@ -49,15 +80,11 @@ if [[ -z "${MERGE_BASE_COMMIT:-}" ]]; then
     fi
 fi
 
-# ---------------------------------------------------------------------------
-# Helper functions
-# ---------------------------------------------------------------------------
-
 fail_fast() {
     DISABLE_LABEL="ci-no-fail-fast"
     # If BUILDKITE_PULL_REQUEST != "false", then we check the PR labels using curl and jq
     if [ "$BUILDKITE_PULL_REQUEST" != "false" ]; then
-        PR_LABELS=$(curl -s "https://api.github.com/repos/vllm-project/vllm/pulls/$BUILDKITE_PULL_REQUEST" | jq -r '.labels[].name')
+        PR_LABELS=$(get_pr_labels)
         if [[ $PR_LABELS == *"$DISABLE_LABEL"* ]]; then
             echo false
         else
@@ -72,7 +99,7 @@ check_run_all_label() {
     RUN_ALL_LABEL="ready-run-all-tests"
     # If BUILDKITE_PULL_REQUEST != "false", then we check the PR labels using curl and jq
     if [ "$BUILDKITE_PULL_REQUEST" != "false" ]; then
-        PR_LABELS=$(curl -s "https://api.github.com/repos/vllm-project/vllm/pulls/$BUILDKITE_PULL_REQUEST" | jq -r '.labels[].name')
+        PR_LABELS=$(get_pr_labels)
         if [[ $PR_LABELS == *"$RUN_ALL_LABEL"* ]]; then
             echo true
         else
@@ -107,16 +134,20 @@ upload_pipeline() {
     # Install minijinja
     ls .buildkite || buildkite-agent annotate --style error 'Please merge upstream main branch for buildkite CI'
     curl -sSfL https://github.com/mitsuhiko/minijinja/releases/download/2.3.1/minijinja-cli-installer.sh | sh
-    source "$HOME/.cargo/env"
+    TEMPLATE_PATH=".buildkite/test-template-amd.j2"
+    CARGO_ENV="${CARGO_HOME:-$HOME/.cargo}/env"
+    if [[ ! -f "$CARGO_ENV" ]]; then
+        echo "Error: Cargo env file not found at $CARGO_ENV"
+        exit 1
+    fi
+    # shellcheck disable=SC1090
+    source "$CARGO_ENV"
 
     if [[ $BUILDKITE_PIPELINE_SLUG == "fastcheck" ]]; then
         AMD_MIRROR_HW="amdtentative"
-        curl -o .buildkite/test-template.j2 \
-            "https://raw.githubusercontent.com/vllm-project/ci-infra/$VLLM_CI_BRANCH/buildkite/test-template-amd.j2?$(date +%s)"
-    else
-        curl -o .buildkite/test-template.j2 \
-            "https://raw.githubusercontent.com/vllm-project/ci-infra/$VLLM_CI_BRANCH/buildkite/test-template-amd.j2?$(date +%s)"
     fi
+    curl -fsSL -o "$TEMPLATE_PATH" \
+        "https://raw.githubusercontent.com/vllm-project/ci-infra/$VLLM_CI_BRANCH/buildkite/test-template-amd.j2?$(date +%s)"
 
 
     # (WIP) Use pipeline generator instead of jinja template
@@ -137,7 +168,7 @@ upload_pipeline() {
     (
         set -x
         # Output pipeline.yaml with all blank lines removed
-        minijinja-cli test-template.j2 test-amd.yaml \
+        minijinja-cli test-template-amd.j2 test-amd.yaml \
             -D branch="$BUILDKITE_BRANCH" \
             -D list_file_diff="$LIST_FILE_DIFF" \
             -D run_all="$RUN_ALL" \
@@ -160,9 +191,26 @@ upload_pipeline() {
 # ---------------------------------------------------------------------------
 # Compute file diff
 # ---------------------------------------------------------------------------
-file_diff=$(get_diff)
+if [[ $BUILDKITE_BRANCH == "main" ]] && ! git rev-parse --verify HEAD~1 >/dev/null 2>&1; then
+    echo "HEAD~1 not available on main, fetching one more commit..."
+    git fetch --no-tags --deepen=1 origin >/dev/null 2>&1 || true
+fi
+
+diff_unavailable=0
+if [[ $BUILDKITE_BRANCH == "main" ]] && ! git rev-parse --verify HEAD~1 >/dev/null 2>&1; then
+    echo "WARNING: Could not resolve HEAD~1 on main, falling back to run_all=1"
+    RUN_ALL=1
+    diff_unavailable=1
+fi
+
 if [[ $BUILDKITE_BRANCH == "main" ]]; then
-    file_diff=$(get_diff_main)
+    if [[ $diff_unavailable -eq 1 ]]; then
+        file_diff=""
+    else
+        file_diff=$(get_diff_main)
+    fi
+else
+    file_diff=$(get_diff)
 fi
 
 # ----------------------------------------------------------------------
@@ -183,13 +231,13 @@ if [[ "${DOCS_ONLY_DISABLE}" != "1" ]]; then
         docs_only=0
         break
       fi
-    done < <(printf '%s\n' "$file_diff" | tr ' ' '\n' | tr -d '\r')
+    done < <(printf '%s\n' "$file_diff" | tr -d '\r')
 
     if [[ "$docs_only" -eq 1 ]]; then
       buildkite-agent annotate ":memo: CI skipped — docs/Markdown/mkdocs-only changes detected
 
 \`\`\`
-$(printf '%s\n' "$file_diff" | tr ' ' '\n')
+$(printf '%s\n' "$file_diff" | tr -d '\r')
 \`\`\`" --style "info" || true
       echo "[docs-only] All changes are docs/**, *.md, or mkdocs.yaml. Exiting before pipeline upload."
       exit 0
@@ -206,12 +254,9 @@ patterns=(
     "docker/Dockerfile.rocm_base"
     "CMakeLists.txt"
     "requirements/common.txt"
-    "requirements/cuda.txt"
-    "requirements/build.txt"
-    "requirements/test.txt"
     "requirements/rocm.txt"
-    "requirements/rocm-build.txt"
-    "requirements/rocm-test.txt"
+    "requirements/build/rocm.txt"
+    "requirements/test/rocm.txt"
     "setup.py"
     "csrc/"
     "cmake/"
@@ -219,12 +264,11 @@ patterns=(
 
 ignore_patterns=(
     "csrc/cpu"
-    "csrc/rocm"
-    "cmake/hipify.py"
     "cmake/cpu_extension.cmake"
 )
 
-for file in $file_diff; do
+while IFS= read -r file; do
+    [[ -z "$file" ]] && continue
     # First check if file matches any pattern
     matches_pattern=0
     for pattern in "${patterns[@]}"; do
@@ -250,7 +294,7 @@ for file in $file_diff; do
             break
         fi
     fi
-done
+done < <(printf '%s\n' "$file_diff" | tr -d '\r')
 
 # Check for ready-run-all-tests label
 LABEL_RUN_ALL=$(check_run_all_label)
@@ -279,7 +323,7 @@ fi
 if [[ $RUN_ALL -eq 1 ]]; then
     LIST_FILE_DIFF="run_all"
 else
-    LIST_FILE_DIFF=$(echo "$file_diff" | tr ' ' '|')
+    LIST_FILE_DIFF=$(join_file_diff "$file_diff")
 fi
 
 upload_pipeline
diff --git a/buildkite/pipeline_generator/buildkite_step.py b/buildkite/pipeline_generator/buildkite_step.py
index 9830a698..e4710a03 100644
--- a/buildkite/pipeline_generator/buildkite_step.py
+++ b/buildkite/pipeline_generator/buildkite_step.py
@@ -27,7 +27,9 @@ class BuildkiteCommandStep(BaseModel):
     def to_yaml(self):
         return {
             "label": self.label,
+            "key": self.key,
             "group": self.group,
+            "agents": self.agents,
             "commands": self.commands,
             "depends_on": self.depends_on,
             "soft_fail": self.soft_fail,
@@ -267,16 +269,16 @@ def convert_group_step_to_buildkite_step(
 
             # Create AMD mirror step and its block step if specified/applicable
             if step.mirror and step.mirror.get("amd"):
-                amd_block_step = None
+                amd_step = _create_amd_mirror_step(step, step_commands, step.mirror["amd"])
+                # Block step depends on the shared AMD image build.
+                mirror_build_dep = amd_step.depends_on[0] if amd_step.depends_on else "image-build-amd"
                 amd_block_step = BuildkiteBlockStep(
                     block=f"Run AMD: {step.label}",
-                    depends_on=["image-build-amd"],
+                    depends_on=[mirror_build_dep],
                     key=f"block-amd-{_generate_step_key(step.label)}",
                 )
                 amd_mirror_steps.append(amd_block_step)
-                amd_step = _create_amd_mirror_step(step, step_commands, step.mirror["amd"])
-                if amd_block_step:
-                    amd_step.depends_on.extend([amd_block_step.key])
+                amd_step.depends_on.append(amd_block_step.key)
                 amd_mirror_steps.append(amd_step)
 
         buildkite_group_steps.append(
@@ -304,6 +306,14 @@ def _step_should_run(step: Step, list_file_diff: List[str]) -> bool:
         return False
     global_config = get_global_config()
     if step.key and step.key.startswith("image-build"):
+        # The shared AMD image build stays on-demand for non-main branches,
+        # except on scheduled nightlies where it should run automatically.
+        if (
+            step.key == "image-build-amd"
+            and global_config["branch"] != "main"
+            and global_config["nightly"] != "1"
+        ):
+            return False
         return True
     if global_config["nightly"] == "1":
         return True
@@ -377,6 +387,8 @@ def _create_amd_mirror_step(step: Step, original_commands: List[str], amd: Dict[
         DeviceType.AMD_MI355_8: AgentQueue.AMD_MI355_8,
     }
 
+    build_dep = "image-build-amd"
+
     amd_queue = amd_queue_map.get(amd_device)
     if not amd_queue:
         raise ValueError(f"Invalid AMD device: {amd_device}. Valid devices: {list(amd_queue_map.keys())}")
@@ -384,9 +396,21 @@ def _create_amd_mirror_step(step: Step, original_commands: List[str], amd: Dict[
     return BuildkiteCommandStep(
         label=amd_label,
         commands=[amd_command_wrapped],
-        depends_on=["image-build-amd"],
+        depends_on=[build_dep],
         agents={"queue": amd_queue},
-        env={"DOCKER_BUILDKIT": "1", "VLLM_TEST_COMMANDS": amd_commands_str},
+        env={
+            "DOCKER_BUILDKIT": "1",
+            # Agent hooks read DOCKER_IMAGE_NAME before run-amd-test.py starts.
+            # Keep the hook warmup on ci_base; the runner uses the full image
+            # only if ci_base or artifact setup fails before tests begin.
+            "DOCKER_IMAGE_NAME": "rocm/vllm-dev:ci_base",
+            "VLLM_CI_BASE_IMAGE": "rocm/vllm-dev:ci_base",
+            "VLLM_CI_FALLBACK_IMAGE": "rocm/vllm-ci:$BUILDKITE_COMMIT",
+            "VLLM_CI_USE_ARTIFACTS": "1",
+            "VLLM_CI_ARTIFACT_GLOB": "artifacts/vllm-rocm-install/vllm-rocm-install.tar.gz",
+            "VLLM_CI_RESULTS_ROOT": "/home/buildkite-agent/huggingface/amd-ci-results",
+            "VLLM_TEST_COMMANDS": amd_commands_str,
+        },
         priority=200,
         soft_fail=False,
         retry=None,
diff --git a/buildkite/test-template-amd.j2 b/buildkite/test-template-amd.j2
index 1b222968..31c1976a 100644
--- a/buildkite/test-template-amd.j2
+++ b/buildkite/test-template-amd.j2
@@ -1,4 +1,3 @@
-{% set docker_image_amd = "rocm/vllm-ci:$BUILDKITE_COMMIT" %}
 {% set default_working_dir = "/vllm-workspace/tests" %}
 {% set list_file_diff = list_file_diff | replace("\r", "") | replace("\n", "|") | split("|") %}
 
@@ -21,39 +20,66 @@
   - group: "AMD Tests"
     depends_on: ~
     steps:
-      - label: "AMD: :docker: build image"
+      - label: "AMD: :docker: ensure ci_base"
         depends_on: ~
         soft_fail: false
         commands:
-          # Handle the introduction of test target in Dockerfile.rocm
+          - bash .buildkite/scripts/ci-bake-rocm.sh ci-base-rocm-ci-with-deps
+        key: "amd-ci-base-ready"
+        env:
+          DOCKER_BUILDKIT: "1"
+          CI_BASE_CONTENT_FILES: "requirements/common.txt requirements/rocm.txt requirements/test/rocm.txt docker/Dockerfile.rocm_base tools/install_torchcodec_rocm.sh tests/vllm_test_utils"
+          CI_BASE_DOCKERFILE: "docker/Dockerfile.rocm"
+          CI_BASE_DOCKERFILE_STAGES: "base build_rixl build_rocshmem build_deepep mori_base ci_base"
+          IMAGE_TAG: "rocm/vllm-dev:ci_base"
+          CI_BASE_IMAGE_TAG: "rocm/vllm-dev:ci_base"
+          VLLM_BAKE_FILE: "docker/docker-bake-rocm.hcl"
+          CI_HCL_URL: "https://raw.githubusercontent.com/vllm-project/ci-infra/{{ vllm_ci_branch | default('main') }}/docker/ci-rocm.hcl"
+          PYTORCH_ROCM_ARCH: "gfx90a;gfx942;gfx950"
+          REMOTE_VLLM: "1"
+          VLLM_BRANCH: "$BUILDKITE_COMMIT"
+{{ amd_infra_retry('        ') }}
+        agents:
+          queue: amd-cpu
+
+      {% if branch != "main" and nightly != "1" %}
+      - block: "Run AMD ROCm image build"
+        depends_on: "amd-ci-base-ready"
+        key: "block-amd-build"
+      {% endif %}
+
+      - label: "AMD: :docker: build image"
+        {% if branch == "main" or nightly == "1" %}
+        depends_on: "amd-ci-base-ready"
+        {% else %}
+        depends_on: "block-amd-build"
+        {% endif %}
+        soft_fail: false
+        commands:
           - |
-            #!/bin/bash
-            if docker manifest inspect {{ docker_image_amd }} >/dev/null 2>&1; then
-              echo "Image already exists: {{ docker_image_amd }}"
-              echo "Skipping AMD image build"
-              exit 0
+            if [[ "${ROCM_CI_ARTIFACT_ONLY:-0}" == "1" ]]; then
+              echo "ROCM_CI_ARTIFACT_ONLY=1; building ROCm wheel artifact only"
+              IMAGE_TAG="" bash .buildkite/scripts/ci-bake-rocm.sh test-rocm-ci-with-artifacts
+            else
+              bash .buildkite/scripts/ci-bake-rocm.sh test-rocm-ci-with-wheel
             fi
-            echo "Image not found, proceeding with build"
-          - >
-            docker build
-            --build-arg REMOTE_VLLM=1
-            --build-arg ARG_PYTORCH_ROCM_ARCH='gfx90a;gfx942;gfx950'
-            --build-arg VLLM_BRANCH=$BUILDKITE_COMMIT
-            --tag {{ docker_image_amd }}
-            -f docker/Dockerfile.rocm
-            --target test
-            --progress plain .
-          - "docker push {{ docker_image_amd }}"
         key: "amd-build"
         env:
           DOCKER_BUILDKIT: "1"
+          VLLM_BAKE_FILE: "docker/docker-bake-rocm.hcl"
+          CI_HCL_URL: "https://raw.githubusercontent.com/vllm-project/ci-infra/{{ vllm_ci_branch | default('main') }}/docker/ci-rocm.hcl"
+          PYTORCH_ROCM_ARCH: "gfx90a;gfx942;gfx950"
+          CI_BASE_IMAGE: "rocm/vllm-dev:ci_base"
+          IMAGE_TAG: "rocm/vllm-ci:$BUILDKITE_COMMIT"
+          REMOTE_VLLM: "1"
+          VLLM_BRANCH: "$BUILDKITE_COMMIT"
 {{ amd_infra_retry('        ') }}
         agents:
           queue: amd-cpu
 
     {% for step in steps %}
     {% if step.mirror_hardwares and mirror_hw in step.mirror_hardwares %}
-    
+
     {% set ns = namespace(blocked=1) %}
     {% set step_slug = amd_step_slug(step) %}
     {% set container_timeout_s = (step.timeout_in_minutes * 60 - 60) if step.timeout_in_minutes and step.timeout_in_minutes > 1 else (step.timeout_in_minutes * 60 if step.timeout_in_minutes else none) %}
@@ -98,10 +124,30 @@
         command: bash .buildkite/scripts/hardware_ci/run-amd-test.sh
         env:
           DOCKER_BUILDKIT: "1"
+          NUM_NODES: "{{ step.num_nodes or 1 }}"
+          # Agent hooks read DOCKER_IMAGE_NAME before run-amd-test.py starts.
+          # Keep the hook warmup on ci_base; the runner uses the full image only
+          # if ci_base or artifact setup fails before tests begin.
+          DOCKER_IMAGE_NAME: "rocm/vllm-dev:ci_base"
+          VLLM_CI_BASE_IMAGE: "rocm/vllm-dev:ci_base"
+          VLLM_CI_FALLBACK_IMAGE: "rocm/vllm-ci:$BUILDKITE_COMMIT"
+          VLLM_CI_USE_ARTIFACTS: "1"
+          VLLM_CI_ARTIFACT_GLOB: "artifacts/vllm-rocm-install/vllm-rocm-install.tar.gz"
+          VLLM_CI_RESULTS_ROOT: "/home/buildkite-agent/huggingface/amd-ci-results"
           {% if container_timeout_s %}
           CONTAINER_TIMEOUT_S: "{{ container_timeout_s }}"
           {% endif %}
+          {% if step.num_nodes and step.num_nodes >= 2 %}
+          VLLM_CI_EXECUTION_MODE: "multi-node"
+          VLLM_NUM_GPUS_PER_NODE: "{{ step.num_gpus or 1 }}"
+          VLLM_NODE_COMMAND_COUNT: "{{ step.commands | length }}"
+          {% for node_command in step.commands %}
+          VLLM_NODE_COMMAND_{{ loop.index0 }}: "(command rocm-smi || true) && export VLLM_TEST_GROUP_NAME={{ step_slug }}-node{{ loop.index0 }} && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 && cd {{ (step.working_dir or default_working_dir) | safe }} && {{ node_command | replace('\"', '\\\"') | safe }}"
+          {% endfor %}
+          {% else %}
+          VLLM_CI_EXECUTION_MODE: "single-node"
           VLLM_TEST_COMMANDS: "(command rocm-smi || true) && export VLLM_TEST_GROUP_NAME={{ step_slug }} && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 && cd {{ (step.working_dir or default_working_dir) | safe }} && {{ (step.command or (step.commands | join(' && '))) | replace('\"', '\\\"') | safe }}"
+          {% endif %}
         priority: 100
         {% if step.grade and step.grade == "Blocking" %}
         soft_fail: false
diff --git a/docker/ci-rocm.hcl b/docker/ci-rocm.hcl
new file mode 100644
index 00000000..41234a4a
--- /dev/null
+++ b/docker/ci-rocm.hcl
@@ -0,0 +1,337 @@
+# ci-rocm.hcl - CI-specific configuration for vLLM ROCm Docker builds
+#
+# This file lives in ci-infra repo at docker/ci-rocm.hcl
+# Used with: docker buildx bake -f docker/docker-bake-rocm.hcl -f ci-rocm.hcl test-rocm-ci
+#
+# Registry cache: Docker Hub (rocm/vllm-ci-cache) is used exclusively.
+# AMD build agents already have Docker Hub credentials (they push the test
+# image to rocm/vllm-ci), so no additional credential setup is required.
+# ROCm CI does not use a separate remote compiler cache.
+
+# CI metadata
+
+variable "BUILDKITE_COMMIT" {
+  default = ""
+}
+
+variable "BUILDKITE_BUILD_NUMBER" {
+  default = ""
+}
+
+variable "BUILDKITE_BUILD_ID" {
+  default = ""
+}
+
+variable "PARENT_COMMIT" {
+  default = ""
+}
+
+# Merge-base of HEAD with main - provides a more stable cache fallback than
+# parent commit for long-lived PRs. Mirrors the VLLM_MERGE_BASE_COMMIT
+# pattern used in the shared ci.hcl file. Auto-computed by ci-bake-rocm.sh
+# when unset.
+variable "VLLM_MERGE_BASE_COMMIT" {
+  default = ""
+}
+
+# Bridge to vLLM's COMMIT variable for OCI labels
+variable "COMMIT" {
+  default = BUILDKITE_COMMIT
+}
+
+# Image tags (set by CI)
+
+variable "IMAGE_TAG" {
+  default = ""
+}
+
+variable "IMAGE_TAG_LATEST" {
+  default = ""
+}
+
+# ROCm-specific GPU architecture targets
+
+variable "PYTORCH_ROCM_ARCH" {
+  default = "gfx90a;gfx942;gfx950"
+}
+
+# Pre-built CI base image (Tier 1). Per-PR builds pull this instead of
+# rebuilding RIXL/DeepEP/torchcodec from scratch. The ci_base stage in
+# Dockerfile.rocm inherits from base, so CI_BASE_IMAGE only affects the test
+# stage and is irrelevant when building --target ci_base itself.
+variable "CI_BASE_IMAGE" {
+  default = "rocm/vllm-dev:ci_base"
+}
+
+# Leave CI_MAX_JOBS empty so the Dockerfile falls back to $(nproc) and uses
+# the full builder parallelism. Operators can still override this per build.
+variable "CI_MAX_JOBS" {
+  default = ""
+}
+
+# Upstream dependency commit pins -- extracted from Dockerfile.rocm by
+# ci-bake-rocm.sh at build time. Empty defaults are safe: the cache
+# functions produce no entries when the variable is empty.
+variable "RIXL_BRANCH" {
+  default = ""
+}
+
+variable "UCX_BRANCH" {
+  default = ""
+}
+
+variable "ROCSHMEM_BRANCH" {
+  default = ""
+}
+
+variable "DEEPEP_BRANCH" {
+  default = ""
+}
+
+# Docker Hub registry cache for AMD builds.
+#
+# A separate repo (rocm/vllm-ci-cache) is used for BuildKit layer cache.
+# cache-to uses mode=min to reduce the volume of data pushed.
+# NOTE: mode=min still includes all layers referenced by the final image
+# manifest, including inherited base layers (~7.25GB ROCm runtime).
+# Docker Hub auto-creates the repo on first push.
+#
+# Final-image cache stays commit-scoped. Branch-to-branch reuse for the test
+# image comes from importing the parent and merge-base commit cache refs.
+#
+# The source-scoped native cache is exported both per-commit and per-branch so
+# ROCm extension rebuilds are shareable within the same commit reruns and across
+# consecutive commits on the same branch without depending on a single global
+# latest tag.
+
+variable "DOCKERHUB_CACHE_REPO" {
+  default = "rocm/vllm-ci-cache"
+}
+
+variable "DOCKERHUB_CACHE_TO" {
+  default = ""
+}
+
+variable "ROCM_CACHE_BRANCH_TAG" {
+  default = ""
+}
+
+variable "ROCM_CACHE_UPSTREAM_BRANCH_TAG" {
+  default = ""
+}
+
+# Functions
+
+function "get_cache_from_rocm" {
+  params = []
+  result = compact([
+    # Exact commit hit - fastest cache on re-runs of the same commit
+    BUILDKITE_COMMIT != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:rocm-${BUILDKITE_COMMIT}" : "",
+    # Parent commit - useful cache for incremental changes
+    PARENT_COMMIT != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:rocm-${PARENT_COMMIT}" : "",
+    # Merge-base with main - stable fallback for long-lived or rebased PRs;
+    # maps to a real main-branch commit whose cache layers are likely warm
+    VLLM_MERGE_BASE_COMMIT != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:rocm-${VLLM_MERGE_BASE_COMMIT}" : "",
+    # Import the source-scoped native build cache as well so builds whose
+    # Python/package layers changed can still reuse compiled ROCm objects.
+    BUILDKITE_COMMIT != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:csrc-rocm-${BUILDKITE_COMMIT}" : "",
+    PARENT_COMMIT != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:csrc-rocm-${PARENT_COMMIT}" : "",
+    VLLM_MERGE_BASE_COMMIT != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:csrc-rocm-${VLLM_MERGE_BASE_COMMIT}" : "",
+    ROCM_CACHE_BRANCH_TAG != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:csrc-rocm-branch-${ROCM_CACHE_BRANCH_TAG}" : "",
+    ROCM_CACHE_UPSTREAM_BRANCH_TAG != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:csrc-rocm-branch-${ROCM_CACHE_UPSTREAM_BRANCH_TAG}" : "",
+    # Branch-scoped full image cache - fallback when parent-commit cache is evicted
+    ROCM_CACHE_BRANCH_TAG != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:rocm-branch-${ROCM_CACHE_BRANCH_TAG}" : "",
+    ROCM_CACHE_UPSTREAM_BRANCH_TAG != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:rocm-branch-${ROCM_CACHE_UPSTREAM_BRANCH_TAG}" : "",
+  ])
+}
+
+function "get_cache_to_rocm" {
+  params = []
+  result = compact([
+    # Commit-scoped cache for exact re-runs.
+    BUILDKITE_COMMIT != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:rocm-${BUILDKITE_COMMIT},mode=min" : "",
+    # Branch-scoped cache so later commits on the same branch can reuse the full
+    # image layers when the parent-commit cache is evicted. Unlike the old
+    # rocm-latest tag (which caused duplicate exporter 400s), this is per-branch.
+    ROCM_CACHE_BRANCH_TAG != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:rocm-branch-${ROCM_CACHE_BRANCH_TAG},mode=min" : "",
+  ])
+}
+
+function "get_cache_from_rocm_csrc" {
+  params = []
+  result = compact([
+    BUILDKITE_COMMIT != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:csrc-rocm-${BUILDKITE_COMMIT}" : "",
+    PARENT_COMMIT != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:csrc-rocm-${PARENT_COMMIT}" : "",
+    VLLM_MERGE_BASE_COMMIT != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:csrc-rocm-${VLLM_MERGE_BASE_COMMIT}" : "",
+    ROCM_CACHE_BRANCH_TAG != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:csrc-rocm-branch-${ROCM_CACHE_BRANCH_TAG}" : "",
+    ROCM_CACHE_UPSTREAM_BRANCH_TAG != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:csrc-rocm-branch-${ROCM_CACHE_UPSTREAM_BRANCH_TAG}" : "",
+  ])
+}
+
+function "get_cache_to_rocm_csrc" {
+  params = []
+  result = compact([
+    # Export the exact-commit native cache for same-commit reruns.
+    BUILDKITE_COMMIT != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:csrc-rocm-${BUILDKITE_COMMIT},mode=min" : "",
+    # Export the branch-scoped native cache so later commits on the same branch
+    # can reuse compiled ROCm objects even when the exact parent cache is absent.
+    ROCM_CACHE_BRANCH_TAG != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:csrc-rocm-branch-${ROCM_CACHE_BRANCH_TAG},mode=min" : "",
+  ])
+}
+
+# Cache functions for upstream dependency stages (RIXL/UCX, ROCShmem, DeepEP).
+# These stages are pinned to specific upstream commit hashes, so cache keys use
+# those hashes rather than the Buildkite commit. This means the cache persists
+# across all vLLM commits as long as the upstream dependency pins don't change.
+
+function "get_cache_from_rocm_deps" {
+  params = []
+  result = compact([
+    RIXL_BRANCH != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:rixl-rocm-${RIXL_BRANCH}-ucx-${UCX_BRANCH}" : "",
+    ROCSHMEM_BRANCH != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:rocshmem-rocm-${ROCSHMEM_BRANCH}" : "",
+    DEEPEP_BRANCH != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:deepep-rocm-${DEEPEP_BRANCH}-rocshmem-${ROCSHMEM_BRANCH}" : "",
+  ])
+}
+
+function "get_cache_to_rocm_rixl" {
+  params = []
+  result = compact([
+    RIXL_BRANCH != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:rixl-rocm-${RIXL_BRANCH}-ucx-${UCX_BRANCH},mode=min" : "",
+  ])
+}
+
+function "get_cache_to_rocm_rocshmem" {
+  params = []
+  result = compact([
+    ROCSHMEM_BRANCH != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:rocshmem-rocm-${ROCSHMEM_BRANCH},mode=min" : "",
+  ])
+}
+
+function "get_cache_to_rocm_deepep" {
+  params = []
+  result = compact([
+    DEEPEP_BRANCH != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:deepep-rocm-${DEEPEP_BRANCH}-rocshmem-${ROCSHMEM_BRANCH},mode=min" : "",
+  ])
+}
+
+# CI targets
+
+target "_ci-rocm" {
+  annotations = [
+    "manifest:vllm.buildkite.build_number=${BUILDKITE_BUILD_NUMBER}",
+    "manifest:vllm.buildkite.build_id=${BUILDKITE_BUILD_ID}",
+  ]
+  args = {
+    ARG_PYTORCH_ROCM_ARCH = PYTORCH_ROCM_ARCH
+    CI_BASE_IMAGE         = CI_BASE_IMAGE
+    max_jobs              = CI_MAX_JOBS
+  }
+}
+
+target "test-rocm-ci" {
+  inherits   = ["_common-rocm", "_ci-rocm", "_labels"]
+  target     = "test"
+  cache-from = get_cache_from_rocm()
+  cache-to   = get_cache_to_rocm()
+  tags = compact([
+    IMAGE_TAG,
+    IMAGE_TAG_LATEST,
+  ])
+  output = ["type=registry"]
+}
+
+# Cache-only target for the source-scoped ROCm native build stage.
+# This persists the csrc-build stage in the registry cache even though the
+# final test image only consumes it indirectly while packaging the wheel.
+target "csrc-rocm-ci" {
+  inherits   = ["_common-rocm", "_ci-rocm"]
+  target     = "csrc-build"
+  cache-from = get_cache_from_rocm_csrc()
+  cache-to   = get_cache_to_rocm_csrc()
+  output     = ["type=cacheonly"]
+}
+
+# Keep wheel export on the same CI graph as the test image build so the
+# shared build_vllm/export_vllm stages resolve identically within one bake
+# invocation. Without this, export-wheel-rocm uses the plain local target
+# args while test-rocm-ci uses CI-only args, which can lead to separate
+# cache lineages and inconsistent export_vllm results.
+target "export-wheel-rocm" {
+  inherits   = ["_common-rocm", "_ci-rocm"]
+  target     = "export_vllm"
+  cache-from = get_cache_from_rocm()
+  cache-to   = get_cache_to_rocm()
+  output     = ["type=local,dest=./wheel-export"]
+}
+
+# Artifact-only vLLM build. GPU test jobs consume this artifact on top of
+# ci_base, avoiding a per-commit multi-GB image push/pull.
+group "test-rocm-ci-with-artifacts" {
+  targets = ["csrc-rocm-ci", "export-wheel-rocm"]
+}
+
+# Full test image + wheel export. Kept for fallback/debugging when a pushed
+# per-commit image is useful.
+group "test-rocm-ci-with-wheel" {
+  targets = ["csrc-rocm-ci", "test-rocm-ci", "export-wheel-rocm"]
+}
+
+# Image tag for the ci_base build. The ensure-ci-base step (in both the Jinja
+# template and amd.yaml) rebuilds this image when content-hash drift is detected.
+variable "CI_BASE_IMAGE_TAG" {
+  default = "rocm/vllm-dev:ci_base"
+}
+
+# Cache-only targets for upstream dependency stages. These persist each stage
+# in the registry cache keyed by its upstream commit hash. When ci_base rebuilds
+# (e.g., requirements change), these stages are cache hits if their upstream
+# pins haven't changed -- saving ~35min of compilation.
+target "rixl-rocm-ci" {
+  inherits   = ["_common-rocm", "_ci-rocm"]
+  target     = "build_rixl"
+  cache-from = get_cache_from_rocm_deps()
+  cache-to   = get_cache_to_rocm_rixl()
+  output     = ["type=cacheonly"]
+}
+
+target "rocshmem-rocm-ci" {
+  inherits   = ["_common-rocm", "_ci-rocm"]
+  target     = "build_rocshmem"
+  cache-from = get_cache_from_rocm_deps()
+  cache-to   = get_cache_to_rocm_rocshmem()
+  output     = ["type=cacheonly"]
+}
+
+target "deepep-rocm-ci" {
+  inherits   = ["_common-rocm", "_ci-rocm"]
+  target     = "build_deepep"
+  cache-from = get_cache_from_rocm_deps()
+  cache-to   = get_cache_to_rocm_deepep()
+  output     = ["type=cacheonly"]
+}
+
+# Builds only the ci_base stage (RIXL, DeepEP, torchcodec, etc.)
+# Invoked by the ensure-ci-base step when the content hash of ci_base-affecting
+# files drifts from the remote image label. Per-PR builds then pull the result
+# as CI_BASE_IMAGE instead of rebuilding those slow layers on every commit.
+# Uses inline cache metadata on the ci_base image itself instead of exporting a
+# separate registry cache artifact.
+target "ci-base-rocm-ci" {
+  inherits   = ["_common-rocm", "_ci-rocm", "_labels"]
+  target     = "ci_base"
+  cache-from = concat(
+    ["type=registry,ref=${CI_BASE_IMAGE_TAG}"],
+    # Import upstream dependency caches so RIXL/ROCShmem/DeepEP stages
+    # are cache hits even when ci_base itself needs rebuilding.
+    get_cache_from_rocm_deps(),
+  )
+  cache-to = ["type=inline"]
+  tags     = [CI_BASE_IMAGE_TAG]
+  output   = ["type=registry"]
+}
+
+# Group for ci_base builds -- exports dependency stage caches alongside the
+# ci_base image so future rebuilds can reuse them independently.
+group "ci-base-rocm-ci-with-deps" {
+  targets = ["rixl-rocm-ci", "rocshmem-rocm-ci", "deepep-rocm-ci", "ci-base-rocm-ci"]
+}