From f93b99330bdcc09b095fd18e721ca15be18f085d Mon Sep 17 00:00:00 2001 From: tjtanaa Date: Wed, 11 Feb 2026 08:10:45 +0000 Subject: [PATCH 01/27] add Dockerfile.rocm_base build Signed-off-by: tjtanaa --- buildkite/bootstrap-amd.sh | 29 ++++ buildkite/test-template-amd.j2 | 263 ++++++++++++++++++++++++++++++++- 2 files changed, 286 insertions(+), 6 deletions(-) diff --git a/buildkite/bootstrap-amd.sh b/buildkite/bootstrap-amd.sh index ebb527a6..32d0cc1c 100644 --- a/buildkite/bootstrap-amd.sh +++ b/buildkite/bootstrap-amd.sh @@ -53,6 +53,21 @@ check_run_all_label() { fi } +compute_rocm_base_cache_key() { + local DOCKERFILE="docker/Dockerfile.rocm_base" + local CI_PYTHON_VERSION="${ROCM_CI_PYTHON_VERSION:-3.12}" + local CI_PYTORCH_ROCM_ARCH="${ROCM_CI_PYTORCH_ROCM_ARCH:-gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151}" + + if [[ ! -f "$DOCKERFILE" ]]; then + echo "unknown" + return + fi + local dockerfile_hash=$(sha256sum "$DOCKERFILE" | cut -c1-16) + local args_string="${CI_PYTHON_VERSION}|${CI_PYTORCH_ROCM_ARCH}" + local args_hash=$(echo "$args_string" | sha256sum | cut -c1-8) + echo "${dockerfile_hash}-${args_hash}" +} + if [[ -z "${COV_ENABLED:-}" ]]; then COV_ENABLED=0 fi @@ -86,6 +101,9 @@ upload_pipeline() { echo "Nightly: $NIGHTLY" echo "AMD Mirror HW: $AMD_MIRROR_HW" + ROCM_BASE_CACHE_KEY=$(compute_rocm_base_cache_key) + echo "ROCm base cache key: $ROCM_BASE_CACHE_KEY" + FAIL_FAST=$(fail_fast) cd .buildkite @@ -103,6 +121,8 @@ upload_pipeline() { -D vllm_merge_base_commit="$(git merge-base origin/main HEAD)" \ -D cov_enabled="$COV_ENABLED" \ -D vllm_ci_branch="$VLLM_CI_BRANCH" \ + -D rocm_base_cache_key="$ROCM_BASE_CACHE_KEY" \ + -D rocm_base_changed="$ROCM_BASE_CHANGED" \ | sed '/^[[:space:]]*$/d' \ > pipeline.yaml ) @@ -212,6 +232,15 @@ for file in $file_diff; do fi done +ROCM_BASE_CHANGED=0 +for file in $file_diff; do + if [[ "$file" == "docker/Dockerfile.rocm_base" ]]; then + ROCM_BASE_CHANGED=1 + echo "Dockerfile.rocm_base changed in this PR" + break + fi +done + # Check for ready-run-all-tests label LABEL_RUN_ALL=$(check_run_all_label) if [[ $LABEL_RUN_ALL == true ]]; then diff --git a/buildkite/test-template-amd.j2 b/buildkite/test-template-amd.j2 index cdefb348..5049a492 100644 --- a/buildkite/test-template-amd.j2 +++ b/buildkite/test-template-amd.j2 @@ -9,6 +9,10 @@ {% set docker_image_cpu = "public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT-cpu" %} {% endif %} {% set docker_image_amd = "rocm/vllm-ci:$BUILDKITE_COMMIT" %} +{% set rocm_base_ecr_image = "public.ecr.aws/q9t5s3a7/vllm-release-repo" %} +{% set rocm_base_ecr_commit_tag = rocm_base_ecr_image ~ ":$BUILDKITE_COMMIT-" ~ rocm_base_cache_key ~ "-rocm-base" %} +{% set rocm_base_ecr_cache_tag = rocm_base_ecr_image ~ ":" ~ rocm_base_cache_key ~ "-rocm-base" %} +{% set rocm_base_ecr_nightly_tag = rocm_base_ecr_image ~ ":latest-rocm-base-nightly" %} {% set default_working_dir = "/vllm-workspace/tests" %} {% set hf_home = "/root/.cache/huggingface" %} {% set hf_home_efs = "/mnt/efs/hf_cache" %} @@ -284,17 +288,177 @@ plugins: - group: "AMD Tests" depends_on: ~ steps: - - label: "AMD: :docker: build image" +{% if branch != "main" %} + - label: "AMD: :git: Check Dockerfile.rocm_base freshness" depends_on: ~ + key: "amd-rocm-base-check" + commands: + - | + set -euo pipefail + echo "--- Checking Dockerfile.rocm_base git history" + git fetch origin main + MAIN_LATEST=$$(git log origin/main -1 --format='%H' -- docker/Dockerfile.rocm_base) + if [ -z "$$MAIN_LATEST" ]; then + echo "No commits found for Dockerfile.rocm_base on origin/main. Skipping check." + exit 0 + fi + echo "Latest commit on main for Dockerfile.rocm_base: $$MAIN_LATEST" + if git merge-base --is-ancestor "$$MAIN_LATEST" HEAD; then + echo "OK: Current branch contains the latest Dockerfile.rocm_base from main" + else + echo "ERROR: Branch does NOT contain the latest Dockerfile.rocm_base from main." + echo "Please rebase or merge main into your branch." + buildkite-agent annotate --style error \ + "Your branch is missing the latest Dockerfile.rocm_base changes from main (commit $$MAIN_LATEST). Please rebase or merge main." \ + --context "rocm-base-freshness" + exit 1 + fi + agents: + queue: amd-cpu + soft_fail: false +{% endif %} + + - label: "AMD: :docker: Build/Reuse ROCm base image" + depends_on: +{% if branch != "main" %} + - "amd-rocm-base-check" +{% else %} + - ~ +{% endif %} + key: "amd-rocm-base-build" + commands: + - | + set -euo pipefail + + CACHE_KEY="{{ rocm_base_cache_key }}" + ECR_CACHE_TAG="{{ rocm_base_ecr_cache_tag }}" + ECR_COMMIT_TAG="{{ rocm_base_ecr_commit_tag }}" + S3_BUCKET="vllm-wheels" + S3_CACHE_PATH="s3://$${S3_BUCKET}/rocm/cache/$${CACHE_KEY}" + + echo "========================================" + echo "ROCm Base Image Build/Reuse" + echo " Cache Key: $${CACHE_KEY}" + echo " ECR Cache Tag: $${ECR_CACHE_TAG}" + echo " ECR Commit Tag: $${ECR_COMMIT_TAG}" + echo "========================================" + + # Login to ECR + aws ecr-public get-login-password --region us-east-1 | \ + docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7 + + # Tier 1: Check ECR cache (fastest) + IMAGE_EXISTS=0 + if docker manifest inspect "$${ECR_CACHE_TAG}" > /dev/null 2>&1; then + IMAGE_EXISTS=1 + echo "ECR cache HIT: $${ECR_CACHE_TAG}" + fi + + if [ "$$IMAGE_EXISTS" -eq 1 ]; then + docker pull "$${ECR_CACHE_TAG}" + docker tag "$${ECR_CACHE_TAG}" "rocm/vllm-dev:base-ci" + else + # Tier 2: Check S3 cache + S3_IMAGE_EXISTS=0 + if aws s3 ls "$${S3_CACHE_PATH}/rocm-base-image.tar.gz" > /dev/null 2>&1; then + S3_IMAGE_EXISTS=1 + fi + + if [ "$$S3_IMAGE_EXISTS" -eq 1 ]; then + echo "S3 cache HIT. Downloading..." + mkdir -p /tmp/rocm-cache + aws s3 cp "$${S3_CACHE_PATH}/rocm-base-image.tar.gz" /tmp/rocm-cache/rocm-base-image.tar.gz + LOAD_OUTPUT=$$(gunzip -c /tmp/rocm-cache/rocm-base-image.tar.gz | docker load) + echo "$$LOAD_OUTPUT" + BASE_TAG=$$(echo "$$LOAD_OUTPUT" | grep "Loaded image:" | sed 's/Loaded image: //') + docker tag "$$BASE_TAG" "rocm/vllm-dev:base-ci" + rm -rf /tmp/rocm-cache + else + echo "CACHE MISS. Building Dockerfile.rocm_base from scratch..." + + DOCKER_BUILDKIT=1 docker buildx build \ + --file docker/Dockerfile.rocm_base \ + --tag "rocm/vllm-dev:base-ci" \ + --build-arg PYTORCH_ROCM_ARCH="gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151" \ + --build-arg PYTHON_VERSION="3.12" \ + --build-arg USE_SCCACHE=1 \ + --build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache \ + --build-arg SCCACHE_REGION_NAME=us-west-2 \ + --build-arg SCCACHE_S3_NO_CREDENTIALS=0 \ + --load \ + --progress plain \ + . + + # Upload to S3 for future cache hits + docker save rocm/vllm-dev:base-ci | gzip > /tmp/rocm-base-image.tar.gz + aws s3 cp /tmp/rocm-base-image.tar.gz "$${S3_CACHE_PATH}/rocm-base-image.tar.gz" + rm -f /tmp/rocm-base-image.tar.gz + + # Also upload base wheels to S3 cache + DOCKER_BUILDKIT=1 docker buildx build \ + --file docker/Dockerfile.rocm_base \ + --tag rocm-base-debs:ci \ + --target debs_wheel_release \ + --build-arg PYTORCH_ROCM_ARCH="gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151" \ + --build-arg PYTHON_VERSION="3.12" \ + --build-arg USE_SCCACHE=1 \ + --build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache \ + --build-arg SCCACHE_REGION_NAME=us-west-2 \ + --build-arg SCCACHE_S3_NO_CREDENTIALS=0 \ + --load \ + . + mkdir -p artifacts/rocm-base-wheels + cid=$$(docker create rocm-base-debs:ci) + docker cp $${cid}:/app/debs/. artifacts/rocm-base-wheels/ + docker rm $${cid} + export PYTHON_VERSION=3.12 + export PYTORCH_ROCM_ARCH="gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151" + S3_BUCKET=vllm-wheels .buildkite/scripts/cache-rocm-base-wheels.sh upload + fi + + # Push to ECR (cache tag for future lookups + commit tag for traceability) + docker tag "rocm/vllm-dev:base-ci" "$${ECR_CACHE_TAG}" + docker push "$${ECR_CACHE_TAG}" + fi + + # Always push the commit-specific tag + docker tag "rocm/vllm-dev:base-ci" "$${ECR_COMMIT_TAG}" + docker push "$${ECR_COMMIT_TAG}" + +{% if branch == "main" %} + # On main, also tag as latest nightly + docker tag "rocm/vllm-dev:base-ci" "{{ rocm_base_ecr_nightly_tag }}" + docker push "{{ rocm_base_ecr_nightly_tag }}" + echo "Pushed nightly tag: {{ rocm_base_ecr_nightly_tag }}" +{% endif %} + + echo "Base image ready: rocm/vllm-dev:base-ci" + agents: + queue: amd-cpu + env: + DOCKER_BUILDKIT: "1" + retry: + automatic: + - exit_status: -1 + limit: 2 + - exit_status: -10 + limit: 2 + - exit_status: 1 + limit: 1 + + - label: "AMD: :docker: build image" + depends_on: amd-rocm-base-build soft_fail: false commands: - # Handle the introduction of test target in Dockerfile.rocm + - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" + - "docker pull {{ rocm_base_ecr_commit_tag }} && docker tag {{ rocm_base_ecr_commit_tag }} rocm/vllm-dev:base-ci" - > docker build --build-arg max_jobs=16 --build-arg REMOTE_VLLM=1 --build-arg ARG_PYTORCH_ROCM_ARCH='gfx90a;gfx942' --build-arg VLLM_BRANCH=$BUILDKITE_COMMIT + --build-arg BASE_IMAGE=rocm/vllm-dev:base-ci --tag {{ docker_image_amd }} -f docker/Dockerfile.rocm --target test @@ -306,13 +470,13 @@ plugins: DOCKER_BUILDKIT: "1" retry: automatic: - - exit_status: -1 # Agent was lost + - exit_status: -1 limit: 2 - - exit_status: -10 # Agent was lost + - exit_status: -10 limit: 2 - - exit_status: 128 # Git connectivity issues + - exit_status: 128 limit: 2 - - exit_status: 1 # Machine occasionally fail + - exit_status: 1 limit: 1 agents: queue: amd-cpu @@ -378,3 +542,90 @@ plugins: limit: 2 {% endif %} {% endfor %} + +{% if branch == "main" %} + - label: "AMD: :rocket: Build ROCm nightly release image" + depends_on: amd-build + key: "amd-nightly-release-image" + soft_fail: true + agents: + queue: amd-cpu + commands: + - | + set -euo pipefail + aws ecr-public get-login-password --region us-east-1 | \ + docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7 + docker pull "{{ rocm_base_ecr_commit_tag }}" + docker tag "{{ rocm_base_ecr_commit_tag }}" rocm/vllm-dev:base-ci + + DOCKER_BUILDKIT=1 docker build \ + --build-arg max_jobs=16 \ + --build-arg BASE_IMAGE=rocm/vllm-dev:base-ci \ + --build-arg ARG_PYTORCH_ROCM_ARCH='gfx90a;gfx942' \ + --build-arg USE_SCCACHE=1 \ + --build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache \ + --build-arg SCCACHE_REGION_NAME=us-west-2 \ + --build-arg SCCACHE_S3_NO_CREDENTIALS=0 \ + --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$$BUILDKITE_COMMIT-rocm \ + --target vllm-openai \ + --progress plain \ + -f docker/Dockerfile.rocm . + docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$$BUILDKITE_COMMIT-rocm + env: + DOCKER_BUILDKIT: "1" + retry: + automatic: + - exit_status: -1 + limit: 2 + + - label: "AMD: :python: Build ROCm nightly wheel" + depends_on: amd-rocm-base-build + key: "amd-nightly-wheel" + soft_fail: true + agents: + queue: amd-cpu + commands: + - | + set -euo pipefail + aws ecr-public get-login-password --region us-east-1 | \ + docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7 + docker pull "{{ rocm_base_ecr_commit_tag }}" + docker tag "{{ rocm_base_ecr_commit_tag }}" rocm/vllm-dev:base-ci + + # Download base wheels from S3 cache + export PYTHON_VERSION=3.12 + export PYTORCH_ROCM_ARCH="gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151" + export S3_BUCKET=vllm-wheels + .buildkite/scripts/cache-rocm-base-wheels.sh download + + mkdir -p docker/context/base-wheels + cp artifacts/rocm-base-wheels/*.whl docker/context/base-wheels/ + + git fetch --tags --force origin + + DOCKER_BUILDKIT=1 docker build \ + --file docker/Dockerfile.rocm \ + --target export_vllm_wheel_release \ + --output type=local,dest=rocm-dist \ + --build-arg BASE_IMAGE=rocm/vllm-dev:base-ci \ + --build-arg ARG_PYTORCH_ROCM_ARCH="gfx90a;gfx942" \ + --build-arg REMOTE_VLLM=0 \ + --build-arg GIT_REPO_CHECK=1 \ + --build-arg USE_SCCACHE=1 \ + --build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache \ + --build-arg SCCACHE_REGION_NAME=us-west-2 \ + --build-arg SCCACHE_S3_NO_CREDENTIALS=0 \ + . + + mkdir -p artifacts/rocm-vllm-wheel + cp rocm-dist/*.whl artifacts/rocm-vllm-wheel/ + bash .buildkite/scripts/upload-rocm-wheels.sh + env: + DOCKER_BUILDKIT: "1" + ROCM_UPLOAD_WHEELS: "true" + S3_BUCKET: "vllm-wheels" + retry: + automatic: + - exit_status: -1 + limit: 2 +{% endif %} From a24755a05134a2f15bfdeb755ab05caa8b771540 Mon Sep 17 00:00:00 2001 From: tjtanaa Date: Tue, 24 Feb 2026 10:06:04 +0000 Subject: [PATCH 02/27] change queue machine and also grep the gpu arch from dockerfile.rocm_base Signed-off-by: tjtanaa --- buildkite/test-template-amd.j2 | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/buildkite/test-template-amd.j2 b/buildkite/test-template-amd.j2 index f8458f3c..8a376f93 100644 --- a/buildkite/test-template-amd.j2 +++ b/buildkite/test-template-amd.j2 @@ -376,10 +376,13 @@ plugins: else echo "CACHE MISS. Building Dockerfile.rocm_base from scratch..." + PYTORCH_ROCM_ARCH=$$(grep '^ARG PYTORCH_ROCM_ARCH=' docker/Dockerfile.rocm_base | sed 's/^ARG PYTORCH_ROCM_ARCH=//') + echo "Using PYTORCH_ROCM_ARCH from Dockerfile.rocm_base: $${PYTORCH_ROCM_ARCH}" + DOCKER_BUILDKIT=1 docker buildx build \ --file docker/Dockerfile.rocm_base \ --tag "rocm/vllm-dev:base-ci" \ - --build-arg PYTORCH_ROCM_ARCH="gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151" \ + --build-arg PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH}" \ --build-arg PYTHON_VERSION="3.12" \ --build-arg USE_SCCACHE=1 \ --build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache \ @@ -399,7 +402,7 @@ plugins: --file docker/Dockerfile.rocm_base \ --tag rocm-base-debs:ci \ --target debs_wheel_release \ - --build-arg PYTORCH_ROCM_ARCH="gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151" \ + --build-arg PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH}" \ --build-arg PYTHON_VERSION="3.12" \ --build-arg USE_SCCACHE=1 \ --build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache \ @@ -412,7 +415,7 @@ plugins: docker cp $${cid}:/app/debs/. artifacts/rocm-base-wheels/ docker rm $${cid} export PYTHON_VERSION=3.12 - export PYTORCH_ROCM_ARCH="gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151" + export PYTORCH_ROCM_ARCH S3_BUCKET=vllm-wheels .buildkite/scripts/cache-rocm-base-wheels.sh upload fi @@ -479,7 +482,7 @@ plugins: - exit_status: 1 limit: 1 agents: - queue: amd-cpu + queue: cpu_queue_postmerge {% for step in steps %} {% if step.mirror_hardwares and mirror_hw in step.mirror_hardwares %} @@ -594,7 +597,9 @@ plugins: # Download base wheels from S3 cache export PYTHON_VERSION=3.12 - export PYTORCH_ROCM_ARCH="gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151" + PYTORCH_ROCM_ARCH=$$(grep '^ARG PYTORCH_ROCM_ARCH=' docker/Dockerfile.rocm_base | sed 's/^ARG PYTORCH_ROCM_ARCH=//') + echo "Using PYTORCH_ROCM_ARCH from Dockerfile.rocm_base: $${PYTORCH_ROCM_ARCH}" + export PYTORCH_ROCM_ARCH export S3_BUCKET=vllm-wheels .buildkite/scripts/cache-rocm-base-wheels.sh download From a67024841d61947a8e94196d70efae44a8500033 Mon Sep 17 00:00:00 2001 From: tjtanaa Date: Tue, 24 Feb 2026 10:30:57 +0000 Subject: [PATCH 03/27] change from amd-cpu to cpu_post_merge Signed-off-by: tjtanaa --- buildkite/test-template-amd.j2 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/buildkite/test-template-amd.j2 b/buildkite/test-template-amd.j2 index 8a376f93..48b74578 100644 --- a/buildkite/test-template-amd.j2 +++ b/buildkite/test-template-amd.j2 @@ -437,7 +437,7 @@ plugins: echo "Base image ready: rocm/vllm-dev:base-ci" agents: - queue: amd-cpu + queue: cpu_queue_postmerge env: DOCKER_BUILDKIT: "1" retry: From 4035b5dae1dfe97042a30f85e93ef192d989c9c9 Mon Sep 17 00:00:00 2001 From: tjtanaa Date: Tue, 24 Feb 2026 10:48:52 +0000 Subject: [PATCH 04/27] automatically extract gpu arch in bootstrap-amd.sh Signed-off-by: tjtanaa --- buildkite/bootstrap-amd.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/buildkite/bootstrap-amd.sh b/buildkite/bootstrap-amd.sh index 32d0cc1c..5295164f 100644 --- a/buildkite/bootstrap-amd.sh +++ b/buildkite/bootstrap-amd.sh @@ -56,7 +56,9 @@ check_run_all_label() { compute_rocm_base_cache_key() { local DOCKERFILE="docker/Dockerfile.rocm_base" local CI_PYTHON_VERSION="${ROCM_CI_PYTHON_VERSION:-3.12}" - local CI_PYTORCH_ROCM_ARCH="${ROCM_CI_PYTORCH_ROCM_ARCH:-gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151}" + local DEFAULT_ARCH + DEFAULT_ARCH=$(grep '^ARG PYTORCH_ROCM_ARCH=' "$DOCKERFILE" | sed 's/^ARG PYTORCH_ROCM_ARCH=//') + local CI_PYTORCH_ROCM_ARCH="${ROCM_CI_PYTORCH_ROCM_ARCH:-$DEFAULT_ARCH}" if [[ ! -f "$DOCKERFILE" ]]; then echo "unknown" From 806ecc00762334eb3908571d97be4a3845a061ad Mon Sep 17 00:00:00 2001 From: tjtanaa Date: Tue, 24 Feb 2026 11:08:26 +0000 Subject: [PATCH 05/27] extract default python version from dockerfile.rocm_base Signed-off-by: tjtanaa --- buildkite/bootstrap-amd.sh | 4 +++- buildkite/test-template-amd.j2 | 14 +++++++++----- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/buildkite/bootstrap-amd.sh b/buildkite/bootstrap-amd.sh index 5295164f..e9eb3c83 100644 --- a/buildkite/bootstrap-amd.sh +++ b/buildkite/bootstrap-amd.sh @@ -55,7 +55,9 @@ check_run_all_label() { compute_rocm_base_cache_key() { local DOCKERFILE="docker/Dockerfile.rocm_base" - local CI_PYTHON_VERSION="${ROCM_CI_PYTHON_VERSION:-3.12}" + local DEFAULT_PYTHON + DEFAULT_PYTHON=$(grep '^ARG PYTHON_VERSION=' "$DOCKERFILE" | sed 's/^ARG PYTHON_VERSION=//') + local CI_PYTHON_VERSION="${ROCM_CI_PYTHON_VERSION:-$DEFAULT_PYTHON}" local DEFAULT_ARCH DEFAULT_ARCH=$(grep '^ARG PYTORCH_ROCM_ARCH=' "$DOCKERFILE" | sed 's/^ARG PYTORCH_ROCM_ARCH=//') local CI_PYTORCH_ROCM_ARCH="${ROCM_CI_PYTORCH_ROCM_ARCH:-$DEFAULT_ARCH}" diff --git a/buildkite/test-template-amd.j2 b/buildkite/test-template-amd.j2 index 48b74578..26ddc0bb 100644 --- a/buildkite/test-template-amd.j2 +++ b/buildkite/test-template-amd.j2 @@ -314,7 +314,7 @@ plugins: exit 1 fi agents: - queue: amd-cpu + queue: cpu_queue_postmerge soft_fail: false {% endif %} @@ -378,12 +378,14 @@ plugins: PYTORCH_ROCM_ARCH=$$(grep '^ARG PYTORCH_ROCM_ARCH=' docker/Dockerfile.rocm_base | sed 's/^ARG PYTORCH_ROCM_ARCH=//') echo "Using PYTORCH_ROCM_ARCH from Dockerfile.rocm_base: $${PYTORCH_ROCM_ARCH}" + PYTHON_VERSION=$$(grep '^ARG PYTHON_VERSION=' docker/Dockerfile.rocm_base | sed 's/^ARG PYTHON_VERSION=//') + echo "Using PYTHON_VERSION from Dockerfile.rocm_base: $${PYTHON_VERSION}" DOCKER_BUILDKIT=1 docker buildx build \ --file docker/Dockerfile.rocm_base \ --tag "rocm/vllm-dev:base-ci" \ --build-arg PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH}" \ - --build-arg PYTHON_VERSION="3.12" \ + --build-arg PYTHON_VERSION="$${PYTHON_VERSION}" \ --build-arg USE_SCCACHE=1 \ --build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache \ --build-arg SCCACHE_REGION_NAME=us-west-2 \ @@ -403,7 +405,7 @@ plugins: --tag rocm-base-debs:ci \ --target debs_wheel_release \ --build-arg PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH}" \ - --build-arg PYTHON_VERSION="3.12" \ + --build-arg PYTHON_VERSION="$${PYTHON_VERSION}" \ --build-arg USE_SCCACHE=1 \ --build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache \ --build-arg SCCACHE_REGION_NAME=us-west-2 \ @@ -414,7 +416,7 @@ plugins: cid=$$(docker create rocm-base-debs:ci) docker cp $${cid}:/app/debs/. artifacts/rocm-base-wheels/ docker rm $${cid} - export PYTHON_VERSION=3.12 + export PYTHON_VERSION export PYTORCH_ROCM_ARCH S3_BUCKET=vllm-wheels .buildkite/scripts/cache-rocm-base-wheels.sh upload fi @@ -596,7 +598,9 @@ plugins: docker tag "{{ rocm_base_ecr_commit_tag }}" rocm/vllm-dev:base-ci # Download base wheels from S3 cache - export PYTHON_VERSION=3.12 + PYTHON_VERSION=$$(grep '^ARG PYTHON_VERSION=' docker/Dockerfile.rocm_base | sed 's/^ARG PYTHON_VERSION=//') + echo "Using PYTHON_VERSION from Dockerfile.rocm_base: $${PYTHON_VERSION}" + export PYTHON_VERSION PYTORCH_ROCM_ARCH=$$(grep '^ARG PYTORCH_ROCM_ARCH=' docker/Dockerfile.rocm_base | sed 's/^ARG PYTORCH_ROCM_ARCH=//') echo "Using PYTORCH_ROCM_ARCH from Dockerfile.rocm_base: $${PYTORCH_ROCM_ARCH}" export PYTORCH_ROCM_ARCH From 1b6a4c8d4c846b4dec32c1a87dff69bc8d1e2873 Mon Sep 17 00:00:00 2001 From: tjtanaa Date: Tue, 24 Feb 2026 13:58:20 +0000 Subject: [PATCH 06/27] push to vllm ecr repo instead Signed-off-by: tjtanaa --- buildkite/test-template-amd.j2 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/buildkite/test-template-amd.j2 b/buildkite/test-template-amd.j2 index 26ddc0bb..13f119e8 100644 --- a/buildkite/test-template-amd.j2 +++ b/buildkite/test-template-amd.j2 @@ -8,7 +8,7 @@ {% set docker_image_cu118 = "public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT-cu118" %} {% set docker_image_cpu = "public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT-cpu" %} {% endif %} -{% set docker_image_amd = "rocm/vllm-ci:$BUILDKITE_COMMIT" %} +{% set docker_image_amd = "public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT-rocm" %} {% set rocm_base_ecr_image = "public.ecr.aws/q9t5s3a7/vllm-release-repo" %} {% set rocm_base_ecr_commit_tag = rocm_base_ecr_image ~ ":$BUILDKITE_COMMIT-" ~ rocm_base_cache_key ~ "-rocm-base" %} {% set rocm_base_ecr_cache_tag = rocm_base_ecr_image ~ ":" ~ rocm_base_cache_key ~ "-rocm-base" %} From d326d8af1568095b7786cf996bf8bd65e0cfdf2a Mon Sep 17 00:00:00 2001 From: tjtanaa Date: Thu, 26 Feb 2026 07:55:00 +0000 Subject: [PATCH 07/27] try amd-cpu again, moving away from cpu_postmerge_queue Signed-off-by: tjtanaa --- buildkite/test-template-amd.j2 | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/buildkite/test-template-amd.j2 b/buildkite/test-template-amd.j2 index 13f119e8..6871d55a 100644 --- a/buildkite/test-template-amd.j2 +++ b/buildkite/test-template-amd.j2 @@ -314,7 +314,7 @@ plugins: exit 1 fi agents: - queue: cpu_queue_postmerge + queue: amd-cpu soft_fail: false {% endif %} @@ -342,7 +342,8 @@ plugins: echo " ECR Cache Tag: $${ECR_CACHE_TAG}" echo " ECR Commit Tag: $${ECR_COMMIT_TAG}" echo "========================================" - + + sudo apt-get update && sudo apt-get install -y awscli # Login to ECR aws ecr-public get-login-password --region us-east-1 | \ docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7 @@ -437,9 +438,9 @@ plugins: echo "Pushed nightly tag: {{ rocm_base_ecr_nightly_tag }}" {% endif %} - echo "Base image ready: rocm/vllm-dev:base-ci" + echo "Base image ready: $${ECR_COMMIT_TAG}" agents: - queue: cpu_queue_postmerge + queue: amd-cpu env: DOCKER_BUILDKIT: "1" retry: @@ -455,6 +456,7 @@ plugins: depends_on: amd-rocm-base-build soft_fail: false commands: + - "sudo apt-get update && sudo apt-get install -y awscli" - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - "docker pull {{ rocm_base_ecr_commit_tag }} && docker tag {{ rocm_base_ecr_commit_tag }} rocm/vllm-dev:base-ci" - > @@ -484,7 +486,7 @@ plugins: - exit_status: 1 limit: 1 agents: - queue: cpu_queue_postmerge + queue: amd-cpu {% for step in steps %} {% if step.mirror_hardwares and mirror_hw in step.mirror_hardwares %} From ca9ec6e682f08c5072bf9d69d7e745bc401196fc Mon Sep 17 00:00:00 2001 From: tjtanaa Date: Thu, 26 Feb 2026 08:04:16 +0000 Subject: [PATCH 08/27] noninteractive installation Signed-off-by: tjtanaa --- buildkite/test-template-amd.j2 | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/buildkite/test-template-amd.j2 b/buildkite/test-template-amd.j2 index 6871d55a..b7f82ef7 100644 --- a/buildkite/test-template-amd.j2 +++ b/buildkite/test-template-amd.j2 @@ -343,7 +343,7 @@ plugins: echo " ECR Commit Tag: $${ECR_COMMIT_TAG}" echo "========================================" - sudo apt-get update && sudo apt-get install -y awscli + sudo DEBIAN_FRONTEND=noninteractive apt-get update && sudo DEBIAN_FRONTEND=noninteractive apt-get install -y awscli # Login to ECR aws ecr-public get-login-password --region us-east-1 | \ docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7 @@ -456,7 +456,7 @@ plugins: depends_on: amd-rocm-base-build soft_fail: false commands: - - "sudo apt-get update && sudo apt-get install -y awscli" + - "sudo DEBIAN_FRONTEND=noninteractive apt-get update && sudo DEBIAN_FRONTEND=noninteractive apt-get install -y awscli" - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - "docker pull {{ rocm_base_ecr_commit_tag }} && docker tag {{ rocm_base_ecr_commit_tag }} rocm/vllm-dev:base-ci" - > From 729703f60c9b852d70e1a32e25655623f22feab4 Mon Sep 17 00:00:00 2001 From: tjtanaa Date: Thu, 26 Feb 2026 08:13:03 +0000 Subject: [PATCH 09/27] change git history check from amd-cpu to cpu_queue_postmerge Signed-off-by: tjtanaa --- buildkite/test-template-amd.j2 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/buildkite/test-template-amd.j2 b/buildkite/test-template-amd.j2 index b7f82ef7..97d07af3 100644 --- a/buildkite/test-template-amd.j2 +++ b/buildkite/test-template-amd.j2 @@ -314,7 +314,7 @@ plugins: exit 1 fi agents: - queue: amd-cpu + queue: cpu_queue_postmerge soft_fail: false {% endif %} From 0f1980ec010f959475434133984a08424657ec57 Mon Sep 17 00:00:00 2001 From: tjtanaa Date: Thu, 26 Feb 2026 08:50:46 +0000 Subject: [PATCH 10/27] use amd cpu for ci image Signed-off-by: tjtanaa --- buildkite/test-template-amd.j2 | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/buildkite/test-template-amd.j2 b/buildkite/test-template-amd.j2 index 97d07af3..f58b92f6 100644 --- a/buildkite/test-template-amd.j2 +++ b/buildkite/test-template-amd.j2 @@ -8,7 +8,7 @@ {% set docker_image_cu118 = "public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT-cu118" %} {% set docker_image_cpu = "public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT-cpu" %} {% endif %} -{% set docker_image_amd = "public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT-rocm" %} +{% set docker_image_amd = "rocm/vllm-ci:$BUILDKITE_COMMIT" %} {% set rocm_base_ecr_image = "public.ecr.aws/q9t5s3a7/vllm-release-repo" %} {% set rocm_base_ecr_commit_tag = rocm_base_ecr_image ~ ":$BUILDKITE_COMMIT-" ~ rocm_base_cache_key ~ "-rocm-base" %} {% set rocm_base_ecr_cache_tag = rocm_base_ecr_image ~ ":" ~ rocm_base_cache_key ~ "-rocm-base" %} @@ -440,7 +440,7 @@ plugins: echo "Base image ready: $${ECR_COMMIT_TAG}" agents: - queue: amd-cpu + queue: cpu_queue_postmerge env: DOCKER_BUILDKIT: "1" retry: From 2f874299b7e3a754386c4c428259f335f8cd8816 Mon Sep 17 00:00:00 2001 From: tjtanaa Date: Thu, 26 Feb 2026 09:15:52 +0000 Subject: [PATCH 11/27] remove sudo Signed-off-by: tjtanaa --- buildkite/test-template-amd.j2 | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/buildkite/test-template-amd.j2 b/buildkite/test-template-amd.j2 index f58b92f6..89e42867 100644 --- a/buildkite/test-template-amd.j2 +++ b/buildkite/test-template-amd.j2 @@ -343,7 +343,7 @@ plugins: echo " ECR Commit Tag: $${ECR_COMMIT_TAG}" echo "========================================" - sudo DEBIAN_FRONTEND=noninteractive apt-get update && sudo DEBIAN_FRONTEND=noninteractive apt-get install -y awscli + DEBIAN_FRONTEND=noninteractive apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y awscli # Login to ECR aws ecr-public get-login-password --region us-east-1 | \ docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7 @@ -456,7 +456,7 @@ plugins: depends_on: amd-rocm-base-build soft_fail: false commands: - - "sudo DEBIAN_FRONTEND=noninteractive apt-get update && sudo DEBIAN_FRONTEND=noninteractive apt-get install -y awscli" + - "DEBIAN_FRONTEND=noninteractive apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y awscli" - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - "docker pull {{ rocm_base_ecr_commit_tag }} && docker tag {{ rocm_base_ecr_commit_tag }} rocm/vllm-dev:base-ci" - > From ba73c39441d47ea5e2223c53b04f59b7b17e68a2 Mon Sep 17 00:00:00 2001 From: tjtanaa Date: Thu, 26 Feb 2026 09:47:26 +0000 Subject: [PATCH 12/27] try enabling sccache when building amd CI image Signed-off-by: tjtanaa --- buildkite/test-template-amd.j2 | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/buildkite/test-template-amd.j2 b/buildkite/test-template-amd.j2 index 89e42867..a32ef42e 100644 --- a/buildkite/test-template-amd.j2 +++ b/buildkite/test-template-amd.j2 @@ -343,7 +343,7 @@ plugins: echo " ECR Commit Tag: $${ECR_COMMIT_TAG}" echo "========================================" - DEBIAN_FRONTEND=noninteractive apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y awscli + pip install awscli # Login to ECR aws ecr-public get-login-password --region us-east-1 | \ docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7 @@ -456,7 +456,7 @@ plugins: depends_on: amd-rocm-base-build soft_fail: false commands: - - "DEBIAN_FRONTEND=noninteractive apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y awscli" + - "pip install awscli" - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - "docker pull {{ rocm_base_ecr_commit_tag }} && docker tag {{ rocm_base_ecr_commit_tag }} rocm/vllm-dev:base-ci" - > @@ -466,6 +466,10 @@ plugins: --build-arg ARG_PYTORCH_ROCM_ARCH='gfx942;gfx950' --build-arg VLLM_BRANCH=$BUILDKITE_COMMIT --build-arg BASE_IMAGE=rocm/vllm-dev:base-ci + --build-arg USE_SCCACHE=1 + --build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache + --build-arg SCCACHE_REGION_NAME=us-west-2 + --build-arg SCCACHE_S3_NO_CREDENTIALS=0 --tag {{ docker_image_amd }} -f docker/Dockerfile.rocm --target test From 72ea91eda65754f32d49142e82729a0245cf41e8 Mon Sep 17 00:00:00 2001 From: tjtanaa Date: Thu, 26 Feb 2026 14:23:50 +0000 Subject: [PATCH 13/27] fix the aws cli installation command; fix the main branch; avoid download docker image to speed things up Signed-off-by: tjtanaa --- buildkite/test-template-amd.j2 | 38 +++++++++++++++++++++------------- 1 file changed, 24 insertions(+), 14 deletions(-) diff --git a/buildkite/test-template-amd.j2 b/buildkite/test-template-amd.j2 index a32ef42e..1bf8dd15 100644 --- a/buildkite/test-template-amd.j2 +++ b/buildkite/test-template-amd.j2 @@ -333,6 +333,9 @@ plugins: CACHE_KEY="{{ rocm_base_cache_key }}" ECR_CACHE_TAG="{{ rocm_base_ecr_cache_tag }}" ECR_COMMIT_TAG="{{ rocm_base_ecr_commit_tag }}" +{% if branch == "main" %} + ECR_NIGHTLY_TAG="{{ rocm_base_ecr_nightly_tag }}" +{% endif %} S3_BUCKET="vllm-wheels" S3_CACHE_PATH="s3://$${S3_BUCKET}/rocm/cache/$${CACHE_KEY}" @@ -341,9 +344,12 @@ plugins: echo " Cache Key: $${CACHE_KEY}" echo " ECR Cache Tag: $${ECR_CACHE_TAG}" echo " ECR Commit Tag: $${ECR_COMMIT_TAG}" +{% if branch == "main" %} + echo " ECR Nightly Tag: $${ECR_NIGHTLY_TAG}" +{% endif %} echo "========================================" - pip install awscli + python3 -m pip install awscli # Login to ECR aws ecr-public get-login-password --region us-east-1 | \ docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7 @@ -356,8 +362,15 @@ plugins: fi if [ "$$IMAGE_EXISTS" -eq 1 ]; then - docker pull "$${ECR_CACHE_TAG}" - docker tag "$${ECR_CACHE_TAG}" "rocm/vllm-dev:base-ci" + echo "ECR cache HIT: $${ECR_CACHE_TAG}" + # Create commit tag directly from cache tag (no pull needed) + docker buildx imagetools create --tag "$${ECR_COMMIT_TAG}" "$${ECR_CACHE_TAG}" + echo "Tagged $${ECR_CACHE_TAG} as $${ECR_COMMIT_TAG} in ECR (no pull required)" +{% if branch == "main" %} + # On main, also tag as latest nightly (no pull needed) + docker buildx imagetools create --tag "$${ECR_NIGHTLY_TAG}" "$${ECR_CACHE_TAG}" + echo "Tagged $${ECR_CACHE_TAG} as $${ECR_NIGHTLY_TAG} in ECR (no pull required)" +{% endif %} else # Tier 2: Check S3 cache S3_IMAGE_EXISTS=0 @@ -425,18 +438,15 @@ plugins: # Push to ECR (cache tag for future lookups + commit tag for traceability) docker tag "rocm/vllm-dev:base-ci" "$${ECR_CACHE_TAG}" docker push "$${ECR_CACHE_TAG}" - fi - - # Always push the commit-specific tag - docker tag "rocm/vllm-dev:base-ci" "$${ECR_COMMIT_TAG}" - docker push "$${ECR_COMMIT_TAG}" - + docker tag "rocm/vllm-dev:base-ci" "$${ECR_COMMIT_TAG}" + docker push "$${ECR_COMMIT_TAG}" {% if branch == "main" %} - # On main, also tag as latest nightly - docker tag "rocm/vllm-dev:base-ci" "{{ rocm_base_ecr_nightly_tag }}" - docker push "{{ rocm_base_ecr_nightly_tag }}" - echo "Pushed nightly tag: {{ rocm_base_ecr_nightly_tag }}" + # On main, also tag and push as latest nightly + docker tag "rocm/vllm-dev:base-ci" "$${ECR_NIGHTLY_TAG}" + docker push "$${ECR_NIGHTLY_TAG}" + echo "Pushed nightly tag: $${ECR_NIGHTLY_TAG}" {% endif %} + fi echo "Base image ready: $${ECR_COMMIT_TAG}" agents: @@ -456,7 +466,7 @@ plugins: depends_on: amd-rocm-base-build soft_fail: false commands: - - "pip install awscli" + - "python3 -m pip install awscli" - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - "docker pull {{ rocm_base_ecr_commit_tag }} && docker tag {{ rocm_base_ecr_commit_tag }} rocm/vllm-dev:base-ci" - > From ff2db46333f3535dc03cb0417b561b36644c5354 Mon Sep 17 00:00:00 2001 From: tjtanaa Date: Thu, 26 Feb 2026 17:15:42 +0000 Subject: [PATCH 14/27] try apt install awscli Signed-off-by: tjtanaa --- buildkite/test-template-amd.j2 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/buildkite/test-template-amd.j2 b/buildkite/test-template-amd.j2 index 1bf8dd15..c5060ba5 100644 --- a/buildkite/test-template-amd.j2 +++ b/buildkite/test-template-amd.j2 @@ -466,7 +466,7 @@ plugins: depends_on: amd-rocm-base-build soft_fail: false commands: - - "python3 -m pip install awscli" + - "apt install -y awscli" - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - "docker pull {{ rocm_base_ecr_commit_tag }} && docker tag {{ rocm_base_ecr_commit_tag }} rocm/vllm-dev:base-ci" - > From d55a61c72bb0c4ecbda6908a04412fdc264bed2a Mon Sep 17 00:00:00 2001 From: tjtanaa Date: Fri, 27 Feb 2026 03:30:07 +0000 Subject: [PATCH 15/27] don't install awscli Signed-off-by: tjtanaa --- buildkite/test-template-amd.j2 | 1 - 1 file changed, 1 deletion(-) diff --git a/buildkite/test-template-amd.j2 b/buildkite/test-template-amd.j2 index c5060ba5..f7ea0dfc 100644 --- a/buildkite/test-template-amd.j2 +++ b/buildkite/test-template-amd.j2 @@ -466,7 +466,6 @@ plugins: depends_on: amd-rocm-base-build soft_fail: false commands: - - "apt install -y awscli" - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - "docker pull {{ rocm_base_ecr_commit_tag }} && docker tag {{ rocm_base_ecr_commit_tag }} rocm/vllm-dev:base-ci" - > From 2d23606dc9dcbd060c4a0bcfafdf0a16d468e9d7 Mon Sep 17 00:00:00 2001 From: tjtanaa Date: Fri, 27 Feb 2026 04:39:55 +0000 Subject: [PATCH 16/27] use smaller agent small_cpu_queue_premerge Signed-off-by: tjtanaa --- buildkite/test-template-amd.j2 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/buildkite/test-template-amd.j2 b/buildkite/test-template-amd.j2 index f7ea0dfc..567b66fe 100644 --- a/buildkite/test-template-amd.j2 +++ b/buildkite/test-template-amd.j2 @@ -314,7 +314,7 @@ plugins: exit 1 fi agents: - queue: cpu_queue_postmerge + queue: small_cpu_queue_premerge soft_fail: false {% endif %} From 2bb652095929123217eebf121a65b5d980b913cb Mon Sep 17 00:00:00 2001 From: tjtanaa Date: Fri, 27 Feb 2026 04:49:04 +0000 Subject: [PATCH 17/27] add to use premerge queue Signed-off-by: tjtanaa --- buildkite/test-template-amd.j2 | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/buildkite/test-template-amd.j2 b/buildkite/test-template-amd.j2 index 567b66fe..5a3e49ef 100644 --- a/buildkite/test-template-amd.j2 +++ b/buildkite/test-template-amd.j2 @@ -450,7 +450,11 @@ plugins: echo "Base image ready: $${ECR_COMMIT_TAG}" agents: +{% if branch == "main" %} queue: cpu_queue_postmerge +{% else %} + queue: cpu_queue_premerge +{% endif %} env: DOCKER_BUILDKIT: "1" retry: From 8c61d7fd83302d09ab7c9cf2c31ffb0d1e2f93ee Mon Sep 17 00:00:00 2001 From: tjtanaa Date: Fri, 27 Feb 2026 04:58:34 +0000 Subject: [PATCH 18/27] always use cpu_queue_postmerge when need to push to ecr Signed-off-by: tjtanaa --- buildkite/test-template-amd.j2 | 4 ---- 1 file changed, 4 deletions(-) diff --git a/buildkite/test-template-amd.j2 b/buildkite/test-template-amd.j2 index 5a3e49ef..567b66fe 100644 --- a/buildkite/test-template-amd.j2 +++ b/buildkite/test-template-amd.j2 @@ -450,11 +450,7 @@ plugins: echo "Base image ready: $${ECR_COMMIT_TAG}" agents: -{% if branch == "main" %} queue: cpu_queue_postmerge -{% else %} - queue: cpu_queue_premerge -{% endif %} env: DOCKER_BUILDKIT: "1" retry: From 4dacfc823277f654333e247f6a56e2f6c29ab708 Mon Sep 17 00:00:00 2001 From: tjtanaa Date: Fri, 27 Feb 2026 10:18:27 +0000 Subject: [PATCH 19/27] don't use sccache Signed-off-by: tjtanaa --- buildkite/test-template-amd.j2 | 4 ---- 1 file changed, 4 deletions(-) diff --git a/buildkite/test-template-amd.j2 b/buildkite/test-template-amd.j2 index 567b66fe..dfde2603 100644 --- a/buildkite/test-template-amd.j2 +++ b/buildkite/test-template-amd.j2 @@ -475,10 +475,6 @@ plugins: --build-arg ARG_PYTORCH_ROCM_ARCH='gfx942;gfx950' --build-arg VLLM_BRANCH=$BUILDKITE_COMMIT --build-arg BASE_IMAGE=rocm/vllm-dev:base-ci - --build-arg USE_SCCACHE=1 - --build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache - --build-arg SCCACHE_REGION_NAME=us-west-2 - --build-arg SCCACHE_S3_NO_CREDENTIALS=0 --tag {{ docker_image_amd }} -f docker/Dockerfile.rocm --target test From bcd0474fdfc83d5e39ac07d901330f21da2ee93d Mon Sep 17 00:00:00 2001 From: tjtanaa Date: Fri, 27 Feb 2026 16:33:42 +0000 Subject: [PATCH 20/27] fix mising aws cli Signed-off-by: tjtanaa --- buildkite/test-template-amd.j2 | 1 + 1 file changed, 1 insertion(+) diff --git a/buildkite/test-template-amd.j2 b/buildkite/test-template-amd.j2 index dfde2603..6eb98957 100644 --- a/buildkite/test-template-amd.j2 +++ b/buildkite/test-template-amd.j2 @@ -466,6 +466,7 @@ plugins: depends_on: amd-rocm-base-build soft_fail: false commands: + - "pip install awscli" - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - "docker pull {{ rocm_base_ecr_commit_tag }} && docker tag {{ rocm_base_ecr_commit_tag }} rocm/vllm-dev:base-ci" - > From ff49fd8133c2b7eb236827107be39b35fa92516f Mon Sep 17 00:00:00 2001 From: tjtanaa Date: Fri, 27 Feb 2026 17:11:07 +0000 Subject: [PATCH 21/27] debug environment Signed-off-by: tjtanaa --- buildkite/test-template-amd.j2 | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/buildkite/test-template-amd.j2 b/buildkite/test-template-amd.j2 index 6eb98957..9c231772 100644 --- a/buildkite/test-template-amd.j2 +++ b/buildkite/test-template-amd.j2 @@ -466,7 +466,25 @@ plugins: depends_on: amd-rocm-base-build soft_fail: false commands: - - "pip install awscli" + - "echo '=== Checking Python ==='" + - "which python || echo 'python not found'" + - "which python3 || echo 'python3 not found'" + - "which python2 || echo 'python2 not found'" + - "echo '=== Checking pip ==='" + - "which pip || echo 'pip not found'" + - "which pip3 || echo 'pip3 not found'" + - "echo '=== Checking AWS CLI ==='" + - "which aws || echo 'aws not found'" + - "echo '=== Checking Docker ==='" + - "which docker || echo 'docker not found'" + - "docker --version || echo 'docker not working'" + - "echo '=== Environment ==='" + - "printenv | sort" + - "echo '=== PATH ==='" + - "echo $PATH" + - "echo '=== Available commands ==='" + - "ls /usr/bin | grep -E '(python|aws)'" + - "ls /usr/local/bin | grep -E '(python|aws)' || true" - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - "docker pull {{ rocm_base_ecr_commit_tag }} && docker tag {{ rocm_base_ecr_commit_tag }} rocm/vllm-dev:base-ci" - > From d425ac6f070027b1985a6e07df178bb9ee4d6f54 Mon Sep 17 00:00:00 2001 From: tjtanaa Date: Fri, 27 Feb 2026 17:16:44 +0000 Subject: [PATCH 22/27] use python3 pip install Signed-off-by: tjtanaa --- buildkite/test-template-amd.j2 | 25 ++++++------------------- 1 file changed, 6 insertions(+), 19 deletions(-) diff --git a/buildkite/test-template-amd.j2 b/buildkite/test-template-amd.j2 index 9c231772..1b3afdd5 100644 --- a/buildkite/test-template-amd.j2 +++ b/buildkite/test-template-amd.j2 @@ -466,25 +466,12 @@ plugins: depends_on: amd-rocm-base-build soft_fail: false commands: - - "echo '=== Checking Python ==='" - - "which python || echo 'python not found'" - - "which python3 || echo 'python3 not found'" - - "which python2 || echo 'python2 not found'" - - "echo '=== Checking pip ==='" - - "which pip || echo 'pip not found'" - - "which pip3 || echo 'pip3 not found'" - - "echo '=== Checking AWS CLI ==='" - - "which aws || echo 'aws not found'" - - "echo '=== Checking Docker ==='" - - "which docker || echo 'docker not found'" - - "docker --version || echo 'docker not working'" - - "echo '=== Environment ==='" - - "printenv | sort" - - "echo '=== PATH ==='" - - "echo $PATH" - - "echo '=== Available commands ==='" - - "ls /usr/bin | grep -E '(python|aws)'" - - "ls /usr/local/bin | grep -E '(python|aws)' || true" + # Install AWS CLI locally (no sudo needed) + # Install through pip because sudo is not available + - "python3 -m pip install --user awscli" + - "export PATH=$HOME/.local/bin:$PATH" + # Verify installation + - "aws --version" - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - "docker pull {{ rocm_base_ecr_commit_tag }} && docker tag {{ rocm_base_ecr_commit_tag }} rocm/vllm-dev:base-ci" - > From 746f44f8f84b1b693aadb5cba4a79be779f13310 Mon Sep 17 00:00:00 2001 From: tjtanaa Date: Fri, 27 Feb 2026 17:40:52 +0000 Subject: [PATCH 23/27] use docker image instead Signed-off-by: tjtanaa --- buildkite/test-template-amd.j2 | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/buildkite/test-template-amd.j2 b/buildkite/test-template-amd.j2 index 1b3afdd5..c852e7f0 100644 --- a/buildkite/test-template-amd.j2 +++ b/buildkite/test-template-amd.j2 @@ -466,13 +466,16 @@ plugins: depends_on: amd-rocm-base-build soft_fail: false commands: - # Install AWS CLI locally (no sudo needed) - # Install through pip because sudo is not available - - "python3 -m pip install --user awscli" - - "export PATH=$HOME/.local/bin:$PATH" - # Verify installation - - "aws --version" - - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" + # Login to ECR using AWS CLI Docker image (no pip/python needed) + - | + docker run --rm \ + -e AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY \ + -e AWS_SESSION_TOKEN \ + -e AWS_DEFAULT_REGION=us-east-1 \ + amazon/aws-cli:latest \ + ecr-public get-login-password --region us-east-1 | \ + docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7 - "docker pull {{ rocm_base_ecr_commit_tag }} && docker tag {{ rocm_base_ecr_commit_tag }} rocm/vllm-dev:base-ci" - > docker build From 93662a614331c66e89fe7a5a5c89f85a3ee6bcb6 Mon Sep 17 00:00:00 2001 From: tjtanaa Date: Sat, 28 Feb 2026 14:56:06 +0000 Subject: [PATCH 24/27] pin to a specific aws-cli version Signed-off-by: tjtanaa --- buildkite/test-template-amd.j2 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/buildkite/test-template-amd.j2 b/buildkite/test-template-amd.j2 index c852e7f0..0a0fc3c8 100644 --- a/buildkite/test-template-amd.j2 +++ b/buildkite/test-template-amd.j2 @@ -473,7 +473,7 @@ plugins: -e AWS_SECRET_ACCESS_KEY \ -e AWS_SESSION_TOKEN \ -e AWS_DEFAULT_REGION=us-east-1 \ - amazon/aws-cli:latest \ + amazon/aws-cli:2.34.0 \ ecr-public get-login-password --region us-east-1 | \ docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7 - "docker pull {{ rocm_base_ecr_commit_tag }} && docker tag {{ rocm_base_ecr_commit_tag }} rocm/vllm-dev:base-ci" From b0d07f8abb7380ccc5d0f8cc4ea542e9ab3fe8ca Mon Sep 17 00:00:00 2001 From: tjtanaa Date: Wed, 4 Mar 2026 07:56:13 +0000 Subject: [PATCH 25/27] remove ci-infra; remove the nightly docker image and wheel releases Signed-off-by: tjtanaa --- buildkite/test-template-amd.j2 | 99 ++-------------------------------- 1 file changed, 4 insertions(+), 95 deletions(-) diff --git a/buildkite/test-template-amd.j2 b/buildkite/test-template-amd.j2 index 0a0fc3c8..444dd0e9 100644 --- a/buildkite/test-template-amd.j2 +++ b/buildkite/test-template-amd.j2 @@ -495,13 +495,13 @@ plugins: DOCKER_BUILDKIT: "1" retry: automatic: - - exit_status: -1 + - exit_status: -1 # Agent was lost limit: 2 - - exit_status: -10 + - exit_status: -10 # Agent was lost limit: 2 - - exit_status: 128 + - exit_status: 128 # Git connectivity issues limit: 2 - - exit_status: 1 + - exit_status: 1 # Machine occasionally fail limit: 1 agents: queue: amd-cpu @@ -567,94 +567,3 @@ plugins: limit: 2 {% endif %} {% endfor %} - -{% if branch == "main" %} - - label: "AMD: :rocket: Build ROCm nightly release image" - depends_on: amd-build - key: "amd-nightly-release-image" - soft_fail: true - agents: - queue: amd-cpu - commands: - - | - set -euo pipefail - aws ecr-public get-login-password --region us-east-1 | \ - docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7 - docker pull "{{ rocm_base_ecr_commit_tag }}" - docker tag "{{ rocm_base_ecr_commit_tag }}" rocm/vllm-dev:base-ci - - DOCKER_BUILDKIT=1 docker build \ - --build-arg max_jobs=16 \ - --build-arg BASE_IMAGE=rocm/vllm-dev:base-ci \ - --build-arg ARG_PYTORCH_ROCM_ARCH='gfx90a;gfx942' \ - --build-arg USE_SCCACHE=1 \ - --build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache \ - --build-arg SCCACHE_REGION_NAME=us-west-2 \ - --build-arg SCCACHE_S3_NO_CREDENTIALS=0 \ - --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$$BUILDKITE_COMMIT-rocm \ - --target vllm-openai \ - --progress plain \ - -f docker/Dockerfile.rocm . - docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$$BUILDKITE_COMMIT-rocm - env: - DOCKER_BUILDKIT: "1" - retry: - automatic: - - exit_status: -1 - limit: 2 - - - label: "AMD: :python: Build ROCm nightly wheel" - depends_on: amd-rocm-base-build - key: "amd-nightly-wheel" - soft_fail: true - agents: - queue: amd-cpu - commands: - - | - set -euo pipefail - aws ecr-public get-login-password --region us-east-1 | \ - docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7 - docker pull "{{ rocm_base_ecr_commit_tag }}" - docker tag "{{ rocm_base_ecr_commit_tag }}" rocm/vllm-dev:base-ci - - # Download base wheels from S3 cache - PYTHON_VERSION=$$(grep '^ARG PYTHON_VERSION=' docker/Dockerfile.rocm_base | sed 's/^ARG PYTHON_VERSION=//') - echo "Using PYTHON_VERSION from Dockerfile.rocm_base: $${PYTHON_VERSION}" - export PYTHON_VERSION - PYTORCH_ROCM_ARCH=$$(grep '^ARG PYTORCH_ROCM_ARCH=' docker/Dockerfile.rocm_base | sed 's/^ARG PYTORCH_ROCM_ARCH=//') - echo "Using PYTORCH_ROCM_ARCH from Dockerfile.rocm_base: $${PYTORCH_ROCM_ARCH}" - export PYTORCH_ROCM_ARCH - export S3_BUCKET=vllm-wheels - .buildkite/scripts/cache-rocm-base-wheels.sh download - - mkdir -p docker/context/base-wheels - cp artifacts/rocm-base-wheels/*.whl docker/context/base-wheels/ - - git fetch --tags --force origin - - DOCKER_BUILDKIT=1 docker build \ - --file docker/Dockerfile.rocm \ - --target export_vllm_wheel_release \ - --output type=local,dest=rocm-dist \ - --build-arg BASE_IMAGE=rocm/vllm-dev:base-ci \ - --build-arg ARG_PYTORCH_ROCM_ARCH="gfx90a;gfx942" \ - --build-arg REMOTE_VLLM=0 \ - --build-arg GIT_REPO_CHECK=1 \ - --build-arg USE_SCCACHE=1 \ - --build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache \ - --build-arg SCCACHE_REGION_NAME=us-west-2 \ - --build-arg SCCACHE_S3_NO_CREDENTIALS=0 \ - . - - mkdir -p artifacts/rocm-vllm-wheel - cp rocm-dist/*.whl artifacts/rocm-vllm-wheel/ - bash .buildkite/scripts/upload-rocm-wheels.sh - env: - DOCKER_BUILDKIT: "1" - ROCM_UPLOAD_WHEELS: "true" - S3_BUCKET: "vllm-wheels" - retry: - automatic: - - exit_status: -1 - limit: 2 -{% endif %} From 128249047f31dec92b8cb0374cb0ea9f59ab09fe Mon Sep 17 00:00:00 2001 From: tjtanaa Date: Mon, 23 Mar 2026 15:45:04 +0000 Subject: [PATCH 26/27] remove code related to rocm-base-image.tar.gz and also remove the rocm_base_ecr_commit_tag Signed-off-by: tjtanaa --- buildkite/test-template-amd.j2 | 124 +++++++++++---------------------- 1 file changed, 41 insertions(+), 83 deletions(-) diff --git a/buildkite/test-template-amd.j2 b/buildkite/test-template-amd.j2 index ac4ba552..e4e7d0d6 100644 --- a/buildkite/test-template-amd.j2 +++ b/buildkite/test-template-amd.j2 @@ -10,7 +10,6 @@ {% endif %} {% set docker_image_amd = "rocm/vllm-ci:$BUILDKITE_COMMIT" %} {% set rocm_base_ecr_image = "public.ecr.aws/q9t5s3a7/vllm-release-repo" %} -{% set rocm_base_ecr_commit_tag = rocm_base_ecr_image ~ ":$BUILDKITE_COMMIT-" ~ rocm_base_cache_key ~ "-rocm-base" %} {% set rocm_base_ecr_cache_tag = rocm_base_ecr_image ~ ":" ~ rocm_base_cache_key ~ "-rocm-base" %} {% set rocm_base_ecr_nightly_tag = rocm_base_ecr_image ~ ":latest-rocm-base-nightly" %} {% set default_working_dir = "/vllm-workspace/tests" %} @@ -332,7 +331,6 @@ plugins: CACHE_KEY="{{ rocm_base_cache_key }}" ECR_CACHE_TAG="{{ rocm_base_ecr_cache_tag }}" - ECR_COMMIT_TAG="{{ rocm_base_ecr_commit_tag }}" {% if branch == "main" %} ECR_NIGHTLY_TAG="{{ rocm_base_ecr_nightly_tag }}" {% endif %} @@ -343,7 +341,6 @@ plugins: echo "ROCm Base Image Build/Reuse" echo " Cache Key: $${CACHE_KEY}" echo " ECR Cache Tag: $${ECR_CACHE_TAG}" - echo " ECR Commit Tag: $${ECR_COMMIT_TAG}" {% if branch == "main" %} echo " ECR Nightly Tag: $${ECR_NIGHTLY_TAG}" {% endif %} @@ -358,97 +355,58 @@ plugins: IMAGE_EXISTS=0 if docker manifest inspect "$${ECR_CACHE_TAG}" > /dev/null 2>&1; then IMAGE_EXISTS=1 - echo "ECR cache HIT: $${ECR_CACHE_TAG}" fi if [ "$$IMAGE_EXISTS" -eq 1 ]; then echo "ECR cache HIT: $${ECR_CACHE_TAG}" - # Create commit tag directly from cache tag (no pull needed) - docker buildx imagetools create --tag "$${ECR_COMMIT_TAG}" "$${ECR_CACHE_TAG}" - echo "Tagged $${ECR_CACHE_TAG} as $${ECR_COMMIT_TAG} in ECR (no pull required)" -{% if branch == "main" %} - # On main, also tag as latest nightly (no pull needed) - docker buildx imagetools create --tag "$${ECR_NIGHTLY_TAG}" "$${ECR_CACHE_TAG}" - echo "Tagged $${ECR_CACHE_TAG} as $${ECR_NIGHTLY_TAG} in ECR (no pull required)" -{% endif %} - else - # Tier 2: Check S3 cache - S3_IMAGE_EXISTS=0 - if aws s3 ls "$${S3_CACHE_PATH}/rocm-base-image.tar.gz" > /dev/null 2>&1; then - S3_IMAGE_EXISTS=1 - fi - - if [ "$$S3_IMAGE_EXISTS" -eq 1 ]; then - echo "S3 cache HIT. Downloading..." - mkdir -p /tmp/rocm-cache - aws s3 cp "$${S3_CACHE_PATH}/rocm-base-image.tar.gz" /tmp/rocm-cache/rocm-base-image.tar.gz - LOAD_OUTPUT=$$(gunzip -c /tmp/rocm-cache/rocm-base-image.tar.gz | docker load) - echo "$$LOAD_OUTPUT" - BASE_TAG=$$(echo "$$LOAD_OUTPUT" | grep "Loaded image:" | sed 's/Loaded image: //') - docker tag "$$BASE_TAG" "rocm/vllm-dev:base-ci" - rm -rf /tmp/rocm-cache - else - echo "CACHE MISS. Building Dockerfile.rocm_base from scratch..." + else + echo "CACHE MISS. Building Dockerfile.rocm_base from scratch..." - PYTORCH_ROCM_ARCH=$$(grep '^ARG PYTORCH_ROCM_ARCH=' docker/Dockerfile.rocm_base | sed 's/^ARG PYTORCH_ROCM_ARCH=//') - echo "Using PYTORCH_ROCM_ARCH from Dockerfile.rocm_base: $${PYTORCH_ROCM_ARCH}" - PYTHON_VERSION=$$(grep '^ARG PYTHON_VERSION=' docker/Dockerfile.rocm_base | sed 's/^ARG PYTHON_VERSION=//') - echo "Using PYTHON_VERSION from Dockerfile.rocm_base: $${PYTHON_VERSION}" + PYTORCH_ROCM_ARCH=$$(grep '^ARG PYTORCH_ROCM_ARCH=' docker/Dockerfile.rocm_base | sed 's/^ARG PYTORCH_ROCM_ARCH=//') + echo "Using PYTORCH_ROCM_ARCH from Dockerfile.rocm_base: $${PYTORCH_ROCM_ARCH}" + PYTHON_VERSION=$$(grep '^ARG PYTHON_VERSION=' docker/Dockerfile.rocm_base | sed 's/^ARG PYTHON_VERSION=//') + echo "Using PYTHON_VERSION from Dockerfile.rocm_base: $${PYTHON_VERSION}" - DOCKER_BUILDKIT=1 docker buildx build \ - --file docker/Dockerfile.rocm_base \ - --tag "rocm/vllm-dev:base-ci" \ - --build-arg PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH}" \ - --build-arg PYTHON_VERSION="$${PYTHON_VERSION}" \ - --build-arg USE_SCCACHE=1 \ - --build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache \ - --build-arg SCCACHE_REGION_NAME=us-west-2 \ - --build-arg SCCACHE_S3_NO_CREDENTIALS=0 \ - --load \ - --progress plain \ - . - - # Upload to S3 for future cache hits - docker save rocm/vllm-dev:base-ci | gzip > /tmp/rocm-base-image.tar.gz - aws s3 cp /tmp/rocm-base-image.tar.gz "$${S3_CACHE_PATH}/rocm-base-image.tar.gz" - rm -f /tmp/rocm-base-image.tar.gz - - # Also upload base wheels to S3 cache - DOCKER_BUILDKIT=1 docker buildx build \ - --file docker/Dockerfile.rocm_base \ - --tag rocm-base-debs:ci \ - --target debs_wheel_release \ - --build-arg PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH}" \ - --build-arg PYTHON_VERSION="$${PYTHON_VERSION}" \ - --build-arg USE_SCCACHE=1 \ - --build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache \ - --build-arg SCCACHE_REGION_NAME=us-west-2 \ - --build-arg SCCACHE_S3_NO_CREDENTIALS=0 \ - --load \ - . - mkdir -p artifacts/rocm-base-wheels - cid=$$(docker create rocm-base-debs:ci) - docker cp $${cid}:/app/debs/. artifacts/rocm-base-wheels/ - docker rm $${cid} - export PYTHON_VERSION - export PYTORCH_ROCM_ARCH - S3_BUCKET=vllm-wheels .buildkite/scripts/cache-rocm-base-wheels.sh upload - fi + DOCKER_BUILDKIT=1 docker buildx build \ + --file docker/Dockerfile.rocm_base \ + --tag "rocm/vllm-dev:base-ci" \ + --build-arg PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH}" \ + --build-arg PYTHON_VERSION="$${PYTHON_VERSION}" \ + --build-arg USE_SCCACHE=1 \ + --build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache \ + --build-arg SCCACHE_REGION_NAME=us-west-2 \ + --build-arg SCCACHE_S3_NO_CREDENTIALS=0 \ + --load \ + --progress plain \ + . + # Also upload base wheels to S3 cache + DOCKER_BUILDKIT=1 docker buildx build \ + --file docker/Dockerfile.rocm_base \ + --tag rocm-base-debs:ci \ + --target debs_wheel_release \ + --build-arg PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH}" \ + --build-arg PYTHON_VERSION="$${PYTHON_VERSION}" \ + --build-arg USE_SCCACHE=1 \ + --build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache \ + --build-arg SCCACHE_REGION_NAME=us-west-2 \ + --build-arg SCCACHE_S3_NO_CREDENTIALS=0 \ + --load \ + . + mkdir -p artifacts/rocm-base-wheels + cid=$$(docker create rocm-base-debs:ci) + docker cp $${cid}:/app/debs/. artifacts/rocm-base-wheels/ + docker rm $${cid} + export PYTHON_VERSION + export PYTORCH_ROCM_ARCH + S3_BUCKET=vllm-wheels .buildkite/scripts/cache-rocm-base-wheels.sh upload + # Push to ECR (cache tag for future lookups + commit tag for traceability) docker tag "rocm/vllm-dev:base-ci" "$${ECR_CACHE_TAG}" docker push "$${ECR_CACHE_TAG}" - docker tag "rocm/vllm-dev:base-ci" "$${ECR_COMMIT_TAG}" - docker push "$${ECR_COMMIT_TAG}" -{% if branch == "main" %} - # On main, also tag and push as latest nightly - docker tag "rocm/vllm-dev:base-ci" "$${ECR_NIGHTLY_TAG}" - docker push "$${ECR_NIGHTLY_TAG}" - echo "Pushed nightly tag: $${ECR_NIGHTLY_TAG}" -{% endif %} fi - echo "Base image ready: $${ECR_COMMIT_TAG}" + echo "Base image ready: $${ECR_CACHE_TAG}" agents: queue: cpu_queue_postmerge env: @@ -476,7 +434,7 @@ plugins: amazon/aws-cli:2.34.0 \ ecr-public get-login-password --region us-east-1 | \ docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7 - - "docker pull {{ rocm_base_ecr_commit_tag }} && docker tag {{ rocm_base_ecr_commit_tag }} rocm/vllm-dev:base-ci" + - "docker pull {{ rocm_base_ecr_cache_tag }} && docker tag {{ rocm_base_ecr_cache_tag }} rocm/vllm-dev:base-ci" - > docker build --build-arg max_jobs=16 From d0a0a069693a6d49278d5e3d6e5d3298fd648d59 Mon Sep 17 00:00:00 2001 From: tjtanaa Date: Wed, 25 Mar 2026 10:43:47 +0000 Subject: [PATCH 27/27] remove PYTHON and PYTORCH_ARCH from docker image cache key Signed-off-by: tjtanaa --- buildkite/bootstrap-amd.sh | 10 +--------- buildkite/test-template-amd.j2 | 24 ++---------------------- 2 files changed, 3 insertions(+), 31 deletions(-) diff --git a/buildkite/bootstrap-amd.sh b/buildkite/bootstrap-amd.sh index e9eb3c83..4547adb7 100644 --- a/buildkite/bootstrap-amd.sh +++ b/buildkite/bootstrap-amd.sh @@ -55,21 +55,13 @@ check_run_all_label() { compute_rocm_base_cache_key() { local DOCKERFILE="docker/Dockerfile.rocm_base" - local DEFAULT_PYTHON - DEFAULT_PYTHON=$(grep '^ARG PYTHON_VERSION=' "$DOCKERFILE" | sed 's/^ARG PYTHON_VERSION=//') - local CI_PYTHON_VERSION="${ROCM_CI_PYTHON_VERSION:-$DEFAULT_PYTHON}" - local DEFAULT_ARCH - DEFAULT_ARCH=$(grep '^ARG PYTORCH_ROCM_ARCH=' "$DOCKERFILE" | sed 's/^ARG PYTORCH_ROCM_ARCH=//') - local CI_PYTORCH_ROCM_ARCH="${ROCM_CI_PYTORCH_ROCM_ARCH:-$DEFAULT_ARCH}" if [[ ! -f "$DOCKERFILE" ]]; then echo "unknown" return fi local dockerfile_hash=$(sha256sum "$DOCKERFILE" | cut -c1-16) - local args_string="${CI_PYTHON_VERSION}|${CI_PYTORCH_ROCM_ARCH}" - local args_hash=$(echo "$args_string" | sha256sum | cut -c1-8) - echo "${dockerfile_hash}-${args_hash}" + echo "${dockerfile_hash}" } if [[ -z "${COV_ENABLED:-}" ]]; then diff --git a/buildkite/test-template-amd.j2 b/buildkite/test-template-amd.j2 index e4e7d0d6..bf3f57ac 100644 --- a/buildkite/test-template-amd.j2 +++ b/buildkite/test-template-amd.j2 @@ -11,7 +11,6 @@ {% set docker_image_amd = "rocm/vllm-ci:$BUILDKITE_COMMIT" %} {% set rocm_base_ecr_image = "public.ecr.aws/q9t5s3a7/vllm-release-repo" %} {% set rocm_base_ecr_cache_tag = rocm_base_ecr_image ~ ":" ~ rocm_base_cache_key ~ "-rocm-base" %} -{% set rocm_base_ecr_nightly_tag = rocm_base_ecr_image ~ ":latest-rocm-base-nightly" %} {% set default_working_dir = "/vllm-workspace/tests" %} {% set hf_home = "/root/.cache/huggingface" %} {% set hf_home_efs = "/mnt/efs/hf_cache" %} @@ -331,9 +330,6 @@ plugins: CACHE_KEY="{{ rocm_base_cache_key }}" ECR_CACHE_TAG="{{ rocm_base_ecr_cache_tag }}" -{% if branch == "main" %} - ECR_NIGHTLY_TAG="{{ rocm_base_ecr_nightly_tag }}" -{% endif %} S3_BUCKET="vllm-wheels" S3_CACHE_PATH="s3://$${S3_BUCKET}/rocm/cache/$${CACHE_KEY}" @@ -341,9 +337,6 @@ plugins: echo "ROCm Base Image Build/Reuse" echo " Cache Key: $${CACHE_KEY}" echo " ECR Cache Tag: $${ECR_CACHE_TAG}" -{% if branch == "main" %} - echo " ECR Nightly Tag: $${ECR_NIGHTLY_TAG}" -{% endif %} echo "========================================" python3 -m pip install awscli @@ -362,16 +355,9 @@ plugins: else echo "CACHE MISS. Building Dockerfile.rocm_base from scratch..." - PYTORCH_ROCM_ARCH=$$(grep '^ARG PYTORCH_ROCM_ARCH=' docker/Dockerfile.rocm_base | sed 's/^ARG PYTORCH_ROCM_ARCH=//') - echo "Using PYTORCH_ROCM_ARCH from Dockerfile.rocm_base: $${PYTORCH_ROCM_ARCH}" - PYTHON_VERSION=$$(grep '^ARG PYTHON_VERSION=' docker/Dockerfile.rocm_base | sed 's/^ARG PYTHON_VERSION=//') - echo "Using PYTHON_VERSION from Dockerfile.rocm_base: $${PYTHON_VERSION}" - DOCKER_BUILDKIT=1 docker buildx build \ --file docker/Dockerfile.rocm_base \ - --tag "rocm/vllm-dev:base-ci" \ - --build-arg PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH}" \ - --build-arg PYTHON_VERSION="$${PYTHON_VERSION}" \ + --tag "$${ECR_CACHE_TAG}" \ --build-arg USE_SCCACHE=1 \ --build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache \ --build-arg SCCACHE_REGION_NAME=us-west-2 \ @@ -385,8 +371,6 @@ plugins: --file docker/Dockerfile.rocm_base \ --tag rocm-base-debs:ci \ --target debs_wheel_release \ - --build-arg PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH}" \ - --build-arg PYTHON_VERSION="$${PYTHON_VERSION}" \ --build-arg USE_SCCACHE=1 \ --build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache \ --build-arg SCCACHE_REGION_NAME=us-west-2 \ @@ -397,12 +381,8 @@ plugins: cid=$$(docker create rocm-base-debs:ci) docker cp $${cid}:/app/debs/. artifacts/rocm-base-wheels/ docker rm $${cid} - export PYTHON_VERSION - export PYTORCH_ROCM_ARCH S3_BUCKET=vllm-wheels .buildkite/scripts/cache-rocm-base-wheels.sh upload - - # Push to ECR (cache tag for future lookups + commit tag for traceability) - docker tag "rocm/vllm-dev:base-ci" "$${ECR_CACHE_TAG}" + docker push "$${ECR_CACHE_TAG}" fi