Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
f93b993
add Dockerfile.rocm_base build
tjtanaa Feb 11, 2026
421c526
Merge remote-tracking branch 'origin/main' into rocmnightly
tjtanaa Feb 24, 2026
a24755a
change queue machine and also grep the gpu arch from dockerfile.rocm_…
tjtanaa Feb 24, 2026
a670248
change from amd-cpu to cpu_post_merge
tjtanaa Feb 24, 2026
4035b5d
automatically extract gpu arch in bootstrap-amd.sh
tjtanaa Feb 24, 2026
806ecc0
extract default python version from dockerfile.rocm_base
tjtanaa Feb 24, 2026
1b6a4c8
push to vllm ecr repo instead
tjtanaa Feb 24, 2026
d326d8a
try amd-cpu again, moving away from cpu_postmerge_queue
tjtanaa Feb 26, 2026
ca9ec6e
noninteractive installation
tjtanaa Feb 26, 2026
729703f
change git history check from amd-cpu to cpu_queue_postmerge
tjtanaa Feb 26, 2026
0f1980e
use amd cpu for ci image
tjtanaa Feb 26, 2026
2f87429
remove sudo
tjtanaa Feb 26, 2026
ba73c39
try enabling sccache when building amd CI image
tjtanaa Feb 26, 2026
72ea91e
fix the aws cli installation command; fix the main branch; avoid down…
tjtanaa Feb 26, 2026
ff2db46
try apt install awscli
tjtanaa Feb 26, 2026
d55a61c
don't install awscli
tjtanaa Feb 27, 2026
2d23606
use smaller agent small_cpu_queue_premerge
tjtanaa Feb 27, 2026
2bb6520
add to use premerge queue
tjtanaa Feb 27, 2026
8c61d7f
always use cpu_queue_postmerge when need to push to ecr
tjtanaa Feb 27, 2026
4dacfc8
don't use sccache
tjtanaa Feb 27, 2026
bcd0474
fix mising aws cli
tjtanaa Feb 27, 2026
ff49fd8
debug environment
tjtanaa Feb 27, 2026
d425ac6
use python3 pip install
tjtanaa Feb 27, 2026
746f44f
use docker image instead
tjtanaa Feb 27, 2026
93662a6
pin to a specific aws-cli version
tjtanaa Feb 28, 2026
b0d07f8
remove ci-infra; remove the nightly docker image and wheel releases
tjtanaa Mar 4, 2026
bcdfd4d
Merge remote-tracking branch 'origin/main' into rocmnightly
tjtanaa Mar 10, 2026
9e04cfa
Merge remote-tracking branch 'origin/main' into rocmnightly
tjtanaa Mar 17, 2026
1282490
remove code related to rocm-base-image.tar.gz and also remove the roc…
tjtanaa Mar 23, 2026
d0a0a06
remove PYTHON and PYTORCH_ARCH from docker image cache key
tjtanaa Mar 25, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions buildkite/bootstrap-amd.sh
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,17 @@ check_run_all_label() {
fi
}

compute_rocm_base_cache_key() {
local DOCKERFILE="docker/Dockerfile.rocm_base"

if [[ ! -f "$DOCKERFILE" ]]; then
echo "unknown"
return
fi
local dockerfile_hash=$(sha256sum "$DOCKERFILE" | cut -c1-16)
echo "${dockerfile_hash}"
}

if [[ -z "${COV_ENABLED:-}" ]]; then
COV_ENABLED=0
fi
Expand Down Expand Up @@ -86,6 +97,9 @@ upload_pipeline() {
echo "Nightly: $NIGHTLY"
echo "AMD Mirror HW: $AMD_MIRROR_HW"

ROCM_BASE_CACHE_KEY=$(compute_rocm_base_cache_key)
echo "ROCm base cache key: $ROCM_BASE_CACHE_KEY"

FAIL_FAST=$(fail_fast)

cd .buildkite
Expand All @@ -103,6 +117,8 @@ upload_pipeline() {
-D vllm_merge_base_commit="$(git merge-base origin/main HEAD)" \
-D cov_enabled="$COV_ENABLED" \
-D vllm_ci_branch="$VLLM_CI_BRANCH" \
-D rocm_base_cache_key="$ROCM_BASE_CACHE_KEY" \
-D rocm_base_changed="$ROCM_BASE_CHANGED" \
| sed '/^[[:space:]]*$/d' \
> pipeline.yaml
)
Expand Down Expand Up @@ -212,6 +228,15 @@ for file in $file_diff; do
fi
done

ROCM_BASE_CHANGED=0
for file in $file_diff; do
if [[ "$file" == "docker/Dockerfile.rocm_base" ]]; then
ROCM_BASE_CHANGED=1
echo "Dockerfile.rocm_base changed in this PR"
break
fi
done

# Check for ready-run-all-tests label
LABEL_RUN_ALL=$(check_run_all_label)
if [[ $LABEL_RUN_ALL == true ]]; then
Expand Down
131 changes: 129 additions & 2 deletions buildkite/test-template-amd.j2
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
{% set docker_image_cpu = "public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT-cpu" %}
{% endif %}
{% set docker_image_amd = "rocm/vllm-ci:$BUILDKITE_COMMIT" %}
{% set rocm_base_ecr_image = "public.ecr.aws/q9t5s3a7/vllm-release-repo" %}
{% set rocm_base_ecr_cache_tag = rocm_base_ecr_image ~ ":" ~ rocm_base_cache_key ~ "-rocm-base" %}
{% set default_working_dir = "/vllm-workspace/tests" %}
{% set hf_home = "/root/.cache/huggingface" %}
{% set hf_home_efs = "/mnt/efs/hf_cache" %}
Expand Down Expand Up @@ -284,17 +286,142 @@ plugins:
- group: "AMD Tests"
depends_on: ~
steps:
- label: "AMD: :docker: build image"
{% if branch != "main" %}
- label: "AMD: :git: Check Dockerfile.rocm_base freshness"
depends_on: ~
key: "amd-rocm-base-check"
commands:
- |
set -euo pipefail
echo "--- Checking Dockerfile.rocm_base git history"
git fetch origin main
MAIN_LATEST=$$(git log origin/main -1 --format='%H' -- docker/Dockerfile.rocm_base)
if [ -z "$$MAIN_LATEST" ]; then
echo "No commits found for Dockerfile.rocm_base on origin/main. Skipping check."
exit 0
fi
echo "Latest commit on main for Dockerfile.rocm_base: $$MAIN_LATEST"
if git merge-base --is-ancestor "$$MAIN_LATEST" HEAD; then
echo "OK: Current branch contains the latest Dockerfile.rocm_base from main"
else
echo "ERROR: Branch does NOT contain the latest Dockerfile.rocm_base from main."
echo "Please rebase or merge main into your branch."
buildkite-agent annotate --style error \
"Your branch is missing the latest Dockerfile.rocm_base changes from main (commit $$MAIN_LATEST). Please rebase or merge main." \
--context "rocm-base-freshness"
exit 1
fi
agents:
queue: small_cpu_queue_premerge
soft_fail: false
{% endif %}

- label: "AMD: :docker: Build/Reuse ROCm base image"
depends_on:
{% if branch != "main" %}
- "amd-rocm-base-check"
{% else %}
- ~
{% endif %}
key: "amd-rocm-base-build"
commands:
- |
set -euo pipefail

CACHE_KEY="{{ rocm_base_cache_key }}"
ECR_CACHE_TAG="{{ rocm_base_ecr_cache_tag }}"
S3_BUCKET="vllm-wheels"
S3_CACHE_PATH="s3://$${S3_BUCKET}/rocm/cache/$${CACHE_KEY}"

echo "========================================"
echo "ROCm Base Image Build/Reuse"
echo " Cache Key: $${CACHE_KEY}"
echo " ECR Cache Tag: $${ECR_CACHE_TAG}"
echo "========================================"

python3 -m pip install awscli
# Login to ECR
aws ecr-public get-login-password --region us-east-1 | \
docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7

# Tier 1: Check ECR cache (fastest)
IMAGE_EXISTS=0
if docker manifest inspect "$${ECR_CACHE_TAG}" > /dev/null 2>&1; then
IMAGE_EXISTS=1
fi

if [ "$$IMAGE_EXISTS" -eq 1 ]; then
echo "ECR cache HIT: $${ECR_CACHE_TAG}"
else
echo "CACHE MISS. Building Dockerfile.rocm_base from scratch..."

DOCKER_BUILDKIT=1 docker buildx build \
--file docker/Dockerfile.rocm_base \
--tag "$${ECR_CACHE_TAG}" \
--build-arg USE_SCCACHE=1 \
--build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache \
--build-arg SCCACHE_REGION_NAME=us-west-2 \
--build-arg SCCACHE_S3_NO_CREDENTIALS=0 \
--load \
--progress plain \
.

# Also upload base wheels to S3 cache
DOCKER_BUILDKIT=1 docker buildx build \
--file docker/Dockerfile.rocm_base \
--tag rocm-base-debs:ci \
--target debs_wheel_release \
--build-arg USE_SCCACHE=1 \
--build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache \
--build-arg SCCACHE_REGION_NAME=us-west-2 \
--build-arg SCCACHE_S3_NO_CREDENTIALS=0 \
--load \
.
mkdir -p artifacts/rocm-base-wheels
cid=$$(docker create rocm-base-debs:ci)
docker cp $${cid}:/app/debs/. artifacts/rocm-base-wheels/
docker rm $${cid}
S3_BUCKET=vllm-wheels .buildkite/scripts/cache-rocm-base-wheels.sh upload

docker push "$${ECR_CACHE_TAG}"
fi

echo "Base image ready: $${ECR_CACHE_TAG}"
agents:
queue: cpu_queue_postmerge
env:
DOCKER_BUILDKIT: "1"
retry:
automatic:
- exit_status: -1
limit: 2
- exit_status: -10
limit: 2
- exit_status: 1
limit: 1

- label: "AMD: :docker: build image"
depends_on: amd-rocm-base-build
soft_fail: false
commands:
# Handle the introduction of test target in Dockerfile.rocm
# Login to ECR using AWS CLI Docker image (no pip/python needed)
- |
docker run --rm \
-e AWS_ACCESS_KEY_ID \
-e AWS_SECRET_ACCESS_KEY \
-e AWS_SESSION_TOKEN \
-e AWS_DEFAULT_REGION=us-east-1 \
amazon/aws-cli:2.34.0 \
ecr-public get-login-password --region us-east-1 | \
docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7
- "docker pull {{ rocm_base_ecr_cache_tag }} && docker tag {{ rocm_base_ecr_cache_tag }} rocm/vllm-dev:base-ci"
- >
docker build
--build-arg max_jobs=16
--build-arg REMOTE_VLLM=1
--build-arg ARG_PYTORCH_ROCM_ARCH='gfx90a;gfx942;gfx950'
Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The arch is still restricted to these as it is an image for AMD CI.

--build-arg VLLM_BRANCH=$BUILDKITE_COMMIT
--build-arg BASE_IMAGE=rocm/vllm-dev:base-ci
--tag {{ docker_image_amd }}
-f docker/Dockerfile.rocm
--target test
Expand Down