Skip to content
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
c194db4
preliminary attempt on nightly rocm docker
Feb 17, 2026
0fee22f
preliminary attempt on nightly rocm docker
Feb 17, 2026
b8186cb
fix release branch
tjtanaa Mar 12, 2026
80a178c
Merge remote-tracking branch 'origin/main' into nightly-rocm
tjtanaa Mar 12, 2026
704696f
use the ECR to download docker image instead
tjtanaa Mar 12, 2026
70467a2
resolve 2990518
tjtanaa Mar 12, 2026
97d0acc
setup for mock release
tjtanaa Mar 12, 2026
36c72b2
fix syntax error
tjtanaa Mar 12, 2026
8c9b340
remove redundant docker pull
tjtanaa Mar 12, 2026
5600a99
only download wheels
tjtanaa Mar 12, 2026
af83bb8
add dry run
tjtanaa Mar 12, 2026
63504ae
fix denied adding to an image in the repository with name 'vllm-relea…
tjtanaa Mar 17, 2026
8d24732
fix denied adding to an image in the repository with name 'vllm-relea…
tjtanaa Mar 17, 2026
0e4701f
sync main
tjtanaa Mar 17, 2026
2f1ebf7
add logs wto the tag deletion op for debugging
tjtanaa Mar 17, 2026
92a8422
debug why tag is not delete
tjtanaa Mar 17, 2026
c86facc
do not create new ECR tag for base
tjtanaa Mar 19, 2026
fb27d75
fix base image to always use the tag ECR_IMAGE_TAG
tjtanaa Mar 21, 2026
96434bd
make the PR ready
tjtanaa Mar 23, 2026
fca0e1a
clean up lines
tjtanaa Mar 23, 2026
4695ecc
Merge remote-tracking branch 'origin/main' into nightly-rocm
tjtanaa Mar 23, 2026
57e2078
remove the PYTHON_VERSION and PYTORCH_ROCM_ARCH extraction logic
tjtanaa Mar 23, 2026
d3c6330
make the PR ready for review
tjtanaa Mar 24, 2026
d3fd8fe
remove dry run
tjtanaa Mar 24, 2026
ebaa629
fix comments
tjtanaa Mar 24, 2026
d72adca
Merge remote-tracking branch 'origin/main' into nightly-rocm
tjtanaa Mar 24, 2026
4d942e4
change to use small_cpu_queue_release
tjtanaa Mar 24, 2026
a2f2e43
Merge branch 'main' into nightly-rocm
tjtanaa Mar 26, 2026
1319b1c
Merge branch 'main' into nightly-rocm
tjtanaa Mar 26, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
941 changes: 467 additions & 474 deletions .buildkite/release-pipeline.yaml

Large diffs are not rendered by default.

7 changes: 3 additions & 4 deletions .buildkite/scripts/annotate-rocm-release.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,13 @@
# Generate Buildkite annotation for ROCm wheel release
set -ex

# Get build configuration from meta-data
# Extract build configuration from Dockerfile.rocm_base (single source of truth)
# Extract ROCm version dynamically from Dockerfile.rocm_base
# BASE_IMAGE format: rocm/dev-ubuntu-22.04:7.0-complete -> extracts "7.0"
ROCM_VERSION=$(grep -E '^ARG BASE_IMAGE=' docker/Dockerfile.rocm_base | sed -E 's/.*:([0-9]+\.[0-9]+).*/\1/' || echo "unknown")
PYTHON_VERSION=$(buildkite-agent meta-data get rocm-python-version 2>/dev/null || echo "3.12")
PYTORCH_ROCM_ARCH=$(buildkite-agent meta-data get rocm-pytorch-rocm-arch 2>/dev/null || echo "gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151")
PYTHON_VERSION=$(grep '^ARG PYTHON_VERSION=' docker/Dockerfile.rocm_base | sed 's/^ARG PYTHON_VERSION=//')
PYTORCH_ROCM_ARCH=$(grep '^ARG PYTORCH_ROCM_ARCH=' docker/Dockerfile.rocm_base | sed 's/^ARG PYTORCH_ROCM_ARCH=//')

# TODO: Enable the nightly build for ROCm
# Get release version, default to 1.0.0.dev for nightly/per-commit builds
RELEASE_VERSION=$(buildkite-agent meta-data get release-version 2>/dev/null || echo "")
if [ -z "${RELEASE_VERSION}" ]; then
Expand Down
12 changes: 8 additions & 4 deletions .buildkite/scripts/cache-rocm-base-wheels.sh
Original file line number Diff line number Diff line change
Expand Up @@ -104,14 +104,18 @@ case "${1:-}" in
echo "Cache key: ${CACHE_KEY}"
echo "Cache path: ${CACHE_PATH}"
echo ""

mkdir -p artifacts/rocm-base-wheels
aws s3 cp --recursive "${CACHE_PATH}" artifacts/rocm-base-wheels/


# Use sync with include/exclude to only download .whl files
aws s3 sync "${CACHE_PATH}" artifacts/rocm-base-wheels/ \
--exclude "*" \
--include "*.whl"

echo ""
echo "Downloaded wheels:"
find artifacts/rocm-base-wheels -maxdepth 1 -name '*.whl' -exec ls -lh {} \;

WHEEL_COUNT=$(find artifacts/rocm-base-wheels -maxdepth 1 -name '*.whl' 2>/dev/null | wc -l)
echo ""
echo "Total: $WHEEL_COUNT wheels"
Expand Down
91 changes: 91 additions & 0 deletions .buildkite/scripts/cleanup-ecr-rocm-base-tags.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
#!/bin/bash
# Clean up old per-commit rocm-base tags from ECR Public, keeping a rolling
Comment thread
tjtanaa marked this conversation as resolved.
Outdated
# window of the most recent N commits' tags plus the cache key tag.
#
# Usage: cleanup-ecr-rocm-base-tags.sh <ecr-image-ref> [window-size]
# ecr-image-ref: full ECR reference of the base image (cache-key tag to preserve)
# window-size: number of recent commit tags to keep (default 300)
set -euo pipefail

ECR_IMAGE_REF="${1:?Usage: $0 <ecr-image-ref> [window-size]}"
WINDOW_SIZE="${2:-300}"
REPO_NAME="vllm-release-repo"
REGION="us-east-1"

# Extract the cache key tag (always preserved)
CACHE_TAG="${ECR_IMAGE_REF##*:}"

# Get image digest from the locally-pulled image
DIGEST=$(docker inspect --format='{{index .RepoDigests 0}}' "$ECR_IMAGE_REF" | awk -F@ '{print $2}')
if [ -z "$DIGEST" ]; then
echo "WARNING: Could not get digest for $ECR_IMAGE_REF, skipping cleanup"
exit 0
fi

# Get all tags for this specific digest from ECR
IMAGE_DETAIL=$(aws ecr-public describe-images \
--repository-name "$REPO_NAME" \
--region "$REGION" \
--image-ids imageDigest="$DIGEST" \
--output json 2>/dev/null || echo '{"imageDetails":[]}')

# Extract all -rocm-base tags (excluding the cache key tag)
COMMIT_BASE_TAGS=$(echo "$IMAGE_DETAIL" | jq -r \
--arg cache_tag "$CACHE_TAG" \
'.imageDetails[0].imageTags[]? // empty
| select(endswith("-rocm-base"))
| select(. != $cache_tag)')

TAG_COUNT=$(echo "$COMMIT_BASE_TAGS" | grep -c . || true)
echo "Found $TAG_COUNT per-commit rocm-base tags (plus cache key tag: $CACHE_TAG)"

if [ "$TAG_COUNT" -le "$WINDOW_SIZE" ]; then
echo "Within window ($WINDOW_SIZE), no cleanup needed"
exit 0
fi

# Get the most recent N commit SHAs from git history
RECENT_COMMITS=$(git log --format=%H -n "$WINDOW_SIZE" 2>/dev/null | sort)
if [ -z "$RECENT_COMMITS" ]; then
echo "WARNING: Could not get git history, skipping cleanup"
exit 0
fi

# Identify tags to delete: commit SHA not in recent history
TAGS_TO_DELETE=""
KEEP_COUNT=0
DELETE_COUNT=0
while IFS= read -r tag; do
[ -z "$tag" ] && continue
COMMIT_SHA="${tag%-rocm-base}"
if echo "$RECENT_COMMITS" | grep -q "^${COMMIT_SHA}$"; then
KEEP_COUNT=$((KEEP_COUNT + 1))
else
TAGS_TO_DELETE="${TAGS_TO_DELETE}${tag}"$'\n'
DELETE_COUNT=$((DELETE_COUNT + 1))
fi
done <<< "$COMMIT_BASE_TAGS"

echo "Keeping $KEEP_COUNT tags (recent commits), deleting $DELETE_COUNT old tags"

if [ "$DELETE_COUNT" -eq 0 ]; then
echo "Nothing to delete"
exit 0
fi

# Delete in batches of 100 (ECR batch-delete-image limit)
echo "$TAGS_TO_DELETE" | grep -v '^$' | while mapfile -t -n 100 BATCH && [ ${#BATCH[@]} -gt 0 ]; do
IMAGE_IDS=""
for tag in "${BATCH[@]}"; do
[ -z "$tag" ] && continue
IMAGE_IDS="$IMAGE_IDS imageTag=$tag"
done
if [ -n "$IMAGE_IDS" ]; then
aws ecr-public batch-delete-image \
--repository-name "$REPO_NAME" \
--region "$REGION" \
--image-ids $IMAGE_IDS 2>/dev/null || echo "WARNING: batch-delete failed for some tags"
fi
done

echo "Cleanup complete: deleted $DELETE_COUNT old rocm-base tags, kept $KEEP_COUNT + cache key"
17 changes: 10 additions & 7 deletions .buildkite/scripts/cleanup-nightly-builds.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,19 @@ set -ex

# Clean up old nightly builds from DockerHub, keeping only the last 14 builds
# This script uses DockerHub API to list and delete old tags with specified prefix
# Usage: cleanup-nightly-builds.sh [TAG_PREFIX]
# Example: cleanup-nightly-builds.sh "nightly-" or cleanup-nightly-builds.sh "cu130-nightly-"
# Usage: cleanup-nightly-builds.sh [TAG_PREFIX] [REPO]
# Example: cleanup-nightly-builds.sh "nightly-"
# Example: cleanup-nightly-builds.sh "cu130-nightly-"
# Example: cleanup-nightly-builds.sh "nightly-" "vllm/vllm-openai-rocm"

# Get tag prefix from argument, default to "nightly-" if not provided
# Get tag prefix and repo from arguments
TAG_PREFIX="${1:-nightly-}"
REPO="${2:-vllm/vllm-openai}"

echo "Cleaning up tags with prefix: $TAG_PREFIX"
echo "Cleaning up tags with prefix: $TAG_PREFIX in repository: $REPO"

# DockerHub API endpoint for vllm/vllm-openai repository
REPO_API_URL="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags"
# DockerHub API endpoint for the repository
REPO_API_URL="https://hub.docker.com/v2/repositories/${REPO}/tags"

# Get DockerHub credentials from environment
if [ -z "$DOCKERHUB_TOKEN" ]; then
Expand Down Expand Up @@ -70,7 +73,7 @@ delete_tag() {
local tag_name="$1"
echo "Deleting tag: $tag_name"

local delete_url="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags/$tag_name"
local delete_url="https://hub.docker.com/v2/repositories/${REPO}/tags/$tag_name"
set +x
local response=$(curl -s -X DELETE -H "Authorization: Bearer $BEARER_TOKEN" "$delete_url")
set -x
Expand Down
55 changes: 55 additions & 0 deletions .buildkite/scripts/push-nightly-builds-rocm.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
#!/bin/bash
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
#
# Push ROCm nightly base image and nightly image from ECR
# to Docker Hub as vllm/vllm-openai-rocm:nightly-base and vllm/vllm-openai-rocm:nightly
# and vllm/vllm-openai-rocm:base-nightly-<commit> and vllm/vllm-openai-rocm:nightly-<commit>.
# Run when NIGHTLY=1 after build-rocm-release-image has pushed to ECR.
#
# Local testing (no push to Docker Hub):
# BUILDKITE_COMMIT=<commit-with-rocm-image-in-ecr> DRY_RUN=1 bash .buildkite/scripts/push-nightly-builds-rocm.sh
# Requires: AWS CLI configured (for ECR public login), Docker. For full run: Docker Hub login.

set -ex

# Use BUILDKITE_COMMIT from env (required; set to a commit that has ROCm image in ECR for local test)
BUILDKITE_COMMIT="${BUILDKITE_COMMIT:?Set BUILDKITE_COMMIT to the commit SHA that has the ROCm image in ECR (e.g. from a previous release pipeline run)}"
DRY_RUN="${DRY_RUN:-0}"
Comment thread
tjtanaa marked this conversation as resolved.

BASE_ORIG_TAG="${BUILDKITE_COMMIT}-rocm-base"
ORIG_TAG="${BUILDKITE_COMMIT}-rocm"
BASE_TAG_NAME="base-nightly"
TAG_NAME="nightly"
BASE_TAG_NAME_COMMIT="base-nightly-${BUILDKITE_COMMIT}"
TAG_NAME_COMMIT="nightly-${BUILDKITE_COMMIT}"

echo "Pushing ROCm image from ECR tag $ORIG_TAG to Docker Hub as $TAG_NAME and $TAG_NAME_COMMIT"
[[ "$DRY_RUN" == "1" ]] && echo "[DRY_RUN] Skipping push to Docker Hub"

# Login to ECR and pull the image built by build-rocm-release-image
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:"$BASE_ORIG_TAG"
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:"$ORIG_TAG"

# Tag for Docker Hub (base-nightly and nightly-base, nightly and nightly-<commit>)
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:"$BASE_ORIG_TAG" vllm/vllm-openai-rocm:"$BASE_TAG_NAME"
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:"$BASE_ORIG_TAG" vllm/vllm-openai-rocm:"$BASE_TAG_NAME_COMMIT"
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:"$ORIG_TAG" vllm/vllm-openai-rocm:"$TAG_NAME"
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:"$ORIG_TAG" vllm/vllm-openai-rocm:"$TAG_NAME_COMMIT"

if [[ "$DRY_RUN" == "1" ]]; then
echo "[DRY_RUN] Would push vllm/vllm-openai-rocm:$BASE_TAG_NAME and vllm/vllm-openai-rocm:$BASE_TAG_NAME_COMMIT"
echo "[DRY_RUN] Would push vllm/vllm-openai-rocm:$TAG_NAME and vllm/vllm-openai-rocm:$TAG_NAME_COMMIT"
echo "[DRY_RUN] Local tags created. Exiting without push."
exit 0
fi

# Push to Docker Hub (docker-login plugin runs before this step in CI)
docker push vllm/vllm-openai-rocm:"$BASE_TAG_NAME"
docker push vllm/vllm-openai-rocm:"$BASE_TAG_NAME_COMMIT"
docker push vllm/vllm-openai-rocm:"$TAG_NAME"
docker push vllm/vllm-openai-rocm:"$TAG_NAME_COMMIT"

echo "Pushed vllm/vllm-openai-rocm:$BASE_TAG_NAME and vllm/vllm-openai-rocm:$BASE_TAG_NAME_COMMIT"
echo "Pushed vllm/vllm-openai-rocm:$TAG_NAME and vllm/vllm-openai-rocm:$TAG_NAME_COMMIT"
Loading