Skip to content
66 changes: 65 additions & 1 deletion .buildkite/release-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -288,7 +288,6 @@ steps:
- input: "ROCm Wheel Release Build Configuration"
key: input-rocm-config
depends_on: ~
if: build.source == "ui"
fields:
- text: "Python Version"
key: "rocm-python-version"
Expand Down Expand Up @@ -700,6 +699,7 @@ steps:
--build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache \
--build-arg SCCACHE_REGION_NAME=us-west-2 \
--build-arg SCCACHE_S3_NO_CREDENTIALS=0 \
--build-arg GIT_REPO_CHECK=1 \
--tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm \
--target vllm-openai \
--progress plain \
Expand All @@ -711,3 +711,67 @@ steps:
env:
DOCKER_BUILDKIT: "1"
S3_BUCKET: "vllm-wheels"

# TODO: Move to use the unified build-rocm-release-image job by @tjtanaa
- label: ":docker: Build nightly ROCm image - x86_64 - ROCm"
id: build-rocm-nightly-image
depends_on: ~
agents:
queue: cpu_queue_postmerge
timeout_in_minutes: 60
commands:
- |
set -euo pipefail

# Login to ECR
aws ecr-public get-login-password --region us-east-1 | \
docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7

docker pull rocm/vllm-dev:base

# Extract GPU architectures from Dockerfile.rocm_base (single source of truth)
PYTORCH_ROCM_ARCH=$$(grep '^ARG PYTORCH_ROCM_ARCH=' docker/Dockerfile.rocm_base | sed 's/^ARG PYTORCH_ROCM_ARCH=//')

# Build vLLM ROCm nightly image using base image
DOCKER_BUILDKIT=1 docker build \
--build-arg max_jobs=16 \
--build-arg BASE_IMAGE="rocm/vllm-dev:base" \
--build-arg ARG_PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH}" \
--build-arg USE_SCCACHE=1 \
--build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache \
--build-arg SCCACHE_REGION_NAME=us-west-2 \
--build-arg SCCACHE_S3_NO_CREDENTIALS=0 \
--build-arg GIT_REPO_CHECK=1 \
--tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm \
--target vllm-openai \
--progress plain \
-f docker/Dockerfile.rocm .

# Push to ECR
docker tag rocm/vllm-dev:base public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm-base
docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm-base
echo "Pushed base image: public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm-base"
docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm
echo "Pushed: public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm"
env:
DOCKER_BUILDKIT: "1"
S3_BUCKET: "vllm-wheels"

- label: "Publish nightly ROCm image to DockerHub"
id: publish-nightly-rocm-image
depends_on:
- build-rocm-nightly-image
agents:
queue: small_cpu_queue_postmerge
commands:
- "bash .buildkite/scripts/push-nightly-builds-rocm.sh"
# Clean up old nightly builds (keep only last 14)
- "bash .buildkite/scripts/cleanup-nightly-builds.sh nightly- vllm/vllm-openai-rocm"
- "bash .buildkite/scripts/cleanup-nightly-builds.sh base-nightly- vllm/vllm-openai-rocm"
plugins:
- docker-login#v3.0.0:
username: vllmbot
password-env: DOCKERHUB_TOKEN
env:
DOCKER_BUILDKIT: "1"
DOCKERHUB_USERNAME: "vllmbot"
Comment on lines +766 to +777
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

To improve maintainability, it's better to define the ROCm nightly repository name as an environment variable and pass it to the scripts. This avoids hardcoding the same string in multiple places and makes it easier to update in the future. This change is required to work with the suggested changes in push-nightly-builds-rocm.sh.

    commands:
      - "bash .buildkite/scripts/push-nightly-builds-rocm.sh \"$ROCM_NIGHTLY_REPO\""
      # Clean up old nightly builds (keep only last 14)
      - "bash .buildkite/scripts/cleanup-nightly-builds.sh nightly- \"$ROCM_NIGHTLY_REPO\""
    plugins:
      - docker-login#v3.0.0:
          username: vllmbot
          password-env: DOCKERHUB_TOKEN
    env:
      DOCKER_BUILDKIT: "1"
      DOCKERHUB_USERNAME: "vllmbot"
      ROCM_NIGHTLY_REPO: "vllm/vllm-openai-rocm"

17 changes: 10 additions & 7 deletions .buildkite/scripts/cleanup-nightly-builds.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,19 @@ set -ex

# Clean up old nightly builds from DockerHub, keeping only the last 14 builds
# This script uses DockerHub API to list and delete old tags with specified prefix
# Usage: cleanup-nightly-builds.sh [TAG_PREFIX]
# Example: cleanup-nightly-builds.sh "nightly-" or cleanup-nightly-builds.sh "cu130-nightly-"
# Usage: cleanup-nightly-builds.sh [TAG_PREFIX] [REPO]
# Example: cleanup-nightly-builds.sh "nightly-"
# Example: cleanup-nightly-builds.sh "cu130-nightly-"
# Example: cleanup-nightly-builds.sh "nightly-" "vllm/vllm-openai-rocm"

# Get tag prefix from argument, default to "nightly-" if not provided
# Get tag prefix and repo from arguments
TAG_PREFIX="${1:-nightly-}"
REPO="${2:-vllm/vllm-openai}"

echo "Cleaning up tags with prefix: $TAG_PREFIX"
echo "Cleaning up tags with prefix: $TAG_PREFIX in repository: $REPO"

# DockerHub API endpoint for vllm/vllm-openai repository
REPO_API_URL="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags"
# DockerHub API endpoint for the repository
REPO_API_URL="https://hub.docker.com/v2/repositories/${REPO}/tags"

# Get DockerHub credentials from environment
if [ -z "$DOCKERHUB_TOKEN" ]; then
Expand Down Expand Up @@ -70,7 +73,7 @@ delete_tag() {
local tag_name="$1"
echo "Deleting tag: $tag_name"

local delete_url="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags/$tag_name"
local delete_url="https://hub.docker.com/v2/repositories/${REPO}/tags/$tag_name"
set +x
local response=$(curl -s -X DELETE -H "Authorization: Bearer $BEARER_TOKEN" "$delete_url")
set -x
Expand Down
55 changes: 55 additions & 0 deletions .buildkite/scripts/push-nightly-builds-rocm.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
#!/bin/bash
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
#
# Push ROCm nightly base image and nightly image from ECR
# to Docker Hub as vllm/vllm-openai-rocm:nightly-base and vllm/vllm-openai-rocm:nightly
# and vllm/vllm-openai-rocm:base-nightly-<commit> and vllm/vllm-openai-rocm:nightly-<commit>.
# Run when NIGHTLY=1 after build-rocm-release-image has pushed to ECR.
#
# Local testing (no push to Docker Hub):
# BUILDKITE_COMMIT=<commit-with-rocm-image-in-ecr> DRY_RUN=1 bash .buildkite/scripts/push-nightly-builds-rocm.sh
# Requires: AWS CLI configured (for ECR public login), Docker. For full run: Docker Hub login.

set -ex

# Use BUILDKITE_COMMIT from env (required; set to a commit that has ROCm image in ECR for local test)
BUILDKITE_COMMIT="${BUILDKITE_COMMIT:?Set BUILDKITE_COMMIT to the commit SHA that has the ROCm image in ECR (e.g. from a previous release pipeline run)}"
DRY_RUN="${DRY_RUN:-0}"

BASE_ORIG_TAG="${BUILDKITE_COMMIT}-rocm-base"
ORIG_TAG="${BUILDKITE_COMMIT}-rocm"
BASE_TAG_NAME="base-nightly"
TAG_NAME="nightly"
BASE_TAG_NAME_COMMIT="base-nightly-${BUILDKITE_COMMIT}"
TAG_NAME_COMMIT="nightly-${BUILDKITE_COMMIT}"

echo "Pushing ROCm image from ECR tag $ORIG_TAG to Docker Hub as $TAG_NAME and $TAG_NAME_COMMIT"
[[ "$DRY_RUN" == "1" ]] && echo "[DRY_RUN] Skipping push to Docker Hub"

# Login to ECR and pull the image built by build-rocm-release-image
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:"$BASE_ORIG_TAG"
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:"$ORIG_TAG"

# Tag for Docker Hub (base-nightly and nightly-base, nightly and nightly-<commit>)
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:"$BASE_ORIG_TAG" vllm/vllm-openai-rocm:"$BASE_TAG_NAME"
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:"$BASE_ORIG_TAG" vllm/vllm-openai-rocm:"$BASE_TAG_NAME_COMMIT"
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:"$ORIG_TAG" vllm/vllm-openai-rocm:"$TAG_NAME"
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:"$ORIG_TAG" vllm/vllm-openai-rocm:"$TAG_NAME_COMMIT"

if [[ "$DRY_RUN" == "1" ]]; then
echo "[DRY_RUN] Would push vllm/vllm-openai-rocm:$BASE_TAG_NAME and vllm/vllm-openai-rocm:$BASE_TAG_NAME_COMMIT"
echo "[DRY_RUN] Would push vllm/vllm-openai-rocm:$TAG_NAME and vllm/vllm-openai-rocm:$TAG_NAME_COMMIT"
echo "[DRY_RUN] Local tags created. Exiting without push."
exit 0
fi

# Push to Docker Hub (docker-login plugin runs before this step in CI)
docker push vllm/vllm-openai-rocm:"$BASE_TAG_NAME"
docker push vllm/vllm-openai-rocm:"$BASE_TAG_NAME_COMMIT"
docker push vllm/vllm-openai-rocm:"$TAG_NAME"
docker push vllm/vllm-openai-rocm:"$TAG_NAME_COMMIT"

echo "Pushed vllm/vllm-openai-rocm:$BASE_TAG_NAME and vllm/vllm-openai-rocm:$BASE_TAG_NAME_COMMIT"
echo "Pushed vllm/vllm-openai-rocm:$TAG_NAME and vllm/vllm-openai-rocm:$TAG_NAME_COMMIT"
Loading