Skip to content
37 changes: 37 additions & 0 deletions .ci/docker/pytorch-nightly-docker.Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
ARG BASE_IMAGE=rocm/pytorch-autobuild:base-latest
FROM ${BASE_IMAGE}
WORKDIR /tmp
USER root

ENV CI=1
ENV PYTORCH_TEST_WITH_ROCM=1
ENV PYTORCH_TESTING_DEVICE_ONLY_FOR="cuda"

RUN git clone https://github.com/pytorch/pytorch --recursive \
&& cd pytorch \
&& pip install -r requirements.txt \
&& git config --local user.name "AMD AMD" \
&& git config --local user.email "[email protected]" \
&& git remote add rocm https://github.com/ROCm/pytorch.git \
&& git fetch rocm \
&& git cherry-pick 519160d466782f5a62365be051fcb3ef90fa0b00 \
&& if ! .ci/pytorch/build.sh; then \
echo "PyTorch build failed. Re-running likely failing HIP test targets with serial verbose Ninja output."; \
if [ -d build ]; then \
ninja -C build -t clean hip_half_test hip_distributions_test || true; \
ninja -C build -j1 -v hip_half_test || true; \
ninja -C build -j1 -v hip_distributions_test || true; \
else \
echo "Expected build directory 'build' was not found after failure."; \
fi; \
exit 1; \
fi \
&& rm -rf /tmp/pytorch/.git
RUN git clone https://github.com/pytorch/vision \
&& cd vision \
&& FORCE_CUDA=1 python setup.py install \
&& rm -rf /tmp/vision/.git
RUN git clone https://github.com/pytorch/audio \
&& cd audio \
&& python setup.py install \
&& rm -rf /tmp/audio/.git
67 changes: 67 additions & 0 deletions .github/scripts/rocm_nightly_debug_build.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
#!/usr/bin/env bash

set -euxo pipefail

ARTIFACT_DIR="${ARTIFACT_DIR:-/debug-artifacts}"
WORKDIR=/tmp/pytorch
PATCH_SHA=519160d466782f5a62365be051fcb3ef90fa0b00

mkdir -p "$ARTIFACT_DIR"
rm -rf "$WORKDIR"

git clone https://github.com/pytorch/pytorch --recursive "$WORKDIR"
cd "$WORKDIR"

pip install -r requirements.txt
git config --local user.name "AMD AMD"
git config --local user.email "[email protected]"
git remote add rocm https://github.com/ROCm/pytorch.git
git fetch rocm
git cherry-pick "$PATCH_SHA"

if .ci/pytorch/build.sh 2>&1 | tee "$ARTIFACT_DIR/build.log"; then
if [[ -f build/.ninja_log ]]; then
cp build/.ninja_log "$ARTIFACT_DIR"/
fi
exit 0
fi

if [[ -f build/.ninja_log ]]; then
cp build/.ninja_log "$ARTIFACT_DIR"/
fi

echo "PyTorch build failed. Re-running gloo_hip wrappers with verbose output." | tee -a "$ARTIFACT_DIR/build.log"

GLOO_DIR=build/third_party/gloo/gloo/CMakeFiles/gloo_hip.dir
if [[ ! -d "$GLOO_DIR" ]]; then
echo "Expected gloo_hip build directory '$GLOO_DIR' was not found." | tee -a "$ARTIFACT_DIR/gloo-debug.log"
exit 1
fi

ninja -C build -t clean gloo_hip || true

find "$GLOO_DIR" -name 'gloo_hip_generated_*.cmake' | sort > "$ARTIFACT_DIR/gloo_wrappers.txt"
if [[ ! -s "$ARTIFACT_DIR/gloo_wrappers.txt" ]]; then
echo "No gloo_hip wrapper scripts were found." | tee -a "$ARTIFACT_DIR/gloo-debug.log"
exit 1
fi

status=0
while IFS= read -r wrapper; do
generated_file="${wrapper%.cmake}"
{
echo
echo "===== Re-running $wrapper ====="
} | tee -a "$ARTIFACT_DIR/gloo-debug.log"

if ! cmake \
-D verbose:BOOL=ON \
-D build_configuration:STRING=RELEASE \
-D generated_file:STRING="$generated_file" \
-P "$wrapper" 2>&1 | tee -a "$ARTIFACT_DIR/gloo-debug.log"; then
status=1
break
fi
done < "$ARTIFACT_DIR/gloo_wrappers.txt"

exit "$status"
160 changes: 160 additions & 0 deletions .github/workflows/pytorch-nightly-docker.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
name: ROCm Nightly Build and Test

on:
schedule:
# Run nightly at 2 AM UTC
- cron: '0 2 * * *'
workflow_dispatch:
inputs:
rocm_version:
description: ROCm version to build
required: false
type: string
workflow_call:
inputs:
rocm_version:
required: false
type: string
push:
branches:
- rocm-nightly-gha

env:
ROCM_VERSION: '7.2.1'
PYTHON_VERSION: '3.10'
PYTORCH_ROCM_ARCH: 'gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1102;gfx1150;gfx1151;gfx1200;gfx1201'
DOCKER_REGISTRY: rocm/pytorch-nightly

jobs:
build:
name: Build ROCm Nightly Image
runs-on: linux-pytorch-mi325-1
outputs:
full-image: ${{ steps.meta.outputs.full-image }}
steps:
- name: Resolve ROCm version
if: ${{ inputs.rocm_version != '' }}
run: echo "ROCM_VERSION=${{ inputs.rocm_version }}" >> "$GITHUB_ENV"

- name: Checkout pytorch
uses: actions/checkout@v6
with:
repository: pytorch/pytorch
ref: main

- name: Checkout nightly workflow files
uses: actions/checkout@v6
with:
path: rocm-nightly-workflow

- name: Generate image tag
id: meta
run: |
tag="$(date +%Y%m%d%H%M%S)-rocm${{ env.ROCM_VERSION }}"
echo "full-image=${{ env.DOCKER_REGISTRY }}:${tag}" >> "$GITHUB_OUTPUT"

- name: Build base image
working-directory: .ci/docker
run: |
export PYTORCH_ROCM_ARCH="${{ env.PYTORCH_ROCM_ARCH }}"
./build.sh pytorch-linux-jammy-rocm${{ env.ROCM_VERSION }}-py${{ env.PYTHON_VERSION }} \
-t rocm/pytorch-autobuild:base-latest

- name: Build ROCm Nightly Image
env:
FULL_IMAGE: ${{ steps.meta.outputs.full-image }}
run: |
build_flags=(--load)
if [[ -n "${CI:-}" ]]; then
build_flags+=(--progress=plain)
fi

docker buildx build \
"${build_flags[@]}" \
--build-arg BASE_IMAGE=rocm/pytorch-autobuild:base-latest \
-t "$FULL_IMAGE" \
-f rocm-nightly-workflow/.ci/docker/pytorch-nightly-docker.Dockerfile \
rocm-nightly-workflow/.ci/docker

- name: Save nightly image artifact
env:
FULL_IMAGE: ${{ steps.meta.outputs.full-image }}
run: |
docker save -o nightly-image.tar "$FULL_IMAGE"

- name: Upload nightly image artifact
uses: actions/[email protected]
with:
name: rocm-nightly-image
path: nightly-image.tar
retention-days: 1
compression-level: 0

test-push:
name: ${{ matrix.target.name }}
needs: build
strategy:
fail-fast: false
matrix:
target:
- name: Test and Push ROCm Nightly Image on MI325
runner: linux-pytorch-mi325-1
push_image: true
- name: Test ROCm Nightly Image on MI250
runner: linux-pytorch-mi250-1
push_image: false
runs-on: ${{ matrix.target.runner }}
timeout-minutes: 300
env:
NIGHTLY_IMAGE: ${{ needs.build.outputs.full-image }}
steps:
- name: Resolve ROCm version
if: ${{ inputs.rocm_version != '' }}
run: echo "ROCM_VERSION=${{ inputs.rocm_version }}" >> "$GITHUB_ENV"

- name: Docker cleanup
run: |
docker container prune -f
docker image prune -f

- name: Download nightly image artifact
uses: actions/[email protected]
with:
name: rocm-nightly-image
path: nightly-image-artifact

- name: Load nightly image
run: docker load -i nightly-image-artifact/nightly-image.tar

- name: Run unit tests
run: |
docker run --rm \
--device=/dev/kfd \
--device=/dev/dri \
--group-add video \
--network host \
--cap-add=SYS_PTRACE \
--security-opt seccomp=unconfined \
"$NIGHTLY_IMAGE" \
bash -c "
git clone https://github.com/ROCm/pytorch-micro-benchmarking.git /tmp/pytorch-micro-benchmarking
cd /tmp/pytorch-micro-benchmarking
python3 micro_benchmarking_pytorch.py --network resnet50
"

- name: Log in to Docker Hub
if: ${{ matrix.target.push_image }}
uses: docker/login-action@v3
with:
username: ${{ secrets.DOCKER_USERNAME }}
password: ${{ secrets.DOCKER_PASSWORD }}

- name: Push validated image
if: ${{ matrix.target.push_image }}
env:
FINAL_IMAGE: ${{ needs.build.outputs.full-image }}
LATEST_IMAGE: ${{ env.DOCKER_REGISTRY }}:latest
run: |
docker tag "$FINAL_IMAGE" "$LATEST_IMAGE"
docker push "$FINAL_IMAGE"
docker push "$LATEST_IMAGE"
71 changes: 71 additions & 0 deletions .github/workflows/rocm-nightly-debug-build.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
name: ROCm Nightly Build Debug

on:
workflow_dispatch:
inputs:
rocm_version:
description: ROCm version to debug
required: false
type: string
push:
branches:
- rocm-nightly-gha
paths:
- .github/workflows/rocm-nightly-debug-build.yml
- .github/scripts/rocm_nightly_debug_build.sh

env:
ROCM_VERSION: '7.2.1'
PYTHON_VERSION: '3.10'
PYTORCH_ROCM_ARCH: 'gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1102;gfx1150;gfx1151;gfx1200;gfx1201'

jobs:
debug-build:
name: Debug ROCm Nightly Image Build
runs-on: linux-pytorch-mi325-1
timeout-minutes: 300
steps:
- name: Resolve ROCm version
if: ${{ inputs.rocm_version != '' }}
run: echo "ROCM_VERSION=${{ inputs.rocm_version }}" >> "$GITHUB_ENV"

- name: Checkout pytorch
uses: actions/checkout@v6
with:
repository: pytorch/pytorch
ref: main

- name: Checkout nightly workflow files
uses: actions/checkout@v6
with:
path: rocm-nightly-workflow

- name: Build base image
working-directory: .ci/docker
run: |
export PYTORCH_ROCM_ARCH="${{ env.PYTORCH_ROCM_ARCH }}"
./build.sh pytorch-linux-jammy-rocm${{ env.ROCM_VERSION }}-py${{ env.PYTHON_VERSION }} \
-t rocm/pytorch-autobuild:base-latest

- name: Run debug build
run: |
mkdir -p debug-artifacts
docker run --rm \
-e ARTIFACT_DIR=/debug-artifacts \
-e BUILD_ENVIRONMENT="pytorch-linux-jammy-rocm${{ env.ROCM_VERSION }}-py${{ env.PYTHON_VERSION }}" \
-e ANACONDA_PYTHON_VERSION="${{ env.PYTHON_VERSION }}" \
-e CI=1 \
-e PYTORCH_ROCM_ARCH="${{ env.PYTORCH_ROCM_ARCH }}" \
-v "$PWD/rocm-nightly-workflow:/workspace/rocm-nightly-workflow" \
-v "$PWD/debug-artifacts:/debug-artifacts" \
rocm/pytorch-autobuild:base-latest \
bash /workspace/rocm-nightly-workflow/.github/scripts/rocm_nightly_debug_build.sh

- name: Upload debug artifacts
if: always()
uses: actions/[email protected]
with:
name: rocm-nightly-debug-artifacts
path: debug-artifacts
retention-days: 7
compression-level: 0
Loading