diff --git a/.ci/docker/pytorch-nightly-docker.Dockerfile b/.ci/docker/pytorch-nightly-docker.Dockerfile new file mode 100644 index 0000000000000..863bb40645907 --- /dev/null +++ b/.ci/docker/pytorch-nightly-docker.Dockerfile @@ -0,0 +1,37 @@ +ARG BASE_IMAGE=rocm/pytorch-autobuild:base-latest +FROM ${BASE_IMAGE} +WORKDIR /tmp +USER root + +ENV CI=1 +ENV PYTORCH_TEST_WITH_ROCM=1 +ENV PYTORCH_TESTING_DEVICE_ONLY_FOR="cuda" + +RUN git clone https://github.com/pytorch/pytorch --recursive \ + && cd pytorch \ + && pip install -r requirements.txt \ + && git config --local user.name "AMD AMD" \ + && git config --local user.email "amd@amd.com" \ + && git remote add rocm https://github.com/ROCm/pytorch.git \ + && git fetch rocm \ + && git cherry-pick 519160d466782f5a62365be051fcb3ef90fa0b00 \ + && if ! .ci/pytorch/build.sh; then \ + echo "PyTorch build failed. Re-running likely failing HIP test targets with serial verbose Ninja output."; \ + if [ -d build ]; then \ + ninja -C build -t clean hip_half_test hip_distributions_test || true; \ + ninja -C build -j1 -v hip_half_test || true; \ + ninja -C build -j1 -v hip_distributions_test || true; \ + else \ + echo "Expected build directory 'build' was not found after failure."; \ + fi; \ + exit 1; \ + fi \ + && rm -rf /tmp/pytorch/.git +RUN git clone https://github.com/pytorch/vision \ + && cd vision \ + && FORCE_CUDA=1 python setup.py install \ + && rm -rf /tmp/vision/.git +RUN git clone https://github.com/pytorch/audio \ + && cd audio \ + && python setup.py install \ + && rm -rf /tmp/audio/.git diff --git a/.ci/docker/ubuntu-rocm/Dockerfile b/.ci/docker/ubuntu-rocm/Dockerfile index ff62e4a934c74..9d797548e3c1d 100644 --- a/.ci/docker/ubuntu-rocm/Dockerfile +++ b/.ci/docker/ubuntu-rocm/Dockerfile @@ -113,9 +113,11 @@ RUN rm install_triton.sh common_utils.sh triton.txt triton_version.txt # Install ccache/sccache (do this last, so we get priority in PATH) +ARG SKIP_SCCACHE_INSTALL COPY ./common/install_cache.sh install_cache.sh ENV PATH /opt/cache/bin:$PATH -RUN bash ./install_cache.sh && rm install_cache.sh +RUN if [ -z "${SKIP_SCCACHE_INSTALL}" ]; then bash ./install_cache.sh; fi +RUN rm install_cache.sh # Install Open MPI for ROCm COPY ./common/install_openmpi.sh install_openmpi.sh diff --git a/.github/scripts/rocm_nightly_debug_build.sh b/.github/scripts/rocm_nightly_debug_build.sh new file mode 100644 index 0000000000000..d9cb0d7a7e2cb --- /dev/null +++ b/.github/scripts/rocm_nightly_debug_build.sh @@ -0,0 +1,83 @@ +#!/usr/bin/env bash + +set -euxo pipefail + +ARTIFACT_DIR="${ARTIFACT_DIR:-/debug-artifacts}" +WORKDIR=/tmp/pytorch +PATCH_SHA=519160d466782f5a62365be051fcb3ef90fa0b00 +LOG_HELPER="${LOG_HELPER:-/workspace/rocm-nightly-workflow/.github/scripts/run_with_log_heartbeat.sh}" + +mkdir -p "$ARTIFACT_DIR" +if ! touch "$ARTIFACT_DIR/.write-test" 2>/dev/null; then + echo "Artifact directory '$ARTIFACT_DIR' is not writable by uid $(id -u)." >&2 + exit 1 +fi +rm -f "$ARTIFACT_DIR/.write-test" +rm -rf "$WORKDIR" + +git clone https://github.com/pytorch/pytorch --recursive "$WORKDIR" +cd "$WORKDIR" + +pip install -r requirements.txt +git config --local user.name "AMD AMD" +git config --local user.email "amd@amd.com" +git remote add rocm https://github.com/ROCm/pytorch.git +git fetch rocm +git cherry-pick "$PATCH_SHA" + +if bash "$LOG_HELPER" "$ARTIFACT_DIR/build.log" -- .ci/pytorch/build.sh; then + if [[ -f build/.ninja_log ]]; then + cp build/.ninja_log "$ARTIFACT_DIR"/ + fi + exit 0 +fi + +if [[ -f build/.ninja_log ]]; then + cp build/.ninja_log "$ARTIFACT_DIR"/ +fi + +echo "PyTorch build failed. Re-running gloo_hip wrappers with verbose output." | tee -a "$ARTIFACT_DIR/build.log" + +GLOO_DIR=build/third_party/gloo/gloo/CMakeFiles/gloo_hip.dir +if [[ ! -d "$GLOO_DIR" ]]; then + echo "Expected gloo_hip build directory '$GLOO_DIR' was not found." | tee -a "$ARTIFACT_DIR/gloo-debug.log" + exit 1 +fi + +ninja -C build -t clean gloo_hip || true + +find "$GLOO_DIR" -name 'gloo_hip_generated_*.cmake' | sort > "$ARTIFACT_DIR/gloo_wrappers.txt" +if [[ ! -s "$ARTIFACT_DIR/gloo_wrappers.txt" ]]; then + echo "No gloo_hip wrapper scripts were found." | tee -a "$ARTIFACT_DIR/gloo-debug.log" + exit 1 +fi + +status=0 +wrapper_index=0 +: > "$ARTIFACT_DIR/gloo_wrapper_logs.txt" +while IFS= read -r wrapper; do + wrapper_index=$((wrapper_index + 1)) + generated_file="${wrapper%.cmake}" + wrapper_log="$ARTIFACT_DIR/gloo-wrapper-$(printf '%03d' "$wrapper_index").log" + { + echo + echo "===== Re-running $wrapper =====" + } | tee -a "$ARTIFACT_DIR/gloo-debug.log" + printf '%s\t%s\n' "$wrapper" "$(basename "$wrapper_log")" >> "$ARTIFACT_DIR/gloo_wrapper_logs.txt" + + if ! bash "$LOG_HELPER" "$wrapper_log" -- \ + cmake \ + -D verbose:BOOL=ON \ + -D build_configuration:STRING=RELEASE \ + -D generated_file:STRING="$generated_file" \ + -P "$wrapper"; then + { + echo "Wrapper failed. Last 200 lines from $(basename "$wrapper_log"):" + tail -n 200 "$wrapper_log" || true + } | tee -a "$ARTIFACT_DIR/gloo-debug.log" + status=1 + break + fi +done < "$ARTIFACT_DIR/gloo_wrappers.txt" + +exit "$status" diff --git a/.github/scripts/run_with_log_heartbeat.sh b/.github/scripts/run_with_log_heartbeat.sh new file mode 100644 index 0000000000000..b4139d6f43dbb --- /dev/null +++ b/.github/scripts/run_with_log_heartbeat.sh @@ -0,0 +1,69 @@ +#!/usr/bin/env bash + +set -euo pipefail + +usage() { + echo "Usage: $0 LOG_FILE -- COMMAND [ARGS...]" >&2 + exit 2 +} + +if [[ $# -lt 3 ]]; then + usage +fi + +log_file=$1 +shift + +if [[ $1 != "--" ]]; then + usage +fi +shift + +heartbeat_seconds="${HEARTBEAT_SECONDS:-300}" +tail_lines="${TAIL_LINES:-200}" +check_interval=5 + +mkdir -p "$(dirname "$log_file")" +: >"$log_file" + +"$@" >"$log_file" 2>&1 & +cmd_pid=$! + +cleanup() { + if kill -0 "$cmd_pid" 2>/dev/null; then + kill "$cmd_pid" 2>/dev/null || true + wait "$cmd_pid" 2>/dev/null || true + fi +} +trap cleanup EXIT + +command_str=$(printf '%q ' "$@") +command_str=${command_str% } + +next_heartbeat=0 +while kill -0 "$cmd_pid" 2>/dev/null; do + now=$(date +%s) + if (( now >= next_heartbeat )); then + echo "[$(date -u +%FT%TZ)] Command still running: ${command_str}" + echo "[$(date -u +%FT%TZ)] Log file: ${log_file} ($(du -h "$log_file" | cut -f1))" + next_heartbeat=$((now + heartbeat_seconds)) + fi + sleep "$check_interval" +done + +if wait "$cmd_pid"; then + status=0 +else + status=$? +fi + +trap - EXIT + +if [[ $status -eq 0 ]]; then + echo "Command completed successfully. Full log saved to ${log_file}" + exit 0 +fi + +echo "Command failed with exit code ${status}. Last ${tail_lines} lines from ${log_file}:" +tail -n "$tail_lines" "$log_file" || true +exit "$status" diff --git a/.github/workflows/pytorch-nightly-docker.yml b/.github/workflows/pytorch-nightly-docker.yml new file mode 100644 index 0000000000000..7e432249424d4 --- /dev/null +++ b/.github/workflows/pytorch-nightly-docker.yml @@ -0,0 +1,161 @@ +name: ROCm Nightly Build and Test + +on: + schedule: + # Run nightly at 2 AM UTC + - cron: '0 2 * * *' + workflow_dispatch: + inputs: + rocm_version: + description: ROCm version to build + required: false + type: string + workflow_call: + inputs: + rocm_version: + required: false + type: string + push: + branches: + - rocm-nightly-gha + +env: + ROCM_VERSION: '7.2.1' + PYTHON_VERSION: '3.10' + PYTORCH_ROCM_ARCH: 'gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1102;gfx1150;gfx1151;gfx1200;gfx1201' + DOCKER_REGISTRY: rocm/pytorch-nightly + +jobs: + build: + name: Build ROCm Nightly Image + runs-on: linux-pytorch-mi325-1 + outputs: + full-image: ${{ steps.meta.outputs.full-image }} + steps: + - name: Resolve ROCm version + if: ${{ inputs.rocm_version != '' }} + run: echo "ROCM_VERSION=${{ inputs.rocm_version }}" >> "$GITHUB_ENV" + + - name: Checkout pytorch + uses: actions/checkout@v6 + with: + repository: pytorch/pytorch + ref: main + + - name: Checkout nightly workflow files + uses: actions/checkout@v6 + with: + path: rocm-nightly-workflow + + - name: Generate image tag + id: meta + run: | + tag="$(date +%Y%m%d%H%M%S)-rocm${{ env.ROCM_VERSION }}" + echo "full-image=${{ env.DOCKER_REGISTRY }}:${tag}" >> "$GITHUB_OUTPUT" + + - name: Build base image + working-directory: .ci/docker + run: | + export SKIP_SCCACHE_INSTALL=1 + export PYTORCH_ROCM_ARCH="${{ env.PYTORCH_ROCM_ARCH }}" + ./build.sh pytorch-linux-jammy-rocm${{ env.ROCM_VERSION }}-py${{ env.PYTHON_VERSION }} \ + -t rocm/pytorch-autobuild:base-latest + + - name: Build ROCm Nightly Image + env: + FULL_IMAGE: ${{ steps.meta.outputs.full-image }} + run: | + build_flags=(--load) + if [[ -n "${CI:-}" ]]; then + build_flags+=(--progress=plain) + fi + + docker buildx build \ + "${build_flags[@]}" \ + --build-arg BASE_IMAGE=rocm/pytorch-autobuild:base-latest \ + -t "$FULL_IMAGE" \ + -f rocm-nightly-workflow/.ci/docker/pytorch-nightly-docker.Dockerfile \ + rocm-nightly-workflow/.ci/docker + + - name: Save nightly image artifact + env: + FULL_IMAGE: ${{ steps.meta.outputs.full-image }} + run: | + docker save -o nightly-image.tar "$FULL_IMAGE" + + - name: Upload nightly image artifact + uses: actions/upload-artifact@v4.4.0 + with: + name: rocm-nightly-image + path: nightly-image.tar + retention-days: 1 + compression-level: 0 + + test-push: + name: ${{ matrix.target.name }} + needs: build + strategy: + fail-fast: false + matrix: + target: + - name: Test and Push ROCm Nightly Image on MI325 + runner: linux-pytorch-mi325-1 + push_image: true + - name: Test ROCm Nightly Image on MI250 + runner: linux-pytorch-mi250-1 + push_image: false + runs-on: ${{ matrix.target.runner }} + timeout-minutes: 300 + env: + NIGHTLY_IMAGE: ${{ needs.build.outputs.full-image }} + steps: + - name: Resolve ROCm version + if: ${{ inputs.rocm_version != '' }} + run: echo "ROCM_VERSION=${{ inputs.rocm_version }}" >> "$GITHUB_ENV" + + - name: Docker cleanup + run: | + docker container prune -f + docker image prune -f + + - name: Download nightly image artifact + uses: actions/download-artifact@v4.1.7 + with: + name: rocm-nightly-image + path: nightly-image-artifact + + - name: Load nightly image + run: docker load -i nightly-image-artifact/nightly-image.tar + + - name: Run unit tests + run: | + docker run --rm \ + --device=/dev/kfd \ + --device=/dev/dri \ + --group-add video \ + --network host \ + --cap-add=SYS_PTRACE \ + --security-opt seccomp=unconfined \ + "$NIGHTLY_IMAGE" \ + bash -c " + git clone https://github.com/ROCm/pytorch-micro-benchmarking.git /tmp/pytorch-micro-benchmarking + cd /tmp/pytorch-micro-benchmarking + python3 micro_benchmarking_pytorch.py --network resnet50 + " + + - name: Log in to Docker Hub + if: ${{ matrix.target.push_image }} + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKER_USERNAME }} + password: ${{ secrets.DOCKER_PASSWORD }} + + - name: Push validated image + if: ${{ matrix.target.push_image }} + env: + FINAL_IMAGE: ${{ needs.build.outputs.full-image }} + LATEST_IMAGE: ${{ env.DOCKER_REGISTRY }}:latest + run: | + docker tag "$FINAL_IMAGE" "$LATEST_IMAGE" + docker push "$FINAL_IMAGE" + docker push "$LATEST_IMAGE" diff --git a/.github/workflows/rocm-nightly-debug-build.yml b/.github/workflows/rocm-nightly-debug-build.yml new file mode 100644 index 0000000000000..b4f889a19318d --- /dev/null +++ b/.github/workflows/rocm-nightly-debug-build.yml @@ -0,0 +1,74 @@ +name: ROCm Nightly Build Debug + +on: + workflow_dispatch: + inputs: + rocm_version: + description: ROCm version to debug + required: false + type: string + push: + branches: + - rocm-nightly-gha + paths: + - .github/workflows/rocm-nightly-debug-build.yml + - .github/scripts/rocm_nightly_debug_build.sh + +env: + ROCM_VERSION: '7.2.1' + PYTHON_VERSION: '3.10' + PYTORCH_ROCM_ARCH: 'gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1102;gfx1150;gfx1151;gfx1200;gfx1201' + +jobs: + debug-build: + name: Debug ROCm Nightly Image Build + runs-on: linux-pytorch-mi325-1 + timeout-minutes: 300 + steps: + - name: Resolve ROCm version + if: ${{ inputs.rocm_version != '' }} + run: echo "ROCM_VERSION=${{ inputs.rocm_version }}" >> "$GITHUB_ENV" + + - name: Checkout pytorch + uses: actions/checkout@v6 + with: + repository: pytorch/pytorch + ref: main + + - name: Checkout nightly workflow files + uses: actions/checkout@v6 + with: + path: rocm-nightly-workflow + + - name: Build base image + working-directory: .ci/docker + run: | + export PYTORCH_ROCM_ARCH="${{ env.PYTORCH_ROCM_ARCH }}" + ./build.sh pytorch-linux-jammy-rocm${{ env.ROCM_VERSION }}-py${{ env.PYTHON_VERSION }} \ + -t rocm/pytorch-autobuild:base-latest + + - name: Run debug build + run: | + mkdir -p debug-artifacts + # The debug image runs as the `jenkins` user, so the bind mount must + # be writable even when the host runner uid/gid does not match. + chmod 0777 debug-artifacts + docker run --rm \ + -e ARTIFACT_DIR=/debug-artifacts \ + -e BUILD_ENVIRONMENT="pytorch-linux-jammy-rocm${{ env.ROCM_VERSION }}-py${{ env.PYTHON_VERSION }}" \ + -e ANACONDA_PYTHON_VERSION="${{ env.PYTHON_VERSION }}" \ + -e CI=1 \ + -e PYTORCH_ROCM_ARCH="${{ env.PYTORCH_ROCM_ARCH }}" \ + -v "$PWD/rocm-nightly-workflow:/workspace/rocm-nightly-workflow" \ + -v "$PWD/debug-artifacts:/debug-artifacts" \ + rocm/pytorch-autobuild:base-latest \ + bash /workspace/rocm-nightly-workflow/.github/scripts/rocm_nightly_debug_build.sh + + - name: Upload debug artifacts + if: always() + uses: actions/upload-artifact@v4.4.0 + with: + name: rocm-nightly-debug-artifacts + path: debug-artifacts + retention-days: 7 + compression-level: 0