From 507fd86e3def4b74506ea3979e32a668461622af Mon Sep 17 00:00:00 2001 From: leo-amd Date: Tue, 31 Mar 2026 16:14:07 +0200 Subject: [PATCH 01/15] First draft --- .ci/docker/pytorch-nightly-docker.Dockerfile | 27 ++++ .github/scripts/send_notification_email.sh | 57 +++++++ .github/workflows/pytorch-nightly-docker.yml | 156 +++++++++++++++++++ 3 files changed, 240 insertions(+) create mode 100644 .ci/docker/pytorch-nightly-docker.Dockerfile create mode 100644 .github/scripts/send_notification_email.sh create mode 100644 .github/workflows/pytorch-nightly-docker.yml diff --git a/.ci/docker/pytorch-nightly-docker.Dockerfile b/.ci/docker/pytorch-nightly-docker.Dockerfile new file mode 100644 index 0000000000000..0f112e8d8e174 --- /dev/null +++ b/.ci/docker/pytorch-nightly-docker.Dockerfile @@ -0,0 +1,27 @@ +ARG BASE_IMAGE=rocm/pytorch-autobuild:base-latest +FROM ${BASE_IMAGE} +WORKDIR /tmp +USER root + +ENV CI=1 +ENV PYTORCH_TEST_WITH_ROCM=1 +ENV PYTORCH_TESTING_DEVICE_ONLY_FOR="cuda" + +RUN git clone https://github.com/pytorch/pytorch --recursive \ + && cd pytorch \ + && pip install -r requirements.txt \ + && git config --local user.name "AMD AMD" \ + && git config --local user.email "amd@amd.com" \ + && git remote add rocm https://github.com/ROCm/pytorch.git \ + && git fetch rocm \ + && git cherry-pick 519160d466782f5a62365be051fcb3ef90fa0b00 \ + && .ci/pytorch/build.sh \ + && rm -rf /tmp/pytorch/.git +RUN git clone https://github.com/pytorch/vision \ + && cd vision \ + && FORCE_CUDA=1 pip install . \ + && rm -rf /tmp/vision/.git +RUN git clone https://github.com/pytorch/audio \ + && cd audio \ + && pip install . \ + && rm -rf /tmp/audio/.git diff --git a/.github/scripts/send_notification_email.sh b/.github/scripts/send_notification_email.sh new file mode 100644 index 0000000000000..ea0705a45b8c3 --- /dev/null +++ b/.github/scripts/send_notification_email.sh @@ -0,0 +1,57 @@ +#!/usr/bin/env bash + +set -euo pipefail + +required_vars=( + ROCM_SMTP_URL + ROCM_SMTP_USERNAME + ROCM_SMTP_PASSWORD + ROCM_EMAIL_FROM + ROCM_EMAIL_TO + EMAIL_SUBJECT + EMAIL_BODY +) + +for var_name in "${required_vars[@]}"; do + if [[ -z "${!var_name:-}" ]]; then + echo "Missing required environment variable: ${var_name}" >&2 + exit 1 + fi +done + +message_file="$(mktemp)" +trap 'rm -f "$message_file"' EXIT + +{ + printf 'From: %s\n' "$ROCM_EMAIL_FROM" + printf 'To: %s\n' "$ROCM_EMAIL_TO" + printf 'Subject: %s\n' "$EMAIL_SUBJECT" + printf 'MIME-Version: 1.0\n' + printf 'Content-Type: text/plain; charset=UTF-8\n' + printf '\n' + printf '%s\n' "$EMAIL_BODY" +} > "$message_file" + +IFS=',' read -r -a recipients <<< "$ROCM_EMAIL_TO" +curl_args=() +for recipient in "${recipients[@]}"; do + recipient="${recipient#"${recipient%%[![:space:]]*}"}" + recipient="${recipient%"${recipient##*[![:space:]]}"}" + if [[ -n "$recipient" ]]; then + curl_args+=(--mail-rcpt "$recipient") + fi +done + +if [[ "${#curl_args[@]}" -eq 0 ]]; then + echo "ROCM_EMAIL_TO did not contain any recipients" >&2 + exit 1 +fi + +curl --silent --show-error --fail --ssl-reqd \ + --url "$ROCM_SMTP_URL" \ + --user "${ROCM_SMTP_USERNAME}:${ROCM_SMTP_PASSWORD}" \ + --mail-from "$ROCM_EMAIL_FROM" \ + "${curl_args[@]}" \ + --upload-file "$message_file" + +echo "Notification email sent to $ROCM_EMAIL_TO" diff --git a/.github/workflows/pytorch-nightly-docker.yml b/.github/workflows/pytorch-nightly-docker.yml new file mode 100644 index 0000000000000..f7b6c8c83577f --- /dev/null +++ b/.github/workflows/pytorch-nightly-docker.yml @@ -0,0 +1,156 @@ +name: ROCm Nightly Build and Test + +on: + schedule: + # Run nightly at 2 AM UTC + - cron: '0 2 * * *' + workflow_dispatch: + inputs: + rocm_version: + description: ROCm version to build + required: false + type: string + workflow_call: + inputs: + rocm_version: + required: false + type: string + +env: + ROCM_VERSION: '7.2.1' + PYTHON_VERSION: '3.10' + PYTORCH_ROCM_ARCH: 'gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1102;gfx1150;gfx1151;gfx1200;gfx1201' + DOCKER_REGISTRY: rocm/pytorch-nightly + +jobs: + build: + runs-on: ubuntu-latest + outputs: + full-image: ${{ steps.meta.outputs.full-image }} + steps: + - name: Resolve ROCm version + if: ${{ inputs.rocm_version != '' }} + run: echo "ROCM_VERSION=${{ inputs.rocm_version }}" >> "$GITHUB_ENV" + + - name: Checkout pytorch + uses: actions/checkout@v4 + with: + repository: pytorch/pytorch + ref: main + + - name: Checkout nightly workflow files + uses: actions/checkout@v4 + with: + path: rocm-nightly-workflow + + - name: Generate image tag + id: meta + run: | + tag="$(date +%Y%m%d%H%M%S)-rocm${{ env.ROCM_VERSION }}" + echo "full-image=${{ env.DOCKER_REGISTRY }}:${tag}" >> "$GITHUB_OUTPUT" + + - name: Log in to Docker Hub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKER_USERNAME }} + password: ${{ secrets.DOCKER_PASSWORD }} + + - name: Build base image + working-directory: .ci/docker + run: | + export PYTORCH_ROCM_ARCH="${{ env.PYTORCH_ROCM_ARCH }}" + ./build.sh pytorch-linux-jammy-rocm${{ env.ROCM_VERSION }}-py${{ env.PYTHON_VERSION }} \ + -t rocm/pytorch-autobuild:base-latest + + - name: Build and push nightly image + uses: docker/build-push-action@v6 + with: + context: rocm-nightly-workflow/.ci/docker + file: rocm-nightly-workflow/.ci/docker/pytorch-nightly-docker.Dockerfile + build-args: | + BASE_IMAGE=rocm/pytorch-autobuild:base-latest + push: true + tags: | + ${{ steps.meta.outputs.full-image }} + ${{ env.DOCKER_REGISTRY }}:latest + + test: + needs: build + strategy: + fail-fast: false + matrix: + gpu-arch: + - label: PLACEHOLDER_RUNNER_LABEL_1 + arch: gfx90a + critical: true + - label: PLACEHOLDER_RUNNER_LABEL_2 + arch: gfx942 + critical: false + runs-on: ${{ matrix.gpu-arch.label }} + timeout-minutes: 300 + continue-on-error: ${{ !matrix.gpu-arch.critical }} + env: + NIGHTLY_IMAGE: ${{ needs.build.outputs.full-image }} + steps: + - name: Resolve ROCm version + if: ${{ inputs.rocm_version != '' }} + run: echo "ROCM_VERSION=${{ inputs.rocm_version }}" >> "$GITHUB_ENV" + + - name: Docker cleanup + run: | + docker container prune -f + docker image prune -f + + - name: Log in to Docker Hub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKER_USERNAME }} + password: ${{ secrets.DOCKER_PASSWORD }} + + - name: Pull nightly image + run: docker pull "$NIGHTLY_IMAGE" + + - name: Run unit tests + run: | + docker run --rm \ + --device=/dev/kfd \ + --device=/dev/dri \ + --group-add video \ + --network host \ + --cap-add=SYS_PTRACE \ + --security-opt seccomp=unconfined \ + "$NIGHTLY_IMAGE" \ + bash -c " + git clone https://github.com/ROCm/pytorch-micro-benchmarking.git /tmp/pytorch-micro-benchmarking + cd /tmp/pytorch-micro-benchmarking + python3 micro_benchmarking_pytorch.py --network resnet50 + " + + notify: + needs: [build, test] + if: failure() + runs-on: ubuntu-latest + steps: + - name: Resolve ROCm version + if: ${{ inputs.rocm_version != '' }} + run: echo "ROCM_VERSION=${{ inputs.rocm_version }}" >> "$GITHUB_ENV" + + - name: Checkout notification scripts + uses: actions/checkout@v4 + + - name: Send failure email + env: + ROCM_SMTP_URL: ${{ secrets.ROCM_SMTP_URL }} + ROCM_SMTP_USERNAME: ${{ secrets.ROCM_SMTP_USERNAME }} + ROCM_SMTP_PASSWORD: ${{ secrets.ROCM_SMTP_PASSWORD }} + ROCM_EMAIL_FROM: ${{ secrets.ROCM_EMAIL_FROM }} + ROCM_EMAIL_TO: ${{ secrets.ROCM_EMAIL_TO }} + EMAIL_SUBJECT: ROCm Nightly Build Failed - ${{ github.ref_name }} - ROCm ${{ env.ROCM_VERSION }} + EMAIL_BODY: | + The ROCm nightly docker build failed. + + Run: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} + Branch: ${{ github.ref_name }} + ROCm version: ${{ env.ROCM_VERSION }} + run: | + bash .github/scripts/send_notification_email.sh From 59590af79e7fda6b96710a627a31ea93b8a00e73 Mon Sep 17 00:00:00 2001 From: leo-amd Date: Tue, 31 Mar 2026 17:30:09 +0200 Subject: [PATCH 02/15] Pass artfiact and test before pushing image --- .github/workflows/pytorch-nightly-docker.yml | 96 +++++++++++--------- 1 file changed, 54 insertions(+), 42 deletions(-) diff --git a/.github/workflows/pytorch-nightly-docker.yml b/.github/workflows/pytorch-nightly-docker.yml index f7b6c8c83577f..b2cb2ceac0773 100644 --- a/.github/workflows/pytorch-nightly-docker.yml +++ b/.github/workflows/pytorch-nightly-docker.yml @@ -24,6 +24,7 @@ env: jobs: build: + name: Build ROCm Nightly Image runs-on: ubuntu-latest outputs: full-image: ${{ steps.meta.outputs.full-image }} @@ -33,13 +34,13 @@ jobs: run: echo "ROCM_VERSION=${{ inputs.rocm_version }}" >> "$GITHUB_ENV" - name: Checkout pytorch - uses: actions/checkout@v4 + uses: actions/checkout@v6 with: repository: pytorch/pytorch ref: main - name: Checkout nightly workflow files - uses: actions/checkout@v4 + uses: actions/checkout@v6 with: path: rocm-nightly-workflow @@ -49,12 +50,6 @@ jobs: tag="$(date +%Y%m%d%H%M%S)-rocm${{ env.ROCM_VERSION }}" echo "full-image=${{ env.DOCKER_REGISTRY }}:${tag}" >> "$GITHUB_OUTPUT" - - name: Log in to Docker Hub - uses: docker/login-action@v3 - with: - username: ${{ secrets.DOCKER_USERNAME }} - password: ${{ secrets.DOCKER_PASSWORD }} - - name: Build base image working-directory: .ci/docker run: | @@ -62,33 +57,35 @@ jobs: ./build.sh pytorch-linux-jammy-rocm${{ env.ROCM_VERSION }}-py${{ env.PYTHON_VERSION }} \ -t rocm/pytorch-autobuild:base-latest - - name: Build and push nightly image - uses: docker/build-push-action@v6 + - name: Build ROCm Nightly Image + env: + FULL_IMAGE: ${{ steps.meta.outputs.full-image }} + run: | + docker build \ + --build-arg BASE_IMAGE=rocm/pytorch-autobuild:base-latest \ + -t "$FULL_IMAGE" \ + -f rocm-nightly-workflow/.ci/docker/pytorch-nightly-docker.Dockerfile \ + rocm-nightly-workflow/.ci/docker + + - name: Save nightly image artifact + env: + FULL_IMAGE: ${{ steps.meta.outputs.full-image }} + run: | + docker save -o nightly-image.tar "$FULL_IMAGE" + + - name: Upload nightly image artifact + uses: actions/upload-artifact@v4.4.0 with: - context: rocm-nightly-workflow/.ci/docker - file: rocm-nightly-workflow/.ci/docker/pytorch-nightly-docker.Dockerfile - build-args: | - BASE_IMAGE=rocm/pytorch-autobuild:base-latest - push: true - tags: | - ${{ steps.meta.outputs.full-image }} - ${{ env.DOCKER_REGISTRY }}:latest - - test: + name: rocm-nightly-image + path: nightly-image.tar + retention-days: 1 + compression-level: 0 + + test-push: + name: Test and Push ROCm Nightly Image needs: build - strategy: - fail-fast: false - matrix: - gpu-arch: - - label: PLACEHOLDER_RUNNER_LABEL_1 - arch: gfx90a - critical: true - - label: PLACEHOLDER_RUNNER_LABEL_2 - arch: gfx942 - critical: false - runs-on: ${{ matrix.gpu-arch.label }} + runs-on: PLACEHOLDER_RUNNER_LABEL_1 timeout-minutes: 300 - continue-on-error: ${{ !matrix.gpu-arch.critical }} env: NIGHTLY_IMAGE: ${{ needs.build.outputs.full-image }} steps: @@ -101,14 +98,14 @@ jobs: docker container prune -f docker image prune -f - - name: Log in to Docker Hub - uses: docker/login-action@v3 + - name: Download nightly image artifact + uses: actions/download-artifact@v4.1.7 with: - username: ${{ secrets.DOCKER_USERNAME }} - password: ${{ secrets.DOCKER_PASSWORD }} + name: rocm-nightly-image + path: nightly-image-artifact - - name: Pull nightly image - run: docker pull "$NIGHTLY_IMAGE" + - name: Load nightly image + run: docker load -i nightly-image-artifact/nightly-image.tar - name: Run unit tests run: | @@ -126,8 +123,23 @@ jobs: python3 micro_benchmarking_pytorch.py --network resnet50 " + - name: Log in to Docker Hub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKER_USERNAME }} + password: ${{ secrets.DOCKER_PASSWORD }} + + - name: Push validated image + env: + FINAL_IMAGE: ${{ needs.build.outputs.full-image }} + LATEST_IMAGE: ${{ env.DOCKER_REGISTRY }}:latest + run: | + docker tag "$FINAL_IMAGE" "$LATEST_IMAGE" + docker push "$FINAL_IMAGE" + docker push "$LATEST_IMAGE" + notify: - needs: [build, test] + needs: [build, test-push] if: failure() runs-on: ubuntu-latest steps: @@ -136,7 +148,7 @@ jobs: run: echo "ROCM_VERSION=${{ inputs.rocm_version }}" >> "$GITHUB_ENV" - name: Checkout notification scripts - uses: actions/checkout@v4 + uses: actions/checkout@v6 - name: Send failure email env: @@ -145,9 +157,9 @@ jobs: ROCM_SMTP_PASSWORD: ${{ secrets.ROCM_SMTP_PASSWORD }} ROCM_EMAIL_FROM: ${{ secrets.ROCM_EMAIL_FROM }} ROCM_EMAIL_TO: ${{ secrets.ROCM_EMAIL_TO }} - EMAIL_SUBJECT: ROCm Nightly Build Failed - ${{ github.ref_name }} - ROCm ${{ env.ROCM_VERSION }} + EMAIL_SUBJECT: ROCm Nightly Workflow Failed - ${{ github.ref_name }} - ROCm ${{ env.ROCM_VERSION }} EMAIL_BODY: | - The ROCm nightly docker build failed. + The ROCm nightly docker workflow failed. Run: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} Branch: ${{ github.ref_name }} From ef2aefdbc8e8ec6995a88ce625db8f1c009975d4 Mon Sep 17 00:00:00 2001 From: leo-amd Date: Wed, 1 Apr 2026 15:30:48 +0200 Subject: [PATCH 03/15] Matrix --- .github/workflows/pytorch-nightly-docker.yml | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/.github/workflows/pytorch-nightly-docker.yml b/.github/workflows/pytorch-nightly-docker.yml index b2cb2ceac0773..4b4365d870867 100644 --- a/.github/workflows/pytorch-nightly-docker.yml +++ b/.github/workflows/pytorch-nightly-docker.yml @@ -82,9 +82,19 @@ jobs: compression-level: 0 test-push: - name: Test and Push ROCm Nightly Image + name: ${{ matrix.target.name }} needs: build - runs-on: PLACEHOLDER_RUNNER_LABEL_1 + strategy: + fail-fast: false + matrix: + target: + - name: Test and Push ROCm Nightly Image on MI325 + runner: linux-pytorch-mi325-1 + push_image: true + - name: Test ROCm Nightly Image on MI250 + runner: linux-pytorch-mi250-1 + push_image: false + runs-on: ${{ matrix.target.runner }} timeout-minutes: 300 env: NIGHTLY_IMAGE: ${{ needs.build.outputs.full-image }} @@ -124,12 +134,14 @@ jobs: " - name: Log in to Docker Hub + if: ${{ matrix.target.push_image }} uses: docker/login-action@v3 with: username: ${{ secrets.DOCKER_USERNAME }} password: ${{ secrets.DOCKER_PASSWORD }} - name: Push validated image + if: ${{ matrix.target.push_image }} env: FINAL_IMAGE: ${{ needs.build.outputs.full-image }} LATEST_IMAGE: ${{ env.DOCKER_REGISTRY }}:latest From fc2c16d4d390edcfe687ff76a375da1c6eef61f8 Mon Sep 17 00:00:00 2001 From: leo-amd Date: Wed, 1 Apr 2026 15:45:50 +0200 Subject: [PATCH 04/15] Remove notifications for now --- .github/scripts/send_notification_email.sh | 57 -------------------- .github/workflows/pytorch-nightly-docker.yml | 29 ---------- 2 files changed, 86 deletions(-) delete mode 100644 .github/scripts/send_notification_email.sh diff --git a/.github/scripts/send_notification_email.sh b/.github/scripts/send_notification_email.sh deleted file mode 100644 index ea0705a45b8c3..0000000000000 --- a/.github/scripts/send_notification_email.sh +++ /dev/null @@ -1,57 +0,0 @@ -#!/usr/bin/env bash - -set -euo pipefail - -required_vars=( - ROCM_SMTP_URL - ROCM_SMTP_USERNAME - ROCM_SMTP_PASSWORD - ROCM_EMAIL_FROM - ROCM_EMAIL_TO - EMAIL_SUBJECT - EMAIL_BODY -) - -for var_name in "${required_vars[@]}"; do - if [[ -z "${!var_name:-}" ]]; then - echo "Missing required environment variable: ${var_name}" >&2 - exit 1 - fi -done - -message_file="$(mktemp)" -trap 'rm -f "$message_file"' EXIT - -{ - printf 'From: %s\n' "$ROCM_EMAIL_FROM" - printf 'To: %s\n' "$ROCM_EMAIL_TO" - printf 'Subject: %s\n' "$EMAIL_SUBJECT" - printf 'MIME-Version: 1.0\n' - printf 'Content-Type: text/plain; charset=UTF-8\n' - printf '\n' - printf '%s\n' "$EMAIL_BODY" -} > "$message_file" - -IFS=',' read -r -a recipients <<< "$ROCM_EMAIL_TO" -curl_args=() -for recipient in "${recipients[@]}"; do - recipient="${recipient#"${recipient%%[![:space:]]*}"}" - recipient="${recipient%"${recipient##*[![:space:]]}"}" - if [[ -n "$recipient" ]]; then - curl_args+=(--mail-rcpt "$recipient") - fi -done - -if [[ "${#curl_args[@]}" -eq 0 ]]; then - echo "ROCM_EMAIL_TO did not contain any recipients" >&2 - exit 1 -fi - -curl --silent --show-error --fail --ssl-reqd \ - --url "$ROCM_SMTP_URL" \ - --user "${ROCM_SMTP_USERNAME}:${ROCM_SMTP_PASSWORD}" \ - --mail-from "$ROCM_EMAIL_FROM" \ - "${curl_args[@]}" \ - --upload-file "$message_file" - -echo "Notification email sent to $ROCM_EMAIL_TO" diff --git a/.github/workflows/pytorch-nightly-docker.yml b/.github/workflows/pytorch-nightly-docker.yml index 4b4365d870867..5a021e7812e05 100644 --- a/.github/workflows/pytorch-nightly-docker.yml +++ b/.github/workflows/pytorch-nightly-docker.yml @@ -149,32 +149,3 @@ jobs: docker tag "$FINAL_IMAGE" "$LATEST_IMAGE" docker push "$FINAL_IMAGE" docker push "$LATEST_IMAGE" - - notify: - needs: [build, test-push] - if: failure() - runs-on: ubuntu-latest - steps: - - name: Resolve ROCm version - if: ${{ inputs.rocm_version != '' }} - run: echo "ROCM_VERSION=${{ inputs.rocm_version }}" >> "$GITHUB_ENV" - - - name: Checkout notification scripts - uses: actions/checkout@v6 - - - name: Send failure email - env: - ROCM_SMTP_URL: ${{ secrets.ROCM_SMTP_URL }} - ROCM_SMTP_USERNAME: ${{ secrets.ROCM_SMTP_USERNAME }} - ROCM_SMTP_PASSWORD: ${{ secrets.ROCM_SMTP_PASSWORD }} - ROCM_EMAIL_FROM: ${{ secrets.ROCM_EMAIL_FROM }} - ROCM_EMAIL_TO: ${{ secrets.ROCM_EMAIL_TO }} - EMAIL_SUBJECT: ROCm Nightly Workflow Failed - ${{ github.ref_name }} - ROCm ${{ env.ROCM_VERSION }} - EMAIL_BODY: | - The ROCm nightly docker workflow failed. - - Run: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} - Branch: ${{ github.ref_name }} - ROCm version: ${{ env.ROCM_VERSION }} - run: | - bash .github/scripts/send_notification_email.sh From 2a6a9e6614cfbebfa7dfe3d00ab916b1cbad280c Mon Sep 17 00:00:00 2001 From: leo-amd Date: Wed, 1 Apr 2026 16:02:33 +0200 Subject: [PATCH 05/15] Temporary trigger --- .github/workflows/pytorch-nightly-docker.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/pytorch-nightly-docker.yml b/.github/workflows/pytorch-nightly-docker.yml index 5a021e7812e05..53e5a2cb90726 100644 --- a/.github/workflows/pytorch-nightly-docker.yml +++ b/.github/workflows/pytorch-nightly-docker.yml @@ -15,6 +15,9 @@ on: rocm_version: required: false type: string + push: + branches: + - rocm-nightly-gha env: ROCM_VERSION: '7.2.1' From a7223b2db9e38a3607a18066795630e6420a2bb9 Mon Sep 17 00:00:00 2001 From: leo-amd Date: Wed, 1 Apr 2026 17:27:04 +0200 Subject: [PATCH 06/15] Self hosted runners smoke test --- .github/workflows/rocm-nightly-build-test.yml | 33 +++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 .github/workflows/rocm-nightly-build-test.yml diff --git a/.github/workflows/rocm-nightly-build-test.yml b/.github/workflows/rocm-nightly-build-test.yml new file mode 100644 index 0000000000000..6026eb9169c97 --- /dev/null +++ b/.github/workflows/rocm-nightly-build-test.yml @@ -0,0 +1,33 @@ +name: ROCm Nightly Runner Smoke Test + +on: + push: + branches: + - rocm-nightly-gha + workflow_dispatch: + +concurrency: + group: ${{ github.workflow }}-${{ github.ref_name }} + cancel-in-progress: true + +permissions: read-all + +jobs: + runner-smoke-test: + name: ${{ matrix.target.name }} + strategy: + fail-fast: false + matrix: + target: + - name: Smoke Test on MI325 + runner: linux-pytorch-mi325-1 + - name: Smoke Test on MI250 + runner: linux-pytorch-mi250-1 + runs-on: ${{ matrix.target.runner }} + timeout-minutes: 10 + steps: + - name: Run rocm-smi + shell: bash + run: | + command -v rocm-smi + rocm-smi From c818f73ca711707fc670b9d6ec75a9ed87a5da3d Mon Sep 17 00:00:00 2001 From: leo-amd Date: Wed, 1 Apr 2026 17:31:38 +0200 Subject: [PATCH 07/15] Delete smokes --- .github/workflows/rocm-nightly-build-test.yml | 33 ------------------- 1 file changed, 33 deletions(-) delete mode 100644 .github/workflows/rocm-nightly-build-test.yml diff --git a/.github/workflows/rocm-nightly-build-test.yml b/.github/workflows/rocm-nightly-build-test.yml deleted file mode 100644 index 6026eb9169c97..0000000000000 --- a/.github/workflows/rocm-nightly-build-test.yml +++ /dev/null @@ -1,33 +0,0 @@ -name: ROCm Nightly Runner Smoke Test - -on: - push: - branches: - - rocm-nightly-gha - workflow_dispatch: - -concurrency: - group: ${{ github.workflow }}-${{ github.ref_name }} - cancel-in-progress: true - -permissions: read-all - -jobs: - runner-smoke-test: - name: ${{ matrix.target.name }} - strategy: - fail-fast: false - matrix: - target: - - name: Smoke Test on MI325 - runner: linux-pytorch-mi325-1 - - name: Smoke Test on MI250 - runner: linux-pytorch-mi250-1 - runs-on: ${{ matrix.target.runner }} - timeout-minutes: 10 - steps: - - name: Run rocm-smi - shell: bash - run: | - command -v rocm-smi - rocm-smi From 454a5fa5bd10318ed8cd697e1e75c86b60ccda94 Mon Sep 17 00:00:00 2001 From: leo-amd Date: Thu, 2 Apr 2026 13:46:13 +0200 Subject: [PATCH 08/15] Verboose and runner change --- .ci/docker/pytorch-nightly-docker.Dockerfile | 4 +++- .github/workflows/pytorch-nightly-docker.yml | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/.ci/docker/pytorch-nightly-docker.Dockerfile b/.ci/docker/pytorch-nightly-docker.Dockerfile index 0f112e8d8e174..fdd30cf2c4546 100644 --- a/.ci/docker/pytorch-nightly-docker.Dockerfile +++ b/.ci/docker/pytorch-nightly-docker.Dockerfile @@ -2,10 +2,12 @@ ARG BASE_IMAGE=rocm/pytorch-autobuild:base-latest FROM ${BASE_IMAGE} WORKDIR /tmp USER root +SHELL ["/bin/bash", "-euxo", "pipefail", "-c"] ENV CI=1 ENV PYTORCH_TEST_WITH_ROCM=1 ENV PYTORCH_TESTING_DEVICE_ONLY_FOR="cuda" +ENV VERBOSE=1 RUN git clone https://github.com/pytorch/pytorch --recursive \ && cd pytorch \ @@ -15,7 +17,7 @@ RUN git clone https://github.com/pytorch/pytorch --recursive \ && git remote add rocm https://github.com/ROCm/pytorch.git \ && git fetch rocm \ && git cherry-pick 519160d466782f5a62365be051fcb3ef90fa0b00 \ - && .ci/pytorch/build.sh \ + && VERBOSE="${VERBOSE}" .ci/pytorch/build.sh \ && rm -rf /tmp/pytorch/.git RUN git clone https://github.com/pytorch/vision \ && cd vision \ diff --git a/.github/workflows/pytorch-nightly-docker.yml b/.github/workflows/pytorch-nightly-docker.yml index 53e5a2cb90726..624f1c6b1833b 100644 --- a/.github/workflows/pytorch-nightly-docker.yml +++ b/.github/workflows/pytorch-nightly-docker.yml @@ -28,7 +28,7 @@ env: jobs: build: name: Build ROCm Nightly Image - runs-on: ubuntu-latest + runs-on: linux-pytorch-mi325-1 outputs: full-image: ${{ steps.meta.outputs.full-image }} steps: @@ -63,8 +63,10 @@ jobs: - name: Build ROCm Nightly Image env: FULL_IMAGE: ${{ steps.meta.outputs.full-image }} + BUILDKIT_PROGRESS: plain run: | docker build \ + --progress=plain \ --build-arg BASE_IMAGE=rocm/pytorch-autobuild:base-latest \ -t "$FULL_IMAGE" \ -f rocm-nightly-workflow/.ci/docker/pytorch-nightly-docker.Dockerfile \ From 03393e9a469aa02ae29a993e1fe0a035892b9bfb Mon Sep 17 00:00:00 2001 From: leo-amd Date: Thu, 2 Apr 2026 15:47:04 +0200 Subject: [PATCH 09/15] Updae dockerfile --- .ci/docker/pytorch-nightly-docker.Dockerfile | 8 +++----- .github/workflows/pytorch-nightly-docker.yml | 5 +++-- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/.ci/docker/pytorch-nightly-docker.Dockerfile b/.ci/docker/pytorch-nightly-docker.Dockerfile index fdd30cf2c4546..ffa1e94ec55ef 100644 --- a/.ci/docker/pytorch-nightly-docker.Dockerfile +++ b/.ci/docker/pytorch-nightly-docker.Dockerfile @@ -2,12 +2,10 @@ ARG BASE_IMAGE=rocm/pytorch-autobuild:base-latest FROM ${BASE_IMAGE} WORKDIR /tmp USER root -SHELL ["/bin/bash", "-euxo", "pipefail", "-c"] ENV CI=1 ENV PYTORCH_TEST_WITH_ROCM=1 ENV PYTORCH_TESTING_DEVICE_ONLY_FOR="cuda" -ENV VERBOSE=1 RUN git clone https://github.com/pytorch/pytorch --recursive \ && cd pytorch \ @@ -17,13 +15,13 @@ RUN git clone https://github.com/pytorch/pytorch --recursive \ && git remote add rocm https://github.com/ROCm/pytorch.git \ && git fetch rocm \ && git cherry-pick 519160d466782f5a62365be051fcb3ef90fa0b00 \ - && VERBOSE="${VERBOSE}" .ci/pytorch/build.sh \ + && .ci/pytorch/build.sh \ && rm -rf /tmp/pytorch/.git RUN git clone https://github.com/pytorch/vision \ && cd vision \ - && FORCE_CUDA=1 pip install . \ + && FORCE_CUDA=1 python setup.py install \ && rm -rf /tmp/vision/.git RUN git clone https://github.com/pytorch/audio \ && cd audio \ - && pip install . \ + && python setup.py install \ && rm -rf /tmp/audio/.git diff --git a/.github/workflows/pytorch-nightly-docker.yml b/.github/workflows/pytorch-nightly-docker.yml index 624f1c6b1833b..a0e051cff3578 100644 --- a/.github/workflows/pytorch-nightly-docker.yml +++ b/.github/workflows/pytorch-nightly-docker.yml @@ -63,10 +63,11 @@ jobs: - name: Build ROCm Nightly Image env: FULL_IMAGE: ${{ steps.meta.outputs.full-image }} - BUILDKIT_PROGRESS: plain + BUILDKIT_PROGRESS: tty + BUILDKIT_TTY_LOG_LINES: 12 run: | docker build \ - --progress=plain \ + --progress=tty \ --build-arg BASE_IMAGE=rocm/pytorch-autobuild:base-latest \ -t "$FULL_IMAGE" \ -f rocm-nightly-workflow/.ci/docker/pytorch-nightly-docker.Dockerfile \ From 551548d9a1789875c999ce3723ccf7cdc51ae8a9 Mon Sep 17 00:00:00 2001 From: leo-amd Date: Thu, 2 Apr 2026 16:24:09 +0200 Subject: [PATCH 10/15] Remove tty --- .github/workflows/pytorch-nightly-docker.yml | 3 --- 1 file changed, 3 deletions(-) diff --git a/.github/workflows/pytorch-nightly-docker.yml b/.github/workflows/pytorch-nightly-docker.yml index a0e051cff3578..48dec9eb2247e 100644 --- a/.github/workflows/pytorch-nightly-docker.yml +++ b/.github/workflows/pytorch-nightly-docker.yml @@ -63,11 +63,8 @@ jobs: - name: Build ROCm Nightly Image env: FULL_IMAGE: ${{ steps.meta.outputs.full-image }} - BUILDKIT_PROGRESS: tty - BUILDKIT_TTY_LOG_LINES: 12 run: | docker build \ - --progress=tty \ --build-arg BASE_IMAGE=rocm/pytorch-autobuild:base-latest \ -t "$FULL_IMAGE" \ -f rocm-nightly-workflow/.ci/docker/pytorch-nightly-docker.Dockerfile \ From e53f83c27fe4e138e58f943e523e42a5992ee63f Mon Sep 17 00:00:00 2001 From: leo-amd Date: Wed, 8 Apr 2026 10:52:20 +0200 Subject: [PATCH 11/15] Use older buildx with better build logging --- .github/workflows/pytorch-nightly-docker.yml | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/.github/workflows/pytorch-nightly-docker.yml b/.github/workflows/pytorch-nightly-docker.yml index 48dec9eb2247e..2fa30853be362 100644 --- a/.github/workflows/pytorch-nightly-docker.yml +++ b/.github/workflows/pytorch-nightly-docker.yml @@ -64,7 +64,13 @@ jobs: env: FULL_IMAGE: ${{ steps.meta.outputs.full-image }} run: | - docker build \ + build_flags=(--load) + if [[ -n "${CI:-}" ]]; then + build_flags+=(--progress=plain) + fi + + docker buildx build \ + "${build_flags[@]}" \ --build-arg BASE_IMAGE=rocm/pytorch-autobuild:base-latest \ -t "$FULL_IMAGE" \ -f rocm-nightly-workflow/.ci/docker/pytorch-nightly-docker.Dockerfile \ From 2e68950ca0ba2f63fb0c1a2a3f25df52b6659ac2 Mon Sep 17 00:00:00 2001 From: leo-amd Date: Wed, 8 Apr 2026 16:01:55 +0200 Subject: [PATCH 12/15] Verboose image build troublshooting --- .ci/docker/pytorch-nightly-docker.Dockerfile | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/.ci/docker/pytorch-nightly-docker.Dockerfile b/.ci/docker/pytorch-nightly-docker.Dockerfile index ffa1e94ec55ef..863bb40645907 100644 --- a/.ci/docker/pytorch-nightly-docker.Dockerfile +++ b/.ci/docker/pytorch-nightly-docker.Dockerfile @@ -15,7 +15,17 @@ RUN git clone https://github.com/pytorch/pytorch --recursive \ && git remote add rocm https://github.com/ROCm/pytorch.git \ && git fetch rocm \ && git cherry-pick 519160d466782f5a62365be051fcb3ef90fa0b00 \ - && .ci/pytorch/build.sh \ + && if ! .ci/pytorch/build.sh; then \ + echo "PyTorch build failed. Re-running likely failing HIP test targets with serial verbose Ninja output."; \ + if [ -d build ]; then \ + ninja -C build -t clean hip_half_test hip_distributions_test || true; \ + ninja -C build -j1 -v hip_half_test || true; \ + ninja -C build -j1 -v hip_distributions_test || true; \ + else \ + echo "Expected build directory 'build' was not found after failure."; \ + fi; \ + exit 1; \ + fi \ && rm -rf /tmp/pytorch/.git RUN git clone https://github.com/pytorch/vision \ && cd vision \ From 1784746800b2fa196519ef2b62a9d8c2775e6ac8 Mon Sep 17 00:00:00 2001 From: leo-amd Date: Thu, 9 Apr 2026 13:55:31 +0200 Subject: [PATCH 13/15] Debug --- .github/scripts/rocm_nightly_debug_build.sh | 67 +++++++++++++++++ .../workflows/rocm-nightly-debug-build.yml | 71 +++++++++++++++++++ 2 files changed, 138 insertions(+) create mode 100644 .github/scripts/rocm_nightly_debug_build.sh create mode 100644 .github/workflows/rocm-nightly-debug-build.yml diff --git a/.github/scripts/rocm_nightly_debug_build.sh b/.github/scripts/rocm_nightly_debug_build.sh new file mode 100644 index 0000000000000..86471110aa553 --- /dev/null +++ b/.github/scripts/rocm_nightly_debug_build.sh @@ -0,0 +1,67 @@ +#!/usr/bin/env bash + +set -euxo pipefail + +ARTIFACT_DIR="${ARTIFACT_DIR:-/debug-artifacts}" +WORKDIR=/tmp/pytorch +PATCH_SHA=519160d466782f5a62365be051fcb3ef90fa0b00 + +mkdir -p "$ARTIFACT_DIR" +rm -rf "$WORKDIR" + +git clone https://github.com/pytorch/pytorch --recursive "$WORKDIR" +cd "$WORKDIR" + +pip install -r requirements.txt +git config --local user.name "AMD AMD" +git config --local user.email "amd@amd.com" +git remote add rocm https://github.com/ROCm/pytorch.git +git fetch rocm +git cherry-pick "$PATCH_SHA" + +if .ci/pytorch/build.sh 2>&1 | tee "$ARTIFACT_DIR/build.log"; then + if [[ -f build/.ninja_log ]]; then + cp build/.ninja_log "$ARTIFACT_DIR"/ + fi + exit 0 +fi + +if [[ -f build/.ninja_log ]]; then + cp build/.ninja_log "$ARTIFACT_DIR"/ +fi + +echo "PyTorch build failed. Re-running gloo_hip wrappers with verbose output." | tee -a "$ARTIFACT_DIR/build.log" + +GLOO_DIR=build/third_party/gloo/gloo/CMakeFiles/gloo_hip.dir +if [[ ! -d "$GLOO_DIR" ]]; then + echo "Expected gloo_hip build directory '$GLOO_DIR' was not found." | tee -a "$ARTIFACT_DIR/gloo-debug.log" + exit 1 +fi + +ninja -C build -t clean gloo_hip || true + +find "$GLOO_DIR" -name 'gloo_hip_generated_*.cmake' | sort > "$ARTIFACT_DIR/gloo_wrappers.txt" +if [[ ! -s "$ARTIFACT_DIR/gloo_wrappers.txt" ]]; then + echo "No gloo_hip wrapper scripts were found." | tee -a "$ARTIFACT_DIR/gloo-debug.log" + exit 1 +fi + +status=0 +while IFS= read -r wrapper; do + generated_file="${wrapper%.cmake}" + { + echo + echo "===== Re-running $wrapper =====" + } | tee -a "$ARTIFACT_DIR/gloo-debug.log" + + if ! cmake \ + -D verbose:BOOL=ON \ + -D build_configuration:STRING=RELEASE \ + -D generated_file:STRING="$generated_file" \ + -P "$wrapper" 2>&1 | tee -a "$ARTIFACT_DIR/gloo-debug.log"; then + status=1 + break + fi +done < "$ARTIFACT_DIR/gloo_wrappers.txt" + +exit "$status" diff --git a/.github/workflows/rocm-nightly-debug-build.yml b/.github/workflows/rocm-nightly-debug-build.yml new file mode 100644 index 0000000000000..5482f9a6d3d52 --- /dev/null +++ b/.github/workflows/rocm-nightly-debug-build.yml @@ -0,0 +1,71 @@ +name: ROCm Nightly Build Debug + +on: + workflow_dispatch: + inputs: + rocm_version: + description: ROCm version to debug + required: false + type: string + push: + branches: + - rocm-nightly-gha + paths: + - .github/workflows/rocm-nightly-debug-build.yml + - .github/scripts/rocm_nightly_debug_build.sh + +env: + ROCM_VERSION: '7.2.1' + PYTHON_VERSION: '3.10' + PYTORCH_ROCM_ARCH: 'gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1102;gfx1150;gfx1151;gfx1200;gfx1201' + +jobs: + debug-build: + name: Debug ROCm Nightly Image Build + runs-on: linux-pytorch-mi325-1 + timeout-minutes: 300 + steps: + - name: Resolve ROCm version + if: ${{ inputs.rocm_version != '' }} + run: echo "ROCM_VERSION=${{ inputs.rocm_version }}" >> "$GITHUB_ENV" + + - name: Checkout pytorch + uses: actions/checkout@v6 + with: + repository: pytorch/pytorch + ref: main + + - name: Checkout nightly workflow files + uses: actions/checkout@v6 + with: + path: rocm-nightly-workflow + + - name: Build base image + working-directory: .ci/docker + run: | + export PYTORCH_ROCM_ARCH="${{ env.PYTORCH_ROCM_ARCH }}" + ./build.sh pytorch-linux-jammy-rocm${{ env.ROCM_VERSION }}-py${{ env.PYTHON_VERSION }} \ + -t rocm/pytorch-autobuild:base-latest + + - name: Run debug build + run: | + mkdir -p debug-artifacts + docker run --rm \ + -e ARTIFACT_DIR=/debug-artifacts \ + -e BUILD_ENVIRONMENT="pytorch-linux-jammy-rocm${{ env.ROCM_VERSION }}-py${{ env.PYTHON_VERSION }}" \ + -e ANACONDA_PYTHON_VERSION="${{ env.PYTHON_VERSION }}" \ + -e CI=1 \ + -e PYTORCH_ROCM_ARCH="${{ env.PYTORCH_ROCM_ARCH }}" \ + -v "$PWD/rocm-nightly-workflow:/workspace/rocm-nightly-workflow" \ + -v "$PWD/debug-artifacts:/debug-artifacts" \ + rocm/pytorch-autobuild:base-latest \ + bash /workspace/rocm-nightly-workflow/.github/scripts/rocm_nightly_debug_build.sh + + - name: Upload debug artifacts + if: always() + uses: actions/upload-artifact@v4.4.0 + with: + name: rocm-nightly-debug-artifacts + path: debug-artifacts + retention-days: 7 + compression-level: 0 From 28cbb199479f38dd5e0aed04ac12d304be579e8b Mon Sep 17 00:00:00 2001 From: leo-automation Date: Thu, 9 Apr 2026 15:29:06 +0200 Subject: [PATCH 14/15] More logging --- .github/scripts/rocm_nightly_debug_build.sh | 23 +++++-- .github/scripts/run_with_log_heartbeat.sh | 69 ++++++++++++++++++++ .github/workflows/pytorch-nightly-docker.yml | 24 +++++-- 3 files changed, 104 insertions(+), 12 deletions(-) create mode 100644 .github/scripts/run_with_log_heartbeat.sh diff --git a/.github/scripts/rocm_nightly_debug_build.sh b/.github/scripts/rocm_nightly_debug_build.sh index 86471110aa553..4d7bc552ac4e3 100644 --- a/.github/scripts/rocm_nightly_debug_build.sh +++ b/.github/scripts/rocm_nightly_debug_build.sh @@ -5,6 +5,7 @@ set -euxo pipefail ARTIFACT_DIR="${ARTIFACT_DIR:-/debug-artifacts}" WORKDIR=/tmp/pytorch PATCH_SHA=519160d466782f5a62365be051fcb3ef90fa0b00 +LOG_HELPER="${LOG_HELPER:-/workspace/rocm-nightly-workflow/.github/scripts/run_with_log_heartbeat.sh}" mkdir -p "$ARTIFACT_DIR" rm -rf "$WORKDIR" @@ -19,7 +20,7 @@ git remote add rocm https://github.com/ROCm/pytorch.git git fetch rocm git cherry-pick "$PATCH_SHA" -if .ci/pytorch/build.sh 2>&1 | tee "$ARTIFACT_DIR/build.log"; then +if bash "$LOG_HELPER" "$ARTIFACT_DIR/build.log" -- .ci/pytorch/build.sh; then if [[ -f build/.ninja_log ]]; then cp build/.ninja_log "$ARTIFACT_DIR"/ fi @@ -47,18 +48,28 @@ if [[ ! -s "$ARTIFACT_DIR/gloo_wrappers.txt" ]]; then fi status=0 +wrapper_index=0 +: > "$ARTIFACT_DIR/gloo_wrapper_logs.txt" while IFS= read -r wrapper; do + wrapper_index=$((wrapper_index + 1)) generated_file="${wrapper%.cmake}" + wrapper_log="$ARTIFACT_DIR/gloo-wrapper-$(printf '%03d' "$wrapper_index").log" { echo echo "===== Re-running $wrapper =====" } | tee -a "$ARTIFACT_DIR/gloo-debug.log" + printf '%s\t%s\n' "$wrapper" "$(basename "$wrapper_log")" >> "$ARTIFACT_DIR/gloo_wrapper_logs.txt" - if ! cmake \ - -D verbose:BOOL=ON \ - -D build_configuration:STRING=RELEASE \ - -D generated_file:STRING="$generated_file" \ - -P "$wrapper" 2>&1 | tee -a "$ARTIFACT_DIR/gloo-debug.log"; then + if ! bash "$LOG_HELPER" "$wrapper_log" -- \ + cmake \ + -D verbose:BOOL=ON \ + -D build_configuration:STRING=RELEASE \ + -D generated_file:STRING="$generated_file" \ + -P "$wrapper"; then + { + echo "Wrapper failed. Last 200 lines from $(basename "$wrapper_log"):" + tail -n 200 "$wrapper_log" || true + } | tee -a "$ARTIFACT_DIR/gloo-debug.log" status=1 break fi diff --git a/.github/scripts/run_with_log_heartbeat.sh b/.github/scripts/run_with_log_heartbeat.sh new file mode 100644 index 0000000000000..b4139d6f43dbb --- /dev/null +++ b/.github/scripts/run_with_log_heartbeat.sh @@ -0,0 +1,69 @@ +#!/usr/bin/env bash + +set -euo pipefail + +usage() { + echo "Usage: $0 LOG_FILE -- COMMAND [ARGS...]" >&2 + exit 2 +} + +if [[ $# -lt 3 ]]; then + usage +fi + +log_file=$1 +shift + +if [[ $1 != "--" ]]; then + usage +fi +shift + +heartbeat_seconds="${HEARTBEAT_SECONDS:-300}" +tail_lines="${TAIL_LINES:-200}" +check_interval=5 + +mkdir -p "$(dirname "$log_file")" +: >"$log_file" + +"$@" >"$log_file" 2>&1 & +cmd_pid=$! + +cleanup() { + if kill -0 "$cmd_pid" 2>/dev/null; then + kill "$cmd_pid" 2>/dev/null || true + wait "$cmd_pid" 2>/dev/null || true + fi +} +trap cleanup EXIT + +command_str=$(printf '%q ' "$@") +command_str=${command_str% } + +next_heartbeat=0 +while kill -0 "$cmd_pid" 2>/dev/null; do + now=$(date +%s) + if (( now >= next_heartbeat )); then + echo "[$(date -u +%FT%TZ)] Command still running: ${command_str}" + echo "[$(date -u +%FT%TZ)] Log file: ${log_file} ($(du -h "$log_file" | cut -f1))" + next_heartbeat=$((now + heartbeat_seconds)) + fi + sleep "$check_interval" +done + +if wait "$cmd_pid"; then + status=0 +else + status=$? +fi + +trap - EXIT + +if [[ $status -eq 0 ]]; then + echo "Command completed successfully. Full log saved to ${log_file}" + exit 0 +fi + +echo "Command failed with exit code ${status}. Last ${tail_lines} lines from ${log_file}:" +tail -n "$tail_lines" "$log_file" || true +exit "$status" diff --git a/.github/workflows/pytorch-nightly-docker.yml b/.github/workflows/pytorch-nightly-docker.yml index 2fa30853be362..885554a141341 100644 --- a/.github/workflows/pytorch-nightly-docker.yml +++ b/.github/workflows/pytorch-nightly-docker.yml @@ -64,17 +64,29 @@ jobs: env: FULL_IMAGE: ${{ steps.meta.outputs.full-image }} run: | + mkdir -p build-logs build_flags=(--load) if [[ -n "${CI:-}" ]]; then build_flags+=(--progress=plain) fi - docker buildx build \ - "${build_flags[@]}" \ - --build-arg BASE_IMAGE=rocm/pytorch-autobuild:base-latest \ - -t "$FULL_IMAGE" \ - -f rocm-nightly-workflow/.ci/docker/pytorch-nightly-docker.Dockerfile \ - rocm-nightly-workflow/.ci/docker + bash rocm-nightly-workflow/.github/scripts/run_with_log_heartbeat.sh \ + build-logs/nightly-image-build.log -- \ + docker buildx build \ + "${build_flags[@]}" \ + --build-arg BASE_IMAGE=rocm/pytorch-autobuild:base-latest \ + -t "$FULL_IMAGE" \ + -f rocm-nightly-workflow/.ci/docker/pytorch-nightly-docker.Dockerfile \ + rocm-nightly-workflow/.ci/docker + + - name: Upload nightly build logs + if: always() + uses: actions/upload-artifact@v4.4.0 + with: + name: rocm-nightly-build-logs + path: build-logs + retention-days: 7 + compression-level: 0 - name: Save nightly image artifact env: From ffdf09aa7c05f704a56546993cfba34dd437d657 Mon Sep 17 00:00:00 2001 From: leo-automation Date: Thu, 9 Apr 2026 17:29:26 +0200 Subject: [PATCH 15/15] FIx permissions and have main jib disable sccache --- .ci/docker/ubuntu-rocm/Dockerfile | 4 ++- .github/scripts/rocm_nightly_debug_build.sh | 5 ++++ .github/workflows/pytorch-nightly-docker.yml | 25 ++++++------------- .../workflows/rocm-nightly-debug-build.yml | 3 +++ 4 files changed, 18 insertions(+), 19 deletions(-) diff --git a/.ci/docker/ubuntu-rocm/Dockerfile b/.ci/docker/ubuntu-rocm/Dockerfile index ff62e4a934c74..9d797548e3c1d 100644 --- a/.ci/docker/ubuntu-rocm/Dockerfile +++ b/.ci/docker/ubuntu-rocm/Dockerfile @@ -113,9 +113,11 @@ RUN rm install_triton.sh common_utils.sh triton.txt triton_version.txt # Install ccache/sccache (do this last, so we get priority in PATH) +ARG SKIP_SCCACHE_INSTALL COPY ./common/install_cache.sh install_cache.sh ENV PATH /opt/cache/bin:$PATH -RUN bash ./install_cache.sh && rm install_cache.sh +RUN if [ -z "${SKIP_SCCACHE_INSTALL}" ]; then bash ./install_cache.sh; fi +RUN rm install_cache.sh # Install Open MPI for ROCm COPY ./common/install_openmpi.sh install_openmpi.sh diff --git a/.github/scripts/rocm_nightly_debug_build.sh b/.github/scripts/rocm_nightly_debug_build.sh index 4d7bc552ac4e3..d9cb0d7a7e2cb 100644 --- a/.github/scripts/rocm_nightly_debug_build.sh +++ b/.github/scripts/rocm_nightly_debug_build.sh @@ -8,6 +8,11 @@ PATCH_SHA=519160d466782f5a62365be051fcb3ef90fa0b00 LOG_HELPER="${LOG_HELPER:-/workspace/rocm-nightly-workflow/.github/scripts/run_with_log_heartbeat.sh}" mkdir -p "$ARTIFACT_DIR" +if ! touch "$ARTIFACT_DIR/.write-test" 2>/dev/null; then + echo "Artifact directory '$ARTIFACT_DIR' is not writable by uid $(id -u)." >&2 + exit 1 +fi +rm -f "$ARTIFACT_DIR/.write-test" rm -rf "$WORKDIR" git clone https://github.com/pytorch/pytorch --recursive "$WORKDIR" diff --git a/.github/workflows/pytorch-nightly-docker.yml b/.github/workflows/pytorch-nightly-docker.yml index 885554a141341..7e432249424d4 100644 --- a/.github/workflows/pytorch-nightly-docker.yml +++ b/.github/workflows/pytorch-nightly-docker.yml @@ -56,6 +56,7 @@ jobs: - name: Build base image working-directory: .ci/docker run: | + export SKIP_SCCACHE_INSTALL=1 export PYTORCH_ROCM_ARCH="${{ env.PYTORCH_ROCM_ARCH }}" ./build.sh pytorch-linux-jammy-rocm${{ env.ROCM_VERSION }}-py${{ env.PYTHON_VERSION }} \ -t rocm/pytorch-autobuild:base-latest @@ -64,29 +65,17 @@ jobs: env: FULL_IMAGE: ${{ steps.meta.outputs.full-image }} run: | - mkdir -p build-logs build_flags=(--load) if [[ -n "${CI:-}" ]]; then build_flags+=(--progress=plain) fi - bash rocm-nightly-workflow/.github/scripts/run_with_log_heartbeat.sh \ - build-logs/nightly-image-build.log -- \ - docker buildx build \ - "${build_flags[@]}" \ - --build-arg BASE_IMAGE=rocm/pytorch-autobuild:base-latest \ - -t "$FULL_IMAGE" \ - -f rocm-nightly-workflow/.ci/docker/pytorch-nightly-docker.Dockerfile \ - rocm-nightly-workflow/.ci/docker - - - name: Upload nightly build logs - if: always() - uses: actions/upload-artifact@v4.4.0 - with: - name: rocm-nightly-build-logs - path: build-logs - retention-days: 7 - compression-level: 0 + docker buildx build \ + "${build_flags[@]}" \ + --build-arg BASE_IMAGE=rocm/pytorch-autobuild:base-latest \ + -t "$FULL_IMAGE" \ + -f rocm-nightly-workflow/.ci/docker/pytorch-nightly-docker.Dockerfile \ + rocm-nightly-workflow/.ci/docker - name: Save nightly image artifact env: diff --git a/.github/workflows/rocm-nightly-debug-build.yml b/.github/workflows/rocm-nightly-debug-build.yml index 5482f9a6d3d52..b4f889a19318d 100644 --- a/.github/workflows/rocm-nightly-debug-build.yml +++ b/.github/workflows/rocm-nightly-debug-build.yml @@ -50,6 +50,9 @@ jobs: - name: Run debug build run: | mkdir -p debug-artifacts + # The debug image runs as the `jenkins` user, so the bind mount must + # be writable even when the host runner uid/gid does not match. + chmod 0777 debug-artifacts docker run --rm \ -e ARTIFACT_DIR=/debug-artifacts \ -e BUILD_ENVIRONMENT="pytorch-linux-jammy-rocm${{ env.ROCM_VERSION }}-py${{ env.PYTHON_VERSION }}" \