diff --git a/.azure/docker-build.yml b/.azure/docker-build.yml deleted file mode 100644 index 1eddde94c1..0000000000 --- a/.azure/docker-build.yml +++ /dev/null @@ -1,112 +0,0 @@ -trigger: - tags: - include: ["*"] - branches: - include: ["main"] - paths: - include: - - ".azure/docker-build.yml" - - "dockers/**" - - "requirements.txt" - - "requirements/*.txt" - - "setup.py" - exclude: - - "*.md" - - "**/*.md" - -pr: - branches: - include: ["*"] - paths: - include: - - ".azure/docker-build.yml" - - "dockers/**" - - "requirements.txt" - - "requirements/*.txt" - - "setup.py" - exclude: - - "*.md" - - "**/*.md" - -schedules: - - cron: "0 */2 * * *" - displayName: rebuild dockers for CI every 2 hours - branches: - include: ["main"] - -jobs: - - job: build_push - strategy: - matrix: - "cuda 12.8 | torch 2.8.0 | cudnn FE v1.15.0": - { CUDA_VERSION: "12.8.1", TORCH_VERSION: "2.8.0", TRITON_VERSION: "3.4.0", CUDNN_FRONTEND_VERSION: "1.15.0" } - "cuda 12.8 | torch nightly | cudnn FE v1.15.0": - { CUDA_VERSION: "12.8.1", TORCH_VERSION: "main", TORCH_INSTALL: "source", CUDNN_FRONTEND_VERSION: "1.15.0" } - #'cuda 12.1': # this version - '8.9.5.29-1+cuda12.1' for 'libcudnn8' was not found - # how much time to give 'run always even if cancelled tasks' before stopping them - cancelTimeoutInMinutes: "2" - timeoutInMinutes: "95" - variables: - UBUNTU_VERSION: "24.04" - PYTHON_VERSION: "3.12" - imageRepository: "pytorchlightning/lightning-thunder" - imageTag: "ubuntu$(UBUNTU_VERSION)-cuda$(CUDA_VERSION)-cudnn-fe$(CUDNN_FRONTEND_VERSION)-py$(PYTHON_VERSION)-pt_${TORCH_VERSION/v/}" - pool: "lit-rtx-3090" - workspace: - clean: all - steps: - - bash: | - set -e - echo $imageTag - nvidia-smi - docker image build \ - -t $(imageRepository):$(imageTag) \ - -f "dockers/ubuntu-cuda/Dockerfile" \ - --build-arg UBUNTU_VERSION="$(UBUNTU_VERSION)" \ - --build-arg CUDA_VERSION="$(CUDA_VERSION)" \ - --build-arg CUDNN_FRONTEND_VERSION="v$(CUDNN_FRONTEND_VERSION)" \ - --build-arg PYTHON_VERSION="$(PYTHON_VERSION)" \ - --build-arg TORCH_VERSION="$(TORCH_VERSION)" \ - --build-arg TORCH_INSTALL="$(TORCH_INSTALL)" \ - --build-arg TRITON_VERSION="$(TRITON_VERSION)" \ - . --no-cache - timeoutInMinutes: "95" - displayName: "Build base image" - - - bash: | - docker image build \ - -t $(imageRepository):$(imageTag)-apex \ - -f "dockers/with-apex/Dockerfile" \ - --build-arg BASE_IMAGE_TAG="$(imageTag)" \ - . --no-cache - timeoutInMinutes: "25" - displayName: "Build Apex image" - - - bash: | - docker image build \ - -t $(imageRepository):$(imageTag)-dev \ - -f "dockers/with-dev/Dockerfile" \ - --build-arg BASE_IMAGE_TAG="$(imageTag)-apex" \ - . --no-cache - timeoutInMinutes: "25" - displayName: "Build Dev image" - - - bash: | - docker image ls | grep $(imageRepository) - # drop pt from requirements so not to interfere with the existing one - bash scripts/remove-torch-lines.sh requirements/base.txt - docker run --rm --gpus=all -v .:/workspace $(imageRepository):$(imageTag)-dev \ - bash -c "cd /workspace && ls -lh . && \ - pip install -q . && \ - bash scripts/sanity-check.sh" - timeoutInMinutes: "5" - displayName: "Sanity check" - - - bash: | - set -e - echo $(imageRepository):$(imageTag) - echo $(DOCKERHUB_PAT) | docker login --username $(DOCKERHUB_USER) --password-stdin - docker push $(imageRepository):$(imageTag)-dev - condition: ne(variables['Build.Reason'], 'PullRequest') - timeoutInMinutes: "35" - displayName: "Push base image" diff --git a/.azure/gpu-coverage.yml b/.azure/gpu-coverage.yml deleted file mode 100644 index 4af011c308..0000000000 --- a/.azure/gpu-coverage.yml +++ /dev/null @@ -1,77 +0,0 @@ -trigger: - tags: - include: ["*"] - paths: - include: - - ".azure/gpu-coverage.yml" - - "requirements/coverage.txt" - - "thunder/tests/coverage/**" - branches: - include: - - "main" - - "release/*" - - "refs/tags/*" - -pr: - branches: - include: ["*"] - -jobs: - - job: coverage - strategy: - matrix: - "w/ torch 2.7.1": - docker-image: "ubuntu24.04-cuda12.8.1-cudnn-fe1.15.0-py3.12-pt_2.8.0-dev" - # how much time to give 'run always even if cancelled tasks' before stopping them - cancelTimeoutInMinutes: "2" - pool: "lit-rtx-3090" - variables: - DEVICES: $( python -c 'name = "$(Agent.Name)" ; gpus = name.split("_")[-1] if "_" in name else "0"; print(gpus)' ) - TORCH_HOME: "/var/tmp/torch" - PIP_CACHE_DIR: "/var/tmp/pip" - PYTHONHASHSEED: "0" - NCCL_DEBUG: "INFO" - ALLOW_COVERAGE_TRACE: "1" - container: - image: "pytorchlightning/lightning-thunder:$(docker-image)" - options: "--gpus=all --shm-size=16g -v /var/tmp:/var/tmp" - workspace: - clean: all - steps: - - bash: | - echo $(DEVICES) - lspci | egrep 'VGA|3D' - dpkg-query -W -f='${Package} ${Version}\n' libnccl2 libnccl-dev - whereis nvidia - nvidia-smi - which python && which pip - python --version - pip --version - pip list - echo "##vso[task.setvariable variable=CUDA_VISIBLE_DEVICES]$(DEVICES)" - displayName: "Image info & NVIDIA" - - - bash: | - set -ex - # drop pt from requirements so not to interfere with the existing one - bash scripts/remove-torch-lines.sh requirements/base.txt - cat requirements/base.txt - - # double check on test requirements - pip install -U -r requirements/base.txt -r requirements/coverage.txt - - # https://docs.codecov.com/docs/codecov-uploader - curl -Os https://uploader.codecov.io/latest/linux/codecov - chmod +x codecov - - # install this package - pip install -e . - displayName: "Install package & ..." - - - bash: bash scripts/sanity-check.sh - displayName: "Sanity check / details" - - - bash: | - PYTHONPATH=$(pwd)/thunder/tests pytest thunder/tests/coverage_tests - timeoutInMinutes: "45" - displayName: "Testing: coverage_tests" diff --git a/.azure/gpu-tests.yml b/.azure/gpu-tests.yml deleted file mode 100644 index a2a62ed7c0..0000000000 --- a/.azure/gpu-tests.yml +++ /dev/null @@ -1,198 +0,0 @@ -trigger: - tags: - include: ["*"] - branches: - include: - - "main" - - "release/*" - - "refs/tags/*" - -pr: - branches: - include: ["*"] - -jobs: - - job: testing - strategy: - matrix: - "main w/ torch 2.8.0": - docker-image: "ubuntu24.04-cuda12.8.1-cudnn-fe1.15.0-py3.12-pt_2.8.0-dev" - testing: "main" - "ops w/ torch 2.8.0": - docker-image: "ubuntu24.04-cuda12.8.1-cudnn-fe1.15.0-py3.12-pt_2.8.0-dev" - testing: "ops" - "grads w/ torch 2.8.0": - docker-image: "ubuntu24.04-cuda12.8.1-cudnn-fe1.15.0-py3.12-pt_2.8.0-dev" - testing: "grads" - "distributed w/ torch 2.8.0": - docker-image: "ubuntu24.04-cuda12.8.1-cudnn-fe1.15.0-py3.12-pt_2.8.0-dev" - testing: "distributed" - "main w/ torch-nightly": - docker-image: "ubuntu24.04-cuda12.8.1-cudnn-fe1.15.0-py3.12-pt_main-dev" - testing: "main" - "ops w/ torch-nightly": - docker-image: "ubuntu24.04-cuda12.8.1-cudnn-fe1.15.0-py3.12-pt_main-dev" - testing: "ops" - "grads w/ torch-nightly": - docker-image: "ubuntu24.04-cuda12.8.1-cudnn-fe1.15.0-py3.12-pt_main-dev" - testing: "grads" - "distributed w/ torch-nightly": - docker-image: "ubuntu24.04-cuda12.8.1-cudnn-fe1.15.0-py3.12-pt_main-dev" - testing: "distributed" - # how much time to give 'run always even if cancelled tasks' before stopping them - cancelTimeoutInMinutes: "2" - pool: "lit-rtx-3090" - variables: - DEVICES: $( python -c 'name = "$(Agent.Name)" ; gpus = name.split("_")[-1] if "_" in name else "0"; print(gpus)' ) - TORCH_HOME: "/var/tmp/torch" - PIP_CACHE_DIR: "/var/tmp/pip" - PYTHONHASHSEED: "0" - NCCL_DEBUG: "INFO" - CI: "true" - container: - image: "pytorchlightning/lightning-thunder:$(docker-image)" - options: "--gpus=all --shm-size=16g -v /var/tmp:/var/tmp" - workspace: - clean: all - steps: - - bash: | - echo $(DEVICES) - lspci | egrep 'VGA|3D' - dpkg-query -W -f='${Package} ${Version}\n' libnccl2 libnccl-dev - whereis nvidia - nvidia-smi - which python && which pip - python --version - pip --version - pip list - echo "##vso[task.setvariable variable=CUDA_VISIBLE_DEVICES]$(DEVICES)" - displayName: "Image info & NVIDIA" - - - bash: | - set -ex - # drop pt from requirements so not to interfere with the existing one - bash scripts/remove-torch-lines.sh requirements/base.txt - cat requirements/base.txt - - # double check on test requirements - pip install -U -r requirements/base.txt -r requirements/test.txt - - # https://docs.codecov.com/docs/codecov-uploader - curl -Os https://uploader.codecov.io/latest/linux/codecov - chmod +x codecov - - # install this package - pip install -e . - displayName: "Install package & ..." - - - bash: bash scripts/sanity-check.sh - displayName: "Sanity check / details" - - - bash: | - set -ex - export CUDA_LAUNCH_BLOCKING=1 - coverage run --source thunder -m \ - pytest thunder/tests/ \ - -m "not standalone" \ - -v --datefmt="%Y%m%d-%H:%M:%S.%f" \ - --random-order-seed=42 \ - --durations=250 \ - --timeout=240 \ - --numprocesses=9 \ - --gpu-mem-limit=2 \ - --ignore=thunder/tests/distributed --ignore=thunder/tests/test_networks.py \ - --ignore=thunder/tests/test_ops.py --ignore=thunder/tests/test_grad.py - # compile coverage results - python -m coverage report - python -m coverage xml - # upload to codecov - ./codecov --token=$(CODECOV_TOKEN) --commit=$(Build.SourceVersion) \ - --flags=gpu,pytest,regular --name="GPU-coverage" --env=linux,azure - condition: eq(variables['testing'], 'main') - timeoutInMinutes: "40" - displayName: "Testing: main" - - - bash: | - set -ex - # these test need to run in single thread as they occurs with CUDA OOM - coverage run --source thunder -m \ - pytest \ - thunder/tests/test_networks.py \ - -m "not standalone" \ - -v --durations=0 \ - --random-order-seed=42 \ - --gpu-mem-limit=6 \ - --numprocesses=3 - # compile coverage results - python -m coverage report - python -m coverage xml - # upload to codecov - ./codecov --token=$(CODECOV_TOKEN) --commit=$(Build.SourceVersion) \ - --flags=gpu,pytest,networks --name="GPU-coverage" --env=linux,azure - condition: eq(variables['testing'], 'main') - timeoutInMinutes: "15" - displayName: "Testing: networks" - - - bash: | - set -ex - coverage run --source thunder -m \ - pytest thunder/tests/test_ops.py \ - -m "not standalone" \ - -v --datefmt="%Y%m%d-%H:%M:%S.%f" \ - --random-order-seed=42 \ - --durations=250 \ - --timeout=240 \ - --gpu-mem-limit=2 \ - --numprocesses=9 - # compile coverage results - python -m coverage report - python -m coverage xml - # upload to codecov - ./codecov --token=$(CODECOV_TOKEN) --commit=$(Build.SourceVersion) \ - --flags=gpu,pytest,regular --name="GPU-coverage" --env=linux,azure - condition: eq(variables['testing'], 'ops') - env: - CUDA_LAUNCH_BLOCKING: "1" - timeoutInMinutes: "40" - displayName: "Testing: ops" - - - bash: | - set -ex - coverage run --source thunder -m \ - pytest thunder/tests/test_grad.py \ - -m "not standalone" \ - -v --datefmt="%Y%m%d-%H:%M:%S.%f" \ - --random-order-seed=42 \ - --durations=250 \ - --timeout=240 \ - --gpu-mem-limit=2 \ - --numprocesses=9 - # compile coverage results - python -m coverage report - python -m coverage xml - # upload to codecov - ./codecov --token=$(CODECOV_TOKEN) --commit=$(Build.SourceVersion) \ - --flags=gpu,pytest,regular --name="GPU-coverage" --env=linux,azure - condition: eq(variables['testing'], 'grads') - env: - CUDA_LAUNCH_BLOCKING: "1" - timeoutInMinutes: "35" - displayName: "Testing: grads" - - - bash: | - set -ex - # run all found tests in given past as standalone - pytest \ - thunder/tests/distributed \ - -v --durations=0 \ - --random-order-seed=42 - # compile coverage results - # TODO: collect and merge reports - # python -m coverage report - # python -m coverage xml - # # upload to codecov - # ./codecov --token=$(CODECOV_TOKEN) --commit=$(Build.SourceVersion) \ - # --flags=gpu,pytest,distributed --name="GPU-coverage" --env=linux,azure - condition: eq(variables['testing'], 'distributed') - timeoutInMinutes: "30" - displayName: "Testing: distributed" diff --git a/.azure/notebook-runs.yml b/.azure/notebook-runs.yml deleted file mode 100644 index b0737f46c4..0000000000 --- a/.azure/notebook-runs.yml +++ /dev/null @@ -1,81 +0,0 @@ -trigger: - tags: - include: ["*"] - branches: - include: - - "main" - - "release/*" - - "refs/tags/*" - -pr: - branches: - include: ["*"] - -jobs: - - job: jupyter - strategy: - matrix: - "notebooks w/ torch 2.8": - docker-image: "ubuntu24.04-cuda12.8.1-cudnn-fe1.15.0-py3.12-pt_2.8.0-dev" - "notebooks w/ torch-nightly": - docker-image: "ubuntu24.04-cuda12.8.1-cudnn-fe1.15.0-py3.12-pt_main-dev" - # how long to run the job before automatically cancelling - timeoutInMinutes: "45" - # how much time to give 'run always even if cancelled tasks' before stopping them - cancelTimeoutInMinutes: "2" - pool: "lit-rtx-3090" - variables: - DEVICES: $( python -c 'name = "$(Agent.Name)" ; gpus = name.split("_")[-1] if "_" in name else "0"; print(gpus)' ) - TORCH_HOME: "/var/tmp/torch" - PIP_CACHE_DIR: "/var/tmp/pip" - container: - image: "pytorchlightning/lightning-thunder:$(docker-image)" - options: "--gpus=all --shm-size=16g -v /var/tmp:/var/tmp" - workspace: - clean: all - steps: - - bash: | - echo $(DEVICES) - lspci | egrep 'VGA|3D' - whereis nvidia - nvidia-smi - which python && which pip - python --version - pip --version - pip list - echo "##vso[task.setvariable variable=CUDA_VISIBLE_DEVICES]$(DEVICES)" - displayName: "Image info & NVIDIA" - - - bash: | - set -ex - # drop pt from requirements so not to interfere with the existing one - bash scripts/remove-torch-lines.sh requirements/base.txt - cat requirements/base.txt - pip install -U -r requirements/notebooks.txt - # install this package - pip install -e . - # double check on test requirements - echo "Install special requirements for notebooks" - displayName: "Install package & ..." - - - bash: | - set -ex - pip list - bash scripts/sanity-check.sh - displayName: "Sanity check / details" - - - bash: | - set -ex - # list all notebooks in this folder - find . -name "*.ipynb" > all.txt - # drop all "./" from beginning of each line - sed -i 's/^\.\///' all.txt - # filter out the ones that are listed in .ignore.ci - grep -Fxv -f .ignore.ci all.txt > ci.txt - # iterate over all listed notebooks and execute them with jupyter - while read -r line; do - echo "Processing $line" - jupyter execute $line --timeout=300 - done <<< $(cat ci.txt) - workingDirectory: "notebooks/" - displayName: "Execute notebooks" diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index cd5f35e6b9..59734937c5 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -9,7 +9,6 @@ * @mruberry @lantiga @KaelanDt # CI/CD and configs -/.azure/ @KaelanDt @lantiga /.github/ @KaelanDt @lantiga /.lightning/ @KaelanDt @lantiga /dockers/ @KaelanDt @lantiga diff --git a/.github/labeling-config.yml b/.github/labeling-config.yml index f1364b6394..842b992236 100644 --- a/.github/labeling-config.yml +++ b/.github/labeling-config.yml @@ -7,7 +7,6 @@ documentation: "ci": - changed-files: - any-glob-to-any-file: - - .azure/* - .github/* - .github/workflows/* - .lightning/workflows/* @@ -17,7 +16,6 @@ documentation: - changed-files: - any-glob-to-any-file: - dockers/**/* - - .azure/docker-build.yml "install": - changed-files: diff --git a/.github/workflows/ci-checks.yml b/.github/workflows/ci-checks.yml index d847e7c2ed..c25d845ea9 100644 --- a/.github/workflows/ci-checks.yml +++ b/.github/workflows/ci-checks.yml @@ -12,8 +12,6 @@ concurrency: jobs: check-schema: uses: Lightning-AI/utilities/.github/workflows/check-schema.yml@main - with: - azure-dir: ".azure" check-package: uses: Lightning-AI/utilities/.github/workflows/check-package.yml@main