diff --git a/.github/workflows/uccl-build-test-gb10.yml b/.github/workflows/uccl-build-test-gb10.yml index 34bad3c98..d6ab9fc82 100644 --- a/.github/workflows/uccl-build-test-gb10.yml +++ b/.github/workflows/uccl-build-test-gb10.yml @@ -80,10 +80,12 @@ jobs: ./build.sh cu13 ep 3.13 --install 2>&1 | tee build.log + pushd /tmp if ! python -c 'import torch; import uccl.ep'; then echo 'Import of torch and uccl.ep failed.' exit 1 fi + popd echo 'Build and Verification Successful!' EOF diff --git a/MANIFEST.in b/MANIFEST.in index 05c629b7d..821041ec8 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,3 +1,6 @@ include uccl/__init__.py include uccl/lib/*.so -include uccl/*.so \ No newline at end of file +include uccl/*.so +recursive-include ep/python/uccl_ep *.py *.so +recursive-include ep/deep_ep_wrapper/deep_ep *.py +include build_native.sh diff --git a/README.md b/README.md index 6fbc8caaa..db980181c 100644 --- a/README.md +++ b/README.md @@ -99,6 +99,15 @@ git clone https://github.com/uccl-project/uccl.git && cd uccl # Eg, bash build.sh cu12 ep --install bash build.sh [cu12|cu13|roc7|roc6|therock] [all|ccl_rdma|ccl_efa|p2p|ep] \ [py_version] [rocm_index_url] --install + +# Install from source. Use the torch package from your current environment so +# CUDA/ROCm backend detection matches the target platform. +pip install nanobind +pip install . --no-build-isolation +# or (legacy, may be removed in newer setuptools): +python3 setup.py install +# or development, install in editable mode instead: +pip install -e . --no-build-isolation ``` > Note: > - By default, `build.sh cu12` targets CUDA 12.8 and `build.sh roc7` targets ROCm 7.1, but you can also specify `cu13|roc6` to target CUDA 13.0 or ROCm 6.4. diff --git a/build.sh b/build.sh index b4b95a4e2..929c1c4aa 100755 --- a/build.sh +++ b/build.sh @@ -341,12 +341,14 @@ if [[ "${SKIP_DOCKER_BUILD:-0}" != "1" ]]; then if [[ "$ARCH" == "aarch64" ]]; then ${CONTAINER_ENGINE} build \ + --network=host \ --platform=linux/arm64 \ $BUILD_ARGS \ -t "$IMAGE_NAME" \ -f "$DOCKERFILE" . else ${CONTAINER_ENGINE} build \ + --network=host \ $BUILD_ARGS \ -t "$IMAGE_NAME" \ -f "$DOCKERFILE" . diff --git a/build_inner.sh b/build_inner.sh index 6b3df2e04..ea263364a 100755 --- a/build_inner.sh +++ b/build_inner.sh @@ -5,6 +5,14 @@ # Invoked by build.sh via docker/podman/apptainer; not intended for direct # execution on the host. # +# Responsibilities (packaging only): +# * Drive ``python -m build`` to invoke setup.py -> ShellBuildExtension, +# which delegates the actual native compilation to ``build_native.sh``. +# * Repair / retag / rename the resulting wheel via auditwheel. +# +# Native compilation lives in ``build_native.sh``; this script does not call +# ``make`` directly. +# # Environment variables consumed (set by build.sh before container launch): # # Required: @@ -21,7 +29,7 @@ # UCCL_RETAG_TO_HOST_GLIBC Retag wheel to host glibc version (default "0") # UCCL_LOCAL_VERSION Local version suffix appended to wheel filename (PEP 440) # -# Build feature flags: +# Build feature flags (forwarded to build_native.sh): # USE_DIETGPU Enable DietGPU compression (default "0") # USE_INTEL_RDMA_NIC Enable Intel RDMA NIC / irdma driver (default "0") # PER_EXPERT_BATCHING Enable per-expert batching (default "0") @@ -31,210 +39,6 @@ set -euo pipefail -######################################################## -# Build helper functions -######################################################## - -# Rename cpython-versioned .so files to .abi3.so for stable ABI compatibility. -# Only applies on Python >= 3.12 where nanobind stable ABI is enabled. -rename_to_abi3() { - local dir="$1" - local py_stable_abi_ok - py_stable_abi_ok=$(python3 -c "import sys; print(1 if sys.version_info >= (3, 12) else 0)") - if [[ "$py_stable_abi_ok" != "1" ]]; then - echo "Python < 3.12 detected, skipping abi3 rename (nanobind stable ABI not supported)" - return - fi - for f in "$dir"/*.cpython-*.so; do - if [[ -f "$f" ]]; then - local newname - newname=$(echo "$f" | sed 's/\.cpython-[^.]*-[^.]*-[^.]*\.so/.abi3.so/') - echo "Renaming $(basename "$f") -> $(basename "$newname")" - mv "$f" "$newname" - fi - done -} - -build_rccl_nccl_header() { - # Unlike CUDA, ROCM does not include nccl.h. So we need to build rccl to get nccl.h. - if [[ ! -f "thirdparty/rccl/build/release/include/nccl.h" ]]; then - cd thirdparty/rccl - # Just to get nccl.h, not the whole library - CXX=/opt/rocm/bin/hipcc cmake -B build/release -S . -DCMAKE_EXPORT_COMPILE_COMMANDS=OFF >/dev/null 2>&1 || true - cd ../.. - fi -} - -build_ccl_rdma() { - local TARGET="$1" - local ARCH="$2" - local IS_EFA="$3" - - set -euo pipefail - echo "[container] build_ccl_rdma Target: $TARGET" - - if [[ "${USE_INTEL_RDMA_NIC:-0}" == "1" ]]; then - echo "[container] Building with Intel RDMA NIC support (USE_INTEL_RDMA_NIC=1)" - fi - - if [[ "$TARGET" == cu* ]]; then - cd collective/rdma && make clean && make -j$(nproc) USE_INTEL_RDMA_NIC=${USE_INTEL_RDMA_NIC:-0} && cd ../../ - TARGET_SO=collective/rdma/libnccl-net-uccl.so - elif [[ "$TARGET" == roc[67] ]]; then - if [[ "$ARCH" == "aarch64" ]]; then - echo "Skipping ROCm build on Arm64 (no ROCm toolchain)." - return - fi - cd collective/rdma && make clean -f Makefile.rocm && make -j$(nproc) -f Makefile.rocm && cd ../../ - TARGET_SO=collective/rdma/librccl-net-uccl.so - elif [[ "$TARGET" == "therock" ]]; then - if [[ "$ARCH" == "aarch64" ]]; then - echo "Skipping ROCm build on Arm64 (no ROCm toolchain)." - return - fi - # Unlike CUDA, ROCM does not include nccl.h. So we need to build rccl to get nccl.h. - if [[ ! -f "thirdparty/rccl/build/release/include/nccl.h" ]]; then - cd thirdparty/rccl - # Just to get nccl.h, not the whole library - CXX=hipcc cmake -B build/release -S . -DCMAKE_EXPORT_COMPILE_COMMANDS=OFF -DCMAKE_PREFIX_PATH=$(rocm-sdk path --cmake) -DROCM_PATH=$(rocm-sdk path --root) -DHIP_PLATFORM=amd >/dev/null 2>&1 || true - cd ../.. - fi - cd collective/rdma && make clean -f Makefile.therock && make -j$(nproc) -f Makefile.therock HIP_HOME=$(rocm-sdk path --root) CONDA_LIB_HOME=$VIRTUAL_ENV/lib && cd ../../ - TARGET_SO=collective/rdma/librccl-net-uccl.so - fi - - echo "[container] Copying RDMA .so to uccl/lib/" - mkdir -p uccl/lib - cp ${TARGET_SO} uccl/lib/ -} - -build_ccl_efa() { - local TARGET="$1" - local ARCH="$2" - local IS_EFA="$3" - - set -euo pipefail - echo "[container] build_ccl_efa Target: $TARGET" - - if [[ "$ARCH" == "aarch64" || "$TARGET" == roc[67] || "$TARGET" == "therock" ]]; then - echo "Skipping EFA build on Arm64 (no EFA installer) or ROCm (no CUDA)." - return - fi - - if [[ "${USE_INTEL_RDMA_NIC:-0}" == "1" ]]; then - echo "[container] Building with Intel RDMA NIC support (USE_INTEL_RDMA_NIC=1)" - fi - - cd collective/efa && make clean && make -j$(nproc) USE_INTEL_RDMA_NIC=${USE_INTEL_RDMA_NIC:-0} && cd ../../ - - # EFA requires a custom NCCL. - cd thirdparty/nccl-sg - make src.build -j$(nproc) NVCC_GENCODE="-gencode=arch=compute_90,code=sm_90" USE_INTEL_RDMA_NIC=${USE_INTEL_RDMA_NIC:-0} - cd ../.. - - echo "[container] Copying EFA .so to uccl/lib/" - mkdir -p uccl/lib - cp collective/efa/libnccl-net-efa.so uccl/lib/ - cp thirdparty/nccl-sg/build/lib/libnccl.so uccl/lib/libnccl-efa.so -} - -build_p2p() { - local TARGET="$1" - local ARCH="$2" - local IS_EFA="$3" - - set -euo pipefail - echo "[container] build_p2p Target: $TARGET" - - if [[ "${USE_DIETGPU:-0}" == "1" ]]; then - cd thirdparty/dietgpu - if [[ "$TARGET" == cu* ]]; then - cd dietgpu/float - CUDA_GPU_ARCH="sm_$(echo "${TORCH_CUDA_ARCH_LIST:-9.0}" | awk '{print $1}' | sed 's/+PTX//; s/\.//')" - echo "Building dietgpu float for CUDA: $CUDA_GPU_ARCH" - make clean -f Makefile.cuda && make -j$(nproc) -f Makefile.cuda GPU_ARCH=$CUDA_GPU_ARCH - else - rm -rf build/ - python3 setup.py build - cd dietgpu/float - echo $TORCH_CUDA_ARCH_LIST - make clean -f Makefile.rocm && make -j$(nproc) -f Makefile.rocm GPU_ARCH=$TORCH_CUDA_ARCH_LIST - fi - cd ../../../.. - cp thirdparty/dietgpu/dietgpu/float/libdietgpu_float.so uccl/lib - fi - - cd p2p - if [[ "$TARGET" == cu* ]]; then - make clean && make -j$(nproc) - elif [[ "$TARGET" == roc[67] ]]; then - make clean -f Makefile.rocm && make -j$(nproc) -f Makefile.rocm - elif [[ "$TARGET" == "therock" ]]; then - make clean -f Makefile.therock && make -j$(nproc) -f Makefile.therock HIP_HOME=$(rocm-sdk path --root) CONDA_LIB_HOME=$VIRTUAL_ENV/lib - fi - cd .. - - echo "[container] Copying P2P .so, collective.py and utils.py to uccl/" - mkdir -p uccl - mkdir -p uccl/lib - cp p2p/libuccl_p2p.so uccl/lib/ - cp p2p/p2p.*.so uccl/ - cp p2p/collective.py uccl/ - cp p2p/utils.py uccl/ - rename_to_abi3 uccl -} - -build_ep() { - local TARGET="$1" - local ARCH="$2" - local IS_EFA="$3" - - set -euo pipefail - echo "[container] build_ep Target: $TARGET" - - if [[ "${USE_INTEL_RDMA_NIC:-0}" == "1" ]]; then - echo "[container] Building EP with Intel RDMA NIC support (USE_INTEL_RDMA_NIC=1)" - fi - - if [[ "$TARGET" == "roc6" ]]; then - echo "ERROR: EP requires roc7 (ROCm 7) for HIP code transformation; roc6 is not supported." >&2 - exit 1 - elif [[ "$TARGET" == "therock" ]]; then - echo "Skipping GPU-driven build on therock (no GPU-driven support yet)." - elif [[ "$TARGET" == roc[67] || "$TARGET" == cu* ]]; then - cd ep - # This may be needed if you traverse through different git commits - # make clean && rm -r build || true - USE_INTEL_RDMA_NIC=${USE_INTEL_RDMA_NIC:-0} python3 setup.py build - cd .. - echo "[container] Copying GPU-driven .so to uccl/" - mkdir -p uccl/lib - cp ep/build/**/*.so uccl/ - fi - rename_to_abi3 uccl -} - -build_ukernel() { - local TARGET="$1" - local ARCH="$2" - local IS_EFA="$3" - - set -euo pipefail - echo "[container] build_ukernel Target: $TARGET" - - cd experimental/ukernel - if [[ "$TARGET" == cu* ]]; then - make clean -f Makefile && make -j$(nproc) -f Makefile - elif [[ "$TARGET" == roc[67] ]]; then - make clean -f Makefile.rocm && make -j$(nproc) -f Makefile.rocm - fi - cd ../.. - - echo "[container] Copying ukernel .so to uccl/" - mkdir -p uccl/lib # mkdir anyway - cp experimental/ukernel/*ukernel*.so uccl/lib -} - ######################################################## # Main build logic ######################################################## @@ -249,32 +53,6 @@ if [[ "$TARGET" == "therock" ]]; then pip3 install --no-cache-dir rocm[libraries,devel] --index-url ${ROCM_IDX_URL} fi -if [[ "$TARGET" == roc[67] ]]; then - build_rccl_nccl_header -fi - -if [[ "$BUILD_TYPE" == "ccl_rdma" ]]; then - build_ccl_rdma "$TARGET" "$ARCH" "$IS_EFA" -elif [[ "$BUILD_TYPE" == "ccl_efa" ]]; then - build_ccl_efa "$TARGET" "$ARCH" "$IS_EFA" -elif [[ "$BUILD_TYPE" == "p2p" ]]; then - build_p2p "$TARGET" "$ARCH" "$IS_EFA" -elif [[ "$BUILD_TYPE" == "ep" ]]; then - build_ep "$TARGET" "$ARCH" "$IS_EFA" -elif [[ "$BUILD_TYPE" == "p2p_ep" ]]; then - build_p2p "$TARGET" "$ARCH" "$IS_EFA" - build_ep "$TARGET" "$ARCH" "$IS_EFA" -elif [[ "$BUILD_TYPE" == "ukernel" ]]; then - build_ukernel "$TARGET" "$ARCH" "$IS_EFA" -elif [[ "$BUILD_TYPE" == "all" ]]; then - if [[ -n "$IS_EFA" ]]; then - build_ccl_efa "$TARGET" "$ARCH" "$IS_EFA" - else - build_ccl_rdma "$TARGET" "$ARCH" "$IS_EFA" - fi - build_p2p "$TARGET" "$ARCH" "$IS_EFA" - build_ep "$TARGET" "$ARCH" "$IS_EFA" -fi if [[ "$TARGET" == "therock" ]]; then echo " @@ -304,10 +82,11 @@ def initialize(): export PIP_EXTRA_INDEX_URL=${ROCM_IDX_URL} fi -ls -lh uccl/ -ls -lh uccl/lib/ -python3 -m build +# All native build logic lives in ``build_native.sh``, driven by setup.py's +# ShellBuildExtension. ``--no-isolation`` reuses the container's setuptools/wheel. +python3 -m build --wheel --no-isolation +# Restore the original setup.py if we patched it. if [[ "$TARGET" == "therock" ]]; then mv ${BACKUP_FN} setup.py fi diff --git a/build_native.sh b/build_native.sh new file mode 100755 index 000000000..7d71872e3 --- /dev/null +++ b/build_native.sh @@ -0,0 +1,337 @@ +#!/bin/bash + +# ----------------------- +# build_native.sh — compile uccl native modules. +# +# Replaces the top-level Makefile. Invoked by setup.py's +# ShellBuildExtension; can also be run standalone for in-tree builds. +# The compile logic is lifted (almost verbatim) from build_inner.sh's +# pre-Makefile incarnation. +# +# Output layout: +# ${UCCL_PY_DIR}/lib/ library .so / .a files (libnccl-net-uccl.so, ...) +# ${UCCL_PY_DIR}/ p2p*.so + collective.py + utils.py +# ${UCCL_EP_DIR}/ ep_cpp*.so (derived from UCCL_PY_DIR, see below) +# +# By default the script writes in-tree (UCCL_PY_DIR=./uccl), matching the +# source layout that the editable install relies on. setup.py overrides +# UCCL_PY_DIR to point at the wheel-staging dir (build_lib/uccl) for +# non-inplace builds. The matching ``uccl.ep`` target is derived +# automatically: +# * source layout (UCCL_PY_DIR == /uccl) -> ./ep/python/uccl_ep +# (matches package_dir) +# * any other UCCL_PY_DIR (wheel staging) -> ${UCCL_PY_DIR}/ep +# so the same script serves all three modes (editable / install / wheel) +# with a single env var from setup.py. +# +# Usage: +# ./build_native.sh [BUILD_TYPE] +# BUILD_TYPE := all (default) | ccl_rdma | ccl_efa | p2p | ep | p2p_ep | ukernel | clean +# When no positional argument is supplied the value of $BUILD_TYPE is used. +# +# Environment variables consumed: +# TARGET Build target: cu12, cu13, roc7, roc6, therock (default cu12) +# ARCH Host architecture: x86_64 or aarch64 (default $(uname -m)) +# IS_EFA Non-empty when EFA is detected (swaps ccl_rdma -> ccl_efa) +# BUILD_TYPE Default value when no positional arg is given (default ``all``) +# +# Output staging (set by setup.py during wheel/install builds): +# UCCL_PY_DIR Target dir for the ``uccl`` package (default ./uccl) +# UCCL_EP_DIR Target dir for the ``uccl.ep`` package +# (auto-derived from UCCL_PY_DIR; override only +# if you really know what you're doing) +# +# Feature flags: +# USE_DIETGPU Enable DietGPU compression (default 0) +# USE_INTEL_RDMA_NIC Enable Intel RDMA NIC / irdma driver (default 0) +# TORCH_CUDA_ARCH_LIST CUDA compute capabilities for dietgpu (default 9.0) +# ----------------------- + +set -euo pipefail + +PROJECT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +cd "${PROJECT_ROOT}" + +TARGET="${TARGET:-cu12}" +ARCH="${ARCH:-$(uname -m)}" +IS_EFA="${IS_EFA:-}" +BUILD_TYPE="${BUILD_TYPE:-all}" + + +UCCL_PY_DIR="${UCCL_PY_DIR:-${PROJECT_ROOT}/uccl}" + +if [[ -z "${UCCL_EP_DIR:-}" ]]; then + if [[ "$(realpath -m "${UCCL_PY_DIR}")" == "$(realpath -m "${PROJECT_ROOT}/uccl")" ]]; then + # for build mode + UCCL_EP_DIR="${PROJECT_ROOT}/ep/python/uccl_ep" + else + # for install/wheel mode + UCCL_EP_DIR="${UCCL_PY_DIR}/ep" + fi +fi +UCCL_LIB_DIR="${UCCL_PY_DIR}/lib" + +# Positional argument overrides BUILD_TYPE. +if [[ $# -gt 0 ]]; then + BUILD_TYPE="$1" +fi + +mkdir -p "${UCCL_PY_DIR}" "${UCCL_LIB_DIR}" "${UCCL_EP_DIR}" + +######################################################## +# Build helper functions +######################################################## + +# Rename cpython-versioned .so files to .abi3.so for stable ABI compatibility. +# Only applies on Python >= 3.12 where nanobind stable ABI is enabled. +rename_to_abi3() { + local dir="$1" + local py_stable_abi_ok + py_stable_abi_ok=$(python3 -c "import sys; print(1 if sys.version_info >= (3, 12) else 0)") + if [[ "$py_stable_abi_ok" != "1" ]]; then + echo "Python < 3.12 detected, skipping abi3 rename (nanobind stable ABI not supported)" + return + fi + for f in "$dir"/*.cpython-*.so; do + if [[ -f "$f" ]]; then + local newname + newname=$(echo "$f" | sed 's/\.cpython-[^.]*-[^.]*-[^.]*\.so/.abi3.so/') + echo "Renaming $(basename "$f") -> $(basename "$newname")" + mv "$f" "$newname" + fi + done +} + +build_rccl_nccl_header() { + # Unlike CUDA, ROCM does not include nccl.h. So we need to build rccl to get nccl.h. + if [[ ! -f "thirdparty/rccl/build/release/include/nccl.h" ]]; then + cd thirdparty/rccl + # Just to get nccl.h, not the whole library + CXX=/opt/rocm/bin/hipcc cmake -B build/release -S . -DCMAKE_EXPORT_COMPILE_COMMANDS=OFF >/dev/null 2>&1 || true + cd ../.. + fi +} + +build_ccl_rdma() { + local TARGET="$1" + local ARCH="$2" + local IS_EFA="$3" + + set -euo pipefail + echo "[container] build_ccl_rdma Target: $TARGET" + + if [[ "${USE_INTEL_RDMA_NIC:-0}" == "1" ]]; then + echo "[container] Building with Intel RDMA NIC support (USE_INTEL_RDMA_NIC=1)" + fi + + if [[ "$TARGET" == cu* ]]; then + cd collective/rdma && make clean && make -j$(nproc) USE_INTEL_RDMA_NIC=${USE_INTEL_RDMA_NIC:-0} && cd ../../ + TARGET_SO=collective/rdma/libnccl-net-uccl.so + elif [[ "$TARGET" == roc[67] ]]; then + if [[ "$ARCH" == "aarch64" ]]; then + echo "Skipping ROCm build on Arm64 (no ROCm toolchain)." + return + fi + cd collective/rdma && make clean -f Makefile.rocm && make -j$(nproc) -f Makefile.rocm && cd ../../ + TARGET_SO=collective/rdma/librccl-net-uccl.so + elif [[ "$TARGET" == "therock" ]]; then + if [[ "$ARCH" == "aarch64" ]]; then + echo "Skipping ROCm build on Arm64 (no ROCm toolchain)." + return + fi + # Unlike CUDA, ROCM does not include nccl.h. So we need to build rccl to get nccl.h. + if [[ ! -f "thirdparty/rccl/build/release/include/nccl.h" ]]; then + cd thirdparty/rccl + # Just to get nccl.h, not the whole library + CXX=hipcc cmake -B build/release -S . -DCMAKE_EXPORT_COMPILE_COMMANDS=OFF -DCMAKE_PREFIX_PATH=$(rocm-sdk path --cmake) -DROCM_PATH=$(rocm-sdk path --root) -DHIP_PLATFORM=amd >/dev/null 2>&1 || true + cd ../.. + fi + cd collective/rdma && make clean -f Makefile.therock && make -j$(nproc) -f Makefile.therock HIP_HOME=$(rocm-sdk path --root) CONDA_LIB_HOME=$VIRTUAL_ENV/lib && cd ../../ + TARGET_SO=collective/rdma/librccl-net-uccl.so + fi + + echo "[container] Copying RDMA .so to ${UCCL_LIB_DIR}" + mkdir -p "${UCCL_LIB_DIR}" + cp ${TARGET_SO} "${UCCL_LIB_DIR}/" +} + +build_ccl_efa() { + local TARGET="$1" + local ARCH="$2" + local IS_EFA="$3" + + set -euo pipefail + echo "[container] build_ccl_efa Target: $TARGET" + + if [[ "$ARCH" == "aarch64" || "$TARGET" == roc[67] || "$TARGET" == "therock" ]]; then + echo "Skipping EFA build on Arm64 (no EFA installer) or ROCm (no CUDA)." + return + fi + + if [[ "${USE_INTEL_RDMA_NIC:-0}" == "1" ]]; then + echo "[container] Building with Intel RDMA NIC support (USE_INTEL_RDMA_NIC=1)" + fi + + cd collective/efa && make clean && make -j$(nproc) USE_INTEL_RDMA_NIC=${USE_INTEL_RDMA_NIC:-0} && cd ../../ + + # EFA requires a custom NCCL. + cd thirdparty/nccl-sg + make src.build -j$(nproc) NVCC_GENCODE="-gencode=arch=compute_90,code=sm_90" USE_INTEL_RDMA_NIC=${USE_INTEL_RDMA_NIC:-0} + cd ../.. + + echo "[container] Copying EFA .so to ${UCCL_LIB_DIR}" + mkdir -p "${UCCL_LIB_DIR}" + cp collective/efa/libnccl-net-efa.so "${UCCL_LIB_DIR}/" + cp thirdparty/nccl-sg/build/lib/libnccl.so "${UCCL_LIB_DIR}/libnccl-efa.so" +} + +build_p2p() { + local TARGET="$1" + local ARCH="$2" + local IS_EFA="$3" + + set -euo pipefail + echo "[container] build_p2p Target: $TARGET" + + if [[ "${USE_DIETGPU:-0}" == "1" ]]; then + cd thirdparty/dietgpu + if [[ "$TARGET" == cu* ]]; then + cd dietgpu/float + CUDA_GPU_ARCH="sm_$(echo "${TORCH_CUDA_ARCH_LIST:-9.0}" | awk '{print $1}' | sed 's/+PTX//; s/\.//')" + echo "Building dietgpu float for CUDA: $CUDA_GPU_ARCH" + make clean -f Makefile.cuda && make -j$(nproc) -f Makefile.cuda GPU_ARCH=$CUDA_GPU_ARCH + else + rm -rf build/ + python3 setup.py build + cd dietgpu/float + echo $TORCH_CUDA_ARCH_LIST + make clean -f Makefile.rocm && make -j$(nproc) -f Makefile.rocm GPU_ARCH=$TORCH_CUDA_ARCH_LIST + fi + cd ../../../.. + cp thirdparty/dietgpu/dietgpu/float/libdietgpu_float.so "${UCCL_LIB_DIR}/" + fi + + cd p2p + if [[ "$TARGET" == cu* ]]; then + make clean && make -j$(nproc) + elif [[ "$TARGET" == roc[67] ]]; then + make clean -f Makefile.rocm && make -j$(nproc) -f Makefile.rocm + elif [[ "$TARGET" == "therock" ]]; then + make clean -f Makefile.therock && make -j$(nproc) -f Makefile.therock HIP_HOME=$(rocm-sdk path --root) CONDA_LIB_HOME=$VIRTUAL_ENV/lib + fi + cd .. + + echo "[container] Copying P2P .so, collective.py and utils.py to ${UCCL_PY_DIR}" + mkdir -p "${UCCL_PY_DIR}" "${UCCL_LIB_DIR}" + cp p2p/libuccl_p2p.so "${UCCL_LIB_DIR}/" + cp p2p/p2p.*.so "${UCCL_PY_DIR}/" + cp p2p/collective.py "${UCCL_PY_DIR}/" + cp p2p/utils.py "${UCCL_PY_DIR}/" + rename_to_abi3 "${UCCL_PY_DIR}" +} + +build_ep() { + local TARGET="$1" + local ARCH="$2" + local IS_EFA="$3" + + set -euo pipefail + echo "[container] build_ep Target: $TARGET" + + if [[ "${USE_INTEL_RDMA_NIC:-0}" == "1" ]]; then + echo "[container] Building EP with Intel RDMA NIC support (USE_INTEL_RDMA_NIC=1)" + fi + + if [[ "$TARGET" == "roc6" ]]; then + echo "ERROR: EP requires roc7 (ROCm 7) for HIP code transformation; roc6 is not supported." >&2 + exit 1 + elif [[ "$TARGET" == "therock" ]]; then + echo "Skipping GPU-driven build on therock (no GPU-driven support yet)." + elif [[ "$TARGET" == roc[67] || "$TARGET" == cu* ]]; then + cd ep + # This may be needed if you traverse through different git commits + # make clean && rm -r build || true + USE_INTEL_RDMA_NIC=${USE_INTEL_RDMA_NIC:-0} python3 setup.py build + cd .. + echo "[container] Copying GPU-driven .so to ${UCCL_EP_DIR}" + mkdir -p "${UCCL_EP_DIR}" + cp ep/build/**/*.so "${UCCL_EP_DIR}/" + fi + rename_to_abi3 "${UCCL_EP_DIR}" +} + +build_ukernel() { + local TARGET="$1" + local ARCH="$2" + local IS_EFA="$3" + + set -euo pipefail + echo "[container] build_ukernel Target: $TARGET" + + cd experimental/ukernel + if [[ "$TARGET" == cu* ]]; then + make clean -f Makefile && make -j$(nproc) -f Makefile + elif [[ "$TARGET" == roc[67] ]]; then + make clean -f Makefile.rocm && make -j$(nproc) -f Makefile.rocm + fi + cd ../.. + + echo "[container] Copying ukernel .so to ${UCCL_LIB_DIR}" + mkdir -p "${UCCL_LIB_DIR}" + cp experimental/ukernel/*ukernel*.so "${UCCL_LIB_DIR}/" +} + +clean_all() { + # Best-effort clean of every per-module Makefile flavour; ignore missing + # files / missing toolchains. + for f in Makefile Makefile.rocm Makefile.therock; do + make -C collective/rdma -f "$f" clean 2>/dev/null || true + make -C p2p -f "$f" clean 2>/dev/null || true + make -C experimental/ukernel -f "$f" clean 2>/dev/null || true + done + make -C collective/efa clean 2>/dev/null || true + rm -rf ep/build + rm -f "${UCCL_LIB_DIR}"/*.so "${UCCL_LIB_DIR}"/*.a + rm -f "${UCCL_PY_DIR}"/p2p*.so + rm -f "${UCCL_PY_DIR}"/collective.py "${UCCL_PY_DIR}"/utils.py + rm -f "${UCCL_EP_DIR}"/*.so +} + +######################################################## +# Main build logic +######################################################## + +if [[ "$BUILD_TYPE" == "clean" ]]; then + clean_all + exit 0 +fi + +if [[ "$TARGET" == roc[67] ]]; then + build_rccl_nccl_header +fi + +if [[ "$BUILD_TYPE" == "ccl_rdma" ]]; then + build_ccl_rdma "$TARGET" "$ARCH" "$IS_EFA" +elif [[ "$BUILD_TYPE" == "ccl_efa" ]]; then + build_ccl_efa "$TARGET" "$ARCH" "$IS_EFA" +elif [[ "$BUILD_TYPE" == "p2p" ]]; then + build_p2p "$TARGET" "$ARCH" "$IS_EFA" +elif [[ "$BUILD_TYPE" == "ep" ]]; then + build_ep "$TARGET" "$ARCH" "$IS_EFA" +elif [[ "$BUILD_TYPE" == "p2p_ep" ]]; then + build_p2p "$TARGET" "$ARCH" "$IS_EFA" + build_ep "$TARGET" "$ARCH" "$IS_EFA" +elif [[ "$BUILD_TYPE" == "ukernel" ]]; then + build_ukernel "$TARGET" "$ARCH" "$IS_EFA" +elif [[ "$BUILD_TYPE" == "all" ]]; then + if [[ -n "$IS_EFA" ]]; then + build_ccl_efa "$TARGET" "$ARCH" "$IS_EFA" + else + build_ccl_rdma "$TARGET" "$ARCH" "$IS_EFA" + fi + build_p2p "$TARGET" "$ARCH" "$IS_EFA" + build_ep "$TARGET" "$ARCH" "$IS_EFA" +else + echo "build_native: unknown BUILD_TYPE '$BUILD_TYPE'" >&2 + exit 1 +fi diff --git a/docker/Dockerfile.cuda b/docker/Dockerfile.cuda index e45808620..1a99c4268 100644 --- a/docker/Dockerfile.cuda +++ b/docker/Dockerfile.cuda @@ -46,7 +46,7 @@ RUN CUDA_MAJOR=$(nvcc --version | grep -oP 'release \K[0-9]+') && \ # ───────────────────────────────────────────────────────── # Install Python build back-end (for Python ${PY_VER}) # ───────────────────────────────────────────────────────── -RUN python${PY_VER} -m pip install --no-cache-dir build auditwheel pybind11 nanobind +RUN python${PY_VER} -m pip install --no-cache-dir auditwheel build wheel pybind11 nanobind RUN python${PY_VER} -m pip install --no-cache-dir --upgrade setuptools # ───── Set Python ${PY_VER} as default python3 and python3-config ───── diff --git a/docker/Dockerfile.efa b/docker/Dockerfile.efa index 0beecad1c..81ea2ca78 100644 --- a/docker/Dockerfile.efa +++ b/docker/Dockerfile.efa @@ -70,7 +70,7 @@ RUN CUDA_MAJOR=$(nvcc --version | grep -oP 'release \K[0-9]+') && \ # ───────────────────────────────────────────────────────── # Install Python build back-end (for Python ${PY_VER}) # ───────────────────────────────────────────────────────── -RUN python${PY_VER} -m pip install --no-cache-dir build auditwheel pybind11 nanobind +RUN python${PY_VER} -m pip install --no-cache-dir auditwheel build wheel pybind11 nanobind RUN python${PY_VER} -m pip install --no-cache-dir --upgrade setuptools # ───── Set Python ${PY_VER} as default python3 and python3-config ───── diff --git a/docker/Dockerfile.gh b/docker/Dockerfile.gh index c26538343..8aa5b55b0 100644 --- a/docker/Dockerfile.gh +++ b/docker/Dockerfile.gh @@ -44,7 +44,7 @@ RUN CUDA_MAJOR=$(nvcc --version | grep -oP 'release \K[0-9]+') && \ # ───────────────────────────────────────────────────────── # Install Python build back-end (for Python ${PY_VER}) # ───────────────────────────────────────────────────────── -RUN python${PY_VER} -m pip install --no-cache-dir build auditwheel pybind11 nanobind +RUN python${PY_VER} -m pip install --no-cache-dir auditwheel build wheel pybind11 nanobind RUN python${PY_VER} -m pip install --no-cache-dir --upgrade setuptools # ───── Set Python ${PY_VER} as default python3 and python3-config ───── diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm index 657212e3a..cc10f92ea 100644 --- a/docker/Dockerfile.rocm +++ b/docker/Dockerfile.rocm @@ -42,7 +42,7 @@ RUN python${PY_VER} -m pip install --no-cache-dir --pre torch torchvision \ # ───────────────────────────────────────────────────────── # Install Python build back-end (for Python ${PY_VER}) # ───────────────────────────────────────────────────────── -RUN python${PY_VER} -m pip install --no-cache-dir build auditwheel pybind11 nanobind +RUN python${PY_VER} -m pip install --no-cache-dir auditwheel build wheel pybind11 nanobind RUN python${PY_VER} -m pip install --no-cache-dir --upgrade setuptools diff --git a/ep/Makefile b/ep/Makefile index 6754b4668..9f5b8b9a7 100644 --- a/ep/Makefile +++ b/ep/Makefile @@ -98,8 +98,8 @@ OBJ_CU := $(SRC_CU:.cu=.o) SRC_BIND := src/uccl_ep.cc OBJ_BIND := $(SRC_BIND:.cc=.o) -EP_EXT := ep$(EXT_SUFFIX) -PYTARGET := ep$(EXT_SUFFIX) +EP_EXT := ep_cpp$(EXT_SUFFIX) +PYTARGET := ep_cpp$(EXT_SUFFIX) .PHONY: all py clean @@ -131,17 +131,22 @@ $(EP_EXT): $(OBJ_CPP) $(OBJ_CU) $(OBJ_BIND) $(NB_OBJECTS) py: $(PYTARGET) $(EP_EXT) # Repo uccl package dir (so in-repo runs use the same build) -UCCL_UCCL := $(abspath $(CURDIR)/../uccl/uccl) +UCCL_UCCL := $(abspath $(CURDIR)/../uccl) + +EP_PYTHON_DIR := $(CURDIR)/python/uccl_ep install: $(EP_EXT) - @mkdir -p $(INSTALL_DIR) - @rm -f $(INSTALL_DIR)/ep.cpython-*.so $(INSTALL_DIR)/ep.abi3.so - @cp $(EP_EXT) $(INSTALL_DIR)/ - @echo "Installed $(EP_EXT) -> $(INSTALL_DIR)/" + @mkdir -p $(INSTALL_DIR)/ep + @rm -f $(INSTALL_DIR)/ep/ep_cpp.cpython-*.so $(INSTALL_DIR)/ep/ep_cpp.abi3.so + @cp $(EP_EXT) $(INSTALL_DIR)/ep/ + @cp $(EP_PYTHON_DIR)/*.py $(INSTALL_DIR)/ep/ + @echo "Installed uccl.ep (.so + .py) -> $(INSTALL_DIR)/ep/" @if [ -d "$(UCCL_UCCL)" ]; then \ - rm -f "$(UCCL_UCCL)"/ep.cpython-*.so "$(UCCL_UCCL)"/ep.abi3.so; \ - cp $(EP_EXT) "$(UCCL_UCCL)/"; \ - echo "Installed $(EP_EXT) -> $(UCCL_UCCL)/ (for in-repo runs)"; \ + mkdir -p "$(UCCL_UCCL)/ep"; \ + rm -f "$(UCCL_UCCL)/ep"/ep_cpp.cpython-*.so "$(UCCL_UCCL)/ep"/ep_cpp.abi3.so; \ + cp $(EP_EXT) "$(UCCL_UCCL)/ep/"; \ + cp $(EP_PYTHON_DIR)/*.py "$(UCCL_UCCL)/ep/"; \ + echo "Installed uccl.ep (.so + .py) -> $(UCCL_UCCL)/ep/ (for in-repo runs)"; \ fi # Clean all generated files diff --git a/ep/bench/buffer.py b/ep/bench/buffer.py index 9326f1bed..55a49afb9 100644 --- a/ep/bench/buffer.py +++ b/ep/bench/buffer.py @@ -1,38 +1,19 @@ import os from contextlib import nullcontext +from typing import Callable, List, Optional, Tuple, Union + import torch import torch.distributed as dist -from typing import Callable, Tuple, Optional, Union, List - -try: - from uccl import ep -except ImportError as exc: - import sys - - sys.stderr.write("Failed to import uccl.ep\n") - raise - -from uccl.ep import EventHandle, Config -# Support both execution modes: -# 1) As part of the packaged deep_ep_wrapper (symlinked buffer inside a package): uses relative import `.utils`. -# 2) As a standalone benchmark script from the `ep/bench` directory (no package): falls back to plain `utils`. -try: - from .utils import ( - EventOverlap, - check_nvlink_connections, - initialize_uccl, - destroy_uccl, - _fp8_e4m3_dtype, - ) -except ImportError: - from utils import ( - EventOverlap, - check_nvlink_connections, - initialize_uccl, - destroy_uccl, - _fp8_e4m3_dtype, - ) +from uccl.ep import ep_cpp +from uccl.ep import Config, EventHandle +from uccl.ep.utils import ( + EventOverlap, + check_nvlink_connections, + initialize_uccl, + destroy_uccl, + _fp8_e4m3_dtype, +) class Buffer: @@ -93,24 +74,26 @@ def __init__( else: device_index = torch.cuda.current_device() - if hasattr(ep, "get_rdma_buffer"): + if hasattr(ep_cpp, "get_rdma_buffer"): # Allocate outside PyTorch's CUDA allocator so RDMA/IPC sees a raw # cudaMalloc/cudaMallocHost-style allocation instead of a possibly # segmented caching-allocator mapping. - scratch_dlpack, rdma_buffer_is_host_allocated = ep.get_rdma_buffer( + scratch_dlpack, rdma_buffer_is_host_allocated = ep_cpp.get_rdma_buffer( num_rdma_bytes, device_index ) self.scratch = torch.utils.dlpack.from_dlpack(scratch_dlpack) else: rdma_buffer_is_host_allocated = False if num_rdma_bytes > 0: - if hasattr(ep, "can_register_rdma_gpu_buffer"): + if hasattr(ep_cpp, "can_register_rdma_gpu_buffer"): rdma_buffer_is_host_allocated = not bool( - ep.can_register_rdma_gpu_buffer(device_index, num_rdma_bytes) + ep_cpp.can_register_rdma_gpu_buffer( + device_index, num_rdma_bytes + ) ) - elif hasattr(ep, "rdma_buffer_should_use_host_alloc"): + elif hasattr(ep_cpp, "rdma_buffer_should_use_host_alloc"): rdma_buffer_is_host_allocated = bool( - ep.rdma_buffer_should_use_host_alloc( + ep_cpp.rdma_buffer_should_use_host_alloc( device_index, num_rdma_bytes ) ) @@ -154,7 +137,7 @@ def __init__( self.low_latency_mode = low_latency_mode self.explicitly_destroy = explicitly_destroy self._next_low_latency_combine_buffer = None - self.runtime = ep.Buffer( + self.runtime = ep_cpp.Buffer( self.rank, self.group_size, num_nvl_bytes, @@ -167,16 +150,12 @@ def __init__( self.runtime.set_rdma_buffer(rdma_buffer_ptr, rdma_buffer_is_host_allocated) # Synchronize device IDs - device_ids = [ - None, - ] * self.group_size + device_ids = [None] * self.group_size local_device_id = self.runtime.get_local_device_id() # print("Before all_gather_object device_ids", local_device_id, flush=True) dist.all_gather_object(device_ids, local_device_id, group) # Synchronize IPC handles - ipc_handles = [ - None, - ] * self.group_size + ipc_handles = [None] * self.group_size local_ipc_handle = self.runtime.get_local_ipc_handle() # print("Before all_gather_object ipc_handles", local_ipc_handle, flush=True) dist.all_gather_object(ipc_handles, local_ipc_handle, group) @@ -217,8 +196,8 @@ def reset_rdma_buffer(self): """ self.runtime.reset_rdma_buffer() - def connect_atomic_buffer(self, proxy: "ep.UcclProxy"): - ep.connect_atomic_buffer(proxy, self.runtime) + def connect_atomic_buffer(self, proxy: "ep_cpp.Proxy"): + ep_cpp.connect_atomic_buffer(proxy, self.runtime) def destroy(self): """ @@ -234,7 +213,7 @@ def destroy(self): @staticmethod def is_sm90_compiled(): - return ep.is_sm90_compiled() + return ep_cpp.is_sm90_compiled() @staticmethod def set_num_sms(new_num_sms: int) -> None: @@ -583,7 +562,7 @@ def get_low_latency_rdma_size_hint( Returns: size: the RDMA buffer size recommended. """ - return ep.get_low_latency_rdma_size_hint( + return ep_cpp.get_low_latency_rdma_size_hint( num_max_dispatch_tokens_per_rank, hidden, num_ranks, num_experts ) diff --git a/ep/bench/rb/Makefile b/ep/bench/rb/Makefile index e3ce43a8d..af7753208 100644 --- a/ep/bench/rb/Makefile +++ b/ep/bench/rb/Makefile @@ -183,8 +183,12 @@ FC_LOAD_LATENCY_TARGET := test_fc_load_latency$(BUILD_SUFFIX) CAS_TARGET := test_cas_throughput$(BUILD_SUFFIX) # EP runtime targets (if EFA is available) +# NOTE: the native extension is named `ep_cpp` so that it can live as +# `uccl.ep.ep_cpp` without shadowing the `uccl.ep` Python package. The +# filename must match the `NB_MODULE(ep_cpp, ...)` declaration in +# ../src/uccl_ep.cc, otherwise Python fails to import it. ifeq ($(HAS_EFA),1) - EP_EXT := ep$(EXT_SUFFIX) + EP_EXT := ep_cpp$(EXT_SUFFIX) endif # Header dependencies @@ -328,9 +332,10 @@ py: $(EP_EXT) @echo "Python extension built: $(EP_EXT)" install: $(EP_EXT) - @mkdir -p $(INSTALL_DIR) - @cp $(EP_EXT) $(INSTALL_DIR)/ - @echo "Installation complete. Module installed to: $(INSTALL_DIR)/$(EP_EXT)" + @mkdir -p $(INSTALL_DIR)/ep + @rm -f $(INSTALL_DIR)/ep/ep_cpp.cpython-*.so $(INSTALL_DIR)/ep/ep_cpp.abi3.so + @cp $(EP_EXT) $(INSTALL_DIR)/ep/ + @echo "Installation complete. Module installed to: $(INSTALL_DIR)/ep/$(EP_EXT)" else $(EP_EXT): @echo "PyTorch not found. Skipping Python extension build." diff --git a/ep/bench/rb/benchmark_rdma_fifo.py b/ep/bench/rb/benchmark_rdma_fifo.py index b02eb7772..15bcce2b0 100644 --- a/ep/bench/rb/benchmark_rdma_fifo.py +++ b/ep/bench/rb/benchmark_rdma_fifo.py @@ -24,7 +24,7 @@ import torch.distributed as dist try: - from uccl import ep + from uccl.ep import ep_cpp as ep except ImportError as exc: sys.stderr.write("Failed to import ep\n") raise diff --git a/ep/bench/test_internode.py b/ep/bench/test_internode.py index 5127eac12..b32c02980 100644 --- a/ep/bench/test_internode.py +++ b/ep/bench/test_internode.py @@ -33,7 +33,8 @@ # noinspection PyUnresolvedReferences -from utils import ( + +from uccl.ep.utils import ( # type: ignore[no-redef] init_dist, bench, bench_kineto, @@ -48,7 +49,7 @@ ) # Test compatibility with low latency functions -from buffer import Buffer +from uccl.ep.buffer import Buffer # type: ignore[no-redef] try: from uccl.ep import Config diff --git a/ep/bench/utils.py b/ep/bench/utils.py index 64aae9155..09f8654a8 100644 --- a/ep/bench/utils.py +++ b/ep/bench/utils.py @@ -1,36 +1,21 @@ import inspect -from typing import Any, Optional, Tuple, Union +import glob import os import socket -import torch -import torch.distributed as dist -from typing import Optional -import glob import sys -from uccl.ep import EventHandle -import tempfile +import time import json +import tempfile from pathlib import Path -import time -import numpy as np - -# import deep_ep as ep -try: - from uccl import ep -except ImportError as exc: - import sys +from typing import Any, Optional, Tuple, Union - sys.stderr.write("Failed to import uccl.ep\n") - raise +import numpy as np +import torch +import torch.distributed as dist -# import deep_ep as ep -try: - from uccl import ep -except ImportError as exc: - import sys +from uccl.ep import ep_cpp - sys.stderr.write("Failed to import uccl.ep\n") - raise +EventHandle = ep_cpp.EventHandle def calc_diff(x: torch.Tensor, y: torch.Tensor): @@ -93,7 +78,7 @@ def init_dist_under_torchrun(local_rank: int, num_local_ranks: int): def _gather_peer_ips(group): # Gather local IP strings across ranks world = dist.get_world_size(group) - my_ip = ep.get_oob_ip() + my_ip = ep_cpp.get_oob_ip() ips = [None] * world dist.all_gather_object(ips, my_ip, group=group) return ips @@ -149,7 +134,7 @@ def get_peer_ip(rank: int, num_ranks: int, group: dist.ProcessGroup): def get_cpu_proxies_meta(proxies, rank, scratch_ptr, scratch_bytes, num_ranks, group): - my_ip = ep.get_oob_ip() + my_ip = ep_cpp.get_oob_ip() meta = { "rank": rank, "ptr": int(scratch_ptr), @@ -573,8 +558,8 @@ def initialize_uccl( proxies = [] - for i in range(ep.get_num_proxy_threads()): - proxy = ep.Proxy( + for i in range(ep_cpp.get_num_proxy_threads()): + proxy = ep_cpp.Proxy( thread_idx=i, gpu_buffer_addr=scratch_ptr, total_size=scratch_nbytes, @@ -599,7 +584,7 @@ def initialize_uccl( for proxy in proxies: proxy.set_peers_meta(peers_meta_list) - ep.register_proxies(local_rank, proxies) + ep_cpp.register_proxies(local_rank, proxies) # Set atomic buffer pointer for all proxies BEFORE starting them # This ensures the atomic buffer info is included in connection info exchange @@ -643,7 +628,7 @@ def destroy_uccl(proxies, workers): except Exception: pass try: - ep.unregister_proxy(device_index) + ep_cpp.unregister_proxy(device_index) except Exception: pass try: diff --git a/ep/deep_ep_wrapper/deep_ep/__init__.py b/ep/deep_ep_wrapper/deep_ep/__init__.py index 57374a2a5..c3a2fee22 100644 --- a/ep/deep_ep_wrapper/deep_ep/__init__.py +++ b/ep/deep_ep_wrapper/deep_ep/__init__.py @@ -1,15 +1,48 @@ -from uccl.ep import Config, EventHandle +"""Compatibility layer exposing :mod:`uccl.ep` through the historical ``deep_ep`` API.""" -from .utils import EventOverlap, check_nvlink_connections, initialize_uccl, destroy_uccl -from .buffer import Buffer -import torch.distributed as dist +from __future__ import annotations + +import sys + +try: # Preserve DeepEP's implicit torch import (best-effort only). + import torch # type: ignore # noqa: F401 +except Exception: # pragma: no cover - torch is optional. + pass + +from uccl import __version__ as __uccl_version__ +from uccl.ep import ( # type: ignore F401 - symbols are re-exported. + Buffer, + Config, + EventOverlap, + destroy_uccl, + ep_cpp, + initialize_uccl, + test_internode, + buffer as _buffer_module, + utils as _utils_module, +) + +# Expose module attributes for attribute access (e.g. ``deep_ep.buffer``). +buffer = _buffer_module +utils = _utils_module + +# Ensure ``import deep_ep.buffer`` and peers succeed. +sys.modules.setdefault(__name__ + ".buffer", buffer) +sys.modules.setdefault(__name__ + ".utils", utils) +sys.modules.setdefault(__name__ + ".ep_cpp", ep_cpp) __all__ = [ - "Config", - "EventHandle", "Buffer", + "Config", "EventOverlap", - "check_nvlink_connections", "initialize_uccl", "destroy_uccl", + "test_internode", + "buffer", + "utils", + "ep_cpp", + "__version__", ] + +# Align version string with the bundled uccl package. +__version__ = __uccl_version__ diff --git a/ep/python/uccl_ep/.gitignore b/ep/python/uccl_ep/.gitignore new file mode 100644 index 000000000..2426d79cf --- /dev/null +++ b/ep/python/uccl_ep/.gitignore @@ -0,0 +1 @@ +ep_cpp*.so diff --git a/ep/python/uccl_ep/__init__.py b/ep/python/uccl_ep/__init__.py new file mode 100644 index 000000000..ed4de7355 --- /dev/null +++ b/ep/python/uccl_ep/__init__.py @@ -0,0 +1,36 @@ +""" +uccl.ep — Expert-Parallel communication for Mixture-of-Experts models. + +This package provides both the native C++/CUDA extension (compiled as +``ep_cpp``) and Python-level helpers (``Buffer``, ``EventOverlap``, etc.). + +Public API +---------- +* Everything exported by the native extension (``Config``, ``EventHandle``, + ``Buffer`` (native), ``Proxy``, helper functions, …) is available directly + as ``uccl.ep.``. +* High-level Python wrappers live in submodules: + - ``uccl.ep.buffer.Buffer`` — the main user-facing ``Buffer`` class + - ``uccl.ep.utils`` — ``EventOverlap``, ``initialize_uccl``, etc. + +For backward compatibility, ``from uccl.ep import Config, EventHandle`` still +works (they come from the native extension), and +``from uccl.ep import Buffer`` returns the **Python** wrapper class. +""" + +from uccl.ep.ep_cpp import * # noqa: F401,F403 — re-export native symbols + +# Keep a reference so users can do ``from uccl.ep import ep_cpp`` when +# they need the raw C++ module (e.g. ``ep_cpp.Buffer`` vs the Python +# wrapper ``Buffer``). +from uccl.ep import ep_cpp # noqa: F401 + +# Import the Python wrapper ``Buffer`` *after* the wildcard import so it +# shadows the native ``Buffer`` class with the richer Python version. +from uccl.ep.buffer import Buffer # noqa: F401 +from uccl.ep.utils import ( # noqa: F401 + EventOverlap, + check_nvlink_connections, + initialize_uccl, + destroy_uccl, +) diff --git a/ep/deep_ep_wrapper/deep_ep/buffer.py b/ep/python/uccl_ep/buffer.py similarity index 100% rename from ep/deep_ep_wrapper/deep_ep/buffer.py rename to ep/python/uccl_ep/buffer.py diff --git a/ep/deep_ep_wrapper/deep_ep/test_internode.py b/ep/python/uccl_ep/test_internode.py similarity index 100% rename from ep/deep_ep_wrapper/deep_ep/test_internode.py rename to ep/python/uccl_ep/test_internode.py diff --git a/ep/deep_ep_wrapper/deep_ep/utils.py b/ep/python/uccl_ep/utils.py similarity index 100% rename from ep/deep_ep_wrapper/deep_ep/utils.py rename to ep/python/uccl_ep/utils.py diff --git a/ep/setup.py b/ep/setup.py index ac3ab09a2..92cdce017 100644 --- a/ep/setup.py +++ b/ep/setup.py @@ -1,4 +1,5 @@ import os +import re import sys import subprocess import sysconfig @@ -51,7 +52,7 @@ def get_ext_filename(self, ext_name): class CustomInstall(install): - """Custom install command that installs .so file to INSTALL_DIR""" + """Custom install command that installs .so + Python files to INSTALL_DIR/ep/""" def run(self): # Run the standard build first @@ -62,32 +63,41 @@ def run(self): install_dir = os.getenv( "INSTALL_DIR", os.path.join(python_site_packages, "uccl") ) - os.makedirs(install_dir, exist_ok=True) - # Find the built .so file + ep_dir = os.path.join(install_dir, "ep") + os.makedirs(ep_dir, exist_ok=True) + + # --- Install Python source files from ep/python/uccl_ep/ --- + py_src_dir = PROJECT_ROOT / "python" / "uccl_ep" + if py_src_dir.is_dir(): + for py_file in py_src_dir.glob("*.py"): + dest = os.path.join(ep_dir, py_file.name) + print(f"Installing {py_file.name} to {ep_dir}") + shutil.copy2(py_file, dest) + + # --- Install the native .so --- build_lib = self.get_finalized_command("build_ext").build_lib - so_files = list(Path(build_lib).glob("ep*.so")) + so_files = list(Path(build_lib).glob("ep_cpp*.so")) if not so_files: raise RuntimeError(f"Could not find built .so file in {build_lib}") so_file = so_files[0] - dest_path = os.path.join(install_dir, so_file.name) + dest_path = os.path.join(ep_dir, so_file.name) - # Copy the .so file to the install directory - print(f"Installing {so_file.name} to {install_dir}") + print(f"Installing {so_file.name} to {ep_dir}") shutil.copy2(so_file, dest_path) if _use_abi3: - for old in Path(install_dir).glob("ep.cpython-*.so"): + for old in Path(ep_dir).glob("ep_cpp.cpython-*.so"): print(f"Removing stale {old.name}") old.unlink() else: - for old in Path(install_dir).glob("ep.abi3.so"): + for old in Path(ep_dir).glob("ep_cpp.abi3.so"): print(f"Removing stale {old.name}") old.unlink() - print(f"Installation complete. Module installed as: {dest_path}") + print(f"Installation complete. uccl.ep installed to: {ep_dir}") class CustomClean(Command): @@ -271,6 +281,7 @@ def run(self): else: # AMD GPU Architecture Detection detected_amd_arch = None + supported_amd_arch = ["gfx942", "gfx950"] try: rocminfo_output = subprocess.check_output( ["rocminfo"], stderr=subprocess.DEVNULL @@ -294,13 +305,16 @@ def run(self): ) # Use environment variable, then detected arch, then fallback - device_arch = os.getenv( - "TORCH_CUDA_ARCH_LIST", - detected_amd_arch if detected_amd_arch else "gfx420", + default_arch = ( + detected_amd_arch if detected_amd_arch else ";".join(supported_amd_arch) ) + device_arch = os.getenv("TORCH_CUDA_ARCH_LIST", default_arch) - for arch in device_arch.split(","): - nvcc_flags.append(f"--offload-arch={arch.lower()}") + nvcc_flags.extend( + f"--offload-arch={arch.lower()}" + for arch in re.split(r"[;,\s]+", device_arch) + if arch in supported_amd_arch + ) # Disable SM90 features on AMD cxx_flags.append("-DDISABLE_SM90_FEATURES") @@ -383,7 +397,7 @@ def run(self): version="0.0.1" + revision, ext_modules=[ CUDAExtension( - name="ep", + name="ep_cpp", include_dirs=include_dirs, library_dirs=library_dirs, sources=sources, diff --git a/ep/src/uccl_ep.cc b/ep/src/uccl_ep.cc index ccdcf6bb2..cdadc6528 100644 --- a/ep/src/uccl_ep.cc +++ b/ep/src/uccl_ep.cc @@ -1635,7 +1635,7 @@ class Buffer { nullptr}; // Device pointer to array of IPC base addresses }; -NB_MODULE(ep, m) { +NB_MODULE(ep_cpp, m) { m.doc() = "Minimal DeepEP-compatible shim with UCCL"; nb::class_(m, "Config") diff --git a/setup.py b/setup.py index 04b4f1cad..cd0e2898b 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,15 @@ +import os import re +import shutil +import subprocess import sys import sysconfig -from setuptools import setup, find_packages, Extension +import importlib.util +from pathlib import Path +from distutils import log +from typing import Optional +from setuptools import setup, find_packages, Extension, Command +from setuptools.command.build_ext import build_ext as _build_ext def _is_freethreaded(): @@ -28,6 +36,7 @@ def get_version(): VERSION = get_version() + # Single package "uccl" for all backends (vLLM-style). # Variants are distinguished by PEP 440 local version identifiers in the # wheel filename (e.g. uccl-0.1.0+cu13, uccl-0.1.0+cu12.efa). @@ -43,6 +52,211 @@ def get_version(): py_limited_api=_use_limited_api, define_macros=[("Py_LIMITED_API", "0x030C0000")] if _use_limited_api else [], ) + + +BUILD_SCRIPT = "build_native.sh" + + +class ShellExtension(Extension): + """Extension wrapper that drives external shell-script builds.""" + + def __init__(self, name, sourcedir=".", script=BUILD_SCRIPT, + targets=("all",), env=None): + super().__init__(name, sources=[]) + self.sourcedir = Path(sourcedir).resolve() + self.script = script + self.targets = list(targets) + self.env = env or {} + + +class ShellClean(Command): + """`python setup.py clean` -> run ``build_native.sh clean`` plus wipe + Python build dirs. + """ + + description = ( + "run `build_native.sh clean` and remove build/, dist/, *.egg-info/, etc." + ) + user_options = [] + + def initialize_options(self): + pass + + def finalize_options(self): + pass + + def run(self): + project_root = Path(__file__).parent.resolve() + + # 1. Delegate to build_native.sh so native sub-modules clean up + # their own .o/.d/.so files. + script = project_root / BUILD_SCRIPT + if script.exists(): + cmd = ["bash", str(script), "clean"] + log.info("running %s in %s", " ".join(cmd), project_root) + subprocess.run(cmd, cwd=str(project_root), check=False) + + # 2. Wipe top-level Python build artefacts. + targets = [ + project_root / "build", + project_root / "dist", + project_root / "wheelhouse", + project_root / "ep" / "build", + ] + targets += list(project_root.glob("*.egg-info")) + targets += list((project_root / "ep").glob("*.egg-info")) + + # 3. Stale shared libraries that may have been copied in-tree by an + # earlier build (e.g. uccl/lib/*.so, uccl/p2p*.so, + # ep/python/uccl_ep/ep_cpp*.so). + targets += list((project_root / "uccl" / "lib").glob("*.so")) + targets += list((project_root / "uccl").glob("p2p*.so")) + targets += list( + (project_root / "ep" / "python" / "uccl_ep").glob("ep_cpp*.so") + ) + + for path in targets: + if not path.exists() and not path.is_symlink(): + continue + log.info("removing %s", path) + if path.is_dir() and not path.is_symlink(): + shutil.rmtree(path, ignore_errors=True) + else: + try: + path.unlink() + except FileNotFoundError: + pass + + +class ShellBuildExtension(_build_ext): + """Custom build_ext that invokes ``build_native.sh`` before compiling + C extensions. + """ + + @staticmethod + def _find_rocm_home()->Optional[Path]: + """Return ROCm install dir as ``Path``, or ``None``. + + Adapted from pytorch ``_find_rocm_home``. + """ + # Guess #1 + rocm_home = os.environ.get('ROCM_HOME') or os.environ.get('ROCM_PATH') + if rocm_home is None: + # Guess #2: Support for ROCm distribution from TheRock + # rocm-sdk-core installs everything under /_rocm_sdk_core + # (include/, lib/, bin/, ...), so the module's own location is the + # ROCM_HOME we want. Use find_spec to locate it without importing. + spec = importlib.util.find_spec('_rocm_sdk_core') + if spec is not None and spec.origin is not None: + rocm_home = str(Path(spec.origin).parent.resolve()) + if rocm_home is None: + # Guess #3 + hipcc_path = shutil.which('hipcc') + if hipcc_path is not None: + rocm_home = os.path.dirname(os.path.dirname( + os.path.realpath(hipcc_path))) + # can be either /hip/bin/hipcc or /bin/hipcc + if os.path.basename(rocm_home) == 'hip': + rocm_home = os.path.dirname(rocm_home) + else: + # Guess #4 + fallback_path = '/opt/rocm' + if os.path.exists(fallback_path): + rocm_home = fallback_path + if rocm_home is None or not os.path.exists(rocm_home): + log.warn("No ROCm runtime is found, using ROCM_HOME='%s'", rocm_home) + return None + log.info("ROCm runtime is found at %s", rocm_home) + return Path(rocm_home) + + @staticmethod + def _detect_rocm_major(rocm_home: Path): + """Return ROCm major version, or ``None`` if undetectable.""" + # ``X.Y.Z`` text file shipped by most ROCm installs. + version_file = rocm_home / ".info" / "version" + if version_file.exists(): + try: + head = version_file.read_text().strip().split(".")[0] + return int(head) + except (ValueError, OSError): + pass + + # Fallback: ``HIP_VERSION_MAJOR`` from the hip header. + hip_version_h = rocm_home / "include" / "hip" / "hip_version.h" + if hip_version_h.exists(): + try: + for line in hip_version_h.read_text().splitlines(): + m = re.match(r"\s*#define\s+HIP_VERSION_MAJOR\s+(\d+)", line) + if m: + return int(m.group(1)) + except OSError: + pass + + return None + + @staticmethod + def _detect_target(env: dict) -> str: + """Resolve build_native.sh ``TARGET``. + + Precedence: env ``TARGET`` > detected ROCm (``roc6`` / ``roc7``) + > ``cu12``. + """ + if env.get("TARGET"): + return env["TARGET"] + rocm_home = ShellBuildExtension._find_rocm_home() + if rocm_home is None: + return "cu12" + major = ShellBuildExtension._detect_rocm_major(rocm_home) + return "roc6" if major == 6 else "roc7" + + def _get_build_output_dir(self): + """``uccl`` package output dir for build_native.sh. + + Editable/inplace -> source ``uccl/``; install/wheel -> ``build_lib/uccl``. + build_native.sh derives ``uccl.ep`` from this internally. + """ + if self.inplace or not self.build_lib: + return Path(__file__).parent.resolve() / "uccl" + return Path(self.build_lib).resolve() / "uccl" + + def run(self): + shell_exts = [ext for ext in self.extensions if isinstance(ext, ShellExtension)] + for ext in shell_exts: + self.build_shell_extension(ext) + # Drop shell-only extensions so the base class doesn't expect artifacts. + self.extensions = [ext for ext in self.extensions if not isinstance(ext, ShellExtension)] + super().run() + + def build_shell_extension(self, ext: ShellExtension): + env = os.environ.copy() + env.setdefault("PYTHON", sys.executable) + env.update(ext.env) + + # Drop PEP 517 build-env PYTHONPATH so the child build script's python + # sees the host venv (otherwise ``import torch`` in ep/setup.py fails). + env.pop("PYTHONPATH", None) + + # build_native.sh dispatches on TARGET; auto-pick when caller didn't. + env["TARGET"] = self._detect_target(env) + + # Tell build_native.sh where to drop the ``uccl`` package's + # artefacts; the script derives the ``uccl.ep`` target dir from + # this value internally. + env["UCCL_PY_DIR"] = str(self._get_build_output_dir()) + + cmd = ["bash", str(ext.sourcedir / ext.script), *ext.targets] + + log.info("running `%s` in %s", " ".join(cmd), ext.sourcedir) + subprocess.check_call(cmd, cwd=str(ext.sourcedir), env=env) + + +shell_ext = ShellExtension( + name="uccl.shell", + sourcedir=Path(__file__).parent, + script=BUILD_SCRIPT, + targets=["all"], +) + setup( name="uccl", version=VERSION, @@ -51,17 +265,23 @@ def get_version(): long_description=open("README.md").read(), long_description_content_type="text/markdown", url="https://github.com/uccl-project/uccl", - packages=find_packages(), - ext_modules=[abi3_ext], + packages=find_packages(include=["uccl", "uccl.*", "uccl.ep"]) + ["uccl.ep"], + package_dir={ + "uccl": "uccl", + "uccl.ep": "ep/python/uccl_ep", + }, + ext_modules=[shell_ext, abi3_ext], package_data={ "uccl": [ "lib/*.so", "p2p*.so", - "ep*.so", "lib/*.a", "collective.py", "utils.py", ], + "uccl.ep": [ + "ep_cpp*.so", + ], }, license="Apache-2.0", install_requires=["intervaltree"], @@ -73,4 +293,8 @@ def get_version(): extras_require={ "rocm": [], }, + cmdclass={ + "build_ext": ShellBuildExtension, + "clean": ShellClean, + }, ) diff --git a/uccl/.gitignore b/uccl/.gitignore index 66df4f2ec..3613cecc1 100644 --- a/uccl/.gitignore +++ b/uccl/.gitignore @@ -1,3 +1,3 @@ collective.py utils.py -_rocm_init.py \ No newline at end of file +_rocm_init.py