diff --git a/.github/workflows/uccl-build-test-gb10.yml b/.github/workflows/uccl-build-test-gb10.yml
index 34bad3c98..d6ab9fc82 100644
--- a/.github/workflows/uccl-build-test-gb10.yml
+++ b/.github/workflows/uccl-build-test-gb10.yml
@@ -80,10 +80,12 @@ jobs:
 
             ./build.sh cu13 ep 3.13 --install 2>&1 | tee build.log
 
+            pushd /tmp
             if ! python -c 'import torch; import uccl.ep'; then
               echo 'Import of torch and uccl.ep failed.'
               exit 1
             fi
+            popd
 
             echo 'Build and Verification Successful!'
           EOF
diff --git a/MANIFEST.in b/MANIFEST.in
index 05c629b7d..821041ec8 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,3 +1,6 @@
 include uccl/__init__.py
 include uccl/lib/*.so
-include uccl/*.so
\ No newline at end of file
+include uccl/*.so
+recursive-include ep/python/uccl_ep *.py *.so
+recursive-include ep/deep_ep_wrapper/deep_ep *.py
+include build_native.sh
diff --git a/README.md b/README.md
index 6fbc8caaa..db980181c 100644
--- a/README.md
+++ b/README.md
@@ -99,6 +99,15 @@ git clone https://github.com/uccl-project/uccl.git && cd uccl
 # Eg, bash build.sh cu12 ep --install
 bash build.sh [cu12|cu13|roc7|roc6|therock] [all|ccl_rdma|ccl_efa|p2p|ep] \
               [py_version] [rocm_index_url] --install
+
+# Install from source. Use the torch package from your current environment so
+# CUDA/ROCm backend detection matches the target platform.
+pip install nanobind
+pip install . --no-build-isolation
+# or (legacy, may be removed in newer setuptools):
+python3 setup.py install
+# or development, install in editable mode instead:
+pip install -e . --no-build-isolation
 ```
 > Note: 
 > - By default, `build.sh cu12` targets CUDA 12.8 and `build.sh roc7` targets ROCm 7.1, but you can also specify `cu13|roc6` to target CUDA 13.0 or ROCm 6.4.
diff --git a/build.sh b/build.sh
index b4b95a4e2..929c1c4aa 100755
--- a/build.sh
+++ b/build.sh
@@ -341,12 +341,14 @@ if [[ "${SKIP_DOCKER_BUILD:-0}" != "1" ]]; then
 
     if [[ "$ARCH" == "aarch64" ]]; then
       ${CONTAINER_ENGINE} build \
+        --network=host \
         --platform=linux/arm64 \
         $BUILD_ARGS \
         -t "$IMAGE_NAME" \
         -f "$DOCKERFILE" .
     else
       ${CONTAINER_ENGINE} build \
+        --network=host \
         $BUILD_ARGS \
         -t "$IMAGE_NAME" \
         -f "$DOCKERFILE" .
diff --git a/build_inner.sh b/build_inner.sh
index 6b3df2e04..ea263364a 100755
--- a/build_inner.sh
+++ b/build_inner.sh
@@ -5,6 +5,14 @@
 # Invoked by build.sh via docker/podman/apptainer; not intended for direct
 # execution on the host.
 #
+# Responsibilities (packaging only):
+#   * Drive ``python -m build`` to invoke setup.py -> ShellBuildExtension,
+#     which delegates the actual native compilation to ``build_native.sh``.
+#   * Repair / retag / rename the resulting wheel via auditwheel.
+#
+# Native compilation lives in ``build_native.sh``; this script does not call
+# ``make`` directly.
+#
 # Environment variables consumed (set by build.sh before container launch):
 #
 #   Required:
@@ -21,7 +29,7 @@
 #     UCCL_RETAG_TO_HOST_GLIBC      Retag wheel to host glibc version (default "0")
 #     UCCL_LOCAL_VERSION            Local version suffix appended to wheel filename (PEP 440)
 #
-#   Build feature flags:
+#   Build feature flags (forwarded to build_native.sh):
 #     USE_DIETGPU                   Enable DietGPU compression (default "0")
 #     USE_INTEL_RDMA_NIC            Enable Intel RDMA NIC / irdma driver (default "0")
 #     PER_EXPERT_BATCHING           Enable per-expert batching (default "0")
@@ -31,210 +39,6 @@
 
 set -euo pipefail
 
-########################################################
-# Build helper functions
-########################################################
-
-# Rename cpython-versioned .so files to .abi3.so for stable ABI compatibility.
-# Only applies on Python >= 3.12 where nanobind stable ABI is enabled.
-rename_to_abi3() {
-  local dir="$1"
-  local py_stable_abi_ok
-  py_stable_abi_ok=$(python3 -c "import sys; print(1 if sys.version_info >= (3, 12) else 0)")
-  if [[ "$py_stable_abi_ok" != "1" ]]; then
-    echo "Python < 3.12 detected, skipping abi3 rename (nanobind stable ABI not supported)"
-    return
-  fi
-  for f in "$dir"/*.cpython-*.so; do
-    if [[ -f "$f" ]]; then
-      local newname
-      newname=$(echo "$f" | sed 's/\.cpython-[^.]*-[^.]*-[^.]*\.so/.abi3.so/')
-      echo "Renaming $(basename "$f") -> $(basename "$newname")"
-      mv "$f" "$newname"
-    fi
-  done
-}
-
-build_rccl_nccl_header() {
-  # Unlike CUDA, ROCM does not include nccl.h. So we need to build rccl to get nccl.h.
-  if [[ ! -f "thirdparty/rccl/build/release/include/nccl.h" ]]; then
-    cd thirdparty/rccl
-    # Just to get nccl.h, not the whole library
-    CXX=/opt/rocm/bin/hipcc cmake -B build/release -S . -DCMAKE_EXPORT_COMPILE_COMMANDS=OFF >/dev/null 2>&1 || true
-    cd ../..
-  fi
-}
-
-build_ccl_rdma() {
-  local TARGET="$1"
-  local ARCH="$2"
-  local IS_EFA="$3"
-
-  set -euo pipefail
-  echo "[container] build_ccl_rdma Target: $TARGET"
-
-  if [[ "${USE_INTEL_RDMA_NIC:-0}" == "1" ]]; then
-    echo "[container] Building with Intel RDMA NIC support (USE_INTEL_RDMA_NIC=1)"
-  fi
-
-  if [[ "$TARGET" == cu* ]]; then
-    cd collective/rdma && make clean && make -j$(nproc) USE_INTEL_RDMA_NIC=${USE_INTEL_RDMA_NIC:-0} && cd ../../
-    TARGET_SO=collective/rdma/libnccl-net-uccl.so
-  elif [[ "$TARGET" == roc[67] ]]; then
-    if [[ "$ARCH" == "aarch64" ]]; then
-      echo "Skipping ROCm build on Arm64 (no ROCm toolchain)."
-      return
-    fi
-    cd collective/rdma && make clean -f Makefile.rocm && make -j$(nproc) -f Makefile.rocm && cd ../../
-    TARGET_SO=collective/rdma/librccl-net-uccl.so
-  elif [[ "$TARGET" == "therock" ]]; then
-    if [[ "$ARCH" == "aarch64" ]]; then
-      echo "Skipping ROCm build on Arm64 (no ROCm toolchain)."
-      return
-    fi
-    # Unlike CUDA, ROCM does not include nccl.h. So we need to build rccl to get nccl.h.
-    if [[ ! -f "thirdparty/rccl/build/release/include/nccl.h" ]]; then
-      cd thirdparty/rccl
-      # Just to get nccl.h, not the whole library
-      CXX=hipcc cmake -B build/release -S . -DCMAKE_EXPORT_COMPILE_COMMANDS=OFF -DCMAKE_PREFIX_PATH=$(rocm-sdk path --cmake) -DROCM_PATH=$(rocm-sdk path --root) -DHIP_PLATFORM=amd >/dev/null 2>&1 || true
-      cd ../..
-    fi
-    cd collective/rdma && make clean -f Makefile.therock && make -j$(nproc) -f Makefile.therock HIP_HOME=$(rocm-sdk path --root) CONDA_LIB_HOME=$VIRTUAL_ENV/lib && cd ../../
-    TARGET_SO=collective/rdma/librccl-net-uccl.so
-  fi
-
-  echo "[container] Copying RDMA .so to uccl/lib/"
-  mkdir -p uccl/lib
-  cp ${TARGET_SO} uccl/lib/
-}
-
-build_ccl_efa() {
-  local TARGET="$1"
-  local ARCH="$2"
-  local IS_EFA="$3"
-
-  set -euo pipefail
-  echo "[container] build_ccl_efa Target: $TARGET"
-
-  if [[ "$ARCH" == "aarch64" || "$TARGET" == roc[67] || "$TARGET" == "therock" ]]; then
-    echo "Skipping EFA build on Arm64 (no EFA installer) or ROCm (no CUDA)."
-    return
-  fi
-
-  if [[ "${USE_INTEL_RDMA_NIC:-0}" == "1" ]]; then
-    echo "[container] Building with Intel RDMA NIC support (USE_INTEL_RDMA_NIC=1)"
-  fi
-
-  cd collective/efa && make clean && make -j$(nproc) USE_INTEL_RDMA_NIC=${USE_INTEL_RDMA_NIC:-0} && cd ../../
-
-  # EFA requires a custom NCCL.
-  cd thirdparty/nccl-sg
-  make src.build -j$(nproc) NVCC_GENCODE="-gencode=arch=compute_90,code=sm_90" USE_INTEL_RDMA_NIC=${USE_INTEL_RDMA_NIC:-0}
-  cd ../..
-
-  echo "[container] Copying EFA .so to uccl/lib/"
-  mkdir -p uccl/lib
-  cp collective/efa/libnccl-net-efa.so uccl/lib/
-  cp thirdparty/nccl-sg/build/lib/libnccl.so uccl/lib/libnccl-efa.so
-}
-
-build_p2p() {
-  local TARGET="$1"
-  local ARCH="$2"
-  local IS_EFA="$3"
-
-  set -euo pipefail
-  echo "[container] build_p2p Target: $TARGET"
-
-  if [[ "${USE_DIETGPU:-0}" == "1" ]]; then
-    cd thirdparty/dietgpu
-    if [[ "$TARGET" == cu* ]]; then
-      cd dietgpu/float
-      CUDA_GPU_ARCH="sm_$(echo "${TORCH_CUDA_ARCH_LIST:-9.0}" | awk '{print $1}' | sed 's/+PTX//; s/\.//')"
-      echo "Building dietgpu float for CUDA: $CUDA_GPU_ARCH"
-      make clean -f Makefile.cuda && make -j$(nproc) -f Makefile.cuda GPU_ARCH=$CUDA_GPU_ARCH
-    else
-      rm -rf build/
-      python3 setup.py build
-      cd dietgpu/float
-      echo $TORCH_CUDA_ARCH_LIST
-      make clean -f Makefile.rocm && make -j$(nproc) -f Makefile.rocm GPU_ARCH=$TORCH_CUDA_ARCH_LIST
-    fi
-    cd ../../../..
-    cp thirdparty/dietgpu/dietgpu/float/libdietgpu_float.so uccl/lib
-  fi
-
-  cd p2p
-  if [[ "$TARGET" == cu* ]]; then
-    make clean && make -j$(nproc)
-  elif [[ "$TARGET" == roc[67] ]]; then
-    make clean -f Makefile.rocm && make -j$(nproc) -f Makefile.rocm
-  elif [[ "$TARGET" == "therock" ]]; then
-    make clean -f Makefile.therock && make -j$(nproc) -f Makefile.therock HIP_HOME=$(rocm-sdk path --root) CONDA_LIB_HOME=$VIRTUAL_ENV/lib
-  fi
-  cd ..
-
-  echo "[container] Copying P2P .so, collective.py and utils.py to uccl/"
-  mkdir -p uccl
-  mkdir -p uccl/lib
-  cp p2p/libuccl_p2p.so uccl/lib/
-  cp p2p/p2p.*.so uccl/
-  cp p2p/collective.py uccl/
-  cp p2p/utils.py uccl/
-  rename_to_abi3 uccl
-}
-
-build_ep() {
-  local TARGET="$1"
-  local ARCH="$2"
-  local IS_EFA="$3"
-
-  set -euo pipefail
-  echo "[container] build_ep Target: $TARGET"
-
-  if [[ "${USE_INTEL_RDMA_NIC:-0}" == "1" ]]; then
-    echo "[container] Building EP with Intel RDMA NIC support (USE_INTEL_RDMA_NIC=1)"
-  fi
-
-  if [[ "$TARGET" == "roc6" ]]; then
-    echo "ERROR: EP requires roc7 (ROCm 7) for HIP code transformation; roc6 is not supported." >&2
-    exit 1
-  elif [[ "$TARGET" == "therock" ]]; then
-    echo "Skipping GPU-driven build on therock (no GPU-driven support yet)."
-  elif [[ "$TARGET" == roc[67] || "$TARGET" == cu* ]]; then
-    cd ep
-    # This may be needed if you traverse through different git commits
-    # make clean && rm -r build || true
-    USE_INTEL_RDMA_NIC=${USE_INTEL_RDMA_NIC:-0} python3 setup.py build
-    cd ..
-    echo "[container] Copying GPU-driven .so to uccl/"
-    mkdir -p uccl/lib
-    cp ep/build/**/*.so uccl/
-  fi
-  rename_to_abi3 uccl
-}
-
-build_ukernel() {
-  local TARGET="$1"
-  local ARCH="$2"
-  local IS_EFA="$3"
-
-  set -euo pipefail
-  echo "[container] build_ukernel Target: $TARGET"
-
-  cd experimental/ukernel
-  if [[ "$TARGET" == cu* ]]; then
-    make clean -f Makefile && make -j$(nproc) -f Makefile
-  elif [[ "$TARGET" == roc[67] ]]; then
-    make clean -f Makefile.rocm && make -j$(nproc) -f Makefile.rocm
-  fi
-  cd ../..
-
-  echo "[container] Copying ukernel .so to uccl/"
-  mkdir -p uccl/lib # mkdir anyway
-  cp experimental/ukernel/*ukernel*.so uccl/lib
-}
-
 ########################################################
 # Main build logic
 ########################################################
@@ -249,32 +53,6 @@ if [[ "$TARGET" == "therock" ]]; then
   pip3 install --no-cache-dir rocm[libraries,devel] --index-url ${ROCM_IDX_URL}
 fi
 
-if [[ "$TARGET" == roc[67] ]]; then
-  build_rccl_nccl_header
-fi
-
-if [[ "$BUILD_TYPE" == "ccl_rdma" ]]; then
-  build_ccl_rdma "$TARGET" "$ARCH" "$IS_EFA"
-elif [[ "$BUILD_TYPE" == "ccl_efa" ]]; then
-  build_ccl_efa "$TARGET" "$ARCH" "$IS_EFA"
-elif [[ "$BUILD_TYPE" == "p2p" ]]; then
-  build_p2p "$TARGET" "$ARCH" "$IS_EFA"
-elif [[ "$BUILD_TYPE" == "ep" ]]; then
-  build_ep "$TARGET" "$ARCH" "$IS_EFA"
-elif [[ "$BUILD_TYPE" == "p2p_ep" ]]; then
-  build_p2p "$TARGET" "$ARCH" "$IS_EFA"
-  build_ep "$TARGET" "$ARCH" "$IS_EFA"
-elif [[ "$BUILD_TYPE" == "ukernel" ]]; then
-  build_ukernel "$TARGET" "$ARCH" "$IS_EFA"
-elif [[ "$BUILD_TYPE" == "all" ]]; then
-  if [[ -n "$IS_EFA" ]]; then
-    build_ccl_efa "$TARGET" "$ARCH" "$IS_EFA"
-  else
-    build_ccl_rdma "$TARGET" "$ARCH" "$IS_EFA"
-  fi
-  build_p2p "$TARGET" "$ARCH" "$IS_EFA"
-  build_ep "$TARGET" "$ARCH" "$IS_EFA"
-fi
 
 if [[ "$TARGET" == "therock" ]]; then
   echo "
@@ -304,10 +82,11 @@ def initialize():
   export PIP_EXTRA_INDEX_URL=${ROCM_IDX_URL}
 fi
 
-ls -lh uccl/
-ls -lh uccl/lib/
-python3 -m build
+# All native build logic lives in ``build_native.sh``, driven by setup.py's
+# ShellBuildExtension. ``--no-isolation`` reuses the container's setuptools/wheel.
+python3 -m build --wheel --no-isolation
 
+# Restore the original setup.py if we patched it.
 if [[ "$TARGET" == "therock" ]]; then
   mv ${BACKUP_FN} setup.py
 fi
diff --git a/build_native.sh b/build_native.sh
new file mode 100755
index 000000000..7d71872e3
--- /dev/null
+++ b/build_native.sh
@@ -0,0 +1,337 @@
+#!/bin/bash
+
+# -----------------------
+# build_native.sh — compile uccl native modules.
+#
+# Replaces the top-level Makefile.  Invoked by setup.py's
+# ShellBuildExtension; can also be run standalone for in-tree builds.
+# The compile logic is lifted (almost verbatim) from build_inner.sh's
+# pre-Makefile incarnation.
+#
+# Output layout:
+#   ${UCCL_PY_DIR}/lib/   library .so / .a files (libnccl-net-uccl.so, ...)
+#   ${UCCL_PY_DIR}/       p2p*.so + collective.py + utils.py
+#   ${UCCL_EP_DIR}/       ep_cpp*.so   (derived from UCCL_PY_DIR, see below)
+#
+# By default the script writes in-tree (UCCL_PY_DIR=./uccl), matching the
+# source layout that the editable install relies on.  setup.py overrides
+# UCCL_PY_DIR to point at the wheel-staging dir (build_lib/uccl) for
+# non-inplace builds.  The matching ``uccl.ep`` target is derived
+# automatically:
+#   * source layout (UCCL_PY_DIR == <project>/uccl) -> ./ep/python/uccl_ep
+#                                                     (matches package_dir)
+#   * any other UCCL_PY_DIR (wheel staging)         -> ${UCCL_PY_DIR}/ep
+# so the same script serves all three modes (editable / install / wheel)
+# with a single env var from setup.py.
+#
+# Usage:
+#   ./build_native.sh [BUILD_TYPE]
+#     BUILD_TYPE := all (default) | ccl_rdma | ccl_efa | p2p | ep | p2p_ep | ukernel | clean
+#     When no positional argument is supplied the value of $BUILD_TYPE is used.
+#
+# Environment variables consumed:
+#   TARGET                Build target: cu12, cu13, roc7, roc6, therock (default cu12)
+#   ARCH                  Host architecture: x86_64 or aarch64 (default $(uname -m))
+#   IS_EFA                Non-empty when EFA is detected (swaps ccl_rdma -> ccl_efa)
+#   BUILD_TYPE            Default value when no positional arg is given (default ``all``)
+#
+#   Output staging (set by setup.py during wheel/install builds):
+#     UCCL_PY_DIR         Target dir for the ``uccl`` package (default ./uccl)
+#     UCCL_EP_DIR         Target dir for the ``uccl.ep`` package
+#                         (auto-derived from UCCL_PY_DIR; override only
+#                         if you really know what you're doing)
+#
+#   Feature flags:
+#     USE_DIETGPU         Enable DietGPU compression (default 0)
+#     USE_INTEL_RDMA_NIC  Enable Intel RDMA NIC / irdma driver (default 0)
+#     TORCH_CUDA_ARCH_LIST CUDA compute capabilities for dietgpu (default 9.0)
+# -----------------------
+
+set -euo pipefail
+
+PROJECT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+cd "${PROJECT_ROOT}"
+
+TARGET="${TARGET:-cu12}"
+ARCH="${ARCH:-$(uname -m)}"
+IS_EFA="${IS_EFA:-}"
+BUILD_TYPE="${BUILD_TYPE:-all}"
+
+
+UCCL_PY_DIR="${UCCL_PY_DIR:-${PROJECT_ROOT}/uccl}"
+
+if [[ -z "${UCCL_EP_DIR:-}" ]]; then
+  if [[ "$(realpath -m "${UCCL_PY_DIR}")" == "$(realpath -m "${PROJECT_ROOT}/uccl")" ]]; then
+    # for build mode
+    UCCL_EP_DIR="${PROJECT_ROOT}/ep/python/uccl_ep"
+  else
+    # for install/wheel mode
+    UCCL_EP_DIR="${UCCL_PY_DIR}/ep"
+  fi
+fi
+UCCL_LIB_DIR="${UCCL_PY_DIR}/lib"
+
+# Positional argument overrides BUILD_TYPE.
+if [[ $# -gt 0 ]]; then
+  BUILD_TYPE="$1"
+fi
+
+mkdir -p "${UCCL_PY_DIR}" "${UCCL_LIB_DIR}" "${UCCL_EP_DIR}"
+
+########################################################
+# Build helper functions
+########################################################
+
+# Rename cpython-versioned .so files to .abi3.so for stable ABI compatibility.
+# Only applies on Python >= 3.12 where nanobind stable ABI is enabled.
+rename_to_abi3() {
+  local dir="$1"
+  local py_stable_abi_ok
+  py_stable_abi_ok=$(python3 -c "import sys; print(1 if sys.version_info >= (3, 12) else 0)")
+  if [[ "$py_stable_abi_ok" != "1" ]]; then
+    echo "Python < 3.12 detected, skipping abi3 rename (nanobind stable ABI not supported)"
+    return
+  fi
+  for f in "$dir"/*.cpython-*.so; do
+    if [[ -f "$f" ]]; then
+      local newname
+      newname=$(echo "$f" | sed 's/\.cpython-[^.]*-[^.]*-[^.]*\.so/.abi3.so/')
+      echo "Renaming $(basename "$f") -> $(basename "$newname")"
+      mv "$f" "$newname"
+    fi
+  done
+}
+
+build_rccl_nccl_header() {
+  # Unlike CUDA, ROCM does not include nccl.h. So we need to build rccl to get nccl.h.
+  if [[ ! -f "thirdparty/rccl/build/release/include/nccl.h" ]]; then
+    cd thirdparty/rccl
+    # Just to get nccl.h, not the whole library
+    CXX=/opt/rocm/bin/hipcc cmake -B build/release -S . -DCMAKE_EXPORT_COMPILE_COMMANDS=OFF >/dev/null 2>&1 || true
+    cd ../..
+  fi
+}
+
+build_ccl_rdma() {
+  local TARGET="$1"
+  local ARCH="$2"
+  local IS_EFA="$3"
+
+  set -euo pipefail
+  echo "[container] build_ccl_rdma Target: $TARGET"
+
+  if [[ "${USE_INTEL_RDMA_NIC:-0}" == "1" ]]; then
+    echo "[container] Building with Intel RDMA NIC support (USE_INTEL_RDMA_NIC=1)"
+  fi
+
+  if [[ "$TARGET" == cu* ]]; then
+    cd collective/rdma && make clean && make -j$(nproc) USE_INTEL_RDMA_NIC=${USE_INTEL_RDMA_NIC:-0} && cd ../../
+    TARGET_SO=collective/rdma/libnccl-net-uccl.so
+  elif [[ "$TARGET" == roc[67] ]]; then
+    if [[ "$ARCH" == "aarch64" ]]; then
+      echo "Skipping ROCm build on Arm64 (no ROCm toolchain)."
+      return
+    fi
+    cd collective/rdma && make clean -f Makefile.rocm && make -j$(nproc) -f Makefile.rocm && cd ../../
+    TARGET_SO=collective/rdma/librccl-net-uccl.so
+  elif [[ "$TARGET" == "therock" ]]; then
+    if [[ "$ARCH" == "aarch64" ]]; then
+      echo "Skipping ROCm build on Arm64 (no ROCm toolchain)."
+      return
+    fi
+    # Unlike CUDA, ROCM does not include nccl.h. So we need to build rccl to get nccl.h.
+    if [[ ! -f "thirdparty/rccl/build/release/include/nccl.h" ]]; then
+      cd thirdparty/rccl
+      # Just to get nccl.h, not the whole library
+      CXX=hipcc cmake -B build/release -S . -DCMAKE_EXPORT_COMPILE_COMMANDS=OFF -DCMAKE_PREFIX_PATH=$(rocm-sdk path --cmake) -DROCM_PATH=$(rocm-sdk path --root) -DHIP_PLATFORM=amd >/dev/null 2>&1 || true
+      cd ../..
+    fi
+    cd collective/rdma && make clean -f Makefile.therock && make -j$(nproc) -f Makefile.therock HIP_HOME=$(rocm-sdk path --root) CONDA_LIB_HOME=$VIRTUAL_ENV/lib && cd ../../
+    TARGET_SO=collective/rdma/librccl-net-uccl.so
+  fi
+
+  echo "[container] Copying RDMA .so to ${UCCL_LIB_DIR}"
+  mkdir -p "${UCCL_LIB_DIR}"
+  cp ${TARGET_SO} "${UCCL_LIB_DIR}/"
+}
+
+build_ccl_efa() {
+  local TARGET="$1"
+  local ARCH="$2"
+  local IS_EFA="$3"
+
+  set -euo pipefail
+  echo "[container] build_ccl_efa Target: $TARGET"
+
+  if [[ "$ARCH" == "aarch64" || "$TARGET" == roc[67] || "$TARGET" == "therock" ]]; then
+    echo "Skipping EFA build on Arm64 (no EFA installer) or ROCm (no CUDA)."
+    return
+  fi
+
+  if [[ "${USE_INTEL_RDMA_NIC:-0}" == "1" ]]; then
+    echo "[container] Building with Intel RDMA NIC support (USE_INTEL_RDMA_NIC=1)"
+  fi
+
+  cd collective/efa && make clean && make -j$(nproc) USE_INTEL_RDMA_NIC=${USE_INTEL_RDMA_NIC:-0} && cd ../../
+
+  # EFA requires a custom NCCL.
+  cd thirdparty/nccl-sg
+  make src.build -j$(nproc) NVCC_GENCODE="-gencode=arch=compute_90,code=sm_90" USE_INTEL_RDMA_NIC=${USE_INTEL_RDMA_NIC:-0}
+  cd ../..
+
+  echo "[container] Copying EFA .so to ${UCCL_LIB_DIR}"
+  mkdir -p "${UCCL_LIB_DIR}"
+  cp collective/efa/libnccl-net-efa.so "${UCCL_LIB_DIR}/"
+  cp thirdparty/nccl-sg/build/lib/libnccl.so "${UCCL_LIB_DIR}/libnccl-efa.so"
+}
+
+build_p2p() {
+  local TARGET="$1"
+  local ARCH="$2"
+  local IS_EFA="$3"
+
+  set -euo pipefail
+  echo "[container] build_p2p Target: $TARGET"
+
+  if [[ "${USE_DIETGPU:-0}" == "1" ]]; then
+    cd thirdparty/dietgpu
+    if [[ "$TARGET" == cu* ]]; then
+      cd dietgpu/float
+      CUDA_GPU_ARCH="sm_$(echo "${TORCH_CUDA_ARCH_LIST:-9.0}" | awk '{print $1}' | sed 's/+PTX//; s/\.//')"
+      echo "Building dietgpu float for CUDA: $CUDA_GPU_ARCH"
+      make clean -f Makefile.cuda && make -j$(nproc) -f Makefile.cuda GPU_ARCH=$CUDA_GPU_ARCH
+    else
+      rm -rf build/
+      python3 setup.py build
+      cd dietgpu/float
+      echo $TORCH_CUDA_ARCH_LIST
+      make clean -f Makefile.rocm && make -j$(nproc) -f Makefile.rocm GPU_ARCH=$TORCH_CUDA_ARCH_LIST
+    fi
+    cd ../../../..
+    cp thirdparty/dietgpu/dietgpu/float/libdietgpu_float.so "${UCCL_LIB_DIR}/"
+  fi
+
+  cd p2p
+  if [[ "$TARGET" == cu* ]]; then
+    make clean && make -j$(nproc)
+  elif [[ "$TARGET" == roc[67] ]]; then
+    make clean -f Makefile.rocm && make -j$(nproc) -f Makefile.rocm
+  elif [[ "$TARGET" == "therock" ]]; then
+    make clean -f Makefile.therock && make -j$(nproc) -f Makefile.therock HIP_HOME=$(rocm-sdk path --root) CONDA_LIB_HOME=$VIRTUAL_ENV/lib
+  fi
+  cd ..
+
+  echo "[container] Copying P2P .so, collective.py and utils.py to ${UCCL_PY_DIR}"
+  mkdir -p "${UCCL_PY_DIR}" "${UCCL_LIB_DIR}"
+  cp p2p/libuccl_p2p.so "${UCCL_LIB_DIR}/"
+  cp p2p/p2p.*.so "${UCCL_PY_DIR}/"
+  cp p2p/collective.py "${UCCL_PY_DIR}/"
+  cp p2p/utils.py "${UCCL_PY_DIR}/"
+  rename_to_abi3 "${UCCL_PY_DIR}"
+}
+
+build_ep() {
+  local TARGET="$1"
+  local ARCH="$2"
+  local IS_EFA="$3"
+
+  set -euo pipefail
+  echo "[container] build_ep Target: $TARGET"
+
+  if [[ "${USE_INTEL_RDMA_NIC:-0}" == "1" ]]; then
+    echo "[container] Building EP with Intel RDMA NIC support (USE_INTEL_RDMA_NIC=1)"
+  fi
+
+  if [[ "$TARGET" == "roc6" ]]; then
+    echo "ERROR: EP requires roc7 (ROCm 7) for HIP code transformation; roc6 is not supported." >&2
+    exit 1
+  elif [[ "$TARGET" == "therock" ]]; then
+    echo "Skipping GPU-driven build on therock (no GPU-driven support yet)."
+  elif [[ "$TARGET" == roc[67] || "$TARGET" == cu* ]]; then
+    cd ep
+    # This may be needed if you traverse through different git commits
+    # make clean && rm -r build || true
+    USE_INTEL_RDMA_NIC=${USE_INTEL_RDMA_NIC:-0} python3 setup.py build
+    cd ..
+    echo "[container] Copying GPU-driven .so to ${UCCL_EP_DIR}"
+    mkdir -p "${UCCL_EP_DIR}"
+    cp ep/build/**/*.so "${UCCL_EP_DIR}/"
+  fi
+  rename_to_abi3 "${UCCL_EP_DIR}"
+}
+
+build_ukernel() {
+  local TARGET="$1"
+  local ARCH="$2"
+  local IS_EFA="$3"
+
+  set -euo pipefail
+  echo "[container] build_ukernel Target: $TARGET"
+
+  cd experimental/ukernel
+  if [[ "$TARGET" == cu* ]]; then
+    make clean -f Makefile && make -j$(nproc) -f Makefile
+  elif [[ "$TARGET" == roc[67] ]]; then
+    make clean -f Makefile.rocm && make -j$(nproc) -f Makefile.rocm
+  fi
+  cd ../..
+
+  echo "[container] Copying ukernel .so to ${UCCL_LIB_DIR}"
+  mkdir -p "${UCCL_LIB_DIR}"
+  cp experimental/ukernel/*ukernel*.so "${UCCL_LIB_DIR}/"
+}
+
+clean_all() {
+  # Best-effort clean of every per-module Makefile flavour; ignore missing
+  # files / missing toolchains.
+  for f in Makefile Makefile.rocm Makefile.therock; do
+    make -C collective/rdma -f "$f" clean 2>/dev/null || true
+    make -C p2p -f "$f" clean 2>/dev/null || true
+    make -C experimental/ukernel -f "$f" clean 2>/dev/null || true
+  done
+  make -C collective/efa clean 2>/dev/null || true
+  rm -rf ep/build
+  rm -f "${UCCL_LIB_DIR}"/*.so "${UCCL_LIB_DIR}"/*.a
+  rm -f "${UCCL_PY_DIR}"/p2p*.so
+  rm -f "${UCCL_PY_DIR}"/collective.py "${UCCL_PY_DIR}"/utils.py
+  rm -f "${UCCL_EP_DIR}"/*.so
+}
+
+########################################################
+# Main build logic
+########################################################
+
+if [[ "$BUILD_TYPE" == "clean" ]]; then
+  clean_all
+  exit 0
+fi
+
+if [[ "$TARGET" == roc[67] ]]; then
+  build_rccl_nccl_header
+fi
+
+if [[ "$BUILD_TYPE" == "ccl_rdma" ]]; then
+  build_ccl_rdma "$TARGET" "$ARCH" "$IS_EFA"
+elif [[ "$BUILD_TYPE" == "ccl_efa" ]]; then
+  build_ccl_efa "$TARGET" "$ARCH" "$IS_EFA"
+elif [[ "$BUILD_TYPE" == "p2p" ]]; then
+  build_p2p "$TARGET" "$ARCH" "$IS_EFA"
+elif [[ "$BUILD_TYPE" == "ep" ]]; then
+  build_ep "$TARGET" "$ARCH" "$IS_EFA"
+elif [[ "$BUILD_TYPE" == "p2p_ep" ]]; then
+  build_p2p "$TARGET" "$ARCH" "$IS_EFA"
+  build_ep "$TARGET" "$ARCH" "$IS_EFA"
+elif [[ "$BUILD_TYPE" == "ukernel" ]]; then
+  build_ukernel "$TARGET" "$ARCH" "$IS_EFA"
+elif [[ "$BUILD_TYPE" == "all" ]]; then
+  if [[ -n "$IS_EFA" ]]; then
+    build_ccl_efa "$TARGET" "$ARCH" "$IS_EFA"
+  else
+    build_ccl_rdma "$TARGET" "$ARCH" "$IS_EFA"
+  fi
+  build_p2p "$TARGET" "$ARCH" "$IS_EFA"
+  build_ep "$TARGET" "$ARCH" "$IS_EFA"
+else
+  echo "build_native: unknown BUILD_TYPE '$BUILD_TYPE'" >&2
+  exit 1
+fi
diff --git a/docker/Dockerfile.cuda b/docker/Dockerfile.cuda
index e45808620..1a99c4268 100644
--- a/docker/Dockerfile.cuda
+++ b/docker/Dockerfile.cuda
@@ -46,7 +46,7 @@ RUN CUDA_MAJOR=$(nvcc --version | grep -oP 'release \K[0-9]+') && \
 # ─────────────────────────────────────────────────────────
 # Install Python build back-end (for Python ${PY_VER})
 # ─────────────────────────────────────────────────────────
-RUN python${PY_VER} -m pip install --no-cache-dir build auditwheel pybind11 nanobind
+RUN python${PY_VER} -m pip install --no-cache-dir auditwheel build wheel pybind11 nanobind
 RUN python${PY_VER} -m pip install --no-cache-dir --upgrade setuptools
 
 # ───── Set Python ${PY_VER} as default python3 and python3-config ─────
diff --git a/docker/Dockerfile.efa b/docker/Dockerfile.efa
index 0beecad1c..81ea2ca78 100644
--- a/docker/Dockerfile.efa
+++ b/docker/Dockerfile.efa
@@ -70,7 +70,7 @@ RUN CUDA_MAJOR=$(nvcc --version | grep -oP 'release \K[0-9]+') && \
 # ─────────────────────────────────────────────────────────
 # Install Python build back-end (for Python ${PY_VER})
 # ─────────────────────────────────────────────────────────
-RUN python${PY_VER} -m pip install --no-cache-dir build auditwheel pybind11 nanobind
+RUN python${PY_VER} -m pip install --no-cache-dir auditwheel build wheel pybind11 nanobind
 RUN python${PY_VER} -m pip install --no-cache-dir --upgrade setuptools
 
 # ───── Set Python ${PY_VER} as default python3 and python3-config ─────
diff --git a/docker/Dockerfile.gh b/docker/Dockerfile.gh
index c26538343..8aa5b55b0 100644
--- a/docker/Dockerfile.gh
+++ b/docker/Dockerfile.gh
@@ -44,7 +44,7 @@ RUN CUDA_MAJOR=$(nvcc --version | grep -oP 'release \K[0-9]+') && \
 # ─────────────────────────────────────────────────────────
 # Install Python build back-end (for Python ${PY_VER})
 # ─────────────────────────────────────────────────────────
-RUN python${PY_VER} -m pip install --no-cache-dir build auditwheel pybind11 nanobind
+RUN python${PY_VER} -m pip install --no-cache-dir auditwheel build wheel pybind11 nanobind
 RUN python${PY_VER} -m pip install --no-cache-dir --upgrade setuptools
 
 # ───── Set Python ${PY_VER} as default python3 and python3-config ─────
diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm
index 657212e3a..cc10f92ea 100644
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@@ -42,7 +42,7 @@ RUN python${PY_VER} -m pip install --no-cache-dir --pre torch torchvision \
 # ─────────────────────────────────────────────────────────
 # Install Python build back-end (for Python ${PY_VER})
 # ─────────────────────────────────────────────────────────
-RUN python${PY_VER} -m pip install --no-cache-dir build auditwheel pybind11 nanobind
+RUN python${PY_VER} -m pip install --no-cache-dir auditwheel build wheel pybind11 nanobind
 RUN python${PY_VER} -m pip install --no-cache-dir --upgrade setuptools
 
 
diff --git a/ep/Makefile b/ep/Makefile
index 6754b4668..9f5b8b9a7 100644
--- a/ep/Makefile
+++ b/ep/Makefile
@@ -98,8 +98,8 @@ OBJ_CU  := $(SRC_CU:.cu=.o)
 
 SRC_BIND := src/uccl_ep.cc
 OBJ_BIND := $(SRC_BIND:.cc=.o)
-EP_EXT  := ep$(EXT_SUFFIX)
-PYTARGET      := ep$(EXT_SUFFIX)
+EP_EXT  := ep_cpp$(EXT_SUFFIX)
+PYTARGET      := ep_cpp$(EXT_SUFFIX)
 
 .PHONY: all py clean
 
@@ -131,17 +131,22 @@ $(EP_EXT): $(OBJ_CPP) $(OBJ_CU) $(OBJ_BIND) $(NB_OBJECTS)
 py: $(PYTARGET) $(EP_EXT)
 
 # Repo uccl package dir (so in-repo runs use the same build)
-UCCL_UCCL := $(abspath $(CURDIR)/../uccl/uccl)
+UCCL_UCCL := $(abspath $(CURDIR)/../uccl)
+
+EP_PYTHON_DIR := $(CURDIR)/python/uccl_ep
 
 install: $(EP_EXT)
-	@mkdir -p $(INSTALL_DIR)
-	@rm -f $(INSTALL_DIR)/ep.cpython-*.so $(INSTALL_DIR)/ep.abi3.so
-	@cp $(EP_EXT) $(INSTALL_DIR)/
-	@echo "Installed $(EP_EXT) -> $(INSTALL_DIR)/"
+	@mkdir -p $(INSTALL_DIR)/ep
+	@rm -f $(INSTALL_DIR)/ep/ep_cpp.cpython-*.so $(INSTALL_DIR)/ep/ep_cpp.abi3.so
+	@cp $(EP_EXT) $(INSTALL_DIR)/ep/
+	@cp $(EP_PYTHON_DIR)/*.py $(INSTALL_DIR)/ep/
+	@echo "Installed uccl.ep (.so + .py) -> $(INSTALL_DIR)/ep/"
 	@if [ -d "$(UCCL_UCCL)" ]; then \
-		rm -f "$(UCCL_UCCL)"/ep.cpython-*.so "$(UCCL_UCCL)"/ep.abi3.so; \
-		cp $(EP_EXT) "$(UCCL_UCCL)/"; \
-		echo "Installed $(EP_EXT) -> $(UCCL_UCCL)/ (for in-repo runs)"; \
+		mkdir -p "$(UCCL_UCCL)/ep"; \
+		rm -f "$(UCCL_UCCL)/ep"/ep_cpp.cpython-*.so "$(UCCL_UCCL)/ep"/ep_cpp.abi3.so; \
+		cp $(EP_EXT) "$(UCCL_UCCL)/ep/"; \
+		cp $(EP_PYTHON_DIR)/*.py "$(UCCL_UCCL)/ep/"; \
+		echo "Installed uccl.ep (.so + .py) -> $(UCCL_UCCL)/ep/ (for in-repo runs)"; \
 	fi
 
 # Clean all generated files
diff --git a/ep/bench/buffer.py b/ep/bench/buffer.py
index 9326f1bed..55a49afb9 100644
--- a/ep/bench/buffer.py
+++ b/ep/bench/buffer.py
@@ -1,38 +1,19 @@
 import os
 from contextlib import nullcontext
+from typing import Callable, List, Optional, Tuple, Union
+
 import torch
 import torch.distributed as dist
-from typing import Callable, Tuple, Optional, Union, List
-
-try:
-    from uccl import ep
-except ImportError as exc:
-    import sys
-
-    sys.stderr.write("Failed to import uccl.ep\n")
-    raise
-
-from uccl.ep import EventHandle, Config
 
-# Support both execution modes:
-# 1) As part of the packaged deep_ep_wrapper (symlinked buffer inside a package): uses relative import `.utils`.
-# 2) As a standalone benchmark script from the `ep/bench` directory (no package): falls back to plain `utils`.
-try:
-    from .utils import (
-        EventOverlap,
-        check_nvlink_connections,
-        initialize_uccl,
-        destroy_uccl,
-        _fp8_e4m3_dtype,
-    )
-except ImportError:
-    from utils import (
-        EventOverlap,
-        check_nvlink_connections,
-        initialize_uccl,
-        destroy_uccl,
-        _fp8_e4m3_dtype,
-    )
+from uccl.ep import ep_cpp
+from uccl.ep import Config, EventHandle
+from uccl.ep.utils import (
+    EventOverlap,
+    check_nvlink_connections,
+    initialize_uccl,
+    destroy_uccl,
+    _fp8_e4m3_dtype,
+)
 
 
 class Buffer:
@@ -93,24 +74,26 @@ def __init__(
         else:
             device_index = torch.cuda.current_device()
 
-        if hasattr(ep, "get_rdma_buffer"):
+        if hasattr(ep_cpp, "get_rdma_buffer"):
             # Allocate outside PyTorch's CUDA allocator so RDMA/IPC sees a raw
             # cudaMalloc/cudaMallocHost-style allocation instead of a possibly
             # segmented caching-allocator mapping.
-            scratch_dlpack, rdma_buffer_is_host_allocated = ep.get_rdma_buffer(
+            scratch_dlpack, rdma_buffer_is_host_allocated = ep_cpp.get_rdma_buffer(
                 num_rdma_bytes, device_index
             )
             self.scratch = torch.utils.dlpack.from_dlpack(scratch_dlpack)
         else:
             rdma_buffer_is_host_allocated = False
             if num_rdma_bytes > 0:
-                if hasattr(ep, "can_register_rdma_gpu_buffer"):
+                if hasattr(ep_cpp, "can_register_rdma_gpu_buffer"):
                     rdma_buffer_is_host_allocated = not bool(
-                        ep.can_register_rdma_gpu_buffer(device_index, num_rdma_bytes)
+                        ep_cpp.can_register_rdma_gpu_buffer(
+                            device_index, num_rdma_bytes
+                        )
                     )
-                elif hasattr(ep, "rdma_buffer_should_use_host_alloc"):
+                elif hasattr(ep_cpp, "rdma_buffer_should_use_host_alloc"):
                     rdma_buffer_is_host_allocated = bool(
-                        ep.rdma_buffer_should_use_host_alloc(
+                        ep_cpp.rdma_buffer_should_use_host_alloc(
                             device_index, num_rdma_bytes
                         )
                     )
@@ -154,7 +137,7 @@ def __init__(
         self.low_latency_mode = low_latency_mode
         self.explicitly_destroy = explicitly_destroy
         self._next_low_latency_combine_buffer = None
-        self.runtime = ep.Buffer(
+        self.runtime = ep_cpp.Buffer(
             self.rank,
             self.group_size,
             num_nvl_bytes,
@@ -167,16 +150,12 @@ def __init__(
             self.runtime.set_rdma_buffer(rdma_buffer_ptr, rdma_buffer_is_host_allocated)
 
         # Synchronize device IDs
-        device_ids = [
-            None,
-        ] * self.group_size
+        device_ids = [None] * self.group_size
         local_device_id = self.runtime.get_local_device_id()
         # print("Before all_gather_object device_ids", local_device_id, flush=True)
         dist.all_gather_object(device_ids, local_device_id, group)
         # Synchronize IPC handles
-        ipc_handles = [
-            None,
-        ] * self.group_size
+        ipc_handles = [None] * self.group_size
         local_ipc_handle = self.runtime.get_local_ipc_handle()
         # print("Before all_gather_object ipc_handles", local_ipc_handle, flush=True)
         dist.all_gather_object(ipc_handles, local_ipc_handle, group)
@@ -217,8 +196,8 @@ def reset_rdma_buffer(self):
         """
         self.runtime.reset_rdma_buffer()
 
-    def connect_atomic_buffer(self, proxy: "ep.UcclProxy"):
-        ep.connect_atomic_buffer(proxy, self.runtime)
+    def connect_atomic_buffer(self, proxy: "ep_cpp.Proxy"):
+        ep_cpp.connect_atomic_buffer(proxy, self.runtime)
 
     def destroy(self):
         """
@@ -234,7 +213,7 @@ def destroy(self):
 
     @staticmethod
     def is_sm90_compiled():
-        return ep.is_sm90_compiled()
+        return ep_cpp.is_sm90_compiled()
 
     @staticmethod
     def set_num_sms(new_num_sms: int) -> None:
@@ -583,7 +562,7 @@ def get_low_latency_rdma_size_hint(
         Returns:
             size: the RDMA buffer size recommended.
         """
-        return ep.get_low_latency_rdma_size_hint(
+        return ep_cpp.get_low_latency_rdma_size_hint(
             num_max_dispatch_tokens_per_rank, hidden, num_ranks, num_experts
         )
 
diff --git a/ep/bench/rb/Makefile b/ep/bench/rb/Makefile
index e3ce43a8d..af7753208 100644
--- a/ep/bench/rb/Makefile
+++ b/ep/bench/rb/Makefile
@@ -183,8 +183,12 @@ FC_LOAD_LATENCY_TARGET := test_fc_load_latency$(BUILD_SUFFIX)
 CAS_TARGET := test_cas_throughput$(BUILD_SUFFIX)
 
 # EP runtime targets (if EFA is available)
+# NOTE: the native extension is named `ep_cpp` so that it can live as
+# `uccl.ep.ep_cpp` without shadowing the `uccl.ep` Python package.  The
+# filename must match the `NB_MODULE(ep_cpp, ...)` declaration in
+# ../src/uccl_ep.cc, otherwise Python fails to import it.
 ifeq ($(HAS_EFA),1)
-    EP_EXT := ep$(EXT_SUFFIX)
+    EP_EXT := ep_cpp$(EXT_SUFFIX)
 endif
 
 # Header dependencies
@@ -328,9 +332,10 @@ py: $(EP_EXT)
 	@echo "Python extension built: $(EP_EXT)"
 
 install: $(EP_EXT)
-	@mkdir -p $(INSTALL_DIR)
-	@cp $(EP_EXT) $(INSTALL_DIR)/
-	@echo "Installation complete. Module installed to: $(INSTALL_DIR)/$(EP_EXT)"
+	@mkdir -p $(INSTALL_DIR)/ep
+	@rm -f $(INSTALL_DIR)/ep/ep_cpp.cpython-*.so $(INSTALL_DIR)/ep/ep_cpp.abi3.so
+	@cp $(EP_EXT) $(INSTALL_DIR)/ep/
+	@echo "Installation complete. Module installed to: $(INSTALL_DIR)/ep/$(EP_EXT)"
 else
 $(EP_EXT):
 	@echo "PyTorch not found. Skipping Python extension build."
diff --git a/ep/bench/rb/benchmark_rdma_fifo.py b/ep/bench/rb/benchmark_rdma_fifo.py
index b02eb7772..15bcce2b0 100644
--- a/ep/bench/rb/benchmark_rdma_fifo.py
+++ b/ep/bench/rb/benchmark_rdma_fifo.py
@@ -24,7 +24,7 @@
 import torch.distributed as dist
 
 try:
-    from uccl import ep
+    from uccl.ep import ep_cpp as ep
 except ImportError as exc:
     sys.stderr.write("Failed to import ep\n")
     raise
diff --git a/ep/bench/test_internode.py b/ep/bench/test_internode.py
index 5127eac12..b32c02980 100644
--- a/ep/bench/test_internode.py
+++ b/ep/bench/test_internode.py
@@ -33,7 +33,8 @@
 
 # noinspection PyUnresolvedReferences
 
-from utils import (
+
+from uccl.ep.utils import (  # type: ignore[no-redef]
     init_dist,
     bench,
     bench_kineto,
@@ -48,7 +49,7 @@
 )
 
 # Test compatibility with low latency functions
-from buffer import Buffer
+from uccl.ep.buffer import Buffer  # type: ignore[no-redef]
 
 try:
     from uccl.ep import Config
diff --git a/ep/bench/utils.py b/ep/bench/utils.py
index 64aae9155..09f8654a8 100644
--- a/ep/bench/utils.py
+++ b/ep/bench/utils.py
@@ -1,36 +1,21 @@
 import inspect
-from typing import Any, Optional, Tuple, Union
+import glob
 import os
 import socket
-import torch
-import torch.distributed as dist
-from typing import Optional
-import glob
 import sys
-from uccl.ep import EventHandle
-import tempfile
+import time
 import json
+import tempfile
 from pathlib import Path
-import time
-import numpy as np
-
-# import deep_ep as ep
-try:
-    from uccl import ep
-except ImportError as exc:
-    import sys
+from typing import Any, Optional, Tuple, Union
 
-    sys.stderr.write("Failed to import uccl.ep\n")
-    raise
+import numpy as np
+import torch
+import torch.distributed as dist
 
-# import deep_ep as ep
-try:
-    from uccl import ep
-except ImportError as exc:
-    import sys
+from uccl.ep import ep_cpp
 
-    sys.stderr.write("Failed to import uccl.ep\n")
-    raise
+EventHandle = ep_cpp.EventHandle
 
 
 def calc_diff(x: torch.Tensor, y: torch.Tensor):
@@ -93,7 +78,7 @@ def init_dist_under_torchrun(local_rank: int, num_local_ranks: int):
 def _gather_peer_ips(group):
     # Gather local IP strings across ranks
     world = dist.get_world_size(group)
-    my_ip = ep.get_oob_ip()
+    my_ip = ep_cpp.get_oob_ip()
     ips = [None] * world
     dist.all_gather_object(ips, my_ip, group=group)
     return ips
@@ -149,7 +134,7 @@ def get_peer_ip(rank: int, num_ranks: int, group: dist.ProcessGroup):
 
 
 def get_cpu_proxies_meta(proxies, rank, scratch_ptr, scratch_bytes, num_ranks, group):
-    my_ip = ep.get_oob_ip()
+    my_ip = ep_cpp.get_oob_ip()
     meta = {
         "rank": rank,
         "ptr": int(scratch_ptr),
@@ -573,8 +558,8 @@ def initialize_uccl(
 
     proxies = []
 
-    for i in range(ep.get_num_proxy_threads()):
-        proxy = ep.Proxy(
+    for i in range(ep_cpp.get_num_proxy_threads()):
+        proxy = ep_cpp.Proxy(
             thread_idx=i,
             gpu_buffer_addr=scratch_ptr,
             total_size=scratch_nbytes,
@@ -599,7 +584,7 @@ def initialize_uccl(
         for proxy in proxies:
             proxy.set_peers_meta(peers_meta_list)
 
-    ep.register_proxies(local_rank, proxies)
+    ep_cpp.register_proxies(local_rank, proxies)
 
     # Set atomic buffer pointer for all proxies BEFORE starting them
     # This ensures the atomic buffer info is included in connection info exchange
@@ -643,7 +628,7 @@ def destroy_uccl(proxies, workers):
     except Exception:
         pass
     try:
-        ep.unregister_proxy(device_index)
+        ep_cpp.unregister_proxy(device_index)
     except Exception:
         pass
     try:
diff --git a/ep/deep_ep_wrapper/deep_ep/__init__.py b/ep/deep_ep_wrapper/deep_ep/__init__.py
index 57374a2a5..c3a2fee22 100644
--- a/ep/deep_ep_wrapper/deep_ep/__init__.py
+++ b/ep/deep_ep_wrapper/deep_ep/__init__.py
@@ -1,15 +1,48 @@
-from uccl.ep import Config, EventHandle
+"""Compatibility layer exposing :mod:`uccl.ep` through the historical ``deep_ep`` API."""
 
-from .utils import EventOverlap, check_nvlink_connections, initialize_uccl, destroy_uccl
-from .buffer import Buffer
-import torch.distributed as dist
+from __future__ import annotations
+
+import sys
+
+try:  # Preserve DeepEP's implicit torch import (best-effort only).
+    import torch  # type: ignore  # noqa: F401
+except Exception:  # pragma: no cover - torch is optional.
+    pass
+
+from uccl import __version__ as __uccl_version__
+from uccl.ep import (  # type: ignore F401 - symbols are re-exported.
+    Buffer,
+    Config,
+    EventOverlap,
+    destroy_uccl,
+    ep_cpp,
+    initialize_uccl,
+    test_internode,
+    buffer as _buffer_module,
+    utils as _utils_module,
+)
+
+# Expose module attributes for attribute access (e.g. ``deep_ep.buffer``).
+buffer = _buffer_module
+utils = _utils_module
+
+# Ensure ``import deep_ep.buffer`` and peers succeed.
+sys.modules.setdefault(__name__ + ".buffer", buffer)
+sys.modules.setdefault(__name__ + ".utils", utils)
+sys.modules.setdefault(__name__ + ".ep_cpp", ep_cpp)
 
 __all__ = [
-    "Config",
-    "EventHandle",
     "Buffer",
+    "Config",
     "EventOverlap",
-    "check_nvlink_connections",
     "initialize_uccl",
     "destroy_uccl",
+    "test_internode",
+    "buffer",
+    "utils",
+    "ep_cpp",
+    "__version__",
 ]
+
+# Align version string with the bundled uccl package.
+__version__ = __uccl_version__
diff --git a/ep/python/uccl_ep/.gitignore b/ep/python/uccl_ep/.gitignore
new file mode 100644
index 000000000..2426d79cf
--- /dev/null
+++ b/ep/python/uccl_ep/.gitignore
@@ -0,0 +1 @@
+ep_cpp*.so
diff --git a/ep/python/uccl_ep/__init__.py b/ep/python/uccl_ep/__init__.py
new file mode 100644
index 000000000..ed4de7355
--- /dev/null
+++ b/ep/python/uccl_ep/__init__.py
@@ -0,0 +1,36 @@
+"""
+uccl.ep — Expert-Parallel communication for Mixture-of-Experts models.
+
+This package provides both the native C++/CUDA extension (compiled as
+``ep_cpp``) and Python-level helpers (``Buffer``, ``EventOverlap``, etc.).
+
+Public API
+----------
+* Everything exported by the native extension (``Config``, ``EventHandle``,
+  ``Buffer`` (native), ``Proxy``, helper functions, …) is available directly
+  as ``uccl.ep.<name>``.
+* High-level Python wrappers live in submodules:
+  - ``uccl.ep.buffer.Buffer``   — the main user-facing ``Buffer`` class
+  - ``uccl.ep.utils``           — ``EventOverlap``, ``initialize_uccl``, etc.
+
+For backward compatibility, ``from uccl.ep import Config, EventHandle`` still
+works (they come from the native extension), and
+``from uccl.ep import Buffer`` returns the **Python** wrapper class.
+"""
+
+from uccl.ep.ep_cpp import *  # noqa: F401,F403 — re-export native symbols
+
+# Keep a reference so users can do ``from uccl.ep import ep_cpp`` when
+# they need the raw C++ module (e.g. ``ep_cpp.Buffer`` vs the Python
+# wrapper ``Buffer``).
+from uccl.ep import ep_cpp  # noqa: F401
+
+# Import the Python wrapper ``Buffer`` *after* the wildcard import so it
+# shadows the native ``Buffer`` class with the richer Python version.
+from uccl.ep.buffer import Buffer  # noqa: F401
+from uccl.ep.utils import (  # noqa: F401
+    EventOverlap,
+    check_nvlink_connections,
+    initialize_uccl,
+    destroy_uccl,
+)
diff --git a/ep/deep_ep_wrapper/deep_ep/buffer.py b/ep/python/uccl_ep/buffer.py
similarity index 100%
rename from ep/deep_ep_wrapper/deep_ep/buffer.py
rename to ep/python/uccl_ep/buffer.py
diff --git a/ep/deep_ep_wrapper/deep_ep/test_internode.py b/ep/python/uccl_ep/test_internode.py
similarity index 100%
rename from ep/deep_ep_wrapper/deep_ep/test_internode.py
rename to ep/python/uccl_ep/test_internode.py
diff --git a/ep/deep_ep_wrapper/deep_ep/utils.py b/ep/python/uccl_ep/utils.py
similarity index 100%
rename from ep/deep_ep_wrapper/deep_ep/utils.py
rename to ep/python/uccl_ep/utils.py
diff --git a/ep/setup.py b/ep/setup.py
index ac3ab09a2..92cdce017 100644
--- a/ep/setup.py
+++ b/ep/setup.py
@@ -1,4 +1,5 @@
 import os
+import re
 import sys
 import subprocess
 import sysconfig
@@ -51,7 +52,7 @@ def get_ext_filename(self, ext_name):
 
 
 class CustomInstall(install):
-    """Custom install command that installs .so file to INSTALL_DIR"""
+    """Custom install command that installs .so + Python files to INSTALL_DIR/ep/"""
 
     def run(self):
         # Run the standard build first
@@ -62,32 +63,41 @@ def run(self):
         install_dir = os.getenv(
             "INSTALL_DIR", os.path.join(python_site_packages, "uccl")
         )
-        os.makedirs(install_dir, exist_ok=True)
 
-        # Find the built .so file
+        ep_dir = os.path.join(install_dir, "ep")
+        os.makedirs(ep_dir, exist_ok=True)
+
+        # --- Install Python source files from ep/python/uccl_ep/ ---
+        py_src_dir = PROJECT_ROOT / "python" / "uccl_ep"
+        if py_src_dir.is_dir():
+            for py_file in py_src_dir.glob("*.py"):
+                dest = os.path.join(ep_dir, py_file.name)
+                print(f"Installing {py_file.name} to {ep_dir}")
+                shutil.copy2(py_file, dest)
+
+        # --- Install the native .so ---
         build_lib = self.get_finalized_command("build_ext").build_lib
-        so_files = list(Path(build_lib).glob("ep*.so"))
+        so_files = list(Path(build_lib).glob("ep_cpp*.so"))
 
         if not so_files:
             raise RuntimeError(f"Could not find built .so file in {build_lib}")
 
         so_file = so_files[0]
-        dest_path = os.path.join(install_dir, so_file.name)
+        dest_path = os.path.join(ep_dir, so_file.name)
 
-        # Copy the .so file to the install directory
-        print(f"Installing {so_file.name} to {install_dir}")
+        print(f"Installing {so_file.name} to {ep_dir}")
         shutil.copy2(so_file, dest_path)
 
         if _use_abi3:
-            for old in Path(install_dir).glob("ep.cpython-*.so"):
+            for old in Path(ep_dir).glob("ep_cpp.cpython-*.so"):
                 print(f"Removing stale {old.name}")
                 old.unlink()
         else:
-            for old in Path(install_dir).glob("ep.abi3.so"):
+            for old in Path(ep_dir).glob("ep_cpp.abi3.so"):
                 print(f"Removing stale {old.name}")
                 old.unlink()
 
-        print(f"Installation complete. Module installed as: {dest_path}")
+        print(f"Installation complete. uccl.ep installed to: {ep_dir}")
 
 
 class CustomClean(Command):
@@ -271,6 +281,7 @@ def run(self):
     else:
         # AMD GPU Architecture Detection
         detected_amd_arch = None
+        supported_amd_arch = ["gfx942", "gfx950"]
         try:
             rocminfo_output = subprocess.check_output(
                 ["rocminfo"], stderr=subprocess.DEVNULL
@@ -294,13 +305,16 @@ def run(self):
             )
 
         # Use environment variable, then detected arch, then fallback
-        device_arch = os.getenv(
-            "TORCH_CUDA_ARCH_LIST",
-            detected_amd_arch if detected_amd_arch else "gfx420",
+        default_arch = (
+            detected_amd_arch if detected_amd_arch else ";".join(supported_amd_arch)
         )
+        device_arch = os.getenv("TORCH_CUDA_ARCH_LIST", default_arch)
 
-        for arch in device_arch.split(","):
-            nvcc_flags.append(f"--offload-arch={arch.lower()}")
+        nvcc_flags.extend(
+            f"--offload-arch={arch.lower()}"
+            for arch in re.split(r"[;,\s]+", device_arch)
+            if arch in supported_amd_arch
+        )
 
         # Disable SM90 features on AMD
         cxx_flags.append("-DDISABLE_SM90_FEATURES")
@@ -383,7 +397,7 @@ def run(self):
         version="0.0.1" + revision,
         ext_modules=[
             CUDAExtension(
-                name="ep",
+                name="ep_cpp",
                 include_dirs=include_dirs,
                 library_dirs=library_dirs,
                 sources=sources,
diff --git a/ep/src/uccl_ep.cc b/ep/src/uccl_ep.cc
index ccdcf6bb2..cdadc6528 100644
--- a/ep/src/uccl_ep.cc
+++ b/ep/src/uccl_ep.cc
@@ -1635,7 +1635,7 @@ class Buffer {
       nullptr};  // Device pointer to array of IPC base addresses
 };
 
-NB_MODULE(ep, m) {
+NB_MODULE(ep_cpp, m) {
   m.doc() = "Minimal DeepEP-compatible shim with UCCL";
 
   nb::class_<uccl::Config>(m, "Config")
diff --git a/setup.py b/setup.py
index 04b4f1cad..cd0e2898b 100644
--- a/setup.py
+++ b/setup.py
@@ -1,7 +1,15 @@
+import os
 import re
+import shutil
+import subprocess
 import sys
 import sysconfig
-from setuptools import setup, find_packages, Extension
+import importlib.util
+from pathlib import Path
+from distutils import log
+from typing import Optional
+from setuptools import setup, find_packages, Extension, Command
+from setuptools.command.build_ext import build_ext as _build_ext
 
 
 def _is_freethreaded():
@@ -28,6 +36,7 @@ def get_version():
 
 VERSION = get_version()
 
+
 # Single package "uccl" for all backends (vLLM-style).
 # Variants are distinguished by PEP 440 local version identifiers in the
 # wheel filename (e.g. uccl-0.1.0+cu13, uccl-0.1.0+cu12.efa).
@@ -43,6 +52,211 @@ def get_version():
     py_limited_api=_use_limited_api,
     define_macros=[("Py_LIMITED_API", "0x030C0000")] if _use_limited_api else [],
 )
+
+
+BUILD_SCRIPT = "build_native.sh"
+
+
+class ShellExtension(Extension):
+    """Extension wrapper that drives external shell-script builds."""
+
+    def __init__(self, name, sourcedir=".", script=BUILD_SCRIPT,
+                 targets=("all",), env=None):
+        super().__init__(name, sources=[])
+        self.sourcedir = Path(sourcedir).resolve()
+        self.script = script
+        self.targets = list(targets)
+        self.env = env or {}
+
+
+class ShellClean(Command):
+    """`python setup.py clean` -> run ``build_native.sh clean`` plus wipe
+    Python build dirs.
+    """
+
+    description = (
+        "run `build_native.sh clean` and remove build/, dist/, *.egg-info/, etc."
+    )
+    user_options = []
+
+    def initialize_options(self):
+        pass
+
+    def finalize_options(self):
+        pass
+
+    def run(self):
+        project_root = Path(__file__).parent.resolve()
+
+        # 1. Delegate to build_native.sh so native sub-modules clean up
+        #    their own .o/.d/.so files.
+        script = project_root / BUILD_SCRIPT
+        if script.exists():
+            cmd = ["bash", str(script), "clean"]
+            log.info("running %s in %s", " ".join(cmd), project_root)
+            subprocess.run(cmd, cwd=str(project_root), check=False)
+
+        # 2. Wipe top-level Python build artefacts.
+        targets = [
+            project_root / "build",
+            project_root / "dist",
+            project_root / "wheelhouse",
+            project_root / "ep" / "build",
+        ]
+        targets += list(project_root.glob("*.egg-info"))
+        targets += list((project_root / "ep").glob("*.egg-info"))
+
+        # 3. Stale shared libraries that may have been copied in-tree by an
+        #    earlier build (e.g. uccl/lib/*.so, uccl/p2p*.so,
+        #    ep/python/uccl_ep/ep_cpp*.so).
+        targets += list((project_root / "uccl" / "lib").glob("*.so"))
+        targets += list((project_root / "uccl").glob("p2p*.so"))
+        targets += list(
+            (project_root / "ep" / "python" / "uccl_ep").glob("ep_cpp*.so")
+        )
+
+        for path in targets:
+            if not path.exists() and not path.is_symlink():
+                continue
+            log.info("removing %s", path)
+            if path.is_dir() and not path.is_symlink():
+                shutil.rmtree(path, ignore_errors=True)
+            else:
+                try:
+                    path.unlink()
+                except FileNotFoundError:
+                    pass
+
+
+class ShellBuildExtension(_build_ext):
+    """Custom build_ext that invokes ``build_native.sh`` before compiling
+    C extensions.
+    """
+
+    @staticmethod
+    def _find_rocm_home()->Optional[Path]:
+        """Return ROCm install dir as ``Path``, or ``None``.
+
+        Adapted from pytorch ``_find_rocm_home``.
+        """
+        # Guess #1
+        rocm_home = os.environ.get('ROCM_HOME') or os.environ.get('ROCM_PATH')
+        if rocm_home is None:
+            # Guess #2: Support for ROCm distribution from TheRock
+            # rocm-sdk-core installs everything under <site-packages>/_rocm_sdk_core
+            # (include/, lib/, bin/, ...), so the module's own location is the
+            # ROCM_HOME we want. Use find_spec to locate it without importing.
+            spec = importlib.util.find_spec('_rocm_sdk_core')
+            if spec is not None and spec.origin is not None:
+                rocm_home = str(Path(spec.origin).parent.resolve())
+        if rocm_home is None:
+            # Guess #3
+            hipcc_path = shutil.which('hipcc')
+            if hipcc_path is not None:
+                rocm_home = os.path.dirname(os.path.dirname(
+                    os.path.realpath(hipcc_path)))
+                # can be either <ROCM_HOME>/hip/bin/hipcc or <ROCM_HOME>/bin/hipcc
+                if os.path.basename(rocm_home) == 'hip':
+                    rocm_home = os.path.dirname(rocm_home)
+            else:
+                # Guess #4
+                fallback_path = '/opt/rocm'
+                if os.path.exists(fallback_path):
+                    rocm_home = fallback_path
+        if rocm_home is None or not os.path.exists(rocm_home):
+            log.warn("No ROCm runtime is found, using ROCM_HOME='%s'", rocm_home)
+            return None
+        log.info("ROCm runtime is found at %s", rocm_home)
+        return Path(rocm_home)
+
+    @staticmethod
+    def _detect_rocm_major(rocm_home: Path):
+        """Return ROCm major version, or ``None`` if undetectable."""
+        # ``X.Y.Z`` text file shipped by most ROCm installs.
+        version_file = rocm_home / ".info" / "version"
+        if version_file.exists():
+            try:
+                head = version_file.read_text().strip().split(".")[0]
+                return int(head)
+            except (ValueError, OSError):
+                pass
+
+        # Fallback: ``HIP_VERSION_MAJOR`` from the hip header.
+        hip_version_h = rocm_home / "include" / "hip" / "hip_version.h"
+        if hip_version_h.exists():
+            try:
+                for line in hip_version_h.read_text().splitlines():
+                    m = re.match(r"\s*#define\s+HIP_VERSION_MAJOR\s+(\d+)", line)
+                    if m:
+                        return int(m.group(1))
+            except OSError:
+                pass
+
+        return None
+
+    @staticmethod
+    def _detect_target(env: dict) -> str:
+        """Resolve build_native.sh ``TARGET``.
+
+        Precedence: env ``TARGET`` > detected ROCm (``roc6`` / ``roc7``)
+        > ``cu12``.
+        """
+        if env.get("TARGET"):
+            return env["TARGET"]
+        rocm_home = ShellBuildExtension._find_rocm_home()
+        if rocm_home is None:
+            return "cu12"
+        major = ShellBuildExtension._detect_rocm_major(rocm_home)
+        return "roc6" if major == 6 else "roc7"
+
+    def _get_build_output_dir(self):
+        """``uccl`` package output dir for build_native.sh.
+
+        Editable/inplace -> source ``uccl/``; install/wheel -> ``build_lib/uccl``.
+        build_native.sh derives ``uccl.ep`` from this internally.
+        """
+        if self.inplace or not self.build_lib:
+            return Path(__file__).parent.resolve() / "uccl"
+        return Path(self.build_lib).resolve() / "uccl"
+
+    def run(self):
+        shell_exts = [ext for ext in self.extensions if isinstance(ext, ShellExtension)]
+        for ext in shell_exts:
+            self.build_shell_extension(ext)
+        # Drop shell-only extensions so the base class doesn't expect artifacts.
+        self.extensions = [ext for ext in self.extensions if not isinstance(ext, ShellExtension)]
+        super().run()
+
+    def build_shell_extension(self, ext: ShellExtension):
+        env = os.environ.copy()
+        env.setdefault("PYTHON", sys.executable)
+        env.update(ext.env)
+
+        # Drop PEP 517 build-env PYTHONPATH so the child build script's python
+        # sees the host venv (otherwise ``import torch`` in ep/setup.py fails).
+        env.pop("PYTHONPATH", None)
+
+        # build_native.sh dispatches on TARGET; auto-pick when caller didn't.
+        env["TARGET"] = self._detect_target(env)
+
+        # Tell build_native.sh where to drop the ``uccl`` package's
+        # artefacts; the script derives the ``uccl.ep`` target dir from
+        # this value internally.
+        env["UCCL_PY_DIR"] = str(self._get_build_output_dir())
+
+        cmd = ["bash", str(ext.sourcedir / ext.script), *ext.targets]
+
+        log.info("running `%s` in %s", " ".join(cmd), ext.sourcedir)
+        subprocess.check_call(cmd, cwd=str(ext.sourcedir), env=env)
+
+
+shell_ext = ShellExtension(
+    name="uccl.shell",
+    sourcedir=Path(__file__).parent,
+    script=BUILD_SCRIPT,
+    targets=["all"],
+)
+
 setup(
     name="uccl",
     version=VERSION,
@@ -51,17 +265,23 @@ def get_version():
     long_description=open("README.md").read(),
     long_description_content_type="text/markdown",
     url="https://github.com/uccl-project/uccl",
-    packages=find_packages(),
-    ext_modules=[abi3_ext],
+    packages=find_packages(include=["uccl", "uccl.*", "uccl.ep"]) + ["uccl.ep"],
+    package_dir={
+        "uccl": "uccl",
+        "uccl.ep": "ep/python/uccl_ep",
+    },
+    ext_modules=[shell_ext, abi3_ext],
     package_data={
         "uccl": [
             "lib/*.so",
             "p2p*.so",
-            "ep*.so",
             "lib/*.a",
             "collective.py",
             "utils.py",
         ],
+        "uccl.ep": [
+            "ep_cpp*.so",
+        ],
     },
     license="Apache-2.0",
     install_requires=["intervaltree"],
@@ -73,4 +293,8 @@ def get_version():
     extras_require={
         "rocm": [],
     },
+    cmdclass={
+        "build_ext": ShellBuildExtension,
+        "clean": ShellClean,
+    },
 )
diff --git a/uccl/.gitignore b/uccl/.gitignore
index 66df4f2ec..3613cecc1 100644
--- a/uccl/.gitignore
+++ b/uccl/.gitignore
@@ -1,3 +1,3 @@
 collective.py
 utils.py
-_rocm_init.py
\ No newline at end of file
+_rocm_init.py