Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
2aa9bce
add args in transport
derekwin Apr 10, 2026
755a286
signal only last chunk of each qp
derekwin Apr 11, 2026
d5486bf
add NicUsable for p2p
derekwin Apr 11, 2026
8f23d25
Revert "signal only last chunk of each qp"
derekwin Apr 11, 2026
f051012
Revert "add args in transport"
derekwin Apr 11, 2026
15ad87b
async batch submit
derekwin Apr 11, 2026
4764363
Revert "async batch submit"
derekwin Apr 11, 2026
7190eaf
resource pool, reduce lock
derekwin Apr 11, 2026
29e2bfc
Revert "resource pool, reduce lock"
derekwin Apr 11, 2026
4856d45
add p2p rdma as adapter
derekwin Apr 13, 2026
d2b02db
update markdown
derekwin Apr 13, 2026
11b42c6
fix bug of tcp
derekwin Apr 13, 2026
d8b5062
refactor rdma adapter
derekwin Apr 14, 2026
8277c8e
-DNDEBUG fix
derekwin Apr 14, 2026
e25193d
update rdma mr
derekwin Apr 14, 2026
fce16c3
fix mr exchange
derekwin Apr 14, 2026
0df68b2
transport/oob: refactor to put-get-subscribe-sync and fix duplicate c…
derekwin Apr 17, 2026
7a39e01
transport/communicator: migrate OOB bootstrap to put/get with namespa…
derekwin Apr 17, 2026
eb3c8e2
transport: unify buffer_id-based MR/IPC async resource APIs
derekwin Apr 17, 2026
586105f
unified buffer id
derekwin Apr 17, 2026
6501951
valid ctx
derekwin Apr 20, 2026
59269d6
split uccl with rdma
derekwin Apr 20, 2026
7d5850f
split rdma req
derekwin Apr 21, 2026
25676b1
fix some p2p bug(maybe)
derekwin Apr 21, 2026
b244a75
refactor rdma_adapter by NICEndpoint
derekwin Apr 21, 2026
fa783b0
replace assert bcus -DNDEBUG
derekwin Apr 21, 2026
f55b6fd
update test p2p to support env
derekwin Apr 21, 2026
b3bb2f3
force rdma use two-sides
derekwin Apr 21, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 22 additions & 13 deletions experimental/ukernel/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -73,11 +73,12 @@ NVCCFLAGS += -DUSE_REDIS_OOB --compiler-options "$(REDIS_CFLAGS)"
LDFLAGS += $(REDIS_LIBS)
endif

# UCCL collective support
# RDMA core dependency (collective/rdma static library)
UCCL_ROOT := $(abspath ../../)
UCCL_RDMA_ROOT := $(UCCL_ROOT)/collective/rdma
UCCL_RDMA_STATIC := $(UCCL_RDMA_ROOT)/librdma.a
UCCL_CFLAGS := -I$(UCCL_ROOT) -I$(UCCL_ROOT)/collective/rdma -I$(UCCL_ROOT)/include
UCCL_CFLAGS := -I$(UCCL_ROOT) -I$(UCCL_ROOT)/collective/rdma -I$(UCCL_ROOT)/include \
-I$(UCCL_ROOT)/p2p -I$(UCCL_ROOT)/p2p/include -I$(UCCL_ROOT)/p2p/rdma
UCCL_LDFLAGS := $(UCCL_RDMA_STATIC) -libverbs -lrdmacm

CXXFLAGS += $(UCCL_CFLAGS)
Expand Down Expand Up @@ -118,10 +119,10 @@ BENCH_BINS := $(BENCH_TRANSPORT_BIN) $(BENCH_GDRCOPY_BIN)
STATIC_LIB := libukernel.a
SHARED_LIB := libukernel.so

# Build the core ukernel artifacts plus top-level benchmarks.
all: $(STATIC_LIB) $(SHARED_LIB) bench
# Build the core ukernel artifacts plus benchmarks.
all: $(STATIC_LIB) $(SHARED_LIB) bench_all

# Compilation rules - local includes must come before system/UCCL includes to avoid header conflicts
# Compilation rules - local includes must come before system/RDMA headers to avoid conflicts
%.o: %.cc
$(CXX) $(LOCAL_INCLUDES) $(CXXFLAGS) $(INCLUDES) -MMD -MP -c $< -o $@

Expand All @@ -131,34 +132,44 @@ all: $(STATIC_LIB) $(SHARED_LIB) bench
%.o: %.cu
$(NVCC) $(LOCAL_INCLUDES) $(NVCCFLAGS) $(INCLUDES) -MMD -MP -c $< -o $@

ensure-uccl-rdma:
ensure-rdma-core:
@if [ ! -f "$(UCCL_RDMA_STATIC)" ]; then \
echo "[ukernel] building collective/rdma..."; \
$(MAKE) -C "$(UCCL_RDMA_ROOT)" build; \
fi

# Backward-compatible alias.
ensure-uccl-rdma: ensure-rdma-core

# Static library
$(STATIC_LIB): $(OBJ_CC) $(OBJ_CU)
ar rcs $@ $^

# Shared library
$(SHARED_LIB): $(OBJ_CC) $(OBJ_CU) | ensure-uccl-rdma
$(SHARED_LIB): $(OBJ_CC) $(OBJ_CU) | ensure-rdma-core
$(CXX) -shared $(CXXFLAGS) $(INCLUDES) $^ $(LDFLAGS) -o $@

# Benchmark build rules
$(BENCH_TRANSPORT_OBJ): $(BENCH_TRANSPORT_SRC)
$(CXX) $(LOCAL_INCLUDES) $(CXXFLAGS) $(INCLUDES) -MMD -MP -c $< -o $@

$(BENCH_TRANSPORT_BIN): $(BENCH_TRANSPORT_OBJ) $(STATIC_LIB) | ensure-uccl-rdma
$(BENCH_TRANSPORT_BIN): $(BENCH_TRANSPORT_OBJ) $(STATIC_LIB) | ensure-rdma-core
$(CXX) $(LOCAL_INCLUDES) $(CXXFLAGS) $(INCLUDES) $< $(STATIC_LIB) $(LDFLAGS) -o $@

$(BENCH_GDRCOPY_OBJ): $(BENCH_GDRCOPY_SRC)
$(CXX) $(BENCH_GDRCOPY_CXXFLAGS) $(BENCH_GDRCOPY_INCLUDES) -MMD -MP -c $< -o $@

$(BENCH_GDRCOPY_BIN): $(BENCH_GDRCOPY_OBJ) | ensure-uccl-rdma
$(BENCH_GDRCOPY_BIN): $(BENCH_GDRCOPY_OBJ) | ensure-rdma-core
$(CXX) $< -o $@ $(BENCH_GDRCOPY_LIBS)

bench: $(BENCH_BINS)
bench_all: $(BENCH_BINS)

# Backward-compatible alias.
bench: bench_all

transport_bench: bench_transport

bench_gdrcopy: $(BENCH_GDRCOPY_BIN)

device_test:
$(MAKE) -C src/device test $(SUBMAKE_SM)
Expand Down Expand Up @@ -196,6 +207,4 @@ clean:
# src/device/persistent_kernel_ops.o: src/device/persistent_kernel_ops.cu
# $(NVCC) $(NVCCFLAGS) -Iinclude -Isrc/transport -Isrc/device -Isrc/device/fifo -MMD -MP -c $< -o $@

.PHONY: all clean bench transport_bench device_test ccl_test transport_test transport_suite device_bench ensure-uccl-rdma

transport_bench: $(BENCH_TRANSPORT_BIN)
.PHONY: all clean bench bench_all bench_transport bench_gdrcopy transport_bench device_test ccl_test transport_test transport_suite device_bench ensure-rdma-core ensure-uccl-rdma
230 changes: 48 additions & 182 deletions experimental/ukernel/README.md
Original file line number Diff line number Diff line change
@@ -1,36 +1,19 @@
# UKernel

Minimal build and test entry points for `experimental/ukernel`.
`experimental/ukernel` is a layered runtime with three core modules:

## Prerequisites

- CUDA toolchain for NVIDIA builds
- RDMA / verbs dependencies used by `transport` and `ccl`
- system-installed GDRCopy (`gdrapi.h` + `libgdrapi`) for `device` and `ccl`
- `torchrun` available for CCL multiprocess integration tests

## Quick Install GDRCopy (System)

`experimental/ukernel` no longer builds `thirdparty/gdrcopy`.
Please install GDRCopy from NVIDIA upstream on your machine first.

Ubuntu quick path (source build):
- `transport`: peer bootstrap, path selection, async send/recv, progress
- `device`: device-side FIFO/task/worker execution
- `ccl`: collective planning + executor on top of transport/device

```bash
sudo apt-get update
sudo apt-get install -y build-essential dkms linux-headers-$(uname -r) libelf-dev
git clone https://github.com/NVIDIA/gdrcopy.git
cd gdrcopy
make CUDA=/usr/local/cuda
sudo make CUDA=/usr/local/cuda prefix=/usr/local install
```
## Prerequisites

Optional: if `libgdrapi.so` is not in the default linker path, pass `GDRCOPY_LIBDIR` when building:
- CUDA toolchain (NVIDIA build path)
- RDMA / verbs dependencies (`libibverbs`, `librdmacm`)
- system-installed GDRCopy (`gdrapi.h` + `libgdrapi.so`)
- `torchrun` for multiprocess CCL/Python integration tests

```bash
cd experimental/ukernel
make GDRCOPY_LIBDIR=/usr/local/lib
```
If `libgdrapi.so` is outside default linker paths, pass `GDRCOPY_LIBDIR`.

## Build

Expand All @@ -42,204 +25,87 @@ make clean -f Makefile
make -j$(nproc) -f Makefile
```

AMD / ROCm:
ROCm:

```bash
cd experimental/ukernel
make clean -f Makefile.rocm
make -j$(nproc) -f Makefile.rocm
```

Common overrides:
Common override example:

```bash
make -f Makefile CUDA_PATH=/usr/local/cuda CONDA_LIB_HOME=/usr/lib SM=80
```

## Test
## Common Targets

Run all transport tests:
Tests:

```bash
cd experimental/ukernel
make transport_test
make device_test SM=80
make ccl_test SM=80
```

Build transport benchmark:

```bash
cd experimental/ukernel
make transport_bench
```

Manual two-process transport check:

```bash
cd experimental/ukernel/src/transport
make test-integration
./test_transport_integration communicator --role=server --case=exchange --exchanger-port 16979
./test_transport_integration communicator --role=client --case=exchange --exchanger-ip 127.0.0.1 --exchanger-port 16979
```

Run all device tests:
Benchmarks:

```bash
cd experimental/ukernel
make device_test SM=80
make bench_transport
make bench_gdrcopy
make bench_all
```

Run all CCL tests:
Compatibility aliases:

```bash
cd experimental/ukernel
make ccl_test SM=80
make transport_bench # alias of bench_transport
make bench # alias of bench_all
```

## Python Binding

`experimental/ukernel/py` contains a `torch`-based Python extension that wraps
the `ccl` executor behind a persistent `ProcessGroup` object. The binding takes
CUDA `torch.Tensor` inputs directly and runs `allreduce` / `alltoall` through
the existing `transport + ccl + device` stack.

Build the extension in place:
Build extension:

```bash
cd experimental/ukernel/py
python setup.py build_ext --inplace
```

Run Python tests (requires 2+ GPUs):
Run common tests:

```bash
cd experimental/ukernel/py
CUDA_VISIBLE_DEVICES=6,7 torchrun --nproc_per_node=2 test_collective.py
CUDA_VISIBLE_DEVICES=0,6,7 torchrun --nproc_per_node=3 test_collective.py # for 3-rank tests
CUDA_VISIBLE_DEVICES=6,7 torchrun --nproc_per_node=2 test_p2p.py
```

Run p2p benchmark flavors:

# rdma battle
```bash
# Collective/rdma path
# UCCL_DEBUG_VLOG_LEVEL=1 : check fast path
UK_P2P_TRANSPORT=uccl UCCL_P2P_MODE=rdma NCCL_P2P_DISABLE=1 NCCL_SHM_DISABLE=1 NCCL_IB_DISABLE=0 NCCL_IB_HCA=mlx5_0 NCCL_DEBUG=INFO CUDA_VISIBLE_DEVICES=6,7 torchrun --nproc_per_node=2 bench_p2p.py
Size | ukernel (ms) | ukernel (GB/s) | UCCL (ms) | UCCL (GB/s) | NCCL (ms) | NCCL (GB/s)
--------------------------------------------------------------------------------------------------------------------------------
1024 B | 0.063 | 0.03 | 0.316 | 0.01 | 0.096 | 0.02
4096 B | 0.093 | 0.09 | 0.306 | 0.03 | 0.099 | 0.08
16384 B | 0.199 | 0.16 | 0.366 | 0.09 | 0.103 | 0.32
65536 B | 0.249 | 0.53 | 0.431 | 0.30 | 0.159 | 0.82
262144 B | 0.437 | 1.20 | 0.687 | 0.76 | 0.385 | 1.36
1048576 B | 1.238 | 1.69 | 1.846 | 1.14 | 1.244 | 1.69
4194304 B | 5.232 | 1.60 | 5.699 | 1.47 | 4.480 | 1.87
16777216 B | 19.853 | 1.69 | 20.715 | 1.62 | 17.434 | 1.92
67108864 B | 79.071 | 1.70 | 78.673 | 1.71 | 69.287 | 1.94
268435456 B | 319.966 | 1.68 | 319.911 | 1.68 | 275.867 | 1.95

Size | ukernel (ms) | ukernel (GB/s) | UCCL (ms) | UCCL (GB/s) | NCCL (ms) | NCCL (GB/s)
--------------------------------------------------------------------------------------------------------------------------------
1024 B | 0.074 | 0.03 | 0.711 | 0.00 | 0.097 | 0.02
4096 B | 0.086 | 0.10 | 0.316 | 0.03 | 0.098 | 0.08
16384 B | 0.196 | 0.17 | 0.240 | 0.14 | 0.099 | 0.33
65536 B | 0.234 | 0.56 | 0.300 | 0.44 | 0.150 | 0.88
262144 B | 0.416 | 1.26 | 0.631 | 0.83 | 0.372 | 1.41
1048576 B | 1.272 | 1.65 | 1.388 | 1.51 | 1.227 | 1.71
4194304 B | 5.083 | 1.65 | 5.726 | 1.46 | 4.475 | 1.87
16777216 B | 20.159 | 1.66 | 24.035 | 1.40 | 17.449 | 1.92
67108864 B | 79.323 | 1.69 | 78.945 | 1.70 | 69.256 | 1.94
268435456 B | 319.898 | 1.68 | 315.428 | 1.70 | 275.951 | 1.95

Size | ukernel (ms) | ukernel (GB/s) | UCCL (ms) | UCCL (GB/s) | NCCL (ms) | NCCL (GB/s)
--------------------------------------------------------------------------------------------------------------------------------
1024 B | 0.086 | 0.02 | 0.140 | 0.01 | 0.137 | 0.01
4096 B | 0.090 | 0.09 | 0.121 | 0.07 | 0.132 | 0.06
16384 B | 0.180 | 0.18 | 0.149 | 0.22 | 0.133 | 0.25
65536 B | 0.246 | 0.53 | 0.205 | 0.64 | 0.151 | 0.87
262144 B | 0.410 | 1.28 | 0.410 | 1.28 | 0.374 | 1.40
1048576 B | 1.268 | 1.65 | 1.243 | 1.69 | 1.225 | 1.71
4194304 B | 5.050 | 1.66 | 5.033 | 1.67 | 4.437 | 1.89
16777216 B | 19.814 | 1.69 | 19.923 | 1.68 | 17.447 | 1.92
67108864 B | 78.855 | 1.70 | 78.982 | 1.70 | 69.237 | 1.94
268435456 B | 319.691 | 1.68 | 315.106 | 1.70 | 276.135 | 1.94

# ipc battle
UK_P2P_TRANSPORT=ipc UCCL_P2P_MODE=ipc CUDA_VISIBLE_DEVICES=6,7 torchrun --nproc_per_node=2 bench_p2p.py
Size | ukernel (ms) | ukernel (GB/s) | UCCL (ms) | UCCL (GB/s) | NCCL (ms) | NCCL (GB/s)
--------------------------------------------------------------------------------------------------------------------------------
1024 B | 3.318 | 0.00 | 0.042 | 0.05 | 0.091 | 0.02
4096 B | 3.470 | 0.00 | 0.042 | 0.20 | 0.090 | 0.09
16384 B | 3.339 | 0.01 | 0.064 | 0.51 | 0.087 | 0.37
65536 B | 3.410 | 0.04 | 0.078 | 1.68 | 0.098 | 1.34
262144 B | 3.439 | 0.15 | 0.185 | 2.84 | 0.088 | 5.96
1048576 B | 3.602 | 0.58 | 0.566 | 3.70 | 0.102 | 20.66
4194304 B | 3.679 | 2.28 | 1.374 | 6.10 | 0.250 | 33.50
16777216 B | 4.104 | 8.18 | 4.683 | 7.17 | 0.799 | 41.99
67108864 B | 6.058 | 22.15 | 17.865 | 7.51 | 2.758 | 48.66
268435456 B | 13.548 | 39.63 | 70.594 | 7.61 | 10.814 | 49.64

# tcp
UK_P2P_TRANSPORT=tcp UCCL_P2P_MODE=ipc CUDA_VISIBLE_DEVICES=6,7 torchrun --nproc_per_node=2 bench_p2p.py
Size | ukernel (ms) | ukernel (GB/s)
------------------------------------------------------
1024 B | 3.318 | 0.00
4096 B | 3.470 | 0.00
16384 B | 3.339 | 0.01
65536 B | 3.410 | 0.04
262144 B | 3.439 | 0.15
1048576 B | 3.602 | 0.58
4194304 B | 3.679 | 2.28
16777216 B | 4.104 | 8.18
67108864 B | 6.058 | 22.15
268435456 B | 13.548 | 39.63
```

Minimal usage under `torchrun`:

```python
import os
import torch
from ukernel_ccl import ProcessGroup

rank = int(os.environ["RANK"])
world = int(os.environ["WORLD_SIZE"])
local_rank = int(os.environ.get("LOCAL_RANK", rank))

torch.cuda.set_device(local_rank)
pg = ProcessGroup(
rank=rank,
world_size=world,
gpu_id=local_rank,
exchanger_ip=os.environ.get("MASTER_ADDR", "127.0.0.1"),
exchanger_port=int(os.environ.get("MASTER_PORT", "29500")),
transport="auto",
)

x = torch.randn(1024 * world + 1, device="cuda", dtype=torch.float32)
pg.allreduce(x, tile_bytes=65536, num_flows=2)

y = torch.randn(1024 * world, device="cuda", dtype=torch.float32)
pg.alltoall(y, tile_bytes=65536, num_flows=2)

send = torch.randn(13, device="cuda", dtype=torch.float32)
recv = torch.empty(13, device="cuda", dtype=torch.float32)
dist.all_to_all_single(
recv,
send,
output_split_sizes=[4, 5, 4],
input_split_sizes=[4, 5, 4],
group=pg,
tile_bytes=65536,
num_flows=2,
)
# p2p/rdma path
UK_P2P_TRANSPORT=rdma UCCL_P2P_MODE=rdma NCCL_P2P_DISABLE=1 NCCL_SHM_DISABLE=1 NCCL_IB_DISABLE=0 NCCL_IB_HCA=mlx5_0 NCCL_DEBUG=INFO CUDA_VISIBLE_DEVICES=6,7 torchrun --nproc_per_node=2 bench_p2p.py

# IPC path
UK_P2P_TRANSPORT=ipc UCCL_P2P_MODE=ipc NCCL_P2P_DISABLE=1 NCCL_SHM_DISABLE=1 NCCL_IB_DISABLE=0 NCCL_IB_HCA=mlx5_0 NCCL_DEBUG=INFO CUDA_VISIBLE_DEVICES=6,7 torchrun --nproc_per_node=2 bench_p2p.py

# TCP path
UK_P2P_TRANSPORT=tcp UCCL_P2P_MODE=ipc NCCL_P2P_DISABLE=1 NCCL_SHM_DISABLE=1 NCCL_IB_DISABLE=0 NCCL_IB_HCA=mlx5_0 NCCL_DEBUG=INFO CUDA_VISIBLE_DEVICES=6,7 torchrun --nproc_per_node=2 bench_p2p.py
```

Current Python binding constraints:
Naming note:

- collective payload tensors must be CUDA and contiguous
- `allreduce` supports non-divisible element counts
- equal-split and variable-split `all_to_all_single` are both supported
- variable-split `all_to_all_single` requires explicit
`input_split_sizes/output_split_sizes` whose sums match local tensor numel
- `UK_P2P_TRANSPORT=uccl` is a compatibility-facing logical selector.
- In ukernel transport, this selector currently maps to the RDMA adapter backend.
- `UCCL_P2P_MODE` controls standalone UCCL behavior and is independent of ukernel adapter naming.

## Modules
## Module Docs

- [`src/transport/README.md`](/Users/jacelau/code/opencode/uccl/experimental/ukernel/src/transport/README.md)
- [`src/device/README.md`](/Users/jacelau/code/opencode/uccl/experimental/ukernel/src/device/README.md)
- [`src/ccl/README.md`](/Users/jacelau/code/opencode/uccl/experimental/ukernel/src/ccl/README.md)
- [`benchmarks/README.md`](/Users/jacelau/code/opencode/uccl/experimental/ukernel/benchmarks/README.md)
- [transport README](/Users/jacelau/code/opencode/uccl/experimental/ukernel/src/transport/README.md)
- [device README](/Users/jacelau/code/opencode/uccl/experimental/ukernel/src/device/README.md)
- [ccl README](/Users/jacelau/code/opencode/uccl/experimental/ukernel/src/ccl/README.md)
- [benchmarks README](/Users/jacelau/code/opencode/uccl/experimental/ukernel/benchmarks/README.md)
Loading
Loading