diff --git a/CMakeLists.txt b/CMakeLists.txt index 69e3e9ff..d099480c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -41,6 +41,7 @@ option(BOOST_CAPY_BUILD_EXAMPLES "Build boost::capy examples" ${BOOST_CAPY_IS_RO option(BOOST_CAPY_BUILD_BENCH "Build boost::capy benchmarks" ${BOOST_CAPY_IS_ROOT}) option(BOOST_CAPY_BUILD_P2300_EXAMPLES "Build examples that depend on beman-execution (P2300)" OFF) option(BOOST_CAPY_BUILD_NVEXEC_EXAMPLES "Build examples that depend on NVIDIA nvexec (CUDA)" OFF) +option(BOOST_CAPY_BUILD_CUDA_EXAMPLES "Build examples that depend only on CUDA (no stdexec/nvexec)" OFF) option(BOOST_CAPY_MRDOCS_BUILD "Build the target for MrDocs: see mrdocs.yml" OFF) if(BOOST_CAPY_BUILD_P2300_EXAMPLES) @@ -60,14 +61,25 @@ if(BOOST_CAPY_BUILD_NVEXEC_EXAMPLES) message(FATAL_ERROR "BOOST_CAPY_BUILD_NVEXEC_EXAMPLES requires CMAKE_CXX_STANDARD >= 23") endif() - enable_language(CUDA) - find_package(CUDAToolkit REQUIRED) # Tell NVIDIA/stdexec to build the nvexec target when its # FetchContent is processed (bench/ and/or the example itself). set(STDEXEC_ENABLE_CUDA ON CACHE BOOL "Build nvexec when configuring NVIDIA/stdexec" FORCE) endif() +if(BOOST_CAPY_BUILD_CUDA_EXAMPLES) + if(NOT DEFINED CMAKE_CXX_STANDARD OR CMAKE_CXX_STANDARD LESS 20) + message(FATAL_ERROR + "BOOST_CAPY_BUILD_CUDA_EXAMPLES requires CMAKE_CXX_STANDARD >= 20") + endif() +endif() + +# Enable the CUDA language once for whichever CUDA example set is requested. +if(BOOST_CAPY_BUILD_NVEXEC_EXAMPLES OR BOOST_CAPY_BUILD_CUDA_EXAMPLES) + enable_language(CUDA) + find_package(CUDAToolkit REQUIRED) +endif() + set_property(GLOBAL PROPERTY USE_FOLDERS ON) if(BOOST_CAPY_IS_ROOT AND BUILD_SHARED_LIBS) diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt index bae311ea..0d48896d 100644 --- a/example/CMakeLists.txt +++ b/example/CMakeLists.txt @@ -28,10 +28,16 @@ if(BOOST_CAPY_BUILD_P2300_EXAMPLES) add_subdirectory(awaitable-sender) endif() +if(BOOST_CAPY_BUILD_CUDA_EXAMPLES) + add_subdirectory(cuda/datamovement) +endif() + if(BOOST_CAPY_BUILD_NVEXEC_EXAMPLES) - add_subdirectory(gpu-pipeline) + add_subdirectory(cuda/pipeline) endif() +add_subdirectory(fabrics) + if(TARGET Boost::asio) add_subdirectory(asio) endif() diff --git a/example/cuda/datamovement/CMakeLists.txt b/example/cuda/datamovement/CMakeLists.txt new file mode 100644 index 00000000..ec626253 --- /dev/null +++ b/example/cuda/datamovement/CMakeLists.txt @@ -0,0 +1,52 @@ +# +# Copyright (c) 2026 Steve Gerbino +# +# Distributed under the Boost Software License, Version 1.0. (See accompanying +# file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +# +# Official repository: https://github.com/cppalliance/capy +# + +# CUDA was enabled at the top level when the option was flipped on. +if(NOT CMAKE_CUDA_COMPILER) + message(FATAL_ERROR + "example/cuda/datamovement requires CUDA; " + "did you set BOOST_CAPY_BUILD_CUDA_EXAMPLES?") +endif() + +file(GLOB_RECURSE PFILES CONFIGURE_DEPENDS + *.cu *.cuh *.hpp + CMakeLists.txt + README.md) + +source_group(TREE ${CMAKE_CURRENT_SOURCE_DIR} PREFIX "" FILES ${PFILES}) + +add_executable(capy_example_cuda_datamovement ${PFILES}) + +set_target_properties(capy_example_cuda_datamovement PROPERTIES + FOLDER "examples" + CUDA_STANDARD 20 + CUDA_STANDARD_REQUIRED ON + CUDA_SEPARABLE_COMPILATION OFF) + +target_compile_features(capy_example_cuda_datamovement PRIVATE cxx_std_20) + +target_link_libraries(capy_example_cuda_datamovement PRIVATE + Boost::capy + CUDA::cudart) + +# The NCCL interop snippet compiles only when NCCL is available. +# Without it, the rest of the example still builds. +find_path(CAPY_NCCL_INCLUDE_DIR nccl.h) +find_library(CAPY_NCCL_LIBRARY nccl) +if(CAPY_NCCL_INCLUDE_DIR AND CAPY_NCCL_LIBRARY) + target_include_directories(capy_example_cuda_datamovement PRIVATE + ${CAPY_NCCL_INCLUDE_DIR}) + target_link_libraries(capy_example_cuda_datamovement PRIVATE + ${CAPY_NCCL_LIBRARY}) + target_compile_definitions(capy_example_cuda_datamovement PRIVATE + CAPY_EXAMPLE_HAS_NCCL=1) + message(STATUS "cuda/datamovement: NCCL found; building NCCL interop") +else() + message(STATUS "cuda/datamovement: NCCL not found; skipping NCCL interop") +endif() diff --git a/example/cuda/datamovement/README.md b/example/cuda/datamovement/README.md new file mode 100644 index 00000000..e645722c --- /dev/null +++ b/example/cuda/datamovement/README.md @@ -0,0 +1,62 @@ +# CUDA data-movement example (P4251R0) + +Validation that the CUDA data-movement listings from +P4251R0 "IoAwaitables for GPU Data Movement" are type-correct against the +real `boost::capy` API and CUDA. The paper flags this code as AI-generated +and unverified; this target proves it compiles. Nothing here is executed +at runtime. + +What is validated: + +- `cuda_stream_awaiter`: the io_env-less baseline. Asserted to be a + standard awaitable but **not** an `IoAwaitable`. +- `cuda_stream`: `memcpy_h2d` / `memcpy_d2h` / `synchronize` return + `IoAwaitable`s. +- NCCL interop: `ncclAllReduce` on `cuda_stream::native_handle()` + followed by `co_await synchronize()`. Built only when NCCL is found at + configure time. +- `cuda_device_stream`: satisfies `WriteStream`, type-erases behind + `any_write_stream`, and the `ingest()` protocol handler compiles once + against both a GPU stream and an in-memory transport. +- CUDA Graphs (`cuda_graphs.cu`): a captured graph is replayed inside + a coroutine that drives `cuda_stream` memcpy / synchronize. + +The non-GPU listings (the byte-oriented compound result and the +RDMA/libfabric/UCX signatures) do not need CUDA and live in the sibling +`example/fabrics` example. The sender bridge is in `example/cuda/pipeline`. + +## Prerequisites + +- NVIDIA GPU and driver visible to `nvidia-smi`. +- CUDA toolkit (13.x works). On Arch: `pacman -S cuda`. +- clang as host and CUDA compiler (verified with clang 22). +- `CMAKE_CXX_STANDARD=20`. + +## Building + +``` +CXX=clang++ cmake -S . -B build-cuda -G Ninja \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_CXX_STANDARD=20 \ + -DBOOST_CAPY_BUILD_CUDA_EXAMPLES=ON \ + -DCMAKE_CUDA_COMPILER=clang++ \ + -DCMAKE_CUDA_HOST_COMPILER=clang++ \ + -DCMAKE_CUDA_ARCHITECTURES=89 \ + -DCUDAToolkit_ROOT=/opt/cuda +cmake --build build-cuda --config Release --target capy_example_cuda_datamovement +``` + +Replace `89` with your GPU's compute capability +(`nvidia-smi --query-gpu=compute_cap --format=csv,noheader`). + +A clean build is the pass condition; the binary need not be run. + +## Scope + +No runtime execution and no multi-device topologies. A clean +build with every `static_assert` holding is the whole deliverable. The +NCCL snippet builds only when NCCL is found. NVSHMEM (a GPU member of the +paper's HPC-fabric list) is not verified: `nvshmem_int_put` is device-side +and its headers do not compile under clang-cuda (capy requires clang-cuda, +since nvcc lacks C++20 coroutines). The non-GPU fabric signatures live in +`example/fabrics`, and the sender bridge in `example/cuda/pipeline`. diff --git a/example/cuda/datamovement/cuda_datamovement.cu b/example/cuda/datamovement/cuda_datamovement.cu new file mode 100644 index 00000000..56924279 --- /dev/null +++ b/example/cuda/datamovement/cuda_datamovement.cu @@ -0,0 +1,92 @@ +// +// Copyright (c) 2026 Steve Gerbino +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +// +// Official repository: https://github.com/cppalliance/capy +// + +#include "cuda_datamovement.hpp" + +#include +#include +#include + +#include +#include +#include +#include +#include + +namespace capy = boost::capy; +namespace ex = capy::example; + +// Intentionally io_env-less: a standard awaitable, not an IoAwaitable. +static_assert(! capy::IoAwaitable); + +// The data-movement awaitables depend on this helper, which the paper +// references but never defines. +static_assert(std::is_same_v< + decltype(ex::make_cuda_error(cudaSuccess)), std::error_code>); + +// The memcpy member functions return IoAwaitables. +static_assert(capy::IoAwaitable< + decltype(std::declval().memcpy_h2d( + nullptr, nullptr, std::size_t{0}))>); +static_assert(capy::IoAwaitable< + decltype(std::declval().memcpy_d2h( + nullptr, nullptr, std::size_t{0}))>); +static_assert(capy::IoAwaitable< + decltype(std::declval().synchronize())>); + +// GPU device memory satisfies WriteStream and type-erases with zero +// per-operation allocation. +static_assert(capy::WriteStream); + +// A protocol handler compiled once, linked against any transport. +capy::task<> +ingest(capy::any_write_stream& dest, std::span data) +{ + auto [ec, n] = co_await dest.write_some( + capy::make_buffer(data.data(), data.size())); + if(ec) + co_return; + // ...protocol logic... +} + +// Reference ingest against two transports to force the "one .o, many +// transports" claim to compile. Never executed. +[[maybe_unused]] void +link_check() +{ + ex::cuda_device_stream gpu(nullptr, nullptr); + capy::any_write_stream gpu_dest(&gpu); // GPU device memory + + capy::test::write_stream mem; + capy::any_write_stream mem_dest(&mem); // in-memory transport + + std::byte payload[8]{}; + (void) ingest(gpu_dest, payload); + (void) ingest(mem_dest, payload); +} + +#if defined(CAPY_EXAMPLE_HAS_NCCL) +#include + +// NCCL interop: a collective enqueues onto the CUDA stream, then +// synchronize() awaits its completion through the same IoAwaitable path. +capy::task<> +all_reduce(ex::cuda_stream& cs, ncclComm_t comm, + float const* sendbuf, float* recvbuf, std::size_t count) +{ + ncclAllReduce(sendbuf, recvbuf, count, ncclFloat, ncclSum, + comm, cs.native_handle()); + co_await cs.synchronize(); +} +#endif + +int main() +{ + return 0; +} diff --git a/example/cuda/datamovement/cuda_datamovement.hpp b/example/cuda/datamovement/cuda_datamovement.hpp new file mode 100644 index 00000000..4855cd67 --- /dev/null +++ b/example/cuda/datamovement/cuda_datamovement.hpp @@ -0,0 +1,353 @@ +// +// Copyright (c) 2026 Steve Gerbino +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +// +// Official repository: https://github.com/cppalliance/capy +// + +#ifndef BOOST_CAPY_EXAMPLE_CUDA_DATAMOVEMENT_HPP +#define BOOST_CAPY_EXAMPLE_CUDA_DATAMOVEMENT_HPP + +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include + +namespace boost { +namespace capy { +namespace example { + +/// Error category for `cudaError_t` values. +class cuda_error_category + : public std::error_category +{ +public: + char const* name() const noexcept override + { + return "cuda"; + } + + std::string message(int ev) const override + { + return ::cudaGetErrorString(static_cast(ev)); + } +}; + +/// Return the singleton CUDA error category. +inline std::error_category const& cuda_category() noexcept +{ + static cuda_error_category const cat; + return cat; +} + +/// Convert a `cudaError_t` to a `std::error_code`. +inline std::error_code make_cuda_error(cudaError_t e) noexcept +{ + return std::error_code(static_cast(e), cuda_category()); +} + +/// A minimal hand-rolled CUDA-completion awaitable (no executor +/// affinity, cancellation, or frame allocator). Resumes on the CUDA +/// driver callback thread. +struct cuda_stream_awaiter +{ + cudaStream_t stream; + + bool await_ready() const noexcept + { + return false; + } + + void await_suspend(std::coroutine_handle<> h) + { + cudaLaunchHostFunc(stream, + [](void* data) + { + std::coroutine_handle<>::from_address(data).resume(); + }, + h.address()); + } + + void await_resume() noexcept + { + } +}; + +/// A CUDA stream whose data-movement operations are IoAwaitables. +/// +/// `memcpy_h2d`/`memcpy_d2h` issue a `cudaMemcpyAsync` and resume the +/// awaiting coroutine on `env->executor` when the stream's +/// `cudaLaunchHostFunc` callback fires. One operation is in flight per +/// stream at a time, so the resume context is a pre-allocated member +/// rather than a per-operation allocation. +class cuda_stream +{ + cudaStream_t stream_ = nullptr; + continuation cont_; + std::error_code error_; + + struct resume_ctx + { + executor_ref ex; + continuation* cont = nullptr; + }; + + resume_ctx ctx_; + + static void CUDART_CB + on_complete(void* arg) + { + auto* ctx = static_cast(arg); + ctx->ex.post(*ctx->cont); + } + + // The paper hardcodes HostToDevice and describes memcpy_d2h as "the + // same pattern"; a kind field unifies both without duplicating the + // awaitable. + struct copy_awaitable + { + cuda_stream* self; + void* dst; + void const* src; + std::size_t count; + cudaMemcpyKind kind; + + bool await_ready() const noexcept + { + return false; + } + + std::coroutine_handle<> + await_suspend(std::coroutine_handle<> h, io_env const* env) + { + auto err = cudaMemcpyAsync( + dst, src, count, kind, self->stream_); + if(err != cudaSuccess) + { + self->error_ = make_cuda_error(err); + return h; + } + self->cont_.h = h; + self->ctx_ = resume_ctx{env->executor, &self->cont_}; + err = cudaLaunchHostFunc( + self->stream_, &on_complete, &self->ctx_); + if(err != cudaSuccess) + { + self->error_ = make_cuda_error(err); + return h; + } + return std::noop_coroutine(); + } + + void await_resume() + { + if(self->error_) + throw std::system_error(self->error_); + self->error_ = {}; + } + }; + + struct sync_awaitable + { + cuda_stream* self; + + bool await_ready() const noexcept + { + return false; + } + + std::coroutine_handle<> + await_suspend(std::coroutine_handle<> h, io_env const* env) + { + self->cont_.h = h; + self->ctx_ = resume_ctx{env->executor, &self->cont_}; + auto err = cudaLaunchHostFunc( + self->stream_, &on_complete, &self->ctx_); + if(err != cudaSuccess) + { + self->error_ = make_cuda_error(err); + return h; + } + return std::noop_coroutine(); + } + + void await_resume() + { + if(self->error_) + throw std::system_error(self->error_); + self->error_ = {}; + } + }; + +public: + cuda_stream() + { + auto err = cudaStreamCreate(&stream_); + if(err != cudaSuccess) + throw std::system_error(make_cuda_error(err)); + } + + ~cuda_stream() + { + if(stream_) + cudaStreamDestroy(stream_); + } + + cuda_stream(cuda_stream&& other) noexcept + : stream_(std::exchange(other.stream_, nullptr)) + { + } + + cuda_stream& operator=(cuda_stream&& other) noexcept + { + if(this != &other) + { + if(stream_) + cudaStreamDestroy(stream_); + stream_ = std::exchange(other.stream_, nullptr); + } + return *this; + } + + cuda_stream(cuda_stream const&) = delete; + cuda_stream& operator=(cuda_stream const&) = delete; + + /// Return the underlying CUDA stream handle. + cudaStream_t native_handle() const noexcept + { + return stream_; + } + + /// Asynchronously copy `count` bytes from host `src` to device `dst`. + auto memcpy_h2d(void* dst, void const* src, std::size_t count) + { + return copy_awaitable{ + this, dst, src, count, cudaMemcpyHostToDevice}; + } + + /// Asynchronously copy `count` bytes from device `src` to host `dst`. + auto memcpy_d2h(void* dst, void const* src, std::size_t count) + { + return copy_awaitable{ + this, dst, src, count, cudaMemcpyDeviceToHost}; + } + + /// Asynchronously wait for all pending stream operations to complete. + auto synchronize() + { + return sync_awaitable{this}; + } +}; + +/// GPU device memory exposed as a WriteStream. +/// +/// Reshapes the `cuda_stream` memcpy pattern to satisfy `WriteStream`, so device +/// memory can hide behind `any_write_stream`. Because `cudaMemcpyAsync` +/// transfers the whole buffer in one operation, `write_some` never +/// performs a partial write. Errors are delivered via `io_result` +/// rather than exceptions. Does not own `stream_`; the caller is +/// responsible for the stream's lifetime. +class cuda_device_stream +{ + cudaStream_t stream_; + std::byte* d_ptr_; + std::size_t offset_ = 0; + continuation cont_; + std::error_code error_; + + struct resume_ctx + { + executor_ref ex; + continuation* cont = nullptr; + }; + + resume_ctx ctx_; + + static void CUDART_CB + on_complete(void* arg) + { + auto* ctx = static_cast(arg); + ctx->ex.post(*ctx->cont); + } + +public: + cuda_device_stream(cudaStream_t s, std::byte* device_ptr) + : stream_(s) + , d_ptr_(device_ptr) + { + } + + template + auto write_some(Buffers buffers) + { + struct awaitable + { + cuda_device_stream* self; + const_buffer buf; + + bool await_ready() const noexcept + { + return false; + } + + std::coroutine_handle<> + await_suspend(std::coroutine_handle<> h, io_env const* env) + { + auto n = buf.size(); + auto err = cudaMemcpyAsync( + self->d_ptr_ + self->offset_, + buf.data(), n, + cudaMemcpyHostToDevice, + self->stream_); + if(err != cudaSuccess) + { + self->error_ = make_cuda_error(err); + return h; + } + self->cont_.h = h; + self->ctx_ = resume_ctx{env->executor, &self->cont_}; + err = cudaLaunchHostFunc( + self->stream_, &on_complete, &self->ctx_); + if(err != cudaSuccess) + { + self->error_ = make_cuda_error(err); + return h; + } + return std::noop_coroutine(); + } + + io_result + await_resume() + { + if(self->error_) + { + auto ec = self->error_; + self->error_ = {}; + return {ec, 0}; + } + auto n = buf.size(); + self->offset_ += n; + return {{}, n}; + } + }; + return awaitable{this, *capy::begin(buffers)}; + } +}; + +} // namespace example +} // namespace capy +} // namespace boost + +#endif diff --git a/example/cuda/datamovement/cuda_graphs.cu b/example/cuda/datamovement/cuda_graphs.cu new file mode 100644 index 00000000..eb7e041a --- /dev/null +++ b/example/cuda/datamovement/cuda_graphs.cu @@ -0,0 +1,69 @@ +// +// Copyright (c) 2026 Steve Gerbino +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +// +// Official repository: https://github.com/cppalliance/capy +// + +#include "cuda_datamovement.hpp" + +#include + +#include + +#include + +namespace capy = boost::capy; +namespace ex = capy::example; + +namespace { + +__global__ void +kernel_A(float* y, int n) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + if(i < n) + y[i] += 1.0f; +} + +__global__ void +kernel_B(float* y, int n) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + if(i < n) + y[i] *= 2.0f; +} + +// A pre-captured CUDA graph is the inner optimized hot path; the +// coroutine is the outer, data-dependent loop (copy in, launch the graph, +// copy out). Graph replay and coroutine orchestration optimize different +// layers and compose without either subsuming the other. +[[maybe_unused]] capy::task<> +graph_replay(ex::cuda_stream& cs, float* d_y, float* h_y, int n) +{ + cudaStream_t stream = cs.native_handle(); + + cudaGraph_t graph; + cudaGraphExec_t instance; + dim3 grid(1); + dim3 block(static_cast(n)); + + cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal); + kernel_A<<>>(d_y, n); + kernel_B<<>>(d_y, n); + cudaStreamEndCapture(stream, &graph); + + cudaGraphInstantiate(&instance, graph, 0); + + co_await cs.memcpy_h2d(d_y, h_y, n * sizeof(float)); + cudaGraphLaunch(instance, stream); + co_await cs.synchronize(); + co_await cs.memcpy_d2h(h_y, d_y, n * sizeof(float)); + + cudaGraphExecDestroy(instance); + cudaGraphDestroy(graph); +} + +} // namespace diff --git a/example/gpu-pipeline/CMakeLists.txt b/example/cuda/pipeline/CMakeLists.txt similarity index 83% rename from example/gpu-pipeline/CMakeLists.txt rename to example/cuda/pipeline/CMakeLists.txt index 4873cf04..3913b4cf 100644 --- a/example/gpu-pipeline/CMakeLists.txt +++ b/example/cuda/pipeline/CMakeLists.txt @@ -11,7 +11,7 @@ # Honor a clean error if the user wired around it. if(NOT CMAKE_CUDA_COMPILER) message(FATAL_ERROR - "example/gpu-pipeline requires CUDA; " + "example/cuda-pipeline requires CUDA; " "did you set BOOST_CAPY_BUILD_NVEXEC_EXAMPLES?") endif() @@ -41,17 +41,17 @@ file(GLOB_RECURSE PFILES CONFIGURE_DEPENDS source_group(TREE ${CMAKE_CURRENT_SOURCE_DIR} PREFIX "" FILES ${PFILES}) -add_executable(capy_example_gpu_pipeline ${PFILES}) +add_executable(capy_example_cuda_pipeline ${PFILES}) -set_target_properties(capy_example_gpu_pipeline PROPERTIES +set_target_properties(capy_example_cuda_pipeline PROPERTIES FOLDER "examples" CUDA_STANDARD 20 CUDA_STANDARD_REQUIRED ON CUDA_SEPARABLE_COMPILATION OFF) -target_compile_features(capy_example_gpu_pipeline PRIVATE cxx_std_23) +target_compile_features(capy_example_cuda_pipeline PRIVATE cxx_std_23) -target_link_libraries(capy_example_gpu_pipeline PRIVATE +target_link_libraries(capy_example_cuda_pipeline PRIVATE Boost::capy STDEXEC::stdexec STDEXEC::nvexec diff --git a/example/gpu-pipeline/Jamfile b/example/cuda/pipeline/Jamfile similarity index 100% rename from example/gpu-pipeline/Jamfile rename to example/cuda/pipeline/Jamfile diff --git a/example/gpu-pipeline/README.md b/example/cuda/pipeline/README.md similarity index 78% rename from example/gpu-pipeline/README.md rename to example/cuda/pipeline/README.md index 4ffb7ae4..719b48ad 100644 --- a/example/gpu-pipeline/README.md +++ b/example/cuda/pipeline/README.md @@ -1,8 +1,9 @@ -# GPU pipeline example +# CUDA pipeline example This example demonstrates that `boost::capy::await_sender` and `boost::capy::as_sender` compose with NVIDIA's `nvexec::stream_scheduler`, -not just with CPU schedulers. Two scenes: +not just with CPU schedulers. Two runnable scenes, plus a third that is +built but not run (P4251R0): 1. **Scene 1 (Direction 1).** A `boost::capy::task` running on `boost::capy::thread_pool` `co_await`s a sender whose terminal action is @@ -23,6 +24,16 @@ not just with CPU schedulers. Two scenes: or returns `noop_coroutine()` after stashing the continuation for the peer to resume. +3. **Scene 3 (P4251R0), built but not run.** `handle_request` shows the + inference-handler shape: a type-erased `any_read_source` read, GPU + dispatch via `await_sender` over a real nvexec kernel, and a type-erased + `any_write_sink` write. It is compiled but not executed (`main` does not + call it). The paper's listing runs a host `run_model()` under a + device-side `then()`, which does not compile on nvexec (host call from + device); this mirrors Scene 1's pattern instead, dispatching a real + kernel and hopping `continues_on(cpu)` before the host-only bridge, and + takes a CPU scheduler the paper's signature omits. + The bridge headers (`awaitable_sender.hpp`, `sender_awaitable.hpp`) are copied verbatim from `bench/stdexec/`; the bridge in the bench was already written against NVIDIA/stdexec. @@ -53,8 +64,8 @@ CXX=clang++ cmake -S . -B build \ -DCUDAToolkit_ROOT=/opt/cuda \ -DBOOST_CAPY_BUILD_STDEXEC_EXAMPLES=ON \ -DBOOST_CAPY_BUILD_NVEXEC_EXAMPLES=ON -cmake --build build --config Release --target capy_example_gpu_pipeline -./build/example/gpu-pipeline/capy_example_gpu_pipeline +cmake --build build --config Release --target capy_example_cuda_pipeline +./build/example/cuda-pipeline/capy_example_cuda_pipeline ``` Replace `89` with your GPU's compute capability (`nvidia-smi diff --git a/example/gpu-pipeline/awaitable_sender.hpp b/example/cuda/pipeline/awaitable_sender.hpp similarity index 100% rename from example/gpu-pipeline/awaitable_sender.hpp rename to example/cuda/pipeline/awaitable_sender.hpp diff --git a/example/gpu-pipeline/gpu_pipeline.cu b/example/cuda/pipeline/cuda_pipeline.cu similarity index 82% rename from example/gpu-pipeline/gpu_pipeline.cu rename to example/cuda/pipeline/cuda_pipeline.cu index 471c2fc4..1071bd34 100644 --- a/example/gpu-pipeline/gpu_pipeline.cu +++ b/example/cuda/pipeline/cuda_pipeline.cu @@ -23,12 +23,15 @@ #include "sender_awaitable.hpp" #include +#include +#include #include #include #include #include +#include #include #include #include @@ -126,6 +129,57 @@ scene1(nvexec::stream_scheduler gpu, co_return h_y0; } +// Scene 3 (P4251R0): the inference-handler shape. Network I/O uses +// type-erased coroutine streams (any_read_source / any_write_sink); GPU +// dispatch uses a sender bridged with await_sender. The paper's +// listing runs a host run_model() under a device-side then(), which does +// not compile on nvexec; this mirrors Scene 1 instead, dispatching a real +// kernel and hopping continues_on(cpu) before the host-only bridge. +[[maybe_unused]] capy::task<> +handle_request( + capy::any_read_source& client, + capy::any_write_sink& response, + nvexec::stream_context& gpu_ctx, + exec::static_thread_pool::scheduler cpu) +{ + // receive request (coroutine, type-erased) + std::array buf; + auto [ec, n] = co_await client.read_some( + capy::mutable_buffer(buf.data(), buf.size())); + if(ec) + co_return; + (void) n; + + // dispatch to GPU (sender, compile-time composition) + auto gpu = gpu_ctx.get_scheduler(); + constexpr int N = 64; + float* d_y = nullptr; + cuda_check(cudaMalloc(&d_y, N * sizeof(float)), "scene3 malloc"); + + co_await capy::await_sender( + ex::just(N, d_y) + | ex::continues_on(gpu) + | nvexec::launch({.grid_size = 1, .block_size = N}, + [] (cudaStream_t, int len, float* y) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < len) + y[i] = static_cast(i); + }) + | ex::continues_on(cpu)); + + cuda_check(cudaSetDevice(0), "scene3 setdevice"); + std::array result{}; + cuda_check(cudaMemcpy(result.data(), d_y, + N * sizeof(float), cudaMemcpyDeviceToHost), "scene3 D2H"); + cuda_check(cudaFree(d_y), "scene3 free"); + + // send result back (coroutine, type-erased) + auto [wec, wn] = co_await capy::write(response, + capy::make_buffer(result.data(), result.size() * sizeof(float))); + (void) wec; + (void) wn; +} + // Adapter run_async-like driver: kicks off scene1 on the capy // thread_pool, blocks the caller until it completes, and returns // the result via the supplied storage. diff --git a/example/gpu-pipeline/sender_awaitable.hpp b/example/cuda/pipeline/sender_awaitable.hpp similarity index 100% rename from example/gpu-pipeline/sender_awaitable.hpp rename to example/cuda/pipeline/sender_awaitable.hpp diff --git a/example/fabrics/CMakeLists.txt b/example/fabrics/CMakeLists.txt new file mode 100644 index 00000000..ec2ed1dd --- /dev/null +++ b/example/fabrics/CMakeLists.txt @@ -0,0 +1,69 @@ +# +# Copyright (c) 2026 Steve Gerbino +# +# Distributed under the Boost Software License, Version 1.0. (See accompanying +# file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +# +# Official repository: https://github.com/cppalliance/capy +# + +# Transport-neutral, non-GPU listings from P4251R0: the byte-oriented +# compound-result pattern (capy only) and the HPC-fabric send signatures. +# No CUDA. + +file(GLOB_RECURSE PFILES CONFIGURE_DEPENDS + *.cpp *.hpp + CMakeLists.txt + README.md) + +source_group(TREE ${CMAKE_CURRENT_SOURCE_DIR} PREFIX "" FILES ${PFILES}) + +add_executable(capy_example_fabrics ${PFILES}) + +set_target_properties(capy_example_fabrics PROPERTIES FOLDER "examples") + +target_compile_features(capy_example_fabrics PRIVATE cxx_std_20) + +target_link_libraries(capy_example_fabrics PRIVATE Boost::capy) + +# HPC-fabric signature checks: each call is compiled (never run) against the real +# library header only when that library is found, to verify the paper's +# signature. Each library is optional and independent. +find_path(CAPY_IBVERBS_INCLUDE_DIR infiniband/verbs.h) +find_library(CAPY_IBVERBS_LIBRARY ibverbs) +if(CAPY_IBVERBS_INCLUDE_DIR AND CAPY_IBVERBS_LIBRARY) + target_include_directories(capy_example_fabrics PRIVATE + ${CAPY_IBVERBS_INCLUDE_DIR}) + target_link_libraries(capy_example_fabrics PRIVATE + ${CAPY_IBVERBS_LIBRARY}) + target_compile_definitions(capy_example_fabrics PRIVATE + CAPY_EXAMPLE_HAS_IBVERBS=1) + message(STATUS "fabrics: libibverbs found; verifying ibv_post_send") +endif() + +find_path(CAPY_LIBFABRIC_INCLUDE_DIR rdma/fabric.h) +find_library(CAPY_LIBFABRIC_LIBRARY fabric) +if(CAPY_LIBFABRIC_INCLUDE_DIR AND CAPY_LIBFABRIC_LIBRARY) + target_include_directories(capy_example_fabrics PRIVATE + ${CAPY_LIBFABRIC_INCLUDE_DIR}) + target_link_libraries(capy_example_fabrics PRIVATE + ${CAPY_LIBFABRIC_LIBRARY}) + target_compile_definitions(capy_example_fabrics PRIVATE + CAPY_EXAMPLE_HAS_LIBFABRIC=1) + message(STATUS "fabrics: libfabric found; verifying fi_send") +endif() + +# UCX ships in the official 'openucx' package; it is also bundled in the +# NVIDIA HPC SDK. Pass -DCAPY_UCX_INCLUDE_DIR/-DCAPY_UCX_LIBRARY to point at +# a non-standard location. +find_path(CAPY_UCX_INCLUDE_DIR ucp/api/ucp.h) +find_library(CAPY_UCX_LIBRARY ucp) +if(CAPY_UCX_INCLUDE_DIR AND CAPY_UCX_LIBRARY) + target_include_directories(capy_example_fabrics PRIVATE + ${CAPY_UCX_INCLUDE_DIR}) + target_link_libraries(capy_example_fabrics PRIVATE + ${CAPY_UCX_LIBRARY}) + target_compile_definitions(capy_example_fabrics PRIVATE + CAPY_EXAMPLE_HAS_UCX=1) + message(STATUS "fabrics: UCX found; verifying ucp_tag_send_nbx") +endif() diff --git a/example/fabrics/README.md b/example/fabrics/README.md new file mode 100644 index 00000000..d1fda947 --- /dev/null +++ b/example/fabrics/README.md @@ -0,0 +1,43 @@ +# Fabrics example (P4251R0) + +The transport-neutral, non-GPU listings from P4251R0 "IoAwaitables for GPU +Data Movement". Validation that the paper's byte-oriented and +HPC-fabric calls are type-correct against the real `boost::capy` API and the +installed fabric libraries. Nothing here is executed; a clean build is the +deliverable. + +Unlike the `cuda/` examples, this needs **no CUDA toolchain**: only a +C++20 compiler and `boost::capy`, plus whichever fabric libraries happen to +be installed. + +What is validated: + +- `read_with_reset`: `read_some` delivers `(error_code, n)` via structured + bindings; the coroutine branches on a partial-read condition with no + sender channel to choose. Pure capy, no transport library. +- HPC-fabric send signatures, each built only when its library is found: + - libibverbs `ibv_post_send` (RDMA / InfiniBand) + - libfabric `fi_send` (OFI completion-queue model) + - UCX `ucp_tag_send_nbx` (progress-engine callback model) + +NCCL and NVSHMEM are the GPU members of the paper's fabric list; NCCL is +exercised by the `cuda/datamovement` example, and NVSHMEM's device API does +not compile under clang-cuda (see that example's notes). + +## Building + +This builds as part of the normal example set (`BOOST_CAPY_BUILD_EXAMPLES`). +The fabric checks activate automatically when their libraries are present: + +- libibverbs: `libibverbs` package. +- libfabric: `libfabric` package. +- UCX: `openucx` package, or pass + `-DCAPY_UCX_INCLUDE_DIR= -DCAPY_UCX_LIBRARY=` to point at a + non-standard location (for example the UCX bundled in the NVIDIA HPC SDK). + +``` +cmake -S . -B build -G Ninja -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_STANDARD=20 +cmake --build build --config Release --target capy_example_fabrics +``` + +A clean build is the pass condition; the binary need not be run. diff --git a/example/fabrics/fabrics.cpp b/example/fabrics/fabrics.cpp new file mode 100644 index 00000000..258fadb7 --- /dev/null +++ b/example/fabrics/fabrics.cpp @@ -0,0 +1,106 @@ +// +// Copyright (c) 2026 Steve Gerbino +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +// +// Official repository: https://github.com/cppalliance/capy +// + +// The transport-neutral, non-GPU listings from P4251R0: the byte-oriented +// compound-result pattern (capy only) and the HPC-fabric send-call +// signatures (libibverbs / libfabric / UCX). Nothing is executed; the +// build itself is the check. Each fabric block builds only when found. + +#include +#include + +#include +#include + +namespace capy = boost::capy; + +namespace { + +// A byte-oriented read delivers (error_code, n) via structured bindings; +// the coroutine branches on a partial-read condition (the peer reset after +// n bytes arrived) with no sender channel to choose. The same compound +// result is what RDMA work completions carry. +[[maybe_unused]] capy::task<> +read_with_reset(capy::test::stream& s) +{ + std::byte buf[64]; + auto [ec, n] = co_await s.read_some( + capy::mutable_buffer(buf, sizeof buf)); + if(ec == std::errc::connection_reset) + { + // 'n' bytes arrived before the reset. + (void) n; + co_return; + } + (void) n; +} + +} // namespace + +#if defined(CAPY_EXAMPLE_HAS_IBVERBS) +#include + +namespace { + +// libibverbs: completion via a completion-channel file descriptor. +[[maybe_unused]] void +sig_ibverbs() +{ + ibv_qp* qp = nullptr; + ibv_send_wr wr{}; + ibv_send_wr* bad_wr = nullptr; + (void) ibv_post_send(qp, &wr, &bad_wr); +} + +} // namespace +#endif + +#if defined(CAPY_EXAMPLE_HAS_LIBFABRIC) +#include + +namespace { + +// libfabric: completion via a completion-queue poll. +[[maybe_unused]] void +sig_libfabric() +{ + fid_ep* ep = nullptr; + char buffer[16]; + fi_addr_t dest_addr = 0; + void* context = nullptr; + (void) fi_send(ep, buffer, sizeof buffer, nullptr, dest_addr, context); +} + +} // namespace +#endif + +#if defined(CAPY_EXAMPLE_HAS_UCX) +#include + +namespace { + +// UCX: completion via a callback from the progress engine. +[[maybe_unused]] void +sig_ucx() +{ + ucp_ep_h ep = nullptr; + char buffer[16]; + ucp_tag_t tag = 0; + ucp_request_param_t param{}; + (void) ucp_tag_send_nbx(ep, buffer, sizeof buffer, tag, ¶m); +} + +} // namespace +#endif + +// The target exists to prove the listings are type-correct; it is not run. +int main() +{ + return 0; +}