cppalliance · sgerbino · Jun 2, 2026 · Jun 2, 2026
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -41,6 +41,7 @@ option(BOOST_CAPY_BUILD_EXAMPLES "Build boost::capy examples" ${BOOST_CAPY_IS_RO
 option(BOOST_CAPY_BUILD_BENCH "Build boost::capy benchmarks" ${BOOST_CAPY_IS_ROOT})
 option(BOOST_CAPY_BUILD_P2300_EXAMPLES "Build examples that depend on beman-execution (P2300)" OFF)
 option(BOOST_CAPY_BUILD_NVEXEC_EXAMPLES "Build examples that depend on NVIDIA nvexec (CUDA)" OFF)
+option(BOOST_CAPY_BUILD_CUDA_EXAMPLES "Build examples that depend only on CUDA (no stdexec/nvexec)" OFF)
 option(BOOST_CAPY_MRDOCS_BUILD "Build the target for MrDocs: see mrdocs.yml" OFF)
 
 if(BOOST_CAPY_BUILD_P2300_EXAMPLES)
@@ -60,14 +61,25 @@ if(BOOST_CAPY_BUILD_NVEXEC_EXAMPLES)
         message(FATAL_ERROR
             "BOOST_CAPY_BUILD_NVEXEC_EXAMPLES requires CMAKE_CXX_STANDARD >= 23")
     endif()
-    enable_language(CUDA)
-    find_package(CUDAToolkit REQUIRED)
     # Tell NVIDIA/stdexec to build the nvexec target when its
     # FetchContent is processed (bench/ and/or the example itself).
     set(STDEXEC_ENABLE_CUDA ON CACHE BOOL
         "Build nvexec when configuring NVIDIA/stdexec" FORCE)
 endif()
 
+if(BOOST_CAPY_BUILD_CUDA_EXAMPLES)
+    if(NOT DEFINED CMAKE_CXX_STANDARD OR CMAKE_CXX_STANDARD LESS 20)
+        message(FATAL_ERROR
+            "BOOST_CAPY_BUILD_CUDA_EXAMPLES requires CMAKE_CXX_STANDARD >= 20")
+    endif()
+endif()
+
+# Enable the CUDA language once for whichever CUDA example set is requested.
+if(BOOST_CAPY_BUILD_NVEXEC_EXAMPLES OR BOOST_CAPY_BUILD_CUDA_EXAMPLES)
+    enable_language(CUDA)
+    find_package(CUDAToolkit REQUIRED)
+endif()
+
 set_property(GLOBAL PROPERTY USE_FOLDERS ON)
 
 if(BOOST_CAPY_IS_ROOT AND BUILD_SHARED_LIBS)

diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt
@@ -28,10 +28,16 @@ if(BOOST_CAPY_BUILD_P2300_EXAMPLES)
     add_subdirectory(awaitable-sender)
 endif()
 
+if(BOOST_CAPY_BUILD_CUDA_EXAMPLES)
+    add_subdirectory(cuda/datamovement)
+endif()
+
 if(BOOST_CAPY_BUILD_NVEXEC_EXAMPLES)
-    add_subdirectory(gpu-pipeline)
+    add_subdirectory(cuda/pipeline)
 endif()
 
+add_subdirectory(fabrics)
+
 if(TARGET Boost::asio)
     add_subdirectory(asio)
 endif()

diff --git a/example/cuda/datamovement/CMakeLists.txt b/example/cuda/datamovement/CMakeLists.txt
@@ -0,0 +1,52 @@
+#
+# Copyright (c) 2026 Steve Gerbino
+#
+# Distributed under the Boost Software License, Version 1.0. (See accompanying
+# file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+#
+# Official repository: https://github.com/cppalliance/capy
+#
+
+# CUDA was enabled at the top level when the option was flipped on.
+if(NOT CMAKE_CUDA_COMPILER)
+    message(FATAL_ERROR
+        "example/cuda/datamovement requires CUDA; "
+        "did you set BOOST_CAPY_BUILD_CUDA_EXAMPLES?")
+endif()
+
+file(GLOB_RECURSE PFILES CONFIGURE_DEPENDS
+    *.cu *.cuh *.hpp
+    CMakeLists.txt
+    README.md)
+
+source_group(TREE ${CMAKE_CURRENT_SOURCE_DIR} PREFIX "" FILES ${PFILES})
+
+add_executable(capy_example_cuda_datamovement ${PFILES})
+
+set_target_properties(capy_example_cuda_datamovement PROPERTIES
+    FOLDER "examples"
+    CUDA_STANDARD 20
+    CUDA_STANDARD_REQUIRED ON
+    CUDA_SEPARABLE_COMPILATION OFF)
+
+target_compile_features(capy_example_cuda_datamovement PRIVATE cxx_std_20)
+
+target_link_libraries(capy_example_cuda_datamovement PRIVATE
+    Boost::capy
+    CUDA::cudart)
+
+# The NCCL interop snippet compiles only when NCCL is available.
+# Without it, the rest of the example still builds.
+find_path(CAPY_NCCL_INCLUDE_DIR nccl.h)
+find_library(CAPY_NCCL_LIBRARY nccl)
+if(CAPY_NCCL_INCLUDE_DIR AND CAPY_NCCL_LIBRARY)
+    target_include_directories(capy_example_cuda_datamovement PRIVATE
+        ${CAPY_NCCL_INCLUDE_DIR})
+    target_link_libraries(capy_example_cuda_datamovement PRIVATE
+        ${CAPY_NCCL_LIBRARY})
+    target_compile_definitions(capy_example_cuda_datamovement PRIVATE
+        CAPY_EXAMPLE_HAS_NCCL=1)
+    message(STATUS "cuda/datamovement: NCCL found; building NCCL interop")
+else()
+    message(STATUS "cuda/datamovement: NCCL not found; skipping NCCL interop")
+endif()
diff --git a/example/cuda/datamovement/README.md b/example/cuda/datamovement/README.md
@@ -0,0 +1,62 @@
+# CUDA data-movement example (P4251R0)
+
+Validation that the CUDA data-movement listings from
+P4251R0 "IoAwaitables for GPU Data Movement" are type-correct against the
+real `boost::capy` API and CUDA. The paper flags this code as AI-generated
+and unverified; this target proves it compiles. Nothing here is executed
+at runtime.
+
+What is validated:
+
+- `cuda_stream_awaiter`: the io_env-less baseline. Asserted to be a
+  standard awaitable but **not** an `IoAwaitable`.
+- `cuda_stream`: `memcpy_h2d` / `memcpy_d2h` / `synchronize` return
+  `IoAwaitable`s.
+- NCCL interop: `ncclAllReduce` on `cuda_stream::native_handle()`
+  followed by `co_await synchronize()`. Built only when NCCL is found at
+  configure time.
+- `cuda_device_stream`: satisfies `WriteStream`, type-erases behind
+  `any_write_stream`, and the `ingest()` protocol handler compiles once
+  against both a GPU stream and an in-memory transport.
+- CUDA Graphs (`cuda_graphs.cu`): a captured graph is replayed inside
+  a coroutine that drives `cuda_stream` memcpy / synchronize.
+
+The non-GPU listings (the byte-oriented compound result and the
+RDMA/libfabric/UCX signatures) do not need CUDA and live in the sibling
+`example/fabrics` example. The sender bridge is in `example/cuda/pipeline`.
+
+## Prerequisites
+
+- NVIDIA GPU and driver visible to `nvidia-smi`.
+- CUDA toolkit (13.x works). On Arch: `pacman -S cuda`.
+- clang as host and CUDA compiler (verified with clang 22).
+- `CMAKE_CXX_STANDARD=20`.
+
+## Building
+
+```
+CXX=clang++ cmake -S . -B build-cuda -G Ninja \
+    -DCMAKE_BUILD_TYPE=Release \
+    -DCMAKE_CXX_STANDARD=20 \
+    -DBOOST_CAPY_BUILD_CUDA_EXAMPLES=ON \
+    -DCMAKE_CUDA_COMPILER=clang++ \
+    -DCMAKE_CUDA_HOST_COMPILER=clang++ \
+    -DCMAKE_CUDA_ARCHITECTURES=89 \
+    -DCUDAToolkit_ROOT=/opt/cuda
+cmake --build build-cuda --config Release --target capy_example_cuda_datamovement
+```
+
+Replace `89` with your GPU's compute capability
+(`nvidia-smi --query-gpu=compute_cap --format=csv,noheader`).
+
+A clean build is the pass condition; the binary need not be run.
+
+## Scope
+
+No runtime execution and no multi-device topologies. A clean
+build with every `static_assert` holding is the whole deliverable. The
+NCCL snippet builds only when NCCL is found. NVSHMEM (a GPU member of the
+paper's HPC-fabric list) is not verified: `nvshmem_int_put` is device-side
+and its headers do not compile under clang-cuda (capy requires clang-cuda,
+since nvcc lacks C++20 coroutines). The non-GPU fabric signatures live in
+`example/fabrics`, and the sender bridge in `example/cuda/pipeline`.
diff --git a/example/cuda/datamovement/cuda_datamovement.cu b/example/cuda/datamovement/cuda_datamovement.cu
@@ -0,0 +1,92 @@
+//
+// Copyright (c) 2026 Steve Gerbino
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// Official repository: https://github.com/cppalliance/capy
+//
+
+#include "cuda_datamovement.hpp"
+
+#include <boost/capy.hpp>
+#include <boost/capy/concept/io_awaitable.hpp>
+#include <boost/capy/test/write_stream.hpp>
+
+#include <cstddef>
+#include <span>
+#include <system_error>
+#include <type_traits>
+#include <utility>
+
+namespace capy = boost::capy;
+namespace ex   = capy::example;
+
+// Intentionally io_env-less: a standard awaitable, not an IoAwaitable.
+static_assert(! capy::IoAwaitable<ex::cuda_stream_awaiter>);
+
+// The data-movement awaitables depend on this helper, which the paper
+// references but never defines.
+static_assert(std::is_same_v<
+    decltype(ex::make_cuda_error(cudaSuccess)), std::error_code>);
+
+// The memcpy member functions return IoAwaitables.
+static_assert(capy::IoAwaitable<
+    decltype(std::declval<ex::cuda_stream&>().memcpy_h2d(
+        nullptr, nullptr, std::size_t{0}))>);
+static_assert(capy::IoAwaitable<
+    decltype(std::declval<ex::cuda_stream&>().memcpy_d2h(
+        nullptr, nullptr, std::size_t{0}))>);
+static_assert(capy::IoAwaitable<
+    decltype(std::declval<ex::cuda_stream&>().synchronize())>);
+
+// GPU device memory satisfies WriteStream and type-erases with zero
+// per-operation allocation.
+static_assert(capy::WriteStream<ex::cuda_device_stream>);
+
+// A protocol handler compiled once, linked against any transport.
+capy::task<>
+ingest(capy::any_write_stream& dest, std::span<std::byte const> data)
+{
+    auto [ec, n] = co_await dest.write_some(
+        capy::make_buffer(data.data(), data.size()));
+    if(ec)
+        co_return;
+    // ...protocol logic...
+}
+
+// Reference ingest against two transports to force the "one .o, many
+// transports" claim to compile. Never executed.
+[[maybe_unused]] void
+link_check()
+{
+    ex::cuda_device_stream gpu(nullptr, nullptr);
+    capy::any_write_stream gpu_dest(&gpu);     // GPU device memory
+
+    capy::test::write_stream mem;
+    capy::any_write_stream mem_dest(&mem);     // in-memory transport
+
+    std::byte payload[8]{};
+    (void) ingest(gpu_dest, payload);
+    (void) ingest(mem_dest, payload);
+}
+
+#if defined(CAPY_EXAMPLE_HAS_NCCL)
+#include <nccl.h>
+
+// NCCL interop: a collective enqueues onto the CUDA stream, then
+// synchronize() awaits its completion through the same IoAwaitable path.
+capy::task<>
+all_reduce(ex::cuda_stream& cs, ncclComm_t comm,
+    float const* sendbuf, float* recvbuf, std::size_t count)
+{
+    ncclAllReduce(sendbuf, recvbuf, count, ncclFloat, ncclSum,
+        comm, cs.native_handle());
+    co_await cs.synchronize();
+}
+#endif
+
+int main()
+{
+    return 0;
+}