Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 14 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ option(BOOST_CAPY_BUILD_EXAMPLES "Build boost::capy examples" ${BOOST_CAPY_IS_RO
option(BOOST_CAPY_BUILD_BENCH "Build boost::capy benchmarks" ${BOOST_CAPY_IS_ROOT})
option(BOOST_CAPY_BUILD_P2300_EXAMPLES "Build examples that depend on beman-execution (P2300)" OFF)
option(BOOST_CAPY_BUILD_NVEXEC_EXAMPLES "Build examples that depend on NVIDIA nvexec (CUDA)" OFF)
option(BOOST_CAPY_BUILD_CUDA_EXAMPLES "Build examples that depend only on CUDA (no stdexec/nvexec)" OFF)
option(BOOST_CAPY_MRDOCS_BUILD "Build the target for MrDocs: see mrdocs.yml" OFF)

if(BOOST_CAPY_BUILD_P2300_EXAMPLES)
Expand All @@ -60,14 +61,25 @@ if(BOOST_CAPY_BUILD_NVEXEC_EXAMPLES)
message(FATAL_ERROR
"BOOST_CAPY_BUILD_NVEXEC_EXAMPLES requires CMAKE_CXX_STANDARD >= 23")
endif()
enable_language(CUDA)
find_package(CUDAToolkit REQUIRED)
# Tell NVIDIA/stdexec to build the nvexec target when its
# FetchContent is processed (bench/ and/or the example itself).
set(STDEXEC_ENABLE_CUDA ON CACHE BOOL
"Build nvexec when configuring NVIDIA/stdexec" FORCE)
endif()

if(BOOST_CAPY_BUILD_CUDA_EXAMPLES)
if(NOT DEFINED CMAKE_CXX_STANDARD OR CMAKE_CXX_STANDARD LESS 20)
message(FATAL_ERROR
"BOOST_CAPY_BUILD_CUDA_EXAMPLES requires CMAKE_CXX_STANDARD >= 20")
endif()
endif()

# Enable the CUDA language once for whichever CUDA example set is requested.
if(BOOST_CAPY_BUILD_NVEXEC_EXAMPLES OR BOOST_CAPY_BUILD_CUDA_EXAMPLES)
enable_language(CUDA)
find_package(CUDAToolkit REQUIRED)
endif()

set_property(GLOBAL PROPERTY USE_FOLDERS ON)

if(BOOST_CAPY_IS_ROOT AND BUILD_SHARED_LIBS)
Expand Down
8 changes: 7 additions & 1 deletion example/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,16 @@ if(BOOST_CAPY_BUILD_P2300_EXAMPLES)
add_subdirectory(awaitable-sender)
endif()

if(BOOST_CAPY_BUILD_CUDA_EXAMPLES)
add_subdirectory(cuda/datamovement)
endif()

if(BOOST_CAPY_BUILD_NVEXEC_EXAMPLES)
add_subdirectory(gpu-pipeline)
add_subdirectory(cuda/pipeline)
endif()

add_subdirectory(fabrics)

if(TARGET Boost::asio)
add_subdirectory(asio)
endif()
Expand Down
52 changes: 52 additions & 0 deletions example/cuda/datamovement/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
#
# Copyright (c) 2026 Steve Gerbino
#
# Distributed under the Boost Software License, Version 1.0. (See accompanying
# file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#
# Official repository: https://github.com/cppalliance/capy
#

# CUDA was enabled at the top level when the option was flipped on.
if(NOT CMAKE_CUDA_COMPILER)
message(FATAL_ERROR
"example/cuda/datamovement requires CUDA; "
"did you set BOOST_CAPY_BUILD_CUDA_EXAMPLES?")
endif()

file(GLOB_RECURSE PFILES CONFIGURE_DEPENDS
*.cu *.cuh *.hpp
CMakeLists.txt
README.md)

source_group(TREE ${CMAKE_CURRENT_SOURCE_DIR} PREFIX "" FILES ${PFILES})

add_executable(capy_example_cuda_datamovement ${PFILES})

set_target_properties(capy_example_cuda_datamovement PROPERTIES
FOLDER "examples"
CUDA_STANDARD 20
CUDA_STANDARD_REQUIRED ON
CUDA_SEPARABLE_COMPILATION OFF)

target_compile_features(capy_example_cuda_datamovement PRIVATE cxx_std_20)

target_link_libraries(capy_example_cuda_datamovement PRIVATE
Boost::capy
CUDA::cudart)

# The NCCL interop snippet compiles only when NCCL is available.
# Without it, the rest of the example still builds.
find_path(CAPY_NCCL_INCLUDE_DIR nccl.h)
find_library(CAPY_NCCL_LIBRARY nccl)
if(CAPY_NCCL_INCLUDE_DIR AND CAPY_NCCL_LIBRARY)
target_include_directories(capy_example_cuda_datamovement PRIVATE
${CAPY_NCCL_INCLUDE_DIR})
target_link_libraries(capy_example_cuda_datamovement PRIVATE
${CAPY_NCCL_LIBRARY})
target_compile_definitions(capy_example_cuda_datamovement PRIVATE
CAPY_EXAMPLE_HAS_NCCL=1)
message(STATUS "cuda/datamovement: NCCL found; building NCCL interop")
else()
message(STATUS "cuda/datamovement: NCCL not found; skipping NCCL interop")
endif()
62 changes: 62 additions & 0 deletions example/cuda/datamovement/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
# CUDA data-movement example (P4251R0)

Validation that the CUDA data-movement listings from
P4251R0 "IoAwaitables for GPU Data Movement" are type-correct against the
real `boost::capy` API and CUDA. The paper flags this code as AI-generated
and unverified; this target proves it compiles. Nothing here is executed
at runtime.

What is validated:

- `cuda_stream_awaiter`: the io_env-less baseline. Asserted to be a
standard awaitable but **not** an `IoAwaitable`.
- `cuda_stream`: `memcpy_h2d` / `memcpy_d2h` / `synchronize` return
`IoAwaitable`s.
- NCCL interop: `ncclAllReduce` on `cuda_stream::native_handle()`
followed by `co_await synchronize()`. Built only when NCCL is found at
configure time.
- `cuda_device_stream`: satisfies `WriteStream`, type-erases behind
`any_write_stream`, and the `ingest()` protocol handler compiles once
against both a GPU stream and an in-memory transport.
- CUDA Graphs (`cuda_graphs.cu`): a captured graph is replayed inside
a coroutine that drives `cuda_stream` memcpy / synchronize.

The non-GPU listings (the byte-oriented compound result and the
RDMA/libfabric/UCX signatures) do not need CUDA and live in the sibling
`example/fabrics` example. The sender bridge is in `example/cuda/pipeline`.

## Prerequisites

- NVIDIA GPU and driver visible to `nvidia-smi`.
- CUDA toolkit (13.x works). On Arch: `pacman -S cuda`.
- clang as host and CUDA compiler (verified with clang 22).
- `CMAKE_CXX_STANDARD=20`.

## Building

```
CXX=clang++ cmake -S . -B build-cuda -G Ninja \
-DCMAKE_BUILD_TYPE=Release \
-DCMAKE_CXX_STANDARD=20 \
-DBOOST_CAPY_BUILD_CUDA_EXAMPLES=ON \
-DCMAKE_CUDA_COMPILER=clang++ \
-DCMAKE_CUDA_HOST_COMPILER=clang++ \
-DCMAKE_CUDA_ARCHITECTURES=89 \
-DCUDAToolkit_ROOT=/opt/cuda
cmake --build build-cuda --config Release --target capy_example_cuda_datamovement
```

Replace `89` with your GPU's compute capability
(`nvidia-smi --query-gpu=compute_cap --format=csv,noheader`).

A clean build is the pass condition; the binary need not be run.

## Scope

No runtime execution and no multi-device topologies. A clean
build with every `static_assert` holding is the whole deliverable. The
NCCL snippet builds only when NCCL is found. NVSHMEM (a GPU member of the
paper's HPC-fabric list) is not verified: `nvshmem_int_put` is device-side
and its headers do not compile under clang-cuda (capy requires clang-cuda,
since nvcc lacks C++20 coroutines). The non-GPU fabric signatures live in
`example/fabrics`, and the sender bridge in `example/cuda/pipeline`.
92 changes: 92 additions & 0 deletions example/cuda/datamovement/cuda_datamovement.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
//
// Copyright (c) 2026 Steve Gerbino
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
//
// Official repository: https://github.com/cppalliance/capy
//

#include "cuda_datamovement.hpp"

#include <boost/capy.hpp>
#include <boost/capy/concept/io_awaitable.hpp>
#include <boost/capy/test/write_stream.hpp>

#include <cstddef>
#include <span>
#include <system_error>
#include <type_traits>
#include <utility>

namespace capy = boost::capy;
namespace ex = capy::example;

// Intentionally io_env-less: a standard awaitable, not an IoAwaitable.
static_assert(! capy::IoAwaitable<ex::cuda_stream_awaiter>);

// The data-movement awaitables depend on this helper, which the paper
// references but never defines.
static_assert(std::is_same_v<
decltype(ex::make_cuda_error(cudaSuccess)), std::error_code>);

// The memcpy member functions return IoAwaitables.
static_assert(capy::IoAwaitable<
decltype(std::declval<ex::cuda_stream&>().memcpy_h2d(
nullptr, nullptr, std::size_t{0}))>);
static_assert(capy::IoAwaitable<
decltype(std::declval<ex::cuda_stream&>().memcpy_d2h(
nullptr, nullptr, std::size_t{0}))>);
static_assert(capy::IoAwaitable<
decltype(std::declval<ex::cuda_stream&>().synchronize())>);

// GPU device memory satisfies WriteStream and type-erases with zero
// per-operation allocation.
static_assert(capy::WriteStream<ex::cuda_device_stream>);

// A protocol handler compiled once, linked against any transport.
capy::task<>
ingest(capy::any_write_stream& dest, std::span<std::byte const> data)
{
auto [ec, n] = co_await dest.write_some(
capy::make_buffer(data.data(), data.size()));
if(ec)
co_return;
// ...protocol logic...
}

// Reference ingest against two transports to force the "one .o, many
// transports" claim to compile. Never executed.
[[maybe_unused]] void
link_check()
{
ex::cuda_device_stream gpu(nullptr, nullptr);
capy::any_write_stream gpu_dest(&gpu); // GPU device memory

capy::test::write_stream mem;
capy::any_write_stream mem_dest(&mem); // in-memory transport

std::byte payload[8]{};
(void) ingest(gpu_dest, payload);
(void) ingest(mem_dest, payload);
}

#if defined(CAPY_EXAMPLE_HAS_NCCL)
#include <nccl.h>

// NCCL interop: a collective enqueues onto the CUDA stream, then
// synchronize() awaits its completion through the same IoAwaitable path.
capy::task<>
all_reduce(ex::cuda_stream& cs, ncclComm_t comm,
float const* sendbuf, float* recvbuf, std::size_t count)
{
ncclAllReduce(sendbuf, recvbuf, count, ncclFloat, ncclSum,
comm, cs.native_handle());
co_await cs.synchronize();
}
#endif

int main()
{
return 0;
}
Loading
Loading