diff --git a/CMakeLists.txt b/CMakeLists.txt
index 69e3e9ff..d099480c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -41,6 +41,7 @@ option(BOOST_CAPY_BUILD_EXAMPLES "Build boost::capy examples" ${BOOST_CAPY_IS_RO
 option(BOOST_CAPY_BUILD_BENCH "Build boost::capy benchmarks" ${BOOST_CAPY_IS_ROOT})
 option(BOOST_CAPY_BUILD_P2300_EXAMPLES "Build examples that depend on beman-execution (P2300)" OFF)
 option(BOOST_CAPY_BUILD_NVEXEC_EXAMPLES "Build examples that depend on NVIDIA nvexec (CUDA)" OFF)
+option(BOOST_CAPY_BUILD_CUDA_EXAMPLES "Build examples that depend only on CUDA (no stdexec/nvexec)" OFF)
 option(BOOST_CAPY_MRDOCS_BUILD "Build the target for MrDocs: see mrdocs.yml" OFF)
 
 if(BOOST_CAPY_BUILD_P2300_EXAMPLES)
@@ -60,14 +61,25 @@ if(BOOST_CAPY_BUILD_NVEXEC_EXAMPLES)
         message(FATAL_ERROR
             "BOOST_CAPY_BUILD_NVEXEC_EXAMPLES requires CMAKE_CXX_STANDARD >= 23")
     endif()
-    enable_language(CUDA)
-    find_package(CUDAToolkit REQUIRED)
     # Tell NVIDIA/stdexec to build the nvexec target when its
     # FetchContent is processed (bench/ and/or the example itself).
     set(STDEXEC_ENABLE_CUDA ON CACHE BOOL
         "Build nvexec when configuring NVIDIA/stdexec" FORCE)
 endif()
 
+if(BOOST_CAPY_BUILD_CUDA_EXAMPLES)
+    if(NOT DEFINED CMAKE_CXX_STANDARD OR CMAKE_CXX_STANDARD LESS 20)
+        message(FATAL_ERROR
+            "BOOST_CAPY_BUILD_CUDA_EXAMPLES requires CMAKE_CXX_STANDARD >= 20")
+    endif()
+endif()
+
+# Enable the CUDA language once for whichever CUDA example set is requested.
+if(BOOST_CAPY_BUILD_NVEXEC_EXAMPLES OR BOOST_CAPY_BUILD_CUDA_EXAMPLES)
+    enable_language(CUDA)
+    find_package(CUDAToolkit REQUIRED)
+endif()
+
 set_property(GLOBAL PROPERTY USE_FOLDERS ON)
 
 if(BOOST_CAPY_IS_ROOT AND BUILD_SHARED_LIBS)
diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt
index bae311ea..0d48896d 100644
--- a/example/CMakeLists.txt
+++ b/example/CMakeLists.txt
@@ -28,10 +28,16 @@ if(BOOST_CAPY_BUILD_P2300_EXAMPLES)
     add_subdirectory(awaitable-sender)
 endif()
 
+if(BOOST_CAPY_BUILD_CUDA_EXAMPLES)
+    add_subdirectory(cuda/datamovement)
+endif()
+
 if(BOOST_CAPY_BUILD_NVEXEC_EXAMPLES)
-    add_subdirectory(gpu-pipeline)
+    add_subdirectory(cuda/pipeline)
 endif()
 
+add_subdirectory(fabrics)
+
 if(TARGET Boost::asio)
     add_subdirectory(asio)
 endif()
diff --git a/example/cuda/datamovement/CMakeLists.txt b/example/cuda/datamovement/CMakeLists.txt
new file mode 100644
index 00000000..ec626253
--- /dev/null
+++ b/example/cuda/datamovement/CMakeLists.txt
@@ -0,0 +1,52 @@
+#
+# Copyright (c) 2026 Steve Gerbino
+#
+# Distributed under the Boost Software License, Version 1.0. (See accompanying
+# file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+#
+# Official repository: https://github.com/cppalliance/capy
+#
+
+# CUDA was enabled at the top level when the option was flipped on.
+if(NOT CMAKE_CUDA_COMPILER)
+    message(FATAL_ERROR
+        "example/cuda/datamovement requires CUDA; "
+        "did you set BOOST_CAPY_BUILD_CUDA_EXAMPLES?")
+endif()
+
+file(GLOB_RECURSE PFILES CONFIGURE_DEPENDS
+    *.cu *.cuh *.hpp
+    CMakeLists.txt
+    README.md)
+
+source_group(TREE ${CMAKE_CURRENT_SOURCE_DIR} PREFIX "" FILES ${PFILES})
+
+add_executable(capy_example_cuda_datamovement ${PFILES})
+
+set_target_properties(capy_example_cuda_datamovement PROPERTIES
+    FOLDER "examples"
+    CUDA_STANDARD 20
+    CUDA_STANDARD_REQUIRED ON
+    CUDA_SEPARABLE_COMPILATION OFF)
+
+target_compile_features(capy_example_cuda_datamovement PRIVATE cxx_std_20)
+
+target_link_libraries(capy_example_cuda_datamovement PRIVATE
+    Boost::capy
+    CUDA::cudart)
+
+# The NCCL interop snippet compiles only when NCCL is available.
+# Without it, the rest of the example still builds.
+find_path(CAPY_NCCL_INCLUDE_DIR nccl.h)
+find_library(CAPY_NCCL_LIBRARY nccl)
+if(CAPY_NCCL_INCLUDE_DIR AND CAPY_NCCL_LIBRARY)
+    target_include_directories(capy_example_cuda_datamovement PRIVATE
+        ${CAPY_NCCL_INCLUDE_DIR})
+    target_link_libraries(capy_example_cuda_datamovement PRIVATE
+        ${CAPY_NCCL_LIBRARY})
+    target_compile_definitions(capy_example_cuda_datamovement PRIVATE
+        CAPY_EXAMPLE_HAS_NCCL=1)
+    message(STATUS "cuda/datamovement: NCCL found; building NCCL interop")
+else()
+    message(STATUS "cuda/datamovement: NCCL not found; skipping NCCL interop")
+endif()
diff --git a/example/cuda/datamovement/README.md b/example/cuda/datamovement/README.md
new file mode 100644
index 00000000..e645722c
--- /dev/null
+++ b/example/cuda/datamovement/README.md
@@ -0,0 +1,62 @@
+# CUDA data-movement example (P4251R0)
+
+Validation that the CUDA data-movement listings from
+P4251R0 "IoAwaitables for GPU Data Movement" are type-correct against the
+real `boost::capy` API and CUDA. The paper flags this code as AI-generated
+and unverified; this target proves it compiles. Nothing here is executed
+at runtime.
+
+What is validated:
+
+- `cuda_stream_awaiter`: the io_env-less baseline. Asserted to be a
+  standard awaitable but **not** an `IoAwaitable`.
+- `cuda_stream`: `memcpy_h2d` / `memcpy_d2h` / `synchronize` return
+  `IoAwaitable`s.
+- NCCL interop: `ncclAllReduce` on `cuda_stream::native_handle()`
+  followed by `co_await synchronize()`. Built only when NCCL is found at
+  configure time.
+- `cuda_device_stream`: satisfies `WriteStream`, type-erases behind
+  `any_write_stream`, and the `ingest()` protocol handler compiles once
+  against both a GPU stream and an in-memory transport.
+- CUDA Graphs (`cuda_graphs.cu`): a captured graph is replayed inside
+  a coroutine that drives `cuda_stream` memcpy / synchronize.
+
+The non-GPU listings (the byte-oriented compound result and the
+RDMA/libfabric/UCX signatures) do not need CUDA and live in the sibling
+`example/fabrics` example. The sender bridge is in `example/cuda/pipeline`.
+
+## Prerequisites
+
+- NVIDIA GPU and driver visible to `nvidia-smi`.
+- CUDA toolkit (13.x works). On Arch: `pacman -S cuda`.
+- clang as host and CUDA compiler (verified with clang 22).
+- `CMAKE_CXX_STANDARD=20`.
+
+## Building
+
+```
+CXX=clang++ cmake -S . -B build-cuda -G Ninja \
+    -DCMAKE_BUILD_TYPE=Release \
+    -DCMAKE_CXX_STANDARD=20 \
+    -DBOOST_CAPY_BUILD_CUDA_EXAMPLES=ON \
+    -DCMAKE_CUDA_COMPILER=clang++ \
+    -DCMAKE_CUDA_HOST_COMPILER=clang++ \
+    -DCMAKE_CUDA_ARCHITECTURES=89 \
+    -DCUDAToolkit_ROOT=/opt/cuda
+cmake --build build-cuda --config Release --target capy_example_cuda_datamovement
+```
+
+Replace `89` with your GPU's compute capability
+(`nvidia-smi --query-gpu=compute_cap --format=csv,noheader`).
+
+A clean build is the pass condition; the binary need not be run.
+
+## Scope
+
+No runtime execution and no multi-device topologies. A clean
+build with every `static_assert` holding is the whole deliverable. The
+NCCL snippet builds only when NCCL is found. NVSHMEM (a GPU member of the
+paper's HPC-fabric list) is not verified: `nvshmem_int_put` is device-side
+and its headers do not compile under clang-cuda (capy requires clang-cuda,
+since nvcc lacks C++20 coroutines). The non-GPU fabric signatures live in
+`example/fabrics`, and the sender bridge in `example/cuda/pipeline`.
diff --git a/example/cuda/datamovement/cuda_datamovement.cu b/example/cuda/datamovement/cuda_datamovement.cu
new file mode 100644
index 00000000..56924279
--- /dev/null
+++ b/example/cuda/datamovement/cuda_datamovement.cu
@@ -0,0 +1,92 @@
+//
+// Copyright (c) 2026 Steve Gerbino
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// Official repository: https://github.com/cppalliance/capy
+//
+
+#include "cuda_datamovement.hpp"
+
+#include <boost/capy.hpp>
+#include <boost/capy/concept/io_awaitable.hpp>
+#include <boost/capy/test/write_stream.hpp>
+
+#include <cstddef>
+#include <span>
+#include <system_error>
+#include <type_traits>
+#include <utility>
+
+namespace capy = boost::capy;
+namespace ex   = capy::example;
+
+// Intentionally io_env-less: a standard awaitable, not an IoAwaitable.
+static_assert(! capy::IoAwaitable<ex::cuda_stream_awaiter>);
+
+// The data-movement awaitables depend on this helper, which the paper
+// references but never defines.
+static_assert(std::is_same_v<
+    decltype(ex::make_cuda_error(cudaSuccess)), std::error_code>);
+
+// The memcpy member functions return IoAwaitables.
+static_assert(capy::IoAwaitable<
+    decltype(std::declval<ex::cuda_stream&>().memcpy_h2d(
+        nullptr, nullptr, std::size_t{0}))>);
+static_assert(capy::IoAwaitable<
+    decltype(std::declval<ex::cuda_stream&>().memcpy_d2h(
+        nullptr, nullptr, std::size_t{0}))>);
+static_assert(capy::IoAwaitable<
+    decltype(std::declval<ex::cuda_stream&>().synchronize())>);
+
+// GPU device memory satisfies WriteStream and type-erases with zero
+// per-operation allocation.
+static_assert(capy::WriteStream<ex::cuda_device_stream>);
+
+// A protocol handler compiled once, linked against any transport.
+capy::task<>
+ingest(capy::any_write_stream& dest, std::span<std::byte const> data)
+{
+    auto [ec, n] = co_await dest.write_some(
+        capy::make_buffer(data.data(), data.size()));
+    if(ec)
+        co_return;
+    // ...protocol logic...
+}
+
+// Reference ingest against two transports to force the "one .o, many
+// transports" claim to compile. Never executed.
+[[maybe_unused]] void
+link_check()
+{
+    ex::cuda_device_stream gpu(nullptr, nullptr);
+    capy::any_write_stream gpu_dest(&gpu);     // GPU device memory
+
+    capy::test::write_stream mem;
+    capy::any_write_stream mem_dest(&mem);     // in-memory transport
+
+    std::byte payload[8]{};
+    (void) ingest(gpu_dest, payload);
+    (void) ingest(mem_dest, payload);
+}
+
+#if defined(CAPY_EXAMPLE_HAS_NCCL)
+#include <nccl.h>
+
+// NCCL interop: a collective enqueues onto the CUDA stream, then
+// synchronize() awaits its completion through the same IoAwaitable path.
+capy::task<>
+all_reduce(ex::cuda_stream& cs, ncclComm_t comm,
+    float const* sendbuf, float* recvbuf, std::size_t count)
+{
+    ncclAllReduce(sendbuf, recvbuf, count, ncclFloat, ncclSum,
+        comm, cs.native_handle());
+    co_await cs.synchronize();
+}
+#endif
+
+int main()
+{
+    return 0;
+}
diff --git a/example/cuda/datamovement/cuda_datamovement.hpp b/example/cuda/datamovement/cuda_datamovement.hpp
new file mode 100644
index 00000000..4855cd67
--- /dev/null
+++ b/example/cuda/datamovement/cuda_datamovement.hpp
@@ -0,0 +1,353 @@
+//
+// Copyright (c) 2026 Steve Gerbino
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// Official repository: https://github.com/cppalliance/capy
+//
+
+#ifndef BOOST_CAPY_EXAMPLE_CUDA_DATAMOVEMENT_HPP
+#define BOOST_CAPY_EXAMPLE_CUDA_DATAMOVEMENT_HPP
+
+#include <boost/capy/continuation.hpp>
+#include <boost/capy/ex/io_env.hpp>
+#include <boost/capy/ex/executor_ref.hpp>
+#include <boost/capy/io_result.hpp>
+#include <boost/capy/buffers.hpp>
+#include <boost/capy/concept/const_buffer_sequence.hpp>
+
+#include <cuda_runtime.h>
+
+#include <coroutine>
+#include <cstddef>
+#include <string>
+#include <system_error>
+#include <utility>
+
+namespace boost {
+namespace capy {
+namespace example {
+
+/// Error category for `cudaError_t` values.
+class cuda_error_category
+    : public std::error_category
+{
+public:
+    char const* name() const noexcept override
+    {
+        return "cuda";
+    }
+
+    std::string message(int ev) const override
+    {
+        return ::cudaGetErrorString(static_cast<cudaError_t>(ev));
+    }
+};
+
+/// Return the singleton CUDA error category.
+inline std::error_category const& cuda_category() noexcept
+{
+    static cuda_error_category const cat;
+    return cat;
+}
+
+/// Convert a `cudaError_t` to a `std::error_code`.
+inline std::error_code make_cuda_error(cudaError_t e) noexcept
+{
+    return std::error_code(static_cast<int>(e), cuda_category());
+}
+
+/// A minimal hand-rolled CUDA-completion awaitable (no executor
+/// affinity, cancellation, or frame allocator). Resumes on the CUDA
+/// driver callback thread.
+struct cuda_stream_awaiter
+{
+    cudaStream_t stream;
+
+    bool await_ready() const noexcept
+    {
+        return false;
+    }
+
+    void await_suspend(std::coroutine_handle<> h)
+    {
+        cudaLaunchHostFunc(stream,
+            [](void* data)
+            {
+                std::coroutine_handle<>::from_address(data).resume();
+            },
+            h.address());
+    }
+
+    void await_resume() noexcept
+    {
+    }
+};
+
+/// A CUDA stream whose data-movement operations are IoAwaitables.
+///
+/// `memcpy_h2d`/`memcpy_d2h` issue a `cudaMemcpyAsync` and resume the
+/// awaiting coroutine on `env->executor` when the stream's
+/// `cudaLaunchHostFunc` callback fires. One operation is in flight per
+/// stream at a time, so the resume context is a pre-allocated member
+/// rather than a per-operation allocation.
+class cuda_stream
+{
+    cudaStream_t stream_ = nullptr;
+    continuation cont_;
+    std::error_code error_;
+
+    struct resume_ctx
+    {
+        executor_ref ex;
+        continuation* cont = nullptr;
+    };
+
+    resume_ctx ctx_;
+
+    static void CUDART_CB
+    on_complete(void* arg)
+    {
+        auto* ctx = static_cast<resume_ctx*>(arg);
+        ctx->ex.post(*ctx->cont);
+    }
+
+    // The paper hardcodes HostToDevice and describes memcpy_d2h as "the
+    // same pattern"; a kind field unifies both without duplicating the
+    // awaitable.
+    struct copy_awaitable
+    {
+        cuda_stream* self;
+        void* dst;
+        void const* src;
+        std::size_t count;
+        cudaMemcpyKind kind;
+
+        bool await_ready() const noexcept
+        {
+            return false;
+        }
+
+        std::coroutine_handle<>
+        await_suspend(std::coroutine_handle<> h, io_env const* env)
+        {
+            auto err = cudaMemcpyAsync(
+                dst, src, count, kind, self->stream_);
+            if(err != cudaSuccess)
+            {
+                self->error_ = make_cuda_error(err);
+                return h;
+            }
+            self->cont_.h = h;
+            self->ctx_ = resume_ctx{env->executor, &self->cont_};
+            err = cudaLaunchHostFunc(
+                self->stream_, &on_complete, &self->ctx_);
+            if(err != cudaSuccess)
+            {
+                self->error_ = make_cuda_error(err);
+                return h;
+            }
+            return std::noop_coroutine();
+        }
+
+        void await_resume()
+        {
+            if(self->error_)
+                throw std::system_error(self->error_);
+            self->error_ = {};
+        }
+    };
+
+    struct sync_awaitable
+    {
+        cuda_stream* self;
+
+        bool await_ready() const noexcept
+        {
+            return false;
+        }
+
+        std::coroutine_handle<>
+        await_suspend(std::coroutine_handle<> h, io_env const* env)
+        {
+            self->cont_.h = h;
+            self->ctx_ = resume_ctx{env->executor, &self->cont_};
+            auto err = cudaLaunchHostFunc(
+                self->stream_, &on_complete, &self->ctx_);
+            if(err != cudaSuccess)
+            {
+                self->error_ = make_cuda_error(err);
+                return h;
+            }
+            return std::noop_coroutine();
+        }
+
+        void await_resume()
+        {
+            if(self->error_)
+                throw std::system_error(self->error_);
+            self->error_ = {};
+        }
+    };
+
+public:
+    cuda_stream()
+    {
+        auto err = cudaStreamCreate(&stream_);
+        if(err != cudaSuccess)
+            throw std::system_error(make_cuda_error(err));
+    }
+
+    ~cuda_stream()
+    {
+        if(stream_)
+            cudaStreamDestroy(stream_);
+    }
+
+    cuda_stream(cuda_stream&& other) noexcept
+        : stream_(std::exchange(other.stream_, nullptr))
+    {
+    }
+
+    cuda_stream& operator=(cuda_stream&& other) noexcept
+    {
+        if(this != &other)
+        {
+            if(stream_)
+                cudaStreamDestroy(stream_);
+            stream_ = std::exchange(other.stream_, nullptr);
+        }
+        return *this;
+    }
+
+    cuda_stream(cuda_stream const&) = delete;
+    cuda_stream& operator=(cuda_stream const&) = delete;
+
+    /// Return the underlying CUDA stream handle.
+    cudaStream_t native_handle() const noexcept
+    {
+        return stream_;
+    }
+
+    /// Asynchronously copy `count` bytes from host `src` to device `dst`.
+    auto memcpy_h2d(void* dst, void const* src, std::size_t count)
+    {
+        return copy_awaitable{
+            this, dst, src, count, cudaMemcpyHostToDevice};
+    }
+
+    /// Asynchronously copy `count` bytes from device `src` to host `dst`.
+    auto memcpy_d2h(void* dst, void const* src, std::size_t count)
+    {
+        return copy_awaitable{
+            this, dst, src, count, cudaMemcpyDeviceToHost};
+    }
+
+    /// Asynchronously wait for all pending stream operations to complete.
+    auto synchronize()
+    {
+        return sync_awaitable{this};
+    }
+};
+
+/// GPU device memory exposed as a WriteStream.
+///
+/// Reshapes the `cuda_stream` memcpy pattern to satisfy `WriteStream`, so device
+/// memory can hide behind `any_write_stream`. Because `cudaMemcpyAsync`
+/// transfers the whole buffer in one operation, `write_some` never
+/// performs a partial write. Errors are delivered via `io_result`
+/// rather than exceptions. Does not own `stream_`; the caller is
+/// responsible for the stream's lifetime.
+class cuda_device_stream
+{
+    cudaStream_t stream_;
+    std::byte* d_ptr_;
+    std::size_t offset_ = 0;
+    continuation cont_;
+    std::error_code error_;
+
+    struct resume_ctx
+    {
+        executor_ref ex;
+        continuation* cont = nullptr;
+    };
+
+    resume_ctx ctx_;
+
+    static void CUDART_CB
+    on_complete(void* arg)
+    {
+        auto* ctx = static_cast<resume_ctx*>(arg);
+        ctx->ex.post(*ctx->cont);
+    }
+
+public:
+    cuda_device_stream(cudaStream_t s, std::byte* device_ptr)
+        : stream_(s)
+        , d_ptr_(device_ptr)
+    {
+    }
+
+    template<ConstBufferSequence Buffers>
+    auto write_some(Buffers buffers)
+    {
+        struct awaitable
+        {
+            cuda_device_stream* self;
+            const_buffer buf;
+
+            bool await_ready() const noexcept
+            {
+                return false;
+            }
+
+            std::coroutine_handle<>
+            await_suspend(std::coroutine_handle<> h, io_env const* env)
+            {
+                auto n = buf.size();
+                auto err = cudaMemcpyAsync(
+                    self->d_ptr_ + self->offset_,
+                    buf.data(), n,
+                    cudaMemcpyHostToDevice,
+                    self->stream_);
+                if(err != cudaSuccess)
+                {
+                    self->error_ = make_cuda_error(err);
+                    return h;
+                }
+                self->cont_.h = h;
+                self->ctx_ = resume_ctx{env->executor, &self->cont_};
+                err = cudaLaunchHostFunc(
+                    self->stream_, &on_complete, &self->ctx_);
+                if(err != cudaSuccess)
+                {
+                    self->error_ = make_cuda_error(err);
+                    return h;
+                }
+                return std::noop_coroutine();
+            }
+
+            io_result<std::size_t>
+            await_resume()
+            {
+                if(self->error_)
+                {
+                    auto ec = self->error_;
+                    self->error_ = {};
+                    return {ec, 0};
+                }
+                auto n = buf.size();
+                self->offset_ += n;
+                return {{}, n};
+            }
+        };
+        return awaitable{this, *capy::begin(buffers)};
+    }
+};
+
+} // namespace example
+} // namespace capy
+} // namespace boost
+
+#endif
diff --git a/example/cuda/datamovement/cuda_graphs.cu b/example/cuda/datamovement/cuda_graphs.cu
new file mode 100644
index 00000000..eb7e041a
--- /dev/null
+++ b/example/cuda/datamovement/cuda_graphs.cu
@@ -0,0 +1,69 @@
+//
+// Copyright (c) 2026 Steve Gerbino
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// Official repository: https://github.com/cppalliance/capy
+//
+
+#include "cuda_datamovement.hpp"
+
+#include <boost/capy.hpp>
+
+#include <cuda_runtime.h>
+
+#include <cstddef>
+
+namespace capy = boost::capy;
+namespace ex   = capy::example;
+
+namespace {
+
+__global__ void
+kernel_A(float* y, int n)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if(i < n)
+        y[i] += 1.0f;
+}
+
+__global__ void
+kernel_B(float* y, int n)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if(i < n)
+        y[i] *= 2.0f;
+}
+
+// A pre-captured CUDA graph is the inner optimized hot path; the
+// coroutine is the outer, data-dependent loop (copy in, launch the graph,
+// copy out). Graph replay and coroutine orchestration optimize different
+// layers and compose without either subsuming the other.
+[[maybe_unused]] capy::task<>
+graph_replay(ex::cuda_stream& cs, float* d_y, float* h_y, int n)
+{
+    cudaStream_t stream = cs.native_handle();
+
+    cudaGraph_t graph;
+    cudaGraphExec_t instance;
+    dim3 grid(1);
+    dim3 block(static_cast<unsigned>(n));
+
+    cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal);
+    kernel_A<<<grid, block, 0, stream>>>(d_y, n);
+    kernel_B<<<grid, block, 0, stream>>>(d_y, n);
+    cudaStreamEndCapture(stream, &graph);
+
+    cudaGraphInstantiate(&instance, graph, 0);
+
+    co_await cs.memcpy_h2d(d_y, h_y, n * sizeof(float));
+    cudaGraphLaunch(instance, stream);
+    co_await cs.synchronize();
+    co_await cs.memcpy_d2h(h_y, d_y, n * sizeof(float));
+
+    cudaGraphExecDestroy(instance);
+    cudaGraphDestroy(graph);
+}
+
+} // namespace
diff --git a/example/gpu-pipeline/CMakeLists.txt b/example/cuda/pipeline/CMakeLists.txt
similarity index 83%
rename from example/gpu-pipeline/CMakeLists.txt
rename to example/cuda/pipeline/CMakeLists.txt
index 4873cf04..3913b4cf 100644
--- a/example/gpu-pipeline/CMakeLists.txt
+++ b/example/cuda/pipeline/CMakeLists.txt
@@ -11,7 +11,7 @@
 # Honor a clean error if the user wired around it.
 if(NOT CMAKE_CUDA_COMPILER)
     message(FATAL_ERROR
-        "example/gpu-pipeline requires CUDA; "
+        "example/cuda-pipeline requires CUDA; "
         "did you set BOOST_CAPY_BUILD_NVEXEC_EXAMPLES?")
 endif()
 
@@ -41,17 +41,17 @@ file(GLOB_RECURSE PFILES CONFIGURE_DEPENDS
 
 source_group(TREE ${CMAKE_CURRENT_SOURCE_DIR} PREFIX "" FILES ${PFILES})
 
-add_executable(capy_example_gpu_pipeline ${PFILES})
+add_executable(capy_example_cuda_pipeline ${PFILES})
 
-set_target_properties(capy_example_gpu_pipeline PROPERTIES
+set_target_properties(capy_example_cuda_pipeline PROPERTIES
     FOLDER "examples"
     CUDA_STANDARD 20
     CUDA_STANDARD_REQUIRED ON
     CUDA_SEPARABLE_COMPILATION OFF)
 
-target_compile_features(capy_example_gpu_pipeline PRIVATE cxx_std_23)
+target_compile_features(capy_example_cuda_pipeline PRIVATE cxx_std_23)
 
-target_link_libraries(capy_example_gpu_pipeline PRIVATE
+target_link_libraries(capy_example_cuda_pipeline PRIVATE
     Boost::capy
     STDEXEC::stdexec
     STDEXEC::nvexec
diff --git a/example/gpu-pipeline/Jamfile b/example/cuda/pipeline/Jamfile
similarity index 100%
rename from example/gpu-pipeline/Jamfile
rename to example/cuda/pipeline/Jamfile
diff --git a/example/gpu-pipeline/README.md b/example/cuda/pipeline/README.md
similarity index 78%
rename from example/gpu-pipeline/README.md
rename to example/cuda/pipeline/README.md
index 4ffb7ae4..719b48ad 100644
--- a/example/gpu-pipeline/README.md
+++ b/example/cuda/pipeline/README.md
@@ -1,8 +1,9 @@
-# GPU pipeline example
+# CUDA pipeline example
 
 This example demonstrates that `boost::capy::await_sender` and
 `boost::capy::as_sender` compose with NVIDIA's `nvexec::stream_scheduler`,
-not just with CPU schedulers. Two scenes:
+not just with CPU schedulers. Two runnable scenes, plus a third that is
+built but not run (P4251R0):
 
 1. **Scene 1 (Direction 1).** A `boost::capy::task` running on
    `boost::capy::thread_pool` `co_await`s a sender whose terminal action is
@@ -23,6 +24,16 @@ not just with CPU schedulers. Two scenes:
    or returns `noop_coroutine()` after stashing the continuation for the
    peer to resume.
 
+3. **Scene 3 (P4251R0), built but not run.** `handle_request` shows the
+   inference-handler shape: a type-erased `any_read_source` read, GPU
+   dispatch via `await_sender` over a real nvexec kernel, and a type-erased
+   `any_write_sink` write. It is compiled but not executed (`main` does not
+   call it). The paper's listing runs a host `run_model()` under a
+   device-side `then()`, which does not compile on nvexec (host call from
+   device); this mirrors Scene 1's pattern instead, dispatching a real
+   kernel and hopping `continues_on(cpu)` before the host-only bridge, and
+   takes a CPU scheduler the paper's signature omits.
+
 The bridge headers (`awaitable_sender.hpp`, `sender_awaitable.hpp`) are
 copied verbatim from `bench/stdexec/`; the bridge in the bench was already
 written against NVIDIA/stdexec.
@@ -53,8 +64,8 @@ CXX=clang++ cmake -S . -B build \
     -DCUDAToolkit_ROOT=/opt/cuda \
     -DBOOST_CAPY_BUILD_STDEXEC_EXAMPLES=ON \
     -DBOOST_CAPY_BUILD_NVEXEC_EXAMPLES=ON
-cmake --build build --config Release --target capy_example_gpu_pipeline
-./build/example/gpu-pipeline/capy_example_gpu_pipeline
+cmake --build build --config Release --target capy_example_cuda_pipeline
+./build/example/cuda-pipeline/capy_example_cuda_pipeline
 ```
 
 Replace `89` with your GPU's compute capability (`nvidia-smi
diff --git a/example/gpu-pipeline/awaitable_sender.hpp b/example/cuda/pipeline/awaitable_sender.hpp
similarity index 100%
rename from example/gpu-pipeline/awaitable_sender.hpp
rename to example/cuda/pipeline/awaitable_sender.hpp
diff --git a/example/gpu-pipeline/gpu_pipeline.cu b/example/cuda/pipeline/cuda_pipeline.cu
similarity index 82%
rename from example/gpu-pipeline/gpu_pipeline.cu
rename to example/cuda/pipeline/cuda_pipeline.cu
index 471c2fc4..1071bd34 100644
--- a/example/gpu-pipeline/gpu_pipeline.cu
+++ b/example/cuda/pipeline/cuda_pipeline.cu
@@ -23,12 +23,15 @@
 #include "sender_awaitable.hpp"
 
 #include <boost/capy.hpp>
+#include <boost/capy/io/any_read_source.hpp>
+#include <boost/capy/io/any_write_sink.hpp>
 #include <boost/capy/test/stream.hpp>
 
 #include <stdexec/execution.hpp>
 #include <exec/static_thread_pool.hpp>
 #include <nvexec/stream_context.cuh>
 
+#include <array>
 #include <cassert>
 #include <cstddef>
 #include <cstdio>
@@ -126,6 +129,57 @@ scene1(nvexec::stream_scheduler gpu,
     co_return h_y0;
 }
 
+// Scene 3 (P4251R0): the inference-handler shape. Network I/O uses
+// type-erased coroutine streams (any_read_source / any_write_sink); GPU
+// dispatch uses a sender bridged with await_sender. The paper's
+// listing runs a host run_model() under a device-side then(), which does
+// not compile on nvexec; this mirrors Scene 1 instead, dispatching a real
+// kernel and hopping continues_on(cpu) before the host-only bridge.
+[[maybe_unused]] capy::task<>
+handle_request(
+    capy::any_read_source& client,
+    capy::any_write_sink& response,
+    nvexec::stream_context& gpu_ctx,
+    exec::static_thread_pool::scheduler cpu)
+{
+    // receive request (coroutine, type-erased)
+    std::array<std::byte, 4096> buf;
+    auto [ec, n] = co_await client.read_some(
+        capy::mutable_buffer(buf.data(), buf.size()));
+    if(ec)
+        co_return;
+    (void) n;
+
+    // dispatch to GPU (sender, compile-time composition)
+    auto gpu = gpu_ctx.get_scheduler();
+    constexpr int N = 64;
+    float* d_y = nullptr;
+    cuda_check(cudaMalloc(&d_y, N * sizeof(float)), "scene3 malloc");
+
+    co_await capy::await_sender(
+        ex::just(N, d_y)
+        | ex::continues_on(gpu)
+        | nvexec::launch({.grid_size = 1, .block_size = N},
+            [] (cudaStream_t, int len, float* y) {
+                int i = blockIdx.x * blockDim.x + threadIdx.x;
+                if (i < len)
+                    y[i] = static_cast<float>(i);
+            })
+        | ex::continues_on(cpu));
+
+    cuda_check(cudaSetDevice(0), "scene3 setdevice");
+    std::array<float, N> result{};
+    cuda_check(cudaMemcpy(result.data(), d_y,
+        N * sizeof(float), cudaMemcpyDeviceToHost), "scene3 D2H");
+    cuda_check(cudaFree(d_y), "scene3 free");
+
+    // send result back (coroutine, type-erased)
+    auto [wec, wn] = co_await capy::write(response,
+        capy::make_buffer(result.data(), result.size() * sizeof(float)));
+    (void) wec;
+    (void) wn;
+}
+
 // Adapter run_async-like driver: kicks off scene1 on the capy
 // thread_pool, blocks the caller until it completes, and returns
 // the result via the supplied storage.
diff --git a/example/gpu-pipeline/sender_awaitable.hpp b/example/cuda/pipeline/sender_awaitable.hpp
similarity index 100%
rename from example/gpu-pipeline/sender_awaitable.hpp
rename to example/cuda/pipeline/sender_awaitable.hpp
diff --git a/example/fabrics/CMakeLists.txt b/example/fabrics/CMakeLists.txt
new file mode 100644
index 00000000..ec2ed1dd
--- /dev/null
+++ b/example/fabrics/CMakeLists.txt
@@ -0,0 +1,69 @@
+#
+# Copyright (c) 2026 Steve Gerbino
+#
+# Distributed under the Boost Software License, Version 1.0. (See accompanying
+# file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+#
+# Official repository: https://github.com/cppalliance/capy
+#
+
+# Transport-neutral, non-GPU listings from P4251R0: the byte-oriented
+# compound-result pattern (capy only) and the HPC-fabric send signatures.
+# No CUDA.
+
+file(GLOB_RECURSE PFILES CONFIGURE_DEPENDS
+    *.cpp *.hpp
+    CMakeLists.txt
+    README.md)
+
+source_group(TREE ${CMAKE_CURRENT_SOURCE_DIR} PREFIX "" FILES ${PFILES})
+
+add_executable(capy_example_fabrics ${PFILES})
+
+set_target_properties(capy_example_fabrics PROPERTIES FOLDER "examples")
+
+target_compile_features(capy_example_fabrics PRIVATE cxx_std_20)
+
+target_link_libraries(capy_example_fabrics PRIVATE Boost::capy)
+
+# HPC-fabric signature checks: each call is compiled (never run) against the real
+# library header only when that library is found, to verify the paper's
+# signature. Each library is optional and independent.
+find_path(CAPY_IBVERBS_INCLUDE_DIR infiniband/verbs.h)
+find_library(CAPY_IBVERBS_LIBRARY ibverbs)
+if(CAPY_IBVERBS_INCLUDE_DIR AND CAPY_IBVERBS_LIBRARY)
+    target_include_directories(capy_example_fabrics PRIVATE
+        ${CAPY_IBVERBS_INCLUDE_DIR})
+    target_link_libraries(capy_example_fabrics PRIVATE
+        ${CAPY_IBVERBS_LIBRARY})
+    target_compile_definitions(capy_example_fabrics PRIVATE
+        CAPY_EXAMPLE_HAS_IBVERBS=1)
+    message(STATUS "fabrics: libibverbs found; verifying ibv_post_send")
+endif()
+
+find_path(CAPY_LIBFABRIC_INCLUDE_DIR rdma/fabric.h)
+find_library(CAPY_LIBFABRIC_LIBRARY fabric)
+if(CAPY_LIBFABRIC_INCLUDE_DIR AND CAPY_LIBFABRIC_LIBRARY)
+    target_include_directories(capy_example_fabrics PRIVATE
+        ${CAPY_LIBFABRIC_INCLUDE_DIR})
+    target_link_libraries(capy_example_fabrics PRIVATE
+        ${CAPY_LIBFABRIC_LIBRARY})
+    target_compile_definitions(capy_example_fabrics PRIVATE
+        CAPY_EXAMPLE_HAS_LIBFABRIC=1)
+    message(STATUS "fabrics: libfabric found; verifying fi_send")
+endif()
+
+# UCX ships in the official 'openucx' package; it is also bundled in the
+# NVIDIA HPC SDK. Pass -DCAPY_UCX_INCLUDE_DIR/-DCAPY_UCX_LIBRARY to point at
+# a non-standard location.
+find_path(CAPY_UCX_INCLUDE_DIR ucp/api/ucp.h)
+find_library(CAPY_UCX_LIBRARY ucp)
+if(CAPY_UCX_INCLUDE_DIR AND CAPY_UCX_LIBRARY)
+    target_include_directories(capy_example_fabrics PRIVATE
+        ${CAPY_UCX_INCLUDE_DIR})
+    target_link_libraries(capy_example_fabrics PRIVATE
+        ${CAPY_UCX_LIBRARY})
+    target_compile_definitions(capy_example_fabrics PRIVATE
+        CAPY_EXAMPLE_HAS_UCX=1)
+    message(STATUS "fabrics: UCX found; verifying ucp_tag_send_nbx")
+endif()
diff --git a/example/fabrics/README.md b/example/fabrics/README.md
new file mode 100644
index 00000000..d1fda947
--- /dev/null
+++ b/example/fabrics/README.md
@@ -0,0 +1,43 @@
+# Fabrics example (P4251R0)
+
+The transport-neutral, non-GPU listings from P4251R0 "IoAwaitables for GPU
+Data Movement". Validation that the paper's byte-oriented and
+HPC-fabric calls are type-correct against the real `boost::capy` API and the
+installed fabric libraries. Nothing here is executed; a clean build is the
+deliverable.
+
+Unlike the `cuda/` examples, this needs **no CUDA toolchain**: only a
+C++20 compiler and `boost::capy`, plus whichever fabric libraries happen to
+be installed.
+
+What is validated:
+
+- `read_with_reset`: `read_some` delivers `(error_code, n)` via structured
+  bindings; the coroutine branches on a partial-read condition with no
+  sender channel to choose. Pure capy, no transport library.
+- HPC-fabric send signatures, each built only when its library is found:
+  - libibverbs `ibv_post_send` (RDMA / InfiniBand)
+  - libfabric `fi_send` (OFI completion-queue model)
+  - UCX `ucp_tag_send_nbx` (progress-engine callback model)
+
+NCCL and NVSHMEM are the GPU members of the paper's fabric list; NCCL is
+exercised by the `cuda/datamovement` example, and NVSHMEM's device API does
+not compile under clang-cuda (see that example's notes).
+
+## Building
+
+This builds as part of the normal example set (`BOOST_CAPY_BUILD_EXAMPLES`).
+The fabric checks activate automatically when their libraries are present:
+
+- libibverbs: `libibverbs` package.
+- libfabric: `libfabric` package.
+- UCX: `openucx` package, or pass
+  `-DCAPY_UCX_INCLUDE_DIR=<dir> -DCAPY_UCX_LIBRARY=<libucp.so>` to point at a
+  non-standard location (for example the UCX bundled in the NVIDIA HPC SDK).
+
+```
+cmake -S . -B build -G Ninja -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_STANDARD=20
+cmake --build build --config Release --target capy_example_fabrics
+```
+
+A clean build is the pass condition; the binary need not be run.
diff --git a/example/fabrics/fabrics.cpp b/example/fabrics/fabrics.cpp
new file mode 100644
index 00000000..258fadb7
--- /dev/null
+++ b/example/fabrics/fabrics.cpp
@@ -0,0 +1,106 @@
+//
+// Copyright (c) 2026 Steve Gerbino
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// Official repository: https://github.com/cppalliance/capy
+//
+
+// The transport-neutral, non-GPU listings from P4251R0: the byte-oriented
+// compound-result pattern (capy only) and the HPC-fabric send-call
+// signatures (libibverbs / libfabric / UCX). Nothing is executed; the
+// build itself is the check. Each fabric block builds only when found.
+
+#include <boost/capy.hpp>
+#include <boost/capy/test/stream.hpp>
+
+#include <cstddef>
+#include <system_error>
+
+namespace capy = boost::capy;
+
+namespace {
+
+// A byte-oriented read delivers (error_code, n) via structured bindings;
+// the coroutine branches on a partial-read condition (the peer reset after
+// n bytes arrived) with no sender channel to choose. The same compound
+// result is what RDMA work completions carry.
+[[maybe_unused]] capy::task<>
+read_with_reset(capy::test::stream& s)
+{
+    std::byte buf[64];
+    auto [ec, n] = co_await s.read_some(
+        capy::mutable_buffer(buf, sizeof buf));
+    if(ec == std::errc::connection_reset)
+    {
+        // 'n' bytes arrived before the reset.
+        (void) n;
+        co_return;
+    }
+    (void) n;
+}
+
+} // namespace
+
+#if defined(CAPY_EXAMPLE_HAS_IBVERBS)
+#include <infiniband/verbs.h>
+
+namespace {
+
+// libibverbs: completion via a completion-channel file descriptor.
+[[maybe_unused]] void
+sig_ibverbs()
+{
+    ibv_qp* qp = nullptr;
+    ibv_send_wr wr{};
+    ibv_send_wr* bad_wr = nullptr;
+    (void) ibv_post_send(qp, &wr, &bad_wr);
+}
+
+} // namespace
+#endif
+
+#if defined(CAPY_EXAMPLE_HAS_LIBFABRIC)
+#include <rdma/fi_endpoint.h>
+
+namespace {
+
+// libfabric: completion via a completion-queue poll.
+[[maybe_unused]] void
+sig_libfabric()
+{
+    fid_ep* ep = nullptr;
+    char buffer[16];
+    fi_addr_t dest_addr = 0;
+    void* context = nullptr;
+    (void) fi_send(ep, buffer, sizeof buffer, nullptr, dest_addr, context);
+}
+
+} // namespace
+#endif
+
+#if defined(CAPY_EXAMPLE_HAS_UCX)
+#include <ucp/api/ucp.h>
+
+namespace {
+
+// UCX: completion via a callback from the progress engine.
+[[maybe_unused]] void
+sig_ucx()
+{
+    ucp_ep_h ep = nullptr;
+    char buffer[16];
+    ucp_tag_t tag = 0;
+    ucp_request_param_t param{};
+    (void) ucp_tag_send_nbx(ep, buffer, sizeof buffer, tag, &param);
+}
+
+} // namespace
+#endif
+
+// The target exists to prove the listings are type-correct; it is not run.
+int main()
+{
+    return 0;
+}