cppalliance · sgerbino · Jun 29, 2026 · Jun 29, 2026
diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt
@@ -30,6 +30,7 @@ endif()
 
 if(BOOST_CAPY_BUILD_CUDA_EXAMPLES)
     add_subdirectory(cuda/datamovement)
+    add_subdirectory(cuda/notification-strategies)
 endif()
 
 if(BOOST_CAPY_BUILD_NVEXEC_EXAMPLES)

diff --git a/example/cuda/notification-strategies/CMakeLists.txt b/example/cuda/notification-strategies/CMakeLists.txt
@@ -0,0 +1,36 @@
+#
+# Copyright (c) 2026 Steve Gerbino
+#
+# Distributed under the Boost Software License, Version 1.0. (See accompanying
+# file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+#
+# Official repository: https://github.com/cppalliance/capy
+#
+
+# CUDA was enabled at the top level when the option was flipped on.
+if(NOT CMAKE_CUDA_COMPILER)
+    message(FATAL_ERROR
+        "example/cuda/notification-strategies requires CUDA; "
+        "did you set BOOST_CAPY_BUILD_CUDA_EXAMPLES?")
+endif()
+
+file(GLOB_RECURSE PFILES CONFIGURE_DEPENDS
+    *.cu *.cuh *.hpp
+    CMakeLists.txt
+    README.md)
+
+source_group(TREE ${CMAKE_CURRENT_SOURCE_DIR} PREFIX "" FILES ${PFILES})
+
+add_executable(capy_example_cuda_notification_strategies ${PFILES})
+
+set_target_properties(capy_example_cuda_notification_strategies PROPERTIES
+    FOLDER "examples"
+    CUDA_STANDARD 20
+    CUDA_STANDARD_REQUIRED ON
+    CUDA_SEPARABLE_COMPILATION OFF)
+
+target_compile_features(capy_example_cuda_notification_strategies PRIVATE cxx_std_20)
+
+target_link_libraries(capy_example_cuda_notification_strategies PRIVATE
+    Boost::capy
+    CUDA::cudart)
diff --git a/example/cuda/notification-strategies/README.md b/example/cuda/notification-strategies/README.md
@@ -0,0 +1,81 @@
+# CUDA notification-strategies example
+
+One GPU completion, three notification mechanisms, one protocol.
+
+The same pipeline (fill a device buffer, copy it back to host, await the
+CUDA stream) is awaited three structurally different ways. All three are
+`boost::capy::IoAwaitable`s, and all three produce the identical result
+at runtime. This demonstrates that the IoAwaitable protocol is
+independent of how asynchronous completion is detected: a host-function
+callback is only one option, and not the best-scaling one.
+
+## The three mechanisms
+
+| Mechanism | How completion is detected | Resumes on |
+|-----------|----------------------------|------------|
+| `callback_awaitable` | `cudaLaunchHostFunc` enqueued on the stream | a CUDA driver thread re-posts through the executor |
+| `poll_awaitable` | a service thread loops `cudaEventQuery` on a recorded event | the poll thread posts when ready |
+| `deferred_sync_awaitable` | a service thread runs blocking `cudaStreamSynchronize` | the service thread posts when it returns |
+
+Each awaitable captures the executor and posts the continuation through
+it, so the coroutine always resumes on a worker thread, never on a CUDA
+or service thread.
+
+The callback mechanism is the only one that cannot report a stream error
+through its host function; `cudaLaunchHostFunc` does not pass completion
+status to the callback, so `callback_awaitable` always resumes with
+success — this is an inherent limitation of the API.
+
+### Service lifetime
+
+`poll_service` and `sync_service` own threads that post continuations to
+the worker executor. Construct them after the worker `thread_pool` and
+destroy them before it, so no continuation is ever posted to a destroyed
+executor. The driver joins every pipeline (via a `std::latch`) before
+shutdown, so no wait is outstanding at teardown.
+
+## Scaling tradeoff
+
+This example proves the three mechanisms are *equivalent in result*. It
+does not measure their *throughput under load*, which a single-GPU
+developer box cannot show. That comparison needs many worker threads
+driving a server-class GPU.
+
+For that measurement: E. Cano, M. Fila, A. Krasznahorkay,
+"Scheduling for Next Generation Triggers", CHEP 2026,
+<https://indico.cern.ch/event/1471803/contributions/6967272/>. They
+report that the CUDA host-function callback handler scales poorly as the
+number of worker threads grows, while event polling and deferred
+synchronization remain stable. In a multi-threaded framework, prefer the
+poll or deferred-sync mechanisms; reach for the callback mechanism for
+its simplicity in low-concurrency settings.
+
+## Prerequisites
+
+- NVIDIA GPU and driver visible to `nvidia-smi`.
+- CUDA toolkit 13.x.
+- clang as host and CUDA compiler (verified with clang 22).
+- `CMAKE_CXX_STANDARD=20`.
+
+## Building and running
+
+```
+CXX=clang++ cmake -S . -B build-cuda -G Ninja \
+    -DCMAKE_BUILD_TYPE=Release \
+    -DCMAKE_CXX_STANDARD=20 \
+    -DBOOST_CAPY_BUILD_CUDA_EXAMPLES=ON \
+    -DCMAKE_CUDA_COMPILER=clang++ \
+    -DCMAKE_CUDA_HOST_COMPILER=clang++ \
+    -DCMAKE_CUDA_ARCHITECTURES=89 \
+    -DCUDAToolkit_ROOT=/opt/cuda
+cmake --build build-cuda --config Release \
+    --target capy_example_cuda_notification_strategies
+./build-cuda/example/cuda/notification-strategies/capy_example_cuda_notification_strategies
+```
+
+Replace `89` with your GPU's compute capability
+(`nvidia-smi --query-gpu=compute_cap --format=csv,noheader`).
+
+Unlike the sibling `cuda/datamovement` example, this one is meant to be
+run. The pass condition is all three mechanisms printing the same
+checksum and a zero exit code.
diff --git a/example/cuda/notification-strategies/notification_strategies.cu b/example/cuda/notification-strategies/notification_strategies.cu
@@ -0,0 +1,200 @@
+//
+// Copyright (c) 2026 Steve Gerbino
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// Official repository: https://github.com/cppalliance/capy
+//
+
+// One GPU completion, three notification mechanisms.
+//
+// The IoAwaitable protocol does not care how completion is detected. The
+// same pipeline (fill a device buffer, copy it back, await the stream)
+// runs once per mechanism (host-function callback, event polling,
+// deferred blocking synchronize) and all three must produce the same
+// checksum. See README.md for the multi-threaded scaling tradeoff, which
+// a single-GPU box cannot measure.
+
+#include "notification_strategies.hpp"
+
+#include <boost/capy.hpp>
+#include <boost/capy/concept/io_awaitable.hpp>
+#include <boost/capy/ex/thread_pool.hpp>
+
+#include <cstdlib>
+#include <iostream>
+#include <latch>
+#include <system_error>
+#include <type_traits>
+#include <vector>
+
+namespace capy = boost::capy;
+namespace ex   = boost::capy::example;
+
+static_assert(std::is_same_v<
+    decltype(ex::make_cuda_error(cudaSuccess)), std::error_code>);
+
+// The whole point: three different mechanisms, all IoAwaitables.
+static_assert(capy::IoAwaitable<ex::callback_awaitable>);
+static_assert(capy::IoAwaitable<ex::poll_awaitable>);
+static_assert(capy::IoAwaitable<ex::deferred_sync_awaitable>);
+
+namespace {
+
+constexpr int buffer_len = 256;
+constexpr int fill_value = 7;
+
+__global__ void
+fill_kernel(int* p, int n, int v)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if(i < n)
+        p[i] = v;
+}
+
+enum class notify
+{
+    callback,
+    poll,
+    deferred_sync
+};
+
+char const*
+name_of(notify how) noexcept
+{
+    switch(how)
+    {
+    case notify::callback:      return "callback";
+    case notify::poll:          return "poll";
+    case notify::deferred_sync: return "deferred-sync";
+    }
+    return "?";
+}
+
+// Await the stream's completion using the selected mechanism. poll waits
+// on the event; callback and deferred-sync wait on the stream.
+capy::task<std::error_code>
+wait(ex::cuda_stream& stream,
+     ex::cuda_event& event,
+     notify how,
+     ex::poll_service& poll_svc,
+     ex::sync_service& sync_svc)
+{
+    switch(how)
+    {
+    case notify::callback:
+        co_return co_await stream.sync_via_callback();
+    case notify::poll:
+        co_return co_await event.sync_via_poll(poll_svc);
+    case notify::deferred_sync:
+        co_return co_await stream.sync_via_deferred(sync_svc);
+    }
+    co_return std::error_code{};
+}
+
+// Fill a device buffer, copy it back, await completion via `how`, and
+// return the host-side checksum. Identical across mechanisms.
+capy::task<long>
+run_pipeline(ex::cuda_stream& stream,
+             ex::cuda_event& event,
+             notify how,
+             ex::poll_service& poll_svc,
+             ex::sync_service& sync_svc)
+{
+    auto s = stream.native_handle();
+
+    int* d_buf = nullptr;
+    auto err = cudaMallocAsync(
+        reinterpret_cast<void**>(&d_buf),
+        buffer_len * sizeof(int), s);
+    if(err != cudaSuccess)
+        co_return -1;
+
+    fill_kernel<<<(buffer_len + 63) / 64, 64, 0, s>>>(
+        d_buf, buffer_len, fill_value);
+
+    std::vector<int> host(buffer_len, 0);
+    cudaMemcpyAsync(
+        host.data(), d_buf, buffer_len * sizeof(int),
+        cudaMemcpyDeviceToHost, s);
+    cudaFreeAsync(d_buf, s);
+    event.record(s);
+
+    auto ec = co_await wait(stream, event, how, poll_svc, sync_svc);
+    if(ec)
+        co_return -1;
+
+    long sum = 0;
+    for(int v : host)
+        sum += v;
+    co_return sum;
+}
+
+// Drive one pipeline run to completion on `pool` and return its checksum.
+long
+run_one(capy::thread_pool& pool,
+        ex::cuda_stream& stream,
+        ex::cuda_event& event,
+        notify how,
+        ex::poll_service& poll_svc,
+        ex::sync_service& sync_svc)
+{
+    long result = 0;
+    std::latch done{1};
+    capy::run_async(pool.get_executor(),
+        [&](long r) { result = r; done.count_down(); })(
+            run_pipeline(stream, event, how, poll_svc, sync_svc));
+    done.wait();
+    return result;
+}
+
+} // namespace
+
+int
+main()
+{
+    int device_count = 0;
+    if(cudaGetDeviceCount(&device_count) != cudaSuccess || device_count == 0)
+    {
+        std::cout << "No CUDA device available.\n";
+        return EXIT_FAILURE;
+    }
+
+    // Declaration order fixes teardown: services stop before the pool
+    // they post to; the stream/event close first of all.
+    capy::thread_pool pool(4);
+    ex::poll_service poll_svc;
+    ex::sync_service sync_svc;
+    ex::cuda_stream stream;
+    ex::cuda_event event;
+
+    notify const modes[] =
+        { notify::callback, notify::poll, notify::deferred_sync };
+
+    long first = 0;
+    bool ok = true;
+    std::cout << "mechanism      checksum\n";
+    for(std::size_t i = 0; i < std::size(modes); ++i)
+    {
+        long r = run_one(pool, stream, event, modes[i], poll_svc, sync_svc);
+        std::cout << name_of(modes[i]) << "   " << r << "\n";
+        if(i == 0)
+            first = r;
+        else if(r != first)
+            ok = false;
+    }
+
+    long const expected =
+        static_cast<long>(buffer_len) * fill_value;
+    if(! ok || first != expected)
+    {
+        std::cout << "MISMATCH: mechanisms disagree or wrong result "
+                     "(expected " << expected << ")\n";
+        return EXIT_FAILURE;
+    }
+
+    std::cout << "All three mechanisms produced " << first
+              << " (identical).\n";
+    return EXIT_SUCCESS;
+}