Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions example/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ endif()

if(BOOST_CAPY_BUILD_CUDA_EXAMPLES)
add_subdirectory(cuda/datamovement)
add_subdirectory(cuda/notification-strategies)
endif()

if(BOOST_CAPY_BUILD_NVEXEC_EXAMPLES)
Expand Down
36 changes: 36 additions & 0 deletions example/cuda/notification-strategies/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
#
# Copyright (c) 2026 Steve Gerbino
#
# Distributed under the Boost Software License, Version 1.0. (See accompanying
# file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#
# Official repository: https://github.com/cppalliance/capy
#

# CUDA was enabled at the top level when the option was flipped on.
if(NOT CMAKE_CUDA_COMPILER)
message(FATAL_ERROR
"example/cuda/notification-strategies requires CUDA; "
"did you set BOOST_CAPY_BUILD_CUDA_EXAMPLES?")
endif()

file(GLOB_RECURSE PFILES CONFIGURE_DEPENDS
*.cu *.cuh *.hpp
CMakeLists.txt
README.md)

source_group(TREE ${CMAKE_CURRENT_SOURCE_DIR} PREFIX "" FILES ${PFILES})

add_executable(capy_example_cuda_notification_strategies ${PFILES})

set_target_properties(capy_example_cuda_notification_strategies PROPERTIES
FOLDER "examples"
CUDA_STANDARD 20
CUDA_STANDARD_REQUIRED ON
CUDA_SEPARABLE_COMPILATION OFF)

target_compile_features(capy_example_cuda_notification_strategies PRIVATE cxx_std_20)

target_link_libraries(capy_example_cuda_notification_strategies PRIVATE
Boost::capy
CUDA::cudart)
81 changes: 81 additions & 0 deletions example/cuda/notification-strategies/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
# CUDA notification-strategies example

One GPU completion, three notification mechanisms, one protocol.

The same pipeline (fill a device buffer, copy it back to host, await the
CUDA stream) is awaited three structurally different ways. All three are
`boost::capy::IoAwaitable`s, and all three produce the identical result
at runtime. This demonstrates that the IoAwaitable protocol is
independent of how asynchronous completion is detected: a host-function
callback is only one option, and not the best-scaling one.

## The three mechanisms

| Mechanism | How completion is detected | Resumes on |
|-----------|----------------------------|------------|
| `callback_awaitable` | `cudaLaunchHostFunc` enqueued on the stream | a CUDA driver thread re-posts through the executor |
| `poll_awaitable` | a service thread loops `cudaEventQuery` on a recorded event | the poll thread posts when ready |
| `deferred_sync_awaitable` | a service thread runs blocking `cudaStreamSynchronize` | the service thread posts when it returns |

Each awaitable captures the executor and posts the continuation through
it, so the coroutine always resumes on a worker thread, never on a CUDA
or service thread.

The callback mechanism is the only one that cannot report a stream error
through its host function; `cudaLaunchHostFunc` does not pass completion
status to the callback, so `callback_awaitable` always resumes with
success — this is an inherent limitation of the API.

### Service lifetime

`poll_service` and `sync_service` own threads that post continuations to
the worker executor. Construct them after the worker `thread_pool` and
destroy them before it, so no continuation is ever posted to a destroyed
executor. The driver joins every pipeline (via a `std::latch`) before
shutdown, so no wait is outstanding at teardown.

## Scaling tradeoff

This example proves the three mechanisms are *equivalent in result*. It
does not measure their *throughput under load*, which a single-GPU
developer box cannot show. That comparison needs many worker threads
driving a server-class GPU.

For that measurement: E. Cano, M. Fila, A. Krasznahorkay,
"Scheduling for Next Generation Triggers", CHEP 2026,
<https://indico.cern.ch/event/1471803/contributions/6967272/>. They
report that the CUDA host-function callback handler scales poorly as the
number of worker threads grows, while event polling and deferred
synchronization remain stable. In a multi-threaded framework, prefer the
poll or deferred-sync mechanisms; reach for the callback mechanism for
its simplicity in low-concurrency settings.

## Prerequisites

- NVIDIA GPU and driver visible to `nvidia-smi`.
- CUDA toolkit 13.x.
- clang as host and CUDA compiler (verified with clang 22).
- `CMAKE_CXX_STANDARD=20`.

## Building and running

```
CXX=clang++ cmake -S . -B build-cuda -G Ninja \
-DCMAKE_BUILD_TYPE=Release \
-DCMAKE_CXX_STANDARD=20 \
-DBOOST_CAPY_BUILD_CUDA_EXAMPLES=ON \
-DCMAKE_CUDA_COMPILER=clang++ \
-DCMAKE_CUDA_HOST_COMPILER=clang++ \
-DCMAKE_CUDA_ARCHITECTURES=89 \
-DCUDAToolkit_ROOT=/opt/cuda
cmake --build build-cuda --config Release \
--target capy_example_cuda_notification_strategies
./build-cuda/example/cuda/notification-strategies/capy_example_cuda_notification_strategies
```

Replace `89` with your GPU's compute capability
(`nvidia-smi --query-gpu=compute_cap --format=csv,noheader`).

Unlike the sibling `cuda/datamovement` example, this one is meant to be
run. The pass condition is all three mechanisms printing the same
checksum and a zero exit code.
200 changes: 200 additions & 0 deletions example/cuda/notification-strategies/notification_strategies.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,200 @@
//
// Copyright (c) 2026 Steve Gerbino
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
//
// Official repository: https://github.com/cppalliance/capy
//

// One GPU completion, three notification mechanisms.
//
// The IoAwaitable protocol does not care how completion is detected. The
// same pipeline (fill a device buffer, copy it back, await the stream)
// runs once per mechanism (host-function callback, event polling,
// deferred blocking synchronize) and all three must produce the same
// checksum. See README.md for the multi-threaded scaling tradeoff, which
// a single-GPU box cannot measure.

#include "notification_strategies.hpp"

#include <boost/capy.hpp>
#include <boost/capy/concept/io_awaitable.hpp>
#include <boost/capy/ex/thread_pool.hpp>

#include <cstdlib>
#include <iostream>
#include <latch>
#include <system_error>
#include <type_traits>
#include <vector>

namespace capy = boost::capy;
namespace ex = boost::capy::example;

static_assert(std::is_same_v<
decltype(ex::make_cuda_error(cudaSuccess)), std::error_code>);

// The whole point: three different mechanisms, all IoAwaitables.
static_assert(capy::IoAwaitable<ex::callback_awaitable>);
static_assert(capy::IoAwaitable<ex::poll_awaitable>);
static_assert(capy::IoAwaitable<ex::deferred_sync_awaitable>);

namespace {

constexpr int buffer_len = 256;
constexpr int fill_value = 7;

__global__ void
fill_kernel(int* p, int n, int v)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if(i < n)
p[i] = v;
}

enum class notify
{
callback,
poll,
deferred_sync
};

char const*
name_of(notify how) noexcept
{
switch(how)
{
case notify::callback: return "callback";
case notify::poll: return "poll";
case notify::deferred_sync: return "deferred-sync";
}
return "?";
}

// Await the stream's completion using the selected mechanism. poll waits
// on the event; callback and deferred-sync wait on the stream.
capy::task<std::error_code>
wait(ex::cuda_stream& stream,
ex::cuda_event& event,
notify how,
ex::poll_service& poll_svc,
ex::sync_service& sync_svc)
{
switch(how)
{
case notify::callback:
co_return co_await stream.sync_via_callback();
case notify::poll:
co_return co_await event.sync_via_poll(poll_svc);
case notify::deferred_sync:
co_return co_await stream.sync_via_deferred(sync_svc);
}
co_return std::error_code{};
}

// Fill a device buffer, copy it back, await completion via `how`, and
// return the host-side checksum. Identical across mechanisms.
capy::task<long>
run_pipeline(ex::cuda_stream& stream,
ex::cuda_event& event,
notify how,
ex::poll_service& poll_svc,
ex::sync_service& sync_svc)
{
auto s = stream.native_handle();

int* d_buf = nullptr;
auto err = cudaMallocAsync(
reinterpret_cast<void**>(&d_buf),
buffer_len * sizeof(int), s);
if(err != cudaSuccess)
co_return -1;

fill_kernel<<<(buffer_len + 63) / 64, 64, 0, s>>>(
d_buf, buffer_len, fill_value);

std::vector<int> host(buffer_len, 0);
cudaMemcpyAsync(
host.data(), d_buf, buffer_len * sizeof(int),
cudaMemcpyDeviceToHost, s);
cudaFreeAsync(d_buf, s);
event.record(s);

auto ec = co_await wait(stream, event, how, poll_svc, sync_svc);
if(ec)
co_return -1;

long sum = 0;
for(int v : host)
sum += v;
co_return sum;
}

// Drive one pipeline run to completion on `pool` and return its checksum.
long
run_one(capy::thread_pool& pool,
ex::cuda_stream& stream,
ex::cuda_event& event,
notify how,
ex::poll_service& poll_svc,
ex::sync_service& sync_svc)
{
long result = 0;
std::latch done{1};
capy::run_async(pool.get_executor(),
[&](long r) { result = r; done.count_down(); })(
run_pipeline(stream, event, how, poll_svc, sync_svc));
done.wait();
return result;
}

} // namespace

int
main()
{
int device_count = 0;
if(cudaGetDeviceCount(&device_count) != cudaSuccess || device_count == 0)
{
std::cout << "No CUDA device available.\n";
return EXIT_FAILURE;
}

// Declaration order fixes teardown: services stop before the pool
// they post to; the stream/event close first of all.
capy::thread_pool pool(4);
ex::poll_service poll_svc;
ex::sync_service sync_svc;
ex::cuda_stream stream;
ex::cuda_event event;

notify const modes[] =
{ notify::callback, notify::poll, notify::deferred_sync };

long first = 0;
bool ok = true;
std::cout << "mechanism checksum\n";
for(std::size_t i = 0; i < std::size(modes); ++i)
{
long r = run_one(pool, stream, event, modes[i], poll_svc, sync_svc);
std::cout << name_of(modes[i]) << " " << r << "\n";
if(i == 0)
first = r;
else if(r != first)
ok = false;
}

long const expected =
static_cast<long>(buffer_len) * fill_value;
if(! ok || first != expected)
{
std::cout << "MISMATCH: mechanisms disagree or wrong result "
"(expected " << expected << ")\n";
return EXIT_FAILURE;
}

std::cout << "All three mechanisms produced " << first
<< " (identical).\n";
return EXIT_SUCCESS;
}
Loading
Loading