Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
111 changes: 65 additions & 46 deletions cudax/include/cuda/experimental/__stf/internal/host_launch_scope.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
#include <cuda/experimental/__stf/internal/thread_hierarchy.cuh>
#include <cuda/experimental/__stf/internal/void_interface.cuh>

#include <memory>
#include <type_traits>

namespace cuda::experimental::stf
Expand Down Expand Up @@ -238,28 +239,24 @@ public:
t.set_symbol(symbol);
}

cudaEvent_t start_event, end_event;
cudaEvent_t start_event = nullptr, end_event = nullptr;
const bool record_time = t.schedule_task() || statistics.is_calibrating_to_file();
// Set only once both timing events exist and the start event has been recorded.
// The timing setup is done below, after the SCOPE(exit) guard is installed, so a
// throw from those cuda_try calls cannot skip t.end_uncleared()/t.clear().
bool timing_active = false;

t.start();

if constexpr (::std::is_same_v<Ctx, stream_ctx>)
{
if (record_time)
{
cuda_safe_call(cudaEventCreate(&start_event));
cuda_safe_call(cudaEventCreate(&end_event));
cuda_safe_call(cudaEventRecord(start_event, t.get_stream()));
}
}

SCOPE(exit)
{
t.end_uncleared();
if constexpr (::std::is_same_v<Ctx, stream_ctx>)
{
if (record_time)
if (timing_active)
{
// Inside the noexcept SCOPE(exit) body; keep cuda_safe_call so a CUDA
// error aborts rather than throwing through the guard.
cuda_safe_call(cudaEventRecord(end_event, t.get_stream()));
cuda_safe_call(cudaEventSynchronize(end_event));

Expand All @@ -280,11 +277,27 @@ public:
t.clear();
};

if constexpr (::std::is_same_v<Ctx, stream_ctx>)
{
if (record_time)
{
// cudaEventCreate is an overload set (cuda_runtime.h adds a flags overload),
// so cuda_try<cudaEventCreate> cannot name it; use the non-overloaded
// cudaEventCreateWithFlags with the default flags (equivalent to cudaEventCreate).
start_event = cuda_try<cudaEventCreateWithFlags>(cudaEventDefault);
end_event = cuda_try<cudaEventCreateWithFlags>(cudaEventDefault);
cuda_try<cudaEventRecord>(start_event, t.get_stream());
timing_active = true;
}
}

if constexpr (fun_invocable_untyped)
{
// --- Untyped dispatch path ---
auto* resolved = new ::std::pair<Fun, host_launch_deps>{::std::forward<Fun>(f), host_launch_deps{}};
auto& hld = resolved->second;
auto resolved =
::std::make_unique<::std::pair<Fun, host_launch_deps>>(::std::forward<Fun>(f), host_launch_deps{});

auto& hld = resolved->second;

const size_t ndeps = deps.size();
hld.lds_.resize(ndeps);
Expand All @@ -298,32 +311,38 @@ public:
hld.dtor_ = user_data_dtor_;
user_data_dtor_ = nullptr;

if constexpr (::std::is_same_v<Ctx, graph_ctx>)
{
using wrapper_type = ::std::remove_reference_t<decltype(*resolved)>;
auto resource = ::std::make_shared<host_callback_args_resource<wrapper_type>>(resolved);
ctx.add_resource(mv(resource));
}

auto callback = [](void* raw) {
auto* w = static_cast<decltype(resolved)>(raw);
w->first(w->second);
if constexpr (!::std::is_same_v<Ctx, graph_ctx>)
auto* w = static_cast<decltype(resolved.get())>(raw);
SCOPE(exit)
{
delete w;
}
if constexpr (!::std::is_same_v<Ctx, graph_ctx>)
{
delete w;
}
};
w->first(w->second);
};

if constexpr (::std::is_same_v<Ctx, graph_ctx>)
{
cudaHostNodeParams params = {.fn = callback, .userData = resolved};
cudaHostNodeParams params = {.fn = callback, .userData = resolved.get()};
auto lock = t.lock_ctx_graph();
cuda_safe_call(cudaGraphAddHostNode(&t.get_node(), t.get_ctx_graph(), nullptr, 0, &params));
t.get_node() = cuda_try<cudaGraphAddHostNode>(t.get_ctx_graph(), nullptr, 0, &params);
// The node now references the args; hand ownership to a ctx resource
// that deletes them (in release_in_callback) when the ctx is released.
using wrapper_type = ::std::remove_reference_t<decltype(*resolved)>;
ctx.add_resource(::std::make_shared<host_callback_args_resource<wrapper_type>>(resolved.get()));
}
else
{
cuda_safe_call(cudaLaunchHostFunc(t.get_stream(), callback, resolved));
// For a stream the callback owns the args once the launch succeeds.
cuda_try<cudaLaunchHostFunc>(t.get_stream(), callback, resolved.get());
}
// Ownership has transferred (to the ctx resource for graph, or to the
// callback for stream). These enqueues are asynchronous, so on a throw
// above the callback has not run and the unique_ptr still owns the args;
// release it now that ownership has moved on.
resolved.release();
}
else
{
Expand All @@ -338,17 +357,17 @@ public:
return deps.instance(t);
}
}();
auto* wrapper = new ::std::pair<Fun, decltype(payload)>{::std::forward<Fun>(f), mv(payload)};

if constexpr (::std::is_same_v<Ctx, graph_ctx>)
{
using wrapper_type = ::std::remove_reference_t<decltype(*wrapper)>;
auto resource = ::std::make_shared<host_callback_args_resource<wrapper_type>>(wrapper);
ctx.add_resource(mv(resource));
}
auto wrapper = ::std::make_unique<::std::pair<Fun, decltype(payload)>>(::std::forward<Fun>(f), mv(payload));

auto callback = [](void* untyped_wrapper) {
auto w = static_cast<decltype(wrapper)>(untyped_wrapper);
auto w = static_cast<decltype(wrapper.get())>(untyped_wrapper);
SCOPE(exit)
{
if constexpr (!::std::is_same_v<Ctx, graph_ctx>)
{
delete w;
}
};

constexpr bool fun_invocable_task_deps = reserved::is_applicable_v<Fun, decltype(payload)>;
constexpr bool fun_invocable_task_non_void_deps =
Expand All @@ -365,23 +384,23 @@ public:
{
::std::apply(::std::forward<Fun>(w->first), reserved::remove_void_interface(mv(w->second)));
}

if constexpr (!::std::is_same_v<Ctx, graph_ctx>)
{
delete w;
}
};

if constexpr (::std::is_same_v<Ctx, graph_ctx>)
{
cudaHostNodeParams params = {.fn = callback, .userData = wrapper};
cudaHostNodeParams params = {.fn = callback, .userData = wrapper.get()};
auto lock = t.lock_ctx_graph();
cuda_safe_call(cudaGraphAddHostNode(&t.get_node(), t.get_ctx_graph(), nullptr, 0, &params));
t.get_node() = cuda_try<cudaGraphAddHostNode>(t.get_ctx_graph(), nullptr, 0, &params);
// Transfer ownership only after the node references the args, so a throw
// from cudaGraphAddHostNode leaves the unique_ptr as the sole owner.
using wrapper_type = ::std::remove_reference_t<decltype(*wrapper)>;
ctx.add_resource(::std::make_shared<host_callback_args_resource<wrapper_type>>(wrapper.get()));
}
else
{
cuda_safe_call(cudaLaunchHostFunc(t.get_stream(), callback, wrapper));
cuda_try<cudaLaunchHostFunc>(t.get_stream(), callback, wrapper.get());
}
wrapper.release();
}
}

Expand Down
64 changes: 40 additions & 24 deletions cudax/include/cuda/experimental/__stf/internal/launch.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ void cuda_launcher(interpreted_spec interpreted_policy, Fun&& f, void** args, St
lconfig.dynamicSmemBytes = mem_config[2];
lconfig.stream = stream;

cuda_safe_call(cudaLaunchKernelExC(&lconfig, (void*) f, args));
cuda_try<cudaLaunchKernelExC>(&lconfig, (void*) f, args);
}

template <typename interpreted_spec, typename Fun>
Expand All @@ -81,15 +81,15 @@ void cuda_launcher_graph(interpreted_spec interpreted_policy, Fun&& f, void** ar
kconfig.kernelParams = args;
kconfig.sharedMemBytes = static_cast<int>(mem_config[2]);

cuda_safe_call(cudaGraphAddKernelNode(&n, g, nullptr, 0, &kconfig));
n = cuda_try<cudaGraphAddKernelNode>(g, nullptr, 0, &kconfig);

// Enable cooperative kernel if necessary by updating the node attributes

bool cooperative_kernel = interpreted_policy.need_cooperative_kernel_launch();

cudaKernelNodeAttrValue val;
val.cooperative = cooperative_kernel ? 1 : 0;
cuda_safe_call(cudaGraphKernelNodeSetAttribute(n, cudaKernelNodeAttributeCooperative, &val));
cuda_try<cudaGraphKernelNodeSetAttribute>(n, cudaKernelNodeAttributeCooperative, &val);
}

template <typename Fun, typename interpreted_spec, typename Arg>
Expand Down Expand Up @@ -120,20 +120,25 @@ void launch_impl(interpreted_spec interpreted_policy, exec_place& p, Fun f, Arg

if (th_mem_config[1] > 0)
{
cuda_safe_call(cudaMallocAsync(&th_dev_tmp_ptr, th_mem_config[1], stream));
cuda_try(cudaMallocAsync(&th_dev_tmp_ptr, th_mem_config[1], stream));
th.set_device_tmp(th_dev_tmp_ptr);
}

// Free the temporary device memory on the way out, even if the launch throws.
// cuda_safe_call (not cuda_try) because SCOPE(exit) is noexcept.
SCOPE(exit)
{
if (th_dev_tmp_ptr)
{
cuda_safe_call(cudaFreeAsync(th_dev_tmp_ptr, stream));
}
};

auto kernel_args = tuple_prepend(mv(th), mv(arg));
using args_type = decltype(kernel_args);
void* all_args[] = {&f, &kernel_args};

cuda_launcher(interpreted_policy, reserved::launch_kernel<Fun, args_type>, all_args, stream);

if (th_mem_config[1] > 0)
{
cuda_safe_call(cudaFreeAsync(th_dev_tmp_ptr, stream));
}
};
}

Expand Down Expand Up @@ -358,20 +363,12 @@ public:
nvtx_range nr(t.get_symbol().c_str());
t.start();

int device;
cudaEvent_t start_event, end_event;

if constexpr (::std::is_same_v<Ctx, stream_ctx>)
{
if (record_time)
{
cudaGetDevice(&device); // We will use this to force it during the next run
// Events must be created here to avoid issues with multi-gpu
cuda_safe_call(cudaEventCreate(&start_event));
cuda_safe_call(cudaEventCreate(&end_event));
cuda_safe_call(cudaEventRecord(start_event, t.get_stream()));
}
}
int device = -1;
cudaEvent_t start_event = nullptr, end_event = nullptr;
// Set only once both timing events exist and the start event has been recorded.
// The timing setup is done below, after the SCOPE(exit) guard is installed, so a
// throw from those cuda_try calls cannot skip t.end_uncleared()/t.clear().
bool timing_active = false;

const size_t grid_size = e_place.size();

Expand Down Expand Up @@ -403,8 +400,11 @@ public:
deallocateManagedMemory(hostMemoryArrivedList, grid_size, t.get_stream());
}

if (record_time)
if (timing_active)
{
// These run inside the enclosing SCOPE(exit) body, which is noexcept;
// keep cuda_safe_call so a CUDA error aborts rather than throwing
// through the guard (which would call std::terminate).
cuda_safe_call(cudaEventRecord(end_event, t.get_stream()));
cuda_safe_call(cudaEventSynchronize(end_event));

Expand All @@ -426,6 +426,22 @@ public:
t.clear();
};

if constexpr (::std::is_same_v<Ctx, stream_ctx>)
{
if (record_time)
{
device = cuda_try<cudaGetDevice>(); // We will use this to force it during the next run
// Events must be created here to avoid issues with multi-gpu.
// cudaEventCreate is an overload set (cuda_runtime.h adds a flags overload),
// so cuda_try<cudaEventCreate> cannot name it; use the non-overloaded
// cudaEventCreateWithFlags with the default flags (equivalent to cudaEventCreate).
start_event = cuda_try<cudaEventCreateWithFlags>(cudaEventDefault);
end_event = cuda_try<cudaEventCreateWithFlags>(cudaEventDefault);
cuda_try<cudaEventRecord>(start_event, t.get_stream());
timing_active = true;
}
}

/* Should only be allocated / deallocated if the last level used is system wide. Unnecessary and wasteful
* otherwise. */
if (grid_size > 1)
Expand Down
Loading