From 1c9ba065daf175944340b07e272cbee6eb38d3d6 Mon Sep 17 00:00:00 2001 From: Andrei Alexandrescu Date: Wed, 3 Jun 2026 23:40:29 -0400 Subject: [PATCH 1/8] cudax/stf: migrate internal/ launch + host_launch_scope to cuda_try Third internal/ slice, covering the kernel/host launch scopes and their shared event-timing pattern. - Convert eligible calls to the templated cuda_try form: cudaLaunchKernelExC, cudaGraphAddKernelNode (out-param -> ref), cudaGraphKernelNodeSetAttribute, cudaFreeAsync, cudaEventRecord (start), cudaGraphAddHostNode (out-param -> ref), cudaLaunchHostFunc. - cudaEventCreate and cudaMallocAsync stay in the runtime-status form: both are overload sets (cuda_runtime.h flags overload / templated wrapper), so cuda_try cannot name them. - Event timing's end record/synchronize/elapsed run inside the noexcept SCOPE(exit) body, so they keep cuda_safe_call: a CUDA error there should abort rather than throw through the guard (which would std::terminate). - The two stream-path cudaLaunchHostFunc enqueues now get a SCOPE(fail) that deletes the heap callback args (resolved / wrapper) if the enqueue throws -- the callback only takes ownership once the enqueue succeeds, so this closes the leak the new throw path would otherwise introduce. The graph-path host nodes are already covered because their args are owned by a ctx resource added before the node is created. Pre-existing and left as-is: the timing events created here are never cudaEventDestroy'd (a leak in the calibration path, unrelated to this change). --- .../__stf/internal/host_launch_scope.cuh | 30 ++++++++++++++----- .../experimental/__stf/internal/launch.cuh | 23 ++++++++------ 2 files changed, 37 insertions(+), 16 deletions(-) diff --git a/cudax/include/cuda/experimental/__stf/internal/host_launch_scope.cuh b/cudax/include/cuda/experimental/__stf/internal/host_launch_scope.cuh index 5c20dcfe4ba..93144f7c4e4 100644 --- a/cudax/include/cuda/experimental/__stf/internal/host_launch_scope.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/host_launch_scope.cuh @@ -247,9 +247,11 @@ public: { if (record_time) { - cuda_safe_call(cudaEventCreate(&start_event)); - cuda_safe_call(cudaEventCreate(&end_event)); - cuda_safe_call(cudaEventRecord(start_event, t.get_stream())); + // cudaEventCreate is an overload set (cuda_runtime.h adds a flags + // overload), so it keeps the runtime-status cuda_try form. + cuda_try(cudaEventCreate(&start_event)); + cuda_try(cudaEventCreate(&end_event)); + cuda_try(start_event, t.get_stream()); } } @@ -260,6 +262,8 @@ public: { if (record_time) { + // Inside the noexcept SCOPE(exit) body; keep cuda_safe_call so a CUDA + // error aborts rather than throwing through the guard. cuda_safe_call(cudaEventRecord(end_event, t.get_stream())); cuda_safe_call(cudaEventSynchronize(end_event)); @@ -318,11 +322,17 @@ public: { cudaHostNodeParams params = {.fn = callback, .userData = resolved}; auto lock = t.lock_ctx_graph(); - cuda_safe_call(cudaGraphAddHostNode(&t.get_node(), t.get_ctx_graph(), nullptr, 0, ¶ms)); + t.get_node() = cuda_try(t.get_ctx_graph(), nullptr, 0, ¶ms); } else { - cuda_safe_call(cudaLaunchHostFunc(t.get_stream(), callback, resolved)); + // The callback owns `resolved` once enqueued; delete it if the enqueue + // throws so it does not leak. + SCOPE(fail) + { + delete resolved; + }; + cuda_try(t.get_stream(), callback, resolved); } } else @@ -376,11 +386,17 @@ public: { cudaHostNodeParams params = {.fn = callback, .userData = wrapper}; auto lock = t.lock_ctx_graph(); - cuda_safe_call(cudaGraphAddHostNode(&t.get_node(), t.get_ctx_graph(), nullptr, 0, ¶ms)); + t.get_node() = cuda_try(t.get_ctx_graph(), nullptr, 0, ¶ms); } else { - cuda_safe_call(cudaLaunchHostFunc(t.get_stream(), callback, wrapper)); + // The callback owns `wrapper` once enqueued; delete it if the enqueue + // throws so it does not leak. + SCOPE(fail) + { + delete wrapper; + }; + cuda_try(t.get_stream(), callback, wrapper); } } } diff --git a/cudax/include/cuda/experimental/__stf/internal/launch.cuh b/cudax/include/cuda/experimental/__stf/internal/launch.cuh index 2cc352adc43..bc3de9b7b13 100644 --- a/cudax/include/cuda/experimental/__stf/internal/launch.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/launch.cuh @@ -64,7 +64,7 @@ void cuda_launcher(interpreted_spec interpreted_policy, Fun&& f, void** args, St lconfig.dynamicSmemBytes = mem_config[2]; lconfig.stream = stream; - cuda_safe_call(cudaLaunchKernelExC(&lconfig, (void*) f, args)); + cuda_try(&lconfig, (void*) f, args); } template @@ -81,7 +81,7 @@ void cuda_launcher_graph(interpreted_spec interpreted_policy, Fun&& f, void** ar kconfig.kernelParams = args; kconfig.sharedMemBytes = static_cast(mem_config[2]); - cuda_safe_call(cudaGraphAddKernelNode(&n, g, nullptr, 0, &kconfig)); + n = cuda_try(g, nullptr, 0, &kconfig); // Enable cooperative kernel if necessary by updating the node attributes @@ -89,7 +89,7 @@ void cuda_launcher_graph(interpreted_spec interpreted_policy, Fun&& f, void** ar cudaKernelNodeAttrValue val; val.cooperative = cooperative_kernel ? 1 : 0; - cuda_safe_call(cudaGraphKernelNodeSetAttribute(n, cudaKernelNodeAttributeCooperative, &val)); + cuda_try(n, cudaKernelNodeAttributeCooperative, &val); } template @@ -120,7 +120,7 @@ void launch_impl(interpreted_spec interpreted_policy, exec_place& p, Fun f, Arg if (th_mem_config[1] > 0) { - cuda_safe_call(cudaMallocAsync(&th_dev_tmp_ptr, th_mem_config[1], stream)); + cuda_try(cudaMallocAsync(&th_dev_tmp_ptr, th_mem_config[1], stream)); th.set_device_tmp(th_dev_tmp_ptr); } @@ -132,7 +132,7 @@ void launch_impl(interpreted_spec interpreted_policy, exec_place& p, Fun f, Arg if (th_mem_config[1] > 0) { - cuda_safe_call(cudaFreeAsync(th_dev_tmp_ptr, stream)); + cuda_try(th_dev_tmp_ptr, stream); } }; } @@ -366,10 +366,12 @@ public: if (record_time) { cudaGetDevice(&device); // We will use this to force it during the next run - // Events must be created here to avoid issues with multi-gpu - cuda_safe_call(cudaEventCreate(&start_event)); - cuda_safe_call(cudaEventCreate(&end_event)); - cuda_safe_call(cudaEventRecord(start_event, t.get_stream())); + // Events must be created here to avoid issues with multi-gpu. + // cudaEventCreate keeps the runtime-status form: it is an overload set + // (cuda_runtime.h adds a flags overload), so cuda_try cannot name it. + cuda_try(cudaEventCreate(&start_event)); + cuda_try(cudaEventCreate(&end_event)); + cuda_try(start_event, t.get_stream()); } } @@ -405,6 +407,9 @@ public: if (record_time) { + // These run inside the enclosing SCOPE(exit) body, which is noexcept; + // keep cuda_safe_call so a CUDA error aborts rather than throwing + // through the guard (which would call std::terminate). cuda_safe_call(cudaEventRecord(end_event, t.get_stream())); cuda_safe_call(cudaEventSynchronize(end_event)); From d2105b3f696bacab5ba9b86f01e3ad7668e002b1 Mon Sep 17 00:00:00 2001 From: Andrei Alexandrescu Date: Wed, 3 Jun 2026 23:47:20 -0400 Subject: [PATCH 2/8] cudax/stf: use cuda_try for timing event creation cudaEventCreate is an overload set (cuda_runtime.h adds a flags overload), so it cannot be named by the templated cuda_try form. Use the non-overloaded cudaEventCreateWithFlags with cudaEventDefault instead, which is exactly what cudaEventCreate(&e) does internally, so behavior is unchanged while keeping the templated form. --- .../experimental/__stf/internal/host_launch_scope.cuh | 9 +++++---- .../include/cuda/experimental/__stf/internal/launch.cuh | 9 +++++---- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/cudax/include/cuda/experimental/__stf/internal/host_launch_scope.cuh b/cudax/include/cuda/experimental/__stf/internal/host_launch_scope.cuh index 93144f7c4e4..566a7ecaf2e 100644 --- a/cudax/include/cuda/experimental/__stf/internal/host_launch_scope.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/host_launch_scope.cuh @@ -247,10 +247,11 @@ public: { if (record_time) { - // cudaEventCreate is an overload set (cuda_runtime.h adds a flags - // overload), so it keeps the runtime-status cuda_try form. - cuda_try(cudaEventCreate(&start_event)); - cuda_try(cudaEventCreate(&end_event)); + // cudaEventCreate is an overload set (cuda_runtime.h adds a flags overload), + // so cuda_try cannot name it; use the non-overloaded + // cudaEventCreateWithFlags with the default flags (equivalent to cudaEventCreate). + start_event = cuda_try(cudaEventDefault); + end_event = cuda_try(cudaEventDefault); cuda_try(start_event, t.get_stream()); } } diff --git a/cudax/include/cuda/experimental/__stf/internal/launch.cuh b/cudax/include/cuda/experimental/__stf/internal/launch.cuh index bc3de9b7b13..7ff921fe892 100644 --- a/cudax/include/cuda/experimental/__stf/internal/launch.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/launch.cuh @@ -367,10 +367,11 @@ public: { cudaGetDevice(&device); // We will use this to force it during the next run // Events must be created here to avoid issues with multi-gpu. - // cudaEventCreate keeps the runtime-status form: it is an overload set - // (cuda_runtime.h adds a flags overload), so cuda_try cannot name it. - cuda_try(cudaEventCreate(&start_event)); - cuda_try(cudaEventCreate(&end_event)); + // cudaEventCreate is an overload set (cuda_runtime.h adds a flags overload), + // so cuda_try cannot name it; use the non-overloaded + // cudaEventCreateWithFlags with the default flags (equivalent to cudaEventCreate). + start_event = cuda_try(cudaEventDefault); + end_event = cuda_try(cudaEventDefault); cuda_try(start_event, t.get_stream()); } } From 39c1cabd4986d54bae6fa3aaaaae01dc4f1d02ff Mon Sep 17 00:00:00 2001 From: Andrei Alexandrescu Date: Thu, 4 Jun 2026 00:12:31 -0400 Subject: [PATCH 3/8] cudax/stf: fix host_launch callback-arg ownership ordering The host_launch callback args (resolved / wrapper) are heap-allocated and guarded by SCOPE(fail) { delete ...; }. Transfer of ownership to the graph-path ctx resource was happening in the wrong order: - Untyped path: `resolved` was set to nullptr right after add_resource, but it is also used as the host node's userData. That made the graph node receive a null userData, so the callback dereferenced null on the success path. - Typed path: add_resource ran before cudaGraphAddHostNode, so a throw from the node creation would delete `wrapper` twice (SCOPE(fail) plus the resource's release_in_callback). Fix both by creating the host node first (while resolved/wrapper is still a valid userData), then handing ownership to the ctx resource, then nulling the pointer once at the end to disarm SCOPE(fail). On a throw before that point the resource has not been added, so SCOPE(fail) is the sole owner and frees the args exactly once. --- .../__stf/internal/host_launch_scope.cuh | 74 ++++++++++--------- 1 file changed, 39 insertions(+), 35 deletions(-) diff --git a/cudax/include/cuda/experimental/__stf/internal/host_launch_scope.cuh b/cudax/include/cuda/experimental/__stf/internal/host_launch_scope.cuh index 566a7ecaf2e..da8c7495aeb 100644 --- a/cudax/include/cuda/experimental/__stf/internal/host_launch_scope.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/host_launch_scope.cuh @@ -289,7 +289,13 @@ public: { // --- Untyped dispatch path --- auto* resolved = new ::std::pair{::std::forward(f), host_launch_deps{}}; - auto& hld = resolved->second; + // Whenever we pass ownership we assigned nullptr to resolved. + SCOPE(fail) + { + delete resolved; + }; + + auto& hld = resolved->second; const size_t ndeps = deps.size(); hld.lds_.resize(ndeps); @@ -303,20 +309,16 @@ public: hld.dtor_ = user_data_dtor_; user_data_dtor_ = nullptr; - if constexpr (::std::is_same_v) - { - using wrapper_type = ::std::remove_reference_t; - auto resource = ::std::make_shared>(resolved); - ctx.add_resource(mv(resource)); - } - auto callback = [](void* raw) { auto* w = static_cast(raw); - w->first(w->second); - if constexpr (!::std::is_same_v) + SCOPE(exit) { - delete w; - } + if constexpr (!::std::is_same_v) + { + delete w; + } + }; + w->first(w->second); }; if constexpr (::std::is_same_v) @@ -324,17 +326,21 @@ public: cudaHostNodeParams params = {.fn = callback, .userData = resolved}; auto lock = t.lock_ctx_graph(); t.get_node() = cuda_try(t.get_ctx_graph(), nullptr, 0, ¶ms); + // The node now references `resolved`; hand ownership to a ctx resource + // that deletes it (in release_in_callback) when the ctx is released. + using wrapper_type = ::std::remove_reference_t; + ctx.add_resource(::std::make_shared>(resolved)); } else { - // The callback owns `resolved` once enqueued; delete it if the enqueue - // throws so it does not leak. - SCOPE(fail) - { - delete resolved; - }; + // For a stream the callback owns `resolved` once the launch succeeds. cuda_try(t.get_stream(), callback, resolved); } + // Ownership has transferred (to the ctx resource for graph, or to the + // callback for stream). These enqueues are asynchronous, so on a throw + // above the callback has not run and SCOPE(fail) still owns `resolved`; + // past this point disarm it. + resolved = nullptr; } else { @@ -350,16 +356,20 @@ public: } }(); auto* wrapper = new ::std::pair{::std::forward(f), mv(payload)}; - - if constexpr (::std::is_same_v) + SCOPE(fail) { - using wrapper_type = ::std::remove_reference_t; - auto resource = ::std::make_shared>(wrapper); - ctx.add_resource(mv(resource)); - } + delete wrapper; + }; auto callback = [](void* untyped_wrapper) { auto w = static_cast(untyped_wrapper); + SCOPE(exit) + { + if constexpr (!::std::is_same_v) + { + delete w; + } + }; constexpr bool fun_invocable_task_deps = reserved::is_applicable_v; constexpr bool fun_invocable_task_non_void_deps = @@ -376,11 +386,6 @@ public: { ::std::apply(::std::forward(w->first), reserved::remove_void_interface(mv(w->second))); } - - if constexpr (!::std::is_same_v) - { - delete w; - } }; if constexpr (::std::is_same_v) @@ -388,17 +393,16 @@ public: cudaHostNodeParams params = {.fn = callback, .userData = wrapper}; auto lock = t.lock_ctx_graph(); t.get_node() = cuda_try(t.get_ctx_graph(), nullptr, 0, ¶ms); + // Transfer ownership only after the node references `wrapper`, so a throw + // from cudaGraphAddHostNode leaves SCOPE(fail) as the sole owner. + using wrapper_type = ::std::remove_reference_t; + ctx.add_resource(::std::make_shared>(wrapper)); } else { - // The callback owns `wrapper` once enqueued; delete it if the enqueue - // throws so it does not leak. - SCOPE(fail) - { - delete wrapper; - }; cuda_try(t.get_stream(), callback, wrapper); } + wrapper = nullptr; } } From d679c2390eeddcfb9a870d7cc75f2b12022a2c6e Mon Sep 17 00:00:00 2001 From: Andrei Alexandrescu Date: Thu, 4 Jun 2026 00:23:02 -0400 Subject: [PATCH 4/8] cudax/stf: own host_launch callback args with unique_ptr Replace the raw new + SCOPE(fail){delete} + manual nulling design for the host_launch callback arguments with std::unique_ptr. The args are borrowed via .get() for the host node userData / cudaLaunchHostFunc argument and the ctx resource, and ownership is handed off with .release() once the node has been created (graph) or the launch has been enqueued (stream). On a throw before that point the unique_ptr frees the args; afterwards the ctx resource (graph) or the callback (stream) owns and frees them. Adds . --- .../__stf/internal/host_launch_scope.cuh | 49 ++++++++----------- 1 file changed, 21 insertions(+), 28 deletions(-) diff --git a/cudax/include/cuda/experimental/__stf/internal/host_launch_scope.cuh b/cudax/include/cuda/experimental/__stf/internal/host_launch_scope.cuh index da8c7495aeb..4ba2b2c34c9 100644 --- a/cudax/include/cuda/experimental/__stf/internal/host_launch_scope.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/host_launch_scope.cuh @@ -35,6 +35,7 @@ #include #include +#include #include namespace cuda::experimental::stf @@ -288,12 +289,8 @@ public: if constexpr (fun_invocable_untyped) { // --- Untyped dispatch path --- - auto* resolved = new ::std::pair{::std::forward(f), host_launch_deps{}}; - // Whenever we pass ownership we assigned nullptr to resolved. - SCOPE(fail) - { - delete resolved; - }; + auto resolved = + ::std::make_unique<::std::pair>(::std::forward(f), host_launch_deps{}); auto& hld = resolved->second; @@ -310,7 +307,7 @@ public: user_data_dtor_ = nullptr; auto callback = [](void* raw) { - auto* w = static_cast(raw); + auto* w = static_cast(raw); SCOPE(exit) { if constexpr (!::std::is_same_v) @@ -323,24 +320,24 @@ public: if constexpr (::std::is_same_v) { - cudaHostNodeParams params = {.fn = callback, .userData = resolved}; + cudaHostNodeParams params = {.fn = callback, .userData = resolved.get()}; auto lock = t.lock_ctx_graph(); t.get_node() = cuda_try(t.get_ctx_graph(), nullptr, 0, ¶ms); - // The node now references `resolved`; hand ownership to a ctx resource - // that deletes it (in release_in_callback) when the ctx is released. + // The node now references the args; hand ownership to a ctx resource + // that deletes them (in release_in_callback) when the ctx is released. using wrapper_type = ::std::remove_reference_t; - ctx.add_resource(::std::make_shared>(resolved)); + ctx.add_resource(::std::make_shared>(resolved.get())); } else { - // For a stream the callback owns `resolved` once the launch succeeds. - cuda_try(t.get_stream(), callback, resolved); + // For a stream the callback owns the args once the launch succeeds. + cuda_try(t.get_stream(), callback, resolved.get()); } // Ownership has transferred (to the ctx resource for graph, or to the // callback for stream). These enqueues are asynchronous, so on a throw - // above the callback has not run and SCOPE(fail) still owns `resolved`; - // past this point disarm it. - resolved = nullptr; + // above the callback has not run and the unique_ptr still owns the args; + // release it now that ownership has moved on. + resolved.release(); } else { @@ -355,14 +352,10 @@ public: return deps.instance(t); } }(); - auto* wrapper = new ::std::pair{::std::forward(f), mv(payload)}; - SCOPE(fail) - { - delete wrapper; - }; + auto wrapper = ::std::make_unique<::std::pair>(::std::forward(f), mv(payload)); auto callback = [](void* untyped_wrapper) { - auto w = static_cast(untyped_wrapper); + auto w = static_cast(untyped_wrapper); SCOPE(exit) { if constexpr (!::std::is_same_v) @@ -390,19 +383,19 @@ public: if constexpr (::std::is_same_v) { - cudaHostNodeParams params = {.fn = callback, .userData = wrapper}; + cudaHostNodeParams params = {.fn = callback, .userData = wrapper.get()}; auto lock = t.lock_ctx_graph(); t.get_node() = cuda_try(t.get_ctx_graph(), nullptr, 0, ¶ms); - // Transfer ownership only after the node references `wrapper`, so a throw - // from cudaGraphAddHostNode leaves SCOPE(fail) as the sole owner. + // Transfer ownership only after the node references the args, so a throw + // from cudaGraphAddHostNode leaves the unique_ptr as the sole owner. using wrapper_type = ::std::remove_reference_t; - ctx.add_resource(::std::make_shared>(wrapper)); + ctx.add_resource(::std::make_shared>(wrapper.get())); } else { - cuda_try(t.get_stream(), callback, wrapper); + cuda_try(t.get_stream(), callback, wrapper.get()); } - wrapper = nullptr; + wrapper.release(); } } From 70f0713392cd0fa6a43bb854b0a6a0a27c0314d6 Mon Sep 17 00:00:00 2001 From: Andrei Alexandrescu Date: Thu, 4 Jun 2026 00:27:44 -0400 Subject: [PATCH 5/8] cudax/stf: free launch temp device memory via SCOPE(exit) launch_impl allocates a temporary device buffer (cudaMallocAsync) and freed it after cuda_launcher returned. Now that cuda_launcher throws on error (via cuda_try), the trailing cudaFreeAsync was skipped on a throw, leaking the buffer. Free it from a SCOPE(exit) placed right after the allocation so it runs on both normal and exceptional exit. cuda_safe_call is used inside the noexcept SCOPE(exit) body. --- .../cuda/experimental/__stf/internal/launch.cuh | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/cudax/include/cuda/experimental/__stf/internal/launch.cuh b/cudax/include/cuda/experimental/__stf/internal/launch.cuh index 7ff921fe892..de4d924f9a5 100644 --- a/cudax/include/cuda/experimental/__stf/internal/launch.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/launch.cuh @@ -124,16 +124,21 @@ void launch_impl(interpreted_spec interpreted_policy, exec_place& p, Fun f, Arg th.set_device_tmp(th_dev_tmp_ptr); } + // Free the temporary device memory on the way out, even if the launch throws. + // cuda_safe_call (not cuda_try) because SCOPE(exit) is noexcept. + SCOPE(exit) + { + if (th_dev_tmp_ptr) + { + cuda_safe_call(cudaFreeAsync(th_dev_tmp_ptr, stream)); + } + }; + auto kernel_args = tuple_prepend(mv(th), mv(arg)); using args_type = decltype(kernel_args); void* all_args[] = {&f, &kernel_args}; cuda_launcher(interpreted_policy, reserved::launch_kernel, all_args, stream); - - if (th_mem_config[1] > 0) - { - cuda_try(th_dev_tmp_ptr, stream); - } }; } From 92c226060d84a612979567ba7c4cd700fe705800 Mon Sep 17 00:00:00 2001 From: Andrei Alexandrescu Date: Thu, 4 Jun 2026 00:35:49 -0400 Subject: [PATCH 6/8] cudax/stf: check cudaGetDevice in launch timing path The cudaGetDevice call in the timing branch was unchecked. Use the templated cuda_try form so a failure is reported. --- cudax/include/cuda/experimental/__stf/internal/launch.cuh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cudax/include/cuda/experimental/__stf/internal/launch.cuh b/cudax/include/cuda/experimental/__stf/internal/launch.cuh index de4d924f9a5..c6eb02773af 100644 --- a/cudax/include/cuda/experimental/__stf/internal/launch.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/launch.cuh @@ -363,14 +363,14 @@ public: nvtx_range nr(t.get_symbol().c_str()); t.start(); - int device; - cudaEvent_t start_event, end_event; + int device = -1; + cudaEvent_t start_event = nullptr, end_event = nullptr; if constexpr (::std::is_same_v) { if (record_time) { - cudaGetDevice(&device); // We will use this to force it during the next run + device = cuda_try(); // We will use this to force it during the next run // Events must be created here to avoid issues with multi-gpu. // cudaEventCreate is an overload set (cuda_runtime.h adds a flags overload), // so cuda_try cannot name it; use the non-overloaded From 14834d8ec73cdb9155fdaa3abaeb2e874688cbb9 Mon Sep 17 00:00:00 2001 From: Andrei Alexandrescu Date: Thu, 4 Jun 2026 18:23:42 -0400 Subject: [PATCH 7/8] [STF] Initialize host_launch_scope timing events to nullptr Match launch.cuh and satisfy GCC -Wmaybe-uninitialized: if cuda_try throws before both events are created, SCOPE(exit) still runs with record_time set. --- .../cuda/experimental/__stf/internal/host_launch_scope.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cudax/include/cuda/experimental/__stf/internal/host_launch_scope.cuh b/cudax/include/cuda/experimental/__stf/internal/host_launch_scope.cuh index 4ba2b2c34c9..ff49075cd66 100644 --- a/cudax/include/cuda/experimental/__stf/internal/host_launch_scope.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/host_launch_scope.cuh @@ -239,7 +239,7 @@ public: t.set_symbol(symbol); } - cudaEvent_t start_event, end_event; + cudaEvent_t start_event = nullptr, end_event = nullptr; const bool record_time = t.schedule_task() || statistics.is_calibrating_to_file(); t.start(); From 7a9d75f1a1e651398b4c549036430ea4d6802d86 Mon Sep 17 00:00:00 2001 From: Andrei Alexandrescu Date: Thu, 4 Jun 2026 21:45:38 -0400 Subject: [PATCH 8/8] cudax/stf: install task cleanup guard before throwing timing calls After the cuda_safe_call -> cuda_try migration, the timing-event setup (cudaGetDevice / cudaEventCreateWithFlags / cudaEventRecord) can throw, but in launch.cuh and host_launch_scope.cuh it ran *before* the SCOPE(exit) that calls t.end_uncleared()/t.clear(). A throw there left the task half-open and leaked any partially-created event. Install the SCOPE(exit) guard first, then do the timing setup, and gate the end-of-scope timing teardown on a new timing_active flag that is only set once both events exist and the start event has been recorded. On a throw from the timing cuda_try calls the guard now runs the task cleanup, and the teardown is skipped because timing_active is still false. No success-path behavior change. Addresses CodeRabbit review comments on PR #9249. --- .../__stf/internal/host_launch_scope.cuh | 33 ++++++++++------- .../experimental/__stf/internal/launch.cuh | 37 +++++++++++-------- 2 files changed, 40 insertions(+), 30 deletions(-) diff --git a/cudax/include/cuda/experimental/__stf/internal/host_launch_scope.cuh b/cudax/include/cuda/experimental/__stf/internal/host_launch_scope.cuh index ff49075cd66..aeae4c78454 100644 --- a/cudax/include/cuda/experimental/__stf/internal/host_launch_scope.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/host_launch_scope.cuh @@ -241,28 +241,19 @@ public: cudaEvent_t start_event = nullptr, end_event = nullptr; const bool record_time = t.schedule_task() || statistics.is_calibrating_to_file(); + // Set only once both timing events exist and the start event has been recorded. + // The timing setup is done below, after the SCOPE(exit) guard is installed, so a + // throw from those cuda_try calls cannot skip t.end_uncleared()/t.clear(). + bool timing_active = false; t.start(); - if constexpr (::std::is_same_v) - { - if (record_time) - { - // cudaEventCreate is an overload set (cuda_runtime.h adds a flags overload), - // so cuda_try cannot name it; use the non-overloaded - // cudaEventCreateWithFlags with the default flags (equivalent to cudaEventCreate). - start_event = cuda_try(cudaEventDefault); - end_event = cuda_try(cudaEventDefault); - cuda_try(start_event, t.get_stream()); - } - } - SCOPE(exit) { t.end_uncleared(); if constexpr (::std::is_same_v) { - if (record_time) + if (timing_active) { // Inside the noexcept SCOPE(exit) body; keep cuda_safe_call so a CUDA // error aborts rather than throwing through the guard. @@ -286,6 +277,20 @@ public: t.clear(); }; + if constexpr (::std::is_same_v) + { + if (record_time) + { + // cudaEventCreate is an overload set (cuda_runtime.h adds a flags overload), + // so cuda_try cannot name it; use the non-overloaded + // cudaEventCreateWithFlags with the default flags (equivalent to cudaEventCreate). + start_event = cuda_try(cudaEventDefault); + end_event = cuda_try(cudaEventDefault); + cuda_try(start_event, t.get_stream()); + timing_active = true; + } + } + if constexpr (fun_invocable_untyped) { // --- Untyped dispatch path --- diff --git a/cudax/include/cuda/experimental/__stf/internal/launch.cuh b/cudax/include/cuda/experimental/__stf/internal/launch.cuh index c6eb02773af..b4b88714ad9 100644 --- a/cudax/include/cuda/experimental/__stf/internal/launch.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/launch.cuh @@ -365,21 +365,10 @@ public: int device = -1; cudaEvent_t start_event = nullptr, end_event = nullptr; - - if constexpr (::std::is_same_v) - { - if (record_time) - { - device = cuda_try(); // We will use this to force it during the next run - // Events must be created here to avoid issues with multi-gpu. - // cudaEventCreate is an overload set (cuda_runtime.h adds a flags overload), - // so cuda_try cannot name it; use the non-overloaded - // cudaEventCreateWithFlags with the default flags (equivalent to cudaEventCreate). - start_event = cuda_try(cudaEventDefault); - end_event = cuda_try(cudaEventDefault); - cuda_try(start_event, t.get_stream()); - } - } + // Set only once both timing events exist and the start event has been recorded. + // The timing setup is done below, after the SCOPE(exit) guard is installed, so a + // throw from those cuda_try calls cannot skip t.end_uncleared()/t.clear(). + bool timing_active = false; const size_t grid_size = e_place.size(); @@ -411,7 +400,7 @@ public: deallocateManagedMemory(hostMemoryArrivedList, grid_size, t.get_stream()); } - if (record_time) + if (timing_active) { // These run inside the enclosing SCOPE(exit) body, which is noexcept; // keep cuda_safe_call so a CUDA error aborts rather than throwing @@ -437,6 +426,22 @@ public: t.clear(); }; + if constexpr (::std::is_same_v) + { + if (record_time) + { + device = cuda_try(); // We will use this to force it during the next run + // Events must be created here to avoid issues with multi-gpu. + // cudaEventCreate is an overload set (cuda_runtime.h adds a flags overload), + // so cuda_try cannot name it; use the non-overloaded + // cudaEventCreateWithFlags with the default flags (equivalent to cudaEventCreate). + start_event = cuda_try(cudaEventDefault); + end_event = cuda_try(cudaEventDefault); + cuda_try(start_event, t.get_stream()); + timing_active = true; + } + } + /* Should only be allocated / deallocated if the last level used is system wide. Unnecessary and wasteful * otherwise. */ if (grid_size > 1)