From 1c9ba065daf175944340b07e272cbee6eb38d3d6 Mon Sep 17 00:00:00 2001
From: Andrei Alexandrescu <andrei@erdani.com>
Date: Wed, 3 Jun 2026 23:40:29 -0400
Subject: [PATCH 1/8] cudax/stf: migrate internal/ launch + host_launch_scope
 to cuda_try

Third internal/ slice, covering the kernel/host launch scopes and their
shared event-timing pattern.

- Convert eligible calls to the templated cuda_try<F> form: cudaLaunchKernelExC,
  cudaGraphAddKernelNode (out-param -> ref), cudaGraphKernelNodeSetAttribute,
  cudaFreeAsync, cudaEventRecord (start), cudaGraphAddHostNode (out-param -> ref),
  cudaLaunchHostFunc.
- cudaEventCreate and cudaMallocAsync stay in the runtime-status form: both are
  overload sets (cuda_runtime.h flags overload / templated wrapper), so
  cuda_try<F> cannot name them.
- Event timing's end record/synchronize/elapsed run inside the noexcept
  SCOPE(exit) body, so they keep cuda_safe_call: a CUDA error there should
  abort rather than throw through the guard (which would std::terminate).
- The two stream-path cudaLaunchHostFunc enqueues now get a SCOPE(fail) that
  deletes the heap callback args (resolved / wrapper) if the enqueue throws --
  the callback only takes ownership once the enqueue succeeds, so this closes
  the leak the new throw path would otherwise introduce. The graph-path host
  nodes are already covered because their args are owned by a ctx resource added
  before the node is created.

Pre-existing and left as-is: the timing events created here are never
cudaEventDestroy'd (a leak in the calibration path, unrelated to this change).
---
 .../__stf/internal/host_launch_scope.cuh      | 30 ++++++++++++++-----
 .../experimental/__stf/internal/launch.cuh    | 23 ++++++++------
 2 files changed, 37 insertions(+), 16 deletions(-)
diff --git a/cudax/include/cuda/experimental/__stf/internal/host_launch_scope.cuh b/cudax/include/cuda/experimental/__stf/internal/host_launch_scope.cuh
index 5c20dcfe4ba..93144f7c4e4 100644
--- a/cudax/include/cuda/experimental/__stf/internal/host_launch_scope.cuh
+++ b/cudax/include/cuda/experimental/__stf/internal/host_launch_scope.cuh
@@ -247,9 +247,11 @@ public:
     {
       if (record_time)
       {
-        cuda_safe_call(cudaEventCreate(&start_event));
-        cuda_safe_call(cudaEventCreate(&end_event));
-        cuda_safe_call(cudaEventRecord(start_event, t.get_stream()));
+        // cudaEventCreate is an overload set (cuda_runtime.h adds a flags
+        // overload), so it keeps the runtime-status cuda_try form.
+        cuda_try(cudaEventCreate(&start_event));
+        cuda_try(cudaEventCreate(&end_event));
+        cuda_try<cudaEventRecord>(start_event, t.get_stream());
       }
     }
 
@@ -260,6 +262,8 @@ public:
       {
         if (record_time)
         {
+          // Inside the noexcept SCOPE(exit) body; keep cuda_safe_call so a CUDA
+          // error aborts rather than throwing through the guard.
           cuda_safe_call(cudaEventRecord(end_event, t.get_stream()));
           cuda_safe_call(cudaEventSynchronize(end_event));
 
@@ -318,11 +322,17 @@ public:
       {
         cudaHostNodeParams params = {.fn = callback, .userData = resolved};
         auto lock                 = t.lock_ctx_graph();
-        cuda_safe_call(cudaGraphAddHostNode(&t.get_node(), t.get_ctx_graph(), nullptr, 0, &params));
+        t.get_node()              = cuda_try<cudaGraphAddHostNode>(t.get_ctx_graph(), nullptr, 0, &params);
       }
       else
       {
-        cuda_safe_call(cudaLaunchHostFunc(t.get_stream(), callback, resolved));
+        // The callback owns `resolved` once enqueued; delete it if the enqueue
+        // throws so it does not leak.
+        SCOPE(fail)
+        {
+          delete resolved;
+        };
+        cuda_try<cudaLaunchHostFunc>(t.get_stream(), callback, resolved);
       }
     }
     else
@@ -376,11 +386,17 @@ public:
       {
         cudaHostNodeParams params = {.fn = callback, .userData = wrapper};
         auto lock                 = t.lock_ctx_graph();
-        cuda_safe_call(cudaGraphAddHostNode(&t.get_node(), t.get_ctx_graph(), nullptr, 0, &params));
+        t.get_node()              = cuda_try<cudaGraphAddHostNode>(t.get_ctx_graph(), nullptr, 0, &params);
       }
       else
       {
-        cuda_safe_call(cudaLaunchHostFunc(t.get_stream(), callback, wrapper));
+        // The callback owns `wrapper` once enqueued; delete it if the enqueue
+        // throws so it does not leak.
+        SCOPE(fail)
+        {
+          delete wrapper;
+        };
+        cuda_try<cudaLaunchHostFunc>(t.get_stream(), callback, wrapper);
       }
     }
   }
diff --git a/cudax/include/cuda/experimental/__stf/internal/launch.cuh b/cudax/include/cuda/experimental/__stf/internal/launch.cuh
index 2cc352adc43..bc3de9b7b13 100644
--- a/cudax/include/cuda/experimental/__stf/internal/launch.cuh
+++ b/cudax/include/cuda/experimental/__stf/internal/launch.cuh
@@ -64,7 +64,7 @@ void cuda_launcher(interpreted_spec interpreted_policy, Fun&& f, void** args, St
   lconfig.dynamicSmemBytes = mem_config[2];
   lconfig.stream           = stream;
 
-  cuda_safe_call(cudaLaunchKernelExC(&lconfig, (void*) f, args));
+  cuda_try<cudaLaunchKernelExC>(&lconfig, (void*) f, args);
 }
 
 template <typename interpreted_spec, typename Fun>
@@ -81,7 +81,7 @@ void cuda_launcher_graph(interpreted_spec interpreted_policy, Fun&& f, void** ar
   kconfig.kernelParams   = args;
   kconfig.sharedMemBytes = static_cast<int>(mem_config[2]);
 
-  cuda_safe_call(cudaGraphAddKernelNode(&n, g, nullptr, 0, &kconfig));
+  n = cuda_try<cudaGraphAddKernelNode>(g, nullptr, 0, &kconfig);
 
   // Enable cooperative kernel if necessary by updating the node attributes
 
@@ -89,7 +89,7 @@ void cuda_launcher_graph(interpreted_spec interpreted_policy, Fun&& f, void** ar
 
   cudaKernelNodeAttrValue val;
   val.cooperative = cooperative_kernel ? 1 : 0;
-  cuda_safe_call(cudaGraphKernelNodeSetAttribute(n, cudaKernelNodeAttributeCooperative, &val));
+  cuda_try<cudaGraphKernelNodeSetAttribute>(n, cudaKernelNodeAttributeCooperative, &val);
 }
 
 template <typename Fun, typename interpreted_spec, typename Arg>
@@ -120,7 +120,7 @@ void launch_impl(interpreted_spec interpreted_policy, exec_place& p, Fun f, Arg
 
     if (th_mem_config[1] > 0)
     {
-      cuda_safe_call(cudaMallocAsync(&th_dev_tmp_ptr, th_mem_config[1], stream));
+      cuda_try(cudaMallocAsync(&th_dev_tmp_ptr, th_mem_config[1], stream));
       th.set_device_tmp(th_dev_tmp_ptr);
     }
 
@@ -132,7 +132,7 @@ void launch_impl(interpreted_spec interpreted_policy, exec_place& p, Fun f, Arg
 
     if (th_mem_config[1] > 0)
     {
-      cuda_safe_call(cudaFreeAsync(th_dev_tmp_ptr, stream));
+      cuda_try<cudaFreeAsync>(th_dev_tmp_ptr, stream);
     }
   };
 }
@@ -366,10 +366,12 @@ public:
       if (record_time)
       {
         cudaGetDevice(&device); // We will use this to force it during the next run
-        // Events must be created here to avoid issues with multi-gpu
-        cuda_safe_call(cudaEventCreate(&start_event));
-        cuda_safe_call(cudaEventCreate(&end_event));
-        cuda_safe_call(cudaEventRecord(start_event, t.get_stream()));
+        // Events must be created here to avoid issues with multi-gpu.
+        // cudaEventCreate keeps the runtime-status form: it is an overload set
+        // (cuda_runtime.h adds a flags overload), so cuda_try<cudaEventCreate> cannot name it.
+        cuda_try(cudaEventCreate(&start_event));
+        cuda_try(cudaEventCreate(&end_event));
+        cuda_try<cudaEventRecord>(start_event, t.get_stream());
       }
     }
 
@@ -405,6 +407,9 @@ public:
 
         if (record_time)
         {
+          // These run inside the enclosing SCOPE(exit) body, which is noexcept;
+          // keep cuda_safe_call so a CUDA error aborts rather than throwing
+          // through the guard (which would call std::terminate).
           cuda_safe_call(cudaEventRecord(end_event, t.get_stream()));
           cuda_safe_call(cudaEventSynchronize(end_event));
 

From d2105b3f696bacab5ba9b86f01e3ad7668e002b1 Mon Sep 17 00:00:00 2001
From: Andrei Alexandrescu <andrei@erdani.com>
Date: Wed, 3 Jun 2026 23:47:20 -0400
Subject: [PATCH 2/8] cudax/stf: use cuda_try<cudaEventCreateWithFlags> for
 timing event creation

cudaEventCreate is an overload set (cuda_runtime.h adds a flags overload),
so it cannot be named by the templated cuda_try<F> form. Use the
non-overloaded cudaEventCreateWithFlags with cudaEventDefault instead,
which is exactly what cudaEventCreate(&e) does internally, so behavior is
unchanged while keeping the templated form.
---
 .../experimental/__stf/internal/host_launch_scope.cuh    | 9 +++++----
 .../include/cuda/experimental/__stf/internal/launch.cuh  | 9 +++++----
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/cudax/include/cuda/experimental/__stf/internal/host_launch_scope.cuh b/cudax/include/cuda/experimental/__stf/internal/host_launch_scope.cuh
index 93144f7c4e4..566a7ecaf2e 100644
--- a/cudax/include/cuda/experimental/__stf/internal/host_launch_scope.cuh
+++ b/cudax/include/cuda/experimental/__stf/internal/host_launch_scope.cuh
@@ -247,10 +247,11 @@ public:
     {
       if (record_time)
       {
-        // cudaEventCreate is an overload set (cuda_runtime.h adds a flags
-        // overload), so it keeps the runtime-status cuda_try form.
-        cuda_try(cudaEventCreate(&start_event));
-        cuda_try(cudaEventCreate(&end_event));
+        // cudaEventCreate is an overload set (cuda_runtime.h adds a flags overload),
+        // so cuda_try<cudaEventCreate> cannot name it; use the non-overloaded
+        // cudaEventCreateWithFlags with the default flags (equivalent to cudaEventCreate).
+        start_event = cuda_try<cudaEventCreateWithFlags>(cudaEventDefault);
+        end_event   = cuda_try<cudaEventCreateWithFlags>(cudaEventDefault);
         cuda_try<cudaEventRecord>(start_event, t.get_stream());
       }
     }
diff --git a/cudax/include/cuda/experimental/__stf/internal/launch.cuh b/cudax/include/cuda/experimental/__stf/internal/launch.cuh
index bc3de9b7b13..7ff921fe892 100644
--- a/cudax/include/cuda/experimental/__stf/internal/launch.cuh
+++ b/cudax/include/cuda/experimental/__stf/internal/launch.cuh
@@ -367,10 +367,11 @@ public:
       {
         cudaGetDevice(&device); // We will use this to force it during the next run
         // Events must be created here to avoid issues with multi-gpu.
-        // cudaEventCreate keeps the runtime-status form: it is an overload set
-        // (cuda_runtime.h adds a flags overload), so cuda_try<cudaEventCreate> cannot name it.
-        cuda_try(cudaEventCreate(&start_event));
-        cuda_try(cudaEventCreate(&end_event));
+        // cudaEventCreate is an overload set (cuda_runtime.h adds a flags overload),
+        // so cuda_try<cudaEventCreate> cannot name it; use the non-overloaded
+        // cudaEventCreateWithFlags with the default flags (equivalent to cudaEventCreate).
+        start_event = cuda_try<cudaEventCreateWithFlags>(cudaEventDefault);
+        end_event   = cuda_try<cudaEventCreateWithFlags>(cudaEventDefault);
         cuda_try<cudaEventRecord>(start_event, t.get_stream());
       }
     }

From 39c1cabd4986d54bae6fa3aaaaae01dc4f1d02ff Mon Sep 17 00:00:00 2001
From: Andrei Alexandrescu <andrei@erdani.com>
Date: Thu, 4 Jun 2026 00:12:31 -0400
Subject: [PATCH 3/8] cudax/stf: fix host_launch callback-arg ownership
 ordering

The host_launch callback args (resolved / wrapper) are heap-allocated and
guarded by SCOPE(fail) { delete ...; }. Transfer of ownership to the
graph-path ctx resource was happening in the wrong order:

- Untyped path: `resolved` was set to nullptr right after add_resource,
  but it is also used as the host node's userData. That made the graph
  node receive a null userData, so the callback dereferenced null on the
  success path.
- Typed path: add_resource ran before cudaGraphAddHostNode, so a throw
  from the node creation would delete `wrapper` twice (SCOPE(fail) plus
  the resource's release_in_callback).

Fix both by creating the host node first (while resolved/wrapper is still
a valid userData), then handing ownership to the ctx resource, then
nulling the pointer once at the end to disarm SCOPE(fail). On a throw
before that point the resource has not been added, so SCOPE(fail) is the
sole owner and frees the args exactly once.
---
 .../__stf/internal/host_launch_scope.cuh      | 74 ++++++++++---------
 1 file changed, 39 insertions(+), 35 deletions(-)

diff --git a/cudax/include/cuda/experimental/__stf/internal/host_launch_scope.cuh b/cudax/include/cuda/experimental/__stf/internal/host_launch_scope.cuh
index 566a7ecaf2e..da8c7495aeb 100644
--- a/cudax/include/cuda/experimental/__stf/internal/host_launch_scope.cuh
+++ b/cudax/include/cuda/experimental/__stf/internal/host_launch_scope.cuh
@@ -289,7 +289,13 @@ public:
     {
       // --- Untyped dispatch path ---
       auto* resolved = new ::std::pair<Fun, host_launch_deps>{::std::forward<Fun>(f), host_launch_deps{}};
-      auto& hld      = resolved->second;
+      // Whenever we pass ownership we assigned nullptr to resolved.
+      SCOPE(fail)
+      {
+        delete resolved;
+      };
+
+      auto& hld = resolved->second;
 
       const size_t ndeps = deps.size();
       hld.lds_.resize(ndeps);
@@ -303,20 +309,16 @@ public:
       hld.dtor_          = user_data_dtor_;
       user_data_dtor_    = nullptr;
 
-      if constexpr (::std::is_same_v<Ctx, graph_ctx>)
-      {
-        using wrapper_type = ::std::remove_reference_t<decltype(*resolved)>;
-        auto resource      = ::std::make_shared<host_callback_args_resource<wrapper_type>>(resolved);
-        ctx.add_resource(mv(resource));
-      }
-
       auto callback = [](void* raw) {
         auto* w = static_cast<decltype(resolved)>(raw);
-        w->first(w->second);
-        if constexpr (!::std::is_same_v<Ctx, graph_ctx>)
+        SCOPE(exit)
         {
-          delete w;
-        }
+          if constexpr (!::std::is_same_v<Ctx, graph_ctx>)
+          {
+            delete w;
+          }
+        };
+        w->first(w->second);
       };
 
       if constexpr (::std::is_same_v<Ctx, graph_ctx>)
@@ -324,17 +326,21 @@ public:
         cudaHostNodeParams params = {.fn = callback, .userData = resolved};
         auto lock                 = t.lock_ctx_graph();
         t.get_node()              = cuda_try<cudaGraphAddHostNode>(t.get_ctx_graph(), nullptr, 0, &params);
+        // The node now references `resolved`; hand ownership to a ctx resource
+        // that deletes it (in release_in_callback) when the ctx is released.
+        using wrapper_type = ::std::remove_reference_t<decltype(*resolved)>;
+        ctx.add_resource(::std::make_shared<host_callback_args_resource<wrapper_type>>(resolved));
       }
       else
       {
-        // The callback owns `resolved` once enqueued; delete it if the enqueue
-        // throws so it does not leak.
-        SCOPE(fail)
-        {
-          delete resolved;
-        };
+        // For a stream the callback owns `resolved` once the launch succeeds.
         cuda_try<cudaLaunchHostFunc>(t.get_stream(), callback, resolved);
       }
+      // Ownership has transferred (to the ctx resource for graph, or to the
+      // callback for stream). These enqueues are asynchronous, so on a throw
+      // above the callback has not run and SCOPE(fail) still owns `resolved`;
+      // past this point disarm it.
+      resolved = nullptr;
     }
     else
     {
@@ -350,16 +356,20 @@ public:
         }
       }();
       auto* wrapper = new ::std::pair<Fun, decltype(payload)>{::std::forward<Fun>(f), mv(payload)};
-
-      if constexpr (::std::is_same_v<Ctx, graph_ctx>)
+      SCOPE(fail)
       {
-        using wrapper_type = ::std::remove_reference_t<decltype(*wrapper)>;
-        auto resource      = ::std::make_shared<host_callback_args_resource<wrapper_type>>(wrapper);
-        ctx.add_resource(mv(resource));
-      }
+        delete wrapper;
+      };
 
       auto callback = [](void* untyped_wrapper) {
         auto w = static_cast<decltype(wrapper)>(untyped_wrapper);
+        SCOPE(exit)
+        {
+          if constexpr (!::std::is_same_v<Ctx, graph_ctx>)
+          {
+            delete w;
+          }
+        };
 
         constexpr bool fun_invocable_task_deps = reserved::is_applicable_v<Fun, decltype(payload)>;
         constexpr bool fun_invocable_task_non_void_deps =
@@ -376,11 +386,6 @@ public:
         {
           ::std::apply(::std::forward<Fun>(w->first), reserved::remove_void_interface(mv(w->second)));
         }
-
-        if constexpr (!::std::is_same_v<Ctx, graph_ctx>)
-        {
-          delete w;
-        }
       };
 
       if constexpr (::std::is_same_v<Ctx, graph_ctx>)
@@ -388,17 +393,16 @@ public:
         cudaHostNodeParams params = {.fn = callback, .userData = wrapper};
         auto lock                 = t.lock_ctx_graph();
         t.get_node()              = cuda_try<cudaGraphAddHostNode>(t.get_ctx_graph(), nullptr, 0, &params);
+        // Transfer ownership only after the node references `wrapper`, so a throw
+        // from cudaGraphAddHostNode leaves SCOPE(fail) as the sole owner.
+        using wrapper_type = ::std::remove_reference_t<decltype(*wrapper)>;
+        ctx.add_resource(::std::make_shared<host_callback_args_resource<wrapper_type>>(wrapper));
       }
       else
       {
-        // The callback owns `wrapper` once enqueued; delete it if the enqueue
-        // throws so it does not leak.
-        SCOPE(fail)
-        {
-          delete wrapper;
-        };
         cuda_try<cudaLaunchHostFunc>(t.get_stream(), callback, wrapper);
       }
+      wrapper = nullptr;
     }
   }
 

From d679c2390eeddcfb9a870d7cc75f2b12022a2c6e Mon Sep 17 00:00:00 2001
From: Andrei Alexandrescu <andrei@erdani.com>
Date: Thu, 4 Jun 2026 00:23:02 -0400
Subject: [PATCH 4/8] cudax/stf: own host_launch callback args with unique_ptr

Replace the raw new + SCOPE(fail){delete} + manual nulling design for the
host_launch callback arguments with std::unique_ptr. The args are borrowed
via .get() for the host node userData / cudaLaunchHostFunc argument and the
ctx resource, and ownership is handed off with .release() once the node has
been created (graph) or the launch has been enqueued (stream). On a throw
before that point the unique_ptr frees the args; afterwards the ctx resource
(graph) or the callback (stream) owns and frees them. Adds <memory>.
---
 .../__stf/internal/host_launch_scope.cuh      | 49 ++++++++-----------
 1 file changed, 21 insertions(+), 28 deletions(-)

diff --git a/cudax/include/cuda/experimental/__stf/internal/host_launch_scope.cuh b/cudax/include/cuda/experimental/__stf/internal/host_launch_scope.cuh
index da8c7495aeb..4ba2b2c34c9 100644
--- a/cudax/include/cuda/experimental/__stf/internal/host_launch_scope.cuh
+++ b/cudax/include/cuda/experimental/__stf/internal/host_launch_scope.cuh
@@ -35,6 +35,7 @@
 #include <cuda/experimental/__stf/internal/thread_hierarchy.cuh>
 #include <cuda/experimental/__stf/internal/void_interface.cuh>
 
+#include <memory>
 #include <type_traits>
 
 namespace cuda::experimental::stf
@@ -288,12 +289,8 @@ public:
     if constexpr (fun_invocable_untyped)
     {
       // --- Untyped dispatch path ---
-      auto* resolved = new ::std::pair<Fun, host_launch_deps>{::std::forward<Fun>(f), host_launch_deps{}};
-      // Whenever we pass ownership we assigned nullptr to resolved.
-      SCOPE(fail)
-      {
-        delete resolved;
-      };
+      auto resolved =
+        ::std::make_unique<::std::pair<Fun, host_launch_deps>>(::std::forward<Fun>(f), host_launch_deps{});
 
       auto& hld = resolved->second;
 
@@ -310,7 +307,7 @@ public:
       user_data_dtor_    = nullptr;
 
       auto callback = [](void* raw) {
-        auto* w = static_cast<decltype(resolved)>(raw);
+        auto* w = static_cast<decltype(resolved.get())>(raw);
         SCOPE(exit)
         {
           if constexpr (!::std::is_same_v<Ctx, graph_ctx>)
@@ -323,24 +320,24 @@ public:
 
       if constexpr (::std::is_same_v<Ctx, graph_ctx>)
       {
-        cudaHostNodeParams params = {.fn = callback, .userData = resolved};
+        cudaHostNodeParams params = {.fn = callback, .userData = resolved.get()};
         auto lock                 = t.lock_ctx_graph();
         t.get_node()              = cuda_try<cudaGraphAddHostNode>(t.get_ctx_graph(), nullptr, 0, &params);
-        // The node now references `resolved`; hand ownership to a ctx resource
-        // that deletes it (in release_in_callback) when the ctx is released.
+        // The node now references the args; hand ownership to a ctx resource
+        // that deletes them (in release_in_callback) when the ctx is released.
         using wrapper_type = ::std::remove_reference_t<decltype(*resolved)>;
-        ctx.add_resource(::std::make_shared<host_callback_args_resource<wrapper_type>>(resolved));
+        ctx.add_resource(::std::make_shared<host_callback_args_resource<wrapper_type>>(resolved.get()));
       }
       else
       {
-        // For a stream the callback owns `resolved` once the launch succeeds.
-        cuda_try<cudaLaunchHostFunc>(t.get_stream(), callback, resolved);
+        // For a stream the callback owns the args once the launch succeeds.
+        cuda_try<cudaLaunchHostFunc>(t.get_stream(), callback, resolved.get());
       }
       // Ownership has transferred (to the ctx resource for graph, or to the
       // callback for stream). These enqueues are asynchronous, so on a throw
-      // above the callback has not run and SCOPE(fail) still owns `resolved`;
-      // past this point disarm it.
-      resolved = nullptr;
+      // above the callback has not run and the unique_ptr still owns the args;
+      // release it now that ownership has moved on.
+      resolved.release();
     }
     else
     {
@@ -355,14 +352,10 @@ public:
           return deps.instance(t);
         }
       }();
-      auto* wrapper = new ::std::pair<Fun, decltype(payload)>{::std::forward<Fun>(f), mv(payload)};
-      SCOPE(fail)
-      {
-        delete wrapper;
-      };
+      auto wrapper = ::std::make_unique<::std::pair<Fun, decltype(payload)>>(::std::forward<Fun>(f), mv(payload));
 
       auto callback = [](void* untyped_wrapper) {
-        auto w = static_cast<decltype(wrapper)>(untyped_wrapper);
+        auto w = static_cast<decltype(wrapper.get())>(untyped_wrapper);
         SCOPE(exit)
         {
           if constexpr (!::std::is_same_v<Ctx, graph_ctx>)
@@ -390,19 +383,19 @@ public:
 
       if constexpr (::std::is_same_v<Ctx, graph_ctx>)
       {
-        cudaHostNodeParams params = {.fn = callback, .userData = wrapper};
+        cudaHostNodeParams params = {.fn = callback, .userData = wrapper.get()};
         auto lock                 = t.lock_ctx_graph();
         t.get_node()              = cuda_try<cudaGraphAddHostNode>(t.get_ctx_graph(), nullptr, 0, &params);
-        // Transfer ownership only after the node references `wrapper`, so a throw
-        // from cudaGraphAddHostNode leaves SCOPE(fail) as the sole owner.
+        // Transfer ownership only after the node references the args, so a throw
+        // from cudaGraphAddHostNode leaves the unique_ptr as the sole owner.
         using wrapper_type = ::std::remove_reference_t<decltype(*wrapper)>;
-        ctx.add_resource(::std::make_shared<host_callback_args_resource<wrapper_type>>(wrapper));
+        ctx.add_resource(::std::make_shared<host_callback_args_resource<wrapper_type>>(wrapper.get()));
       }
       else
       {
-        cuda_try<cudaLaunchHostFunc>(t.get_stream(), callback, wrapper);
+        cuda_try<cudaLaunchHostFunc>(t.get_stream(), callback, wrapper.get());
       }
-      wrapper = nullptr;
+      wrapper.release();
     }
   }
 

From 70f0713392cd0fa6a43bb854b0a6a0a27c0314d6 Mon Sep 17 00:00:00 2001
From: Andrei Alexandrescu <andrei@erdani.com>
Date: Thu, 4 Jun 2026 00:27:44 -0400
Subject: [PATCH 5/8] cudax/stf: free launch temp device memory via SCOPE(exit)

launch_impl allocates a temporary device buffer (cudaMallocAsync) and freed
it after cuda_launcher returned. Now that cuda_launcher throws on error (via
cuda_try<cudaLaunchKernelExC>), the trailing cudaFreeAsync was skipped on a
throw, leaking the buffer. Free it from a SCOPE(exit) placed right after the
allocation so it runs on both normal and exceptional exit. cuda_safe_call is
used inside the noexcept SCOPE(exit) body.
---
 .../cuda/experimental/__stf/internal/launch.cuh   | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/cudax/include/cuda/experimental/__stf/internal/launch.cuh b/cudax/include/cuda/experimental/__stf/internal/launch.cuh
index 7ff921fe892..de4d924f9a5 100644
--- a/cudax/include/cuda/experimental/__stf/internal/launch.cuh
+++ b/cudax/include/cuda/experimental/__stf/internal/launch.cuh
@@ -124,16 +124,21 @@ void launch_impl(interpreted_spec interpreted_policy, exec_place& p, Fun f, Arg
       th.set_device_tmp(th_dev_tmp_ptr);
     }
 
+    // Free the temporary device memory on the way out, even if the launch throws.
+    // cuda_safe_call (not cuda_try) because SCOPE(exit) is noexcept.
+    SCOPE(exit)
+    {
+      if (th_dev_tmp_ptr)
+      {
+        cuda_safe_call(cudaFreeAsync(th_dev_tmp_ptr, stream));
+      }
+    };
+
     auto kernel_args = tuple_prepend(mv(th), mv(arg));
     using args_type  = decltype(kernel_args);
     void* all_args[] = {&f, &kernel_args};
 
     cuda_launcher(interpreted_policy, reserved::launch_kernel<Fun, args_type>, all_args, stream);
-
-    if (th_mem_config[1] > 0)
-    {
-      cuda_try<cudaFreeAsync>(th_dev_tmp_ptr, stream);
-    }
   };
 }
 

From 92c226060d84a612979567ba7c4cd700fe705800 Mon Sep 17 00:00:00 2001
From: Andrei Alexandrescu <andrei@erdani.com>
Date: Thu, 4 Jun 2026 00:35:49 -0400
Subject: [PATCH 6/8] cudax/stf: check cudaGetDevice in launch timing path

The cudaGetDevice call in the timing branch was unchecked. Use the
templated cuda_try<cudaGetDevice> form so a failure is reported.
---
 cudax/include/cuda/experimental/__stf/internal/launch.cuh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cudax/include/cuda/experimental/__stf/internal/launch.cuh b/cudax/include/cuda/experimental/__stf/internal/launch.cuh
index de4d924f9a5..c6eb02773af 100644
--- a/cudax/include/cuda/experimental/__stf/internal/launch.cuh
+++ b/cudax/include/cuda/experimental/__stf/internal/launch.cuh
@@ -363,14 +363,14 @@ public:
     nvtx_range nr(t.get_symbol().c_str());
     t.start();
 
-    int device;
-    cudaEvent_t start_event, end_event;
+    int device              = -1;
+    cudaEvent_t start_event = nullptr, end_event = nullptr;
 
     if constexpr (::std::is_same_v<Ctx, stream_ctx>)
     {
       if (record_time)
       {
-        cudaGetDevice(&device); // We will use this to force it during the next run
+        device = cuda_try<cudaGetDevice>(); // We will use this to force it during the next run
         // Events must be created here to avoid issues with multi-gpu.
         // cudaEventCreate is an overload set (cuda_runtime.h adds a flags overload),
         // so cuda_try<cudaEventCreate> cannot name it; use the non-overloaded

From 14834d8ec73cdb9155fdaa3abaeb2e874688cbb9 Mon Sep 17 00:00:00 2001
From: Andrei Alexandrescu <andrei@erdani.com>
Date: Thu, 4 Jun 2026 18:23:42 -0400
Subject: [PATCH 7/8] [STF] Initialize host_launch_scope timing events to
 nullptr

Match launch.cuh and satisfy GCC -Wmaybe-uninitialized: if cuda_try throws
before both events are created, SCOPE(exit) still runs with record_time set.
---
 .../cuda/experimental/__stf/internal/host_launch_scope.cuh      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cudax/include/cuda/experimental/__stf/internal/host_launch_scope.cuh b/cudax/include/cuda/experimental/__stf/internal/host_launch_scope.cuh
index 4ba2b2c34c9..ff49075cd66 100644
--- a/cudax/include/cuda/experimental/__stf/internal/host_launch_scope.cuh
+++ b/cudax/include/cuda/experimental/__stf/internal/host_launch_scope.cuh
@@ -239,7 +239,7 @@ public:
       t.set_symbol(symbol);
     }
 
-    cudaEvent_t start_event, end_event;
+    cudaEvent_t start_event = nullptr, end_event = nullptr;
     const bool record_time = t.schedule_task() || statistics.is_calibrating_to_file();
 
     t.start();

From 7a9d75f1a1e651398b4c549036430ea4d6802d86 Mon Sep 17 00:00:00 2001
From: Andrei Alexandrescu <andrei@erdani.com>
Date: Thu, 4 Jun 2026 21:45:38 -0400
Subject: [PATCH 8/8] cudax/stf: install task cleanup guard before throwing
 timing calls

After the cuda_safe_call -> cuda_try migration, the timing-event setup
(cudaGetDevice / cudaEventCreateWithFlags / cudaEventRecord) can throw, but
in launch.cuh and host_launch_scope.cuh it ran *before* the SCOPE(exit) that
calls t.end_uncleared()/t.clear(). A throw there left the task half-open and
leaked any partially-created event.

Install the SCOPE(exit) guard first, then do the timing setup, and gate the
end-of-scope timing teardown on a new timing_active flag that is only set once
both events exist and the start event has been recorded. On a throw from the
timing cuda_try calls the guard now runs the task cleanup, and the teardown is
skipped because timing_active is still false. No success-path behavior change.

Addresses CodeRabbit review comments on PR #9249.
---
 .../__stf/internal/host_launch_scope.cuh      | 33 ++++++++++-------
 .../experimental/__stf/internal/launch.cuh    | 37 +++++++++++--------
 2 files changed, 40 insertions(+), 30 deletions(-)

diff --git a/cudax/include/cuda/experimental/__stf/internal/host_launch_scope.cuh b/cudax/include/cuda/experimental/__stf/internal/host_launch_scope.cuh
index ff49075cd66..aeae4c78454 100644
--- a/cudax/include/cuda/experimental/__stf/internal/host_launch_scope.cuh
+++ b/cudax/include/cuda/experimental/__stf/internal/host_launch_scope.cuh
@@ -241,28 +241,19 @@ public:
 
     cudaEvent_t start_event = nullptr, end_event = nullptr;
     const bool record_time = t.schedule_task() || statistics.is_calibrating_to_file();
+    // Set only once both timing events exist and the start event has been recorded.
+    // The timing setup is done below, after the SCOPE(exit) guard is installed, so a
+    // throw from those cuda_try calls cannot skip t.end_uncleared()/t.clear().
+    bool timing_active = false;
 
     t.start();
 
-    if constexpr (::std::is_same_v<Ctx, stream_ctx>)
-    {
-      if (record_time)
-      {
-        // cudaEventCreate is an overload set (cuda_runtime.h adds a flags overload),
-        // so cuda_try<cudaEventCreate> cannot name it; use the non-overloaded
-        // cudaEventCreateWithFlags with the default flags (equivalent to cudaEventCreate).
-        start_event = cuda_try<cudaEventCreateWithFlags>(cudaEventDefault);
-        end_event   = cuda_try<cudaEventCreateWithFlags>(cudaEventDefault);
-        cuda_try<cudaEventRecord>(start_event, t.get_stream());
-      }
-    }
-
     SCOPE(exit)
     {
       t.end_uncleared();
       if constexpr (::std::is_same_v<Ctx, stream_ctx>)
       {
-        if (record_time)
+        if (timing_active)
         {
           // Inside the noexcept SCOPE(exit) body; keep cuda_safe_call so a CUDA
           // error aborts rather than throwing through the guard.
@@ -286,6 +277,20 @@ public:
       t.clear();
     };
 
+    if constexpr (::std::is_same_v<Ctx, stream_ctx>)
+    {
+      if (record_time)
+      {
+        // cudaEventCreate is an overload set (cuda_runtime.h adds a flags overload),
+        // so cuda_try<cudaEventCreate> cannot name it; use the non-overloaded
+        // cudaEventCreateWithFlags with the default flags (equivalent to cudaEventCreate).
+        start_event = cuda_try<cudaEventCreateWithFlags>(cudaEventDefault);
+        end_event   = cuda_try<cudaEventCreateWithFlags>(cudaEventDefault);
+        cuda_try<cudaEventRecord>(start_event, t.get_stream());
+        timing_active = true;
+      }
+    }
+
     if constexpr (fun_invocable_untyped)
     {
       // --- Untyped dispatch path ---
diff --git a/cudax/include/cuda/experimental/__stf/internal/launch.cuh b/cudax/include/cuda/experimental/__stf/internal/launch.cuh
index c6eb02773af..b4b88714ad9 100644
--- a/cudax/include/cuda/experimental/__stf/internal/launch.cuh
+++ b/cudax/include/cuda/experimental/__stf/internal/launch.cuh
@@ -365,21 +365,10 @@ public:
 
     int device              = -1;
     cudaEvent_t start_event = nullptr, end_event = nullptr;
-
-    if constexpr (::std::is_same_v<Ctx, stream_ctx>)
-    {
-      if (record_time)
-      {
-        device = cuda_try<cudaGetDevice>(); // We will use this to force it during the next run
-        // Events must be created here to avoid issues with multi-gpu.
-        // cudaEventCreate is an overload set (cuda_runtime.h adds a flags overload),
-        // so cuda_try<cudaEventCreate> cannot name it; use the non-overloaded
-        // cudaEventCreateWithFlags with the default flags (equivalent to cudaEventCreate).
-        start_event = cuda_try<cudaEventCreateWithFlags>(cudaEventDefault);
-        end_event   = cuda_try<cudaEventCreateWithFlags>(cudaEventDefault);
-        cuda_try<cudaEventRecord>(start_event, t.get_stream());
-      }
-    }
+    // Set only once both timing events exist and the start event has been recorded.
+    // The timing setup is done below, after the SCOPE(exit) guard is installed, so a
+    // throw from those cuda_try calls cannot skip t.end_uncleared()/t.clear().
+    bool timing_active = false;
 
     const size_t grid_size = e_place.size();
 
@@ -411,7 +400,7 @@ public:
           deallocateManagedMemory(hostMemoryArrivedList, grid_size, t.get_stream());
         }
 
-        if (record_time)
+        if (timing_active)
         {
           // These run inside the enclosing SCOPE(exit) body, which is noexcept;
           // keep cuda_safe_call so a CUDA error aborts rather than throwing
@@ -437,6 +426,22 @@ public:
       t.clear();
     };
 
+    if constexpr (::std::is_same_v<Ctx, stream_ctx>)
+    {
+      if (record_time)
+      {
+        device = cuda_try<cudaGetDevice>(); // We will use this to force it during the next run
+        // Events must be created here to avoid issues with multi-gpu.
+        // cudaEventCreate is an overload set (cuda_runtime.h adds a flags overload),
+        // so cuda_try<cudaEventCreate> cannot name it; use the non-overloaded
+        // cudaEventCreateWithFlags with the default flags (equivalent to cudaEventCreate).
+        start_event = cuda_try<cudaEventCreateWithFlags>(cudaEventDefault);
+        end_event   = cuda_try<cudaEventCreateWithFlags>(cudaEventDefault);
+        cuda_try<cudaEventRecord>(start_event, t.get_stream());
+        timing_active = true;
+      }
+    }
+
     /* Should only be allocated / deallocated if the last level used is system wide. Unnecessary and wasteful
      * otherwise. */
     if (grid_size > 1)