NVIDIA · andralex · Jun 4, 2026 · Jun 4, 2026 · Jun 4, 2026 · Jun 4, 2026
@@ -35,6 +35,7 @@
 #include <cuda/experimental/__stf/internal/thread_hierarchy.cuh>
 #include <cuda/experimental/__stf/internal/void_interface.cuh>
 
+#include <memory>
 #include <type_traits>
 
 namespace cuda::experimental::stf
@@ -238,28 +239,24 @@ public:
       t.set_symbol(symbol);
     }
 
-    cudaEvent_t start_event, end_event;
+    cudaEvent_t start_event = nullptr, end_event = nullptr;
     const bool record_time = t.schedule_task() || statistics.is_calibrating_to_file();
+    // Set only once both timing events exist and the start event has been recorded.
+    // The timing setup is done below, after the SCOPE(exit) guard is installed, so a
+    // throw from those cuda_try calls cannot skip t.end_uncleared()/t.clear().
+    bool timing_active = false;
 
     t.start();
 
-    if constexpr (::std::is_same_v<Ctx, stream_ctx>)
-    {
-      if (record_time)
-      {
-        cuda_safe_call(cudaEventCreate(&start_event));
-        cuda_safe_call(cudaEventCreate(&end_event));
-        cuda_safe_call(cudaEventRecord(start_event, t.get_stream()));
-      }
-    }
-
     SCOPE(exit)
     {
       t.end_uncleared();
       if constexpr (::std::is_same_v<Ctx, stream_ctx>)
       {
-        if (record_time)
+        if (timing_active)
         {
+          // Inside the noexcept SCOPE(exit) body; keep cuda_safe_call so a CUDA
+          // error aborts rather than throwing through the guard.
           cuda_safe_call(cudaEventRecord(end_event, t.get_stream()));
           cuda_safe_call(cudaEventSynchronize(end_event));
 
@@ -280,11 +277,27 @@ public:
       t.clear();
     };
 
+    if constexpr (::std::is_same_v<Ctx, stream_ctx>)
+    {
+      if (record_time)
+      {
+        // cudaEventCreate is an overload set (cuda_runtime.h adds a flags overload),
+        // so cuda_try<cudaEventCreate> cannot name it; use the non-overloaded
+        // cudaEventCreateWithFlags with the default flags (equivalent to cudaEventCreate).
+        start_event = cuda_try<cudaEventCreateWithFlags>(cudaEventDefault);
+        end_event   = cuda_try<cudaEventCreateWithFlags>(cudaEventDefault);
+        cuda_try<cudaEventRecord>(start_event, t.get_stream());
+        timing_active = true;
+      }
+    }
+
     if constexpr (fun_invocable_untyped)
     {
       // --- Untyped dispatch path ---
-      auto* resolved = new ::std::pair<Fun, host_launch_deps>{::std::forward<Fun>(f), host_launch_deps{}};
-      auto& hld      = resolved->second;
+      auto resolved =
+        ::std::make_unique<::std::pair<Fun, host_launch_deps>>(::std::forward<Fun>(f), host_launch_deps{});
+
+      auto& hld = resolved->second;
 
       const size_t ndeps = deps.size();
       hld.lds_.resize(ndeps);
@@ -298,32 +311,38 @@ public:
       hld.dtor_          = user_data_dtor_;
       user_data_dtor_    = nullptr;
 
-      if constexpr (::std::is_same_v<Ctx, graph_ctx>)
-      {
-        using wrapper_type = ::std::remove_reference_t<decltype(*resolved)>;
-        auto resource      = ::std::make_shared<host_callback_args_resource<wrapper_type>>(resolved);
-        ctx.add_resource(mv(resource));
-      }
-
       auto callback = [](void* raw) {
-        auto* w = static_cast<decltype(resolved)>(raw);
-        w->first(w->second);
-        if constexpr (!::std::is_same_v<Ctx, graph_ctx>)
+        auto* w = static_cast<decltype(resolved.get())>(raw);
+        SCOPE(exit)
         {
-          delete w;
-        }
+          if constexpr (!::std::is_same_v<Ctx, graph_ctx>)
+          {
+            delete w;
+          }
+        };
+        w->first(w->second);
       };
 
       if constexpr (::std::is_same_v<Ctx, graph_ctx>)
       {
-        cudaHostNodeParams params = {.fn = callback, .userData = resolved};
+        cudaHostNodeParams params = {.fn = callback, .userData = resolved.get()};
         auto lock                 = t.lock_ctx_graph();
-        cuda_safe_call(cudaGraphAddHostNode(&t.get_node(), t.get_ctx_graph(), nullptr, 0, &params));
+        t.get_node()              = cuda_try<cudaGraphAddHostNode>(t.get_ctx_graph(), nullptr, 0, &params);
+        // The node now references the args; hand ownership to a ctx resource
+        // that deletes them (in release_in_callback) when the ctx is released.
+        using wrapper_type = ::std::remove_reference_t<decltype(*resolved)>;
+        ctx.add_resource(::std::make_shared<host_callback_args_resource<wrapper_type>>(resolved.get()));
       }
       else
       {
-        cuda_safe_call(cudaLaunchHostFunc(t.get_stream(), callback, resolved));
+        // For a stream the callback owns the args once the launch succeeds.
+        cuda_try<cudaLaunchHostFunc>(t.get_stream(), callback, resolved.get());
       }
+      // Ownership has transferred (to the ctx resource for graph, or to the
+      // callback for stream). These enqueues are asynchronous, so on a throw
+      // above the callback has not run and the unique_ptr still owns the args;
+      // release it now that ownership has moved on.
+      resolved.release();
     }
     else
     {
@@ -338,17 +357,17 @@ public:
           return deps.instance(t);
         }
       }();
-      auto* wrapper = new ::std::pair<Fun, decltype(payload)>{::std::forward<Fun>(f), mv(payload)};
-
-      if constexpr (::std::is_same_v<Ctx, graph_ctx>)
-      {
-        using wrapper_type = ::std::remove_reference_t<decltype(*wrapper)>;
-        auto resource      = ::std::make_shared<host_callback_args_resource<wrapper_type>>(wrapper);
-        ctx.add_resource(mv(resource));
-      }
+      auto wrapper = ::std::make_unique<::std::pair<Fun, decltype(payload)>>(::std::forward<Fun>(f), mv(payload));
 
       auto callback = [](void* untyped_wrapper) {
-        auto w = static_cast<decltype(wrapper)>(untyped_wrapper);
+        auto w = static_cast<decltype(wrapper.get())>(untyped_wrapper);
+        SCOPE(exit)
+        {
+          if constexpr (!::std::is_same_v<Ctx, graph_ctx>)
+          {
+            delete w;
+          }
+        };
 
         constexpr bool fun_invocable_task_deps = reserved::is_applicable_v<Fun, decltype(payload)>;
         constexpr bool fun_invocable_task_non_void_deps =
@@ -365,23 +384,23 @@ public:
         {
           ::std::apply(::std::forward<Fun>(w->first), reserved::remove_void_interface(mv(w->second)));
         }
-
-        if constexpr (!::std::is_same_v<Ctx, graph_ctx>)
-        {
-          delete w;
-        }
       };
 
       if constexpr (::std::is_same_v<Ctx, graph_ctx>)
       {
-        cudaHostNodeParams params = {.fn = callback, .userData = wrapper};
+        cudaHostNodeParams params = {.fn = callback, .userData = wrapper.get()};
         auto lock                 = t.lock_ctx_graph();
-        cuda_safe_call(cudaGraphAddHostNode(&t.get_node(), t.get_ctx_graph(), nullptr, 0, &params));
+        t.get_node()              = cuda_try<cudaGraphAddHostNode>(t.get_ctx_graph(), nullptr, 0, &params);
+        // Transfer ownership only after the node references the args, so a throw
+        // from cudaGraphAddHostNode leaves the unique_ptr as the sole owner.
+        using wrapper_type = ::std::remove_reference_t<decltype(*wrapper)>;
+        ctx.add_resource(::std::make_shared<host_callback_args_resource<wrapper_type>>(wrapper.get()));
       }
       else
       {
-        cuda_safe_call(cudaLaunchHostFunc(t.get_stream(), callback, wrapper));
+        cuda_try<cudaLaunchHostFunc>(t.get_stream(), callback, wrapper.get());
       }
+      wrapper.release();
     }
   }
 

@@ -64,7 +64,7 @@ void cuda_launcher(interpreted_spec interpreted_policy, Fun&& f, void** args, St
   lconfig.dynamicSmemBytes = mem_config[2];
   lconfig.stream           = stream;
 
-  cuda_safe_call(cudaLaunchKernelExC(&lconfig, (void*) f, args));
+  cuda_try<cudaLaunchKernelExC>(&lconfig, (void*) f, args);
 }
 
 template <typename interpreted_spec, typename Fun>
@@ -81,15 +81,15 @@ void cuda_launcher_graph(interpreted_spec interpreted_policy, Fun&& f, void** ar
   kconfig.kernelParams   = args;
   kconfig.sharedMemBytes = static_cast<int>(mem_config[2]);
 
-  cuda_safe_call(cudaGraphAddKernelNode(&n, g, nullptr, 0, &kconfig));
+  n = cuda_try<cudaGraphAddKernelNode>(g, nullptr, 0, &kconfig);
 
   // Enable cooperative kernel if necessary by updating the node attributes
 
   bool cooperative_kernel = interpreted_policy.need_cooperative_kernel_launch();
 
   cudaKernelNodeAttrValue val;
   val.cooperative = cooperative_kernel ? 1 : 0;
-  cuda_safe_call(cudaGraphKernelNodeSetAttribute(n, cudaKernelNodeAttributeCooperative, &val));
+  cuda_try<cudaGraphKernelNodeSetAttribute>(n, cudaKernelNodeAttributeCooperative, &val);
 }
 
 template <typename Fun, typename interpreted_spec, typename Arg>
@@ -120,20 +120,25 @@ void launch_impl(interpreted_spec interpreted_policy, exec_place& p, Fun f, Arg
 
     if (th_mem_config[1] > 0)
     {
-      cuda_safe_call(cudaMallocAsync(&th_dev_tmp_ptr, th_mem_config[1], stream));
+      cuda_try(cudaMallocAsync(&th_dev_tmp_ptr, th_mem_config[1], stream));
       th.set_device_tmp(th_dev_tmp_ptr);
     }
 
+    // Free the temporary device memory on the way out, even if the launch throws.
+    // cuda_safe_call (not cuda_try) because SCOPE(exit) is noexcept.
+    SCOPE(exit)
+    {
+      if (th_dev_tmp_ptr)
+      {
+        cuda_safe_call(cudaFreeAsync(th_dev_tmp_ptr, stream));
+      }
+    };
+
     auto kernel_args = tuple_prepend(mv(th), mv(arg));
     using args_type  = decltype(kernel_args);
     void* all_args[] = {&f, &kernel_args};
 
     cuda_launcher(interpreted_policy, reserved::launch_kernel<Fun, args_type>, all_args, stream);
-
-    if (th_mem_config[1] > 0)
-    {
-      cuda_safe_call(cudaFreeAsync(th_dev_tmp_ptr, stream));
-    }
   };
 }
 
@@ -358,20 +363,12 @@ public:
     nvtx_range nr(t.get_symbol().c_str());
     t.start();
 
-    int device;
-    cudaEvent_t start_event, end_event;
-
-    if constexpr (::std::is_same_v<Ctx, stream_ctx>)
-    {
-      if (record_time)
-      {
-        cudaGetDevice(&device); // We will use this to force it during the next run
-        // Events must be created here to avoid issues with multi-gpu
-        cuda_safe_call(cudaEventCreate(&start_event));
-        cuda_safe_call(cudaEventCreate(&end_event));
-        cuda_safe_call(cudaEventRecord(start_event, t.get_stream()));
-      }
-    }
+    int device              = -1;
+    cudaEvent_t start_event = nullptr, end_event = nullptr;
+    // Set only once both timing events exist and the start event has been recorded.
+    // The timing setup is done below, after the SCOPE(exit) guard is installed, so a
+    // throw from those cuda_try calls cannot skip t.end_uncleared()/t.clear().
+    bool timing_active = false;
 
     const size_t grid_size = e_place.size();
 
@@ -403,8 +400,11 @@ public:
           deallocateManagedMemory(hostMemoryArrivedList, grid_size, t.get_stream());
         }
 
-        if (record_time)
+        if (timing_active)
         {
+          // These run inside the enclosing SCOPE(exit) body, which is noexcept;
+          // keep cuda_safe_call so a CUDA error aborts rather than throwing
+          // through the guard (which would call std::terminate).
           cuda_safe_call(cudaEventRecord(end_event, t.get_stream()));
           cuda_safe_call(cudaEventSynchronize(end_event));
 
@@ -426,6 +426,22 @@ public:
       t.clear();
     };
 
+    if constexpr (::std::is_same_v<Ctx, stream_ctx>)
+    {
+      if (record_time)
+      {
+        device = cuda_try<cudaGetDevice>(); // We will use this to force it during the next run
+        // Events must be created here to avoid issues with multi-gpu.
+        // cudaEventCreate is an overload set (cuda_runtime.h adds a flags overload),
+        // so cuda_try<cudaEventCreate> cannot name it; use the non-overloaded
+        // cudaEventCreateWithFlags with the default flags (equivalent to cudaEventCreate).
+        start_event = cuda_try<cudaEventCreateWithFlags>(cudaEventDefault);
+        end_event   = cuda_try<cudaEventCreateWithFlags>(cudaEventDefault);
+        cuda_try<cudaEventRecord>(start_event, t.get_stream());
+        timing_active = true;
+      }
+    }
+
     /* Should only be allocated / deallocated if the last level used is system wide. Unnecessary and wasteful
      * otherwise. */
     if (grid_size > 1)