diff --git a/cudax/include/cuda/experimental/__stf/internal/host_launch_scope.cuh b/cudax/include/cuda/experimental/__stf/internal/host_launch_scope.cuh index 5c20dcfe4ba..aeae4c78454 100644 --- a/cudax/include/cuda/experimental/__stf/internal/host_launch_scope.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/host_launch_scope.cuh @@ -35,6 +35,7 @@ #include #include +#include #include namespace cuda::experimental::stf @@ -238,28 +239,24 @@ public: t.set_symbol(symbol); } - cudaEvent_t start_event, end_event; + cudaEvent_t start_event = nullptr, end_event = nullptr; const bool record_time = t.schedule_task() || statistics.is_calibrating_to_file(); + // Set only once both timing events exist and the start event has been recorded. + // The timing setup is done below, after the SCOPE(exit) guard is installed, so a + // throw from those cuda_try calls cannot skip t.end_uncleared()/t.clear(). + bool timing_active = false; t.start(); - if constexpr (::std::is_same_v) - { - if (record_time) - { - cuda_safe_call(cudaEventCreate(&start_event)); - cuda_safe_call(cudaEventCreate(&end_event)); - cuda_safe_call(cudaEventRecord(start_event, t.get_stream())); - } - } - SCOPE(exit) { t.end_uncleared(); if constexpr (::std::is_same_v) { - if (record_time) + if (timing_active) { + // Inside the noexcept SCOPE(exit) body; keep cuda_safe_call so a CUDA + // error aborts rather than throwing through the guard. cuda_safe_call(cudaEventRecord(end_event, t.get_stream())); cuda_safe_call(cudaEventSynchronize(end_event)); @@ -280,11 +277,27 @@ public: t.clear(); }; + if constexpr (::std::is_same_v) + { + if (record_time) + { + // cudaEventCreate is an overload set (cuda_runtime.h adds a flags overload), + // so cuda_try cannot name it; use the non-overloaded + // cudaEventCreateWithFlags with the default flags (equivalent to cudaEventCreate). + start_event = cuda_try(cudaEventDefault); + end_event = cuda_try(cudaEventDefault); + cuda_try(start_event, t.get_stream()); + timing_active = true; + } + } + if constexpr (fun_invocable_untyped) { // --- Untyped dispatch path --- - auto* resolved = new ::std::pair{::std::forward(f), host_launch_deps{}}; - auto& hld = resolved->second; + auto resolved = + ::std::make_unique<::std::pair>(::std::forward(f), host_launch_deps{}); + + auto& hld = resolved->second; const size_t ndeps = deps.size(); hld.lds_.resize(ndeps); @@ -298,32 +311,38 @@ public: hld.dtor_ = user_data_dtor_; user_data_dtor_ = nullptr; - if constexpr (::std::is_same_v) - { - using wrapper_type = ::std::remove_reference_t; - auto resource = ::std::make_shared>(resolved); - ctx.add_resource(mv(resource)); - } - auto callback = [](void* raw) { - auto* w = static_cast(raw); - w->first(w->second); - if constexpr (!::std::is_same_v) + auto* w = static_cast(raw); + SCOPE(exit) { - delete w; - } + if constexpr (!::std::is_same_v) + { + delete w; + } + }; + w->first(w->second); }; if constexpr (::std::is_same_v) { - cudaHostNodeParams params = {.fn = callback, .userData = resolved}; + cudaHostNodeParams params = {.fn = callback, .userData = resolved.get()}; auto lock = t.lock_ctx_graph(); - cuda_safe_call(cudaGraphAddHostNode(&t.get_node(), t.get_ctx_graph(), nullptr, 0, ¶ms)); + t.get_node() = cuda_try(t.get_ctx_graph(), nullptr, 0, ¶ms); + // The node now references the args; hand ownership to a ctx resource + // that deletes them (in release_in_callback) when the ctx is released. + using wrapper_type = ::std::remove_reference_t; + ctx.add_resource(::std::make_shared>(resolved.get())); } else { - cuda_safe_call(cudaLaunchHostFunc(t.get_stream(), callback, resolved)); + // For a stream the callback owns the args once the launch succeeds. + cuda_try(t.get_stream(), callback, resolved.get()); } + // Ownership has transferred (to the ctx resource for graph, or to the + // callback for stream). These enqueues are asynchronous, so on a throw + // above the callback has not run and the unique_ptr still owns the args; + // release it now that ownership has moved on. + resolved.release(); } else { @@ -338,17 +357,17 @@ public: return deps.instance(t); } }(); - auto* wrapper = new ::std::pair{::std::forward(f), mv(payload)}; - - if constexpr (::std::is_same_v) - { - using wrapper_type = ::std::remove_reference_t; - auto resource = ::std::make_shared>(wrapper); - ctx.add_resource(mv(resource)); - } + auto wrapper = ::std::make_unique<::std::pair>(::std::forward(f), mv(payload)); auto callback = [](void* untyped_wrapper) { - auto w = static_cast(untyped_wrapper); + auto w = static_cast(untyped_wrapper); + SCOPE(exit) + { + if constexpr (!::std::is_same_v) + { + delete w; + } + }; constexpr bool fun_invocable_task_deps = reserved::is_applicable_v; constexpr bool fun_invocable_task_non_void_deps = @@ -365,23 +384,23 @@ public: { ::std::apply(::std::forward(w->first), reserved::remove_void_interface(mv(w->second))); } - - if constexpr (!::std::is_same_v) - { - delete w; - } }; if constexpr (::std::is_same_v) { - cudaHostNodeParams params = {.fn = callback, .userData = wrapper}; + cudaHostNodeParams params = {.fn = callback, .userData = wrapper.get()}; auto lock = t.lock_ctx_graph(); - cuda_safe_call(cudaGraphAddHostNode(&t.get_node(), t.get_ctx_graph(), nullptr, 0, ¶ms)); + t.get_node() = cuda_try(t.get_ctx_graph(), nullptr, 0, ¶ms); + // Transfer ownership only after the node references the args, so a throw + // from cudaGraphAddHostNode leaves the unique_ptr as the sole owner. + using wrapper_type = ::std::remove_reference_t; + ctx.add_resource(::std::make_shared>(wrapper.get())); } else { - cuda_safe_call(cudaLaunchHostFunc(t.get_stream(), callback, wrapper)); + cuda_try(t.get_stream(), callback, wrapper.get()); } + wrapper.release(); } } diff --git a/cudax/include/cuda/experimental/__stf/internal/launch.cuh b/cudax/include/cuda/experimental/__stf/internal/launch.cuh index 2cc352adc43..b4b88714ad9 100644 --- a/cudax/include/cuda/experimental/__stf/internal/launch.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/launch.cuh @@ -64,7 +64,7 @@ void cuda_launcher(interpreted_spec interpreted_policy, Fun&& f, void** args, St lconfig.dynamicSmemBytes = mem_config[2]; lconfig.stream = stream; - cuda_safe_call(cudaLaunchKernelExC(&lconfig, (void*) f, args)); + cuda_try(&lconfig, (void*) f, args); } template @@ -81,7 +81,7 @@ void cuda_launcher_graph(interpreted_spec interpreted_policy, Fun&& f, void** ar kconfig.kernelParams = args; kconfig.sharedMemBytes = static_cast(mem_config[2]); - cuda_safe_call(cudaGraphAddKernelNode(&n, g, nullptr, 0, &kconfig)); + n = cuda_try(g, nullptr, 0, &kconfig); // Enable cooperative kernel if necessary by updating the node attributes @@ -89,7 +89,7 @@ void cuda_launcher_graph(interpreted_spec interpreted_policy, Fun&& f, void** ar cudaKernelNodeAttrValue val; val.cooperative = cooperative_kernel ? 1 : 0; - cuda_safe_call(cudaGraphKernelNodeSetAttribute(n, cudaKernelNodeAttributeCooperative, &val)); + cuda_try(n, cudaKernelNodeAttributeCooperative, &val); } template @@ -120,20 +120,25 @@ void launch_impl(interpreted_spec interpreted_policy, exec_place& p, Fun f, Arg if (th_mem_config[1] > 0) { - cuda_safe_call(cudaMallocAsync(&th_dev_tmp_ptr, th_mem_config[1], stream)); + cuda_try(cudaMallocAsync(&th_dev_tmp_ptr, th_mem_config[1], stream)); th.set_device_tmp(th_dev_tmp_ptr); } + // Free the temporary device memory on the way out, even if the launch throws. + // cuda_safe_call (not cuda_try) because SCOPE(exit) is noexcept. + SCOPE(exit) + { + if (th_dev_tmp_ptr) + { + cuda_safe_call(cudaFreeAsync(th_dev_tmp_ptr, stream)); + } + }; + auto kernel_args = tuple_prepend(mv(th), mv(arg)); using args_type = decltype(kernel_args); void* all_args[] = {&f, &kernel_args}; cuda_launcher(interpreted_policy, reserved::launch_kernel, all_args, stream); - - if (th_mem_config[1] > 0) - { - cuda_safe_call(cudaFreeAsync(th_dev_tmp_ptr, stream)); - } }; } @@ -358,20 +363,12 @@ public: nvtx_range nr(t.get_symbol().c_str()); t.start(); - int device; - cudaEvent_t start_event, end_event; - - if constexpr (::std::is_same_v) - { - if (record_time) - { - cudaGetDevice(&device); // We will use this to force it during the next run - // Events must be created here to avoid issues with multi-gpu - cuda_safe_call(cudaEventCreate(&start_event)); - cuda_safe_call(cudaEventCreate(&end_event)); - cuda_safe_call(cudaEventRecord(start_event, t.get_stream())); - } - } + int device = -1; + cudaEvent_t start_event = nullptr, end_event = nullptr; + // Set only once both timing events exist and the start event has been recorded. + // The timing setup is done below, after the SCOPE(exit) guard is installed, so a + // throw from those cuda_try calls cannot skip t.end_uncleared()/t.clear(). + bool timing_active = false; const size_t grid_size = e_place.size(); @@ -403,8 +400,11 @@ public: deallocateManagedMemory(hostMemoryArrivedList, grid_size, t.get_stream()); } - if (record_time) + if (timing_active) { + // These run inside the enclosing SCOPE(exit) body, which is noexcept; + // keep cuda_safe_call so a CUDA error aborts rather than throwing + // through the guard (which would call std::terminate). cuda_safe_call(cudaEventRecord(end_event, t.get_stream())); cuda_safe_call(cudaEventSynchronize(end_event)); @@ -426,6 +426,22 @@ public: t.clear(); }; + if constexpr (::std::is_same_v) + { + if (record_time) + { + device = cuda_try(); // We will use this to force it during the next run + // Events must be created here to avoid issues with multi-gpu. + // cudaEventCreate is an overload set (cuda_runtime.h adds a flags overload), + // so cuda_try cannot name it; use the non-overloaded + // cudaEventCreateWithFlags with the default flags (equivalent to cudaEventCreate). + start_event = cuda_try(cudaEventDefault); + end_event = cuda_try(cudaEventDefault); + cuda_try(start_event, t.get_stream()); + timing_active = true; + } + } + /* Should only be allocated / deallocated if the last level used is system wide. Unnecessary and wasteful * otherwise. */ if (grid_size > 1)