From b6dd048249d2daee4f833f3dd9cc321a96c7c6d8 Mon Sep 17 00:00:00 2001
From: Andrei Alexandrescu <andrei@erdani.com>
Date: Thu, 4 Jun 2026 21:34:52 -0400
Subject: [PATCH 1/2] cudax/stf: migrate stream/interfaces/ from cuda_safe_call
 to cuda_try

Covers the stream-backend data interfaces (hashtable, slice, slice reduction
ops). Uses the templated cuda_try<F> form for single-function calls
(cudaMemcpyAsync, cudaMemcpy2DAsync, cudaMemsetAsync, cudaStreamSynchronize,
cudaFreeHost, cudaFreeAsync, cudaPointerGetAttributes).

cudaHostAlloc and cudaMallocAsync stay in the runtime-status form: both are
overload sets (cuda_runtime.h templated wrappers), so cuda_try<F> cannot name
them.

hashtable stream_data_allocate: after the device cudaMallocAsync succeeds, the
buffer is freed via SCOPE(fail) if the subsequent cudaMemsetAsync throws, so the
new throw path does not leak the allocation. Adds scope_guard.cuh.

Verified locally by building cudax.test.stf.hashtable.test,
cudax.test.stf.reductions.slice2d_reduction (2D cudaMemcpy2DAsync path), and
cudax.test.stf.reductions.reduce_sum.
---
 .../interfaces/hashtable_linearprobing.cuh    | 27 ++++++++++++-------
 .../__stf/stream/interfaces/slice.cuh         | 13 +++++----
 .../stream/interfaces/slice_reduction_ops.cuh |  4 +--
 3 files changed, 26 insertions(+), 18 deletions(-)
diff --git a/cudax/include/cuda/experimental/__stf/stream/interfaces/hashtable_linearprobing.cuh b/cudax/include/cuda/experimental/__stf/stream/interfaces/hashtable_linearprobing.cuh
index 6968f058f85..d51e5e58e7c 100644
--- a/cudax/include/cuda/experimental/__stf/stream/interfaces/hashtable_linearprobing.cuh
+++ b/cudax/include/cuda/experimental/__stf/stream/interfaces/hashtable_linearprobing.cuh
@@ -29,6 +29,7 @@
 
 #include <cuda/experimental/__stf/internal/hashtable_linearprobing.cuh>
 #include <cuda/experimental/__stf/stream/stream_data_interface.cuh>
+#include <cuda/experimental/__stf/utility/scope_guard.cuh>
 
 namespace cuda::experimental::stf
 {
@@ -77,7 +78,7 @@ public:
     size_t sz               = this->shape.get_capacity() * sizeof(reserved::KeyValue);
 
     // NAIVE method !
-    cuda_safe_call(cudaMemcpyAsync((void*) dst, (void*) src, sz, kind, s));
+    cuda_try<cudaMemcpyAsync>((void*) dst, (void*) src, sz, kind, s);
   }
 
   void stream_data_allocate(
@@ -95,18 +96,26 @@ public:
 
     if (memory_node.is_host())
     {
-      // Fallback to a synchronous method
-      cuda_safe_call(cudaStreamSynchronize(stream));
-      cuda_safe_call(cudaHostAlloc(&base_ptr, s, cudaHostAllocMapped));
+      // Fallback to a synchronous method. cudaHostAlloc is an overload set
+      // (cuda_runtime.h templated wrapper), so it keeps the runtime-status form.
+      cuda_try<cudaStreamSynchronize>(stream);
+      cuda_try(cudaHostAlloc(&base_ptr, s, cudaHostAllocMapped));
       memset(base_ptr, 0xff, s);
     }
     else
     {
-      cuda_safe_call(cudaMallocAsync(&base_ptr, s, stream));
+      // cudaMallocAsync is an overload set (templated wrapper), so it keeps the
+      // runtime-status form.
+      cuda_try(cudaMallocAsync(&base_ptr, s, stream));
+      // Free the buffer if the initialization below throws.
+      SCOPE(fail)
+      {
+        cuda_safe_call(cudaFreeAsync(base_ptr, stream));
+      };
 
       // We also need to initialize the hashtable
       static_assert(reserved::kEmpty == 0xffffffff, "memset expected kEmpty=0xffffffff");
-      cuda_safe_call(cudaMemsetAsync(base_ptr, 0xff, s, stream));
+      cuda_try<cudaMemsetAsync>(base_ptr, 0xff, s, stream);
     }
 
     local_desc.addr = base_ptr;
@@ -123,12 +132,12 @@ public:
     if (memory_node.is_host())
     {
       // Fallback to a synchronous method
-      cuda_safe_call(cudaStreamSynchronize(stream));
-      cuda_safe_call(cudaFreeHost(local_desc.addr));
+      cuda_try<cudaStreamSynchronize>(stream);
+      cuda_try<cudaFreeHost>(local_desc.addr);
     }
     else
     {
-      cuda_safe_call(cudaFreeAsync(local_desc.addr, stream));
+      cuda_try<cudaFreeAsync>(local_desc.addr, stream);
     }
     local_desc.addr = nullptr; // not strictly necessary, but helps debugging
   }
diff --git a/cudax/include/cuda/experimental/__stf/stream/interfaces/slice.cuh b/cudax/include/cuda/experimental/__stf/stream/interfaces/slice.cuh
index d19ea175674..a94a3ee751f 100644
--- a/cudax/include/cuda/experimental/__stf/stream/interfaces/slice.cuh
+++ b/cudax/include/cuda/experimental/__stf/stream/interfaces/slice.cuh
@@ -232,15 +232,15 @@ public:
 
     if constexpr (dimensions == 0)
     {
-      cuda_safe_call(cudaMemcpyAsync(dst_ptr, src_ptr, sizeof(T), kind, s));
+      cuda_try<cudaMemcpyAsync>(dst_ptr, src_ptr, sizeof(T), kind, s);
     }
     else if constexpr (dimensions == 1)
     {
-      cuda_safe_call(cudaMemcpyAsync(dst_ptr, src_ptr, b.extent(0) * sizeof(T), kind, s));
+      cuda_try<cudaMemcpyAsync>(dst_ptr, src_ptr, b.extent(0) * sizeof(T), kind, s);
     }
     else if constexpr (dimensions == 2)
     {
-      cuda_safe_call(cudaMemcpy2DAsync(
+      cuda_try<cudaMemcpy2DAsync>(
         dst_ptr,
         dst_instance.stride(1) * sizeof(T),
         src_ptr,
@@ -248,14 +248,14 @@ public:
         b.extent(0) * sizeof(T),
         b.extent(1),
         kind,
-        s));
+        s);
     }
     else
     {
       // We only support higher dimensions if they are contiguous !
       if ((contiguous_dims(src_instance) == dimensions) && (contiguous_dims(dst_instance) == dimensions))
       {
-        cuda_safe_call(cudaMemcpyAsync(dst_ptr, src_ptr, b.size() * sizeof(T), kind, s));
+        cuda_try<cudaMemcpyAsync>(dst_ptr, src_ptr, b.size() * sizeof(T), kind, s);
       }
       else
       {
@@ -281,8 +281,7 @@ public:
   {
     auto s = this->instance(instance_id);
 
-    cudaPointerAttributes attributes{};
-    cuda_safe_call(cudaPointerGetAttributes(&attributes, s.data_handle()));
+    const auto attributes = cuda_try<cudaPointerGetAttributes>(s.data_handle());
 
     // Implicitly converted to an optional
     return attributes.type;
diff --git a/cudax/include/cuda/experimental/__stf/stream/interfaces/slice_reduction_ops.cuh b/cudax/include/cuda/experimental/__stf/stream/interfaces/slice_reduction_ops.cuh
index 453823392ac..f89956c72ac 100644
--- a/cudax/include/cuda/experimental/__stf/stream/interfaces/slice_reduction_ops.cuh
+++ b/cudax/include/cuda/experimental/__stf/stream/interfaces/slice_reduction_ops.cuh
@@ -115,7 +115,7 @@ public:
     if (e.affine_data_place().is_host())
     {
       // TODO make a callback when the situation gets better
-      cuda_safe_call(cudaStreamSynchronize(s));
+      cuda_try<cudaStreamSynchronize>(s);
       // slice_print(in, "in before op");
       // slice_print(inout, "inout before op");
 
@@ -160,7 +160,7 @@ public:
     if (e.affine_data_place().is_host())
     {
       // TODO make a callback when the situation gets better
-      cuda_safe_call(cudaStreamSynchronize(s));
+      cuda_try<cudaStreamSynchronize>(s);
       if constexpr (dimensions == 1)
       {
         for (size_t i = 0; i < out.extent(0); i++)

From 3ae7ba156d90a930111436ff71e9d103c1eaba0e Mon Sep 17 00:00:00 2001
From: Andrei Alexandrescu <andrei@erdani.com>
Date: Thu, 4 Jun 2026 21:51:12 -0400
Subject: [PATCH 2/2] cudax/stf: use runtime-status cuda_try for
 cudaMemcpyAsync (overload set)

cudaMemcpyAsync is an overload set on CTK 13.2+ (cuda_runtime.h adds an
alternate-spelling wrapper), so cuda_try<cudaMemcpyAsync> is ill-formed.
Clang rejects it; GCC accepts it leniently, which is why local GCC builds
missed it. Revert the cudaMemcpyAsync calls in the stream slice and
hashtable interfaces to the runtime-status cuda_try(cudaMemcpyAsync(...))
form. The other (non-overloaded) calls keep the templated form.
---
 .../__stf/stream/interfaces/hashtable_linearprobing.cuh   | 4 +++-
 .../cuda/experimental/__stf/stream/interfaces/slice.cuh   | 8 +++++---
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/cudax/include/cuda/experimental/__stf/stream/interfaces/hashtable_linearprobing.cuh b/cudax/include/cuda/experimental/__stf/stream/interfaces/hashtable_linearprobing.cuh
index d51e5e58e7c..aa6771b007e 100644
--- a/cudax/include/cuda/experimental/__stf/stream/interfaces/hashtable_linearprobing.cuh
+++ b/cudax/include/cuda/experimental/__stf/stream/interfaces/hashtable_linearprobing.cuh
@@ -78,7 +78,9 @@ public:
     size_t sz               = this->shape.get_capacity() * sizeof(reserved::KeyValue);
 
     // NAIVE method !
-    cuda_try<cudaMemcpyAsync>((void*) dst, (void*) src, sz, kind, s);
+    // cudaMemcpyAsync is an overload set (cuda_runtime.h alternate-spelling wrapper),
+    // so it keeps the runtime-status cuda_try form.
+    cuda_try(cudaMemcpyAsync((void*) dst, (void*) src, sz, kind, s));
   }
 
   void stream_data_allocate(
diff --git a/cudax/include/cuda/experimental/__stf/stream/interfaces/slice.cuh b/cudax/include/cuda/experimental/__stf/stream/interfaces/slice.cuh
index a94a3ee751f..cbda86ef9f8 100644
--- a/cudax/include/cuda/experimental/__stf/stream/interfaces/slice.cuh
+++ b/cudax/include/cuda/experimental/__stf/stream/interfaces/slice.cuh
@@ -232,11 +232,13 @@ public:
 
     if constexpr (dimensions == 0)
     {
-      cuda_try<cudaMemcpyAsync>(dst_ptr, src_ptr, sizeof(T), kind, s);
+      // cudaMemcpyAsync is an overload set (cuda_runtime.h adds an alternate-spelling
+      // wrapper), so it keeps the runtime-status cuda_try form.
+      cuda_try(cudaMemcpyAsync(dst_ptr, src_ptr, sizeof(T), kind, s));
     }
     else if constexpr (dimensions == 1)
     {
-      cuda_try<cudaMemcpyAsync>(dst_ptr, src_ptr, b.extent(0) * sizeof(T), kind, s);
+      cuda_try(cudaMemcpyAsync(dst_ptr, src_ptr, b.extent(0) * sizeof(T), kind, s));
     }
     else if constexpr (dimensions == 2)
     {
@@ -255,7 +257,7 @@ public:
       // We only support higher dimensions if they are contiguous !
       if ((contiguous_dims(src_instance) == dimensions) && (contiguous_dims(dst_instance) == dimensions))
       {
-        cuda_try<cudaMemcpyAsync>(dst_ptr, src_ptr, b.size() * sizeof(T), kind, s);
+        cuda_try(cudaMemcpyAsync(dst_ptr, src_ptr, b.size() * sizeof(T), kind, s));
       }
       else
       {