diff --git a/cudax/include/cuda/experimental/__stf/stream/interfaces/hashtable_linearprobing.cuh b/cudax/include/cuda/experimental/__stf/stream/interfaces/hashtable_linearprobing.cuh
index 6968f058f85..aa6771b007e 100644
--- a/cudax/include/cuda/experimental/__stf/stream/interfaces/hashtable_linearprobing.cuh
+++ b/cudax/include/cuda/experimental/__stf/stream/interfaces/hashtable_linearprobing.cuh
@@ -29,6 +29,7 @@
 
 #include <cuda/experimental/__stf/internal/hashtable_linearprobing.cuh>
 #include <cuda/experimental/__stf/stream/stream_data_interface.cuh>
+#include <cuda/experimental/__stf/utility/scope_guard.cuh>
 
 namespace cuda::experimental::stf
 {
@@ -77,7 +78,9 @@ public:
     size_t sz               = this->shape.get_capacity() * sizeof(reserved::KeyValue);
 
     // NAIVE method !
-    cuda_safe_call(cudaMemcpyAsync((void*) dst, (void*) src, sz, kind, s));
+    // cudaMemcpyAsync is an overload set (cuda_runtime.h alternate-spelling wrapper),
+    // so it keeps the runtime-status cuda_try form.
+    cuda_try(cudaMemcpyAsync((void*) dst, (void*) src, sz, kind, s));
   }
 
   void stream_data_allocate(
@@ -95,18 +98,26 @@ public:
 
     if (memory_node.is_host())
     {
-      // Fallback to a synchronous method
-      cuda_safe_call(cudaStreamSynchronize(stream));
-      cuda_safe_call(cudaHostAlloc(&base_ptr, s, cudaHostAllocMapped));
+      // Fallback to a synchronous method. cudaHostAlloc is an overload set
+      // (cuda_runtime.h templated wrapper), so it keeps the runtime-status form.
+      cuda_try<cudaStreamSynchronize>(stream);
+      cuda_try(cudaHostAlloc(&base_ptr, s, cudaHostAllocMapped));
       memset(base_ptr, 0xff, s);
     }
     else
     {
-      cuda_safe_call(cudaMallocAsync(&base_ptr, s, stream));
+      // cudaMallocAsync is an overload set (templated wrapper), so it keeps the
+      // runtime-status form.
+      cuda_try(cudaMallocAsync(&base_ptr, s, stream));
+      // Free the buffer if the initialization below throws.
+      SCOPE(fail)
+      {
+        cuda_safe_call(cudaFreeAsync(base_ptr, stream));
+      };
 
       // We also need to initialize the hashtable
       static_assert(reserved::kEmpty == 0xffffffff, "memset expected kEmpty=0xffffffff");
-      cuda_safe_call(cudaMemsetAsync(base_ptr, 0xff, s, stream));
+      cuda_try<cudaMemsetAsync>(base_ptr, 0xff, s, stream);
     }
 
     local_desc.addr = base_ptr;
@@ -123,12 +134,12 @@ public:
     if (memory_node.is_host())
     {
       // Fallback to a synchronous method
-      cuda_safe_call(cudaStreamSynchronize(stream));
-      cuda_safe_call(cudaFreeHost(local_desc.addr));
+      cuda_try<cudaStreamSynchronize>(stream);
+      cuda_try<cudaFreeHost>(local_desc.addr);
     }
     else
     {
-      cuda_safe_call(cudaFreeAsync(local_desc.addr, stream));
+      cuda_try<cudaFreeAsync>(local_desc.addr, stream);
     }
     local_desc.addr = nullptr; // not strictly necessary, but helps debugging
   }
diff --git a/cudax/include/cuda/experimental/__stf/stream/interfaces/slice.cuh b/cudax/include/cuda/experimental/__stf/stream/interfaces/slice.cuh
index d19ea175674..cbda86ef9f8 100644
--- a/cudax/include/cuda/experimental/__stf/stream/interfaces/slice.cuh
+++ b/cudax/include/cuda/experimental/__stf/stream/interfaces/slice.cuh
@@ -232,15 +232,17 @@ public:
 
     if constexpr (dimensions == 0)
     {
-      cuda_safe_call(cudaMemcpyAsync(dst_ptr, src_ptr, sizeof(T), kind, s));
+      // cudaMemcpyAsync is an overload set (cuda_runtime.h adds an alternate-spelling
+      // wrapper), so it keeps the runtime-status cuda_try form.
+      cuda_try(cudaMemcpyAsync(dst_ptr, src_ptr, sizeof(T), kind, s));
     }
     else if constexpr (dimensions == 1)
     {
-      cuda_safe_call(cudaMemcpyAsync(dst_ptr, src_ptr, b.extent(0) * sizeof(T), kind, s));
+      cuda_try(cudaMemcpyAsync(dst_ptr, src_ptr, b.extent(0) * sizeof(T), kind, s));
     }
     else if constexpr (dimensions == 2)
     {
-      cuda_safe_call(cudaMemcpy2DAsync(
+      cuda_try<cudaMemcpy2DAsync>(
         dst_ptr,
         dst_instance.stride(1) * sizeof(T),
         src_ptr,
@@ -248,14 +250,14 @@ public:
         b.extent(0) * sizeof(T),
         b.extent(1),
         kind,
-        s));
+        s);
     }
     else
     {
       // We only support higher dimensions if they are contiguous !
       if ((contiguous_dims(src_instance) == dimensions) && (contiguous_dims(dst_instance) == dimensions))
       {
-        cuda_safe_call(cudaMemcpyAsync(dst_ptr, src_ptr, b.size() * sizeof(T), kind, s));
+        cuda_try(cudaMemcpyAsync(dst_ptr, src_ptr, b.size() * sizeof(T), kind, s));
       }
       else
       {
@@ -281,8 +283,7 @@ public:
   {
     auto s = this->instance(instance_id);
 
-    cudaPointerAttributes attributes{};
-    cuda_safe_call(cudaPointerGetAttributes(&attributes, s.data_handle()));
+    const auto attributes = cuda_try<cudaPointerGetAttributes>(s.data_handle());
 
     // Implicitly converted to an optional
     return attributes.type;
diff --git a/cudax/include/cuda/experimental/__stf/stream/interfaces/slice_reduction_ops.cuh b/cudax/include/cuda/experimental/__stf/stream/interfaces/slice_reduction_ops.cuh
index 453823392ac..f89956c72ac 100644
--- a/cudax/include/cuda/experimental/__stf/stream/interfaces/slice_reduction_ops.cuh
+++ b/cudax/include/cuda/experimental/__stf/stream/interfaces/slice_reduction_ops.cuh
@@ -115,7 +115,7 @@ public:
     if (e.affine_data_place().is_host())
     {
       // TODO make a callback when the situation gets better
-      cuda_safe_call(cudaStreamSynchronize(s));
+      cuda_try<cudaStreamSynchronize>(s);
       // slice_print(in, "in before op");
       // slice_print(inout, "inout before op");
 
@@ -160,7 +160,7 @@ public:
     if (e.affine_data_place().is_host())
     {
       // TODO make a callback when the situation gets better
-      cuda_safe_call(cudaStreamSynchronize(s));
+      cuda_try<cudaStreamSynchronize>(s);
       if constexpr (dimensions == 1)
       {
         for (size_t i = 0; i < out.extent(0); i++)