From b6dd048249d2daee4f833f3dd9cc321a96c7c6d8 Mon Sep 17 00:00:00 2001 From: Andrei Alexandrescu Date: Thu, 4 Jun 2026 21:34:52 -0400 Subject: [PATCH 1/2] cudax/stf: migrate stream/interfaces/ from cuda_safe_call to cuda_try Covers the stream-backend data interfaces (hashtable, slice, slice reduction ops). Uses the templated cuda_try form for single-function calls (cudaMemcpyAsync, cudaMemcpy2DAsync, cudaMemsetAsync, cudaStreamSynchronize, cudaFreeHost, cudaFreeAsync, cudaPointerGetAttributes). cudaHostAlloc and cudaMallocAsync stay in the runtime-status form: both are overload sets (cuda_runtime.h templated wrappers), so cuda_try cannot name them. hashtable stream_data_allocate: after the device cudaMallocAsync succeeds, the buffer is freed via SCOPE(fail) if the subsequent cudaMemsetAsync throws, so the new throw path does not leak the allocation. Adds scope_guard.cuh. Verified locally by building cudax.test.stf.hashtable.test, cudax.test.stf.reductions.slice2d_reduction (2D cudaMemcpy2DAsync path), and cudax.test.stf.reductions.reduce_sum. --- .../interfaces/hashtable_linearprobing.cuh | 27 ++++++++++++------- .../__stf/stream/interfaces/slice.cuh | 13 +++++---- .../stream/interfaces/slice_reduction_ops.cuh | 4 +-- 3 files changed, 26 insertions(+), 18 deletions(-) diff --git a/cudax/include/cuda/experimental/__stf/stream/interfaces/hashtable_linearprobing.cuh b/cudax/include/cuda/experimental/__stf/stream/interfaces/hashtable_linearprobing.cuh index 6968f058f85..d51e5e58e7c 100644 --- a/cudax/include/cuda/experimental/__stf/stream/interfaces/hashtable_linearprobing.cuh +++ b/cudax/include/cuda/experimental/__stf/stream/interfaces/hashtable_linearprobing.cuh @@ -29,6 +29,7 @@ #include #include +#include namespace cuda::experimental::stf { @@ -77,7 +78,7 @@ public: size_t sz = this->shape.get_capacity() * sizeof(reserved::KeyValue); // NAIVE method ! - cuda_safe_call(cudaMemcpyAsync((void*) dst, (void*) src, sz, kind, s)); + cuda_try((void*) dst, (void*) src, sz, kind, s); } void stream_data_allocate( @@ -95,18 +96,26 @@ public: if (memory_node.is_host()) { - // Fallback to a synchronous method - cuda_safe_call(cudaStreamSynchronize(stream)); - cuda_safe_call(cudaHostAlloc(&base_ptr, s, cudaHostAllocMapped)); + // Fallback to a synchronous method. cudaHostAlloc is an overload set + // (cuda_runtime.h templated wrapper), so it keeps the runtime-status form. + cuda_try(stream); + cuda_try(cudaHostAlloc(&base_ptr, s, cudaHostAllocMapped)); memset(base_ptr, 0xff, s); } else { - cuda_safe_call(cudaMallocAsync(&base_ptr, s, stream)); + // cudaMallocAsync is an overload set (templated wrapper), so it keeps the + // runtime-status form. + cuda_try(cudaMallocAsync(&base_ptr, s, stream)); + // Free the buffer if the initialization below throws. + SCOPE(fail) + { + cuda_safe_call(cudaFreeAsync(base_ptr, stream)); + }; // We also need to initialize the hashtable static_assert(reserved::kEmpty == 0xffffffff, "memset expected kEmpty=0xffffffff"); - cuda_safe_call(cudaMemsetAsync(base_ptr, 0xff, s, stream)); + cuda_try(base_ptr, 0xff, s, stream); } local_desc.addr = base_ptr; @@ -123,12 +132,12 @@ public: if (memory_node.is_host()) { // Fallback to a synchronous method - cuda_safe_call(cudaStreamSynchronize(stream)); - cuda_safe_call(cudaFreeHost(local_desc.addr)); + cuda_try(stream); + cuda_try(local_desc.addr); } else { - cuda_safe_call(cudaFreeAsync(local_desc.addr, stream)); + cuda_try(local_desc.addr, stream); } local_desc.addr = nullptr; // not strictly necessary, but helps debugging } diff --git a/cudax/include/cuda/experimental/__stf/stream/interfaces/slice.cuh b/cudax/include/cuda/experimental/__stf/stream/interfaces/slice.cuh index d19ea175674..a94a3ee751f 100644 --- a/cudax/include/cuda/experimental/__stf/stream/interfaces/slice.cuh +++ b/cudax/include/cuda/experimental/__stf/stream/interfaces/slice.cuh @@ -232,15 +232,15 @@ public: if constexpr (dimensions == 0) { - cuda_safe_call(cudaMemcpyAsync(dst_ptr, src_ptr, sizeof(T), kind, s)); + cuda_try(dst_ptr, src_ptr, sizeof(T), kind, s); } else if constexpr (dimensions == 1) { - cuda_safe_call(cudaMemcpyAsync(dst_ptr, src_ptr, b.extent(0) * sizeof(T), kind, s)); + cuda_try(dst_ptr, src_ptr, b.extent(0) * sizeof(T), kind, s); } else if constexpr (dimensions == 2) { - cuda_safe_call(cudaMemcpy2DAsync( + cuda_try( dst_ptr, dst_instance.stride(1) * sizeof(T), src_ptr, @@ -248,14 +248,14 @@ public: b.extent(0) * sizeof(T), b.extent(1), kind, - s)); + s); } else { // We only support higher dimensions if they are contiguous ! if ((contiguous_dims(src_instance) == dimensions) && (contiguous_dims(dst_instance) == dimensions)) { - cuda_safe_call(cudaMemcpyAsync(dst_ptr, src_ptr, b.size() * sizeof(T), kind, s)); + cuda_try(dst_ptr, src_ptr, b.size() * sizeof(T), kind, s); } else { @@ -281,8 +281,7 @@ public: { auto s = this->instance(instance_id); - cudaPointerAttributes attributes{}; - cuda_safe_call(cudaPointerGetAttributes(&attributes, s.data_handle())); + const auto attributes = cuda_try(s.data_handle()); // Implicitly converted to an optional return attributes.type; diff --git a/cudax/include/cuda/experimental/__stf/stream/interfaces/slice_reduction_ops.cuh b/cudax/include/cuda/experimental/__stf/stream/interfaces/slice_reduction_ops.cuh index 453823392ac..f89956c72ac 100644 --- a/cudax/include/cuda/experimental/__stf/stream/interfaces/slice_reduction_ops.cuh +++ b/cudax/include/cuda/experimental/__stf/stream/interfaces/slice_reduction_ops.cuh @@ -115,7 +115,7 @@ public: if (e.affine_data_place().is_host()) { // TODO make a callback when the situation gets better - cuda_safe_call(cudaStreamSynchronize(s)); + cuda_try(s); // slice_print(in, "in before op"); // slice_print(inout, "inout before op"); @@ -160,7 +160,7 @@ public: if (e.affine_data_place().is_host()) { // TODO make a callback when the situation gets better - cuda_safe_call(cudaStreamSynchronize(s)); + cuda_try(s); if constexpr (dimensions == 1) { for (size_t i = 0; i < out.extent(0); i++) From 3ae7ba156d90a930111436ff71e9d103c1eaba0e Mon Sep 17 00:00:00 2001 From: Andrei Alexandrescu Date: Thu, 4 Jun 2026 21:51:12 -0400 Subject: [PATCH 2/2] cudax/stf: use runtime-status cuda_try for cudaMemcpyAsync (overload set) cudaMemcpyAsync is an overload set on CTK 13.2+ (cuda_runtime.h adds an alternate-spelling wrapper), so cuda_try is ill-formed. Clang rejects it; GCC accepts it leniently, which is why local GCC builds missed it. Revert the cudaMemcpyAsync calls in the stream slice and hashtable interfaces to the runtime-status cuda_try(cudaMemcpyAsync(...)) form. The other (non-overloaded) calls keep the templated form. --- .../__stf/stream/interfaces/hashtable_linearprobing.cuh | 4 +++- .../cuda/experimental/__stf/stream/interfaces/slice.cuh | 8 +++++--- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/cudax/include/cuda/experimental/__stf/stream/interfaces/hashtable_linearprobing.cuh b/cudax/include/cuda/experimental/__stf/stream/interfaces/hashtable_linearprobing.cuh index d51e5e58e7c..aa6771b007e 100644 --- a/cudax/include/cuda/experimental/__stf/stream/interfaces/hashtable_linearprobing.cuh +++ b/cudax/include/cuda/experimental/__stf/stream/interfaces/hashtable_linearprobing.cuh @@ -78,7 +78,9 @@ public: size_t sz = this->shape.get_capacity() * sizeof(reserved::KeyValue); // NAIVE method ! - cuda_try((void*) dst, (void*) src, sz, kind, s); + // cudaMemcpyAsync is an overload set (cuda_runtime.h alternate-spelling wrapper), + // so it keeps the runtime-status cuda_try form. + cuda_try(cudaMemcpyAsync((void*) dst, (void*) src, sz, kind, s)); } void stream_data_allocate( diff --git a/cudax/include/cuda/experimental/__stf/stream/interfaces/slice.cuh b/cudax/include/cuda/experimental/__stf/stream/interfaces/slice.cuh index a94a3ee751f..cbda86ef9f8 100644 --- a/cudax/include/cuda/experimental/__stf/stream/interfaces/slice.cuh +++ b/cudax/include/cuda/experimental/__stf/stream/interfaces/slice.cuh @@ -232,11 +232,13 @@ public: if constexpr (dimensions == 0) { - cuda_try(dst_ptr, src_ptr, sizeof(T), kind, s); + // cudaMemcpyAsync is an overload set (cuda_runtime.h adds an alternate-spelling + // wrapper), so it keeps the runtime-status cuda_try form. + cuda_try(cudaMemcpyAsync(dst_ptr, src_ptr, sizeof(T), kind, s)); } else if constexpr (dimensions == 1) { - cuda_try(dst_ptr, src_ptr, b.extent(0) * sizeof(T), kind, s); + cuda_try(cudaMemcpyAsync(dst_ptr, src_ptr, b.extent(0) * sizeof(T), kind, s)); } else if constexpr (dimensions == 2) { @@ -255,7 +257,7 @@ public: // We only support higher dimensions if they are contiguous ! if ((contiguous_dims(src_instance) == dimensions) && (contiguous_dims(dst_instance) == dimensions)) { - cuda_try(dst_ptr, src_ptr, b.size() * sizeof(T), kind, s); + cuda_try(cudaMemcpyAsync(dst_ptr, src_ptr, b.size() * sizeof(T), kind, s)); } else {