diff --git a/cudax/include/cuda/experimental/__stf/stream/interfaces/hashtable_linearprobing.cuh b/cudax/include/cuda/experimental/__stf/stream/interfaces/hashtable_linearprobing.cuh index 6968f058f85..aa6771b007e 100644 --- a/cudax/include/cuda/experimental/__stf/stream/interfaces/hashtable_linearprobing.cuh +++ b/cudax/include/cuda/experimental/__stf/stream/interfaces/hashtable_linearprobing.cuh @@ -29,6 +29,7 @@ #include #include +#include namespace cuda::experimental::stf { @@ -77,7 +78,9 @@ public: size_t sz = this->shape.get_capacity() * sizeof(reserved::KeyValue); // NAIVE method ! - cuda_safe_call(cudaMemcpyAsync((void*) dst, (void*) src, sz, kind, s)); + // cudaMemcpyAsync is an overload set (cuda_runtime.h alternate-spelling wrapper), + // so it keeps the runtime-status cuda_try form. + cuda_try(cudaMemcpyAsync((void*) dst, (void*) src, sz, kind, s)); } void stream_data_allocate( @@ -95,18 +98,26 @@ public: if (memory_node.is_host()) { - // Fallback to a synchronous method - cuda_safe_call(cudaStreamSynchronize(stream)); - cuda_safe_call(cudaHostAlloc(&base_ptr, s, cudaHostAllocMapped)); + // Fallback to a synchronous method. cudaHostAlloc is an overload set + // (cuda_runtime.h templated wrapper), so it keeps the runtime-status form. + cuda_try(stream); + cuda_try(cudaHostAlloc(&base_ptr, s, cudaHostAllocMapped)); memset(base_ptr, 0xff, s); } else { - cuda_safe_call(cudaMallocAsync(&base_ptr, s, stream)); + // cudaMallocAsync is an overload set (templated wrapper), so it keeps the + // runtime-status form. + cuda_try(cudaMallocAsync(&base_ptr, s, stream)); + // Free the buffer if the initialization below throws. + SCOPE(fail) + { + cuda_safe_call(cudaFreeAsync(base_ptr, stream)); + }; // We also need to initialize the hashtable static_assert(reserved::kEmpty == 0xffffffff, "memset expected kEmpty=0xffffffff"); - cuda_safe_call(cudaMemsetAsync(base_ptr, 0xff, s, stream)); + cuda_try(base_ptr, 0xff, s, stream); } local_desc.addr = base_ptr; @@ -123,12 +134,12 @@ public: if (memory_node.is_host()) { // Fallback to a synchronous method - cuda_safe_call(cudaStreamSynchronize(stream)); - cuda_safe_call(cudaFreeHost(local_desc.addr)); + cuda_try(stream); + cuda_try(local_desc.addr); } else { - cuda_safe_call(cudaFreeAsync(local_desc.addr, stream)); + cuda_try(local_desc.addr, stream); } local_desc.addr = nullptr; // not strictly necessary, but helps debugging } diff --git a/cudax/include/cuda/experimental/__stf/stream/interfaces/slice.cuh b/cudax/include/cuda/experimental/__stf/stream/interfaces/slice.cuh index d19ea175674..cbda86ef9f8 100644 --- a/cudax/include/cuda/experimental/__stf/stream/interfaces/slice.cuh +++ b/cudax/include/cuda/experimental/__stf/stream/interfaces/slice.cuh @@ -232,15 +232,17 @@ public: if constexpr (dimensions == 0) { - cuda_safe_call(cudaMemcpyAsync(dst_ptr, src_ptr, sizeof(T), kind, s)); + // cudaMemcpyAsync is an overload set (cuda_runtime.h adds an alternate-spelling + // wrapper), so it keeps the runtime-status cuda_try form. + cuda_try(cudaMemcpyAsync(dst_ptr, src_ptr, sizeof(T), kind, s)); } else if constexpr (dimensions == 1) { - cuda_safe_call(cudaMemcpyAsync(dst_ptr, src_ptr, b.extent(0) * sizeof(T), kind, s)); + cuda_try(cudaMemcpyAsync(dst_ptr, src_ptr, b.extent(0) * sizeof(T), kind, s)); } else if constexpr (dimensions == 2) { - cuda_safe_call(cudaMemcpy2DAsync( + cuda_try( dst_ptr, dst_instance.stride(1) * sizeof(T), src_ptr, @@ -248,14 +250,14 @@ public: b.extent(0) * sizeof(T), b.extent(1), kind, - s)); + s); } else { // We only support higher dimensions if they are contiguous ! if ((contiguous_dims(src_instance) == dimensions) && (contiguous_dims(dst_instance) == dimensions)) { - cuda_safe_call(cudaMemcpyAsync(dst_ptr, src_ptr, b.size() * sizeof(T), kind, s)); + cuda_try(cudaMemcpyAsync(dst_ptr, src_ptr, b.size() * sizeof(T), kind, s)); } else { @@ -281,8 +283,7 @@ public: { auto s = this->instance(instance_id); - cudaPointerAttributes attributes{}; - cuda_safe_call(cudaPointerGetAttributes(&attributes, s.data_handle())); + const auto attributes = cuda_try(s.data_handle()); // Implicitly converted to an optional return attributes.type; diff --git a/cudax/include/cuda/experimental/__stf/stream/interfaces/slice_reduction_ops.cuh b/cudax/include/cuda/experimental/__stf/stream/interfaces/slice_reduction_ops.cuh index 453823392ac..f89956c72ac 100644 --- a/cudax/include/cuda/experimental/__stf/stream/interfaces/slice_reduction_ops.cuh +++ b/cudax/include/cuda/experimental/__stf/stream/interfaces/slice_reduction_ops.cuh @@ -115,7 +115,7 @@ public: if (e.affine_data_place().is_host()) { // TODO make a callback when the situation gets better - cuda_safe_call(cudaStreamSynchronize(s)); + cuda_try(s); // slice_print(in, "in before op"); // slice_print(inout, "inout before op"); @@ -160,7 +160,7 @@ public: if (e.affine_data_place().is_host()) { // TODO make a callback when the situation gets better - cuda_safe_call(cudaStreamSynchronize(s)); + cuda_try(s); if constexpr (dimensions == 1) { for (size_t i = 0; i < out.extent(0); i++)