From 53ab6963845c2b4e31e86d320b5ce145b45dba22 Mon Sep 17 00:00:00 2001 From: fbusato Date: Wed, 3 Jun 2026 13:24:24 -0700 Subject: [PATCH 1/8] implementation --- .../include/cuda/__warp/warp_match_any.h | 84 +++++++++++++++++++ 1 file changed, 84 insertions(+) create mode 100644 libcudacxx/include/cuda/__warp/warp_match_any.h diff --git a/libcudacxx/include/cuda/__warp/warp_match_any.h b/libcudacxx/include/cuda/__warp/warp_match_any.h new file mode 100644 index 00000000000..7c3366fbf38 --- /dev/null +++ b/libcudacxx/include/cuda/__warp/warp_match_any.h @@ -0,0 +1,84 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA___WARP_WARP_MATCH_ANY_H +#define _CUDA___WARP_WARP_MATCH_ANY_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#if _CCCL_CUDA_COMPILATION() + +# include +# include +# include +# include +# include +# include +# include + +# include + +_CCCL_BEGIN_NAMESPACE_CUDA_DEVICE + +extern "C" _CCCL_DEVICE void __cuda__match_any_sync_is_not_supported_before_SM_70__(); + +//! @brief Returns the mask of lanes with the same bitwise value as the calling lane. +//! +//! @param[in] __data The data to compare across lanes. +//! @param[in] __lane_mask The mask of participating lanes. +//! +//! @return A lane mask containing lanes in `__lane_mask` whose `__data` matches the calling lane's data. +template +[[nodiscard]] _CCCL_DEVICE_API lane_mask +warp_match_any(const _Tp& __data, const lane_mask __lane_mask = lane_mask::all()) noexcept +{ + static_assert(is_trivially_copyable_v<_Tp>, "data must be trivially copyable"); + _CCCL_ASSERT(__lane_mask != lane_mask::none(), "lane_mask must be non-zero"); + + constexpr int __ratio = ::cuda::ceil_div(sizeof(_Tp), sizeof(::cuda::std::uint32_t)); + ::cuda::std::uint32_t __array[__ratio]{}; + +# if defined(_CCCL_BUILTIN_CLEAR_PADDING) + auto __data_copy = __data; + _CCCL_BUILTIN_CLEAR_PADDING(&__data_copy); + const auto __data_ptr = ::cuda::std::addressof(__data_copy); +# else // ^^^ _CCCL_BUILTIN_CLEAR_PADDING ^^^ / vvv !_CCCL_BUILTIN_CLEAR_PADDING vvv + static_assert(is_bitwise_comparable_v<_Tp>, "data must be bitwise comparable"); + const auto __data_ptr = ::cuda::std::addressof(__data); +# endif // _CCCL_BUILTIN_CLEAR_PADDING + ::cuda::std::memcpy(__array, __data_ptr, sizeof(_Tp)); + + lane_mask __ret = __lane_mask; + _CCCL_PRAGMA_UNROLL_FULL() + for (int i = 0; i < __ratio; ++i) + { + ::cuda::std::uint32_t __match_any_result = 0; + NV_IF_ELSE_TARGET(NV_PROVIDES_SM_70, + (__match_any_result = ::__match_any_sync(__lane_mask.value(), __array[i]);), + (::cuda::device::__cuda__match_any_sync_is_not_supported_before_SM_70__();)); + __ret &= lane_mask{__match_any_result}; + } + return __ret; +} + +_CCCL_END_NAMESPACE_CUDA_DEVICE + +# include + +#endif // _CCCL_CUDA_COMPILATION() +#endif // _CUDA___WARP_WARP_MATCH_ANY_H From a5064cfcddffc66374121efbd1241575b9ce5b88 Mon Sep 17 00:00:00 2001 From: fbusato Date: Wed, 3 Jun 2026 13:24:38 -0700 Subject: [PATCH 2/8] header --- libcudacxx/include/cuda/warp | 1 + 1 file changed, 1 insertion(+) diff --git a/libcudacxx/include/cuda/warp b/libcudacxx/include/cuda/warp index e2d6dd95e0f..da440289374 100644 --- a/libcudacxx/include/cuda/warp +++ b/libcudacxx/include/cuda/warp @@ -23,6 +23,7 @@ #include #include +#include #include #endif // _CUDA_WARP From efeaa27531429a3d8ea663e63b710d202c659934 Mon Sep 17 00:00:00 2001 From: fbusato Date: Wed, 3 Jun 2026 13:24:49 -0700 Subject: [PATCH 3/8] unit test --- .../cuda/warp/warp_match_any.pass.cpp | 97 +++++++++++++++++++ 1 file changed, 97 insertions(+) create mode 100644 libcudacxx/test/libcudacxx/cuda/warp/warp_match_any.pass.cpp diff --git a/libcudacxx/test/libcudacxx/cuda/warp/warp_match_any.pass.cpp b/libcudacxx/test/libcudacxx/cuda/warp/warp_match_any.pass.cpp new file mode 100644 index 00000000000..2a48524fa78 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/warp/warp_match_any.pass.cpp @@ -0,0 +1,97 @@ +//===----------------------------------------------------------------------===// +// +// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// +// UNSUPPORTED: pre-sm-70 + +// UNSUPPORTED: enable-tile +// error: asm statement is unsupported in tile code + +#include +#include +#include +#include + +#include "test_macros.h" + +TEST_DEVICE_FUNC uint32_t make_low_mask(unsigned count) +{ + return count == 32 ? 0xFFFFFFFF : ((1u << count) - 1); +} + +TEST_DEVICE_FUNC uint32_t make_stride_mask(unsigned count, unsigned step, unsigned remainder) +{ + uint32_t mask = 0; + for (unsigned lane = 0; lane < count; ++lane) + { + if ((lane % step) == remainder) + { + mask |= uint32_t{1} << lane; + } + } + return mask; +} + +template +TEST_DEVICE_FUNC void test_all_equal(T value = T{}) +{ + for (unsigned i = 1; i <= 32; ++i) + { + auto mask = cuda::device::lane_mask{make_low_mask(i)}; + if (threadIdx.x < i) + { + assert(cuda::device::warp_match_any(value, mask) == mask); + } + } +} + +// two different groups of lanes +template +TEST_DEVICE_FUNC void test_grouped(T valueA = T{}, T valueB = T{1}) +{ + for (unsigned i = 2; i <= 32; ++i) + { + auto mask = cuda::device::lane_mask{make_low_mask(i)}; + if (threadIdx.x < i) + { + auto value = threadIdx.x % 2 == 0 ? valueA : valueB; + auto expected = cuda::device::lane_mask{make_stride_mask(i, 2, threadIdx.x % 2)}; + assert(cuda::device::warp_match_any(value, mask) == expected); + } + } +} + +__global__ void test_kernel() +{ + using array_t = cuda::std::array; + + test_all_equal(); + test_all_equal(); + test_all_equal(); + test_all_equal(); +#if _CCCL_HAS_INT128() + test_all_equal<__uint128_t>(); +#endif + test_all_equal(char3{0, 0, 0}); + test_all_equal(array_t{0, 0, 0, 0, 0, 0}); + + test_grouped(); + test_grouped(); + test_grouped(); + test_grouped(); +#if _CCCL_HAS_INT128() + test_grouped<__uint128_t>(); +#endif + test_grouped(char3{0, 0, 0}, char3{1, 1, 1}); + test_grouped(array_t{0, 0, 0, 0, 0, 0}, array_t{1, 1, 1, 1, 1, 1}); +} + +int main(int, char**) +{ + NV_IF_TARGET(NV_IS_HOST, (test_kernel<<<1, 32>>>();)) + return 0; +} From d5c0bdb9f1c7eff90e54b1a8df3e61e49abc6c46 Mon Sep 17 00:00:00 2001 From: fbusato Date: Wed, 3 Jun 2026 13:24:58 -0700 Subject: [PATCH 4/8] documentation --- docs/libcudacxx/extended_api/warp.rst | 6 ++ .../extended_api/warp/warp_match_all.rst | 18 ++-- .../extended_api/warp/warp_match_any.rst | 98 +++++++++++++++++++ 3 files changed, 115 insertions(+), 7 deletions(-) create mode 100644 docs/libcudacxx/extended_api/warp/warp_match_any.rst diff --git a/docs/libcudacxx/extended_api/warp.rst b/docs/libcudacxx/extended_api/warp.rst index 08df953de89..c60e13b076b 100644 --- a/docs/libcudacxx/extended_api/warp.rst +++ b/docs/libcudacxx/extended_api/warp.rst @@ -9,6 +9,7 @@ Warp warp/warp_shuffle warp/warp_match_all + warp/warp_match_any warp/lane_mask .. list-table:: @@ -45,6 +46,11 @@ Warp - CCCL 3.1.0 - CUDA 13.1 + * - :ref:`warp_match_any ` + - Get the mask of lanes with the same value + - CCCL 3.5.0 + - CUDA 13.5 + * - :ref:`lane_mask ` - Class to represent a mask of lanes in a warp - CCCL 3.1.0 diff --git a/docs/libcudacxx/extended_api/warp/warp_match_all.rst b/docs/libcudacxx/extended_api/warp/warp_match_all.rst index 596504d674b..50fe3314f0b 100644 --- a/docs/libcudacxx/extended_api/warp/warp_match_all.rst +++ b/docs/libcudacxx/extended_api/warp/warp_match_all.rst @@ -18,6 +18,10 @@ Defined in ```` header. The functionality provides a generalized and safe alternative to CUDA warp match all intrinsic ``__match_all_sync``. The function allows bitwise comparison of any data size, including raw arrays, pointers, and structs. +.. note:: + + The underlying CUDA intrinsic does not provide memory ordering. + **Parameters** - ``data``: data to compare. @@ -30,7 +34,7 @@ The function allows bitwise comparison of any data size, including raw arrays, p **Constraints** - ``T`` shall be trivially copyable, see :ref:`cuda::is_trivially_copyable `. -- When ``__builtin_clear_padding`` is not supported, ``T`` shall have no padding bits, that is, ``T``'s value representation shall be identical to its object representation. +- ``T`` shall be bitwise comparable, see :ref:`cuda::is_bitwise_comparable `, except when ``__builtin_clear_padding`` is supported. In the latter case, ``T`` can have padding bits. **Preconditions** @@ -39,17 +43,17 @@ The function allows bitwise comparison of any data size, including raw arrays, p **Undefined Behavior** -- ``lane_mask`` must represent a subset of the active lanes, undefined behavior otherwise. +- ``lane_mask`` must represent a subset of the active lanes. +- All non-exited lanes specified by ``lane_mask`` must execute the function with the same ``lane_mask`` value. **Performance considerations** - The function calls the PTX instruction ``match.sync`` :math:`ceil\left(\frac{sizeof(data)}{4}\right)` times. -- The function is slightly faster when called with a mask of all active lanes (overload function) even if all lanes participates in the call. -- The function is slower when called with a non-fully active warp. +- The function is faster when called with a mask representing all active lanes in a warp (default value of the second parameter ``lane_mask``). **References** -- `CUDA match_all Intrinsics `_ +- `CUDA match_all Intrinsics `_ - `PTX match.sync instruction `_ Example @@ -69,7 +73,7 @@ Example __global__ void warp_match_kernel() { assert(cuda::device::warp_match_all(2)); assert(cuda::device::warp_match_all(2, cuda::device::lane_mask::all())); - assert(cuda::device::warp_match_all(MyStruct{1.0, 3})); // Undefined Behavior + assert(cuda::device::warp_match_all(MyStruct{1.0, 3})); // compile error, except when __builtin_clear_padding is supported assert(!cuda::device::warp_match_all(threadIdx.x)); } @@ -79,4 +83,4 @@ Example return 0; } -`See it on Godbolt 🔗 `_ +`See it on Godbolt 🔗 `_ diff --git a/docs/libcudacxx/extended_api/warp/warp_match_any.rst b/docs/libcudacxx/extended_api/warp/warp_match_any.rst new file mode 100644 index 00000000000..e321afa157c --- /dev/null +++ b/docs/libcudacxx/extended_api/warp/warp_match_any.rst @@ -0,0 +1,98 @@ +.. _libcudacxx-extended-api-warp-warp-match-any: + +``cuda::device::warp_match_any`` +================================ + +Defined in ```` header. + +.. code:: cuda + + namespace cuda::device { + + template + [[nodiscard]] __device__ lane_mask + warp_match_any(const T& data, lane_mask = lane_mask::all()); + + } // namespace cuda::device + +The functionality provides a generalized and safe alternative to CUDA warp match any intrinsic ``__match_any_sync``. +The function allows bitwise comparison of any data size, including raw arrays, pointers, and structs. + +.. note:: + + The underlying CUDA intrinsic does not provide memory ordering. + +**Parameters** + +- ``data``: data to compare. +- ``lane_mask``: mask of the active lanes. + +**Return value** + +- A ``lane_mask`` representing the non-exited lanes in ``lane_mask`` that have the same bitwise value for ``data`` as the calling lane. + +**Constraints** + +- ``T`` shall be trivially copyable, see :ref:`cuda::is_trivially_copyable `. +- When ``__builtin_clear_padding`` is not supported, ``T`` shall have no padding bits, that is, ``T``'s value representation shall be identical to its object representation. + +**Preconditions** + +- The functionality is only supported on ``SM >= 70``. +- ``lane_mask`` must be non-zero. + +**Undefined Behavior** + +- ``lane_mask`` must represent a subset of the active lanes. +- All non-exited lanes specified by ``lane_mask`` must execute the function with the same ``lane_mask`` value. + +**Performance considerations** + +- The function calls the PTX instruction ``match.sync`` :math:`ceil\left(\frac{sizeof(data)}{4}\right)` times. +- The function is faster when called with a mask representing all active lanes in a warp (default value of the second parameter ``lane_mask``). + +**References** + +- `CUDA match_any Intrinsics `_ +- `PTX match.sync instruction `_ + +Example +------- + +.. code:: cuda + + #include + #include + #include + + struct MyStruct { + double x; // 8 bytes + int y; // 4 bytes + }; // 4 bytes of padding + + __global__ void warp_match_kernel() { + { + auto mask = cuda::device::warp_match_any(threadIdx.x / 4); + auto expected = cuda::device::lane_mask{0b1111 << ((threadIdx.x / 4) * 4)}; + assert(mask == expected); + } + { + auto mask = cuda::device::warp_match_any(2); + auto expected = cuda::device::lane_mask{0xFFFFFFFF}; + assert(mask == expected); + } + { + // compile error, except when __builtin_clear_padding is supported + auto mask = cuda::device::warp_match_any(MyStruct{1.0, 3}); + auto expected = cuda::device::lane_mask{0xFFFFFFFF}; + assert(mask == expected); + } + } + + int main() { + warp_match_kernel<<<1, 32>>>(); + cudaDeviceSynchronize(); + return 0; + } + +`See it on Godbolt 🔗 `_ From feb923ed853f75e286a72892bf64f81d35c249ad Mon Sep 17 00:00:00 2001 From: Federico Busato <50413820+fbusato@users.noreply.github.com> Date: Thu, 4 Jun 2026 09:24:31 -0700 Subject: [PATCH 5/8] Update libcudacxx/test/libcudacxx/cuda/warp/warp_match_any.pass.cpp Co-authored-by: Michael Schellenberger Costa --- libcudacxx/test/libcudacxx/cuda/warp/warp_match_any.pass.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libcudacxx/test/libcudacxx/cuda/warp/warp_match_any.pass.cpp b/libcudacxx/test/libcudacxx/cuda/warp/warp_match_any.pass.cpp index 2a48524fa78..4402bd7a4ec 100644 --- a/libcudacxx/test/libcudacxx/cuda/warp/warp_match_any.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/warp/warp_match_any.pass.cpp @@ -75,7 +75,7 @@ __global__ void test_kernel() test_all_equal(); #if _CCCL_HAS_INT128() test_all_equal<__uint128_t>(); -#endif +#endif // _CCCL_HAS_INT128() test_all_equal(char3{0, 0, 0}); test_all_equal(array_t{0, 0, 0, 0, 0, 0}); From c0fe5cf51468bf7bf3328aff5dcdef5e688253b7 Mon Sep 17 00:00:00 2001 From: Federico Busato <50413820+fbusato@users.noreply.github.com> Date: Thu, 4 Jun 2026 09:24:40 -0700 Subject: [PATCH 6/8] Update libcudacxx/test/libcudacxx/cuda/warp/warp_match_any.pass.cpp Co-authored-by: Michael Schellenberger Costa --- libcudacxx/test/libcudacxx/cuda/warp/warp_match_any.pass.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libcudacxx/test/libcudacxx/cuda/warp/warp_match_any.pass.cpp b/libcudacxx/test/libcudacxx/cuda/warp/warp_match_any.pass.cpp index 4402bd7a4ec..e9c1671ee2f 100644 --- a/libcudacxx/test/libcudacxx/cuda/warp/warp_match_any.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/warp/warp_match_any.pass.cpp @@ -85,7 +85,7 @@ __global__ void test_kernel() test_grouped(); #if _CCCL_HAS_INT128() test_grouped<__uint128_t>(); -#endif +#endif // _CCCL_HAS_INT128() test_grouped(char3{0, 0, 0}, char3{1, 1, 1}); test_grouped(array_t{0, 0, 0, 0, 0, 0}, array_t{1, 1, 1, 1, 1, 1}); } From faa21ec1fa9bf8b3e8f9e502b6cff0585d203be6 Mon Sep 17 00:00:00 2001 From: fbusato Date: Thu, 4 Jun 2026 09:35:15 -0700 Subject: [PATCH 7/8] formatting --- docs/libcudacxx/extended_api/warp/warp_match_any.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/libcudacxx/extended_api/warp/warp_match_any.rst b/docs/libcudacxx/extended_api/warp/warp_match_any.rst index e321afa157c..7beee65db5b 100644 --- a/docs/libcudacxx/extended_api/warp/warp_match_any.rst +++ b/docs/libcudacxx/extended_api/warp/warp_match_any.rst @@ -21,7 +21,7 @@ The function allows bitwise comparison of any data size, including raw arrays, p .. note:: The underlying CUDA intrinsic does not provide memory ordering. - + **Parameters** - ``data``: data to compare. From de51be0a903576c5b25d8c1cbfce8c8deeaf8147 Mon Sep 17 00:00:00 2001 From: fbusato Date: Fri, 5 Jun 2026 12:09:50 -0700 Subject: [PATCH 8/8] avoid launching a kernel --- .../test/libcudacxx/cuda/warp/warp_match_any.pass.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/libcudacxx/test/libcudacxx/cuda/warp/warp_match_any.pass.cpp b/libcudacxx/test/libcudacxx/cuda/warp/warp_match_any.pass.cpp index e9c1671ee2f..1caece548b2 100644 --- a/libcudacxx/test/libcudacxx/cuda/warp/warp_match_any.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/warp/warp_match_any.pass.cpp @@ -65,10 +65,9 @@ TEST_DEVICE_FUNC void test_grouped(T valueA = T{}, T valueB = T{1}) } } -__global__ void test_kernel() +TEST_DEVICE_FUNC void test() { using array_t = cuda::std::array; - test_all_equal(); test_all_equal(); test_all_equal(); @@ -85,13 +84,13 @@ __global__ void test_kernel() test_grouped(); #if _CCCL_HAS_INT128() test_grouped<__uint128_t>(); -#endif // _CCCL_HAS_INT128() +#endif // _CCCL_HAS_INT128() test_grouped(char3{0, 0, 0}, char3{1, 1, 1}); test_grouped(array_t{0, 0, 0, 0, 0, 0}, array_t{1, 1, 1, 1, 1, 1}); } int main(int, char**) { - NV_IF_TARGET(NV_IS_HOST, (test_kernel<<<1, 32>>>();)) + NV_DISPATCH_TARGET(NV_IS_HOST, (cuda_thread_count = 32;), NV_IS_DEVICE, (test();)) return 0; }