diff --git a/docs/libcudacxx/extended_api/warp.rst b/docs/libcudacxx/extended_api/warp.rst index 08df953de89..c60e13b076b 100644 --- a/docs/libcudacxx/extended_api/warp.rst +++ b/docs/libcudacxx/extended_api/warp.rst @@ -9,6 +9,7 @@ Warp warp/warp_shuffle warp/warp_match_all + warp/warp_match_any warp/lane_mask .. list-table:: @@ -45,6 +46,11 @@ Warp - CCCL 3.1.0 - CUDA 13.1 + * - :ref:`warp_match_any ` + - Get the mask of lanes with the same value + - CCCL 3.5.0 + - CUDA 13.5 + * - :ref:`lane_mask ` - Class to represent a mask of lanes in a warp - CCCL 3.1.0 diff --git a/docs/libcudacxx/extended_api/warp/warp_match_all.rst b/docs/libcudacxx/extended_api/warp/warp_match_all.rst index 596504d674b..50fe3314f0b 100644 --- a/docs/libcudacxx/extended_api/warp/warp_match_all.rst +++ b/docs/libcudacxx/extended_api/warp/warp_match_all.rst @@ -18,6 +18,10 @@ Defined in ```` header. The functionality provides a generalized and safe alternative to CUDA warp match all intrinsic ``__match_all_sync``. The function allows bitwise comparison of any data size, including raw arrays, pointers, and structs. +.. note:: + + The underlying CUDA intrinsic does not provide memory ordering. + **Parameters** - ``data``: data to compare. @@ -30,7 +34,7 @@ The function allows bitwise comparison of any data size, including raw arrays, p **Constraints** - ``T`` shall be trivially copyable, see :ref:`cuda::is_trivially_copyable `. -- When ``__builtin_clear_padding`` is not supported, ``T`` shall have no padding bits, that is, ``T``'s value representation shall be identical to its object representation. +- ``T`` shall be bitwise comparable, see :ref:`cuda::is_bitwise_comparable `, except when ``__builtin_clear_padding`` is supported. In the latter case, ``T`` can have padding bits. **Preconditions** @@ -39,17 +43,17 @@ The function allows bitwise comparison of any data size, including raw arrays, p **Undefined Behavior** -- ``lane_mask`` must represent a subset of the active lanes, undefined behavior otherwise. +- ``lane_mask`` must represent a subset of the active lanes. +- All non-exited lanes specified by ``lane_mask`` must execute the function with the same ``lane_mask`` value. **Performance considerations** - The function calls the PTX instruction ``match.sync`` :math:`ceil\left(\frac{sizeof(data)}{4}\right)` times. -- The function is slightly faster when called with a mask of all active lanes (overload function) even if all lanes participates in the call. -- The function is slower when called with a non-fully active warp. +- The function is faster when called with a mask representing all active lanes in a warp (default value of the second parameter ``lane_mask``). **References** -- `CUDA match_all Intrinsics `_ +- `CUDA match_all Intrinsics `_ - `PTX match.sync instruction `_ Example @@ -69,7 +73,7 @@ Example __global__ void warp_match_kernel() { assert(cuda::device::warp_match_all(2)); assert(cuda::device::warp_match_all(2, cuda::device::lane_mask::all())); - assert(cuda::device::warp_match_all(MyStruct{1.0, 3})); // Undefined Behavior + assert(cuda::device::warp_match_all(MyStruct{1.0, 3})); // compile error, except when __builtin_clear_padding is supported assert(!cuda::device::warp_match_all(threadIdx.x)); } @@ -79,4 +83,4 @@ Example return 0; } -`See it on Godbolt 🔗 `_ +`See it on Godbolt 🔗 `_ diff --git a/docs/libcudacxx/extended_api/warp/warp_match_any.rst b/docs/libcudacxx/extended_api/warp/warp_match_any.rst new file mode 100644 index 00000000000..7beee65db5b --- /dev/null +++ b/docs/libcudacxx/extended_api/warp/warp_match_any.rst @@ -0,0 +1,98 @@ +.. _libcudacxx-extended-api-warp-warp-match-any: + +``cuda::device::warp_match_any`` +================================ + +Defined in ```` header. + +.. code:: cuda + + namespace cuda::device { + + template + [[nodiscard]] __device__ lane_mask + warp_match_any(const T& data, lane_mask = lane_mask::all()); + + } // namespace cuda::device + +The functionality provides a generalized and safe alternative to CUDA warp match any intrinsic ``__match_any_sync``. +The function allows bitwise comparison of any data size, including raw arrays, pointers, and structs. + +.. note:: + + The underlying CUDA intrinsic does not provide memory ordering. + +**Parameters** + +- ``data``: data to compare. +- ``lane_mask``: mask of the active lanes. + +**Return value** + +- A ``lane_mask`` representing the non-exited lanes in ``lane_mask`` that have the same bitwise value for ``data`` as the calling lane. + +**Constraints** + +- ``T`` shall be trivially copyable, see :ref:`cuda::is_trivially_copyable `. +- When ``__builtin_clear_padding`` is not supported, ``T`` shall have no padding bits, that is, ``T``'s value representation shall be identical to its object representation. + +**Preconditions** + +- The functionality is only supported on ``SM >= 70``. +- ``lane_mask`` must be non-zero. + +**Undefined Behavior** + +- ``lane_mask`` must represent a subset of the active lanes. +- All non-exited lanes specified by ``lane_mask`` must execute the function with the same ``lane_mask`` value. + +**Performance considerations** + +- The function calls the PTX instruction ``match.sync`` :math:`ceil\left(\frac{sizeof(data)}{4}\right)` times. +- The function is faster when called with a mask representing all active lanes in a warp (default value of the second parameter ``lane_mask``). + +**References** + +- `CUDA match_any Intrinsics `_ +- `PTX match.sync instruction `_ + +Example +------- + +.. code:: cuda + + #include + #include + #include + + struct MyStruct { + double x; // 8 bytes + int y; // 4 bytes + }; // 4 bytes of padding + + __global__ void warp_match_kernel() { + { + auto mask = cuda::device::warp_match_any(threadIdx.x / 4); + auto expected = cuda::device::lane_mask{0b1111 << ((threadIdx.x / 4) * 4)}; + assert(mask == expected); + } + { + auto mask = cuda::device::warp_match_any(2); + auto expected = cuda::device::lane_mask{0xFFFFFFFF}; + assert(mask == expected); + } + { + // compile error, except when __builtin_clear_padding is supported + auto mask = cuda::device::warp_match_any(MyStruct{1.0, 3}); + auto expected = cuda::device::lane_mask{0xFFFFFFFF}; + assert(mask == expected); + } + } + + int main() { + warp_match_kernel<<<1, 32>>>(); + cudaDeviceSynchronize(); + return 0; + } + +`See it on Godbolt 🔗 `_ diff --git a/libcudacxx/include/cuda/__warp/warp_match_any.h b/libcudacxx/include/cuda/__warp/warp_match_any.h new file mode 100644 index 00000000000..7c3366fbf38 --- /dev/null +++ b/libcudacxx/include/cuda/__warp/warp_match_any.h @@ -0,0 +1,84 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA___WARP_WARP_MATCH_ANY_H +#define _CUDA___WARP_WARP_MATCH_ANY_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#if _CCCL_CUDA_COMPILATION() + +# include +# include +# include +# include +# include +# include +# include + +# include + +_CCCL_BEGIN_NAMESPACE_CUDA_DEVICE + +extern "C" _CCCL_DEVICE void __cuda__match_any_sync_is_not_supported_before_SM_70__(); + +//! @brief Returns the mask of lanes with the same bitwise value as the calling lane. +//! +//! @param[in] __data The data to compare across lanes. +//! @param[in] __lane_mask The mask of participating lanes. +//! +//! @return A lane mask containing lanes in `__lane_mask` whose `__data` matches the calling lane's data. +template +[[nodiscard]] _CCCL_DEVICE_API lane_mask +warp_match_any(const _Tp& __data, const lane_mask __lane_mask = lane_mask::all()) noexcept +{ + static_assert(is_trivially_copyable_v<_Tp>, "data must be trivially copyable"); + _CCCL_ASSERT(__lane_mask != lane_mask::none(), "lane_mask must be non-zero"); + + constexpr int __ratio = ::cuda::ceil_div(sizeof(_Tp), sizeof(::cuda::std::uint32_t)); + ::cuda::std::uint32_t __array[__ratio]{}; + +# if defined(_CCCL_BUILTIN_CLEAR_PADDING) + auto __data_copy = __data; + _CCCL_BUILTIN_CLEAR_PADDING(&__data_copy); + const auto __data_ptr = ::cuda::std::addressof(__data_copy); +# else // ^^^ _CCCL_BUILTIN_CLEAR_PADDING ^^^ / vvv !_CCCL_BUILTIN_CLEAR_PADDING vvv + static_assert(is_bitwise_comparable_v<_Tp>, "data must be bitwise comparable"); + const auto __data_ptr = ::cuda::std::addressof(__data); +# endif // _CCCL_BUILTIN_CLEAR_PADDING + ::cuda::std::memcpy(__array, __data_ptr, sizeof(_Tp)); + + lane_mask __ret = __lane_mask; + _CCCL_PRAGMA_UNROLL_FULL() + for (int i = 0; i < __ratio; ++i) + { + ::cuda::std::uint32_t __match_any_result = 0; + NV_IF_ELSE_TARGET(NV_PROVIDES_SM_70, + (__match_any_result = ::__match_any_sync(__lane_mask.value(), __array[i]);), + (::cuda::device::__cuda__match_any_sync_is_not_supported_before_SM_70__();)); + __ret &= lane_mask{__match_any_result}; + } + return __ret; +} + +_CCCL_END_NAMESPACE_CUDA_DEVICE + +# include + +#endif // _CCCL_CUDA_COMPILATION() +#endif // _CUDA___WARP_WARP_MATCH_ANY_H diff --git a/libcudacxx/include/cuda/warp b/libcudacxx/include/cuda/warp index e2d6dd95e0f..da440289374 100644 --- a/libcudacxx/include/cuda/warp +++ b/libcudacxx/include/cuda/warp @@ -23,6 +23,7 @@ #include #include +#include #include #endif // _CUDA_WARP diff --git a/libcudacxx/test/libcudacxx/cuda/warp/warp_match_any.pass.cpp b/libcudacxx/test/libcudacxx/cuda/warp/warp_match_any.pass.cpp new file mode 100644 index 00000000000..1caece548b2 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/warp/warp_match_any.pass.cpp @@ -0,0 +1,96 @@ +//===----------------------------------------------------------------------===// +// +// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// +// UNSUPPORTED: pre-sm-70 + +// UNSUPPORTED: enable-tile +// error: asm statement is unsupported in tile code + +#include +#include +#include +#include + +#include "test_macros.h" + +TEST_DEVICE_FUNC uint32_t make_low_mask(unsigned count) +{ + return count == 32 ? 0xFFFFFFFF : ((1u << count) - 1); +} + +TEST_DEVICE_FUNC uint32_t make_stride_mask(unsigned count, unsigned step, unsigned remainder) +{ + uint32_t mask = 0; + for (unsigned lane = 0; lane < count; ++lane) + { + if ((lane % step) == remainder) + { + mask |= uint32_t{1} << lane; + } + } + return mask; +} + +template +TEST_DEVICE_FUNC void test_all_equal(T value = T{}) +{ + for (unsigned i = 1; i <= 32; ++i) + { + auto mask = cuda::device::lane_mask{make_low_mask(i)}; + if (threadIdx.x < i) + { + assert(cuda::device::warp_match_any(value, mask) == mask); + } + } +} + +// two different groups of lanes +template +TEST_DEVICE_FUNC void test_grouped(T valueA = T{}, T valueB = T{1}) +{ + for (unsigned i = 2; i <= 32; ++i) + { + auto mask = cuda::device::lane_mask{make_low_mask(i)}; + if (threadIdx.x < i) + { + auto value = threadIdx.x % 2 == 0 ? valueA : valueB; + auto expected = cuda::device::lane_mask{make_stride_mask(i, 2, threadIdx.x % 2)}; + assert(cuda::device::warp_match_any(value, mask) == expected); + } + } +} + +TEST_DEVICE_FUNC void test() +{ + using array_t = cuda::std::array; + test_all_equal(); + test_all_equal(); + test_all_equal(); + test_all_equal(); +#if _CCCL_HAS_INT128() + test_all_equal<__uint128_t>(); +#endif // _CCCL_HAS_INT128() + test_all_equal(char3{0, 0, 0}); + test_all_equal(array_t{0, 0, 0, 0, 0, 0}); + + test_grouped(); + test_grouped(); + test_grouped(); + test_grouped(); +#if _CCCL_HAS_INT128() + test_grouped<__uint128_t>(); +#endif // _CCCL_HAS_INT128() + test_grouped(char3{0, 0, 0}, char3{1, 1, 1}); + test_grouped(array_t{0, 0, 0, 0, 0, 0}, array_t{1, 1, 1, 1, 1, 1}); +} + +int main(int, char**) +{ + NV_DISPATCH_TARGET(NV_IS_HOST, (cuda_thread_count = 32;), NV_IS_DEVICE, (test();)) + return 0; +}