From 53ab6963845c2b4e31e86d320b5ce145b45dba22 Mon Sep 17 00:00:00 2001
From: fbusato <fbusato@nvidia.com>
Date: Wed, 3 Jun 2026 13:24:24 -0700
Subject: [PATCH 1/8] implementation

---
 .../include/cuda/__warp/warp_match_any.h      | 84 +++++++++++++++++++
 1 file changed, 84 insertions(+)
 create mode 100644 libcudacxx/include/cuda/__warp/warp_match_any.h
diff --git a/libcudacxx/include/cuda/__warp/warp_match_any.h b/libcudacxx/include/cuda/__warp/warp_match_any.h
new file mode 100644
index 00000000000..7c3366fbf38
--- /dev/null
+++ b/libcudacxx/include/cuda/__warp/warp_match_any.h
@@ -0,0 +1,84 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA___WARP_WARP_MATCH_ANY_H
+#define _CUDA___WARP_WARP_MATCH_ANY_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#if _CCCL_CUDA_COMPILATION()
+
+#  include <cuda/__cmath/ceil_div.h>
+#  include <cuda/__type_traits/is_bitwise_comparable.h>
+#  include <cuda/__type_traits/is_trivially_copyable.h>
+#  include <cuda/__warp/lane_mask.h>
+#  include <cuda/std/__cstring/memcpy.h>
+#  include <cuda/std/__memory/addressof.h>
+#  include <cuda/std/cstdint>
+
+#  include <cuda/std/__cccl/prologue.h>
+
+_CCCL_BEGIN_NAMESPACE_CUDA_DEVICE
+
+extern "C" _CCCL_DEVICE void __cuda__match_any_sync_is_not_supported_before_SM_70__();
+
+//! @brief Returns the mask of lanes with the same bitwise value as the calling lane.
+//!
+//! @param[in] __data The data to compare across lanes.
+//! @param[in] __lane_mask The mask of participating lanes.
+//!
+//! @return A lane mask containing lanes in `__lane_mask` whose `__data` matches the calling lane's data.
+template <class _Tp>
+[[nodiscard]] _CCCL_DEVICE_API lane_mask
+warp_match_any(const _Tp& __data, const lane_mask __lane_mask = lane_mask::all()) noexcept
+{
+  static_assert(is_trivially_copyable_v<_Tp>, "data must be trivially copyable");
+  _CCCL_ASSERT(__lane_mask != lane_mask::none(), "lane_mask must be non-zero");
+
+  constexpr int __ratio = ::cuda::ceil_div(sizeof(_Tp), sizeof(::cuda::std::uint32_t));
+  ::cuda::std::uint32_t __array[__ratio]{};
+
+#  if defined(_CCCL_BUILTIN_CLEAR_PADDING)
+  auto __data_copy = __data;
+  _CCCL_BUILTIN_CLEAR_PADDING(&__data_copy);
+  const auto __data_ptr = ::cuda::std::addressof(__data_copy);
+#  else // ^^^ _CCCL_BUILTIN_CLEAR_PADDING ^^^ / vvv !_CCCL_BUILTIN_CLEAR_PADDING vvv
+  static_assert(is_bitwise_comparable_v<_Tp>, "data must be bitwise comparable");
+  const auto __data_ptr = ::cuda::std::addressof(__data);
+#  endif // _CCCL_BUILTIN_CLEAR_PADDING
+  ::cuda::std::memcpy(__array, __data_ptr, sizeof(_Tp));
+
+  lane_mask __ret = __lane_mask;
+  _CCCL_PRAGMA_UNROLL_FULL()
+  for (int i = 0; i < __ratio; ++i)
+  {
+    ::cuda::std::uint32_t __match_any_result = 0;
+    NV_IF_ELSE_TARGET(NV_PROVIDES_SM_70,
+                      (__match_any_result = ::__match_any_sync(__lane_mask.value(), __array[i]);),
+                      (::cuda::device::__cuda__match_any_sync_is_not_supported_before_SM_70__();));
+    __ret &= lane_mask{__match_any_result};
+  }
+  return __ret;
+}
+
+_CCCL_END_NAMESPACE_CUDA_DEVICE
+
+#  include <cuda/std/__cccl/epilogue.h>
+
+#endif // _CCCL_CUDA_COMPILATION()
+#endif // _CUDA___WARP_WARP_MATCH_ANY_H

From a5064cfcddffc66374121efbd1241575b9ce5b88 Mon Sep 17 00:00:00 2001
From: fbusato <fbusato@nvidia.com>
Date: Wed, 3 Jun 2026 13:24:38 -0700
Subject: [PATCH 2/8] header

---
 libcudacxx/include/cuda/warp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/libcudacxx/include/cuda/warp b/libcudacxx/include/cuda/warp
index e2d6dd95e0f..da440289374 100644
--- a/libcudacxx/include/cuda/warp
+++ b/libcudacxx/include/cuda/warp
@@ -23,6 +23,7 @@
 
 #include <cuda/__warp/lane_mask.h>
 #include <cuda/__warp/warp_match_all.h>
+#include <cuda/__warp/warp_match_any.h>
 #include <cuda/__warp/warp_shuffle.h>
 
 #endif // _CUDA_WARP

From efeaa27531429a3d8ea663e63b710d202c659934 Mon Sep 17 00:00:00 2001
From: fbusato <fbusato@nvidia.com>
Date: Wed, 3 Jun 2026 13:24:49 -0700
Subject: [PATCH 3/8] unit test

---
 .../cuda/warp/warp_match_any.pass.cpp         | 97 +++++++++++++++++++
 1 file changed, 97 insertions(+)
 create mode 100644 libcudacxx/test/libcudacxx/cuda/warp/warp_match_any.pass.cpp

diff --git a/libcudacxx/test/libcudacxx/cuda/warp/warp_match_any.pass.cpp b/libcudacxx/test/libcudacxx/cuda/warp/warp_match_any.pass.cpp
new file mode 100644
index 00000000000..2a48524fa78
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/warp/warp_match_any.pass.cpp
@@ -0,0 +1,97 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+// UNSUPPORTED: pre-sm-70
+
+// UNSUPPORTED: enable-tile
+// error: asm statement is unsupported in tile code
+
+#include <cuda/std/array>
+#include <cuda/std/cassert>
+#include <cuda/std/cstdint>
+#include <cuda/warp>
+
+#include "test_macros.h"
+
+TEST_DEVICE_FUNC uint32_t make_low_mask(unsigned count)
+{
+  return count == 32 ? 0xFFFFFFFF : ((1u << count) - 1);
+}
+
+TEST_DEVICE_FUNC uint32_t make_stride_mask(unsigned count, unsigned step, unsigned remainder)
+{
+  uint32_t mask = 0;
+  for (unsigned lane = 0; lane < count; ++lane)
+  {
+    if ((lane % step) == remainder)
+    {
+      mask |= uint32_t{1} << lane;
+    }
+  }
+  return mask;
+}
+
+template <typename T>
+TEST_DEVICE_FUNC void test_all_equal(T value = T{})
+{
+  for (unsigned i = 1; i <= 32; ++i)
+  {
+    auto mask = cuda::device::lane_mask{make_low_mask(i)};
+    if (threadIdx.x < i)
+    {
+      assert(cuda::device::warp_match_any(value, mask) == mask);
+    }
+  }
+}
+
+// two different groups of lanes
+template <typename T>
+TEST_DEVICE_FUNC void test_grouped(T valueA = T{}, T valueB = T{1})
+{
+  for (unsigned i = 2; i <= 32; ++i)
+  {
+    auto mask = cuda::device::lane_mask{make_low_mask(i)};
+    if (threadIdx.x < i)
+    {
+      auto value    = threadIdx.x % 2 == 0 ? valueA : valueB;
+      auto expected = cuda::device::lane_mask{make_stride_mask(i, 2, threadIdx.x % 2)};
+      assert(cuda::device::warp_match_any(value, mask) == expected);
+    }
+  }
+}
+
+__global__ void test_kernel()
+{
+  using array_t = cuda::std::array<char, 6>;
+
+  test_all_equal<uint8_t>();
+  test_all_equal<uint16_t>();
+  test_all_equal<uint32_t>();
+  test_all_equal<uint64_t>();
+#if _CCCL_HAS_INT128()
+  test_all_equal<__uint128_t>();
+#endif
+  test_all_equal(char3{0, 0, 0});
+  test_all_equal(array_t{0, 0, 0, 0, 0, 0});
+
+  test_grouped<uint8_t>();
+  test_grouped<uint16_t>();
+  test_grouped<uint32_t>();
+  test_grouped<uint64_t>();
+#if _CCCL_HAS_INT128()
+  test_grouped<__uint128_t>();
+#endif
+  test_grouped(char3{0, 0, 0}, char3{1, 1, 1});
+  test_grouped(array_t{0, 0, 0, 0, 0, 0}, array_t{1, 1, 1, 1, 1, 1});
+}
+
+int main(int, char**)
+{
+  NV_IF_TARGET(NV_IS_HOST, (test_kernel<<<1, 32>>>();))
+  return 0;
+}

From d5c0bdb9f1c7eff90e54b1a8df3e61e49abc6c46 Mon Sep 17 00:00:00 2001
From: fbusato <fbusato@nvidia.com>
Date: Wed, 3 Jun 2026 13:24:58 -0700
Subject: [PATCH 4/8] documentation

---
 docs/libcudacxx/extended_api/warp.rst         |  6 ++
 .../extended_api/warp/warp_match_all.rst      | 18 ++--
 .../extended_api/warp/warp_match_any.rst      | 98 +++++++++++++++++++
 3 files changed, 115 insertions(+), 7 deletions(-)
 create mode 100644 docs/libcudacxx/extended_api/warp/warp_match_any.rst

diff --git a/docs/libcudacxx/extended_api/warp.rst b/docs/libcudacxx/extended_api/warp.rst
index 08df953de89..c60e13b076b 100644
--- a/docs/libcudacxx/extended_api/warp.rst
+++ b/docs/libcudacxx/extended_api/warp.rst
@@ -9,6 +9,7 @@ Warp
 
    warp/warp_shuffle
    warp/warp_match_all
+   warp/warp_match_any
    warp/lane_mask
 
 .. list-table::
@@ -45,6 +46,11 @@ Warp
      - CCCL 3.1.0
      - CUDA 13.1
 
+   * - :ref:`warp_match_any <libcudacxx-extended-api-warp-warp-match-any>`
+     - Get the mask of lanes with the same value
+     - CCCL 3.5.0
+     - CUDA 13.5
+
    * - :ref:`lane_mask <libcudacxx-extended-api-warp-lane-mask>`
      - Class to represent a mask of lanes in a warp
      - CCCL 3.1.0
diff --git a/docs/libcudacxx/extended_api/warp/warp_match_all.rst b/docs/libcudacxx/extended_api/warp/warp_match_all.rst
index 596504d674b..50fe3314f0b 100644
--- a/docs/libcudacxx/extended_api/warp/warp_match_all.rst
+++ b/docs/libcudacxx/extended_api/warp/warp_match_all.rst
@@ -18,6 +18,10 @@ Defined in ``<cuda/warp>`` header.
 The functionality provides a generalized and safe alternative to CUDA warp match all intrinsic ``__match_all_sync``.
 The function allows bitwise comparison of any data size, including raw arrays, pointers, and structs.
 
+.. note::
+
+  The underlying CUDA intrinsic does not provide memory ordering.
+
 **Parameters**
 
 - ``data``: data to compare.
@@ -30,7 +34,7 @@ The function allows bitwise comparison of any data size, including raw arrays, p
 **Constraints**
 
 - ``T`` shall be trivially copyable, see :ref:`cuda::is_trivially_copyable <libcudacxx-extended-api-type_traits-is_trivially_copyable>`.
-- When ``__builtin_clear_padding`` is not supported, ``T`` shall have no padding bits, that is, ``T``'s value representation shall be identical to its object representation.
+- ``T`` shall be bitwise comparable, see :ref:`cuda::is_bitwise_comparable <libcudacxx-extended-api-type_traits-is_bitwise_comparable>`, except when ``__builtin_clear_padding`` is supported. In the latter case, ``T`` can have padding bits.
 
 **Preconditions**
 
@@ -39,17 +43,17 @@ The function allows bitwise comparison of any data size, including raw arrays, p
 
 **Undefined Behavior**
 
-- ``lane_mask`` must represent a subset of the active lanes, undefined behavior otherwise.
+- ``lane_mask`` must represent a subset of the active lanes.
+- All non-exited lanes specified by ``lane_mask`` must execute the function with the same ``lane_mask`` value.
 
 **Performance considerations**
 
 - The function calls the PTX instruction ``match.sync`` :math:`ceil\left(\frac{sizeof(data)}{4}\right)` times.
-- The function is slightly faster when called with a mask of all active lanes (overload function) even if all lanes participates in the call.
-- The function is slower when called with a non-fully active warp.
+- The function is faster when called with a mask representing all active lanes in a warp (default value of the second parameter ``lane_mask``).
 
 **References**
 
-- `CUDA match_all Intrinsics <https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#warp-match-functions>`_
+- `CUDA match_all Intrinsics <https://docs.nvidia.com/cuda/cuda-programming-guide/05-appendices/cpp-language-extensions.html#warp-match-functions>`_
 - `PTX match.sync instruction <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-match-sync>`_
 
 Example
@@ -69,7 +73,7 @@ Example
     __global__ void warp_match_kernel() {
         assert(cuda::device::warp_match_all(2));
         assert(cuda::device::warp_match_all(2, cuda::device::lane_mask::all()));
-        assert(cuda::device::warp_match_all(MyStruct{1.0, 3})); // Undefined Behavior
+        assert(cuda::device::warp_match_all(MyStruct{1.0, 3})); // compile error, except when __builtin_clear_padding is supported
         assert(!cuda::device::warp_match_all(threadIdx.x));
     }
 
@@ -79,4 +83,4 @@ Example
         return 0;
     }
 
-`See it on Godbolt 🔗 <https://godbolt.org/z/Eq81fTb8z>`_
+`See it on Godbolt 🔗 <https://godbolt.org/z/x1sWbx14r>`_
diff --git a/docs/libcudacxx/extended_api/warp/warp_match_any.rst b/docs/libcudacxx/extended_api/warp/warp_match_any.rst
new file mode 100644
index 00000000000..e321afa157c
--- /dev/null
+++ b/docs/libcudacxx/extended_api/warp/warp_match_any.rst
@@ -0,0 +1,98 @@
+.. _libcudacxx-extended-api-warp-warp-match-any:
+
+``cuda::device::warp_match_any``
+================================
+
+Defined in ``<cuda/warp>`` header.
+
+.. code:: cuda
+
+    namespace cuda::device {
+
+    template <typename T>
+    [[nodiscard]] __device__ lane_mask
+    warp_match_any(const T& data, lane_mask = lane_mask::all());
+
+    } // namespace cuda::device
+
+The functionality provides a generalized and safe alternative to CUDA warp match any intrinsic ``__match_any_sync``.
+The function allows bitwise comparison of any data size, including raw arrays, pointers, and structs.
+
+.. note::
+
+  The underlying CUDA intrinsic does not provide memory ordering.
+  
+**Parameters**
+
+- ``data``: data to compare.
+- ``lane_mask``: mask of the active lanes.
+
+**Return value**
+
+- A ``lane_mask`` representing the non-exited lanes in ``lane_mask`` that have the same bitwise value for ``data`` as  the calling lane.
+
+**Constraints**
+
+- ``T`` shall be trivially copyable, see :ref:`cuda::is_trivially_copyable <libcudacxx-extended-api-type_traits-is_trivially_copyable>`.
+- When ``__builtin_clear_padding`` is not supported, ``T`` shall have no padding bits, that is, ``T``'s value representation shall be identical to its object representation.
+
+**Preconditions**
+
+- The functionality is only supported on ``SM >= 70``.
+- ``lane_mask`` must be non-zero.
+
+**Undefined Behavior**
+
+- ``lane_mask`` must represent a subset of the active lanes.
+- All non-exited lanes specified by ``lane_mask`` must execute the function with the same ``lane_mask`` value.
+
+**Performance considerations**
+
+- The function calls the PTX instruction ``match.sync`` :math:`ceil\left(\frac{sizeof(data)}{4}\right)` times.
+- The function is faster when called with a mask representing all active lanes in a warp (default value of the second parameter ``lane_mask``).
+
+**References**
+
+- `CUDA match_any Intrinsics <https://docs.nvidia.com/cuda/cuda-programming-guide/05-appendices/cpp-language-extensions.html#warp-match-functions>`_
+- `PTX match.sync instruction <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-match-sync>`_
+
+Example
+-------
+
+.. code:: cuda
+
+    #include <cuda/std/array>
+    #include <cuda/std/cassert>
+    #include <cuda/warp>
+
+    struct MyStruct {
+        double x; // 8 bytes
+        int    y; // 4 bytes
+    };            // 4 bytes of padding
+
+    __global__ void warp_match_kernel() {
+        {
+            auto mask     = cuda::device::warp_match_any(threadIdx.x / 4);
+            auto expected = cuda::device::lane_mask{0b1111 << ((threadIdx.x / 4) * 4)};
+            assert(mask == expected);
+        }
+        {
+            auto mask     = cuda::device::warp_match_any(2);
+            auto expected = cuda::device::lane_mask{0xFFFFFFFF};
+            assert(mask == expected);
+        }
+        {
+            // compile error, except when __builtin_clear_padding is supported
+            auto mask     = cuda::device::warp_match_any(MyStruct{1.0, 3});
+            auto expected = cuda::device::lane_mask{0xFFFFFFFF};
+            assert(mask == expected);
+        }
+    }
+
+    int main() {
+        warp_match_kernel<<<1, 32>>>();
+        cudaDeviceSynchronize();
+        return 0;
+    }
+
+`See it on Godbolt 🔗 <https://godbolt.org/z/Ys1McG8nv>`_

From feb923ed853f75e286a72892bf64f81d35c249ad Mon Sep 17 00:00:00 2001
From: Federico Busato <50413820+fbusato@users.noreply.github.com>
Date: Thu, 4 Jun 2026 09:24:31 -0700
Subject: [PATCH 5/8] Update
 libcudacxx/test/libcudacxx/cuda/warp/warp_match_any.pass.cpp

Co-authored-by: Michael Schellenberger Costa <miscco@nvidia.com>
---
 libcudacxx/test/libcudacxx/cuda/warp/warp_match_any.pass.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libcudacxx/test/libcudacxx/cuda/warp/warp_match_any.pass.cpp b/libcudacxx/test/libcudacxx/cuda/warp/warp_match_any.pass.cpp
index 2a48524fa78..4402bd7a4ec 100644
--- a/libcudacxx/test/libcudacxx/cuda/warp/warp_match_any.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/warp/warp_match_any.pass.cpp
@@ -75,7 +75,7 @@ __global__ void test_kernel()
   test_all_equal<uint64_t>();
 #if _CCCL_HAS_INT128()
   test_all_equal<__uint128_t>();
-#endif
+#endif // _CCCL_HAS_INT128()
   test_all_equal(char3{0, 0, 0});
   test_all_equal(array_t{0, 0, 0, 0, 0, 0});
 

From c0fe5cf51468bf7bf3328aff5dcdef5e688253b7 Mon Sep 17 00:00:00 2001
From: Federico Busato <50413820+fbusato@users.noreply.github.com>
Date: Thu, 4 Jun 2026 09:24:40 -0700
Subject: [PATCH 6/8] Update
 libcudacxx/test/libcudacxx/cuda/warp/warp_match_any.pass.cpp

Co-authored-by: Michael Schellenberger Costa <miscco@nvidia.com>
---
 libcudacxx/test/libcudacxx/cuda/warp/warp_match_any.pass.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libcudacxx/test/libcudacxx/cuda/warp/warp_match_any.pass.cpp b/libcudacxx/test/libcudacxx/cuda/warp/warp_match_any.pass.cpp
index 4402bd7a4ec..e9c1671ee2f 100644
--- a/libcudacxx/test/libcudacxx/cuda/warp/warp_match_any.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/warp/warp_match_any.pass.cpp
@@ -85,7 +85,7 @@ __global__ void test_kernel()
   test_grouped<uint64_t>();
 #if _CCCL_HAS_INT128()
   test_grouped<__uint128_t>();
-#endif
+#endif // _CCCL_HAS_INT128() 
   test_grouped(char3{0, 0, 0}, char3{1, 1, 1});
   test_grouped(array_t{0, 0, 0, 0, 0, 0}, array_t{1, 1, 1, 1, 1, 1});
 }

From faa21ec1fa9bf8b3e8f9e502b6cff0585d203be6 Mon Sep 17 00:00:00 2001
From: fbusato <fbusato@nvidia.com>
Date: Thu, 4 Jun 2026 09:35:15 -0700
Subject: [PATCH 7/8] formatting

---
 docs/libcudacxx/extended_api/warp/warp_match_any.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/libcudacxx/extended_api/warp/warp_match_any.rst b/docs/libcudacxx/extended_api/warp/warp_match_any.rst
index e321afa157c..7beee65db5b 100644
--- a/docs/libcudacxx/extended_api/warp/warp_match_any.rst
+++ b/docs/libcudacxx/extended_api/warp/warp_match_any.rst
@@ -21,7 +21,7 @@ The function allows bitwise comparison of any data size, including raw arrays, p
 .. note::
 
   The underlying CUDA intrinsic does not provide memory ordering.
-  
+
 **Parameters**
 
 - ``data``: data to compare.

From de51be0a903576c5b25d8c1cbfce8c8deeaf8147 Mon Sep 17 00:00:00 2001
From: fbusato <fbusato@nvidia.com>
Date: Fri, 5 Jun 2026 12:09:50 -0700
Subject: [PATCH 8/8] avoid launching a kernel

---
 .../test/libcudacxx/cuda/warp/warp_match_any.pass.cpp      | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/libcudacxx/test/libcudacxx/cuda/warp/warp_match_any.pass.cpp b/libcudacxx/test/libcudacxx/cuda/warp/warp_match_any.pass.cpp
index e9c1671ee2f..1caece548b2 100644
--- a/libcudacxx/test/libcudacxx/cuda/warp/warp_match_any.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/warp/warp_match_any.pass.cpp
@@ -65,10 +65,9 @@ TEST_DEVICE_FUNC void test_grouped(T valueA = T{}, T valueB = T{1})
   }
 }
 
-__global__ void test_kernel()
+TEST_DEVICE_FUNC void test()
 {
   using array_t = cuda::std::array<char, 6>;
-
   test_all_equal<uint8_t>();
   test_all_equal<uint16_t>();
   test_all_equal<uint32_t>();
@@ -85,13 +84,13 @@ __global__ void test_kernel()
   test_grouped<uint64_t>();
 #if _CCCL_HAS_INT128()
   test_grouped<__uint128_t>();
-#endif // _CCCL_HAS_INT128() 
+#endif // _CCCL_HAS_INT128()
   test_grouped(char3{0, 0, 0}, char3{1, 1, 1});
   test_grouped(array_t{0, 0, 0, 0, 0, 0}, array_t{1, 1, 1, 1, 1, 1});
 }
 
 int main(int, char**)
 {
-  NV_IF_TARGET(NV_IS_HOST, (test_kernel<<<1, 32>>>();))
+  NV_DISPATCH_TARGET(NV_IS_HOST, (cuda_thread_count = 32;), NV_IS_DEVICE, (test();))
   return 0;
 }