scverse · Intron7 · Apr 9, 2026 · Apr 10, 2026 · Apr 13, 2026 · Apr 13, 2026
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
@@ -74,11 +74,16 @@ jobs:
               'name = "rapids-singlecell"',
               f'name = "rapids-singlecell-cu{cuda}"',
           )
-          # Rename matching extra to "rapids", remove the other
+          # Rename matching extra to "rapids", remove the other CUDA extra
           text = text.replace(f'rapids-cu{cuda} =', 'rapids =')
-          # Remove the other CUDA extra line entirely
-          lines = text.splitlines(keepends=True)
-          text = "".join(l for l in lines if f'rapids-cu{other}' not in l)
+          # Remove the other CUDA extra (handles multi-line TOML arrays)
+          import re
+          text = re.sub(
+              rf'^rapids-cu{other}\s*=\s*\[.*?\]\s*\n',
+              '',
+              text,
+              flags=re.MULTILINE | re.DOTALL,
+          )
 
           # Set CUDA architectures (replace "native" with CI target archs)
           text = text.replace(
@@ -112,14 +117,31 @@ jobs:
           CIBW_ENVIRONMENT_PASS_LINUX: SETUPTOOLS_SCM_PRETEND_VERSION
           CIBW_ENVIRONMENT: >
             CUDA_PATH=/usr/local/cuda
-            LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
             PATH=/usr/local/cuda/bin:$PATH
           CIBW_BEFORE_BUILD: >
             python -m pip install -U pip
             scikit-build-core cmake ninja nanobind
+            librmm-cu${{ matrix.cuda_major }} &&
+            RMM_ROOT=$(python -c "import librmm;print(librmm.__path__[0])") &&
+            LOG_ROOT=$(python -c "import rapids_logger;print(rapids_logger.__path__[0])") &&
+            echo "[rsc-build] librmm=$RMM_ROOT" &&
+            echo "[rsc-build] rapids_logger=$LOG_ROOT" &&
+            ln -sf "$RMM_ROOT/lib64/librmm.so" /usr/local/lib/librmm.so &&
+            ln -sf "$LOG_ROOT/lib64/librapids_logger.so" /usr/local/lib/librapids_logger.so &&
+            ldconfig &&
+            python -c "import librmm;print(librmm.__path__[0])" > /tmp/.librmm_dir &&
+            echo "[rsc-build] marker=$(cat /tmp/.librmm_dir)"
           CIBW_TEST_SKIP: "*"
           CIBW_TEST_COMMAND: ""
-          CIBW_REPAIR_WHEEL_COMMAND: "auditwheel repair --exclude libcublas.so.${{ matrix.cuda_major }} --exclude libcublasLt.so.${{ matrix.cuda_major }} --exclude libcudart.so.${{ matrix.cuda_major }} -w {dest_dir} {wheel}"
+          CIBW_REPAIR_WHEEL_COMMAND: >
+            auditwheel repair
+            --exclude libcublas.so.${{ matrix.cuda_major }}
+            --exclude libcublasLt.so.${{ matrix.cuda_major }}
+            --exclude libcudart.so.${{ matrix.cuda_major }}
+            --exclude librmm.so
+            --exclude librapids_logger.so
+            -w {dest_dir} {wheel}
+            && pipx run abi3audit --strict --report {wheel}
           CIBW_BUILD_VERBOSITY: "1"
 
       - uses: actions/upload-artifact@v4

diff --git a/.gitignore b/.gitignore
@@ -6,6 +6,7 @@ __pycache__/
 /data/
 test-data/
 .vscode/
+.codex
 
 # Distribution / packaging
 /dist/
@@ -50,3 +51,4 @@ CLAUDE.md
 
 # tmp_scripts
 tmp_scripts/
+benchmarks/
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -14,6 +14,79 @@ if (RSC_BUILD_EXTENSIONS)
   find_package(Python REQUIRED COMPONENTS Interpreter Development.Module ${SKBUILD_SABI_COMPONENT})
   find_package(nanobind CONFIG REQUIRED)
   find_package(CUDAToolkit REQUIRED)
+
+  # Find librmm cmake config.
+  # Searches all plausible locations: env vars, Python prefix, base_prefix
+  # (survives build isolation), and CI marker files.  Works with conda, pip,
+  # pixi, uv, hatch, and cibuildwheel.
+  set(_env_roots "")
+  # Explicit override
+  if(DEFINED ENV{LIBRMM_DIR})
+    file(GLOB _librmm_hints "$ENV{LIBRMM_DIR}/lib*/cmake"
+                            "$ENV{LIBRMM_DIR}/../rapids_logger/lib*/cmake")
+    list(APPEND CMAKE_PREFIX_PATH ${_librmm_hints})
+  endif()
+  # Environment managers
+  foreach(_var CONDA_PREFIX VIRTUAL_ENV PIXI_PROJECT_ROOT)
+    if(DEFINED ENV{${_var}})
+      list(APPEND _env_roots "$ENV{${_var}}")
+    endif()
+  endforeach()
+  # Python prefix, base_prefix, and real executable's env root
+  foreach(_attr prefix base_prefix)
+    execute_process(
+      COMMAND "${Python_EXECUTABLE}" -c "import sys; print(sys.${_attr})"
+      OUTPUT_VARIABLE _pp OUTPUT_STRIP_TRAILING_WHITESPACE ERROR_QUIET)
+    if(_pp)
+      list(APPEND _env_roots "${_pp}")
+    endif()
+  endforeach()
+  # Resolve symlinks to find the real Python env (works through build isolation)
+  execute_process(
+    COMMAND "${Python_EXECUTABLE}" -c
+      "import sys,pathlib; print(pathlib.Path(sys.executable).resolve().parents[1])"
+    OUTPUT_VARIABLE _real_prefix OUTPUT_STRIP_TRAILING_WHITESPACE ERROR_QUIET)
+  if(_real_prefix)
+    list(APPEND _env_roots "${_real_prefix}")
+  endif()
+  # Direct site-packages search — the most reliable way to find pip-installed
+  # librmm regardless of venv nesting depth
+  execute_process(
+    COMMAND "${Python_EXECUTABLE}" -c
+      "import site; print(';'.join(site.getsitepackages()))"
+    OUTPUT_VARIABLE _site_paths OUTPUT_STRIP_TRAILING_WHITESPACE ERROR_QUIET)
+  if(_site_paths)
+    foreach(_sp ${_site_paths})
+      file(GLOB _sp_hints "${_sp}/librmm/lib*/cmake"
+                          "${_sp}/rapids_logger/lib*/cmake")
+      foreach(_h ${_sp_hints})
+        get_filename_component(_dir "${_h}" DIRECTORY)
+        list(APPEND CMAKE_PREFIX_PATH "${_dir}")
+      endforeach()
+    endforeach()
+  endif()
+  # CI/cibuildwheel: CIBW_BEFORE_BUILD writes the librmm path to a marker file
+  if(EXISTS "/tmp/.librmm_dir")
+    file(READ "/tmp/.librmm_dir" _rmm_marker)
+    string(STRIP "${_rmm_marker}" _rmm_marker)
+    file(GLOB _marker_hints "${_rmm_marker}/lib*/cmake"
+                            "${_rmm_marker}/../rapids_logger/lib*/cmake")
+    list(APPEND CMAKE_PREFIX_PATH ${_marker_hints})
+  endif()
+  # Search each root for rmm + rapids_logger cmake configs
+  list(REMOVE_DUPLICATES _env_roots)
+  foreach(_root ${_env_roots})
+    file(GLOB _hints "${_root}/lib/cmake/rmm"
+                     "${_root}/lib/python*/site-packages/librmm/lib*/cmake/rmm"
+                     "${_root}/lib/python*/site-packages/rapids_logger/lib*/cmake/rapids_logger")
+    foreach(_h ${_hints})
+      get_filename_component(_dir "${_h}" DIRECTORY)
+      list(APPEND CMAKE_PREFIX_PATH "${_dir}")
+    endforeach()
+  endforeach()
+  message(STATUS "rmm search roots: ${_env_roots}")
+  message(STATUS "rmm CMAKE_PREFIX_PATH: ${CMAKE_PREFIX_PATH}")
+  find_package(rmm CONFIG REQUIRED)
   message(STATUS "Building for CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
 else()
   message(STATUS "RSC_BUILD_EXTENSIONS=OFF -> skipping compiled extensions for docs")
@@ -84,7 +157,8 @@ if (RSC_BUILD_EXTENSIONS)
   add_nb_cuda_module(_edistance_cuda    src/rapids_singlecell/_cuda/edistance/edistance.cu)
   add_nb_cuda_module(_hvg_cuda          src/rapids_singlecell/_cuda/hvg/hvg.cu)
   add_nb_cuda_module(_kde_cuda          src/rapids_singlecell/_cuda/kde/kde.cu)
-  add_nb_cuda_module(_wilcoxon_cuda     src/rapids_singlecell/_cuda/wilcoxon/wilcoxon.cu)
+  add_nb_cuda_module(_wilcoxon_ovr_cuda src/rapids_singlecell/_cuda/wilcoxon/wilcoxon_ovr.cu)
+  add_nb_cuda_module(_wilcoxon_ovo_cuda src/rapids_singlecell/_cuda/wilcoxon/wilcoxon_ovo.cu)
   # Harmony CUDA modules
   add_nb_cuda_module(_harmony_scatter_cuda   src/rapids_singlecell/_cuda/harmony/scatter/scatter.cu)
   add_nb_cuda_module(_harmony_outer_cuda     src/rapids_singlecell/_cuda/harmony/outer/outer.cu)
@@ -100,4 +174,8 @@ if (RSC_BUILD_EXTENSIONS)
   target_link_libraries(_harmony_correction_batched_cuda PRIVATE CUDA::cublas)
   # Wilcoxon binned histogram CUDA module
   add_nb_cuda_module(_wilcoxon_binned_cuda   src/rapids_singlecell/_cuda/wilcoxon_binned/wilcoxon_binned.cu)
+  if(rmm_FOUND)
+    target_link_libraries(_wilcoxon_ovr_cuda PRIVATE rmm::rmm)
+    target_link_libraries(_wilcoxon_ovo_cuda PRIVATE rmm::rmm)
+  endif()
 endif()
diff --git a/notebooks b/notebooks
diff --git a/pyproject.toml b/pyproject.toml
@@ -3,6 +3,10 @@ requires = [
     "scikit-build-core>=0.10",
     "nanobind>=2.0.0",
     "setuptools-scm>=8",
+    # librmm headers are needed at build time for the Wilcoxon CUDA kernels.
+    # The headers are identical across cu12/cu13; the runtime .so is loaded
+    # from the installed RAPIDS package.
+    "librmm-cu12>=25.10",
 ]
 build-backend = "scikit_build_core.build"
 
@@ -32,8 +36,22 @@ dependencies = [
 ]
 
 [project.optional-dependencies]
-rapids-cu13 = [ "cupy-cuda13x", "cudf-cu13>=25.10", "cuml-cu13>=25.10", "cugraph-cu13>=25.10", "cuvs-cu13>=25.10" ]
-rapids-cu12 = [ "cupy-cuda12x", "cudf-cu12>=25.10", "cuml-cu12>=25.10", "cugraph-cu12>=25.10", "cuvs-cu12>=25.10" ]
+rapids-cu13 = [
+    "cupy-cuda13x",
+    "librmm-cu13>=25.10",
+    "cudf-cu13>=25.10",
+    "cuml-cu13>=25.10",
+    "cugraph-cu13>=25.10",
+    "cuvs-cu13>=25.10",
+]
+rapids-cu12 = [
+    "cupy-cuda12x",
+    "librmm-cu12>=25.10",
+    "cudf-cu12>=25.10",
+    "cuml-cu12>=25.10",
+    "cugraph-cu12>=25.10",
+    "cuvs-cu12>=25.10",
+]
 
 doc = [
     "sphinx>=4.5.0",

diff --git a/src/rapids_singlecell/_cuda/__init__.py b/src/rapids_singlecell/_cuda/__init__.py
@@ -13,6 +13,16 @@
 
 import importlib
 
+# Pre-load librmm.so + deps so the dynamic linker can resolve them when
+# our nanobind extensions (which link rmm) are imported.  This is the same
+# pattern used by cuml, cuvs, and other RAPIDS packages.
+try:
+    import librmm
+
+    librmm.load_library()
+except (ImportError, OSError):
+    pass
+
 __all__ = [
     "_aggr_cuda",
     "_aucell_cuda",
@@ -44,7 +54,8 @@
     "_sparse2dense_cuda",
     "_spca_cuda",
     "_wilcoxon_binned_cuda",
-    "_wilcoxon_cuda",
+    "_wilcoxon_ovo_cuda",
+    "_wilcoxon_ovr_cuda",
 ]
 
 

diff --git a/src/rapids_singlecell/_cuda/cooc/kernels_cooc.cuh b/src/rapids_singlecell/_cuda/cooc/kernels_cooc.cuh
@@ -353,7 +353,7 @@ __global__ void occur_count_kernel_csr_catpairs_tiled(
                 // Binary search for threshold bin
                 int lo = 0, hi = l_val;
                 while (lo < hi) {
-                    int mid = (lo + hi) >> 1;
+                    int mid = lo + ((hi - lo) >> 1);
                     if (dist_sq <= thresholds[mid])
                         hi = mid;
                     else

diff --git a/src/rapids_singlecell/_cuda/nb_types.h b/src/rapids_singlecell/_cuda/nb_types.h
@@ -42,6 +42,13 @@ using gpu_array = nb::ndarray<T, Device>;
 template <typename T, typename Device, typename Contig>
 using gpu_array_contig = nb::ndarray<T, Device, Contig>;
 
+// Host (NumPy) array aliases
+template <typename T>
+using host_array = nb::ndarray<T, nb::numpy, nb::ndim<1>>;
+
+template <typename T>
+using host_array_2d = nb::ndarray<T, nb::numpy, nb::ndim<2>>;
+
 // Register bindings for both regular CUDA and managed-memory arrays.
 // Usage:
 //   template <typename Device>
+29 −29		00_DO_FIRST.ipynb
+16 −7		01_demo_gpu.ipynb
+1 −1		02_decoupler.ipynb
+1 −2		03_demo_gpu-PR.ipynb
+6 −4		06-multi_gpu_show.ipynb
+1 −1		README.md
+1 −3		demo_gpu-seuratv3-brain-1M.ipynb
+7 −3		demo_gpu-seuratv3.ipynb
+9 −3		ligrec_benchmark.ipynb