scverse · Intron7 · Apr 9, 2026 · Apr 10, 2026 · Apr 13, 2026 · Apr 13, 2026
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
@@ -74,11 +74,16 @@ jobs:
               'name = "rapids-singlecell"',
               f'name = "rapids-singlecell-cu{cuda}"',
           )
-          # Rename matching extra to "rapids", remove the other
+          # Rename matching extra to "rapids", remove the other CUDA extra
           text = text.replace(f'rapids-cu{cuda} =', 'rapids =')
-          # Remove the other CUDA extra line entirely
-          lines = text.splitlines(keepends=True)
-          text = "".join(l for l in lines if f'rapids-cu{other}' not in l)
+          # Remove the other CUDA extra (handles multi-line TOML arrays)
+          import re
+          text = re.sub(
+              rf'^rapids-cu{other}\s*=\s*\[.*?\]\s*\n',
+              '',
+              text,
+              flags=re.MULTILINE | re.DOTALL,
+          )
 
           # Set CUDA architectures (replace "native" with CI target archs)
           text = text.replace(
@@ -112,14 +117,23 @@ jobs:
           CIBW_ENVIRONMENT_PASS_LINUX: SETUPTOOLS_SCM_PRETEND_VERSION
           CIBW_ENVIRONMENT: >
             CUDA_PATH=/usr/local/cuda
-            LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
             PATH=/usr/local/cuda/bin:$PATH
           CIBW_BEFORE_BUILD: >
             python -m pip install -U pip
             scikit-build-core cmake ninja nanobind
+            librmm-cu${{ matrix.cuda_major }} &&
+            SITE=$(python -c "import sysconfig;print(sysconfig.get_path('purelib'))") &&
+            echo "[rsc-build] site-packages=$SITE" &&
+            echo "[rsc-build] librmm=$(ls $SITE/librmm/lib64/*.so 2>/dev/null)" &&
+            echo "[rsc-build] rapids_logger=$(ls $SITE/rapids_logger/lib64/*.so 2>/dev/null)" &&
+            ln -sf "$SITE/librmm/lib64/librmm.so" /usr/local/lib/librmm.so &&
+            ln -sf "$SITE/rapids_logger/lib64/librapids_logger.so" /usr/local/lib/librapids_logger.so &&
+            ldconfig &&
+            python -c "import librmm;print(librmm.__path__[0])" > /tmp/.librmm_dir &&
+            echo "[rsc-build] marker=$(cat /tmp/.librmm_dir)"
           CIBW_TEST_SKIP: "*"
           CIBW_TEST_COMMAND: ""
-          CIBW_REPAIR_WHEEL_COMMAND: "auditwheel repair --exclude libcublas.so.${{ matrix.cuda_major }} --exclude libcublasLt.so.${{ matrix.cuda_major }} --exclude libcudart.so.${{ matrix.cuda_major }} -w {dest_dir} {wheel}"
+          CIBW_REPAIR_WHEEL_COMMAND: "auditwheel repair --exclude libcublas.so.${{ matrix.cuda_major }} --exclude libcublasLt.so.${{ matrix.cuda_major }} --exclude libcudart.so.${{ matrix.cuda_major }} --exclude librmm.so --exclude librapids_logger.so -w {dest_dir} {wheel}"
           CIBW_BUILD_VERBOSITY: "1"
 
       - uses: actions/upload-artifact@v4

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -14,6 +14,47 @@ if (RSC_BUILD_EXTENSIONS)
   find_package(Python REQUIRED COMPONENTS Interpreter Development.Module ${SKBUILD_SABI_COMPONENT})
   find_package(nanobind CONFIG REQUIRED)
   find_package(CUDAToolkit REQUIRED)
+
+  # Find librmm cmake config.
+  # Works with conda, pixi, uv, venv — uses env root to find site-packages.
+  # Priority: LIBRMM_DIR env var > CONDA_PREFIX > VIRTUAL_ENV > Python prefix.
+  set(_env_roots "")
+  if(DEFINED ENV{LIBRMM_DIR})
+    list(APPEND _env_roots "$ENV{LIBRMM_DIR}/..")
+  endif()
+  foreach(_var CONDA_PREFIX VIRTUAL_ENV PIXI_PROJECT_ROOT)
+    if(DEFINED ENV{${_var}})
+      list(APPEND _env_roots "$ENV{${_var}}")
+    endif()
+  endforeach()
+  # Fallback: Python prefix (works for any env manager)
+  execute_process(
+    COMMAND "${Python_EXECUTABLE}" -c "import sys; print(sys.prefix)"
+    OUTPUT_VARIABLE _py_prefix OUTPUT_STRIP_TRAILING_WHITESPACE ERROR_QUIET)
+  if(_py_prefix)
+    list(APPEND _env_roots "${_py_prefix}")
+  endif()
+  # CI/cibuildwheel: CIBW_BEFORE_BUILD writes the librmm path to a marker file
+  if(EXISTS "/tmp/.librmm_dir")
+    file(READ "/tmp/.librmm_dir" _rmm_marker)
+    string(STRIP "${_rmm_marker}" _rmm_marker)
+    # Marker contains e.g. /opt/.../site-packages/librmm — find cmake dir + deps
+    file(GLOB _marker_hints "${_rmm_marker}/lib*/cmake"
+                            "${_rmm_marker}/../rapids_logger/lib*/cmake")
+    list(APPEND CMAKE_PREFIX_PATH ${_marker_hints})
+  endif()
+  foreach(_root ${_env_roots})
+    file(GLOB _hints "${_root}/lib/cmake/rmm"
+                     "${_root}/lib/python*/site-packages/librmm/lib*/cmake/rmm"
+                     "${_root}/lib/python*/site-packages/rapids_logger/lib*/cmake/rapids_logger")
+    foreach(_h ${_hints})
+      get_filename_component(_dir "${_h}" DIRECTORY)
+      list(APPEND CMAKE_PREFIX_PATH "${_dir}")
+    endforeach()
+  endforeach()
+  message(STATUS "rmm search roots: ${_env_roots}")
+  message(STATUS "rmm CMAKE_PREFIX_PATH: ${CMAKE_PREFIX_PATH}")
+  find_package(rmm CONFIG REQUIRED)
   message(STATUS "Building for CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
 else()
   message(STATUS "RSC_BUILD_EXTENSIONS=OFF -> skipping compiled extensions for docs")
@@ -84,7 +125,8 @@ if (RSC_BUILD_EXTENSIONS)
   add_nb_cuda_module(_edistance_cuda    src/rapids_singlecell/_cuda/edistance/edistance.cu)
   add_nb_cuda_module(_hvg_cuda          src/rapids_singlecell/_cuda/hvg/hvg.cu)
   add_nb_cuda_module(_kde_cuda          src/rapids_singlecell/_cuda/kde/kde.cu)
-  add_nb_cuda_module(_wilcoxon_cuda     src/rapids_singlecell/_cuda/wilcoxon/wilcoxon.cu)
+  add_nb_cuda_module(_wilcoxon_ovr_cuda src/rapids_singlecell/_cuda/wilcoxon/wilcoxon_ovr.cu)
+  add_nb_cuda_module(_wilcoxon_ovo_cuda src/rapids_singlecell/_cuda/wilcoxon/wilcoxon_ovo.cu)
   # Harmony CUDA modules
   add_nb_cuda_module(_harmony_scatter_cuda   src/rapids_singlecell/_cuda/harmony/scatter/scatter.cu)
   add_nb_cuda_module(_harmony_outer_cuda     src/rapids_singlecell/_cuda/harmony/outer/outer.cu)
@@ -100,4 +142,8 @@ if (RSC_BUILD_EXTENSIONS)
   target_link_libraries(_harmony_correction_batched_cuda PRIVATE CUDA::cublas)
   # Wilcoxon binned histogram CUDA module
   add_nb_cuda_module(_wilcoxon_binned_cuda   src/rapids_singlecell/_cuda/wilcoxon_binned/wilcoxon_binned.cu)
+  if(rmm_FOUND)
+    target_link_libraries(_wilcoxon_ovr_cuda PRIVATE rmm::rmm)
+    target_link_libraries(_wilcoxon_ovo_cuda PRIVATE rmm::rmm)
+  endif()
 endif()
diff --git a/pyproject.toml b/pyproject.toml
@@ -32,8 +32,22 @@ dependencies = [
 ]
 
 [project.optional-dependencies]
-rapids-cu13 = [ "cupy-cuda13x", "cudf-cu13>=25.10", "cuml-cu13>=25.10", "cugraph-cu13>=25.10", "cuvs-cu13>=25.10" ]
-rapids-cu12 = [ "cupy-cuda12x", "cudf-cu12>=25.10", "cuml-cu12>=25.10", "cugraph-cu12>=25.10", "cuvs-cu12>=25.10" ]
+rapids-cu13 = [
+    "cupy-cuda13x",
+    "librmm-cu13>=25.10",
+    "cudf-cu13>=25.10",
+    "cuml-cu13>=25.10",
+    "cugraph-cu13>=25.10",
+    "cuvs-cu13>=25.10",
+]
+rapids-cu12 = [
+    "cupy-cuda12x",
+    "librmm-cu12>=25.10",
+    "cudf-cu12>=25.10",
+    "cuml-cu12>=25.10",
+    "cugraph-cu12>=25.10",
+    "cuvs-cu12>=25.10",
+]
 
 doc = [
     "sphinx>=4.5.0",

diff --git a/src/rapids_singlecell/_cuda/__init__.py b/src/rapids_singlecell/_cuda/__init__.py
@@ -13,6 +13,18 @@
 
 import importlib
 
+# Pre-load librmm.so + deps so the dynamic linker can resolve them when
+# our nanobind extensions (which link rmm) are imported.  This is the same
+# pattern used by cuml, cuvs, and other RAPIDS packages.
+try:
+    import librmm
+
+    librmm.load_library()
+except (ImportError, OSError):
+    pass
+
+_RMM_MODULES = {"_wilcoxon_ovo_cuda", "_wilcoxon_ovr_cuda"}
+
 __all__ = [
     "_aggr_cuda",
     "_aucell_cuda",
@@ -44,7 +56,8 @@
     "_sparse2dense_cuda",
     "_spca_cuda",
     "_wilcoxon_binned_cuda",
-    "_wilcoxon_cuda",
+    "_wilcoxon_ovo_cuda",
+    "_wilcoxon_ovr_cuda",
 ]
 
 

diff --git a/src/rapids_singlecell/_cuda/nb_types.h b/src/rapids_singlecell/_cuda/nb_types.h
@@ -42,6 +42,13 @@ using gpu_array = nb::ndarray<T, Device>;
 template <typename T, typename Device, typename Contig>
 using gpu_array_contig = nb::ndarray<T, Device, Contig>;
 
+// Host (NumPy) array aliases
+template <typename T>
+using host_array = nb::ndarray<T, nb::numpy, nb::ndim<1>>;
+
+template <typename T>
+using host_array_2d = nb::ndarray<T, nb::numpy>;
+
 // Register bindings for both regular CUDA and managed-memory arrays.
 // Usage:
 //   template <typename Device>