Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -241,6 +241,7 @@ if (RSC_BUILD_EXTENSIONS)
# CUDA modules
add_nb_cuda_module(_mean_var_cuda src/rapids_singlecell/_cuda/mean_var/mean_var.cu)
add_nb_cuda_module(_sparse2dense_cuda src/rapids_singlecell/_cuda/sparse2dense/sparse2dense.cu)
add_nb_cuda_module(_jaccard_cuda src/rapids_singlecell/_cuda/jaccard/jaccard.cu)
add_nb_cuda_module(_scale_cuda src/rapids_singlecell/_cuda/scale/scale.cu)
add_nb_cuda_module(_qc_cuda src/rapids_singlecell/_cuda/qc/qc.cu)
add_nb_cuda_module(_qc_dask_cuda src/rapids_singlecell/_cuda/qc_dask/qc_kernels_dask.cu)
Expand Down
4 changes: 4 additions & 0 deletions docs/release-notes/0.16.0.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@
* Batched cuSOLVER precision-Cholesky for the full-covariance GMM (`gmm_fit_predict`), ~2-3x faster {pr}`688` {smaller}`S Dicks`
* {class}`~rapids_singlecell.ptg.Distance` with ``metric="edistance"`` now accepts sparse CSR input (a sparse layer or ``layer_key="X"``), densified inside the CUDA kernel so the dense matrix is never materialized on the GPU {pr}`689` {smaller}`S Dicks`

```{rubric} Bug fixes
```
* {func}`~rapids_singlecell.pp.neighbors` with ``method="jaccard"`` no longer crashes with ``CUDA_ERROR_ILLEGAL_ADDRESS`` on large datasets where ``n_obs * (n_neighbors - 1)**2`` exceeds ``2**31`` {pr}`700` {smaller}`S Dicks`

```{rubric} Misc
```
* Scanpy compat: {func}`~rapids_singlecell.pp.scale`, {func}`~rapids_singlecell.pp.log1p` and {func}`~rapids_singlecell.pp.pca` rename their first argument from ``adata`` to ``data`` and now accept a matrix in addition to an {class}`~anndata.AnnData`, matching Scanpy {pr}`699` {smaller}`S Dicks`
Expand Down
1 change: 1 addition & 0 deletions src/rapids_singlecell/_cuda/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
"_harmony_pen_cuda",
"_harmony_scatter_cuda",
"_hvg_cuda",
"_jaccard_cuda",
"_kde_cuda",
"_ligrec_cuda",
"_mean_var_cuda",
Expand Down
37 changes: 37 additions & 0 deletions src/rapids_singlecell/_cuda/jaccard/jaccard.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#include <cuda_runtime.h>
#include "../nb_types.h"

#include "kernels_jaccard.cuh"

using namespace nb::literals;

constexpr int JACCARD_BLOCK_SIZE = 256;

static inline void launch_jaccard_shared_counts(const int* knn, int n_obs,
int k, float* jaccard_vals,
cudaStream_t stream) {
const long long n_edges = (long long)n_obs * k;
if (n_edges == 0) return;
unsigned grid = strided_grid(n_edges, JACCARD_BLOCK_SIZE);
jaccard_shared_counts_kernel<<<grid, JACCARD_BLOCK_SIZE, 0, stream>>>(
knn, n_obs, k, jaccard_vals);
CUDA_CHECK_LAST_ERROR(jaccard_shared_counts_kernel);
}
Comment thread
Intron7 marked this conversation as resolved.

template <typename Device>
void register_bindings(nb::module_& m) {
m.def(
"jaccard_shared_counts",
[](gpu_array_c<const int, Device> knn, int n_obs, int k,
gpu_array_c<float, Device> jaccard_vals, std::uintptr_t stream) {
launch_jaccard_shared_counts(knn.data(), n_obs, k,
jaccard_vals.data(),
(cudaStream_t)stream);
},
"knn"_a, nb::kw_only(), "n_obs"_a, "k"_a, "jaccard_vals"_a,
"stream"_a = 0);
}

NB_MODULE(_jaccard_cuda, m) {
REGISTER_GPU_BINDINGS(register_bindings, m);
}
36 changes: 36 additions & 0 deletions src/rapids_singlecell/_cuda/jaccard/kernels_jaccard.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
#pragma once

#include <cuda_runtime.h>

// One thread per KNN entry (i, slot): edge = i*k + slot, j = knn[edge].
// Writes |N(i) & N(j)| (shared-neighbor count) for each directed KNN edge.
// The self entry (j == i) and any out-of-range index write 0 and are dropped
// downstream. Inner loops skip column 0 so the count matches the exclude-self
// definition. All row offsets are computed in 64-bit (overflow-proof for any
// n_obs); the grid-stride loop covers n_edges beyond one launch.
__global__ void jaccard_shared_counts_kernel(const int* __restrict__ knn,
int n_obs, int k,
float* __restrict__ jaccard_vals) {
const long long n_edges = (long long)n_obs * k;
const long long stride = (long long)blockDim.x * gridDim.x;
for (long long edge = (long long)blockIdx.x * blockDim.x + threadIdx.x;
edge < n_edges; edge += stride) {
const long long i = edge / k;
const int j = knn[edge];
if (j == (int)i || j < 0 || j >= n_obs) {
jaccard_vals[edge] = 0.0f;
continue;
}
const int* Ni = knn + i * (long long)k;
const int* Nj = knn + (long long)j * k;
int c = 0;
for (int a = 1; a < k; ++a) {
const int va = Ni[a];
for (int b = 1; b < k; ++b) {
c += (va == Nj[b]);
}
}
jaccard_vals[edge] = (float)c / (2 * (k - 1) - c);
;
Comment thread
Intron7 marked this conversation as resolved.
Outdated
}
}
32 changes: 15 additions & 17 deletions src/rapids_singlecell/preprocessing/_neighbors/_neighbors.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,25 +231,23 @@ def _get_connectivities_jaccard(
-------
Symmetric CSR connectivity matrix.
"""
k_no_self = n_neighbors - 1

# Binary adjacency (self excluded)
rows_adj = cp.repeat(cp.arange(n_obs, dtype=cp.int32), k_no_self)
cols_adj = knn_indices[:, 1:].ravel()
data_adj = cp.ones(n_obs * k_no_self, dtype=cp.float32)
adjacency = cp_sparse.csr_matrix(
(data_adj, (rows_adj, cols_adj)), shape=(n_obs, n_obs)
)
from rapids_singlecell._cuda._jaccard_cuda import jaccard_shared_counts

knn = cp.ascontiguousarray(knn_indices, dtype=cp.int32)

# For each directed KNN edge (i->j), compute shared neighbor count
i_idx = rows_adj
j_idx = cols_adj
rows_i = adjacency[i_idx]
rows_j = adjacency[j_idx]
shared = cp.asarray(rows_i.multiply(rows_j).sum(axis=1)).ravel()
# Jaccard weight per KNN entry (i -> j), straight from the KNN lists
# (no replicated adjacency matrix, so no int32 nnz overflow).
jaccard_vals = cp.empty(n_obs * n_neighbors, dtype=cp.float32)
jaccard_shared_counts(
knn,
n_obs=n_obs,
k=n_neighbors,
jaccard_vals=jaccard_vals,
stream=cp.cuda.get_current_stream().ptr,
)

# Jaccard: |N(i) & N(j)| / (2*(k-1) - |N(i) & N(j)|)
jaccard_vals = shared / (2 * k_no_self - shared)
i_idx = cp.repeat(cp.arange(n_obs, dtype=cp.int32), n_neighbors)
j_idx = knn.ravel()

# Filter zeros and build sparse matrix
mask = jaccard_vals != 0
Expand Down
Loading