Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "pdex"
version = "0.1.14"
version = "0.1.15"
description = "Parallel differential expression for single-cell perturbation sequencing"
readme = "README.md"
authors = [{ name = "noam teyssier", email = "noam.teyssier@arcinstitute.org" }]
Expand Down
17 changes: 13 additions & 4 deletions src/pdex/_utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,14 @@
import anndata as ad
import numpy as np

# A heuristic to determine if the data is log-transformed
# Checks if the mean cell umi count is greater than a certain threshold
# If the the mean cell umi count is < UPPER_LIMIT_LOG, it is assumed that the data is log-transformed
#
# This limit is set to 15 (log-data with >15 average UMI counts would mean an
# average UMI count of ($ e^{15} - 1 = 3.26M $ ) which is unlikely at this point)
UPPER_LIMIT_LOG = 15


def guess_is_log(adata: ad.AnnData, num_cells: int | float = 5e2) -> bool:
"""Make an *educated* guess whether the provided anndata is log-transformed.
Expand All @@ -15,9 +23,10 @@ def guess_is_log(adata: ad.AnnData, num_cells: int | float = 5e2) -> bool:
mask = np.random.choice(adata.shape[0], size=num_cells, replace=False)

# Sum the matrix across the selected cell subset
sums = adata[mask].X.sum(axis=1)
sums = adata[mask].X.sum(axis=1) # type: ignore

# Extract the fractional components of the array
decimals, _ = np.modf(sums)
# Determine the mean cell umi count
mean_umi_count = np.mean(sums)

return np.any(decimals != 0.0)
# Return True if the mean cell umi count is less than the upper limit
return bool(mean_umi_count < UPPER_LIMIT_LOG)