-
Notifications
You must be signed in to change notification settings - Fork 755
feat: add illicofor rank_genes_groups
#4038
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 79 commits
c5591c8
7aca4b3
c2f3738
c80958a
72318fb
97b4f7c
b9c8257
5394d2b
8928dfd
897a646
af1f523
40d5946
74b6d87
8352445
1cad431
f0d78b4
d9ad811
c83f82b
8f201ba
3ca2957
0bf0976
a375d31
d0f0db0
e1c43a1
53a2f74
5e2ee5e
fa454d7
ee037e6
6557bcf
9299b23
480a57a
c755a14
ac76f90
259d8f3
7373f55
739fffb
db85c7c
92a2953
f66ca68
9a62053
9f16597
0b319b8
3971219
585aa87
3dac202
7fcdac7
ef5ead2
023aa0f
9fe02a3
6aca9ca
0437475
d15aede
d6f2c51
44cfc6e
b448347
2b60935
3f902dd
2467433
d1aa5fb
2585d2c
6a1f917
152c344
c24ebd0
e13cb6a
62e6c87
bd5f675
a9d9f0a
aac3189
487be58
977e254
36c5039
36acd5b
9678e40
168de8b
c1f0fbc
5cc6647
7ab15f9
3d19c95
2363ee7
b522c71
dbbe64d
4216aad
98e3d71
264c683
c99dd41
0c33493
59b9119
192d3ea
4479fc0
1492048
e808e69
ef18687
a623d40
0940df2
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -7,6 +7,7 @@ | |
| import numba | ||
| import numpy as np | ||
| import pandas as pd | ||
| from anndata import AnnData | ||
| from fast_array_utils.numba import njit | ||
| from fast_array_utils.stats import mean_var | ||
| from scipy import sparse | ||
|
|
@@ -28,7 +29,6 @@ | |
| from collections.abc import Generator, Iterable | ||
| from typing import Literal | ||
|
|
||
| from anndata import AnnData | ||
| from numpy.typing import NDArray | ||
|
|
||
|
|
||
|
|
@@ -47,6 +47,35 @@ def _select_top_n(scores: NDArray, n_top: int): | |
| return global_indices | ||
|
|
||
|
|
||
| def _illico_results_to_iter( | ||
| illico_df: pd.DataFrame, | ||
| groups_order: NDArray, | ||
| ireference: int | None, | ||
| *, | ||
| copy_pvalues: bool, | ||
| ): | ||
|
ilan-gold marked this conversation as resolved.
Outdated
|
||
| """Yield per-group ``(index, z, p)`` tuples from illico's long-form output. | ||
|
|
||
| illico returns a DataFrame with a 2-level MultiIndex ``(pert, feature)`` | ||
| and columns including ``z_score`` and ``p_value``. We stream one group | ||
| at a time via `pandas.Series.loc`, trusting illico_df groups are ordered | ||
| by ``var_name``. | ||
| """ | ||
| ref_label = None if ireference is None else groups_order[ireference] | ||
| z_series = illico_df["z_score"] | ||
| p_series = illico_df["p_value"] | ||
| illico_groups = set(illico_df.index.unique(level="pert")) | ||
| return ( | ||
| ( | ||
| group_index, | ||
| z_series.loc[group_name].to_numpy(), | ||
| p_series.loc[group_name].to_numpy(copy=copy_pvalues), | ||
| ) | ||
| for group_index, group_name in enumerate(groups_order) | ||
| if group_name != ref_label and group_name in illico_groups | ||
| ) | ||
|
|
||
|
|
||
| @njit | ||
| def rankdata(data: NDArray[np.number]) -> NDArray[np.float64]: | ||
| """Parallelized version of scipy.stats.rankdata.""" | ||
|
|
@@ -141,6 +170,7 @@ def __init__( | |
| self.expm1_func = lambda x: np.expm1(x * np.log(base)) | ||
| else: | ||
| self.expm1_func = np.expm1 | ||
| self.group_col = adata.obs[groupby].array | ||
|
|
||
| self.groups_order, self.groups_masks_obs = _utils.select_groups( | ||
| adata, groups, groupby | ||
|
|
@@ -441,8 +471,40 @@ def compute_statistics( # noqa: PLR0912 | |
| if not mean_in_log_space: | ||
| # If we are not exponentiating after the mean aggregation, we need to recalculate the stats. | ||
| self._basic_stats(exponentiate_values=True) | ||
| elif method == "wilcoxon": | ||
| generate_test_results = self.wilcoxon(tie_correct=tie_correct) | ||
| elif "wilcoxon" in method: | ||
| if "illico" in method: | ||
| from illico import asymptotic_wilcoxon | ||
|
|
||
| illico_df = asymptotic_wilcoxon( | ||
| AnnData( | ||
| X=self.X, | ||
| var=pd.DataFrame(index=self.var_names), | ||
| obs=pd.DataFrame( | ||
| index=pd.RangeIndex(self.X.shape[0]).astype("str"), | ||
| data={"group": self.group_col}, | ||
| ), | ||
| ), | ||
| reference=self.groups_order[self.ireference] | ||
| if self.ireference is not None | ||
| else None, | ||
| group_keys="group", | ||
| return_as_scanpy=False, | ||
| is_log1p=True, | ||
| tie_correct=tie_correct, | ||
| use_continuity=False, | ||
| alternative="two-sided", | ||
| use_rust=False, | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @ilan-gold curious why this is hardcoded?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. From what @remydubois said on zulip: https://scverse.zulipchat.com/#narrow/channel/315570-random/topic/Article.20on.20speeding.20up.20computation/near/581587388 it is basically not worth the headache. Furthermore, if we were to ever adopt the codebase, it would be the |
||
| groups=self.groups_order, | ||
| ) | ||
| generate_test_results = _illico_results_to_iter( | ||
| illico_df, | ||
| self.groups_order, | ||
| self.ireference, | ||
| # p-values are altered by this correction method | ||
| copy_pvalues=corr_method == "benjamini-hochberg", | ||
| ) | ||
| else: | ||
| generate_test_results = self.wilcoxon(tie_correct=tie_correct) | ||
|
ilan-gold marked this conversation as resolved.
Outdated
|
||
| # If we're not exponentiating after the mean aggregation, then do it now. | ||
| self._basic_stats(exponentiate_values=not mean_in_log_space) | ||
| elif method == "logreg": | ||
|
|
@@ -451,7 +513,6 @@ def compute_statistics( # noqa: PLR0912 | |
| self.stats = None | ||
|
|
||
| n_genes = self.X.shape[1] | ||
|
|
||
| for group_index, scores, pvals in generate_test_results: | ||
| group_name = str(self.groups_order[group_index]) | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
which feature does this PR need?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This comes from
illicobut it seemed as good a time as any to require this. I will split this out into a separate PR though, fair point, there are probably version checks floating around our code baseThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There is no reason for
illicoto require 0.11. I can lower the lower bound to 0.10.8 in the next release (v0.6.0).There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No worries, really, We should upgrade to
0.11anyway for the next scanpy minor release. I'd rather keep the community moving on this front.