simplify and and optimize prcomp_irlba#52
Conversation
This gives a significant speed-up for sparse matrices
library(irlba)
#> Loading required package: Matrix
library(Matrix)
prcomp_irlba_new <- function(x, n = 3, retx = TRUE, center = TRUE, scale. = FALSE, ...)
{
if (hasArg(tol))
warning("The `tol` truncation argument from `prcomp` is not supported by
`prcomp_irlba`. If specified, `tol` is passed to the `irlba` function to
control that algorithm's convergence tolerance. See `?prcomp_irlba` for help.")
# Try to convert data frame to matrix...
if (is.data.frame(x)) x <- as.matrix(x)
col_means <- colMeans(x)
center <- if (!is.logical(center)) center else if (center) col_means else 0
col_vars <- (colMeans(x^2) - 2*col_means*center + center^2) / (1 - 1/nrow(x))
scale. <- if (!is.logical(scale.)) scale. else if (scale.) sqrt(col_vars) else 1
args <- list(A=x, nv=n)
if(!isTRUE(all(center==0))) args$center <- center # center & scale are only supplied to irlba if
if(!isTRUE(all(scale.==1))) args$scale <- scale. # centering/scaling would actually be performed
args <- c(args, list(...))
s <- do.call(irlba, args=args)
ans <-list(
sdev = s$d / sqrt(nrow(x) - 1),
rotation = s$v,
center = if(is.null(args$center)) FALSE else args$center,
scale = if(is.null(args$center)) FALSE else args$center
)
colnames(ans$rotation) <- paste("PC", seq_len(ncol(ans$rotation)), sep="")
if (retx)
{
ans$x <- s$u %*% diag(s$d)
colnames(ans$x) <- colnames(ans$rotation)
}
ans$totalvar <- sum(col_vars/scale.^2)
class(ans) <- c("irlba_prcomp", "prcomp")
ans
}
prcomp_irlba_old <- function(x, n = 3, retx = TRUE, center = TRUE, scale. = FALSE, ...)
{
a <- names(as.list(match.call()))
ans <- list(scale=scale.)
if ("tol" %in% a)
warning("The `tol` truncation argument from `prcomp` is not supported by
`prcomp_irlba`. If specified, `tol` is passed to the `irlba` function to
control that algorithm's convergence tolerance. See `?prcomp_irlba` for help.")
# Try to convert data frame to matrix...
if (is.data.frame(x)) x <- as.matrix(x)
args <- list(A=x, nv=n)
if (is.logical(center))
{
if (center) args$center <- colMeans(x)
} else args$center <- center
if (is.logical(scale.))
{
if (is.numeric(args$center))
{
f <- function(i) sqrt(sum((x[, i] - args$center[i]) ^ 2) / (nrow(x) - 1L))
scale. <- vapply(seq(ncol(x)), f, pi, USE.NAMES=FALSE)
if (ans$scale) ans$totalvar <- ncol(x)
else ans$totalvar <- sum(scale. ^ 2)
} else
{
if (ans$scale)
{
scale. <- apply(x, 2L, function(v) sqrt(sum(v ^ 2) / max(1, length(v) - 1L)))
f <- function(i) sqrt(sum((x[, i] / scale.[i]) ^ 2) / (nrow(x) - 1L))
ans$totalvar <- sum(vapply(seq(ncol(x)), f, pi, USE.NAMES=FALSE) ^ 2)
} else
{
f <- function(i) sum(x[, i] ^ 2) / (nrow(x) - 1L)
ans$totalvar <- sum(vapply(seq(ncol(x)), f, pi, USE.NAMES=FALSE))
}
}
if (ans$scale) args$scale <- scale.
} else
{
args$scale <- scale.
f <- function(i) sqrt(sum((x[, i] / scale.[i]) ^ 2) / (nrow(x) - 1L))
ans$totalvar <- sum(vapply(seq(ncol(x)), f, pi, USE.NAMES=FALSE))
}
if (!missing(...)) args <- c(args, list(...))
s <- do.call(irlba, args=args)
ans$sdev <- s$d / sqrt(max(1, nrow(x) - 1))
ans$rotation <- s$v
colnames(ans$rotation) <- paste("PC", seq(1, ncol(ans$rotation)), sep="")
ans$center <- args$center
if (retx)
{
ans <- c(ans, list(x = sweep(s$u, 2, s$d, FUN=`*`)))
colnames(ans$x) <- paste("PC", seq(1, ncol(ans$rotation)), sep="")
}
class(ans) <- c("irlba_prcomp", "prcomp")
ans
}
n <- 10000
p <- 1000
mat <- matrix(rpois(n = n*p, lambda = 0.005), n, p)
sparse_mat <- as(mat, "sparseMatrix")
(lb <- bench::mark(
prcomp_irlba_old(mat, scale.=TRUE),
prcomp_irlba_old(sparse_mat, scale.=TRUE),
prcomp_irlba_new(mat, scale.=TRUE),
prcomp_irlba_new(sparse_mat, scale.=TRUE),
check = FALSE
))
#> Warning: Some expressions had a GC in every iteration; so filtering is disabled.
#> # A tibble: 4 x 6
#> expression min median `itr/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl>
#> 1 prcomp_irlba_old(mat, scale. = TRUE) 1.84s 1.84s 0.542
#> 2 prcomp_irlba_old(sparse_mat, scale. = TRUE) 296.94ms 361.07ms 2.77
#> 3 prcomp_irlba_new(mat, scale. = TRUE) 1.9s 1.9s 0.527
#> 4 prcomp_irlba_new(sparse_mat, scale. = TRUE) 30.73ms 32.94ms 20.8
#> # ... with 2 more variables: mem_alloc <bch:byt>, `gc/sec` <dbl>
plot(lb)
#> Loading required namespace: tidyrCreated on 2019-11-18 by the reprex package (v0.3.0) |
Codecov Report
@@ Coverage Diff @@
## master #52 +/- ##
==========================================
- Coverage 89.1% 88.76% -0.34%
==========================================
Files 8 8
Lines 881 801 -80
==========================================
- Hits 785 711 -74
+ Misses 96 90 -6
Continue to review full report at Codecov.
|
|
This PR changes the order of the elements in the returned list to match the order of |
|
Some comments from a developer perspective:
x <- matrix(rnorm(100), 10, 10) + 1e9
center <- col_means <- colMeans(x)
col_vars <- (colMeans(x^2) - 2*col_means*center + center^2) / (1 - 1/nrow(x))
col_vars
## [1] 142.2222 0.0000 -142.2222 142.2222 0.0000 0.0000 -142.2222
## [8] -142.2222 -142.2222 142.2222
true_col_vars <- matrixStats::colVars(x)
true_col_vars
## [1] 0.5213468 0.7576393 1.7165307 0.9792770 0.9100585 0.7977485 1.0333882
## [8] 1.0694728 0.9085411 1.1655892Perhaps this wouldn't be likely to occur for single-cell data, but people should be able to use irlba for other things with arbitrary location. |
|
Thanks for this! I agree a speedup is desirable in the sparse case, and the order alignment with prcomp is a good idea. I'm carefully considering the @LTLA 's comments. We can, for instance, easily make the scaling parameters optionally computed as required. And hopefully carefully address numerical stability. After which I hope to merge and modify this. -bwl |
|
Thanks for the comments, Aaron & for consideration @bwlewis!
It seems to me like these have also been computed before and that this is necessary to compute
Now this is more serious, I would be happy to use matrixStats::colVars(x, center=center)
## [1] 0.0000 134.7368 0.0000 0.0000 0.0000 134.7368 0.0000 0.0000 0.0000 134.7368 |
|
OK, figured it out: # normal case
x <- matrix(rnorm(200), 20, 8) + 5
center <- colMeans(x) + 1
colMeans(t(t(x)-center)^2)/(1-1/nrow(x) ) # slow
#> [1] 2.276515 1.955017 1.922532 2.172460 2.162209 2.235109 2.140299 2.092379
matrixStats::colVars(x) + (colMeans(x)-center)^2/(1-1/nrow(x)) # ok
#> [1] 2.276515 1.955017 1.922532 2.172460 2.162209 2.235109 2.140299 2.092379
(colMeans(x^2) - 2*colMeans(x)*center + center^2) / (1 - 1/nrow(x)) # instable
#> [1] 2.276515 1.955017 1.922532 2.172460 2.162209 2.235109 2.140299 2.092379
# toxic case
x <- matrix(rnorm(200), 20, 8) + 1e9
center <- colMeans(x) + 1
colMeans(t(t(x)-center)^2)/(1-1/nrow(x) ) # slow
#> [1] 1.575861 2.406821 2.810052 1.853597 1.428814 1.697791 2.128663 1.925782
matrixStats::colVars(x) + (colMeans(x)-center)^2/(1-1/nrow(x)) # ok
#> [1] 1.575861 2.406821 2.810052 1.853597 1.428814 1.697791 2.128663 1.925782
(colMeans(x^2) - 2*colMeans(x)*center + center^2) / (1 - 1/nrow(x)) # instable
#> [1] -134.7368 0.0000 0.0000 0.0000 -134.7368 0.0000 0.0000
#> [8] 0.0000
# variance case
x <- matrix(rnorm(200), 20, 7) + 1e9
#> Warning in matrix(rnorm(200), 20, 7): data length [200] is not a sub-multiple or
#> multiple of the number of columns [7]
center <- colMeans(x)
matrixStats::colVars(x) #variance only
#> [1] 1.1096721 0.9205107 1.1990229 1.0338478 0.9222606 1.0248208 0.8031727
colMeans(t(t(x)-center)^2)/(1-1/nrow(x) ) # slow
#> [1] 1.1096721 0.9205107 1.1990229 1.0338478 0.9222606 1.0248208 0.8031727
matrixStats::colVars(x) + (colMeans(x)-center)^2/(1-1/nrow(x)) # ok
#> [1] 1.1096721 0.9205107 1.1990229 1.0338478 0.9222606 1.0248208 0.8031727
(colMeans(x^2) - 2*colMeans(x)*center + center^2) / (1 - 1/nrow(x)) # instable
#> [1] 0.0000 0.0000 134.7368 0.0000 0.0000 -134.7368 134.7368The EDIT: I just realized |
|
There are several points here that warrant further discussion. The IMO, the best solution is to keep the original if/else branches and replace the As for the ## NOTE: untested, but you should get the idea.
setGeneric(".my_colVars", function(x, ...) standardGeneric(".my_colVars"))
#' @importFrom Matrix t rowSums
setMethod(".my_colVars", "ANY", function(x, center=NULL) {
if (!is.null(center)) {
y <- t(x) - center
rowSums(y^2)/(ncol(y)-1)
} else {
colSums(x^2)/(nrow(x)-1)
}
})
#' @importFrom Matrix t colSums
setMethod(".my_colVars", "dgCMatrix", function(x, center=NULL) {
if (!is.null(center)) {
nzero <- diff(x@p)
expanded <- rep(center, nzero)
x@x <- (x@x - expanded)^2
(colSums(x) + nzero * center^2)/(nrow(x)-1)
} else {
colSums(x^2)/(nrow(x)-1)
}
})Optimized implementations for other classes are left as an exercise for the reader. I must admit that I never realized that |
|
@bwlewis Would you be happy relying on Bioconductor/MatrixGenerics? |
|
Hi Jan, Aaron: Sorry I have been so negligent in maintenance here --
it has been a bit of a crazy year (indeed for the whole world).
Jan, I have a bunch of unincorportaed ideas from Aaron to get fully
implemented. I will be working on this very soon.
Can we revisit this in a week and discuss after I get some of the
changes in place?
Best,
Bryan
…On 2/1/21, Jan Gleixner ***@***.***> wrote:
@bwlewis Would you be happy relying on
[Bioconductor/MatrixGenerics](https://github.com/Bioconductor/MatrixGenerics)?
--
You are receiving this because you were mentioned.
Reply to this email directly or view it on GitHub:
#52 (comment)
|
|
I'll be honest with you guys, I've forgotten most of what I suggested. But I would very much like to get back into this - particularly interested in seeing how we can improve the R-side IRLBA code, which I depend on a lot for my S4 matrix abstractions. |

This gives a significant speed-up for sparse matrices by avoiding to create dense intermediates.