Updates on RSpectra: new "center" and "scale" parameters for svds()
Per the suggestion by @robmaz, RSpectra::svds()
now has two new parameters center
and scale
, to support implicit centering
and scaling of matrices in partial SVD. The minimum version for this new feature
is RSpectra >= 0.16-0
.
These two parameters are very useful for principal component analysis (PCA)
based on the covariance or correlation matrix, without actually forming them.
Below we simulate a random data matrix, and use both R’s built-in
prcomp()
and the svds()
function in RSpectra
to compute PCA.
library(RSpectra)
library(Matrix)
# Simulate data matrix
set.seed(123)
n = 2000
p = 5000
k = 10
x = matrix(rnorm(n * p), n)
# R's built-in function
system.time(res1 <- prcomp(x, center = TRUE, scale. = FALSE, rank. = k))
## user system elapsed
## 6.918 0.135 7.053
# svds()
system.time(res2 <- svds(x, k, nu = 0, opts = list(center = TRUE, scale = FALSE)))
## user system elapsed
## 1.432 0.000 1.432
# Check explained variances
head(res1$sdev, k)
## [1] 2.581690 2.569330 2.562992 2.560865 2.558364 2.552452 2.547646 2.545435
## [9] 2.542844 2.538125
# Here we need to normalize the result by `sqrt(n-1)` if `scale = FALSE`
res2$d / sqrt(n - 1)
## [1] 2.581690 2.569330 2.562992 2.560865 2.558364 2.552452 2.547646 2.545435
## [9] 2.542844 2.538125
# Check factor loadings (eigenvectors)
head(res1$rotation)
## PC1 PC2 PC3 PC4 PC5
## [1,] 0.002955287 0.009469244 0.001791091 -0.004472325 0.0008397064
## [2,] 0.039091388 0.002317985 -0.016254929 -0.030014596 -0.0037889699
## [3,] 0.011089497 -0.014628014 0.024707183 -0.022815585 0.0142715032
## [4,] 0.019378279 0.003213703 0.011944642 0.005600938 0.0256555170
## [5,] 0.011824996 -0.025775262 -0.008988052 -0.008098504 0.0393693188
## [6,] -0.001385724 -0.014513030 0.035388546 -0.011516888 0.0003066008
## PC6 PC7 PC8 PC9 PC10
## [1,] 0.015071818 0.00461462 0.018306793 -0.005177352 -0.017118890
## [2,] 0.010304461 -0.01921043 -0.015591374 -0.012862728 0.019834698
## [3,] -0.001323470 -0.01548052 0.009874774 -0.006201647 0.035196107
## [4,] 0.019699390 0.01027901 0.008836943 0.020698740 -0.003105694
## [5,] 0.011788053 -0.02337262 0.058207087 -0.010045558 0.006709223
## [6,] -0.003844848 0.01498251 0.012312399 -0.011870723 0.032744347
head(res2$v)
## [,1] [,2] [,3] [,4] [,5]
## [1,] 0.002955287 -0.009469244 -0.001791091 -0.004472325 -0.0008397064
## [2,] 0.039091388 -0.002317985 0.016254929 -0.030014596 0.0037889699
## [3,] 0.011089497 0.014628014 -0.024707183 -0.022815585 -0.0142715032
## [4,] 0.019378279 -0.003213703 -0.011944642 0.005600938 -0.0256555170
## [5,] 0.011824996 0.025775262 0.008988052 -0.008098504 -0.0393693188
## [6,] -0.001385724 0.014513030 -0.035388546 -0.011516888 -0.0003066008
## [,6] [,7] [,8] [,9] [,10]
## [1,] 0.015071818 -0.00461462 -0.018306793 -0.005177352 -0.017118890
## [2,] 0.010304461 0.01921043 0.015591374 -0.012862728 0.019834698
## [3,] -0.001323470 0.01548052 -0.009874774 -0.006201647 0.035196107
## [4,] 0.019699390 -0.01027901 -0.008836943 0.020698740 -0.003105694
## [5,] 0.011788053 0.02337262 -0.058207087 -0.010045558 0.006709223
## [6,] -0.003844848 -0.01498251 -0.012312399 -0.011870723 0.032744347
We can see that the two methods generate the same results (note that eigenvectors
are identical up to signs), but svds()
is much faster than prcomp()
since it
only computes the leading singular values.
The performance advantage of svds()
is more evident if the input matrix is also
sparse. We repeat the experiment above, but on a different input:
# Simulate data matrix
set.seed(123)
n = 2000
p = 5000
k = 10
xsp = rnorm(n * p)
# 90% of the values are zero
xsp[sample(n * p, n * p * 0.9)] = 0
xsp = Matrix(xsp, n, p, sparse = TRUE)
# R's built-in function
system.time(res1 <- prcomp(xsp, center = TRUE, scale. = FALSE, rank. = k))
## user system elapsed
## 6.656 0.159 6.815
# svds()
system.time(res2 <- svds(xsp, k, nu = 0, opts = list(center = TRUE, scale = FALSE)))
## user system elapsed
## 0.47 0.00 0.47
# Check explained variances
head(res1$sdev, k)
## [1] 0.8185087 0.8153621 0.8149581 0.8130531 0.8114726 0.8096282 0.8080887
## [8] 0.8076944 0.8058630 0.8052317
# Here we need to normalize the result by `sqrt(n-1)` if `scale = FALSE`
res2$d / sqrt(n - 1)
## [1] 0.8185087 0.8153621 0.8149581 0.8130531 0.8114726 0.8096282 0.8080887
## [8] 0.8076944 0.8058630 0.8052317
# Check factor loadings (eigenvectors)
head(res1$rotation)
## PC1 PC2 PC3 PC4 PC5
## [1,] -0.014078976 0.009977389 -0.0048860045 0.036143696 0.020856379
## [2,] 0.005891397 -0.006337634 0.0086567191 -0.024204012 0.006019543
## [3,] -0.004839200 0.006800550 0.0057230097 0.001902374 0.009481416
## [4,] 0.012187115 0.007566681 -0.0007558701 0.037469119 -0.016597866
## [5,] 0.017071907 -0.010494435 -0.0074782229 0.015558749 0.004890078
## [6,] -0.013842080 0.007694404 0.0018808309 -0.013330489 -0.017356824
## PC6 PC7 PC8 PC9 PC10
## [1,] -6.246730e-03 0.001227333 0.005002885 -0.019607028 0.005792201
## [2,] 1.020680e-02 0.007968355 -0.028889050 0.008189175 -0.006268807
## [3,] -4.108971e-02 0.011208912 0.005501149 -0.007212123 -0.021556508
## [4,] 2.665231e-02 -0.004031854 -0.010210344 -0.011379201 0.004734222
## [5,] 7.927843e-04 -0.003725163 -0.005354887 0.004972582 0.005005228
## [6,] -5.537012e-06 0.010373699 -0.019280452 0.019368409 -0.014930834
head(res2$v)
## [,1] [,2] [,3] [,4] [,5]
## [1,] 0.014078976 -0.009977389 0.0048860045 -0.036143696 -0.020856379
## [2,] -0.005891397 0.006337634 -0.0086567191 0.024204012 -0.006019543
## [3,] 0.004839200 -0.006800550 -0.0057230097 -0.001902374 -0.009481416
## [4,] -0.012187115 -0.007566681 0.0007558701 -0.037469119 0.016597866
## [5,] -0.017071907 0.010494435 0.0074782229 -0.015558749 -0.004890078
## [6,] 0.013842080 -0.007694404 -0.0018808309 0.013330489 0.017356824
## [,6] [,7] [,8] [,9] [,10]
## [1,] 6.246730e-03 0.001227333 0.005002885 0.019607028 0.005792201
## [2,] -1.020680e-02 0.007968355 -0.028889050 -0.008189175 -0.006268807
## [3,] 4.108971e-02 0.011208912 0.005501149 0.007212123 -0.021556508
## [4,] -2.665231e-02 -0.004031854 -0.010210344 0.011379201 0.004734222
## [5,] -7.927843e-04 -0.003725163 -0.005354887 -0.004972582 0.005005228
## [6,] 5.537012e-06 0.010373699 -0.019280452 -0.019368409 -0.014930834
The above code demonstrates a roughly 15x speedup.
To perform PCA based on the correlation matrix, simply specify scale = TRUE
. Below
gives an example of two different ways that lead to the same output.
# Simulate data matrix
set.seed(123)
n = 2000
p = 5000
k = 10
xsp = rnorm(n * p)
# 90% of the values are zero
xsp[sample(n * p, n * p * 0.9)] = 0
xsp = Matrix(xsp, n, p, sparse = TRUE)
# R's built-in function
system.time(res1 <- prcomp(xsp, center = TRUE, scale. = TRUE, rank. = k))
## user system elapsed
## 7.084 0.205 7.291
# svds()
system.time(res2 <- svds(xsp, k, nu = 0, opts = list(center = TRUE, scale = TRUE)))
## user system elapsed
## 0.587 0.000 0.587
# Check explained variances
head(res1$sdev, k)
## [1] 2.576689 2.566099 2.562755 2.560604 2.551354 2.547781 2.546969 2.540066
## [9] 2.536446 2.534298
res2$d
## [1] 2.576689 2.566099 2.562755 2.560604 2.551354 2.547781 2.546969 2.540066
## [9] 2.536446 2.534298
# Check factor loadings (eigenvectors)
head(res1$rotation)
## PC1 PC2 PC3 PC4 PC5
## [1,] -0.006483247 -0.0009410683 -0.019834553 0.032506226 -0.002916057
## [2,] 0.012595033 0.0011791356 0.010536472 -0.020026913 0.025265776
## [3,] -0.001758559 0.0074203003 -0.009177566 -0.001530424 0.007236185
## [4,] 0.010485835 0.0102749040 -0.017666064 0.018482750 -0.027623466
## [5,] 0.019636988 -0.0051696089 -0.004308241 0.019228107 -0.001763454
## [6,] -0.013690684 -0.0025113870 -0.003252875 -0.030508385 -0.003294067
## PC6 PC7 PC8 PC9 PC10
## [1,] -0.001658184 -7.242177e-03 -0.0008240643 -0.007626584 -0.0006768329
## [2,] -0.014348978 7.286501e-03 0.0121723469 0.007891901 0.0271362784
## [3,] -0.012574501 -1.400828e-02 -0.0353942993 -0.009328341 0.0040393007
## [4,] 0.007793619 4.733246e-03 0.0230159209 -0.006839061 -0.0027831289
## [5,] -0.002541560 -2.382465e-05 0.0130737963 0.003924594 -0.0073279903
## [6,] -0.008978843 -4.013950e-03 0.0138156553 0.013757868 0.0160054867
head(res2$v)
## [,1] [,2] [,3] [,4] [,5]
## [1,] 0.006483247 -0.0009410683 0.019834553 0.032506226 0.002916057
## [2,] -0.012595033 0.0011791356 -0.010536472 -0.020026913 -0.025265776
## [3,] 0.001758559 0.0074203003 0.009177566 -0.001530424 -0.007236185
## [4,] -0.010485835 0.0102749040 0.017666064 0.018482750 0.027623466
## [5,] -0.019636988 -0.0051696089 0.004308241 0.019228107 0.001763454
## [6,] 0.013690684 -0.0025113870 0.003252875 -0.030508385 0.003294067
## [,6] [,7] [,8] [,9] [,10]
## [1,] -0.001658184 -7.242177e-03 -0.0008240643 0.007626584 0.0006768329
## [2,] -0.014348978 7.286501e-03 0.0121723469 -0.007891901 -0.0271362784
## [3,] -0.012574501 -1.400828e-02 -0.0353942993 0.009328341 -0.0040393007
## [4,] 0.007793619 4.733246e-03 0.0230159209 0.006839061 0.0027831289
## [5,] -0.002541560 -2.382465e-05 0.0130737963 -0.003924594 0.0073279902
## [6,] -0.008978843 -4.013950e-03 0.0138156553 -0.013757868 -0.0160054867