Updates on RSpectra: new "center" and "scale" parameters for svds()

Per the suggestion by @robmaz, RSpectra::svds() now has two new parameters center and scale, to support implicit centering and scaling of matrices in partial SVD. The minimum version for this new feature is RSpectra >= 0.16-0.

These two parameters are very useful for principal component analysis (PCA) based on the covariance or correlation matrix, without actually forming them. Below we simulate a random data matrix, and use both R’s built-in prcomp() and the svds() function in RSpectra to compute PCA.

library(RSpectra)
library(Matrix)

# Simulate data matrix
set.seed(123)
n = 2000
p = 5000
k = 10
x = matrix(rnorm(n * p), n)

# R's built-in function
system.time(res1 <- prcomp(x, center = TRUE, scale. = FALSE, rank. = k))

##    user  system elapsed 
##   6.918   0.135   7.053

# svds()
system.time(res2 <- svds(x, k, nu = 0, opts = list(center = TRUE, scale = FALSE)))

##    user  system elapsed 
##   1.432   0.000   1.432

# Check explained variances
head(res1$sdev, k)

##  [1] 2.581690 2.569330 2.562992 2.560865 2.558364 2.552452 2.547646 2.545435
##  [9] 2.542844 2.538125

# Here we need to normalize the result by `sqrt(n-1)` if `scale = FALSE`
res2$d / sqrt(n - 1)

##  [1] 2.581690 2.569330 2.562992 2.560865 2.558364 2.552452 2.547646 2.545435
##  [9] 2.542844 2.538125

# Check factor loadings (eigenvectors)
head(res1$rotation)

##               PC1          PC2          PC3          PC4           PC5
## [1,]  0.002955287  0.009469244  0.001791091 -0.004472325  0.0008397064
## [2,]  0.039091388  0.002317985 -0.016254929 -0.030014596 -0.0037889699
## [3,]  0.011089497 -0.014628014  0.024707183 -0.022815585  0.0142715032
## [4,]  0.019378279  0.003213703  0.011944642  0.005600938  0.0256555170
## [5,]  0.011824996 -0.025775262 -0.008988052 -0.008098504  0.0393693188
## [6,] -0.001385724 -0.014513030  0.035388546 -0.011516888  0.0003066008
##               PC6         PC7          PC8          PC9         PC10
## [1,]  0.015071818  0.00461462  0.018306793 -0.005177352 -0.017118890
## [2,]  0.010304461 -0.01921043 -0.015591374 -0.012862728  0.019834698
## [3,] -0.001323470 -0.01548052  0.009874774 -0.006201647  0.035196107
## [4,]  0.019699390  0.01027901  0.008836943  0.020698740 -0.003105694
## [5,]  0.011788053 -0.02337262  0.058207087 -0.010045558  0.006709223
## [6,] -0.003844848  0.01498251  0.012312399 -0.011870723  0.032744347

head(res2$v)

##              [,1]         [,2]         [,3]         [,4]          [,5]
## [1,]  0.002955287 -0.009469244 -0.001791091 -0.004472325 -0.0008397064
## [2,]  0.039091388 -0.002317985  0.016254929 -0.030014596  0.0037889699
## [3,]  0.011089497  0.014628014 -0.024707183 -0.022815585 -0.0142715032
## [4,]  0.019378279 -0.003213703 -0.011944642  0.005600938 -0.0256555170
## [5,]  0.011824996  0.025775262  0.008988052 -0.008098504 -0.0393693188
## [6,] -0.001385724  0.014513030 -0.035388546 -0.011516888 -0.0003066008
##              [,6]        [,7]         [,8]         [,9]        [,10]
## [1,]  0.015071818 -0.00461462 -0.018306793 -0.005177352 -0.017118890
## [2,]  0.010304461  0.01921043  0.015591374 -0.012862728  0.019834698
## [3,] -0.001323470  0.01548052 -0.009874774 -0.006201647  0.035196107
## [4,]  0.019699390 -0.01027901 -0.008836943  0.020698740 -0.003105694
## [5,]  0.011788053  0.02337262 -0.058207087 -0.010045558  0.006709223
## [6,] -0.003844848 -0.01498251 -0.012312399 -0.011870723  0.032744347

We can see that the two methods generate the same results (note that eigenvectors are identical up to signs), but svds() is much faster than prcomp() since it only computes the leading singular values.

The performance advantage of svds() is more evident if the input matrix is also sparse. We repeat the experiment above, but on a different input:

# Simulate data matrix
set.seed(123)
n = 2000
p = 5000
k = 10
xsp = rnorm(n * p)
# 90% of the values are zero
xsp[sample(n * p, n * p * 0.9)] = 0
xsp = Matrix(xsp, n, p, sparse = TRUE)

# R's built-in function
system.time(res1 <- prcomp(xsp, center = TRUE, scale. = FALSE, rank. = k))

##    user  system elapsed 
##   6.656   0.159   6.815

# svds()
system.time(res2 <- svds(xsp, k, nu = 0, opts = list(center = TRUE, scale = FALSE)))

##    user  system elapsed 
##    0.47    0.00    0.47

# Check explained variances
head(res1$sdev, k)

##  [1] 0.8185087 0.8153621 0.8149581 0.8130531 0.8114726 0.8096282 0.8080887
##  [8] 0.8076944 0.8058630 0.8052317

# Here we need to normalize the result by `sqrt(n-1)` if `scale = FALSE`
res2$d / sqrt(n - 1)

##  [1] 0.8185087 0.8153621 0.8149581 0.8130531 0.8114726 0.8096282 0.8080887
##  [8] 0.8076944 0.8058630 0.8052317

# Check factor loadings (eigenvectors)
head(res1$rotation)

##               PC1          PC2           PC3          PC4          PC5
## [1,] -0.014078976  0.009977389 -0.0048860045  0.036143696  0.020856379
## [2,]  0.005891397 -0.006337634  0.0086567191 -0.024204012  0.006019543
## [3,] -0.004839200  0.006800550  0.0057230097  0.001902374  0.009481416
## [4,]  0.012187115  0.007566681 -0.0007558701  0.037469119 -0.016597866
## [5,]  0.017071907 -0.010494435 -0.0074782229  0.015558749  0.004890078
## [6,] -0.013842080  0.007694404  0.0018808309 -0.013330489 -0.017356824
##                PC6          PC7          PC8          PC9         PC10
## [1,] -6.246730e-03  0.001227333  0.005002885 -0.019607028  0.005792201
## [2,]  1.020680e-02  0.007968355 -0.028889050  0.008189175 -0.006268807
## [3,] -4.108971e-02  0.011208912  0.005501149 -0.007212123 -0.021556508
## [4,]  2.665231e-02 -0.004031854 -0.010210344 -0.011379201  0.004734222
## [5,]  7.927843e-04 -0.003725163 -0.005354887  0.004972582  0.005005228
## [6,] -5.537012e-06  0.010373699 -0.019280452  0.019368409 -0.014930834

head(res2$v)

##              [,1]         [,2]          [,3]         [,4]         [,5]
## [1,]  0.014078976 -0.009977389  0.0048860045 -0.036143696 -0.020856379
## [2,] -0.005891397  0.006337634 -0.0086567191  0.024204012 -0.006019543
## [3,]  0.004839200 -0.006800550 -0.0057230097 -0.001902374 -0.009481416
## [4,] -0.012187115 -0.007566681  0.0007558701 -0.037469119  0.016597866
## [5,] -0.017071907  0.010494435  0.0074782229 -0.015558749 -0.004890078
## [6,]  0.013842080 -0.007694404 -0.0018808309  0.013330489  0.017356824
##               [,6]         [,7]         [,8]         [,9]        [,10]
## [1,]  6.246730e-03  0.001227333  0.005002885  0.019607028  0.005792201
## [2,] -1.020680e-02  0.007968355 -0.028889050 -0.008189175 -0.006268807
## [3,]  4.108971e-02  0.011208912  0.005501149  0.007212123 -0.021556508
## [4,] -2.665231e-02 -0.004031854 -0.010210344  0.011379201  0.004734222
## [5,] -7.927843e-04 -0.003725163 -0.005354887 -0.004972582  0.005005228
## [6,]  5.537012e-06  0.010373699 -0.019280452 -0.019368409 -0.014930834

The above code demonstrates a roughly 15x speedup.

To perform PCA based on the correlation matrix, simply specify scale = TRUE. Below gives an example of two different ways that lead to the same output.

# Simulate data matrix
set.seed(123)
n = 2000
p = 5000
k = 10
xsp = rnorm(n * p)
# 90% of the values are zero
xsp[sample(n * p, n * p * 0.9)] = 0
xsp = Matrix(xsp, n, p, sparse = TRUE)

# R's built-in function
system.time(res1 <- prcomp(xsp, center = TRUE, scale. = TRUE, rank. = k))

##    user  system elapsed 
##   7.084   0.205   7.291

# svds()
system.time(res2 <- svds(xsp, k, nu = 0, opts = list(center = TRUE, scale = TRUE)))

##    user  system elapsed 
##   0.587   0.000   0.587

# Check explained variances
head(res1$sdev, k)

##  [1] 2.576689 2.566099 2.562755 2.560604 2.551354 2.547781 2.546969 2.540066
##  [9] 2.536446 2.534298

res2$d

##  [1] 2.576689 2.566099 2.562755 2.560604 2.551354 2.547781 2.546969 2.540066
##  [9] 2.536446 2.534298

# Check factor loadings (eigenvectors)
head(res1$rotation)

##               PC1           PC2          PC3          PC4          PC5
## [1,] -0.006483247 -0.0009410683 -0.019834553  0.032506226 -0.002916057
## [2,]  0.012595033  0.0011791356  0.010536472 -0.020026913  0.025265776
## [3,] -0.001758559  0.0074203003 -0.009177566 -0.001530424  0.007236185
## [4,]  0.010485835  0.0102749040 -0.017666064  0.018482750 -0.027623466
## [5,]  0.019636988 -0.0051696089 -0.004308241  0.019228107 -0.001763454
## [6,] -0.013690684 -0.0025113870 -0.003252875 -0.030508385 -0.003294067
##               PC6           PC7           PC8          PC9          PC10
## [1,] -0.001658184 -7.242177e-03 -0.0008240643 -0.007626584 -0.0006768329
## [2,] -0.014348978  7.286501e-03  0.0121723469  0.007891901  0.0271362784
## [3,] -0.012574501 -1.400828e-02 -0.0353942993 -0.009328341  0.0040393007
## [4,]  0.007793619  4.733246e-03  0.0230159209 -0.006839061 -0.0027831289
## [5,] -0.002541560 -2.382465e-05  0.0130737963  0.003924594 -0.0073279903
## [6,] -0.008978843 -4.013950e-03  0.0138156553  0.013757868  0.0160054867

head(res2$v)

##              [,1]          [,2]         [,3]         [,4]         [,5]
## [1,]  0.006483247 -0.0009410683  0.019834553  0.032506226  0.002916057
## [2,] -0.012595033  0.0011791356 -0.010536472 -0.020026913 -0.025265776
## [3,]  0.001758559  0.0074203003  0.009177566 -0.001530424 -0.007236185
## [4,] -0.010485835  0.0102749040  0.017666064  0.018482750  0.027623466
## [5,] -0.019636988 -0.0051696089  0.004308241  0.019228107  0.001763454
## [6,]  0.013690684 -0.0025113870  0.003252875 -0.030508385  0.003294067
##              [,6]          [,7]          [,8]         [,9]         [,10]
## [1,] -0.001658184 -7.242177e-03 -0.0008240643  0.007626584  0.0006768329
## [2,] -0.014348978  7.286501e-03  0.0121723469 -0.007891901 -0.0271362784
## [3,] -0.012574501 -1.400828e-02 -0.0353942993  0.009328341 -0.0040393007
## [4,]  0.007793619  4.733246e-03  0.0230159209  0.006839061  0.0027831289
## [5,] -0.002541560 -2.382465e-05  0.0130737963 -0.003924594  0.0073279902
## [6,] -0.008978843 -4.013950e-03  0.0138156553 -0.013757868 -0.0160054867

Updates on RSpectra: new "center" and "scale" parameters for svds()

Yixuan Qiu