Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Catch variance equal to zero when scaling #177

Merged
merged 7 commits into from
Mar 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 11 additions & 8 deletions .github/workflows/check-standard.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Workflow derived from https://github.com/r-lib/actions/tree/master/examples
# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
on:
push:
Expand All @@ -18,7 +18,7 @@ jobs:
fail-fast: false
matrix:
config:
- {os: macOS-latest, r: 'release'}
- {os: macos-latest, r: 'release'}
- {os: windows-latest, r: 'release'}
- {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'}
- {os: ubuntu-latest, r: 'release'}
Expand All @@ -32,21 +32,24 @@ jobs:
_R_CHECK_LENGTH_1_LOGIC2_: verbose

steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v4

- uses: r-lib/actions/setup-pandoc@v1
- uses: r-lib/actions/setup-pandoc@v2

- uses: r-lib/actions/setup-r@v1
- uses: r-lib/actions/setup-r@v2
with:
r-version: ${{ matrix.config.r }}
http-user-agent: ${{ matrix.config.http-user-agent }}
use-public-rspm: true

- uses: r-lib/actions/setup-r-dependencies@v1
- uses: r-lib/actions/setup-r-dependencies@v2
with:
extra-packages: rcmdcheck
extra-packages: any::rcmdcheck
needs: check

- uses: r-lib/actions/check-r-package@v1
- uses: r-lib/actions/check-r-package@v2
with:
upload-snapshots: true

- name: Show testthat output
if: always()
Expand Down
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@ Encoding: UTF-8
Package: bigstatsr
Type: Package
Title: Statistical Tools for Filebacked Big Matrices
Version: 1.5.13
Date: 2023-12-13
Version: 1.5.15
Date: 2024-03-24
Authors@R: c(
person("Florian", "Privé", email = "[email protected]",
role = c("aut", "cre")),
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ exportMethods(tcrossprod)
exportMethods(typeof)
import(foreach)
import(ggplot2)
import(rmio)
importFrom(Rcpp,sourceCpp)
importFrom(bigassertr,assert_01)
importFrom(bigassertr,assert_all)
Expand Down
4 changes: 4 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
## bigstatsr 1.5.14

- Error when variables with a zero scaling are used in e.g. `big_randomSVD()` and `big_crossprodSelf()` (#52).

## bigstatsr 1.5.13

- Add parameter `backingfile` to `big_crossprodSelf()` and `big_cor()` (#170).
Expand Down
1 change: 1 addition & 0 deletions R/FBM.R
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@ sub_bk <- function(path, replacement = "", stop_if_not_ext = TRUE) {
#' X[] # access as standard R matrix
#'
#' @exportClass FBM
#' @import rmio
#'
FBM_RC <- methods::setRefClass(

Expand Down
2 changes: 2 additions & 0 deletions R/crossprodSelf.R
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@ big_crossprodSelf <- function(
tmp1 <- X[ind.row, ind.col[ind1]]

ms <- fun.scaling(X, ind.row = ind.row, ind.col = ind.col[ind1])
if (any_near0(ms$scale)) stop2(MSG_ZERO_SCALE)

mu[ind1] <- ms$center
delta[ind1] <- ms$scale
sums[ind1] <- colSums(tmp1)
Expand Down
35 changes: 20 additions & 15 deletions R/plot.R
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,9 @@
#'
#' @examples
#' library(ggplot2)
#' qplot(y = 1:10)
#' qplot(y = 1:10) + theme_bw()
#' qplot(y = 1:10) + theme_bigstatsr()
#' (p <- ggplot(mapping = aes(x = 1:10, y = 1:10)) + geom_point())
#' p + theme_bw()
#' p + theme_bigstatsr()
theme_bigstatsr <- function(size.rel = 1) {
theme_bw() +
theme(
Expand All @@ -31,10 +31,6 @@ theme_bigstatsr <- function(size.rel = 1) {
)
}

MY_THEME <- function(p, coeff = 1) {
p + theme_bigstatsr(size.rel = coeff)
}

#' @importFrom cowplot plot_grid
#' @export
cowplot::plot_grid
Expand Down Expand Up @@ -101,7 +97,9 @@ plot.big_SVD <- function(x, type = c("screeplot", "scores", "loadings"),

assert_lengths(nval, 1)

p <- MY_THEME(qplot(y = x$d[seq_len(nval)]), coeff = coeff) +
p <- ggplot(mapping = aes(x = seq_len(nval), y = x$d[seq_len(nval)])) +
theme_bigstatsr(size.rel = coeff) +
geom_point() +
geom_line() +
scale_y_log10() +
labs(title = "Scree Plot", x = "PC Index", y = "Singular Value")
Expand Down Expand Up @@ -133,7 +131,9 @@ plot.big_SVD <- function(x, type = c("screeplot", "scores", "loadings"),
nx <- scores[1]
ny <- scores[2]

MY_THEME(qplot(x = sc[, nx], y = sc[, ny]), coeff = coeff) +
ggplot(mapping = aes(x = sc[, nx], y = sc[, ny])) +
geom_point() +
theme_bigstatsr(size.rel = coeff) +
coord_fixed() +
labs(title = "Scores of PCA", x = paste0("PC", nx), y = paste0("PC", ny))

Expand All @@ -155,7 +155,9 @@ plot.big_SVD <- function(x, type = c("screeplot", "scores", "loadings"),

} else {

p <- MY_THEME(qplot(y = x$v[, loadings]), coeff = coeff) +
p <- ggplot(mapping = aes(x = rows_along(x$v), y = x$v[, loadings])) +
geom_point() +
theme_bigstatsr(size.rel = coeff) +
labs(title = paste0("Loadings of PC", loadings),
x = "Column index", y = NULL)

Expand Down Expand Up @@ -215,17 +217,20 @@ plot.mhtest <- function(x, type = c("hist", "Manhattan", "Q-Q", "Volcano"),
type <- match.arg(type)
main <- paste(type, "plot")

if (type == "Manhattan") {
qplot(y = -lpval) +
p <- if (type == "Manhattan") {
ggplot(mapping = aes(x = seq_along(lpval), y = -lpval)) +
geom_point() +
labs(title = main, x = "Column Index",
y = expression(-log[10](italic("p-value"))))
} else if (type == "Volcano") {
qplot(x = x[["estim"]], y = -lpval) +
ggplot(mapping = aes(x = x[["estim"]], y = -lpval)) +
geom_point() +
labs(title = main, x = "Estimate",
y = expression(-log[10](italic("p-value"))))
} else if (type == "Q-Q") {
unif.ranked <- stats::ppoints(length(lpval))[rank(lpval)]
qplot(x = -log10(unif.ranked), y = -lpval) +
ggplot(mapping = aes(x = -log10(unif.ranked), y = -lpval)) +
geom_point() +
labs(title = main,
x = expression(Expected~~-log[10](italic("p-value"))),
y = expression(Observed~~-log[10](italic("p-value")))) +
Expand All @@ -239,7 +244,7 @@ plot.mhtest <- function(x, type = c("hist", "Manhattan", "Q-Q", "Volcano"),
labs(x = "p-value")
}

last_plot() + theme_bigstatsr(size.rel = coeff)
p + theme_bigstatsr(size.rel = coeff)
}

################################################################################
Expand Down
27 changes: 20 additions & 7 deletions R/randomSVD.R
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

Ax <- FBM(n, ncores)
Atx <- FBM(m, 1)
calc <- FBM(ncores, 1, init = 0)
calc <- FBM(ncores, 1, init = -1)

Check warning on line 16 in R/randomSVD.R

View check run for this annotation

Codecov / codecov/patch

R/randomSVD.R#L16

Added line #L16 was not covered by tests

cluster_type <- getOption("bigstatsr.cluster.type")
if (verbose) {
Expand All @@ -26,24 +26,32 @@

if (ic == 0) { # I'm the master

# Wait for the slaves to finish computing the scalings
while (any(calc[] < 0)) {
if (any(calc[] == -2)) {
calc[] <- 3 # signal all slaves to stop
stop2(MSG_ZERO_SCALE)
} else Sys.sleep(TIME)

Check warning on line 34 in R/randomSVD.R

View check run for this annotation

Codecov / codecov/patch

R/randomSVD.R#L30-L34

Added lines #L30 - L34 were not covered by tests
}

printf <- function(...) cat(sprintf(...))
it <- 0
# A
A <- function(x, args) {
printf("%d - computing A * x\n", it <<- it + 1)
Atx[] <- x
calc[] <- 1 # make them work
# Master wait for its slaves to finish working
while (sum(calc[]) > 0) Sys.sleep(TIME)
# the master wait for its slaves to finish working
while (any(calc[] > 0)) Sys.sleep(TIME)

Check warning on line 45 in R/randomSVD.R

View check run for this annotation

Codecov / codecov/patch

R/randomSVD.R#L45

Added line #L45 was not covered by tests
rowSums(Ax[])
}
# Atrans
Atrans <- function(x, args) {
printf("%d - computing At * x\n", it <<- it + 1)
Ax[, 1] <- x
calc[] <- 2 # make them work
# Master wait for its slaves to finish working
while (sum(calc[]) > 0) Sys.sleep(TIME)
# the master wait for its slaves to finish working
while (any(calc[] > 0)) Sys.sleep(TIME)

Check warning on line 54 in R/randomSVD.R

View check run for this annotation

Codecov / codecov/patch

R/randomSVD.R#L54

Added line #L54 was not covered by tests
Atx[]
}

Expand All @@ -61,6 +69,12 @@

# Scaling
ms <- fun.scaling(X, ind.row = ind.row, ind.col = ind.col.part)
if (any_near0(ms$scale)) {
calc[ic] <- -2 # signal the master there is an issue with the scaling
return(ms)

Check warning on line 74 in R/randomSVD.R

View check run for this annotation

Codecov / codecov/patch

R/randomSVD.R#L72-L74

Added lines #L72 - L74 were not covered by tests
} else {
calc[ic] <- 0

Check warning on line 76 in R/randomSVD.R

View check run for this annotation

Codecov / codecov/patch

R/randomSVD.R#L76

Added line #L76 was not covered by tests
}

repeat {
# Slaves wait for their master to give them orders
Expand All @@ -78,8 +92,6 @@
} else if (c == 3) {
# End
break
} else {
stop("RandomSVD: unclear order from the master.")
}
calc[ic] <- 0
}
Expand Down Expand Up @@ -110,6 +122,7 @@

# scaling
ms <- fun.scaling(X, ind.row, ind.col)
if (any_near0(ms$scale)) stop2(MSG_ZERO_SCALE)

printf <- function(...) if (verbose) cat(sprintf(...))
it <- 0
Expand Down
2 changes: 2 additions & 0 deletions R/scaling.R
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,8 @@ big_scale <- function(center = TRUE, scale = TRUE) {
sds <- rep(1, m)
}

if (any_near0(sds)) warning2(MSG_ZERO_SCALE)

data.frame(center = means, scale = sds)
}
}
Expand Down
2 changes: 2 additions & 0 deletions R/tcrossprodSelf.R
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@ big_tcrossprodSelf <- function(
ind <- seq2(intervals[j, ])
ind.col.ind <- ind.col[ind]
ms <- fun.scaling(X, ind.row = ind.row, ind.col = ind.col.ind)
if (any_near0(ms$scale)) stop2(MSG_ZERO_SCALE)

means[ind] <- ms$center
sds[ind] <- ms$scale
increment_scaled_tcrossprod(K, X_part_temp, X,
Expand Down
11 changes: 11 additions & 0 deletions R/utils-assert.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,17 @@

################################################################################

MSG_ZERO_SCALE <-
"Some variables have zero scaling; remove them before attempting to scale."

################################################################################

any_near0 <- function(x) {
any(dplyr::near(x, 0))
}

################################################################################

as_vec <- function(x) {
x2 <- drop(x)
if (is.matrix(x2))
Expand Down
6 changes: 5 additions & 1 deletion docs/news/index.html

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions inst/WORDLIST
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,12 @@ bty
cbind
cdot
CMSA
Codecov
confounders
Crossprod
crossprods
didn
dir
doesn
doi
eigen
Expand Down
6 changes: 3 additions & 3 deletions man/theme_bigstatsr.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 8 additions & 0 deletions tests/testthat/helper-init.R
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,14 @@ diffPCs <- function(test, rot) {

################################################################################

custom_scaling <- function(X, ind.row = rows_along(X),
ind.col = cols_along(X), ncores = 1) {
stats <- big_colstats(X, ind.row, ind.col, ncores = ncores)
data.frame(center = stats$sum / length(ind.row), scale = sqrt(stats$var))
}

################################################################################

set.seed(NULL)
if (not_cran) {
# Seeds that won't work (because of bad luck)
Expand Down
12 changes: 12 additions & 0 deletions tests/testthat/test-crossprodSelf.R
Original file line number Diff line number Diff line change
Expand Up @@ -76,3 +76,15 @@ test_that("equality with crossprod with half of the data", {
})

################################################################################

test_that("we catch zero variance variables when scaling", {

X <- FBM(20, 20, init = rnorm(400))
expect_no_error(big_crossprodSelf(X, fun.scaling = custom_scaling))

X[, 1] <- 0
expect_error(big_crossprodSelf(X, fun.scaling = custom_scaling),
"Some variables have zero scaling")
})

################################################################################
19 changes: 19 additions & 0 deletions tests/testthat/test-randomSVD.R
Original file line number Diff line number Diff line change
Expand Up @@ -138,3 +138,22 @@ test_that("as_scaling_fun() works", {
})

################################################################################

test_that("zero variance is caught", {

X <- FBM(20, 20, init = rnorm(400))
expect_no_error(big_randomSVD(X, big_scale()))

X[, 1] <- 0 # set a variable to have zero variance
# this is also catched as a warning in big_scale()
expect_error(expect_warning(big_randomSVD(X, big_scale()),
"Some variables have zero scaling"),
"Some variables have zero scaling")

expect_error(big_randomSVD(X, custom_scaling),
"Some variables have zero scaling")
expect_error(big_randomSVD(X, custom_scaling, ncores = test_cores()),
"Some variables have zero scaling")
})

################################################################################
Loading
Loading