Skip to content

Commit

Permalink
solved error in calculation of mean in big_apply + set.seed for permu…
Browse files Browse the repository at this point in the history
…tation
  • Loading branch information
kollo97 committed May 7, 2024
1 parent 2be4e3c commit db71d86
Show file tree
Hide file tree
Showing 3 changed files with 51 additions and 28 deletions.
65 changes: 45 additions & 20 deletions R/anglemanise_utils.R
Original file line number Diff line number Diff line change
@@ -1,3 +1,25 @@
# --------------------------------------------------------------------------------------------- #
# Utility functions for the anglemanise package
# --------------------------------------------------------------------------------------------- #
#' Convert a sparse matrix into an FBM
#'
#' Converts a sparse matrix into an FBM with efficient memory usage.
#'
#' @param s_mat A sparse matrix.
#' @return A FBM object from the bigstatsr package.
#' @importFrom bigstatsr FBM
sparse_to_fbm <- function(s_mat) {
n <- nrow(s_mat)
p <- ncol(s_mat)
X <- bigstatsr::FBM(n, p)
big_apply(X, a.FUN = function(X, ind) {
X[, ind] <- s_mat[, ind] %>% as.matrix()
NULL
}, a.combine = "c", block.size = 200)

return(X)
}

# --------------------------------------------------------------------------------------------- #
#' Permute counts in a Seurat object
#'
Expand All @@ -8,7 +30,7 @@
#' @examples
#' seu_permuted = permute_counts(seu, which = "columns")
permute_counts <- function(X.sub, which = "columns", seed=1) {

set.seed(seed)
if (which == "columns") {
X.sub <- apply(X.sub, 2, sample)
} else if (which == "rows") {
Expand All @@ -33,21 +55,24 @@ get_dstat <- function(corr_matrix) {
# calculate mean and standard deviation
# n = number of entries that are not NA ==> so that when calling mean or sd, we do not count NA values,
# corresponding to mean(..., na.rm = TRUE), sd(..., na.rm = TRUE)
n <- big_apply(corr_matrix,
n <- bigstatsr::big_apply(corr_matrix,
a.FUN = function(X, ind) {
sum(!is.na(X[, ind, drop = FALSE])) # this works because TRUE = 1 and FALSE = 0 in R
}, a.combine = "sum", block.size = 1000
}, a.combine = "sum", block.size = 200
)
mean <- big_apply(corr_matrix, a.FUN = function(X, ind) {
sum(X[, ind, drop = FALSE], na.rm = TRUE) / n
}, a.combine = "c", block.size = 1000) %>% mean()
sd <- big_apply(corr_matrix, a.FUN = function(X, ind) {
sum((X[, ind, drop = FALSE] - mean)^2, na.rm = TRUE) / n
}, a.combine = "sum", block.size = 1000) %>% sqrt()

mean <- bigstatsr::big_apply(corr_matrix, a.FUN = function(X, ind) {
sum(X[, ind, drop = FALSE], na.rm = TRUE)
}, a.combine = "sum", block.size = 200) / n

sd <- bigstatsr::big_apply(corr_matrix, a.FUN = function(X, ind) {
sum((X[, ind, drop = FALSE] - mean)^2, na.rm = TRUE) / (n - 1)
}, a.combine = "sum", block.size = 200) %>% sqrt()

dstat <- list(
mean = mean,
sd = sd,
sn = sd / mean
sn = sd / mean # signal-to-noise ratio
)
return(dstat)
}
Expand Down Expand Up @@ -76,14 +101,14 @@ big_mat_list_mean <- function(fbmList) {
}
n_col <- ncol(fbmList[[1]])
n_row <- nrow(fbmList[[1]])
mat_mean_zscore <- FBM(n_row, n_col)
mat_mean_zscore <- bigstatsr::FBM(n_row, n_col)

big_apply(mat_mean_zscore, a.FUN = function(X, ind) {
bigstatsr::big_apply(mat_mean_zscore, a.FUN = function(X, ind) {
X.sub <- X[, ind, drop = FALSE]
X.sub <- Reduce(function(x, y) x + y[, ind, drop = FALSE], fbmList, init = X.sub)
X[, ind] <- X.sub / length(fbmList)
NULL
}, a.combine = "c", block.size = 500)
}, a.combine = "c", block.size = 200)

return(mat_mean_zscore)
}
Expand Down Expand Up @@ -132,31 +157,31 @@ get_list_stats <- function(fbmList) {

n_col <- ncol(fbmList[[1]])
n_row <- nrow(fbmList[[1]])
mat_sds_zscore <- FBM(n_row, n_col)
mat_sds_zscore <- bigstatsr::FBM(n_row, n_col)


message("Calculating sds...")
big_apply(mat_sds_zscore, a.FUN = function(X, ind) {
bigstatsr::big_apply(mat_sds_zscore, a.FUN = function(X, ind) {
wrap_sds <- function(x, y) {
x <- x + (y[, ind, drop = FALSE] - mat_mean_zscore[, ind, drop = FALSE])^2
}
X.sub <- X[, ind, drop = FALSE]
X.sub <- Reduce(wrap_sds, fbmList, init = X.sub)
X[, ind] <- sqrt(X.sub / (length(fbmList) - 1))
NULL
}, a.combine = "c", block.size = 500)
}, a.combine = "c", block.size = 200)

mat_sn_zscore <- FBM(n_row, n_col)
mat_sn_zscore <- bigstatsr::FBM(n_row, n_col)
diag(mat_sds_zscore) <- NA # to avoid dividing by zero ==> diagonal is 0 cause of perfect correlation ofc

big_apply(mat_sn_zscore, a.FUN = function(X, ind) {
bigstatsr::big_apply(mat_sn_zscore, a.FUN = function(X, ind) {
X[, ind] <- mat_mean_zscore[, ind, drop = FALSE] / mat_sds_zscore[, ind, drop = FALSE]
NULL
}, a.combine = "c", block.size = 500)
}, a.combine = "c", block.size = 200)
res <- list(
mean_zscore = mat_mean_zscore[],
sds_zscore = mat_sds_zscore[],
sn_zscore = mat_sn_zscore[] # signal-to-noise ratio ==> a number which we can use to say, correlation xy is 2 standard deviations away from the mean
sn_zscore = mat_sn_zscore[]
)

return(res)
Expand Down
2 changes: 0 additions & 2 deletions R/big_extract_corr.R
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,6 @@
big_extract_corr <- function(
x_mat) {

df <- ncol(x_mat)-2 # important for the degrees of freedom when computing the p-values
# log normalize the data
bigstatsr::big_apply(x_mat, a.FUN = function(X, ind) {
X.sub <- X[, ind, drop = FALSE]
# normalize the data:
Expand Down
12 changes: 6 additions & 6 deletions R/big_factorise.R
Original file line number Diff line number Diff line change
Expand Up @@ -25,18 +25,18 @@
#' @export big_factorise
big_factorise <- function(x_mat, # nolint
name) {
x_mat <- x_mat %>%
as.matrix() %>% # COMMENT: Unfortunately, FBM cannot handle sparse matrices yet ==> need for transformation into FBM from matrix
bigstatsr::as_FBM()
x_mat <- sparse_to_fbm(x_mat)
# initialize empty FBM with same dimensions as x_mat to store permuted correlation matrix
x_mat_perm <- bigstatsr::FBM(nrow = nrow(x_mat), ncol = ncol(x_mat))

# permute matrix
bigstatsr::big_apply(x_mat, a.FUN = function(X, ind) {
set.seed(1)
X.sub <- X[, ind, drop = FALSE]
X.sub <- permute_counts(X.sub)
X.sub <- apply(X.sub, 2, sample)
x_mat_perm[, ind] <- X.sub
}, a.combine = "c", block.size = 1000)
NULL
}, a.combine = "c", block.size = 200)

# compute correlation matrix for both original and permuted matrix
x_mat_corr <- big_extract_corr(x_mat)
Expand All @@ -49,7 +49,7 @@ big_factorise <- function(x_mat, # nolint
# transform original correlation matrix into zscores
bigstatsr::big_apply(x_mat_corr, a.FUN = function(X, ind) {
X[, ind] <- (X[, ind, drop = FALSE] - dstat$mean) / dstat$sd
}, a.combine = "c", block.size = 1000)
}, a.combine = "c", block.size = 200)

return(x_mat_corr)
}
Expand Down

0 comments on commit db71d86

Please sign in to comment.