Skip to content

Commit

Permalink
[r] add tf-idf and log normalization functions
Browse files Browse the repository at this point in the history
  • Loading branch information
immanuelazn committed Dec 12, 2024
1 parent 290863d commit 1ef6091
Show file tree
Hide file tree
Showing 7 changed files with 147 additions and 0 deletions.
2 changes: 2 additions & 0 deletions r/NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,8 @@ export(min_by_row)
export(min_scalar)
export(multiply_cols)
export(multiply_rows)
export(normalize_log)
export(normalize_tfidf)
export(nucleosome_counts)
export(open_fragments_10x)
export(open_fragments_dir)
Expand Down
1 change: 1 addition & 0 deletions r/NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ Contributions welcome :)
- Add `rowQuantiles()` and `colQuantiles()` functions, which return the quantiles of each row/column of a matrix. Currently `rowQuantiles()` only works on row-major matrices and `colQuantiles()` only works on col-major matrices.
If `matrixStats` or `MatrixGenerics` packages are installed, `BPCells::colQuantiles()` will fall back to their implementations for non-BPCells objects. (pull request #128)
- Add `pseudobulk_matrix()` which allows pseudobulk aggregation by `sum` or `mean` and calculation of per-pseudobulk `variance` and `nonzero` statistics for each gene (pull request #128)
- Add functions `normalize_tfidf()` and `normalize_log()`, which allow for easy normalization of iterable matrices using TF-IDF or log1p(pull request #168)

## Improvements
- `trackplot_loop()` now accepts discrete color scales
Expand Down
54 changes: 54 additions & 0 deletions r/R/transforms.R
Original file line number Diff line number Diff line change
Expand Up @@ -923,3 +923,57 @@ regress_out <- function(mat, latent_data, prediction_axis = c("row", "col")) {
vars_to_regress = vars_to_regress
)
}

#################
# Normalizations
#################

#' Normalize a matrix using log normalization
#' @param mat (IterableMatrix) Matrix to normalize
#' @param scale_factor (numeric) Scale factor to multiply matrix by for log normalization
#' @param add_one (logical) Add one to the matrix before log normalization
#' @returns log normalized matrix.
#' @export
normalize_log <- function(mat, scale_factor = 1e4, add_one = TRUE) {
assert_is(mat, "IterableMatrix")
assert_is_numeric(scale_factor)
assert_true(is.logical(add_one))
assert_greater_than_zero(scale_factor)
mat <- mat * scale_factor
if (!add_one) mat <- mat - 1
return(log1p(mat))
}


#' Normalize a `(features x cells)`` matrix using term frequency-inverse document frequency
#' @param mat (IterableMatrix) to normalize
#' @param feature_means (numeric) Means of the features to normalize by. If no names are provided, then
#' each numeric value is assumed to correspond to the feature mean for the corresponding row of the matrix.
#' Else, map each feature name to its mean value.
#' @returns tf-idf normalized matrix.
#' @export
normalize_tfidf <- function(mat, feature_means = NULL, threads = 1L) {
assert_is(mat, "IterableMatrix")
assert_is_wholenumber(threads)
# If feature means are passed in, only need to calculate term frequency
if (is.null(feature_means)) {
mat_stats <- matrix_stats(mat, row_stats = c("mean"), col_stats = c("mean"))
feature_means <- mat_stats$row_stats["mean", ]
read_depth <- mat_stats$col_stats["mean", ] * nrow(mat)
} else {
assert_is_numeric(feature_means)
if (!is.null(names(feature_means)) && !is.null(rownames(mat))) {
# Make sure every name in feature means exists in rownames(mat)
# In the case there is a length mismatch but the feature names all exist in feature_means
# will not error out
assert_true(all(rownames(mat) %in% names(feature_means)))
feature_means <- feature_means[rownames(mat)]
} else {
assert_len(feature_means, nrow(mat))
}
read_depth <- matrix_stats(mat, col_stats = c("mean"), threads = threads)$col_stats["mean",] * nrow(mat)
}
tf <- mat %>% multiply_cols(1 / read_depth)
idf <- 1 / feature_means
return(tf %>% multiply_rows(idf))
}
21 changes: 21 additions & 0 deletions r/man/normalize_log.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

21 changes: 21 additions & 0 deletions r/man/normalize_tfidf.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions r/pkgdown/_pkgdown.yml
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,8 @@ reference:
- checksum
- apply_by_row
- regress_out
- normalize_log
- normalize_tfidf
- IterableMatrix-methods
- pseudobulk_matrix

Expand Down
46 changes: 46 additions & 0 deletions r/tests/testthat/test-matrix_transforms.R
Original file line number Diff line number Diff line change
Expand Up @@ -346,3 +346,49 @@ test_that("linear regression works", {
expect_equal(as(m1, "matrix"), ans)
expect_equal(as(m1t, "matrix"), ans)
})

test_that("tf-idf normalization works", {
m <- generate_sparse_matrix(5, 5)
rownames(m) <- paste0("row", seq_len(nrow(m)))
rev_rownames <- rev(rownames(m))
# Create tf-idf normalization for dgCMatrix
res_dgc <- diag(1/rowMeans(m)) %*% (m %*% diag(1/colSums(m))) %>% as("dgCMatrix")

rownames(res_dgc) <- rownames(m)
m2 <- as(m, "IterableMatrix")
# Check that we can pass in row means as a (named) vector
row_means <- matrix_stats(m2, row_stats = c("mean"))$row_stats["mean",]
# Test that row means ordering does not matter as long as names exist
row_means_shuffled <- row_means[sample(1:length(row_means))]
# Test that row means can have an extra element as long as all rownames are in the vector
row_means_plus_one <- c(row_means, row6 = 1)


res <- normalize_tfidf(m2)
expect_equal(res %>% as("dgCMatrix"), res_dgc)
res_with_row_means <- normalize_tfidf(m2, feature_means = row_means)
expect_identical(res, res_with_row_means)

res_with_shuffled_row_means <- normalize_tfidf(m2, feature_means = row_means_shuffled)
expect_identical(res_with_row_means, res_with_shuffled_row_means, res)

res_with_row_means_with_extra_element <- normalize_tfidf(m2, feature_means = row_means_plus_one)
expect_identical(res, res_with_row_means_with_extra_element)
})

test_that("normalize_log works", {
m <- generate_sparse_matrix(5, 5)
m2 <- as(m, "IterableMatrix")
# Test that default params yield the same as log1p on dgCMatrix
res_1 <- as(normalize_log(m2), "dgCMatrix")
expect_equal(res_1, log1p(m*1e4))

# Test that changing scale factor works
res_2 <- as(normalize_log(m2, scale_factor = 1e5), "dgCMatrix")
expect_identical(res_2, log1p(m*1e5))
# Test that removing the add_one works
# log of 0 is -inf, but we don't do that on the c side, and just have really large negative numbers.
res_3 <- as(normalize_log(m2, add_one = FALSE), "dgCMatrix")
res_3@x[res_3@x < -700] <- -Inf
expect_identical(as(res_3, "dgeMatrix"), log(m*1e4))
})

0 comments on commit 1ef6091

Please sign in to comment.