Skip to content

Commit

Permalink
Remove melt_to_df, work on matrix instead
Browse files Browse the repository at this point in the history
  • Loading branch information
kollo97 committed Mar 20, 2024
1 parent a635025 commit b1eb021
Show file tree
Hide file tree
Showing 24 changed files with 245 additions and 75 deletions.
1 change: 1 addition & 0 deletions .Rbuildignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,4 @@ functions_graph.svg
R/flippitise.R
^vignettes$
^LICENSE\.md$
^\.github$
1 change: 1 addition & 0 deletions .github/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
*.html
49 changes: 49 additions & 0 deletions .github/workflows/R-CMD-check.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
on:
push:
branches: [main, master]
pull_request:
branches: [main, master]

name: R-CMD-check

jobs:
R-CMD-check:
runs-on: ${{ matrix.config.os }}

name: ${{ matrix.config.os }} (${{ matrix.config.r }})

strategy:
fail-fast: false
matrix:
config:
- {os: macos-latest, r: 'release'}
- {os: windows-latest, r: 'release'}
- {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'}
- {os: ubuntu-latest, r: 'release'}
- {os: ubuntu-latest, r: 'oldrel-1'}

env:
GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
R_KEEP_PKG_SOURCE: yes

steps:
- uses: actions/checkout@v4

- uses: r-lib/actions/setup-pandoc@v2

- uses: r-lib/actions/setup-r@v2
with:
r-version: ${{ matrix.config.r }}
http-user-agent: ${{ matrix.config.http-user-agent }}
use-public-rspm: true

- uses: r-lib/actions/setup-r-dependencies@v2
with:
extra-packages: any::rcmdcheck
needs: check

- uses: r-lib/actions/check-r-package@v2
with:
upload-snapshots: true
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
/tmp
/tmp_bm
/data
/results
/playground
Expand All @@ -18,3 +19,6 @@ prepare*
external_packages.tsv*
.Rhistory
/renv
bm_anglemana.Rmd
test_speed.R
Time_profile*
1 change: 0 additions & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@ importFrom(Rcpp,evalCpp)
importFrom(Seurat,SCTransform)
importFrom(SeuratObject,LayerData)
importFrom(SeuratObject,Layers)
importFrom(cli,cli_progress_along)
importFrom(data.table,as.data.table)
importFrom(data.table,fread)
importFrom(data.table,fwrite)
Expand Down
6 changes: 5 additions & 1 deletion R/RcppExports.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@
# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393

dist2mat <- function(x, bf) {
.Call('_anglemania_dist2mat', PACKAGE = 'anglemania', x, bf)
.Call(`_anglemania_dist2mat`, x, bf)
}

matrixAddition <- function(A, B) {
.Call(`_anglemania_matrixAddition`, A, B)
}

15 changes: 13 additions & 2 deletions R/anglemanise.R
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,8 @@ anglemanise <- function(seurat_list, #nolint
if (!is.list(seurat_list) || any(sapply(seurat_list, function(x) attr(class(x), "package") != "SeuratObject"))) {
stop("seurat_list needs to be a list of Seurat objects")
}
if (!is.integer(n_threads) || n_threads < 1) {
stop("n_threads has to be a positve integer")
if (!is.numeric(n_threads) || n_threads < 1) {
stop("n_threads has to be a positive integer")
}
if (!is.character(path_to_write_angles)) {
stop("path_to_write_angles has to be a string")
Expand Down Expand Up @@ -100,12 +100,23 @@ anglemanise <- function(seurat_list, #nolint
n_threads,
path_to_write_angles)
message(paste0("Adding ", x_name))
# rows_sharp <- rownames(x[["x_sharp"]])
# cols_sharp <- colnames(x[["x_sharp"]])
# rows_blunt <- rownames(x[["x_blunt"]])
# cols_blunt <- colnames(x[["x_blunt"]])
# l_added[["x_sharp"]] <- matrixAddition(l_added[["x_sharp"]], x[["x_sharp"]])
# l_added[["x_blunt"]] <- matrixAddition(l_added[["x_blunt"]], x[["x_blunt"]])
l_added[["x_sharp"]] <- l_added[["x_sharp"]] + x[["x_sharp"]]
l_added[["x_blunt"]] <- l_added[["x_blunt"]] + x[["x_blunt"]]
l_added[["l_angles"]][[x_name]] <- x[["l_angles"]]
l_added[["data_info"]][[x_name]] <- x[["data_info"]]
# rownames(l_added[["x_sharp"]]) <- rows_sharp
# colnames(l_added[["x_sharp"]]) <- cols_sharp
# rownames(l_added[["x_blunt"]]) <- rows_blunt
# colnames(l_added[["x_blunt"]]) <- cols_blunt
invisible(gc())
}

# Implementation of the parallel processing will introduce
# heavy memory load, which is now limited to ~4Gb of RAM for
# an average sized dataset. Consider the tradeoffs.
Expand Down
9 changes: 5 additions & 4 deletions R/approximate_angles.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,24 +8,25 @@
#' from a minimal splitting unit provided by the user.
#'
#' @importFrom tidyr tibble
#' @param x_df_ang data.table. A long data.table with three columns:
#' @param x_mat_ang data.table. A long data.table with three columns: ###########
#' x - name of a gene in row, y - name of a gene in column,
#' angle - an angle between x and y.
#' @param quantile_split double. minimal splitting unit to
#' construct a vector of probabilities from 0 to 1.
#' @return list. List with multiple entries, one of which contains
#' an approxiamted distribution. Others contains NAs to be filled by
#' subsequent functions.
approximate_angles <- function(x_df_ang, #nolint
approximate_angles <- function(x_mat_ang, #nolint
quantile_split) {
##
if (quantile_split <= 0) {
stop("quantile split should be a positive number")
}
angles_dist <- tidyr::tibble(
angle = unname(
quantile(x_df_ang$angle,
seq(0, 1, by = quantile_split)
quantile(x_mat_ang,
seq(0, 1, by = quantile_split),
na.rm = TRUE
)
),
prob = seq(0, 1, by = quantile_split)
Expand Down
11 changes: 7 additions & 4 deletions R/assemble_cons_nodes.R
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,13 @@ assemble_cons_nodes <- function(l_processed,
message(paste0("Starting ", angle))
cons_edges <- l_processed[[paste0("x_", angle)]]
cons_edges <- melt_to_df(cons_edges)
cons_edges <- cons_edges[order(-angle)]
# cons_edges <- cons_edges[order(-angle)]
cons_edges <- cons_edges[angle >= fringe]
cons_edges <- data.table::as.data.table(cons_edges)
cons_edges <- cons_edges[, edge := paste0(x, "_", y)]
cons_edges <- cons_edges[, .(edge, angle)]
colnames(cons_edges) <- c("edge", "importance")
data.table::setkey(cons_edges, edge)
####
nodes <- names(l_processed$data_info)

Expand All @@ -42,11 +43,13 @@ assemble_cons_nodes <- function(l_processed,
critangs_paths <- get_critangs_paths(l_processed,
node,
angle)
dt_cons_samp <- data.table::fread(file = critangs_paths, drop = 2)
dt_cons_samp <- data.table::fread(file = critangs_paths,
drop = 2,
key = "edge")
# data.table::setkey(dt_cons_samp, edge)
# data.table::setkey(cons_edges, edge)
# dt_cons_samp <- dt_cons_samp[cons_edges, nomatch = NULL]
dt_cons_samp <- dt_cons_samp[edge %chin% cons_edges$edge]
dt_cons_samp <- dt_cons_samp[cons_edges, nomatch = NULL]
# dt_cons_samp <- dt_cons_samp[edge %chin% cons_edges$edge]
},
.progress = p
) -> l_nodes_samp
Expand Down
6 changes: 3 additions & 3 deletions R/estimate_critical_angles.R
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
#' angles are extracted from the fitted gaussian.
#'
#' @importFrom magrittr %>%
#' @param x_df_ang data.table. A long data.table with three columns:
#' @param x_mat_ang data.table. A long data.table with three columns: ########
#' x - name of a gene in row, y - name of a gene in column,
#' angle - an angle between x and y.
#' @param s_dims integer. Number of samples in the input gene
Expand All @@ -26,12 +26,12 @@
#' @return list. Records on parameters of the gaussian,
#' goodness of fit, and critical angles.
#' @export estimate_critical_angles
estimate_critical_angles <- function(x_df_ang, #nolint
estimate_critical_angles <- function(x_mat_ang, #nolint
s_dims,
extrema) {
quantile_split <- min(extrema)
##
out <- approximate_angles(x_df_ang, quantile_split) %>%
out <- approximate_angles(x_mat_ang, quantile_split) %>%
fit_gaussian(.) %>%
test_gof_Ks(.) %>%
estimate_MOAV(., s_dims) %>%
Expand Down
41 changes: 26 additions & 15 deletions R/factorise.R
Original file line number Diff line number Diff line change
Expand Up @@ -39,11 +39,15 @@ factorise <- function(x_mat, #nolint
message("Computing cosine distances...")
x_mat_ang <- extract_angles(x_mat, n_threads = n_threads)
invisible(gc())
message("Melting to data table...")
x_df_ang <- melt_to_df(x_mat_ang)
# NOTE: For now, don't use melt_to_df, filter_angles, and write_anlges because it's slow
# NOTE: they belong to the assemble cons nodes etc. code and bascially store the angle data.tables for each sample,
# then later, the data.tables are fioltered by the conserved gene pairs of the processed x_mat_ang. Then it is basically
# computes which data.tables/matrices are the most similar, i.e., how many blunt/sharp gene pairs each sample pair share
# message("Melting to data table...")
# x_df_ang <- melt_to_df(x_mat_ang)
invisible(gc())
message("Estimating critical angles...")
l_angles <- estimate_critical_angles(x_df_ang, s_dims = s_dims, extrema)
l_angles <- estimate_critical_angles(x_mat_ang, s_dims = s_dims, extrema)
if (
any(
l_angles$critical_angles <= 0
Expand All @@ -68,26 +72,33 @@ factorise <- function(x_mat, #nolint
)
stop("Extrema is too thin, consider taking a larger margin")
}
l_flippity <- filter_angles(x_df_ang, l_angles)
if (!is.na(path_to_write_angles) & is.character(path_to_write_angles)) {
message("Writing tables with angles to compute integration metric...")
l_df_ang_md5 <- write_angles(l_flippity, path_to_write_angles)
}
message("Filtering angles...")
##### DEPRECATE WRITING ANGLES FOR NOW AND SEE HOW IT GOES?
# FIXME: taking too long.
# COMMENT: an idea was to instead of storing and tracing back which samples are the closest, one could also use a binary matrix
# COMMENT: https://chat.openai.com/share/34bc83e7-ad3a-49aa-a7cc-a08fb4e89861
# COMMENT: (based on bytes instead of 0 and 1) to record which samples contributed to which element in the x_mat_ang.
# l_flippity <- filter_angles(x_df_ang, l_angles)
# if (!is.na(path_to_write_angles) & is.character(path_to_write_angles)) {
# message("Writing tables with angles to compute integration metric...")
# l_df_ang_md5 <- write_angles(l_flippity, path_to_write_angles)
# }
message("Factorising angle matrices...")
x_mat_angfact_sharp <- ifelse(x_mat_ang <= l_angles$critical_angles[1], 1, 0)
x_mat_angfact_sharp <- as(x_mat_angfact_sharp, "dgCMatrix")
x_mat_angfact_blunt <- ifelse(x_mat_ang >= l_angles$critical_angles[2], 1, 0)
x_mat_angfact_blunt <- as(x_mat_angfact_blunt, "dgCMatrix")
x_mat_angfact_sharp <- ((x_mat_ang <= l_angles$critical_angles[1]) * 1) |> as("dgCMatrix")
x_mat_angfact_blunt <- ((x_mat_ang >= l_angles$critical_angles[2]) * 1) |> as("dgCMatrix")

invisible(gc())
message("test")
gib <- list(
x_sharp = x_mat_angfact_sharp,
x_blunt = x_mat_angfact_blunt,
l_angles = l_angles,
data_info = list(
samples = s_names,
path_to_df_ang = path_to_write_angles,
name_df_ang_sharp = l_df_ang_md5[["sharp"]],
name_df_ang_blunt = l_df_ang_md5[["blunt"]]
path_to_df_ang = path_to_write_angles
# COMMENT: remove l_df_ang_md5... (which is the file name of the stored data.table)
# name_df_ang_sharp = l_df_ang_md5[["sharp"]],
# name_df_ang_blunt = l_df_ang_md5[["blunt"]]
)
)
return(gib)
Expand Down
13 changes: 6 additions & 7 deletions R/import_sl.R
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,11 @@ import_sl <- function(seurat_list, min_mcells = 6) { #nolint
if (length(seurat_list)<2){
stop("Seurat list must at least contain two Seurat objects")
}
if (any(sapply(seurat_list, function(x) attr(class(x), "package") != "SeuratObject"))){
if (any(sapply(seurat_list, function(x) class(x) != "Seurat")) ||
any(sapply(seurat_list, function(x) attr(class(x), "package") != "SeuratObject"))){
stop("List contains elements that are not Seurat objects")
}
if (!is.integer(min_mcells) || min_mcells <= 5) {
if (!is.numeric(min_mcells) || min_mcells <= 5) {
stop("The argument 'min_mcells' must be an integer larger than 5.")
}

Expand All @@ -48,13 +49,11 @@ import_sl <- function(seurat_list, min_mcells = 6) { #nolint
## check if scale.data is present and SCTransform is necessary
if (!all(sapply(seurat_list, function(x) "scale.data" %in% SeuratObject::Layers(x)))){
message("scaled data is not present in all samples.
Applying SCTransform to all samples.")
Normalizing and scaling all samples.")
seurat_list <- lapply(seurat_list,
function(x) {
Seurat::SCTransform(x,
return.only.var.genes = FALSE,
min_cells = 2,
verbose = FALSE)
x <- Seurat::NormalizeData(x)
x <- Seurat::ScaleData(x)
}
)
}
Expand Down
18 changes: 10 additions & 8 deletions R/melt_to_df.R
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,15 @@
melt_to_df <- function(x_mat_ang) { #nolint
##
x_mat_ang[upper.tri(x_mat_ang, diag = TRUE)] <- NA
x_df_ang <- data.table::as.data.table(x_mat_ang, keep.rownames = "x")
x_df_ang <- data.table::melt(x_df_ang,
id.vars = "x",
variable.name = "y",
value.name = "angle")
x_df_ang <- na.omit(x_df_ang)
x_df_ang <- x_df_ang[, y := as.character(y)]
class(x_df_ang) <- "data.table"

idx <- which(!is.na(x_mat_ang), arr.ind = TRUE)

# Create a data table from indices and values
x_df_ang <- data.table::data.table(
x = rownames(x_mat_ang)[idx[, 1]],
y = colnames(x_mat_ang)[idx[, 2]],
angle = x_mat_ang[idx]
)

return(x_df_ang)
}
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,7 @@ Currently, all used functions are documented and ready to be roxygenised. Techni
## Ideas

Following [google doc](https://docs.google.com/document/d/10TEWmnfBOlW7SGFl70eb_26Z-kTeOK-bcNg6WgZJMsk/edit?usp=sharing)

<!-- badges: start -->
[![R-CMD-check](https://github.com/BIMSBbioinfo/anglemania/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/BIMSBbioinfo/anglemania/actions/workflows/R-CMD-check.yaml)
<!-- badges: end -->
2 changes: 1 addition & 1 deletion man/import_sl.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

10 changes: 9 additions & 1 deletion man/integrate_by_features.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

13 changes: 13 additions & 0 deletions src/RcppExports.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,22 @@ BEGIN_RCPP
return rcpp_result_gen;
END_RCPP
}
// matrixAddition
arma::sp_mat matrixAddition(arma::sp_mat A, arma::sp_mat B);
RcppExport SEXP _anglemania_matrixAddition(SEXP ASEXP, SEXP BSEXP) {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< arma::sp_mat >::type A(ASEXP);
Rcpp::traits::input_parameter< arma::sp_mat >::type B(BSEXP);
rcpp_result_gen = Rcpp::wrap(matrixAddition(A, B));
return rcpp_result_gen;
END_RCPP
}

static const R_CallMethodDef CallEntries[] = {
{"_anglemania_dist2mat", (DL_FUNC) &_anglemania_dist2mat, 2},
{"_anglemania_matrixAddition", (DL_FUNC) &_anglemania_matrixAddition, 2},
{NULL, NULL, 0}
};

Expand Down
Binary file modified src/RcppExports.o
Binary file not shown.
Binary file modified src/anglemania.so
Binary file not shown.
Loading

0 comments on commit b1eb021

Please sign in to comment.