Remove melt_to_df, work on matrix instead

BIMSBbioinfo · Mar 20, 2024 · b1eb021 · b1eb021
1 parent a635025
commit b1eb021
Show file tree

Hide file tree

Showing 24 changed files with 245 additions and 75 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -11,3 +11,4 @@ functions_graph.svg
 R/flippitise.R
 ^vignettes$
 ^LICENSE\.md$
+^\.github$
diff --git a/.github/.gitignore b/.github/.gitignore
@@ -0,0 +1 @@
+*.html
diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml
@@ -0,0 +1,49 @@
+# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
+# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
+on:
+  push:
+    branches: [main, master]
+  pull_request:
+    branches: [main, master]
+
+name: R-CMD-check
+
+jobs:
+  R-CMD-check:
+    runs-on: ${{ matrix.config.os }}
+
+    name: ${{ matrix.config.os }} (${{ matrix.config.r }})
+
+    strategy:
+      fail-fast: false
+      matrix:
+        config:
+          - {os: macos-latest,   r: 'release'}
+          - {os: windows-latest, r: 'release'}
+          - {os: ubuntu-latest,   r: 'devel', http-user-agent: 'release'}
+          - {os: ubuntu-latest,   r: 'release'}
+          - {os: ubuntu-latest,   r: 'oldrel-1'}
+
+    env:
+      GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
+      R_KEEP_PKG_SOURCE: yes
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: r-lib/actions/setup-pandoc@v2
+
+      - uses: r-lib/actions/setup-r@v2
+        with:
+          r-version: ${{ matrix.config.r }}
+          http-user-agent: ${{ matrix.config.http-user-agent }}
+          use-public-rspm: true
+
+      - uses: r-lib/actions/setup-r-dependencies@v2
+        with:
+          extra-packages: any::rcmdcheck
+          needs: check
+
+      - uses: r-lib/actions/check-r-package@v2
+        with:
+          upload-snapshots: true
diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,5 @@
 /tmp
+/tmp_bm
 /data
 /results
 /playground
@@ -18,3 +19,6 @@ prepare*
 external_packages.tsv*
 .Rhistory
 /renv
+bm_anglemana.Rmd
+test_speed.R
+Time_profile*
diff --git a/NAMESPACE b/NAMESPACE
@@ -31,7 +31,6 @@ importFrom(Rcpp,evalCpp)
 importFrom(Seurat,SCTransform)
 importFrom(SeuratObject,LayerData)
 importFrom(SeuratObject,Layers)
-importFrom(cli,cli_progress_along)
 importFrom(data.table,as.data.table)
 importFrom(data.table,fread)
 importFrom(data.table,fwrite)

diff --git a/R/RcppExports.R b/R/RcppExports.R
@@ -2,6 +2,10 @@
 # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
 
 dist2mat <- function(x, bf) {
-    .Call('_anglemania_dist2mat', PACKAGE = 'anglemania', x, bf)
+    .Call(`_anglemania_dist2mat`, x, bf)
+}
+
+matrixAddition <- function(A, B) {
+    .Call(`_anglemania_matrixAddition`, A, B)
 }
 
diff --git a/R/anglemanise.R b/R/anglemanise.R
@@ -42,8 +42,8 @@ anglemanise <- function(seurat_list, #nolint
   if (!is.list(seurat_list) || any(sapply(seurat_list, function(x) attr(class(x), "package") != "SeuratObject"))) {
     stop("seurat_list needs to be a list of Seurat objects")
   }
-  if (!is.integer(n_threads) || n_threads < 1) {
-    stop("n_threads has to be a positve integer")
+  if (!is.numeric(n_threads) || n_threads < 1) {
+    stop("n_threads has to be a positive integer")
   }
   if (!is.character(path_to_write_angles)) {
     stop("path_to_write_angles has to be a string")
@@ -100,12 +100,23 @@ anglemanise <- function(seurat_list, #nolint
                               n_threads,
                               path_to_write_angles)
     message(paste0("Adding ", x_name))
+    # rows_sharp <- rownames(x[["x_sharp"]])
+    # cols_sharp <- colnames(x[["x_sharp"]])
+    # rows_blunt <- rownames(x[["x_blunt"]])
+    # cols_blunt <- colnames(x[["x_blunt"]])
+    # l_added[["x_sharp"]] <- matrixAddition(l_added[["x_sharp"]], x[["x_sharp"]])
+    # l_added[["x_blunt"]] <- matrixAddition(l_added[["x_blunt"]], x[["x_blunt"]])
     l_added[["x_sharp"]] <- l_added[["x_sharp"]] + x[["x_sharp"]]
     l_added[["x_blunt"]] <- l_added[["x_blunt"]] + x[["x_blunt"]]
     l_added[["l_angles"]][[x_name]] <- x[["l_angles"]]
     l_added[["data_info"]][[x_name]] <- x[["data_info"]]
+    # rownames(l_added[["x_sharp"]]) <- rows_sharp
+    # colnames(l_added[["x_sharp"]]) <- cols_sharp
+    # rownames(l_added[["x_blunt"]]) <- rows_blunt
+    # colnames(l_added[["x_blunt"]]) <- cols_blunt
     invisible(gc())
   }
+
   # Implementation of the parallel processing will introduce
   # heavy memory load, which is now limited to ~4Gb of RAM for
   # an average sized dataset. Consider the tradeoffs.

diff --git a/R/approximate_angles.R b/R/approximate_angles.R
@@ -8,24 +8,25 @@
 #' from a minimal splitting unit provided by the user.
 #'
 #' @importFrom tidyr tibble
-#' @param x_df_ang data.table. A long data.table with three columns:
+#' @param x_mat_ang data.table. A long data.table with three columns: ###########
 #'   x - name of a gene in row, y - name of a gene in column,
 #'   angle - an angle between x and y.
 #' @param quantile_split double. minimal splitting unit to
 #'   construct a vector of probabilities from 0 to 1.
 #' @return list. List with multiple entries, one of which contains
 #' an approxiamted distribution. Others contains NAs to be filled by
 #' subsequent functions.
-approximate_angles <- function(x_df_ang, #nolint
+approximate_angles <- function(x_mat_ang, #nolint
                                quantile_split) {
   ##
   if (quantile_split <= 0) {
     stop("quantile split should be a positive number")
   }
   angles_dist <- tidyr::tibble(
     angle = unname(
-      quantile(x_df_ang$angle,
-        seq(0, 1, by = quantile_split)
+      quantile(x_mat_ang,
+        seq(0, 1, by = quantile_split),
+        na.rm = TRUE
       )
     ),
     prob = seq(0, 1, by = quantile_split)

diff --git a/R/assemble_cons_nodes.R b/R/assemble_cons_nodes.R
@@ -25,12 +25,13 @@ assemble_cons_nodes <- function(l_processed,
       message(paste0("Starting ", angle))
       cons_edges <- l_processed[[paste0("x_", angle)]]
       cons_edges <- melt_to_df(cons_edges)
-      cons_edges <- cons_edges[order(-angle)]
+      # cons_edges <- cons_edges[order(-angle)]
       cons_edges <- cons_edges[angle >= fringe]
       cons_edges <- data.table::as.data.table(cons_edges)
       cons_edges <- cons_edges[, edge := paste0(x, "_", y)]
       cons_edges <- cons_edges[, .(edge, angle)]
       colnames(cons_edges) <- c("edge", "importance")
+      data.table::setkey(cons_edges, edge)
       ####
       nodes <- names(l_processed$data_info)
 
@@ -42,11 +43,13 @@ assemble_cons_nodes <- function(l_processed,
           critangs_paths <- get_critangs_paths(l_processed,
                                                           node,
                                                           angle)
-          dt_cons_samp <- data.table::fread(file = critangs_paths, drop = 2)
+          dt_cons_samp <- data.table::fread(file = critangs_paths,
+                                            drop = 2,
+                                            key = "edge")
           # data.table::setkey(dt_cons_samp, edge)
           # data.table::setkey(cons_edges, edge)
-          # dt_cons_samp <- dt_cons_samp[cons_edges, nomatch = NULL]        
-          dt_cons_samp <- dt_cons_samp[edge %chin% cons_edges$edge]
+          dt_cons_samp <- dt_cons_samp[cons_edges, nomatch = NULL]
+          # dt_cons_samp <- dt_cons_samp[edge %chin% cons_edges$edge]
           },
         .progress = p
       ) -> l_nodes_samp

diff --git a/R/estimate_critical_angles.R b/R/estimate_critical_angles.R
@@ -15,7 +15,7 @@
 #' angles are extracted from the fitted gaussian.
 #'
 #' @importFrom magrittr %>%
-#' @param x_df_ang data.table. A long data.table with three columns:
+#' @param x_mat_ang data.table. A long data.table with three columns: ########
 #'   x - name of a gene in row, y - name of a gene in column,
 #'   angle - an angle between x and y.
 #' @param s_dims integer. Number of samples in the input gene
@@ -26,12 +26,12 @@
 #' @return list. Records on parameters of the gaussian,
 #' goodness of fit, and critical angles.
 #' @export estimate_critical_angles
-estimate_critical_angles <- function(x_df_ang, #nolint
+estimate_critical_angles <- function(x_mat_ang, #nolint
                                      s_dims,
                                      extrema) {
   quantile_split <- min(extrema)
   ##
-  out <- approximate_angles(x_df_ang, quantile_split) %>%
+  out <- approximate_angles(x_mat_ang, quantile_split) %>%
     fit_gaussian(.) %>%
     test_gof_Ks(.) %>%
     estimate_MOAV(., s_dims) %>%

diff --git a/R/factorise.R b/R/factorise.R
@@ -39,11 +39,15 @@ factorise <- function(x_mat, #nolint
   message("Computing cosine distances...")
   x_mat_ang <- extract_angles(x_mat, n_threads = n_threads)
   invisible(gc())
-  message("Melting to data table...")
-  x_df_ang <- melt_to_df(x_mat_ang)
+  # NOTE: For now, don't use melt_to_df, filter_angles, and write_anlges because it's slow
+  # NOTE: they belong to the assemble cons nodes etc. code and bascially store the angle data.tables for each sample, 
+  # then later, the data.tables  are fioltered by the conserved gene pairs of the processed x_mat_ang. Then it is basically 
+  # computes which data.tables/matrices are the most similar, i.e., how many blunt/sharp gene pairs each sample pair share
+  # message("Melting to data table...")
+  # x_df_ang <- melt_to_df(x_mat_ang)
   invisible(gc())
   message("Estimating critical angles...")
-  l_angles <- estimate_critical_angles(x_df_ang, s_dims = s_dims, extrema)
+  l_angles <- estimate_critical_angles(x_mat_ang, s_dims = s_dims, extrema)
   if (
     any(
       l_angles$critical_angles <= 0
@@ -68,26 +72,33 @@ factorise <- function(x_mat, #nolint
     )
     stop("Extrema is too thin, consider taking a larger margin")
   }
-  l_flippity <- filter_angles(x_df_ang, l_angles)
-  if (!is.na(path_to_write_angles) & is.character(path_to_write_angles)) {
-    message("Writing tables with angles to compute integration metric...")
-    l_df_ang_md5 <- write_angles(l_flippity, path_to_write_angles)
-  }
+  message("Filtering angles...")
+  ##### DEPRECATE WRITING ANGLES FOR NOW AND SEE HOW IT GOES?
+  # FIXME: taking too long.
+  # COMMENT: an idea was to instead of storing and tracing back which samples are the closest, one could also use a binary matrix 
+  # COMMENT: https://chat.openai.com/share/34bc83e7-ad3a-49aa-a7cc-a08fb4e89861 
+  # COMMENT: (based on bytes instead of 0 and 1) to record which samples contributed to which element in the x_mat_ang.
+  # l_flippity <- filter_angles(x_df_ang, l_angles)
+  # if (!is.na(path_to_write_angles) & is.character(path_to_write_angles)) {
+  #   message("Writing tables with angles to compute integration metric...")
+  #   l_df_ang_md5 <- write_angles(l_flippity, path_to_write_angles)
+  # }
   message("Factorising angle matrices...")
-  x_mat_angfact_sharp <- ifelse(x_mat_ang <= l_angles$critical_angles[1], 1, 0)
-  x_mat_angfact_sharp <- as(x_mat_angfact_sharp, "dgCMatrix")
-  x_mat_angfact_blunt <- ifelse(x_mat_ang >= l_angles$critical_angles[2], 1, 0)
-  x_mat_angfact_blunt <- as(x_mat_angfact_blunt, "dgCMatrix")
+  x_mat_angfact_sharp <- ((x_mat_ang <= l_angles$critical_angles[1]) * 1) |> as("dgCMatrix")
+  x_mat_angfact_blunt <- ((x_mat_ang >= l_angles$critical_angles[2]) * 1) |> as("dgCMatrix")
+
   invisible(gc())
+  message("test")
   gib <- list(
     x_sharp = x_mat_angfact_sharp,
     x_blunt = x_mat_angfact_blunt,
     l_angles = l_angles,
     data_info = list(
       samples = s_names,
-      path_to_df_ang = path_to_write_angles,
-      name_df_ang_sharp = l_df_ang_md5[["sharp"]],
-      name_df_ang_blunt = l_df_ang_md5[["blunt"]]
+      path_to_df_ang = path_to_write_angles
+      # COMMENT: remove l_df_ang_md5... (which is the file name of the stored data.table)
+      # name_df_ang_sharp = l_df_ang_md5[["sharp"]],
+      # name_df_ang_blunt = l_df_ang_md5[["blunt"]]
     )
   )
   return(gib)

diff --git a/R/import_sl.R b/R/import_sl.R
@@ -28,10 +28,11 @@ import_sl <- function(seurat_list, min_mcells = 6) { #nolint
   if (length(seurat_list)<2){
     stop("Seurat list must at least contain two Seurat objects")
   }
-  if (any(sapply(seurat_list, function(x) attr(class(x), "package") != "SeuratObject"))){
+  if (any(sapply(seurat_list, function(x) class(x) != "Seurat")) || 
+      any(sapply(seurat_list, function(x) attr(class(x), "package") != "SeuratObject"))){
     stop("List contains elements that are not Seurat objects")
   }
-  if (!is.integer(min_mcells) || min_mcells <= 5) {
+  if (!is.numeric(min_mcells) || min_mcells <= 5) {
     stop("The argument 'min_mcells' must be an integer larger than 5.")
   }
 
@@ -48,13 +49,11 @@ import_sl <- function(seurat_list, min_mcells = 6) { #nolint
   ##  check if scale.data is present and SCTransform is necessary
   if (!all(sapply(seurat_list, function(x) "scale.data" %in% SeuratObject::Layers(x)))){
     message("scaled data is not present in all samples. 
-  Applying SCTransform to all samples.")
+    Normalizing and scaling all samples.")
     seurat_list <- lapply(seurat_list,
                           function(x) {
-                            Seurat::SCTransform(x,
-                                                return.only.var.genes = FALSE,
-                                                min_cells = 2,
-                                                verbose = FALSE)
+                            x <- Seurat::NormalizeData(x)
+                            x <- Seurat::ScaleData(x)
                           }
     )
   }

diff --git a/R/melt_to_df.R b/R/melt_to_df.R
@@ -18,13 +18,15 @@
 melt_to_df <- function(x_mat_ang) { #nolint
   ##
   x_mat_ang[upper.tri(x_mat_ang, diag = TRUE)] <- NA
-  x_df_ang <- data.table::as.data.table(x_mat_ang, keep.rownames = "x")
-  x_df_ang <- data.table::melt(x_df_ang,
-                               id.vars = "x",
-                               variable.name = "y",
-                               value.name    = "angle")
-  x_df_ang <- na.omit(x_df_ang)
-  x_df_ang <- x_df_ang[, y := as.character(y)]
-  class(x_df_ang) <- "data.table"
+
+  idx <- which(!is.na(x_mat_ang), arr.ind = TRUE)
+
+  # Create a data table from indices and values
+  x_df_ang <- data.table::data.table(
+    x = rownames(x_mat_ang)[idx[, 1]],
+    y = colnames(x_mat_ang)[idx[, 2]],
+    angle = x_mat_ang[idx]
+  )
+
   return(x_df_ang)
 }
diff --git a/README.md b/README.md
@@ -19,3 +19,7 @@ Currently, all used functions are documented and ready to be roxygenised. Techni
 ## Ideas
 
 Following [google doc](https://docs.google.com/document/d/10TEWmnfBOlW7SGFl70eb_26Z-kTeOK-bcNg6WgZJMsk/edit?usp=sharing)
+
+<!-- badges: start -->
+  [![R-CMD-check](https://github.com/BIMSBbioinfo/anglemania/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/BIMSBbioinfo/anglemania/actions/workflows/R-CMD-check.yaml)
+<!-- badges: end -->
diff --git a/man/import_sl.Rd b/man/import_sl.Rd
diff --git a/man/integrate_by_features.Rd b/man/integrate_by_features.Rd
diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp
@@ -23,9 +23,22 @@ BEGIN_RCPP
     return rcpp_result_gen;
 END_RCPP
 }
+// matrixAddition
+arma::sp_mat matrixAddition(arma::sp_mat A, arma::sp_mat B);
+RcppExport SEXP _anglemania_matrixAddition(SEXP ASEXP, SEXP BSEXP) {
+BEGIN_RCPP
+    Rcpp::RObject rcpp_result_gen;
+    Rcpp::RNGScope rcpp_rngScope_gen;
+    Rcpp::traits::input_parameter< arma::sp_mat >::type A(ASEXP);
+    Rcpp::traits::input_parameter< arma::sp_mat >::type B(BSEXP);
+    rcpp_result_gen = Rcpp::wrap(matrixAddition(A, B));
+    return rcpp_result_gen;
+END_RCPP
+}
 
 static const R_CallMethodDef CallEntries[] = {
     {"_anglemania_dist2mat", (DL_FUNC) &_anglemania_dist2mat, 2},
+    {"_anglemania_matrixAddition", (DL_FUNC) &_anglemania_matrixAddition, 2},
     {NULL, NULL, 0}
 };
 

diff --git a/src/RcppExports.o b/src/RcppExports.o
diff --git a/src/anglemania.so b/src/anglemania.so