Merge branch 'dev' into 325-mixed-model-in-limma

nf-core · Nov 22, 2024 · b2a122b · b2a122b
2 parents ff75bb1 + 6307b48
commit b2a122b
Show file tree

Hide file tree

Showing 23 changed files with 1,413 additions and 44 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -5,11 +5,18 @@ on:
     branches:
       - dev
   pull_request:
+    branches:
+      - dev
+      - master
   release:
     types: [published]
   workflow_dispatch:
 
 env:
+  NFT_DIFF: "pdiff"
+  NFT_DIFF_ARGS: "--line-numbers --expand-tabs=2"
+  NFT_VER: "0.9.2"
+  NFT_WORKDIR: "~"
   NXF_ANSI_LOG: false
   NXF_SINGULARITY_CACHEDIR: ${{ github.workspace }}/.singularity
   NXF_SINGULARITY_LIBRARYDIR: ${{ github.workspace }}/.singularity
@@ -20,28 +27,26 @@ concurrency:
 
 jobs:
   test:
-    name: "Run pipeline with test data (${{ matrix.NXF_VER }} | ${{ matrix.test_profile }} | ${{ matrix.compute_profile }})"
+    name: "${{ matrix.NXF_VER }} | ${{ matrix.test_profile }} | ${{ matrix.compute_profile }}"
     # Only run on push if this is the nf-core dev branch (merged PRs)
     if: "${{ github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'nf-core/differentialabundance') }}"
     runs-on: ubuntu-latest
     strategy:
+      fail-fast: false
       matrix:
         NXF_VER:
           - "24.04.2"
           - "latest-everything"
         test_profile:
           - "test"
-          - "test_nogtf"
-          - "test_affy"
-          - "test_maxquant"
-          - "test_soft"
-          - "test_rnaseq_limma"
+          - "nogtf"
+          - "affy"
+          - "maxquant"
+          - "soft"
+          - "rnaseq_limma"
         compute_profile:
-          - "conda"
           - "docker"
           - "singularity"
-        test_name:
-          - "test"
         isMaster:
           - ${{ github.base_ref == 'master' }}
         # Exclude conda and singularity on dev
@@ -53,6 +58,8 @@ jobs:
     steps:
       - name: Check out pipeline code
         uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b # v4
+        with:
+          fetch-depth: 0
 
       - name: Set up Nextflow
         uses: nf-core/setup-nextflow@v2
@@ -87,6 +94,32 @@ jobs:
       - name: Clean up Disk space
         uses: jlumbroso/free-disk-space@54081f138730dfa15788a46383842cd2f914a1be # v1.3.1
 
-      - name: "Run pipeline with test data ${{ matrix.NXF_VER }} | ${{ matrix.test_name }} | ${{ matrix.compute_profile }}"
+      - uses: actions/setup-python@v4
+        with:
+          python-version: "3.11"
+          architecture: "x64"
+
+      - name: Install pdiff to see diff between nf-test snapshots
         run: |
-          nextflow run ${GITHUB_WORKSPACE} -profile ${{ matrix.test_profile }},${{ matrix.compute_profile }} --outdir ./results
+          python -m pip install --upgrade pip
+          pip install pdiff
+
+      - uses: nf-core/setup-nf-test@v1
+        with:
+          version: ${{ env.NFT_VER }}
+
+      - name: Run Tests (${{matrix.NXF_VER}} | ${{matrix.test_profile}} | ${{matrix.compute_profile}})
+        run: |
+          nf-test test \
+              --ci \
+              --tag ${{matrix.test_profile}} \
+              --profile "+${{ matrix.compute_profile }}" \
+              --junitxml=test.xml \
+              --debug
+
+      - name: Publish Test Report
+        uses: mikepenz/action-junit-report@v3
+        if: always() # always run even if the previous step fails
+        with:
+          report_paths: test.xml
+          annotate_only: true
diff --git a/.gitignore b/.gitignore
@@ -7,3 +7,5 @@ testing/
 testing*
 *.pyc
 null/
+.nf-test
+.nf-test.log
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,11 +7,21 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Added
 
+- [[#358](https://github.com/nf-core/differentialabundance/pull/358)] - Added software version check in nf-tests and gene biotype column check in `.Rmd` ([@atrigila](https://github.com/atrigila), review by [@pinin4fjords](https://github.com/pinin4fjords))
+- [[#345](https://github.com/nf-core/differentialabundance/pull/345)] - Plot differentially expressed genes by gene biotype ([@atrigila](https://github.com/atrigila), review by [@grst](https://github.com/grst))
+- [[#343](https://github.com/nf-core/differentialabundance/pull/343)] - Add pipeline-level nf-tests ([@atrigila](https://github.com/atrigila), review by [@pinin4fjords](https://github.com/pinin4fjords) and [@nschcolnicov](https://github.com/nschcolnicov))
 - [[#286](https://github.com/nf-core/differentialabundance/pull/286)] - Integration of limma voom for rnaseq data ([@KamilMaliszArdigen](https://github.com/KamilMaliszArdigen), review by [@pinin4fjords](https://github.com/pinin4fjords))
+- [[#354](https://github.com/nf-core/differentialabundance/pull/354)] - Warning message within the R Markdown report to control when genes don't have annotation data ([@alanmmobbs93](https://github.com/alanmmobbs93)). Review by [@WackerO](https://github.com/WackerO) and [@pinin4fjords](https://github.com/pinin4fjords).
 - [[#325](https://github.com/nf-core/differentialabundance/pull/325)] - Integration of limma voom for rnaseq data with mixed models ([@KamilMaliszArdigen](https://github.com/KamilMaliszArdigen), review by [@pinin4fjords](https://github.com/pinin4fjords))
 
+
 ### Fixed
 
+- [[#358](https://github.com/nf-core/differentialabundance/pull/358)] - Fixed nf-tests not running due to `--changed-since HEAD^`([@atrigila](https://github.com/atrigila), review by [@pinin4fjords](https://github.com/pinin4fjords))
+- [[#344](https://github.com/nf-core/differentialabundance/pull/344)] - Fixed replacement of NA sub-strings
+  ([@atrigila](https://github.com/atrigila), suggested by [@BEFH](https://github.com/BEFH), review by [@apeltzer](https://github.com/apeltzer) and [@nschcolnicov](https://github.com/nschcolnicov))
+- [[#342](https://github.com/nf-core/differentialabundance/pull/342)] - Fixed incorrectly colored dots in report volcano plots for logFC thresholds <1 ([@WackerO](https://github.com/WackerO), review by [@pinin4fjords](https://github.com/pinin4fjords))
+- [[#330](https://github.com/nf-core/differentialabundance/pull/330)] - Fixed broken docs by removing g:profiler colons ([@WackerO](https://github.com/WackerO), review by [@pinin4fjords](https://github.com/pinin4fjords))
 - [[#304](https://github.com/nf-core/differentialabundance/pull/304)] - Removed TXT file options from nextflow_schema where they are equivalent to TSV to make the input files clearer ([@WackerO](https://github.com/WackerO), review by [@pinin4fjords](https://github.com/pinin4fjords))
 - [[#299](https://github.com/nf-core/differentialabundance/pull/299)] - Add exclusions for 3.0.1 template update ([@pinin4fjords](https://github.com/pinin4fjords))
 - [[#289](https://github.com/nf-core/differentialabundance/pull/289)] - Fix missing ch_gene_sets default for gprofiler2 ([@WackerO](https://github.com/WackerO), review by [@pinin4fjords](https://github.com/pinin4fjords))
@@ -22,6 +32,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Changed
 
+- [[#348](https://github.com/nf-core/differentialabundance/pull/348)] - Update test_full profile matrix file and indentation of profiles in nextflow.config ([@nschcolnicov](https://github.com/nschcolnicov), review by [@WackerO](https://github.com/WackerO) and [@pinin4fjords](https://github.com/pinin4fjords))
+
 ## v1.5.0 - 2024-05-08
 
 ### `Added`

diff --git a/assets/differentialabundance_report.Rmd b/assets/differentialabundance_report.Rmd
@@ -370,10 +370,13 @@ differential_files <- lapply(contrasts$id, function(d){
     file.path(params$input_dir, paste0(gsub(' |;', '_', d), differential_file_suffix))
 })
 
-differential_results <- lapply(differential_files, function(diff_file){
-    if (! file.exists(diff_file)){
-        stop(paste("Differential file", diff_file, "does not exist"))
-    }
+# Initialize vector to store warning messages before merging tables
+warnings_list <- c()
+
+# Read differential results and merge with features table
+results <- lapply(differential_files, function(diff_file) {
+    if (!file.exists(diff_file)) stop(paste("Differential file", diff_file, "does not exist"))
+
     diff <- read_differential(
         diff_file,
         feature_id_column = params$differential_feature_id_column,
@@ -382,19 +385,49 @@ differential_results <- lapply(differential_files, function(diff_file){
         qval_column = params$differential_qval_column
     )
 
-    # If fold changes are not logged already, log them (we assume they're logged
-    # later on)
-
-    if (! params$differential_foldchanges_logged){
+    # Log transform fold changes if not already logged
+    if (!params$differential_foldchanges_logged) {
         diff[[params$differential_fc_column]] <- log2(diff[[params$differential_fc_column]])
     }
 
-    # Annotate differential tables if possible
-    if (! is.null(params$features)){
-        diff <- merge(features, diff, by.x = params$features_id_col, by.y = params$differential_feature_id_column)
+    # Annotate differential table if features table is provided
+    if (!is.null(params$features)) {
+        ## Merge Differential expression table on features table
+        merged <- merge(features, diff, by.x = params$features_id_col, by.y = params$differential_feature_id_column)
+
+        ## Get number of missing rows
+        n_missing <- length(setdiff(diff[[params$differential_feature_id_column]], merged[[params$features_id_col]]))
+
+        ## Create warnings if necessary
+        warnings <- c(
+            ## Missing IDs
+            if (n_missing > 0) sprintf(
+                '<p style="color:#DAA520;"><strong>WARNING:</strong> %d IDs from the differential table (%s) were lost on merge with features table (%s).</p>',
+                n_missing, basename(diff_file), basename(params$features)
+            ),
+            ## Check whether there are fewer rows, missing data
+            if (nrow(merged) < nrow(diff)) sprintf(
+                '<p style="color:#DAA520;"><strong>WARNING:</strong> Rows were lost on merge (%s -> %s). Original: %d, Merged: %d.</p>',
+                basename(diff_file), basename(params$features), nrow(diff), nrow(merged)
+            ),
+            ## Check whether there are more rows, possible duplications
+            if (nrow(merged) > nrow(diff)) sprintf(
+                '<p style="color:#DAA520;"><strong>WARNING:</strong> Rows were duplicated on merge (%s -> %s). Original: %d, Merged: %d.</p>',
+                basename(diff_file), basename(params$features), nrow(diff), nrow(merged)
+            )
+        )
+    } else {
+        merged <- diff
+        warnings <- character(0)
     }
-    diff
+    ## Collect results
+    list(diff_features = merged, warnings = warnings)
 })
+
+# Separate differential_results and warnings_list from results
+differential_results <- lapply(results, `[[`, "diff_features")
+warnings_list <- unlist(lapply(results, `[[`, "warnings"))
+
 names(differential_results) <- contrasts$id
 ```
 
@@ -726,7 +759,7 @@ iv_min_group_sizes <- unlist(lapply(informative_variables, function(x) min(table
 
 if (any(iv_min_group_sizes > 2)){
     cat("\n### Outlier detection {.tabset}\n")
-    cat("\nOutlier detection based on [median absolute deviation](https://wiki.arrayserver.com/wiki/index.php?title=CorrelationQC.pdf) was undertaken, the outlier scoring is plotted below.\n")
+    cat("\nOutlier detection based on [median absolute deviation](https://archive.ph/o3thZ) was undertaken, the outlier scoring is plotted below. For more on MAD, see [this wiki article](https://en.wikipedia.org/wiki/Median_absolute_deviation).\n")
 }
 
 foo <- lapply(informative_variables[iv_min_group_sizes > 2], function(iv){
@@ -787,7 +820,6 @@ foo <- lapply(names(p_value_types), function(pvt){
 ```
 
 ```{r, echo=FALSE, results='asis', eval = FALSE}
-
 differential_summary_string <- paste(
     paste(
     lapply(
@@ -806,7 +838,13 @@ cat(differential_summary_string)
 
 ### Differential `r params$features_type` details
 
-```{r, echo=FALSE, results='asis'}
+```{r, echo=FALSE, results='asis', warning=FALSE, message=FALSE}
+
+# Display all warnings related to number of rows
+if (length(warnings_list) > 0) {
+    for (warning in warnings_list) { cat(warning) }
+}
+
 for (i in 1:nrow(contrasts)){
     cat("\n#### ", contrast_descriptions[i], "  {.tabset}\n")
 
@@ -833,7 +871,7 @@ for (i in 1:nrow(contrasts)){
         cat("\n##### ", pvt, " p values\n")
         pval_column <- p_value_types[[pvt]]
 
-        de_fc <- abs(full_de[[params$differential_fc_column]]) >= log2(params$differential_min_fold_change)
+        de_fc <- abs(full_de[[params$differential_fc_column]]) >= abs(log2(params$differential_min_fold_change))
         de_fc_label <- paste("abs(logFC) >=", log2(params$differential_min_fold_change))
 
         de_pval <- full_de[[pval_column]] <= p_value_thresholds[[pvt]]
@@ -895,6 +933,28 @@ for (i in 1:nrow(contrasts)){
         if (nrow(contrast_de) > 0){
             contrast_de <- round_dataframe_columns(contrast_de, digits=params$report_round_digits)
             print( htmltools::tagList(datatable(contrast_de, caption = paste('Differential genes', dir, 'in', contrast_descriptions[i], " (check", differential_files[[i]], "for more detail)"), rownames = FALSE) ))
+
+        if ("Gene biotype" %in% colnames(contrast_de)) {
+            # Plot Differentially Expressed Genes by Gene Biotype
+            gene_biotype_table <- contrast_de %>%
+                group_by(`Gene biotype`) %>%
+                summarise(count = dplyr::n(), .groups = 'drop') %>%
+                filter(count > 0) %>%
+                arrange(desc(count))
+
+            gene_biotype_plot <- ggplot(gene_biotype_table, aes(x = reorder(`Gene biotype`, -count), y = count)) +
+                geom_bar(stat = "identity", position = position_dodge()) +
+                labs(
+                    title = paste0("Differentially Expressed Genes by Gene Biotype (", dir, ")"),
+                    x = "Gene Biotype",
+                    y = "Number of Differentially Expressed Genes"  ) +
+                theme_minimal() +
+                theme(axis.text.x = element_text(angle = 45, hjust = 1))
+
+            print(gene_biotype_plot)
+        } else {
+            cat("Column 'Gene biotype' does not exist. Skipping plot.\n")
+        }
         }else{
             cat(paste0("No significantly differential '", dir, "' genes.\n\n"))
         }

diff --git a/conf/test_full.config b/conf/test_full.config
@@ -17,7 +17,7 @@ params {
     // Input data
     input = 'https://raw.githubusercontent.com/nf-core/test-datasets/differentialabundance/testdata/rnaseq_featurecounts_sample_preparations.tsv'
     contrasts = 'https://raw.githubusercontent.com/nf-core/test-datasets/differentialabundance/testdata/rnaseq_featurecounts_contrast_file.csv'
-    matrix = 'https://raw.githubusercontent.com/nf-core/test-datasets/differentialabundance/testdata/rnaseq_featurecounts_merged_gene_counts.txt'
+    matrix = 'https://raw.githubusercontent.com/nf-core/test-datasets/differentialabundance/testdata/rnaseq_featurecounts_merged_gene_counts.tsv'
     gtf = 'https://ftp.ensembl.org/pub/release-81/gtf/mus_musculus/Mus_musculus.GRCm38.81.gtf.gz'
 
     // Configure inputs

diff --git a/docs/usage.md b/docs/usage.md
@@ -286,16 +286,16 @@ Currently, two tools can be used to do gene set enrichment analysis.
 --gene_sets_files gene_sets.gmt
 ```
 
-### g:Profiler
+### gProfiler2
 
-The [gprofiler2](https://cran.r-project.org/web/packages/gprofiler2/vignettes/gprofiler2.html) package can be used to test which pathways are enriched in the sets of differential genes produced by the the DESeq2 or limma modules. It is an R interface for the g:Profiler webtool. In the simplest form, this feature can be enabled with the parameters from the following example:
+The [gprofiler2](https://cran.r-project.org/web/packages/gprofiler2/vignettes/gprofiler2.html) package can be used to test which pathways are enriched in the sets of differential genes produced by the the DESeq2 or limma modules. It is an R interface for the gprofiler webtool. In the simplest form, this feature can be enabled with the parameters from the following example:
 
 ```bash
 --gprofiler2_run true \
 --gprofiler2_organism mmusculus
 ```
 
-If gene sets have been specified to the workflow via `--gene_sets_files` these are used by default. Specifying `--gprofiler2_organism` (mmusculus for Mus musculus, hsapiens for Homo sapiens etc.) will override those gene sets with g:profiler's own for the relevant species. `--gprofiler2_token` will override both options and use gene sets from a previous g:profiler run.
+If gene sets have been specified to the workflow via `--gene_sets_files` these are used by default. Specifying `--gprofiler2_organism` (mmusculus for Mus musculus, hsapiens for Homo sapiens etc.) will override those gene sets with gprofiler's own for the relevant species. `--gprofiler2_token` will override both options and use gene sets from a previous gprofiler run.
 
 By default the analysis will be run with a background list of genes that passed the abundance filter (i.e. those genes that actually had some expression); see for example https://doi.org/10.1186/s13059-015-0761-7 for why this is advisable. You can provide your own background list with `--gprofiler2_background_file background.txt`or if you want to not use any background, set `--gprofiler2_background_file false`.
 

diff --git a/nextflow.config b/nextflow.config
@@ -324,18 +324,18 @@ profiles {
         executor.cpus           = 4
         executor.memory         = 8.GB
     }
-    test      { includeConfig 'conf/test.config'      }
-    test_nogtf { includeConfig 'conf/test_nogtf.config' }
-    test_full { includeConfig 'conf/test_full.config' }
-    affy { includeConfig 'conf/affy.config' }
-    maxquant { includeConfig 'conf/maxquant.config' }
-    rnaseq { includeConfig 'conf/rnaseq.config' }
-    rnaseq_limma { includeConfig 'conf/rnaseq_limma.config' }
+    test              { includeConfig 'conf/test.config'              }
+    test_nogtf        { includeConfig 'conf/test_nogtf.config'        }
+    test_full         { includeConfig 'conf/test_full.config'         }
+    affy              { includeConfig 'conf/affy.config'              }
+    maxquant          { includeConfig 'conf/maxquant.config'          }
+    rnaseq            { includeConfig 'conf/rnaseq.config'            }
+    rnaseq_limma      { includeConfig 'conf/rnaseq_limma.config'      }
     test_rnaseq_limma { includeConfig 'conf/test_rnaseq_limma.config' }
-    soft {includeConfig 'conf/soft.config'}
-    test_affy { includeConfig 'conf/test_affy.config' }
-    test_maxquant { includeConfig 'conf/test_maxquant.config' }
-    test_soft {includeConfig 'conf/test_soft.config' }
+    soft              { includeConfig 'conf/soft.config'              }
+    test_affy         { includeConfig 'conf/test_affy.config'         }
+    test_maxquant     { includeConfig 'conf/test_maxquant.config'     }
+    test_soft         { includeConfig 'conf/test_soft.config'         }
 }
 
 // Load nf-core custom profiles from different Institutions

diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -373,7 +373,7 @@
                 "exploratory_mad_threshold": {
                     "type": "integer",
                     "default": -5,
-                    "help_text": "MAD = median absolute deviation. A threshold on this value is used to define observations (samples) as outliers, or not, in exploratory plots. Based on the definition at https://wiki.arrayserver.com/wiki/index.php?title=CorrelationQC.pdf. ",
+                    "help_text": "MAD = median absolute deviation. A threshold on this value is used to define observations (samples) as outliers, or not, in exploratory plots. Based on the definition at https://archive.ph/o3thZ. For more on MAD, see https://en.wikipedia.org/wiki/Median_absolute_deviation.",
                     "description": "Threshold on MAD score for outlier identification",
                     "fa_icon": "fas fa-angry"
                 },

diff --git a/nf-test.config b/nf-test.config
@@ -0,0 +1,20 @@
+config {
+    // location for all nf-tests
+    testsDir "tests"
+
+    // nf-test directory including temporary files for each test
+    workDir ".nf-test"
+
+    // location of library folder that is added automatically to the classpath
+    libDir "tests/pipeline/lib/"
+
+    // location of an optional nextflow.config file specific for executing tests
+    configFile "nextflow.config"
+
+    // run all test with the defined docker profile from the main nextflow.config
+    profile "docker"
+
+    plugins {
+        load "[email protected]"
+    }
+}