nf-core · nschcolnicov · Dec 9, 2024 · nschcolnicov · Dec 9, 2024
diff --git a/bin/validate_fom_components_yaml.R b/bin/validate_fom_components_yaml.R
@@ -0,0 +1,208 @@
+#!/usr/bin/env Rscript
+
+# Call shinyngs parsing functions to validate simple matrix inputs
+
+library(optparse)
+library(yaml)  # Load the yaml package to parse YAML files
+
+option_list <- list(
+    make_option(
+        c("-s", "--sample_metadata"),
+        type = "character",
+        default = NULL,
+        help = "CSV-format sample metadata file."
+    ),
+    make_option(
+        c("-i", "--sample_id_col"),
+        type = "character",
+        default = "sample",
+        help = "Column in sample metadata used as sample identifier. Should be used to name columns of expression matrices, and duplicate rows will be removed based on this column."
+    ),
+    make_option(
+        c("-f", "--feature_metadata"),
+        type = "character",
+        default = NULL,
+        help = "TSV-format feature (often gene) metadata file."
+    ),
+    make_option(
+        c("-j", "--feature_id_col"),
+        type = "character",
+        default = "gene_id",
+        help = "Column in feature metadata used as feature identifier. Should be used to name columns of expression matrices."
+    ),
+    make_option(
+        c("-e", "--assay_files"),
+        type = "character",
+        default = NULL,
+        help = "Comma-separated list of TSV-format file expression matrix files."
+    ),
+    make_option(
+        c("-c", "--contrasts_file"),
+        type = "character",
+        default = NULL,
+        help = "YAML-format contrast file with model and contrast details."
+    ),
+    make_option(
+        c("-d", "--differential_results"),
+        type = "character",
+        default = NULL,
+        help = "Tab-separated files containing at least fold change and p value, one for each row of the contrast file."
+    ),
+    make_option(
+        c("-k", "--fold_change_column"),
+        type = "character",
+        default = "log2FoldChange",
+        help = "Column in differential results files holding fold changes."
+    ),
+    make_option(
+        c("-u", "--unlog_foldchanges"),
+        action = "store_true",
+        default = FALSE,
+        help = "Set this option if fold changes should be unlogged."
+    ),
+    make_option(
+        c("-p", "--pval_column"),
+        type = "character",
+        default = "padj",
+        help = "Column in differential results files holding p values."
+    ),
+    make_option(
+        c("-q", "--qval_column"),
+        type = "character",
+        default = "padj",
+        help = "Column in differential results files holding q values/ adjusted p values."
+    ),
+    make_option(
+        c("-o", "--output_directory"),
+        type = "character",
+        default = NULL,
+        help = "Serialized R object which can be used to generate a shiny app."
+    ),
+    make_option(
+        c("-t", "--separator"),
+        type = "character",
+        default = "\t",
+        help = "Consistent separator for re-written files."
+    )
+)
+
+opt_parser <- OptionParser(option_list = option_list)
+opt <- parse_args(opt_parser)
+
+# Check mandatory
+
+mandatory <-
+    c(
+        "sample_metadata",
+        "assay_files"
+    )
+
+missing_args <- mandatory[!mandatory %in% names(opt)]
+if (length(missing_args) > 0) {
+    stop(paste("Missing mandatory arguments:", paste(missing_args, collapse = ", ")))
+}
+
+library(shinyngs)
+
+# Load and parse the YAML contrasts file
+if (!is.null(opt$contrasts_file)) {
+    contrasts_data <- yaml.load_file(opt$contrasts_file)
+} else {
+    stop("Contrasts file not provided.")
+}
+
+# Function to process contrasts data from the YAML format
+process_contrasts <- function(contrasts) {
+    contrasts_list <- lapply(contrasts, function(contrast) {
+        data.frame(
+        id = contrast$id,
+        variable = contrast$comparison[1],  # Assuming comparison[1] is the variable
+        reference = contrast$comparison[2],  # Assuming comparison[2] is the reference
+        target = contrast$comparison[3],  # Assuming comparison[3] is the target
+        blocking = ifelse(is.null(contrast$blocking_factors), NA, paste(contrast$blocking_factors, collapse = ", "))
+        )
+    })
+    do.call(rbind, contrasts_list)
+}
+
+# Process the contrasts data
+contrasts_df <- process_contrasts(contrasts_data$contrasts)
+
+# Now validate the inputs and contrasts
+# validate_inputs() just wraps the parsing functions of shinyng, used by e.g.
+# eselistfromConfig(). These functions are good for ensuring the consistency of
+# FOM (feaure/ observation matrix) data.
+
+validated_parts <- validate_inputs(
+    samples_metadata = opt$sample_metadata,
+    features_metadata = opt$feature_metadata,
+    assay_files = opt$assay_files,
+    assay_names = opt$assay_names,
+    #   contrasts_file = contrasts_df,  # Pass the processed contrasts dataframe
+    sample_id_col = opt$sample_id_col,
+    feature_id_col = opt$feature_id_col,
+    differential_results = opt$differential_results,
+    pval_column = opt$pval_column,
+    qval_column = opt$qval_column,
+    fc_column = opt$fold_change_column,
+    unlog_foldchanges = opt$unlog_foldchanges
+)
+
+# If an output path is provided we can re-write the data, ensuring consistency
+# of output formatting
+
+if (! is.null(opt$output_directory)){
+
+    dir.create(opt$output_directory, showWarnings = FALSE, recursive = TRUE)
+
+    # Write the files back, but using the supplied separator
+
+    write_table <- function(x, infile, suffix, na = 'NA'){
+        file_basename <- tools::file_path_sans_ext(basename(infile))
+        outfile <- file.path(opt$output_directory, paste(file_basename, suffix, 'tsv', sep = '.'))
+
+        print(paste("...... writing", outfile))
+        write.table(x, file = outfile, sep = opt$separator, quote = FALSE, row.names = FALSE, na = na)
+    }
+
+    # Write back the sample sheet, feature metadata and contrasts
+
+    print("Writing basic data...")
+    for (infile in c('sample_metadata', 'feature_metadata', 'contrasts_file')){
+        filename <- opt[[infile]]
+        if ((! is.null(filename)) && filename %in% names(validated_parts)){
+        write(paste("...", infile))
+
+        # Write contrasts file with empty strings for NAs in blocking
+        write_table(validated_parts[[filename]], filename, infile, na = ifelse(infile == 'contrasts_file', '', 'NA'))
+        }
+    }
+
+    # Write contrasts file with empty strings for NAs in blocking
+    write_table(contrasts_df, opt$contrasts_file, 'contrasts_file', na = '')
+
+    # Write back the matrices
+
+    print("Writing matrices...")
+    if ('assays' %in% names(validated_parts)){
+        for (assay in names(validated_parts[['assays']])){
+        mat <- validated_parts[['assays']][[assay]]
+
+        # Add a column for row names
+        mat <- data.frame(feature_name = rownames(mat), mat, check.names = FALSE)
+        colnames(mat)[1] <- opt$feature_id_col
+
+        write_table(mat, assay, 'assay')
+        }
+    }
+
+    # Write back the simplified differential results (if supplied)
+
+    if ('differential_stats' %in% names(validated_parts)){
+        for (ds in names(validated_parts[['differential_stats']])){
+        write_table(validated_parts[['differential_stats']][[ds]], ds)
+        }
+    }
+
+}
+
diff --git a/modules/nf-core/shinyngs/validatefomcomponents/main.nf b/modules/nf-core/shinyngs/validatefomcomponents/main.nf
diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -42,7 +42,7 @@
                     "type": "string",
                     "description": "A CSV file describing sample contrasts",
                     "help_text": "This file is used to define groups of samples from 'input' to compare.  It must contain at least the columns 'variable', 'reference', 'target' and 'blocking', where 'variable' is a column in the input sample sheet, 'reference' and 'target' are values in that column, and blocking is a colon-separated list of additional 'blocking' variables (can be an empty string)",
-                    "pattern": "^\\S+\\.(csv|tsv)$",
+                    "pattern": "^\\S+\\.(yaml|yml)$",
                     "format": "file-path",
                     "mimetype": "text/csv",
                     "fa_icon": "fas fa-adjust"

diff --git a/tests/test_maxquant.nf.test b/tests/test_maxquant.nf.test
@@ -11,6 +11,7 @@ nextflow_pipeline {
         when {
             params {
                 outdir        = "$outputDir"
+                contrasts = "/workspace/differentialabundance/testing/files/MaxQuant_contrasts.yaml"
             }
         }
 

diff --git a/workflows/differentialabundance.nf b/workflows/differentialabundance.nf
@@ -187,12 +187,19 @@ workflow DIFFERENTIALABUNDANCE {
         // TODO: there should probably be a separate plotting module in proteus to simplify this
 
         ch_contrast_variables = ch_contrasts_file
-            .splitCsv(header:true, sep:(params.contrasts.endsWith('csv') ? ',' : '\t'))
-            .map{ it.tail().first() }
-            .map{
-                tuple('id': it.variable)
+            .map { entry ->
+                def yaml_file = entry[1]
+                def yaml_data = new groovy.yaml.YamlSlurper().parse(yaml_file)
+
+                yaml_data.contrasts.collect { contrast ->
+                    tuple('id': contrast.comparison[0])
+                }
             }
-            .unique()   // uniquify to keep each contrast variable only once (in case it exists in multiple lines for blocking etc.)
+            .flatten()
+            .unique() // Uniquify to keep each contrast variable only once (in case it exists in multiple lines for blocking etc.)
+
+        ch_contrast_variables.view()
+
 
         // Run proteus to import protein abundances
         PROTEUS(
-Original file line number
+Diff line change
@@ Expand Up / @@ -11,6 +11,7 @@ nextflow_pipeline { @@
             when {
                 params {
                     outdir        = "$outputDir"
+                    contrasts = "/workspace/differentialabundance/testing/files/MaxQuant_contrasts.yaml"
                 }
             }
@@ Expand Down @@