From 5f25fb3ba6ccb47a7e44653878ee60e1d7c71c63 Mon Sep 17 00:00:00 2001 From: nschcolnicov Date: Mon, 9 Dec 2024 14:06:45 +0000 Subject: [PATCH] POC contrasts csv -> yaml --- bin/validate_fom_components_yaml.R | 208 ++++++++++++++++++ .../shinyngs/validatefomcomponents/main.nf | 6 +- nextflow_schema.json | 2 +- tests/test_maxquant.nf.test | 1 + workflows/differentialabundance.nf | 17 +- 5 files changed, 224 insertions(+), 10 deletions(-) create mode 100755 bin/validate_fom_components_yaml.R diff --git a/bin/validate_fom_components_yaml.R b/bin/validate_fom_components_yaml.R new file mode 100755 index 00000000..7c7c764c --- /dev/null +++ b/bin/validate_fom_components_yaml.R @@ -0,0 +1,208 @@ +#!/usr/bin/env Rscript + +# Call shinyngs parsing functions to validate simple matrix inputs + +library(optparse) +library(yaml) # Load the yaml package to parse YAML files + +option_list <- list( + make_option( + c("-s", "--sample_metadata"), + type = "character", + default = NULL, + help = "CSV-format sample metadata file." + ), + make_option( + c("-i", "--sample_id_col"), + type = "character", + default = "sample", + help = "Column in sample metadata used as sample identifier. Should be used to name columns of expression matrices, and duplicate rows will be removed based on this column." + ), + make_option( + c("-f", "--feature_metadata"), + type = "character", + default = NULL, + help = "TSV-format feature (often gene) metadata file." + ), + make_option( + c("-j", "--feature_id_col"), + type = "character", + default = "gene_id", + help = "Column in feature metadata used as feature identifier. Should be used to name columns of expression matrices." + ), + make_option( + c("-e", "--assay_files"), + type = "character", + default = NULL, + help = "Comma-separated list of TSV-format file expression matrix files." + ), + make_option( + c("-c", "--contrasts_file"), + type = "character", + default = NULL, + help = "YAML-format contrast file with model and contrast details." + ), + make_option( + c("-d", "--differential_results"), + type = "character", + default = NULL, + help = "Tab-separated files containing at least fold change and p value, one for each row of the contrast file." + ), + make_option( + c("-k", "--fold_change_column"), + type = "character", + default = "log2FoldChange", + help = "Column in differential results files holding fold changes." + ), + make_option( + c("-u", "--unlog_foldchanges"), + action = "store_true", + default = FALSE, + help = "Set this option if fold changes should be unlogged." + ), + make_option( + c("-p", "--pval_column"), + type = "character", + default = "padj", + help = "Column in differential results files holding p values." + ), + make_option( + c("-q", "--qval_column"), + type = "character", + default = "padj", + help = "Column in differential results files holding q values/ adjusted p values." + ), + make_option( + c("-o", "--output_directory"), + type = "character", + default = NULL, + help = "Serialized R object which can be used to generate a shiny app." + ), + make_option( + c("-t", "--separator"), + type = "character", + default = "\t", + help = "Consistent separator for re-written files." + ) +) + +opt_parser <- OptionParser(option_list = option_list) +opt <- parse_args(opt_parser) + +# Check mandatory + +mandatory <- + c( + "sample_metadata", + "assay_files" + ) + +missing_args <- mandatory[!mandatory %in% names(opt)] +if (length(missing_args) > 0) { + stop(paste("Missing mandatory arguments:", paste(missing_args, collapse = ", "))) +} + +library(shinyngs) + +# Load and parse the YAML contrasts file +if (!is.null(opt$contrasts_file)) { + contrasts_data <- yaml.load_file(opt$contrasts_file) +} else { + stop("Contrasts file not provided.") +} + +# Function to process contrasts data from the YAML format +process_contrasts <- function(contrasts) { + contrasts_list <- lapply(contrasts, function(contrast) { + data.frame( + id = contrast$id, + variable = contrast$comparison[1], # Assuming comparison[1] is the variable + reference = contrast$comparison[2], # Assuming comparison[2] is the reference + target = contrast$comparison[3], # Assuming comparison[3] is the target + blocking = ifelse(is.null(contrast$blocking_factors), NA, paste(contrast$blocking_factors, collapse = ", ")) + ) + }) + do.call(rbind, contrasts_list) +} + +# Process the contrasts data +contrasts_df <- process_contrasts(contrasts_data$contrasts) + +# Now validate the inputs and contrasts +# validate_inputs() just wraps the parsing functions of shinyng, used by e.g. +# eselistfromConfig(). These functions are good for ensuring the consistency of +# FOM (feaure/ observation matrix) data. + +validated_parts <- validate_inputs( + samples_metadata = opt$sample_metadata, + features_metadata = opt$feature_metadata, + assay_files = opt$assay_files, + assay_names = opt$assay_names, + # contrasts_file = contrasts_df, # Pass the processed contrasts dataframe + sample_id_col = opt$sample_id_col, + feature_id_col = opt$feature_id_col, + differential_results = opt$differential_results, + pval_column = opt$pval_column, + qval_column = opt$qval_column, + fc_column = opt$fold_change_column, + unlog_foldchanges = opt$unlog_foldchanges +) + +# If an output path is provided we can re-write the data, ensuring consistency +# of output formatting + +if (! is.null(opt$output_directory)){ + + dir.create(opt$output_directory, showWarnings = FALSE, recursive = TRUE) + + # Write the files back, but using the supplied separator + + write_table <- function(x, infile, suffix, na = 'NA'){ + file_basename <- tools::file_path_sans_ext(basename(infile)) + outfile <- file.path(opt$output_directory, paste(file_basename, suffix, 'tsv', sep = '.')) + + print(paste("...... writing", outfile)) + write.table(x, file = outfile, sep = opt$separator, quote = FALSE, row.names = FALSE, na = na) + } + + # Write back the sample sheet, feature metadata and contrasts + + print("Writing basic data...") + for (infile in c('sample_metadata', 'feature_metadata', 'contrasts_file')){ + filename <- opt[[infile]] + if ((! is.null(filename)) && filename %in% names(validated_parts)){ + write(paste("...", infile)) + + # Write contrasts file with empty strings for NAs in blocking + write_table(validated_parts[[filename]], filename, infile, na = ifelse(infile == 'contrasts_file', '', 'NA')) + } + } + + # Write contrasts file with empty strings for NAs in blocking + write_table(contrasts_df, opt$contrasts_file, 'contrasts_file', na = '') + + # Write back the matrices + + print("Writing matrices...") + if ('assays' %in% names(validated_parts)){ + for (assay in names(validated_parts[['assays']])){ + mat <- validated_parts[['assays']][[assay]] + + # Add a column for row names + mat <- data.frame(feature_name = rownames(mat), mat, check.names = FALSE) + colnames(mat)[1] <- opt$feature_id_col + + write_table(mat, assay, 'assay') + } + } + + # Write back the simplified differential results (if supplied) + + if ('differential_stats' %in% names(validated_parts)){ + for (ds in names(validated_parts[['differential_stats']])){ + write_table(validated_parts[['differential_stats']][[ds]], ds) + } + } + +} + diff --git a/modules/nf-core/shinyngs/validatefomcomponents/main.nf b/modules/nf-core/shinyngs/validatefomcomponents/main.nf index bedab3e6..87dcc704 100644 --- a/modules/nf-core/shinyngs/validatefomcomponents/main.nf +++ b/modules/nf-core/shinyngs/validatefomcomponents/main.nf @@ -3,9 +3,7 @@ process SHINYNGS_VALIDATEFOMCOMPONENTS { label 'process_single' conda "${moduleDir}/environment.yml" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/r-shinyngs:2.0.0--r43hdfd78af_0' : - 'biocontainers/r-shinyngs:2.0.0--r43hdfd78af_0' }" + container "community.wave.seqera.io/library/r-shinyngs_r-yaml:aa63537f6db6190c" input: tuple val(meta), path(sample), path(assay_files) @@ -30,7 +28,7 @@ process SHINYNGS_VALIDATEFOMCOMPONENTS { def feature = feature_meta ? "--feature_metadata '$feature_meta'" : '' """ - validate_fom_components.R \\ + validate_fom_components_yaml.R \\ --sample_metadata "$sample" \\ $feature \\ --assay_files "${assay_files.join(',')}" \\ diff --git a/nextflow_schema.json b/nextflow_schema.json index 70af98c8..86ee139e 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -42,7 +42,7 @@ "type": "string", "description": "A CSV file describing sample contrasts", "help_text": "This file is used to define groups of samples from 'input' to compare. It must contain at least the columns 'variable', 'reference', 'target' and 'blocking', where 'variable' is a column in the input sample sheet, 'reference' and 'target' are values in that column, and blocking is a colon-separated list of additional 'blocking' variables (can be an empty string)", - "pattern": "^\\S+\\.(csv|tsv)$", + "pattern": "^\\S+\\.(yaml|yml)$", "format": "file-path", "mimetype": "text/csv", "fa_icon": "fas fa-adjust" diff --git a/tests/test_maxquant.nf.test b/tests/test_maxquant.nf.test index a9c7cc7f..5ec93cb0 100644 --- a/tests/test_maxquant.nf.test +++ b/tests/test_maxquant.nf.test @@ -11,6 +11,7 @@ nextflow_pipeline { when { params { outdir = "$outputDir" + contrasts = "/workspace/differentialabundance/testing/files/MaxQuant_contrasts.yaml" } } diff --git a/workflows/differentialabundance.nf b/workflows/differentialabundance.nf index e49bebda..69f0aece 100644 --- a/workflows/differentialabundance.nf +++ b/workflows/differentialabundance.nf @@ -187,12 +187,19 @@ workflow DIFFERENTIALABUNDANCE { // TODO: there should probably be a separate plotting module in proteus to simplify this ch_contrast_variables = ch_contrasts_file - .splitCsv(header:true, sep:(params.contrasts.endsWith('csv') ? ',' : '\t')) - .map{ it.tail().first() } - .map{ - tuple('id': it.variable) + .map { entry -> + def yaml_file = entry[1] + def yaml_data = new groovy.yaml.YamlSlurper().parse(yaml_file) + + yaml_data.contrasts.collect { contrast -> + tuple('id': contrast.comparison[0]) + } } - .unique() // uniquify to keep each contrast variable only once (in case it exists in multiple lines for blocking etc.) + .flatten() + .unique() // Uniquify to keep each contrast variable only once (in case it exists in multiple lines for blocking etc.) + + ch_contrast_variables.view() + // Run proteus to import protein abundances PROTEUS(