Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

POC contrasts csv -> yaml #382

Draft
wants to merge 1 commit into
base: dev
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
208 changes: 208 additions & 0 deletions bin/validate_fom_components_yaml.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,208 @@
#!/usr/bin/env Rscript
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.


# Call shinyngs parsing functions to validate simple matrix inputs

library(optparse)
library(yaml) # Load the yaml package to parse YAML files

option_list <- list(
make_option(
c("-s", "--sample_metadata"),
type = "character",
default = NULL,
help = "CSV-format sample metadata file."
),
make_option(
c("-i", "--sample_id_col"),
type = "character",
default = "sample",
help = "Column in sample metadata used as sample identifier. Should be used to name columns of expression matrices, and duplicate rows will be removed based on this column."
),
make_option(
c("-f", "--feature_metadata"),
type = "character",
default = NULL,
help = "TSV-format feature (often gene) metadata file."
),
make_option(
c("-j", "--feature_id_col"),
type = "character",
default = "gene_id",
help = "Column in feature metadata used as feature identifier. Should be used to name columns of expression matrices."
),
make_option(
c("-e", "--assay_files"),
type = "character",
default = NULL,
help = "Comma-separated list of TSV-format file expression matrix files."
),
make_option(
c("-c", "--contrasts_file"),
type = "character",
default = NULL,
help = "YAML-format contrast file with model and contrast details."
),
make_option(
c("-d", "--differential_results"),
type = "character",
default = NULL,
help = "Tab-separated files containing at least fold change and p value, one for each row of the contrast file."
),
make_option(
c("-k", "--fold_change_column"),
type = "character",
default = "log2FoldChange",
help = "Column in differential results files holding fold changes."
),
make_option(
c("-u", "--unlog_foldchanges"),
action = "store_true",
default = FALSE,
help = "Set this option if fold changes should be unlogged."
),
make_option(
c("-p", "--pval_column"),
type = "character",
default = "padj",
help = "Column in differential results files holding p values."
),
make_option(
c("-q", "--qval_column"),
type = "character",
default = "padj",
help = "Column in differential results files holding q values/ adjusted p values."
),
make_option(
c("-o", "--output_directory"),
type = "character",
default = NULL,
help = "Serialized R object which can be used to generate a shiny app."
),
make_option(
c("-t", "--separator"),
type = "character",
default = "\t",
help = "Consistent separator for re-written files."
)
)

opt_parser <- OptionParser(option_list = option_list)
opt <- parse_args(opt_parser)

# Check mandatory

mandatory <-
c(
"sample_metadata",
"assay_files"
)

missing_args <- mandatory[!mandatory %in% names(opt)]
if (length(missing_args) > 0) {
stop(paste("Missing mandatory arguments:", paste(missing_args, collapse = ", ")))
}

library(shinyngs)

# Load and parse the YAML contrasts file
if (!is.null(opt$contrasts_file)) {
contrasts_data <- yaml.load_file(opt$contrasts_file)
} else {
stop("Contrasts file not provided.")
}

# Function to process contrasts data from the YAML format
process_contrasts <- function(contrasts) {
contrasts_list <- lapply(contrasts, function(contrast) {
data.frame(
id = contrast$id,
variable = contrast$comparison[1], # Assuming comparison[1] is the variable
reference = contrast$comparison[2], # Assuming comparison[2] is the reference
target = contrast$comparison[3], # Assuming comparison[3] is the target
blocking = ifelse(is.null(contrast$blocking_factors), NA, paste(contrast$blocking_factors, collapse = ", "))
)
})
do.call(rbind, contrasts_list)
}

# Process the contrasts data
contrasts_df <- process_contrasts(contrasts_data$contrasts)

# Now validate the inputs and contrasts
# validate_inputs() just wraps the parsing functions of shinyng, used by e.g.
# eselistfromConfig(). These functions are good for ensuring the consistency of
# FOM (feaure/ observation matrix) data.

validated_parts <- validate_inputs(
samples_metadata = opt$sample_metadata,
features_metadata = opt$feature_metadata,
assay_files = opt$assay_files,
assay_names = opt$assay_names,
# contrasts_file = contrasts_df, # Pass the processed contrasts dataframe
sample_id_col = opt$sample_id_col,
feature_id_col = opt$feature_id_col,
differential_results = opt$differential_results,
pval_column = opt$pval_column,
qval_column = opt$qval_column,
fc_column = opt$fold_change_column,
unlog_foldchanges = opt$unlog_foldchanges
)

# If an output path is provided we can re-write the data, ensuring consistency
# of output formatting

if (! is.null(opt$output_directory)){

dir.create(opt$output_directory, showWarnings = FALSE, recursive = TRUE)

# Write the files back, but using the supplied separator

write_table <- function(x, infile, suffix, na = 'NA'){
file_basename <- tools::file_path_sans_ext(basename(infile))
outfile <- file.path(opt$output_directory, paste(file_basename, suffix, 'tsv', sep = '.'))

print(paste("...... writing", outfile))
write.table(x, file = outfile, sep = opt$separator, quote = FALSE, row.names = FALSE, na = na)
}

# Write back the sample sheet, feature metadata and contrasts

print("Writing basic data...")
for (infile in c('sample_metadata', 'feature_metadata', 'contrasts_file')){
filename <- opt[[infile]]
if ((! is.null(filename)) && filename %in% names(validated_parts)){
write(paste("...", infile))

# Write contrasts file with empty strings for NAs in blocking
write_table(validated_parts[[filename]], filename, infile, na = ifelse(infile == 'contrasts_file', '', 'NA'))
}
}

# Write contrasts file with empty strings for NAs in blocking
write_table(contrasts_df, opt$contrasts_file, 'contrasts_file', na = '')

# Write back the matrices

print("Writing matrices...")
if ('assays' %in% names(validated_parts)){
for (assay in names(validated_parts[['assays']])){
mat <- validated_parts[['assays']][[assay]]

# Add a column for row names
mat <- data.frame(feature_name = rownames(mat), mat, check.names = FALSE)
colnames(mat)[1] <- opt$feature_id_col

write_table(mat, assay, 'assay')
}
}

# Write back the simplified differential results (if supplied)

if ('differential_stats' %in% names(validated_parts)){
for (ds in names(validated_parts[['differential_stats']])){
write_table(validated_parts[['differential_stats']][[ds]], ds)
}
}

}

6 changes: 2 additions & 4 deletions modules/nf-core/shinyngs/validatefomcomponents/main.nf

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion nextflow_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@
"type": "string",
"description": "A CSV file describing sample contrasts",
"help_text": "This file is used to define groups of samples from 'input' to compare. It must contain at least the columns 'variable', 'reference', 'target' and 'blocking', where 'variable' is a column in the input sample sheet, 'reference' and 'target' are values in that column, and blocking is a colon-separated list of additional 'blocking' variables (can be an empty string)",
"pattern": "^\\S+\\.(csv|tsv)$",
"pattern": "^\\S+\\.(yaml|yml)$",
"format": "file-path",
"mimetype": "text/csv",
"fa_icon": "fas fa-adjust"
Expand Down
1 change: 1 addition & 0 deletions tests/test_maxquant.nf.test
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ nextflow_pipeline {
when {
params {
outdir = "$outputDir"
contrasts = "/workspace/differentialabundance/testing/files/MaxQuant_contrasts.yaml"
}
}

Expand Down
17 changes: 12 additions & 5 deletions workflows/differentialabundance.nf
Original file line number Diff line number Diff line change
Expand Up @@ -187,12 +187,19 @@ workflow DIFFERENTIALABUNDANCE {
// TODO: there should probably be a separate plotting module in proteus to simplify this

ch_contrast_variables = ch_contrasts_file
.splitCsv(header:true, sep:(params.contrasts.endsWith('csv') ? ',' : '\t'))
.map{ it.tail().first() }
.map{
tuple('id': it.variable)
.map { entry ->
def yaml_file = entry[1]
def yaml_data = new groovy.yaml.YamlSlurper().parse(yaml_file)

yaml_data.contrasts.collect { contrast ->
tuple('id': contrast.comparison[0])
}
}
.unique() // uniquify to keep each contrast variable only once (in case it exists in multiple lines for blocking etc.)
.flatten()
.unique() // Uniquify to keep each contrast variable only once (in case it exists in multiple lines for blocking etc.)

ch_contrast_variables.view()


// Run proteus to import protein abundances
PROTEUS(
Expand Down
Loading