From 85803fb5755124dddae282f36b44b716b133e53d Mon Sep 17 00:00:00 2001 From: Tejas Temker Date: Thu, 18 Jul 2024 15:13:49 -0400 Subject: [PATCH 01/26] intial commit i# Please enter the commit message for your changes. Lines starting --- main.nf | 3 +++ 1 file changed, 3 insertions(+) diff --git a/main.nf b/main.nf index 31549e6..f1a6f0c 100644 --- a/main.nf +++ b/main.nf @@ -48,6 +48,9 @@ else if (params.workflow == "gbrs"){ else if (params.workflow == "amplicon"){ include {AMPLICON} from './workflows/amplicon' } +else if (params.workflow == "cnv_array"){ + include {CNV} from './workflows/cnv_array' +} else { // if workflow name is not supported: exit 1, "ERROR: No valid pipeline called. '--workflow ${params.workflow}' is not a valid workflow name." From e39d4f1bdb4b528c3bfd5fcd2b63e042bed1ee49 Mon Sep 17 00:00:00 2001 From: Tejas Temker Date: Thu, 18 Jul 2024 15:49:32 -0400 Subject: [PATCH 02/26] initial commit --- bin/help/cnv.nf | 13 +++++++++++++ bin/log/cnv.nf | 46 ++++++++++++++++++++++++++++++++++++++++++++++ workflows/cnv.nf | 44 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 103 insertions(+) create mode 100644 bin/help/cnv.nf create mode 100644 bin/log/cnv.nf create mode 100644 workflows/cnv.nf diff --git a/bin/help/cnv.nf b/bin/help/cnv.nf new file mode 100644 index 0000000..45e9c16 --- /dev/null +++ b/bin/help/cnv.nf @@ -0,0 +1,13 @@ +def help() { + println ''' +Parameter | Default | Description + +--idat_folder | / | The directory containing IDAT files. +--output_dir | / | The directory to store the output files. +--bpm_file | / | The path to the BPM file. +--egt_file | / | The path to the EGT file. +-w | / | The directory for intermediary files and Nextflow processes. This directory can become quite large. Ensure ample storage. +--help | false | Print this help message and exit. +''' +} + diff --git a/bin/log/cnv.nf b/bin/log/cnv.nf new file mode 100644 index 0000000..156c1e3 --- /dev/null +++ b/bin/log/cnv.nf @@ -0,0 +1,46 @@ +import Logos + +logo = new Logo() +println '\n' +println logo.show() + +def param_log(){ + +if (!params.idat_folder) { + error "'--idat_folder': is not provided, it is a required parameter." +} + +if (!params.output_dir) { + error "'--output_dir': is not provided, it is a required parameter." +} + +if (!params.bpm_file) { + error "'--bpm_file': is not provided, it is a required parameter." +} + +if (!params.egt_file) { + error "'--egt_file': is not provided, it is a required parameter." +} + +log.info """ +IAAP_CLI PARAMETER LOG + +--comment: ${params.comment} + +Results Published to: ${params.output_dir} +______________________________________________________ +--idat_folder ${params.idat_folder} +--output_dir ${params.output_dir} +--bpm_file ${params.bpm_file} +--egt_file ${params.egt_file} +-w ${workDir} +--keep_intermediate ${params.keep_intermediate} +-c ${params.config} + +Project Directory: ${projectDir} + +Command line call: +${workflow.commandLine} +______________________________________________________ +""" +} \ No newline at end of file diff --git a/workflows/cnv.nf b/workflows/cnv.nf new file mode 100644 index 0000000..d2b9a0d --- /dev/null +++ b/workflows/cnv.nf @@ -0,0 +1,44 @@ +#!/usr/bin/env nextflow +nextflow.enable.dsl=2 + + +// import modules +// include {help} from "${projectDir}/bin/help/cnv.nf" +// include {param_log} from "${projectDir}/bin/log/cnv.nf" + +// Parameter validation +if (!params.idat_folder || !params.output_dir || !params.bpm_file || !params.egt_file) { + exit 1, "All parameters (idat_folder, output_dir, bpm_file, egt_file) are required." +} + +// main workflow +process IAAP_CLI { + + // container 'quay.io/jaxcompsci/gtc2vcf_with_tools:v2' + // errorStrategy 'finish' + + input: + path idat_folder from params.idat_folder + path output_dir from params.output_dir + path bpm_file from params.bpm_file + path egt_file from params.egt_file + + script: + """ + mkdir -p $output_dir + chmod a+w $output_dir + + echo "Running IAAP_CLI with BPM file: $bpm_file and EGT file: $egt_file" > $output_dir/iaap_cli.log + + /usr/local/bin/iaap-cli/iaap-cli gencall \ + $bpm_file \ + $egt_file \ + $output_dir \ + --idat-folder $idat_folder \ + --output-gtc >> $output_dir/iaap_cli.log 2>&1 + """ +} + +workflow { + IAAP_CLI() +} \ No newline at end of file From a0cb6180c618412e9ed23c2de0c000eaaf2a5178 Mon Sep 17 00:00:00 2001 From: Tejas Temker Date: Mon, 22 Jul 2024 11:54:05 -0400 Subject: [PATCH 03/26] adding log files and main.nf --- bin/help/cnv.nf | 13 ----------- main.nf | 5 +++++ workflows/cnv.nf | 58 +++++++++++++++++------------------------------- 3 files changed, 25 insertions(+), 51 deletions(-) delete mode 100644 bin/help/cnv.nf diff --git a/bin/help/cnv.nf b/bin/help/cnv.nf deleted file mode 100644 index 45e9c16..0000000 --- a/bin/help/cnv.nf +++ /dev/null @@ -1,13 +0,0 @@ -def help() { - println ''' -Parameter | Default | Description - ---idat_folder | / | The directory containing IDAT files. ---output_dir | / | The directory to store the output files. ---bpm_file | / | The path to the BPM file. ---egt_file | / | The path to the EGT file. --w | / | The directory for intermediary files and Nextflow processes. This directory can become quite large. Ensure ample storage. ---help | false | Print this help message and exit. -''' -} - diff --git a/main.nf b/main.nf index f1a6f0c..f8b2143 100644 --- a/main.nf +++ b/main.nf @@ -103,4 +103,9 @@ workflow{ if (params.workflow == "amplicon"){ AMPLICON() } + if (params.workflow == "cnv_array"){ + + CNV_ARRAY() + + } } diff --git a/workflows/cnv.nf b/workflows/cnv.nf index d2b9a0d..ec53547 100644 --- a/workflows/cnv.nf +++ b/workflows/cnv.nf @@ -1,44 +1,26 @@ #!/usr/bin/env nextflow nextflow.enable.dsl=2 - -// import modules -// include {help} from "${projectDir}/bin/help/cnv.nf" -// include {param_log} from "${projectDir}/bin/log/cnv.nf" - -// Parameter validation -if (!params.idat_folder || !params.output_dir || !params.bpm_file || !params.egt_file) { - exit 1, "All parameters (idat_folder, output_dir, bpm_file, egt_file) are required." +// Import modules +include { IAAP_CLI } from "${projectDir}/modules/illumina/iaap_cli" +include { help } from "${projectDir}/bin/help/cnv.nf" +include { param_log } from "${projectDir}/bin/log/cnv.nf" + +// Help if needed +if (params.help) { + help() + exit 0 } - -// main workflow -process IAAP_CLI { - - // container 'quay.io/jaxcompsci/gtc2vcf_with_tools:v2' - // errorStrategy 'finish' - - input: - path idat_folder from params.idat_folder - path output_dir from params.output_dir - path bpm_file from params.bpm_file - path egt_file from params.egt_file - - script: - """ - mkdir -p $output_dir - chmod a+w $output_dir - - echo "Running IAAP_CLI with BPM file: $bpm_file and EGT file: $egt_file" > $output_dir/iaap_cli.log - - /usr/local/bin/iaap-cli/iaap-cli gencall \ - $bpm_file \ - $egt_file \ - $output_dir \ - --idat-folder $idat_folder \ - --output-gtc >> $output_dir/iaap_cli.log 2>&1 - """ +// Log parameter info +param_log() +// Parameter validation +if (!params.idat_folder || !params.bpm_file || !params.egt_file) { + exit 1, "All parameters (idat_folder, bpm_file, egt_file) are required." } - -workflow { - IAAP_CLI() +errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.memory} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} +// Main workflow +workflow CNV_ARRAY { + IAAP_CLI( + idat_folder: params.idat_folder + ) } \ No newline at end of file From 3e53bec2e3d41f560ad0a70c0990081d3da45f97 Mon Sep 17 00:00:00 2001 From: Tejas Temker Date: Mon, 22 Jul 2024 11:58:00 -0400 Subject: [PATCH 04/26] config files and log files --- bin/help/cnv_array.nf | 13 +++++++ config/cnv_array.config | 75 ++++++++++++++++++++++++++++++++++++ modules/illumina/iaap_cli.nf | 39 +++++++++++++++++++ 3 files changed, 127 insertions(+) create mode 100644 bin/help/cnv_array.nf create mode 100644 config/cnv_array.config create mode 100644 modules/illumina/iaap_cli.nf diff --git a/bin/help/cnv_array.nf b/bin/help/cnv_array.nf new file mode 100644 index 0000000..45e9c16 --- /dev/null +++ b/bin/help/cnv_array.nf @@ -0,0 +1,13 @@ +def help() { + println ''' +Parameter | Default | Description + +--idat_folder | / | The directory containing IDAT files. +--output_dir | / | The directory to store the output files. +--bpm_file | / | The path to the BPM file. +--egt_file | / | The path to the EGT file. +-w | / | The directory for intermediary files and Nextflow processes. This directory can become quite large. Ensure ample storage. +--help | false | Print this help message and exit. +''' +} + diff --git a/config/cnv_array.config b/config/cnv_array.config new file mode 100644 index 0000000..0bc3be5 --- /dev/null +++ b/config/cnv_array.config @@ -0,0 +1,75 @@ +//==================== Nextflow/Container Config ========== + +manifest { + name = "iaap_cli" + description = 'Pipeline for processing of IAAP CLI samples.' + author = 'Your Name, Your Organization' + version = "0.1.0" +} + +params { + + // Shared params + idat_folder = null + bpm_file = null + egt_file = null + output_dir = './' // Default to current directory + organize_by = 'idat' // Organize by idat folder + pubdir = './results' // Default publication directory + + // Tool-specific params + iaap_cli_version = 'v2' + container = "quay.io/jaxcompsci/gtc2vcf_with_tools:${params.iaap_cli_version}" + mem_threshold = 60.GB + low_memory = 8.GB + high_memory = 24.GB + low_time = '03:00:00' + high_time = '12:00:00' +} + +process { + withName: IAAP_CLI { + cpus = 4 + memory { params.idat_folder.size() < params.mem_threshold ? params.low_memory : params.high_memory } + time { params.idat_folder.size() < params.mem_threshold ? params.low_time : params.high_time } + container = params.container + errorStrategy { + (task.exitStatus == 140) ? { + log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.memory} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n" + return 'finish' + }.call() : 'finish' + } + } +} + +// Default configuration +executor { + name = 'local' + cpus = 4 + memory = '8 GB' +} + +docker { + enabled = true +} + +report { + enabled = true + file = "${params.pubdir}/report.html" +} + +timeline { + enabled = true + file = "${params.pubdir}/timeline.html" +} + +trace { + enabled = true + file = "${params.pubdir}/trace.txt" +} + +params { + // Log parameter information + def param_log = { log.info("Parameter: ${it.key} = ${it.value}") } + params.each(param_log) +} \ No newline at end of file diff --git a/modules/illumina/iaap_cli.nf b/modules/illumina/iaap_cli.nf new file mode 100644 index 0000000..c4927f8 --- /dev/null +++ b/modules/illumina/iaap_cli.nf @@ -0,0 +1,39 @@ +process IAAP_CLI { + tag "$idat_folder" + cpus = 4 + memory { idat_folder.size() < 60.GB ? 8.GB : 24.GB } + time { idat_folder.size() < 60.GB ? '03:00:00' : '12:00:00' } + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/jaxcompsci/gtc2vcf_with_tools:v2' + + publishDir "${params.pubdir}/${params.organize_by=='idat' ? "$idat_folder" + '/results' : 'iaap_cli'}", pattern:"*.log", mode:'copy' + + input: + path idat_folder from params.idat_folder + path output_dir from params.output_dir + path bpm_file from params.bpm_file + path egt_file from params.egt_file + + output: + path "$output_dir/iaap_cli.log", emit: iaap_cli_log + + script: + """ + mkdir -p $output_dir + chmod a+w $output_dir + + echo "Running IAAP_CLI with BPM file: $bpm_file and EGT file: $egt_file" > $output_dir/iaap_cli.log + + /usr/local/bin/iaap-cli/iaap-cli gencall \ + $bpm_file \ + $egt_file \ + $output_dir \ + --idat-folder $idat_folder \ + --output-gtc >> $output_dir/iaap_cli.log 2>&1 + """ +} + +workflow { + IAAP_CLI() +} \ No newline at end of file From 9a02b9f72e4d6db36e6a93d752d2f27a9d037119 Mon Sep 17 00:00:00 2001 From: Tejas Temker Date: Mon, 22 Jul 2024 12:08:33 -0400 Subject: [PATCH 05/26] config changes --- config/cnv_array.config | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/config/cnv_array.config b/config/cnv_array.config index 0bc3be5..b928349 100644 --- a/config/cnv_array.config +++ b/config/cnv_array.config @@ -3,18 +3,17 @@ manifest { name = "iaap_cli" description = 'Pipeline for processing of IAAP CLI samples.' - author = 'Your Name, Your Organization' + author = 'Tejas Temker, Copyright Jackson Laboratory 2024' version = "0.1.0" } params { // Shared params - idat_folder = null + idat_folder = '.idat' bpm_file = null egt_file = null output_dir = './' // Default to current directory - organize_by = 'idat' // Organize by idat folder pubdir = './results' // Default publication directory // Tool-specific params From b2b59dc8bb6ac19865badabae72bcacd2ba59734 Mon Sep 17 00:00:00 2001 From: Mike Lloyd Date: Mon, 22 Jul 2024 14:40:07 -0400 Subject: [PATCH 06/26] csv input added, iaap_cli working --- bin/help/cnv_array.nf | 2 - bin/log/{cnv.nf => cnv_array.nf} | 13 +---- bin/shared/extract_cnv_array_csv.nf | 83 +++++++++++++++++++++++++++++ config/cnv_array.config | 71 ++---------------------- main.nf | 4 +- modules/illumina/iaap_cli.nf | 41 ++++++-------- nextflow.config | 41 +++++++------- workflows/cnv.nf | 26 --------- workflows/cnv_array.nf | 33 ++++++++++++ 9 files changed, 161 insertions(+), 153 deletions(-) rename bin/log/{cnv.nf => cnv_array.nf} (69%) create mode 100644 bin/shared/extract_cnv_array_csv.nf delete mode 100644 workflows/cnv.nf create mode 100644 workflows/cnv_array.nf diff --git a/bin/help/cnv_array.nf b/bin/help/cnv_array.nf index 45e9c16..1345dd2 100644 --- a/bin/help/cnv_array.nf +++ b/bin/help/cnv_array.nf @@ -2,8 +2,6 @@ def help() { println ''' Parameter | Default | Description ---idat_folder | / | The directory containing IDAT files. ---output_dir | / | The directory to store the output files. --bpm_file | / | The path to the BPM file. --egt_file | / | The path to the EGT file. -w | / | The directory for intermediary files and Nextflow processes. This directory can become quite large. Ensure ample storage. diff --git a/bin/log/cnv.nf b/bin/log/cnv_array.nf similarity index 69% rename from bin/log/cnv.nf rename to bin/log/cnv_array.nf index 156c1e3..0184cf3 100644 --- a/bin/log/cnv.nf +++ b/bin/log/cnv_array.nf @@ -6,14 +6,6 @@ println logo.show() def param_log(){ -if (!params.idat_folder) { - error "'--idat_folder': is not provided, it is a required parameter." -} - -if (!params.output_dir) { - error "'--output_dir': is not provided, it is a required parameter." -} - if (!params.bpm_file) { error "'--bpm_file': is not provided, it is a required parameter." } @@ -27,10 +19,9 @@ IAAP_CLI PARAMETER LOG --comment: ${params.comment} -Results Published to: ${params.output_dir} +Results Published to: ${params.pubdir} ______________________________________________________ ---idat_folder ${params.idat_folder} ---output_dir ${params.output_dir} +--csv_input ${params.csv_input} --bpm_file ${params.bpm_file} --egt_file ${params.egt_file} -w ${workDir} diff --git a/bin/shared/extract_cnv_array_csv.nf b/bin/shared/extract_cnv_array_csv.nf new file mode 100644 index 0000000..0411a57 --- /dev/null +++ b/bin/shared/extract_cnv_array_csv.nf @@ -0,0 +1,83 @@ +// Function to extract information (meta data + file(s)) from csv file(s) +// https://github.com/nf-core/sarek/blob/master/workflows/sarek.nf#L1084 + +ANSI_RED = "\u001B[31m"; +ANSI_RESET = "\u001B[0m"; + +def extract_csv(csv_file) { + // check that the sample sheet is not 1 line or less, because it'll skip all subsequent checks if so. + file(csv_file).withReader('UTF-8') { reader -> + def line, numberOfLinesInSampleSheet = 0; + while ((line = reader.readLine()) != null) {numberOfLinesInSampleSheet++} + if (numberOfLinesInSampleSheet < 2) { + System.err.println(ANSI_RED + "-----------------------------------------------------------------------" + ANSI_RESET) + System.err.println(ANSI_RED + "Samplesheet had less than two lines. The sample sheet must be a csv file with a header, so at least two lines." + ANSI_RESET) + System.err.println(ANSI_RED + "-----------------------------------------------------------------------" + ANSI_RESET) + System.exit(1) + } + } + + Channel.from(csv_file).splitCsv(header: true) + .map{ row -> + if (!(row.sampleID) || !(row.idat_red || !(row.idat_green))){ + System.err.println(ANSI_RED + "-----------------------------------------------------------------------" + ANSI_RESET) + System.err.println(ANSI_RED + "Missing field in csv file header. The csv file must have fields: 'sampleID', 'idat_red', 'idat_green'." + ANSI_RESET) + System.err.println(ANSI_RED + "Exiting now." + ANSI_RESET) + System.err.println(ANSI_RED + "-----------------------------------------------------------------------" + ANSI_RESET) + System.exit(1) + } + [row.sampleID.toString(), row] + }.groupTuple() + .map{ meta, rows -> + size = rows.size() + [rows, size] + }.transpose() + .map{ row, numLanes -> //from here do the usual thing for csv parsing + + + // Metadata to identify samplesheet + def meta = [:] + + if (row.sampleID) meta.sampleID = row.sampleID.toString() + + if (row.gender != "XY" && row.gender != "XX" && row.gender != ""){ + System.err.println(ANSI_RED + "-----------------------------------------------------------------------" + ANSI_RESET) + System.err.println(ANSI_RED + "Geneder must be 'XX', 'XY' or empty. " + row.gender + " was provided, and isn't valid." + ANSI_RESET) + System.err.println(ANSI_RED + "Exiting now." + ANSI_RESET) + System.err.println(ANSI_RED + "-----------------------------------------------------------------------" + ANSI_RESET) + System.exit(1) + } + + if (row.gender == "") { + meta.gender = 'NA' + } else { + meta.gender = row.gender.toString() + } + + // join meta to idat, and check file existence. + try { + file(row.idat_red, checkIfExists: true) + } + catch (Exception e) { + System.err.println(ANSI_RED + "---------------------------------------------" + ANSI_RESET) + System.err.println(ANSI_RED + "The file: " + row.idat_red + " does not exist. Use absolute paths, and check for correctness." + ANSI_RESET) + System.err.println(ANSI_RED + "Exiting now." + ANSI_RESET) + System.err.println(ANSI_RED + "---------------------------------------------" + ANSI_RESET) + System.exit(1) + } + try { + file(row.idat_green, checkIfExists: true) + } + catch (Exception e) { + System.err.println(ANSI_RED + "---------------------------------------------" + ANSI_RESET) + System.err.println(ANSI_RED + "The file: " + row.idat_green + " does not exist. Use absolute paths, and check for correctness." + ANSI_RESET) + System.err.println(ANSI_RED + "Exiting now." + ANSI_RESET) + System.err.println(ANSI_RED + "---------------------------------------------" + ANSI_RESET) + System.exit(1) + } + + return [meta.sampleID, meta, row.idat_red, row.idat_green] + + + } +} diff --git a/config/cnv_array.config b/config/cnv_array.config index b928349..26122e2 100644 --- a/config/cnv_array.config +++ b/config/cnv_array.config @@ -1,74 +1,13 @@ //==================== Nextflow/Container Config ========== manifest { - name = "iaap_cli" - description = 'Pipeline for processing of IAAP CLI samples.' - author = 'Tejas Temker, Copyright Jackson Laboratory 2024' + name = "cnv_array" + description = 'Pipeline for processing Copy Number Variation from Illumina Genotype Array.' + author = 'Tejas Temker, Michael Lloyd, Copyright Jackson Laboratory 2024' version = "0.1.0" } params { - - // Shared params - idat_folder = '.idat' - bpm_file = null - egt_file = null - output_dir = './' // Default to current directory - pubdir = './results' // Default publication directory - - // Tool-specific params - iaap_cli_version = 'v2' - container = "quay.io/jaxcompsci/gtc2vcf_with_tools:${params.iaap_cli_version}" - mem_threshold = 60.GB - low_memory = 8.GB - high_memory = 24.GB - low_time = '03:00:00' - high_time = '12:00:00' -} - -process { - withName: IAAP_CLI { - cpus = 4 - memory { params.idat_folder.size() < params.mem_threshold ? params.low_memory : params.high_memory } - time { params.idat_folder.size() < params.mem_threshold ? params.low_time : params.high_time } - container = params.container - errorStrategy { - (task.exitStatus == 140) ? { - log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.memory} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n" - return 'finish' - }.call() : 'finish' - } - } -} - -// Default configuration -executor { - name = 'local' - cpus = 4 - memory = '8 GB' -} - -docker { - enabled = true + bpm_file = '/projects/compsci/omics_share/human/GRCh38/supporting_files/cnv_array/HumanCytoSNP-12v2-1_L1.bpm' + egt_file = '/projects/compsci/omics_share/human/GRCh38/supporting_files/cnv_array/HumanCytoSNP-12v2-1_L.egt' } - -report { - enabled = true - file = "${params.pubdir}/report.html" -} - -timeline { - enabled = true - file = "${params.pubdir}/timeline.html" -} - -trace { - enabled = true - file = "${params.pubdir}/trace.txt" -} - -params { - // Log parameter information - def param_log = { log.info("Parameter: ${it.key} = ${it.value}") } - params.each(param_log) -} \ No newline at end of file diff --git a/main.nf b/main.nf index f8b2143..73a303a 100644 --- a/main.nf +++ b/main.nf @@ -49,7 +49,7 @@ else if (params.workflow == "amplicon"){ include {AMPLICON} from './workflows/amplicon' } else if (params.workflow == "cnv_array"){ - include {CNV} from './workflows/cnv_array' + include {CNV_ARRAY} from './workflows/cnv_array' } else { // if workflow name is not supported: @@ -104,8 +104,6 @@ workflow{ AMPLICON() } if (params.workflow == "cnv_array"){ - CNV_ARRAY() - } } diff --git a/modules/illumina/iaap_cli.nf b/modules/illumina/iaap_cli.nf index c4927f8..ad57ed5 100644 --- a/modules/illumina/iaap_cli.nf +++ b/modules/illumina/iaap_cli.nf @@ -1,39 +1,32 @@ process IAAP_CLI { - tag "$idat_folder" + + tag "$sampleID" + cpus = 4 - memory { idat_folder.size() < 60.GB ? 8.GB : 24.GB } - time { idat_folder.size() < 60.GB ? '03:00:00' : '12:00:00' } - errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.mem} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + memory 24.GB + time '01:30:00' + + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.memory} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} container 'quay.io/jaxcompsci/gtc2vcf_with_tools:v2' - publishDir "${params.pubdir}/${params.organize_by=='idat' ? "$idat_folder" + '/results' : 'iaap_cli'}", pattern:"*.log", mode:'copy' + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'iaap_cli' }", pattern: "*.gtc", mode:'copy' + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'iaap_cli' }", pattern: "*.log", mode:'copy' input: - path idat_folder from params.idat_folder - path output_dir from params.output_dir - path bpm_file from params.bpm_file - path egt_file from params.egt_file + tuple val(sampleID), val(meta), path(red_idat), path(green_idat) output: - path "$output_dir/iaap_cli.log", emit: iaap_cli_log + tuple val(sampleID), val(meta), path("*.gtc"), emit: gtc + path "iaap_cli.log", emit: iaap_cli_log script: """ - mkdir -p $output_dir - chmod a+w $output_dir - - echo "Running IAAP_CLI with BPM file: $bpm_file and EGT file: $egt_file" > $output_dir/iaap_cli.log - /usr/local/bin/iaap-cli/iaap-cli gencall \ - $bpm_file \ - $egt_file \ - $output_dir \ - --idat-folder $idat_folder \ - --output-gtc >> $output_dir/iaap_cli.log 2>&1 + ${params.bpm_file} \ + ${params.egt_file} \ + ./ \ + --idat-folder ./ \ + --output-gtc >> iaap_cli.log 2>&1 """ } - -workflow { - IAAP_CLI() -} \ No newline at end of file diff --git a/nextflow.config b/nextflow.config index 3fdcb69..dcb0c13 100644 --- a/nextflow.config +++ b/nextflow.config @@ -1,29 +1,27 @@ -/*___________________________________________________ - - Nextflow DSL2 Main Config - - Authors: Anuj Srivastava, Carolyn Paisie, Barry Guglielmo, Michael Lloyd, Brian Sanderson, Sai Lek, Harshpreet Chandok, Peter Fields - Copyright of Jackson Laboratories 2022 - -_____________________________________________________*/ - params { - // Select workflow - workflow = 'rnaseq' + // set workflow + pipeline = 'Not_Specified' + workflow = params.pipeline + + // define reference_cache directory + reference_cache='/projects/omics_share' - // select config from config folder to use + // select config from config folder to use based on workflow config = "config/${params.workflow}.config" // set publish directory for data to save (easier to follow) - pubdir = "../${workflow}" + pubdir = "/flashscratch/${USER}" + + profile = 'sumner2' // organize output: // by sample folders (with many analysis in one sample folder) or by // analysis folder (with many samples in one folder per analysis) - organize_by = 'sample' // analysis keep_intermediate = false // true - + fastq2 = true // default is PE for workflows + tmpdir = "/flashscratch/${USER}" // generic param + // get help help = null @@ -31,13 +29,12 @@ params { comment = '' } -// specific config for the pipeline - try { includeConfig params.config } catch (Exception e) { - System.err.println("ERROR: Could not load ${params.config} check that you are using a valid pipeline name") + System.err.println("ERROR: Could not load ${params.config} check that you are using a valid workflow name") + System.exit(1) } // work directory is important as it will be large, plan accordingly @@ -47,13 +44,15 @@ manifest { name = "The Jackson Laboratory Computational Sciences Nextflow based analysis pipelines" homePage = "https://github.com/TheJacksonLaboratory/cs-nf-pipelines" mainScript = "main.nf" - nextflowVersion = "!>=20.10.0" - version = "0.4.1" + nextflowVersion = "!>=22.04.3" + version = "PIVOT" + author = 'Michael Lloyd, Brian Sanderson, Barry Guglielmo, Sai Lek, Peter Fields, Harshpreet Chandok, Carolyn Paisie, Gabriel Rech, Ardian Ferraj, Tejas Temker, Anuj Srivastava. Copyright Jackson Laboratory 2024' } + profiles { sumner { includeConfig "config/profiles/sumner.config" } - sumner2 { includeConfig "config/profiles/sumner2.config" } + sumner2 { includeConfig "config/profiles/sumner2.config" } elion { includeConfig "config/profiles/elion.config" } } diff --git a/workflows/cnv.nf b/workflows/cnv.nf deleted file mode 100644 index ec53547..0000000 --- a/workflows/cnv.nf +++ /dev/null @@ -1,26 +0,0 @@ -#!/usr/bin/env nextflow -nextflow.enable.dsl=2 - -// Import modules -include { IAAP_CLI } from "${projectDir}/modules/illumina/iaap_cli" -include { help } from "${projectDir}/bin/help/cnv.nf" -include { param_log } from "${projectDir}/bin/log/cnv.nf" - -// Help if needed -if (params.help) { - help() - exit 0 -} -// Log parameter info -param_log() -// Parameter validation -if (!params.idat_folder || !params.bpm_file || !params.egt_file) { - exit 1, "All parameters (idat_folder, bpm_file, egt_file) are required." -} -errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.memory} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} -// Main workflow -workflow CNV_ARRAY { - IAAP_CLI( - idat_folder: params.idat_folder - ) -} \ No newline at end of file diff --git a/workflows/cnv_array.nf b/workflows/cnv_array.nf new file mode 100644 index 0000000..4ae7dc4 --- /dev/null +++ b/workflows/cnv_array.nf @@ -0,0 +1,33 @@ +#!/usr/bin/env nextflow +nextflow.enable.dsl=2 + +// Import modules +include {help} from "${projectDir}/bin/help/cnv_array.nf" +include {param_log} from "${projectDir}/bin/log/cnv_array.nf" +include {extract_csv} from "${projectDir}/bin/shared/extract_cnv_array_csv.nf" +include {IAAP_CLI} from "${projectDir}/modules/illumina/iaap_cli" + +// Help if needed +if (params.help) { + help() + exit 0 +} + +// Log parameter info +param_log() +// Parameter validation +if (!params.bpm_file || !params.egt_file) { + exit 1, "All parameters (idat_folder, bpm_file, egt_file) are required." +} + +if (params.csv_input) { + ch_input = extract_csv(file(params.csv_input, checkIfExists: true)) +} else { + exit 1, "Workflow requires a CSV manifest. See `--help` for information." +} + +// Main workflow +workflow CNV_ARRAY { + IAAP_CLI(ch_input) + IAAP_CLI.out.gtc.view() +} From 2a7e256e22670513e893e3706b823ea5b5ca2580 Mon Sep 17 00:00:00 2001 From: Tejas Temker Date: Wed, 24 Jul 2024 11:47:41 -0400 Subject: [PATCH 07/26] BCFtools workflow --- bin/help/cnv_array.nf | 17 +++++++++++++++++ bin/log/cnv_array.nf | 39 ++++++++++++++++++++++++++++++++++++++- config/cnv_array.config | 5 +++++ workflows/cnv_array.nf | 20 ++++++++++++++++++++ 4 files changed, 80 insertions(+), 1 deletion(-) diff --git a/bin/help/cnv_array.nf b/bin/help/cnv_array.nf index 1345dd2..55da977 100644 --- a/bin/help/cnv_array.nf +++ b/bin/help/cnv_array.nf @@ -6,6 +6,23 @@ Parameter | Default | Description --egt_file | / | The path to the EGT file. -w | / | The directory for intermediary files and Nextflow processes. This directory can become quite large. Ensure ample storage. --help | false | Print this help message and exit. + ++gtc2vcf --no-version -Ou \ +--bpm | / | The path to the BPM file +--csv | / | The path to csv file +--egt | / | The patht to egt file +--gtcs | / | The path to gtgc output +--fasta-ref | / | The path to reference +--extra | / | The path to output directory + +bcftools sort -Ou -T ./bcftools. | \ +bcftools norm --no-version -Ob -c x -f ${fasta} | \ +tee bcftools_convert.bcf | \ + +bcftools index --force --output bcftools_convert.bcf.csi +bcftools convert -O v -o bcftools_convert.vcf bcftools_convert.bcf + """ + ''' } diff --git a/bin/log/cnv_array.nf b/bin/log/cnv_array.nf index 0184cf3..27194f7 100644 --- a/bin/log/cnv_array.nf +++ b/bin/log/cnv_array.nf @@ -14,6 +14,18 @@ if (!params.egt_file) { error "'--egt_file': is not provided, it is a required parameter." } +if (!params.csv_file) { + error "'--csv_file': is not provided, it is a required parameter." +} + +if (!params.fasta_file) { + error "'--fasta_file': is not provided, it is a required parameter." +} + +if (!params.tsv_file) { + error "'--tsv_file': is not provided, it is a required parameter." +} + log.info """ IAAP_CLI PARAMETER LOG @@ -34,4 +46,29 @@ Command line call: ${workflow.commandLine} ______________________________________________________ """ -} \ No newline at end of file +} + + +log.info """ +BCFTOOLS_GTC2VCF PARAMETER LOG + +--comment: ${params.comment} + +Results Published to: ${params.pubdir} +______________________________________________________ +--bpm_file ${params.bpm_file} +--csv_file ${params.csv_file} +--egt_file ${params.egt_file} +--fasta_file ${params.fasta_file} +--tsv_file ${params.tsv_file} +-w ${workDir} +--keep_intermediate ${params.keep_intermediate} +-c ${params.config} + +Project Directory: ${projectDir} + +Command line call: +${workflow.commandLine} +______________________________________________________ +""" +} diff --git a/config/cnv_array.config b/config/cnv_array.config index 26122e2..2fc65ef 100644 --- a/config/cnv_array.config +++ b/config/cnv_array.config @@ -10,4 +10,9 @@ manifest { params { bpm_file = '/projects/compsci/omics_share/human/GRCh38/supporting_files/cnv_array/HumanCytoSNP-12v2-1_L1.bpm' egt_file = '/projects/compsci/omics_share/human/GRCh38/supporting_files/cnv_array/HumanCytoSNP-12v2-1_L.egt' + + csv_file = 'path/to/default.csv' + fasta_file = 'path/to/default.fasta' + tsv_file = 'path/to/default.tsv' + pubdir = 'path/to/output' } diff --git a/workflows/cnv_array.nf b/workflows/cnv_array.nf index 4ae7dc4..46a2ad2 100644 --- a/workflows/cnv_array.nf +++ b/workflows/cnv_array.nf @@ -6,6 +6,7 @@ include {help} from "${projectDir}/bin/help/cnv_array.nf" include {param_log} from "${projectDir}/bin/log/cnv_array.nf" include {extract_csv} from "${projectDir}/bin/shared/extract_cnv_array_csv.nf" include {IAAP_CLI} from "${projectDir}/modules/illumina/iaap_cli" +include {BCFTOOLS_GTC2VCF} from "${projectDir}/modules/bcftools_gtc2vcf.nf" // Help if needed if (params.help) { @@ -30,4 +31,23 @@ if (params.csv_input) { workflow CNV_ARRAY { IAAP_CLI(ch_input) IAAP_CLI.out.gtc.view() + + // Define paths for BCFTOOLS_GTC2VCF inputs + bpm_file = file(params.bpm_file) + csv_file = file(params.csv_file) + egt_file = file(params.egt_file) + gtcs_dir = IAAP_CLI.out.gtc + fasta_file = file(params.fasta_file) + tsv_file = file(params.tsv_file) + + // Call BCFTOOLS_GTC2VCF process + BCFTOOLS_GTC2VCF(bpm_file, csv_file, egt_file, gtcs_dir, fasta_file, tsv_file) + + // result.view { files -> + // println "BCF: ${files[0]}" + // println "CSI: ${files[1]}" + // println "VCF: ${files[2]}" + // println "TSV: ${files[3]}" + // } + BCFTOOLS_GTC2VCF.out.gtc.view() } From 49e7a63ab6c050c2b6867f54db575576872596aa Mon Sep 17 00:00:00 2001 From: Tejas Temker Date: Wed, 24 Jul 2024 16:15:35 +0000 Subject: [PATCH 08/26] cnv_array.nf edited online with Bitbucket --- workflows/cnv_array.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/cnv_array.nf b/workflows/cnv_array.nf index 46a2ad2..b234524 100644 --- a/workflows/cnv_array.nf +++ b/workflows/cnv_array.nf @@ -6,7 +6,7 @@ include {help} from "${projectDir}/bin/help/cnv_array.nf" include {param_log} from "${projectDir}/bin/log/cnv_array.nf" include {extract_csv} from "${projectDir}/bin/shared/extract_cnv_array_csv.nf" include {IAAP_CLI} from "${projectDir}/modules/illumina/iaap_cli" -include {BCFTOOLS_GTC2VCF} from "${projectDir}/modules/bcftools_gtc2vcf.nf" +include {BCFTOOLS_GTC2VCF} from "${projectDir}/modules/bcftools/bcftools_gtct2vcf.nf" // Help if needed if (params.help) { From edbbb1c7657de67076206831d7c5bf151dd108e1 Mon Sep 17 00:00:00 2001 From: Tejas Temker Date: Wed, 24 Jul 2024 12:22:03 -0400 Subject: [PATCH 09/26] adding bcftools_gtct2vcf.nf file --- modules/bcftools/bcftools_gtct2vcf.nf | 34 +++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 modules/bcftools/bcftools_gtct2vcf.nf diff --git a/modules/bcftools/bcftools_gtct2vcf.nf b/modules/bcftools/bcftools_gtct2vcf.nf new file mode 100644 index 0000000..70c37df --- /dev/null +++ b/modules/bcftools/bcftools_gtct2vcf.nf @@ -0,0 +1,34 @@ +// bcftools_gtc2vcf.nf + +process BCFTOOLS_GTC2VCF { + + cpus = 4 + memory 24.GB + time '01:30:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.memory} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + container 'quay.io/jaxcompsci/gtc2vcf_with_tools:v2' + publishDir "${params.pubdir}", mode: 'copy' + + input: + tuple path(bpm), path(csv), path(egt), path(gtcs_dir), path(fasta), path(tsv) + + output: + tuple path('bcftools_convert.bcf'), path('bcftools_convert.bcf.csi'), path('bcftools_convert.vcf'), path('bcftools_convert.tsv') + + script: + """ + bcftools +gtc2vcf --no-version -Ou \ + --bpm ${bpm} \ + --csv ${csv} \ + --egt ${egt} \ + --gtcs ${gtcs_dir} \ + --fasta-ref ${fasta} \ + --extra ${tsv} | \ + bcftools sort -Ou -T ./bcftools. | \ + bcftools norm --no-version -Ob -c x -f ${fasta} | \ + tee bcftools_convert.bcf | \ + bcftools index --force --output bcftools_convert.bcf.csi + bcftools convert -O v -o bcftools_convert.vcf bcftools_convert.bcf + """ +} \ No newline at end of file From 6a276cc40feccb538fd355ce1323ff42a703dea1 Mon Sep 17 00:00:00 2001 From: Tejas Temker Date: Fri, 26 Jul 2024 12:55:03 -0400 Subject: [PATCH 10/26] gtc2vcf files --- bin/help/cnv_array.nf | 41 +++++---- bin/log/cnv_array.nf | 118 ++++++++++---------------- config/cnv_array.config | 11 +-- modules/bcftools/bcftools_gtct2vcf.nf | 18 +++- workflows/cnv_array.nf | 25 ++---- 5 files changed, 87 insertions(+), 126 deletions(-) diff --git a/bin/help/cnv_array.nf b/bin/help/cnv_array.nf index 55da977..2024b45 100644 --- a/bin/help/cnv_array.nf +++ b/bin/help/cnv_array.nf @@ -1,28 +1,25 @@ def help() { println ''' -Parameter | Default | Description +Parameter | Default | Description +-----------------|---------|--------------------------------------------------------------------------- +--bpm_file | / | The path to the BPM file. +--egt_file | / | The path to the EGT file. +-w | / | The directory for intermediary files and Nextflow processes. Ensure ample storage. +--help | false | Print this help message and exit. ---bpm_file | / | The path to the BPM file. ---egt_file | / | The path to the EGT file. --w | / | The directory for intermediary files and Nextflow processes. This directory can become quite large. Ensure ample storage. ---help | false | Print this help message and exit. - -+gtc2vcf --no-version -Ou \ ---bpm | / | The path to the BPM file ---csv | / | The path to csv file ---egt | / | The patht to egt file ---gtcs | / | The path to gtgc output ---fasta-ref | / | The path to reference ---extra | / | The path to output directory - -bcftools sort -Ou -T ./bcftools. | \ -bcftools norm --no-version -Ob -c x -f ${fasta} | \ -tee bcftools_convert.bcf | \ - -bcftools index --force --output bcftools_convert.bcf.csi -bcftools convert -O v -o bcftools_convert.vcf bcftools_convert.bcf - """ +--bpm | / | The path to the BPM file. +--csv | / | The path to the CSV file. +--egt | / | The path to the EGT file. +--gtcs | / | The path to GTC output. +--fasta-ref | / | The path to the reference FASTA file. +--extra | / | The path to the output directory. +// Example usage of BCFTOOLS: +// --------------------------- +// bcftools sort -Ou -T ./bcftools. | \\ +// bcftools norm --no-version -Ob -c x -f | \\ +// tee bcftools_convert.bcf | \\ +// bcftools index --force --output bcftools_convert.bcf.csi +// bcftools convert -O v -o bcftools_convert.vcf bcftools_convert.bcf ''' } - diff --git a/bin/log/cnv_array.nf b/bin/log/cnv_array.nf index 27194f7..2f9dfbf 100644 --- a/bin/log/cnv_array.nf +++ b/bin/log/cnv_array.nf @@ -1,74 +1,46 @@ -import Logos - -logo = new Logo() -println '\n' -println logo.show() - -def param_log(){ - -if (!params.bpm_file) { - error "'--bpm_file': is not provided, it is a required parameter." -} - -if (!params.egt_file) { - error "'--egt_file': is not provided, it is a required parameter." -} - -if (!params.csv_file) { - error "'--csv_file': is not provided, it is a required parameter." -} - -if (!params.fasta_file) { - error "'--fasta_file': is not provided, it is a required parameter." -} - -if (!params.tsv_file) { - error "'--tsv_file': is not provided, it is a required parameter." -} - -log.info """ -IAAP_CLI PARAMETER LOG - ---comment: ${params.comment} - -Results Published to: ${params.pubdir} -______________________________________________________ ---csv_input ${params.csv_input} ---bpm_file ${params.bpm_file} ---egt_file ${params.egt_file} --w ${workDir} ---keep_intermediate ${params.keep_intermediate} --c ${params.config} - -Project Directory: ${projectDir} - -Command line call: -${workflow.commandLine} -______________________________________________________ -""" -} - - -log.info """ -BCFTOOLS_GTC2VCF PARAMETER LOG - ---comment: ${params.comment} - -Results Published to: ${params.pubdir} -______________________________________________________ ---bpm_file ${params.bpm_file} ---csv_file ${params.csv_file} ---egt_file ${params.egt_file} ---fasta_file ${params.fasta_file} ---tsv_file ${params.tsv_file} --w ${workDir} ---keep_intermediate ${params.keep_intermediate} --c ${params.config} - -Project Directory: ${projectDir} - -Command line call: -${workflow.commandLine} -______________________________________________________ -""" +def param_log() { + // Check required parameters + if (!params.bpm_file) { + error "'--bpm_file': is not provided, it is a required parameter." + } + + if (!params.egt_file) { + error "'--egt_file': is not provided, it is a required parameter." + } + + if (!params.csv_file) { + error "'--csv_file': is not provided, it is a required parameter." + } + + if (!params.fasta_file) { + error "'--fasta_file': is not provided, it is a required parameter." + } + + if (!params.tsv_file) { + error "'--tsv_file': is not provided, it is a required parameter." + } + + // Log parameter information + log.info """ + CNV_ARRAY PARAMETER LOG + + --comment: ${params.comment ?: 'N/A'} + + Results Published to: ${params.pubdir ?: 'N/A'} + ______________________________________________________ + --idat_folder ${params.idat_folder ?: 'N/A'} + --bpm_file ${params.bpm_file} + --egt_file ${params.egt_file} + --csv_file ${params.csv_file} + --fasta_file ${params.fasta_file} + --tsv_file ${params.tsv_file} + -w ${workDir} + --keep_intermediate ${params.keep_intermediate ?: 'N/A'} + -c ${params.config ?: 'N/A'} + + Project Directory: ${projectDir} + Command line call: + ${workflow.commandLine} + ______________________________________________________ + """ } diff --git a/config/cnv_array.config b/config/cnv_array.config index 2fc65ef..7bd7d28 100644 --- a/config/cnv_array.config +++ b/config/cnv_array.config @@ -1,5 +1,3 @@ -//==================== Nextflow/Container Config ========== - manifest { name = "cnv_array" description = 'Pipeline for processing Copy Number Variation from Illumina Genotype Array.' @@ -10,9 +8,8 @@ manifest { params { bpm_file = '/projects/compsci/omics_share/human/GRCh38/supporting_files/cnv_array/HumanCytoSNP-12v2-1_L1.bpm' egt_file = '/projects/compsci/omics_share/human/GRCh38/supporting_files/cnv_array/HumanCytoSNP-12v2-1_L.egt' - - csv_file = 'path/to/default.csv' - fasta_file = 'path/to/default.fasta' - tsv_file = 'path/to/default.tsv' - pubdir = 'path/to/output' + idat_folder = '/home/temket/cnv_workflow/data/raw_idat' + gtc_file = '/flashscratch/lloydm/CNV_test/example_sample_input.csv' + ref_fa = '/projects/compsci/omics_share/human/GRCh38/supporting_files/cnv_array/HumanCytoSNP-12v2-1_L1.csv' + tsv_file = '/home/temket/cnv_workflow/output/bcftools_convert.tsv' } diff --git a/modules/bcftools/bcftools_gtct2vcf.nf b/modules/bcftools/bcftools_gtct2vcf.nf index 70c37df..a0aecf4 100644 --- a/modules/bcftools/bcftools_gtct2vcf.nf +++ b/modules/bcftools/bcftools_gtct2vcf.nf @@ -1,7 +1,17 @@ -// bcftools_gtc2vcf.nf +def prepare_bcftools_inputs(ch_input) { + bpm_file = file(params.bpm_file) + csv_file = file(params.csv_input) + egt_file = file(params.egt_file) + gtcs_dir = IAAP_CLI.out.gtc + fasta_file = file(params.fasta_file) + tsv_file = file(params.tsv_file) + return tuple(bpm_file, csv_file, egt_file, gtcs_dir, fasta_file, tsv_file) +} + + +// Define BCFTOOLS_GTC2VCF process process BCFTOOLS_GTC2VCF { - cpus = 4 memory 24.GB time '01:30:00' @@ -11,7 +21,7 @@ process BCFTOOLS_GTC2VCF { publishDir "${params.pubdir}", mode: 'copy' input: - tuple path(bpm), path(csv), path(egt), path(gtcs_dir), path(fasta), path(tsv) + tuple path(bpm_file), path(csv_file), path(egt_file), path(gtcs_dir), path(fasta_file), path(tsv_file) from prepare_bcftools_inputs output: tuple path('bcftools_convert.bcf'), path('bcftools_convert.bcf.csi'), path('bcftools_convert.vcf'), path('bcftools_convert.tsv') @@ -31,4 +41,4 @@ process BCFTOOLS_GTC2VCF { bcftools index --force --output bcftools_convert.bcf.csi bcftools convert -O v -o bcftools_convert.vcf bcftools_convert.bcf """ -} \ No newline at end of file +} diff --git a/workflows/cnv_array.nf b/workflows/cnv_array.nf index b234524..177872d 100644 --- a/workflows/cnv_array.nf +++ b/workflows/cnv_array.nf @@ -5,7 +5,7 @@ nextflow.enable.dsl=2 include {help} from "${projectDir}/bin/help/cnv_array.nf" include {param_log} from "${projectDir}/bin/log/cnv_array.nf" include {extract_csv} from "${projectDir}/bin/shared/extract_cnv_array_csv.nf" -include {IAAP_CLI} from "${projectDir}/modules/illumina/iaap_cli" +include {IAAP_CLI} from "${projectDir}/modules/illumina/iaap_cli.nf" include {BCFTOOLS_GTC2VCF} from "${projectDir}/modules/bcftools/bcftools_gtct2vcf.nf" // Help if needed @@ -27,27 +27,12 @@ if (params.csv_input) { exit 1, "Workflow requires a CSV manifest. See `--help` for information." } +// Extract CSV input +ch_input = extract_csv(file(params.csv_input, checkIfExists: true)) + // Main workflow workflow CNV_ARRAY { IAAP_CLI(ch_input) IAAP_CLI.out.gtc.view() - - // Define paths for BCFTOOLS_GTC2VCF inputs - bpm_file = file(params.bpm_file) - csv_file = file(params.csv_file) - egt_file = file(params.egt_file) - gtcs_dir = IAAP_CLI.out.gtc - fasta_file = file(params.fasta_file) - tsv_file = file(params.tsv_file) - - // Call BCFTOOLS_GTC2VCF process - BCFTOOLS_GTC2VCF(bpm_file, csv_file, egt_file, gtcs_dir, fasta_file, tsv_file) - - // result.view { files -> - // println "BCF: ${files[0]}" - // println "CSI: ${files[1]}" - // println "VCF: ${files[2]}" - // println "TSV: ${files[3]}" - // } - BCFTOOLS_GTC2VCF.out.gtc.view() + BCFTOOLS_GTC2VCF(prepare_bcftools_inputs) } From cf89bbfc0e7cdf9efe9e243e21607c9e33d9b5c8 Mon Sep 17 00:00:00 2001 From: Mike Lloyd Date: Fri, 26 Jul 2024 14:03:21 -0400 Subject: [PATCH 11/26] working through gtc2vcf --- bin/log/cnv_array.nf | 12 ++++---- config/cnv_array.config | 4 +-- modules/bcftools/bcftools_gtct2vcf.nf | 43 ++++++++++----------------- modules/illumina/iaap_cli.nf | 2 +- workflows/cnv_array.nf | 4 +-- 5 files changed, 27 insertions(+), 38 deletions(-) diff --git a/bin/log/cnv_array.nf b/bin/log/cnv_array.nf index 2f9dfbf..75d2990 100644 --- a/bin/log/cnv_array.nf +++ b/bin/log/cnv_array.nf @@ -8,12 +8,12 @@ def param_log() { error "'--egt_file': is not provided, it is a required parameter." } - if (!params.csv_file) { - error "'--csv_file': is not provided, it is a required parameter." + if (!params.gtc_csv) { + error "'--gtc_csv': is not provided, it is a required parameter." } - if (!params.fasta_file) { - error "'--fasta_file': is not provided, it is a required parameter." + if (!params.ref_fa) { + error "'--ref_fa': is not provided, it is a required parameter." } if (!params.tsv_file) { @@ -31,8 +31,8 @@ def param_log() { --idat_folder ${params.idat_folder ?: 'N/A'} --bpm_file ${params.bpm_file} --egt_file ${params.egt_file} - --csv_file ${params.csv_file} - --fasta_file ${params.fasta_file} + --gtc_csv ${params.gtc_csv} + --ref_fa ${params.ref_fa} --tsv_file ${params.tsv_file} -w ${workDir} --keep_intermediate ${params.keep_intermediate ?: 'N/A'} diff --git a/config/cnv_array.config b/config/cnv_array.config index 7bd7d28..eb03404 100644 --- a/config/cnv_array.config +++ b/config/cnv_array.config @@ -9,7 +9,7 @@ params { bpm_file = '/projects/compsci/omics_share/human/GRCh38/supporting_files/cnv_array/HumanCytoSNP-12v2-1_L1.bpm' egt_file = '/projects/compsci/omics_share/human/GRCh38/supporting_files/cnv_array/HumanCytoSNP-12v2-1_L.egt' idat_folder = '/home/temket/cnv_workflow/data/raw_idat' - gtc_file = '/flashscratch/lloydm/CNV_test/example_sample_input.csv' - ref_fa = '/projects/compsci/omics_share/human/GRCh38/supporting_files/cnv_array/HumanCytoSNP-12v2-1_L1.csv' + gtc_csv = '/projects/compsci/omics_share/human/GRCh38/supporting_files/cnv_array/HumanCytoSNP-12v2-1_L1.csv' + ref_fa = '/projects/compsci/omics_share/human/GRCh38/genome/sequence/gatk/Homo_sapiens_assembly38.fasta' tsv_file = '/home/temket/cnv_workflow/output/bcftools_convert.tsv' } diff --git a/modules/bcftools/bcftools_gtct2vcf.nf b/modules/bcftools/bcftools_gtct2vcf.nf index a0aecf4..f047a36 100644 --- a/modules/bcftools/bcftools_gtct2vcf.nf +++ b/modules/bcftools/bcftools_gtct2vcf.nf @@ -1,44 +1,33 @@ -def prepare_bcftools_inputs(ch_input) { - bpm_file = file(params.bpm_file) - csv_file = file(params.csv_input) - egt_file = file(params.egt_file) - gtcs_dir = IAAP_CLI.out.gtc - fasta_file = file(params.fasta_file) - tsv_file = file(params.tsv_file) - - return tuple(bpm_file, csv_file, egt_file, gtcs_dir, fasta_file, tsv_file) -} - - // Define BCFTOOLS_GTC2VCF process process BCFTOOLS_GTC2VCF { - cpus = 4 + cpus = 1 memory 24.GB time '01:30:00' errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.memory} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} container 'quay.io/jaxcompsci/gtc2vcf_with_tools:v2' - publishDir "${params.pubdir}", mode: 'copy' + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'bcftools' }", mode: 'copy' input: - tuple path(bpm_file), path(csv_file), path(egt_file), path(gtcs_dir), path(fasta_file), path(tsv_file) from prepare_bcftools_inputs + tuple val(sampleID), val(meta), path(gtc) output: - tuple path('bcftools_convert.bcf'), path('bcftools_convert.bcf.csi'), path('bcftools_convert.vcf'), path('bcftools_convert.tsv') - + tuple val(sampleID), val(meta), path('*_convert.bcf'), path('*_convert.bcf.csi'), path('*_convert.vcf'), path('*_convert.tsv'), emit: gtc2vcf + // tuple val(sampleID), val(meta), path('*.BAF'), path('*.LRR'), emit: baf_lrr + script: """ bcftools +gtc2vcf --no-version -Ou \ - --bpm ${bpm} \ - --csv ${csv} \ - --egt ${egt} \ - --gtcs ${gtcs_dir} \ - --fasta-ref ${fasta} \ - --extra ${tsv} | \ + --bpm ${params.bpm_file} \ + --csv ${params.gtc_csv} \ + --egt ${params.egt_file} \ + --gtcs ./ \ + --fasta-ref ${params.ref_fa} \ + --extra ${sampleID}_convert.tsv | \ bcftools sort -Ou -T ./bcftools. | \ - bcftools norm --no-version -Ob -c x -f ${fasta} | \ - tee bcftools_convert.bcf | \ - bcftools index --force --output bcftools_convert.bcf.csi - bcftools convert -O v -o bcftools_convert.vcf bcftools_convert.bcf + bcftools norm --no-version -Ob -c x -f ${params.ref_fa} | \ + tee ${sampleID}_convert.bcf | \ + bcftools index --force --output ${sampleID}_convert.bcf.csi + bcftools convert -O v -o ${sampleID}_convert.vcf ${sampleID}_convert.bcf """ } diff --git a/modules/illumina/iaap_cli.nf b/modules/illumina/iaap_cli.nf index ad57ed5..8bea6aa 100644 --- a/modules/illumina/iaap_cli.nf +++ b/modules/illumina/iaap_cli.nf @@ -2,7 +2,7 @@ process IAAP_CLI { tag "$sampleID" - cpus = 4 + cpus = 1 memory 24.GB time '01:30:00' diff --git a/workflows/cnv_array.nf b/workflows/cnv_array.nf index 177872d..ec996a5 100644 --- a/workflows/cnv_array.nf +++ b/workflows/cnv_array.nf @@ -33,6 +33,6 @@ ch_input = extract_csv(file(params.csv_input, checkIfExists: true)) // Main workflow workflow CNV_ARRAY { IAAP_CLI(ch_input) - IAAP_CLI.out.gtc.view() - BCFTOOLS_GTC2VCF(prepare_bcftools_inputs) + BCFTOOLS_GTC2VCF(IAAP_CLI.out.gtc) + BCFTOOLS_GTC2VCF.out.gtc2vcf.view() } From 5acfdc0a614b806a27c2629e87d1ba19994fe39b Mon Sep 17 00:00:00 2001 From: Tejas Temker Date: Tue, 30 Jul 2024 09:47:27 -0400 Subject: [PATCH 12/26] adding bcftools query module --- bin/help/cnv_array.nf | 7 ------- config/cnv_array.config | 5 +++-- modules/bcftools/bcftools_gtct2vcf.nf | 2 +- workflows/cnv_array.nf | 5 +++++ 4 files changed, 9 insertions(+), 10 deletions(-) diff --git a/bin/help/cnv_array.nf b/bin/help/cnv_array.nf index 2024b45..5eef671 100644 --- a/bin/help/cnv_array.nf +++ b/bin/help/cnv_array.nf @@ -14,12 +14,5 @@ Parameter | Default | Description --fasta-ref | / | The path to the reference FASTA file. --extra | / | The path to the output directory. -// Example usage of BCFTOOLS: -// --------------------------- -// bcftools sort -Ou -T ./bcftools. | \\ -// bcftools norm --no-version -Ob -c x -f | \\ -// tee bcftools_convert.bcf | \\ -// bcftools index --force --output bcftools_convert.bcf.csi -// bcftools convert -O v -o bcftools_convert.vcf bcftools_convert.bcf ''' } diff --git a/config/cnv_array.config b/config/cnv_array.config index eb03404..7ebb716 100644 --- a/config/cnv_array.config +++ b/config/cnv_array.config @@ -6,10 +6,11 @@ manifest { } params { + gtc_csv = '/projects/compsci/omics_share/human/GRCh38/supporting_files/cnv_array/HumanCytoSNP-12v2-1_L1.csv' bpm_file = '/projects/compsci/omics_share/human/GRCh38/supporting_files/cnv_array/HumanCytoSNP-12v2-1_L1.bpm' egt_file = '/projects/compsci/omics_share/human/GRCh38/supporting_files/cnv_array/HumanCytoSNP-12v2-1_L.egt' - idat_folder = '/home/temket/cnv_workflow/data/raw_idat' - gtc_csv = '/projects/compsci/omics_share/human/GRCh38/supporting_files/cnv_array/HumanCytoSNP-12v2-1_L1.csv' + idat_folder = '/home/temket/cnv_workflow/data/raw_idat/' ref_fa = '/projects/compsci/omics_share/human/GRCh38/genome/sequence/gatk/Homo_sapiens_assembly38.fasta' tsv_file = '/home/temket/cnv_workflow/output/bcftools_convert.tsv' + } diff --git a/modules/bcftools/bcftools_gtct2vcf.nf b/modules/bcftools/bcftools_gtct2vcf.nf index f047a36..ece9c2c 100644 --- a/modules/bcftools/bcftools_gtct2vcf.nf +++ b/modules/bcftools/bcftools_gtct2vcf.nf @@ -13,7 +13,7 @@ process BCFTOOLS_GTC2VCF { output: tuple val(sampleID), val(meta), path('*_convert.bcf'), path('*_convert.bcf.csi'), path('*_convert.vcf'), path('*_convert.tsv'), emit: gtc2vcf - // tuple val(sampleID), val(meta), path('*.BAF'), path('*.LRR'), emit: baf_lrr + script: """ diff --git a/workflows/cnv_array.nf b/workflows/cnv_array.nf index ec996a5..1f23504 100644 --- a/workflows/cnv_array.nf +++ b/workflows/cnv_array.nf @@ -7,6 +7,8 @@ include {param_log} from "${projectDir}/bin/log/cnv_array.nf" include {extract_csv} from "${projectDir}/bin/shared/extract_cnv_array_csv.nf" include {IAAP_CLI} from "${projectDir}/modules/illumina/iaap_cli.nf" include {BCFTOOLS_GTC2VCF} from "${projectDir}/modules/bcftools/bcftools_gtct2vcf.nf" +include {BCFTOOLS_QUERY_ASCAT} from "${projectDir}/modules/bcftools/bcftools_query_ascat.nf" + // Help if needed if (params.help) { @@ -35,4 +37,7 @@ workflow CNV_ARRAY { IAAP_CLI(ch_input) BCFTOOLS_GTC2VCF(IAAP_CLI.out.gtc) BCFTOOLS_GTC2VCF.out.gtc2vcf.view() + BCFTOOLS_QUERY_ASCAT(BCFTOOLS_GTC2VCF.out.gtc2vcf) + BCFTOOLS_QUERY_ASCAT.out.bcftools_query.view() + } From bab489c4c53b2efe9fc3f75ec151cfe0a48f1266 Mon Sep 17 00:00:00 2001 From: Tejas Temker Date: Tue, 30 Jul 2024 10:12:35 -0400 Subject: [PATCH 13/26] module --- modules/bcftools/bcftools_query_ascat.nf | 25 ++++++++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 modules/bcftools/bcftools_query_ascat.nf diff --git a/modules/bcftools/bcftools_query_ascat.nf b/modules/bcftools/bcftools_query_ascat.nf new file mode 100644 index 0000000..426af84 --- /dev/null +++ b/modules/bcftools/bcftools_query_ascat.nf @@ -0,0 +1,25 @@ +process BCFTOOLS_QUERY_ASCAT { + cpus 1 + memory 8.GB + time '01:00:00' + errorStrategy 'finish' + + container 'quay.io/jaxcompsci/gtc2vcf_with_tools:v2' + publishDir "${params.pubdir}/${params.organize_by == 'sample' ? sampleID : 'bcftools'}", mode: 'copy' + + input: + tuple val(sampleID), val(meta), path(bcf), path(csi), path(vcf), path(tsv) + + output: + tuple val(sampleID), val(meta), path('*_convert.BAF'), path('*_convert.LRR'), emit: bcftools_query + + script: + """ + (bcftools query -l ${sampleID}_convert.bcf | awk 'BEGIN{printf("\\tCHROM\\tPOS");} {printf("\\t%s",\$1);} END{printf("\\n");}' && bcftools query -f '%ID\\t%CHROM\\t%POS[\\t%BAF]\\n' ${sampleID}_convert.bcf) > ${sampleID}_convert.BAF + + (bcftools query -l ${sampleID}_convert.bcf | awk 'BEGIN{printf("\\tCHROM\\tPOS");} {printf("\\t%s",\$1);} END{printf("\\n");}' && bcftools query -f '%ID\\t%CHROM\\t%POS[\\t%LRR]\\n' ${sampleID}_convert.bcf) > ${sampleID}_convert.LRR + + sed -i s/chr// ${sampleID}_convert.BAF + sed -i s/chr// ${sampleID}_convert.LRR + """ +} From 0d583e9181ef93c181853e0575c077929472cef0 Mon Sep 17 00:00:00 2001 From: Tejas Temker Date: Tue, 30 Jul 2024 15:52:54 -0400 Subject: [PATCH 14/26] commit ascat module --- config/cnv_array.config | 5 ++++- modules/bcftools/bcftools_query_ascat.nf | 1 + modules/illumina/iaap_cli.nf | 1 + workflows/cnv_array.nf | 8 ++++++-- 4 files changed, 12 insertions(+), 3 deletions(-) diff --git a/config/cnv_array.config b/config/cnv_array.config index 7ebb716..09ab81b 100644 --- a/config/cnv_array.config +++ b/config/cnv_array.config @@ -7,10 +7,13 @@ manifest { params { gtc_csv = '/projects/compsci/omics_share/human/GRCh38/supporting_files/cnv_array/HumanCytoSNP-12v2-1_L1.csv' + bpm_file = '/projects/compsci/omics_share/human/GRCh38/supporting_files/cnv_array/HumanCytoSNP-12v2-1_L1.bpm' egt_file = '/projects/compsci/omics_share/human/GRCh38/supporting_files/cnv_array/HumanCytoSNP-12v2-1_L.egt' idat_folder = '/home/temket/cnv_workflow/data/raw_idat/' ref_fa = '/projects/compsci/omics_share/human/GRCh38/genome/sequence/gatk/Homo_sapiens_assembly38.fasta' tsv_file = '/home/temket/cnv_workflow/output/bcftools_convert.tsv' - + snp_platform = 'IlluminaCytoSNP' + GC_file = '/projects/compsci/omics_share/human/GRCh38/supporting_files/cnv_array/HumanCytoSNP-12v2-1_GCcontent_validSNPloci.txt' + RT_file = '/projects/compsci/omics_share/human/GRCh38/supporting_files/cnv_array/HumanCytoSNP-12v2-1_ReplicationTiming_SNPloci_hg19.txt' } diff --git a/modules/bcftools/bcftools_query_ascat.nf b/modules/bcftools/bcftools_query_ascat.nf index 426af84..8916726 100644 --- a/modules/bcftools/bcftools_query_ascat.nf +++ b/modules/bcftools/bcftools_query_ascat.nf @@ -12,6 +12,7 @@ process BCFTOOLS_QUERY_ASCAT { output: tuple val(sampleID), val(meta), path('*_convert.BAF'), path('*_convert.LRR'), emit: bcftools_query + tuple path('*_convert.BAF'), path('*_convert.LRR'), emit: bafnlrr script: """ diff --git a/modules/illumina/iaap_cli.nf b/modules/illumina/iaap_cli.nf index 8bea6aa..c1f46b7 100644 --- a/modules/illumina/iaap_cli.nf +++ b/modules/illumina/iaap_cli.nf @@ -18,6 +18,7 @@ process IAAP_CLI { output: tuple val(sampleID), val(meta), path("*.gtc"), emit: gtc + tuple val(sampleID), val(meta), emit: ascat2r path "iaap_cli.log", emit: iaap_cli_log script: diff --git a/workflows/cnv_array.nf b/workflows/cnv_array.nf index 1f23504..28354e7 100644 --- a/workflows/cnv_array.nf +++ b/workflows/cnv_array.nf @@ -8,6 +8,7 @@ include {extract_csv} from "${projectDir}/bin/shared/extract_cnv_array_csv.nf" include {IAAP_CLI} from "${projectDir}/modules/illumina/iaap_cli.nf" include {BCFTOOLS_GTC2VCF} from "${projectDir}/modules/bcftools/bcftools_gtct2vcf.nf" include {BCFTOOLS_QUERY_ASCAT} from "${projectDir}/modules/bcftools/bcftools_query_ascat.nf" +include {ASCAT} from "${projectDir}/modules/r/ASCAT.nf" // Help if needed @@ -28,6 +29,8 @@ if (params.csv_input) { } else { exit 1, "Workflow requires a CSV manifest. See `--help` for information." } +GC_file = file(params.gc_file, checkIfExists: true) +RT_file = file(params.rt_file, checkIfExists: true) // Extract CSV input ch_input = extract_csv(file(params.csv_input, checkIfExists: true)) @@ -38,6 +41,7 @@ workflow CNV_ARRAY { BCFTOOLS_GTC2VCF(IAAP_CLI.out.gtc) BCFTOOLS_GTC2VCF.out.gtc2vcf.view() BCFTOOLS_QUERY_ASCAT(BCFTOOLS_GTC2VCF.out.gtc2vcf) - BCFTOOLS_QUERY_ASCAT.out.bcftools_query.view() - + BCFTOOLS_QUERY_ASCAT.out.bcftools_query.view() + ASCAT(IAAP_CLI.out.ascat2r,BCFTOOLS_QUERY_ASCAT.out.bafnlrr, params.platform,GC_file,RT_file) + ASCAT.out.ascat.view() } From b8cf928730f6c2de26c6df88e0cbe39e7983088d Mon Sep 17 00:00:00 2001 From: Tejas Temker Date: Tue, 30 Jul 2024 15:55:53 -0400 Subject: [PATCH 15/26] ASCAT --- bin/cnv_array/ASCAT_run.R | 103 ++++++++++++++++++++++++++++++++++++++ modules/r/ASCAT.nf | 37 ++++++++++++++ 2 files changed, 140 insertions(+) create mode 100644 bin/cnv_array/ASCAT_run.R create mode 100644 modules/r/ASCAT.nf diff --git a/bin/cnv_array/ASCAT_run.R b/bin/cnv_array/ASCAT_run.R new file mode 100644 index 0000000..85f7c29 --- /dev/null +++ b/bin/cnv_array/ASCAT_run.R @@ -0,0 +1,103 @@ +suppressMessages(library(ASCAT)) + +### ASCAT Run ###### + +# Note: this script expects ASCAT is running on single sample BAF/LRR files. + +args=(commandArgs(TRUE)) + +sampleID = args[1] +LRR_file = args[2] +BAF_file = args[3] +gender = args[4] +platform = args[5] +GC_file = args[6] +RT_file = args[7] + +###### + +# Expected SNP POS file: +# Probe Set ID Chromosome Physical Position +# CN_473963 1 61736 +# CN_473964 1 61808 + +## the above can be taken from the BAF file. The BAF file contains positions for all valid SNPs. +SNPpos <- read.table(BAF_file, sep = "\t", header = TRUE)[ ,1:3] +colnames(SNPpos) <- c('Probe_Set_ID', 'Chromosome', 'Physical_Position') + +## + +ascat.bc = ascat.loadData(Tumor_LogR_file = LRR_file, Tumor_BAF_file = BAF_file, gender = gender, genomeVersion = "hg38") + +ascat.bc$samples[1] <- sampleID +colnames(ascat.bc[["Tumor_LogR"]]) <- sampleID +colnames(ascat.bc[["Tumor_BAF"]]) <- sampleID + +ascat.plotRawData(ascat.bc, img.prefix = "Before_correction_") + +ascat.bc = ascat.correctLogR(ascat.bc, GCcontentfile = GC_file, replictimingfile = RT_file) + +ascat.plotRawData(ascat.bc, img.prefix = "After_correction_") + +gg = ascat.predictGermlineGenotypes(ascat.bc, platform = platform) + +ascat.bc = ascat.aspcf(ascat.bc, ascat.gg = gg) + +ascat.plotSegmentedData(ascat.bc) + +ascat.output = ascat.runAscat(ascat.bc, write_segments = T) + +## + +QC = ascat.metrics(ascat.bc, ascat.output) + +write.table(as.data.frame(QC), file = paste0(sampleID, "_sample.QC.txt"), sep="\t", quote=F, row.names=F, col.names=T) + +save(ascat.bc, ascat.output, QC, file = paste0(sampleID, "_ASCAT_objects.Rdata")) + +## + +if ( length(ascat.output$failedarrays) == 0 ) { + + num_probes <- vector(mode="numeric", length=nrow(ascat.output$segments_raw)) + for (i in 1:nrow(ascat.output$segments_raw)) { + L1 = which(SNPpos$Chromosome == ascat.output$segments_raw$chr[i] & SNPpos$Physical_Position == ascat.output$segments_raw$startpos[i]) + L2 = which(SNPpos$Chromosome == ascat.output$segments_raw$chr[i] & SNPpos$Physical_Position == ascat.output$segments_raw$endpos[i]) + num_probes[i] = L2[length(L2)] - L1[1] + 1 + } + seg_raw = cbind(ascat.output$segments_raw,num_probes) + + num_probes <- vector(mode="numeric", length=nrow(ascat.output$segments)) + for (i in 1:nrow(ascat.output$segments)) { + + #print(i) + L1 = which(SNPpos$Chromosome == ascat.output$segments$chr[i] & SNPpos$Physical_Position == ascat.output$segments$startpos[i]) + L2 = which(SNPpos$Chromosome == ascat.output$segments$chr[i] & SNPpos$Physical_Position == ascat.output$segments$endpos[i]) + num_probes[i] = L2[length(L2)] - L1[1] + 1 + + } + seg = cbind(ascat.output$segments,num_probes) + + seg_raw_dfs <- split(seg_raw, seg_raw$sample) + seg_dfs <- split(seg, seg$sample) + + for (samp in names(seg_raw_dfs)){ + write.table(seg_raw_dfs[[samp]], file = paste0(samp, ".segments_raw.txt"), sep="\t", quote=F, row.names=F) + write.table(seg_dfs[[samp]], file = paste0(samp, ".segments.txt"), sep="\t", quote=F, row.names=F) + write.table(as.data.frame(ascat.output$aberrantcellfraction)[row.names(as.data.frame(ascat.output$aberrantcellfraction)) %in% samp,], file=paste(samp,".aberrantcellfraction.txt",sep=""), sep="\t", quote=F, row.names=F, col.names=F) + write.table(as.data.frame(ascat.output$ploidy)[row.names(as.data.frame(ascat.output$ploidy)) %in% samp,], file=paste(samp,".ploidy.txt",sep=""), sep="\t", quote=F, row.names=F, col.names=F) + } + +} else { + + write.table(as.data.frame(ascat.output$failedarrays), file="ASCAT.failedarrays.txt", sep="\t", quote=F, row.names=F, col.names=F) + +} + +if ( !is.null(ascat.output$nonaberrantarrays) ) { + + write.table(as.data.frame(ascat.output$nonaberrantarrays), file="ASCAT.nonaberrantarrays.txt", sep="\t", quote=F, row.names=F, col.names=F) + +} + +sessionInfo() \ No newline at end of file diff --git a/modules/r/ASCAT.nf b/modules/r/ASCAT.nf new file mode 100644 index 0000000..449f1ef --- /dev/null +++ b/modules/r/ASCAT.nf @@ -0,0 +1,37 @@ +process ASCAT { + tag "$sampleID" + + cpus 1 + memory 24.GB + time '01:30:00' + errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.memory} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + + + container 'quay.io/biocontainers/ascat:3.1.1--r43hdfd78af_1' + + input: + val(sampleID),path(LRR),path(BAF),val(gender),val(platform),path(GC_file),path(RT_file) + + output: + tuple val(sampleID), + path("${sampleID}_sample.QC.txt"), + path("${sampleID}_ASCAT_objects.Rdata"), + path("${sampleID}.segments_raw.txt"), + path("${sampleID}.segments.txt"), + path("${sampleID}.aberrantcellfraction.txt"), + path("${sampleID}.ploidy.txt"), + path("ASCAT.failedarrays.txt", optional: true), + path("ASCAT.nonaberrantarrays.txt", optional: true), emit: ascat + + script: + """ + Rscript ${projectDir}/bin/cnv_array/ASCAT_run.R \ + ${sampleID} \ + ${LRR} \ + ${BAF} \ + ${meta.gender} \ + ${params.snp_platform} \ + ${params.GC_file} \ + ${params.RT_file} + """ +} \ No newline at end of file From 6c90a3547c50999794bcbfc240e035b365150b62 Mon Sep 17 00:00:00 2001 From: Mike Lloyd Date: Tue, 30 Jul 2024 16:19:00 -0400 Subject: [PATCH 16/26] ascat script error yet to be resolved --- config/cnv_array.config | 4 ++-- modules/bcftools/bcftools_query_ascat.nf | 3 +-- modules/r/ASCAT.nf | 8 ++++---- workflows/cnv_array.nf | 4 +--- 4 files changed, 8 insertions(+), 11 deletions(-) diff --git a/config/cnv_array.config b/config/cnv_array.config index 09ab81b..3612960 100644 --- a/config/cnv_array.config +++ b/config/cnv_array.config @@ -14,6 +14,6 @@ params { ref_fa = '/projects/compsci/omics_share/human/GRCh38/genome/sequence/gatk/Homo_sapiens_assembly38.fasta' tsv_file = '/home/temket/cnv_workflow/output/bcftools_convert.tsv' snp_platform = 'IlluminaCytoSNP' - GC_file = '/projects/compsci/omics_share/human/GRCh38/supporting_files/cnv_array/HumanCytoSNP-12v2-1_GCcontent_validSNPloci.txt' - RT_file = '/projects/compsci/omics_share/human/GRCh38/supporting_files/cnv_array/HumanCytoSNP-12v2-1_ReplicationTiming_SNPloci_hg19.txt' + gc_file = '/projects/compsci/omics_share/human/GRCh38/supporting_files/cnv_array/HumanCytoSNP-12v2-1_GCcontent_validSNPloci.txt' + rt_file = '/projects/compsci/omics_share/human/GRCh38/supporting_files/cnv_array/HumanCytoSNP-12v2-1_ReplicationTiming_SNPloci_hg19.txt' } diff --git a/modules/bcftools/bcftools_query_ascat.nf b/modules/bcftools/bcftools_query_ascat.nf index 8916726..4981fde 100644 --- a/modules/bcftools/bcftools_query_ascat.nf +++ b/modules/bcftools/bcftools_query_ascat.nf @@ -11,8 +11,7 @@ process BCFTOOLS_QUERY_ASCAT { tuple val(sampleID), val(meta), path(bcf), path(csi), path(vcf), path(tsv) output: - tuple val(sampleID), val(meta), path('*_convert.BAF'), path('*_convert.LRR'), emit: bcftools_query - tuple path('*_convert.BAF'), path('*_convert.LRR'), emit: bafnlrr + tuple val(sampleID), val(meta), path('*_convert.BAF'), path('*_convert.LRR'), emit: baf_lrr script: """ diff --git a/modules/r/ASCAT.nf b/modules/r/ASCAT.nf index 449f1ef..fca6bd9 100644 --- a/modules/r/ASCAT.nf +++ b/modules/r/ASCAT.nf @@ -10,7 +10,7 @@ process ASCAT { container 'quay.io/biocontainers/ascat:3.1.1--r43hdfd78af_1' input: - val(sampleID),path(LRR),path(BAF),val(gender),val(platform),path(GC_file),path(RT_file) + tuple val(sampleID), val(meta), path(LRR), path(BAF) output: tuple val(sampleID), @@ -31,7 +31,7 @@ process ASCAT { ${BAF} \ ${meta.gender} \ ${params.snp_platform} \ - ${params.GC_file} \ - ${params.RT_file} + ${params.gc_file} \ + ${params.rt_file} """ -} \ No newline at end of file +} diff --git a/workflows/cnv_array.nf b/workflows/cnv_array.nf index 28354e7..1156cf1 100644 --- a/workflows/cnv_array.nf +++ b/workflows/cnv_array.nf @@ -39,9 +39,7 @@ ch_input = extract_csv(file(params.csv_input, checkIfExists: true)) workflow CNV_ARRAY { IAAP_CLI(ch_input) BCFTOOLS_GTC2VCF(IAAP_CLI.out.gtc) - BCFTOOLS_GTC2VCF.out.gtc2vcf.view() BCFTOOLS_QUERY_ASCAT(BCFTOOLS_GTC2VCF.out.gtc2vcf) - BCFTOOLS_QUERY_ASCAT.out.bcftools_query.view() - ASCAT(IAAP_CLI.out.ascat2r,BCFTOOLS_QUERY_ASCAT.out.bafnlrr, params.platform,GC_file,RT_file) + ASCAT(BCFTOOLS_QUERY_ASCAT.out.baf_lrr) ASCAT.out.ascat.view() } From 232f294be3e24d697ff968dec8b915bc994b2428 Mon Sep 17 00:00:00 2001 From: Mike Lloyd Date: Wed, 31 Jul 2024 11:07:29 -0400 Subject: [PATCH 17/26] ascat update --- config/cnv_array.config | 2 +- modules/bcftools/bcftools_gtct2vcf.nf | 4 +++- modules/bcftools/bcftools_query_ascat.nf | 3 +++ modules/r/ASCAT.nf | 16 ++++++---------- workflows/cnv_array.nf | 2 +- 5 files changed, 14 insertions(+), 13 deletions(-) diff --git a/config/cnv_array.config b/config/cnv_array.config index 3612960..4b2cfb3 100644 --- a/config/cnv_array.config +++ b/config/cnv_array.config @@ -15,5 +15,5 @@ params { tsv_file = '/home/temket/cnv_workflow/output/bcftools_convert.tsv' snp_platform = 'IlluminaCytoSNP' gc_file = '/projects/compsci/omics_share/human/GRCh38/supporting_files/cnv_array/HumanCytoSNP-12v2-1_GCcontent_validSNPloci.txt' - rt_file = '/projects/compsci/omics_share/human/GRCh38/supporting_files/cnv_array/HumanCytoSNP-12v2-1_ReplicationTiming_SNPloci_hg19.txt' + rt_file = '/projects/compsci/omics_share/human/GRCh38/supporting_files/cnv_array/HumanCytoSNP-12v2-1_ReplicationTiming_SNPloci_hg38.txt' } diff --git a/modules/bcftools/bcftools_gtct2vcf.nf b/modules/bcftools/bcftools_gtct2vcf.nf index ece9c2c..a2c8155 100644 --- a/modules/bcftools/bcftools_gtct2vcf.nf +++ b/modules/bcftools/bcftools_gtct2vcf.nf @@ -1,5 +1,7 @@ -// Define BCFTOOLS_GTC2VCF process process BCFTOOLS_GTC2VCF { + + tag "$sampleID" + cpus = 1 memory 24.GB time '01:30:00' diff --git a/modules/bcftools/bcftools_query_ascat.nf b/modules/bcftools/bcftools_query_ascat.nf index 4981fde..8d44824 100644 --- a/modules/bcftools/bcftools_query_ascat.nf +++ b/modules/bcftools/bcftools_query_ascat.nf @@ -1,4 +1,7 @@ process BCFTOOLS_QUERY_ASCAT { + + tag "$sampleID" + cpus 1 memory 8.GB time '01:00:00' diff --git a/modules/r/ASCAT.nf b/modules/r/ASCAT.nf index fca6bd9..a9399aa 100644 --- a/modules/r/ASCAT.nf +++ b/modules/r/ASCAT.nf @@ -6,22 +6,18 @@ process ASCAT { time '01:30:00' errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.memory} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} + container 'quay.io/jaxcompsci/ascat:v3.1.3' - container 'quay.io/biocontainers/ascat:3.1.1--r43hdfd78af_1' + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'ascat' }", mode: 'copy' input: tuple val(sampleID), val(meta), path(LRR), path(BAF) output: - tuple val(sampleID), - path("${sampleID}_sample.QC.txt"), - path("${sampleID}_ASCAT_objects.Rdata"), - path("${sampleID}.segments_raw.txt"), - path("${sampleID}.segments.txt"), - path("${sampleID}.aberrantcellfraction.txt"), - path("${sampleID}.ploidy.txt"), - path("ASCAT.failedarrays.txt", optional: true), - path("ASCAT.nonaberrantarrays.txt", optional: true), emit: ascat + tuple val(sampleID), val(meta), path("*.txt"), emit: all_txt + tuple val(sampleID), val(meta), path("*.png"), emit: all_png + tuple val(sampleID), val(meta), path("*.Rdata"), emit: ascat_rdata + tuple val(sampleID), val(meta), path("*segments_raw.txt"), path("*.ploidy.txt"), emit: seg_ploidy script: """ diff --git a/workflows/cnv_array.nf b/workflows/cnv_array.nf index 1156cf1..31bce92 100644 --- a/workflows/cnv_array.nf +++ b/workflows/cnv_array.nf @@ -41,5 +41,5 @@ workflow CNV_ARRAY { BCFTOOLS_GTC2VCF(IAAP_CLI.out.gtc) BCFTOOLS_QUERY_ASCAT(BCFTOOLS_GTC2VCF.out.gtc2vcf) ASCAT(BCFTOOLS_QUERY_ASCAT.out.baf_lrr) - ASCAT.out.ascat.view() + ASCAT.out.seg_ploidy.view() } From 85195700bce4bde78eecb9be89e7dfd8b692cb33 Mon Sep 17 00:00:00 2001 From: Mike Lloyd Date: Wed, 31 Jul 2024 11:29:58 -0400 Subject: [PATCH 18/26] ascat working --- bin/cnv_array/ASCAT_run.R | 4 ++-- modules/r/ASCAT.nf | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/bin/cnv_array/ASCAT_run.R b/bin/cnv_array/ASCAT_run.R index 85f7c29..bbbf444 100644 --- a/bin/cnv_array/ASCAT_run.R +++ b/bin/cnv_array/ASCAT_run.R @@ -7,8 +7,8 @@ suppressMessages(library(ASCAT)) args=(commandArgs(TRUE)) sampleID = args[1] -LRR_file = args[2] -BAF_file = args[3] +BAF_file = args[2] +LRR_file = args[3] gender = args[4] platform = args[5] GC_file = args[6] diff --git a/modules/r/ASCAT.nf b/modules/r/ASCAT.nf index a9399aa..4725ade 100644 --- a/modules/r/ASCAT.nf +++ b/modules/r/ASCAT.nf @@ -11,7 +11,7 @@ process ASCAT { publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'ascat' }", mode: 'copy' input: - tuple val(sampleID), val(meta), path(LRR), path(BAF) + tuple val(sampleID), val(meta), path(BAF), path(LRR) output: tuple val(sampleID), val(meta), path("*.txt"), emit: all_txt @@ -23,8 +23,8 @@ process ASCAT { """ Rscript ${projectDir}/bin/cnv_array/ASCAT_run.R \ ${sampleID} \ - ${LRR} \ ${BAF} \ + ${LRR} \ ${meta.gender} \ ${params.snp_platform} \ ${params.gc_file} \ From 109ad21c940e3aa609bfecb423b572d2d3fafaff Mon Sep 17 00:00:00 2001 From: Tejas Temker Date: Thu, 1 Aug 2024 15:44:26 -0400 Subject: [PATCH 19/26] Annotations module files --- config/cnv_array.config | 6 +++-- modules/utility_modules/ascat_annotation.nf | 25 +++++++++++++++++++++ workflows/cnv_array.nf | 5 ++++- 3 files changed, 33 insertions(+), 3 deletions(-) create mode 100644 modules/utility_modules/ascat_annotation.nf diff --git a/config/cnv_array.config b/config/cnv_array.config index 4b2cfb3..ee2b95d 100644 --- a/config/cnv_array.config +++ b/config/cnv_array.config @@ -14,6 +14,8 @@ params { ref_fa = '/projects/compsci/omics_share/human/GRCh38/genome/sequence/gatk/Homo_sapiens_assembly38.fasta' tsv_file = '/home/temket/cnv_workflow/output/bcftools_convert.tsv' snp_platform = 'IlluminaCytoSNP' - gc_file = '/projects/compsci/omics_share/human/GRCh38/supporting_files/cnv_array/HumanCytoSNP-12v2-1_GCcontent_validSNPloci.txt' - rt_file = '/projects/compsci/omics_share/human/GRCh38/supporting_files/cnv_array/HumanCytoSNP-12v2-1_ReplicationTiming_SNPloci_hg38.txt' + GC_file = '/projects/compsci/omics_share/human/GRCh38/supporting_files/cnv_array/HumanCytoSNP-12v2-1_GCcontent_validSNPloci.txt' + RT_file = '/projects/compsci/omics_share/human/GRCh38/supporting_files/cnv_array/HumanCytoSNP-12v2-1_ReplicationTiming_SNPloci_hg38.txt' + chrArm = '/projects/compsci/omics_share/human/GRCh38/supporting_files/cnv_array/GRCh38_chromosome_arm.txt' + cnvGeneFile = '/projects/compsci/omics_share/human/GRCh38/supporting_files/cnv_array/biomaRt_GRCh38_ensemblv102_CNVgeneAnnotations_primaryChroms.txt' } diff --git a/modules/utility_modules/ascat_annotation.nf b/modules/utility_modules/ascat_annotation.nf new file mode 100644 index 0000000..289c1eb --- /dev/null +++ b/modules/utility_modules/ascat_annotation.nf @@ -0,0 +1,25 @@ +process ASCAT_ANNOTATION { + + tag "$sampleID" + + cpus = 1 + memory = 24.GB + time = '01:30:00' + errorStrategy = { (task.exitStatus == 140) ? { log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.memory} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish' }.call() : 'finish' } + + container 'quay.io/biocontainers/ascat:3.1.1--r43hdfd78af_1' + publishDir "${params.pubdir}/${params.organize_by == 'sample' ? sampleID : 'ascat_annotation'}", mode: 'copy' + + input: + tuple val(sampleID), val(meta), path(segments_raw), path(ploidy) + + output: + tuple val(sampleID), val(meta), path("${sampleID}.segments_raw.extend.txt"), path("${sampleID}.*"), emit: ascat_annotated + + script: + """ + perl \${projectDir}/bin/cnv_array/${sampleID}.segment_raw_extend.pl ${segments_raw} ${ploidy} ${params.chrArm} ${meta} + perl \${projectDir}/bin/cnv_array/annotate_ensembl_genes.pl ${sampleID}.segments_raw.extend.txt ${params.cnvGeneFile} + R CMD BATCH --slave "--args ${sampleID}.segments_raw.extend.txt ${sampleID} ./ " \${projectDir}/seg_plot.R + """ +} diff --git a/workflows/cnv_array.nf b/workflows/cnv_array.nf index 31bce92..4151078 100644 --- a/workflows/cnv_array.nf +++ b/workflows/cnv_array.nf @@ -9,6 +9,7 @@ include {IAAP_CLI} from "${projectDir}/modules/illumina/iaap_cli.nf" include {BCFTOOLS_GTC2VCF} from "${projectDir}/modules/bcftools/bcftools_gtct2vcf.nf" include {BCFTOOLS_QUERY_ASCAT} from "${projectDir}/modules/bcftools/bcftools_query_ascat.nf" include {ASCAT} from "${projectDir}/modules/r/ASCAT.nf" +include {ASCAT_ANNOTATION} from "${projectDir}/modules/utility_modules/ascat_annotation.nf" // Help if needed @@ -29,6 +30,7 @@ if (params.csv_input) { } else { exit 1, "Workflow requires a CSV manifest. See `--help` for information." } + GC_file = file(params.gc_file, checkIfExists: true) RT_file = file(params.rt_file, checkIfExists: true) @@ -41,5 +43,6 @@ workflow CNV_ARRAY { BCFTOOLS_GTC2VCF(IAAP_CLI.out.gtc) BCFTOOLS_QUERY_ASCAT(BCFTOOLS_GTC2VCF.out.gtc2vcf) ASCAT(BCFTOOLS_QUERY_ASCAT.out.baf_lrr) - ASCAT.out.seg_ploidy.view() + ASCAT_ANNOTATION(ASCAT.out.seg_ploidy) + ASCAT_ANNOTATION.out.ascat_annotated.view() } From 103925867fa8a5d109493b179e62a1e0fdf442b3 Mon Sep 17 00:00:00 2001 From: Mike Lloyd Date: Fri, 2 Aug 2024 12:06:25 -0400 Subject: [PATCH 20/26] cnv array working --- bin/cnv_array/ASCAT_run.R | 4 + bin/cnv_array/annotate_ensembl_genes.pl | 132 +++++++++ bin/cnv_array/seg_plot.R | 117 ++++++++ bin/cnv_array/segment_raw_extend.pl | 288 ++++++++++++++++++++ config/cnv_array.config | 6 +- modules/utility_modules/ascat_annotation.nf | 14 +- workflows/cnv_array.nf | 1 - 7 files changed, 552 insertions(+), 10 deletions(-) create mode 100644 bin/cnv_array/annotate_ensembl_genes.pl create mode 100644 bin/cnv_array/seg_plot.R create mode 100644 bin/cnv_array/segment_raw_extend.pl diff --git a/bin/cnv_array/ASCAT_run.R b/bin/cnv_array/ASCAT_run.R index bbbf444..6911201 100644 --- a/bin/cnv_array/ASCAT_run.R +++ b/bin/cnv_array/ASCAT_run.R @@ -27,6 +27,10 @@ colnames(SNPpos) <- c('Probe_Set_ID', 'Chromosome', 'Physical_Position') ## +if (gender == 'NA') { + gender = 'XY' +} + ascat.bc = ascat.loadData(Tumor_LogR_file = LRR_file, Tumor_BAF_file = BAF_file, gender = gender, genomeVersion = "hg38") ascat.bc$samples[1] <- sampleID diff --git a/bin/cnv_array/annotate_ensembl_genes.pl b/bin/cnv_array/annotate_ensembl_genes.pl new file mode 100644 index 0000000..7b96f7b --- /dev/null +++ b/bin/cnv_array/annotate_ensembl_genes.pl @@ -0,0 +1,132 @@ +#!/usr/bin/perl -w +use POSIX; +use File::Basename; + +# This script annotates ensembl genes with copy number and breakpoints +# perl ensemblegenes_cnv_break.pl *.segments_raw.extend.txt mart_export_gene_chr1-Y.hg19ensembl75-85.08232016.txt + +if ($#ARGV != 1) { + print "This scripts requires: \n"; + exit(-1); +} + +$file_cn = $ARGV[0]; +$file_gene = $ARGV[1]; + +$file_output = basename($file_cn,".txt").".ensgene_cnvbreak.txt"; +open(OUTFILE, ">$file_output"); + +open(GENEFILE, "$file_gene") or die "can't open $file_gene: $!"; +$gene = ; +chomp($gene); + +open(CNFILE, "$file_cn") or die "can't open $file_cn: $!"; +@data = ; +close(CNFILE); +chomp(@data); + +#print OUTFILE "$tmp\tstartext\tendext\tstartext_desc\tendext_desc\tCN_raw\tLOH\tparm_fraction\tqarm_fraction\tploidy\tcopydiff_2\tcopydiff_ploidy\tlogratio_2\tlogratio_ploidy\n"; +print OUTFILE "$gene\tnum_cnv_seg\tseg_desc\tploidy\tnMajor\tnMinor\tnAraw\tnBraw\tCN_raw\tLOH\tcopydiff_2\tcopydiff_ploidy\tlogratio_2\tlogratio_ploidy\tnMajor_max\tnMinor_max\tnAraw_max\tnBraw_max\tCN_raw_max\tLOH_max\tcopydiff_2_max\tcopydiff_ploidy_max\tlogratio_2_max\tlogratio_ploidy_max\n"; + +while ($gene = ) { + + chomp($gene); + @line = split(/\t/, $gene); + $chr = $line[2]; + $start = $line[3]; + $end = $line[4]; + + #$cnraw1=999; + $numseg=0; + $region=""; + %segline = (); + @n = (); + + for ($j=1; $j<=$#data; $j++) { + @segment = split(/\t/, $data[$j]); + + $chr_cn = $segment[1]; + $pos1 = $segment[2]; + $pos2 = $segment[3]; + $pos1ext = $segment[9]; + $pos2ext = $segment[10]; + $left = $segment[11]; + $right = $segment[12]; + $cnraw = $segment[13]; + + if (($chr_cn eq $chr) && ($start <= $pos2ext) && ($end >= $pos1ext)) { #overlap + #$numseg++; + push(@n, $cnraw); + $segline{$cnraw} = [ @segment ]; + + #check if overlap with regions with no call + if (($start <= $pos1) && ($end >= $pos1ext)) { + $region = $region.$left.";"; + } + if (($start <= $pos2ext) && ($end >= $pos2)) { + $region = $region.$right.";"; + } + + #if ($cnraw < $cnraw1) { + # $cnraw1 = $cnraw; + # $count = $j; + #} + } + } + + if ($region eq "") { + $region = "NA"; + } + + if ($#n >= 0) { + + $numseg = $#n +1; + @sortn = sort{ $a <=> $b } @n; + + $nA = $segline{$sortn[0]}[4]; + $nB = $segline{$sortn[0]}[5]; + $rawA = $segline{$sortn[0]}[6]; + $rawB = $segline{$sortn[0]}[7]; + $cnraw = $segline{$sortn[0]}[13]; + $loh = $segline{$sortn[0]}[14]; + $ploidy= $segline{$sortn[0]}[17]; + $copydiff1 = $segline{$sortn[0]}[18]; + $copydiff2 = $segline{$sortn[0]}[19]; + $logratio1 = $segline{$sortn[0]}[20]; + $logratio2 = $segline{$sortn[0]}[21]; + + $outline = "$gene\t$numseg\t$region\t$ploidy\t$nA\t$nB\t$rawA\t$rawB\t$cnraw\t$loh\t$copydiff1\t$copydiff2\t$logratio1\t$logratio2\t"; + + if ($numseg > 1 ) { + $nA = $segline{$sortn[$#sortn]}[4]; + $nB = $segline{$sortn[$#sortn]}[5]; + $rawA = $segline{$sortn[$#sortn]}[6]; + $rawB = $segline{$sortn[$#sortn]}[7]; + $cnraw = $segline{$sortn[$#sortn]}[13]; + $loh = $segline{$sortn[$#sortn]}[14]; + $copydiff1 = $segline{$sortn[$#sortn]}[18]; + $copydiff2 = $segline{$sortn[$#sortn]}[19]; + $logratio1 = $segline{$sortn[$#sortn]}[20]; + $logratio2 = $segline{$sortn[$#sortn]}[21]; + } + else { + $nA = "NA"; + $nB = "NA"; + $rawA = "NA"; + $rawB = "NA"; + $cnraw = "NA"; + $loh = "NA"; + $copydiff1 = "NA"; + $copydiff2 = "NA"; + $logratio1 = "NA"; + $logratio2 = "NA"; + + } + + $outline = $outline."$nA\t$nB\t$rawA\t$rawB\t$cnraw\t$loh\t$copydiff1\t$copydiff2\t$logratio1\t$logratio2"; + print OUTFILE "$outline\n"; + } +} + +close (GENEFILE); +close (OUTFILE); diff --git a/bin/cnv_array/seg_plot.R b/bin/cnv_array/seg_plot.R new file mode 100644 index 0000000..f2bb5a2 --- /dev/null +++ b/bin/cnv_array/seg_plot.R @@ -0,0 +1,117 @@ +# updated to have Graces Sept '19 tweaks + +options(scipen = 999) + +args=(commandArgs(TRUE)) + +# filename is name of the *segments_raw.extend.txt file +filename <- args[1] + +sampleID <- args[2] + +# outdir is the dir where png result will be written ... use "./" for current dir +outdir <- args[3] + + +CNS <-read.table(filename,header=T,sep="\t") + +gender <- 'female' +sex <- 'female' + +if (sex == "female") { + CNS=CNS[CNS$chr!="Y",] +} + +#title of plot +ploidy=round(CNS$ploidy[1], digits=2) +plottitle=paste( gsub("_"," ", sampleID), " ploidy=",ploidy,sep="") + +chromo <- unique(CNS$chr) +chromo +xx=0 +y = c() +start=CNS$startpos +end=CNS$endpos + +for (x in chromo) { + start[CNS$chr == x]=start[CNS$chr == x]+xx + end[CNS$chr == x]=end[CNS$chr == x]+xx + tmp = CNS$endpos[CNS$chr == x] + xx=tail(tmp,1)+xx + y <- c(y, xx) +} + +png(paste(outdir,sampleID,"_segmentsgenomeplot.copydiffploidy.png",sep=""), width=1300,height=600) + +par(mar = c(5, 5, 8, 4)) + +val=CNS$copydiff_ploidy + +# in the following stmt, family="serif" changes font to times-roman; cex.main=1.8 scales up the title font size +# formerly also used: ylim=c(-7,20), +plot( c(start,end), c(val,val), col="white", main=plottitle, xlab="Chromosome", ylab="Delta from Ploidy", + ylim=c(-8, max( c(val,val) ) )) + +for (i in 1:length(start)) { + if (CNS$LOH[i]==1) { + polygon(c(start[i],end[i],end[i],start[i]),c(min(-7),min(-7),max(-6),max(-6)),col="lightsteelblue",border="lightsteelblue",lwd=2) + } + +} + +segments(start,val,end,val,col="tomato",lwd=5) +abline(v=y,col="grey") +posy=c(-8) +i=1 +l=0 +for (x in chromo) { + posx=(l+y[i])/2 + text(posx,posy,x,cex=1.2,srt=45) + l=y[i] + i=i+1 +} +abline(h=0,col="black",lty=2,lwd=1.5) + +# formerly also used: inset=c(0,-0.1), +legend("topright", inset=-0.1, c("Difference from sample ploidy ", "LOH"), xpd=TRUE, horiz=T, + bty="n", lty=c(1,1), lwd=6, col=c("tomato", "lightsteelblue"), cex=1.5 ) + +dev.off() + + +png(paste(outdir,sampleID,"_segmentsgenomeplot.CNraw_loh.png",sep=""), width=1300,height=600) + +par(mar = c(5, 5, 8, 4)) + +val=CNS$CN_raw +d=max(val)/100 +plot(c(start,end),c(val,val),col="white",ylab="CN, CN Major, CN Minor",main=plottitle,ylim=c(-d,max(val)+4*d),xaxt='n',xlab="chromosomes") + +for (i in 1:length(start)) { + if (CNS$LOH[i]==1) { + polygon(c(start[i],end[i],end[i],start[i]),c(min(val),min(val),max(val),max(val)),col="lightsteelblue",border="lightsteelblue",lwd=2) + } + +} + +val=CNS$nBraw-d +segments(start,val,end,val,col="blue",lwd=4) +val=CNS$nAraw +segments(start,val,end,val,col="red",lwd=4) +val=CNS$CN_raw+d +segments(start,val,end,val,col="purple",lwd=4) +abline(v=y,col="grey") +posy=max(val)+2*d +i=1 +l=0 +for (x in chromo) { + posx=(l+y[i])/2 + text(posx,posy,x,cex=1,srt=45) + l=y[i] + i=i+1 +} +abline(h=CNS$ploidy[1],col="black",lty=2,lwd=1.5) + +legend("topright", c("CN Total", "CN Major", "CN Minor", "LOH"),xpd=TRUE,horiz=T, inset=c(0,-0.1), bty = "n", lty=c(1,1,1,1), lwd=6, col = c("purple", "red", "blue", "lightsteelblue"), cex = 1) +dev.off() + diff --git a/bin/cnv_array/segment_raw_extend.pl b/bin/cnv_array/segment_raw_extend.pl new file mode 100644 index 0000000..04cd570 --- /dev/null +++ b/bin/cnv_array/segment_raw_extend.pl @@ -0,0 +1,288 @@ +#!/usr/bin/perl -w +use POSIX; +use File::Basename; + +# This script adds to segment file the arm fraction, LOH and CN diff and log ratio relative to 2 and ploidy +# The segments are extended + +# perl segment_raw_annotate.pl *segments_raw.txt *ploidy.txt hg38_chromosome_arm.txt [male, female, unknown] + +if ($#ARGV != 3) { + print "This scripts requires: \n"; + exit(-1); +} + +$file_cn = $ARGV[0]; +$file_ploidy = $ARGV[1]; +$file_arm = $ARGV[2]; +$gender = $ARGV[3]; + +$file_output = basename($file_cn,".txt").".extend.txt"; + +$ploidy = `cat $file_ploidy`; +chomp($ploidy); + +# $gender = `cat $file_gender`; +# chomp($gender); + +if (($gender eq "female") || ($gender eq "unknown")) { + $cn_factor = 1; +} +elsif ($gender eq "male") { + $cn_factor= 0.5; +} + +$tmp = `cat $file_arm | awk 'NR>1'`; +@arm = split(/\n/,$tmp); +chomp(@arm); + +open(CN, "$file_cn") or die "can't open $file_cn: $!"; +$tmp = ; +chomp($tmp); + +open(OUTFILE, ">$file_output"); +print OUTFILE "$tmp\tstartext\tendext\tstartext_desc\tendext_desc\tCN_raw\tLOH\tparm_fraction\tqarm_fraction\tploidy\tcopydiff_2\tcopydiff_ploidy\tlogratio_2\tlogratio_ploidy\n"; + +open(TMPFILE, ">tmp.txt"); + +#merge segments +$tmp = ; +chomp($tmp); +@line = split(/\t/,$tmp); +print "@line\n"; +$sample = $line[0]; +$chromo = $line[1]; +$n1 = $line[4]; +$n2 = $line[5]; +$cn1 = $line[6]; +$cn2 = $line[7]; +$start = $line[2]; +$end = $line[3]; +$num = $line[8]; + +print "$num\n"; + +while ($tmp = ) { + chomp($tmp); + @line = split(/\t/,$tmp); + + if (($chromo eq $line[1]) && ($cn1 == $line[6]) && ($cn2 == $line[7])) { + $end = $line[3]; + $num = $num + $line[8]; + } + else { + print TMPFILE "$sample\t$chromo\t$start\t$end\t$n1\t$n2\t$cn1\t$cn2\t$num\n"; + $sample = $line[0]; + $chromo = $line[1]; + $n1 = $line[4]; + $n2 = $line[5]; + $cn1 = $line[6]; + $cn2 = $line[7]; + $start = $line[2]; + $end = $line[3]; + $num = $line[8]; + } +} +#lastline +print TMPFILE "$sample\t$chromo\t$start\t$end\t$n1\t$n2\t$cn1\t$cn2\t$num\n"; + +close (CN); +close (TMPFILE); + +open(CN, "tmp.txt") or die "can't open tmp.txt: $!"; +@seg = ; +chomp(@seg); +close (CN); +$n = 0; + +for ($j=0; $j<$#seg; $j++) { + + @array1 = split(/\t/,$seg[$j]); + @array2 = split(/\t/,$seg[$j+1]); + #$x1 = $array1[2]; + $x2 = $array1[3]; + $y1 = $array2[2]; + #$y2 = $array2[3]; + + if ($array1[1] ne $n) { #first line for chr + + $n = $array1[1]; + $left = 0; + $left1 = "telomere"; + + for ($i=1; $i<=$#arm; $i+=2) { + @line = split(/\t/,$arm[$i]); + if ($n eq substr($line[0],3)) { + $a = $line[1]; + $b = $line[2]; + } + } + + if ($array2[1] ne $n) { #last line for chr + $right = $b; + $right1 = "telomere"; + } + elsif (($x2 < $a) && ($y1 > $a)) { + $right = $a; + $right1 = "centromere"; + } + else { + $right = floor(($x2 + $y1)/2); + $right1 = "no_probe"; + } + } + else { + + $left = $right + 1; + $left1 = $right1; + + if ($array2[1] ne $n) { #last line for chr + + $right = $b; + $right1 = "telomere"; + } + elsif (($x2 < $a) && ($y1 > $a)) { + $right = $a; + $right1 = "centromere"; + } + else { + $right = floor(($x2 + $y1)/2); + $right1 = "no_probe"; + } + } + + $copy = $array1[6] + $array1[7]; + if ($array1[6] >= 0.5 && $array1[7] <= 0.1) { + $loh=1; + } + else { + $loh=0; + } + + for ($i=0; $i<=$#arm; $i+=2) { + @line = split(/\t/,$arm[$i]); + if ($n eq substr($line[0],3)) { + if (($right>=$line[1]) && ($left<=$line[2])) { + @tmp = ($left,$right,$line[1],$line[2]); + @sorttmp = sort{ $a <=> $b } @tmp; + $overlap1=($sorttmp[2]-$sorttmp[1])/($line[2]-$line[1]); + } + else { + $overlap1=0; + } + } + } + + for ($i=1; $i<=$#arm; $i+=2) { + @line = split(/\t/,$arm[$i]); + if ($n eq substr($line[0],3)) { + if (($right>=$line[1]) && ($left<=$line[2])) { + @tmp = ($left,$right,$line[1],$line[2]); + @sorttmp = sort{ $a <=> $b } @tmp; + $overlap2=($sorttmp[2]-$sorttmp[1])/($line[2]-$line[1]); + } + else { + $overlap2=0; + } + } + } + + if (($n eq "X") || ($n eq "Y")) { + $diff1=$copy - ($cn_factor * 2); + $diff2=$copy- ($cn_factor * $ploidy); + $logratio1 = log(($copy+0.01)/($cn_factor * 2))/log(2); + $logratio2 = log(($copy+0.01)/($cn_factor * $ploidy))/log(2); + } + else { + $diff1=$copy-2; + $diff2=$copy-$ploidy; + $logratio1 = log(($copy+0.01)/2)/log(2); + $logratio2 = log(($copy+0.01)/$ploidy)/log(2); + } + + print OUTFILE "$seg[$j]\t$left\t$right\t$left1\t$right1\t$copy\t$loh\t$overlap1\t$overlap2\t$ploidy\t$diff1\t$diff2\t$logratio1\t$logratio2\n"; +} + +@array1 = split(/\t/,$seg[$#seg]); + +if ($array1[1] ne $n) { #first line for chr + + $n = $array1[1]; + $left = 0; + $left1 = "telomere"; + + for ($i=1; $i<=$#arm; $i+=2) { + @line = split(/\t/,$arm[$i]); + if ($n eq substr($line[0],3)) { + $a = $line[1]; + $b = $line[2]; + } + } + + $right = $b; + $right1 = "telomere"; + +} +else { + + $left = $right + 1; + $left1 = $right1; + + $right = $b; + $right1 = "telomere"; + +} + +$copy = $array1[6] + $array1[7]; +if ($array1[6] >= 0.5 && $array1[7] <= 0.1) { + $loh=1; +} +else { + $loh=0; +} + +for ($i=0; $i<=$#arm; $i+=2) { + @line = split(/\t/,$arm[$i]); + if ($n eq substr($line[0],3)) { + if (($right>=$line[1]) && ($left<=$line[2])) { + @tmp = ($left,$right,$line[1],$line[2]); + @sorttmp = sort{ $a <=> $b } @tmp; + $overlap1=($sorttmp[2]-$sorttmp[1])/($line[2]-$line[1]); + } + else { + $overlap1=0; + } + } +} + +for ($i=1; $i<=$#arm; $i+=2) { + @line = split(/\t/,$arm[$i]); + if ($n eq substr($line[0],3)) { + if (($right>=$line[1]) && ($left<=$line[2])) { + @tmp = ($left,$right,$line[1],$line[2]); + @sorttmp = sort{ $a <=> $b } @tmp; + $overlap2=($sorttmp[2]-$sorttmp[1])/($line[2]-$line[1]); + } + else { + $overlap2=0; + } + } +} + +if (($n eq "X") || ($n eq "Y")) { + $diff1=$copy - ($cn_factor * 2); + $diff2=$copy- ($cn_factor * $ploidy); + $logratio1 = log(($copy+0.01)/($cn_factor * 2))/log(2); + $logratio2 = log(($copy+0.01)/($cn_factor * $ploidy))/log(2); +} +else { + $diff1=$copy-2; + $diff2=$copy-$ploidy; + $logratio1 = log(($copy+0.01)/2)/log(2); + $logratio2 = log(($copy+0.01)/$ploidy)/log(2); +} + +print OUTFILE "$seg[$j]\t$left\t$right\t$left1\t$right1\t$copy\t$loh\t$overlap1\t$overlap2\t$ploidy\t$diff1\t$diff2\t$logratio1\t$logratio2\n"; + +close(CN); +close (OUTFILE); diff --git a/config/cnv_array.config b/config/cnv_array.config index ee2b95d..f8c82f0 100644 --- a/config/cnv_array.config +++ b/config/cnv_array.config @@ -7,15 +7,13 @@ manifest { params { gtc_csv = '/projects/compsci/omics_share/human/GRCh38/supporting_files/cnv_array/HumanCytoSNP-12v2-1_L1.csv' - bpm_file = '/projects/compsci/omics_share/human/GRCh38/supporting_files/cnv_array/HumanCytoSNP-12v2-1_L1.bpm' egt_file = '/projects/compsci/omics_share/human/GRCh38/supporting_files/cnv_array/HumanCytoSNP-12v2-1_L.egt' - idat_folder = '/home/temket/cnv_workflow/data/raw_idat/' ref_fa = '/projects/compsci/omics_share/human/GRCh38/genome/sequence/gatk/Homo_sapiens_assembly38.fasta' tsv_file = '/home/temket/cnv_workflow/output/bcftools_convert.tsv' snp_platform = 'IlluminaCytoSNP' - GC_file = '/projects/compsci/omics_share/human/GRCh38/supporting_files/cnv_array/HumanCytoSNP-12v2-1_GCcontent_validSNPloci.txt' - RT_file = '/projects/compsci/omics_share/human/GRCh38/supporting_files/cnv_array/HumanCytoSNP-12v2-1_ReplicationTiming_SNPloci_hg38.txt' + gc_file = '/projects/compsci/omics_share/human/GRCh38/supporting_files/cnv_array/HumanCytoSNP-12v2-1_GCcontent_validSNPloci.txt' + rt_file = '/projects/compsci/omics_share/human/GRCh38/supporting_files/cnv_array/HumanCytoSNP-12v2-1_ReplicationTiming_SNPloci_hg38.txt' chrArm = '/projects/compsci/omics_share/human/GRCh38/supporting_files/cnv_array/GRCh38_chromosome_arm.txt' cnvGeneFile = '/projects/compsci/omics_share/human/GRCh38/supporting_files/cnv_array/biomaRt_GRCh38_ensemblv102_CNVgeneAnnotations_primaryChroms.txt' } diff --git a/modules/utility_modules/ascat_annotation.nf b/modules/utility_modules/ascat_annotation.nf index 289c1eb..3eda34f 100644 --- a/modules/utility_modules/ascat_annotation.nf +++ b/modules/utility_modules/ascat_annotation.nf @@ -7,19 +7,23 @@ process ASCAT_ANNOTATION { time = '01:30:00' errorStrategy = { (task.exitStatus == 140) ? { log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.memory} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish' }.call() : 'finish' } - container 'quay.io/biocontainers/ascat:3.1.1--r43hdfd78af_1' + container 'quay.io/jaxcompsci/ascat:v3.1.3' + publishDir "${params.pubdir}/${params.organize_by == 'sample' ? sampleID : 'ascat_annotation'}", mode: 'copy' input: tuple val(sampleID), val(meta), path(segments_raw), path(ploidy) output: - tuple val(sampleID), val(meta), path("${sampleID}.segments_raw.extend.txt"), path("${sampleID}.*"), emit: ascat_annotated + tuple val(sampleID), val(meta), path("*.segments_raw.extend.txt"), emit: seg_extended + tuple val(sampleID), val(meta), path("*.ensgene_cnvbreak.txt"), emit: ensembl_annot + tuple val(sampleID), val(meta), path("*.png"), emit: png script: + gender = meta.gender == 'XX' ? 'female' : 'male' """ - perl \${projectDir}/bin/cnv_array/${sampleID}.segment_raw_extend.pl ${segments_raw} ${ploidy} ${params.chrArm} ${meta} - perl \${projectDir}/bin/cnv_array/annotate_ensembl_genes.pl ${sampleID}.segments_raw.extend.txt ${params.cnvGeneFile} - R CMD BATCH --slave "--args ${sampleID}.segments_raw.extend.txt ${sampleID} ./ " \${projectDir}/seg_plot.R + perl ${projectDir}/bin/cnv_array/segment_raw_extend.pl ${segments_raw} ${ploidy} ${params.chrArm} ${gender} + perl ${projectDir}/bin/cnv_array/annotate_ensembl_genes.pl ${sampleID}.segments_raw.extend.txt ${params.cnvGeneFile} + R CMD BATCH --slave "--args ${sampleID}.segments_raw.extend.txt ${sampleID} ./ " ${projectDir}/bin/cnv_array/seg_plot.R """ } diff --git a/workflows/cnv_array.nf b/workflows/cnv_array.nf index 4151078..38b087e 100644 --- a/workflows/cnv_array.nf +++ b/workflows/cnv_array.nf @@ -44,5 +44,4 @@ workflow CNV_ARRAY { BCFTOOLS_QUERY_ASCAT(BCFTOOLS_GTC2VCF.out.gtc2vcf) ASCAT(BCFTOOLS_QUERY_ASCAT.out.baf_lrr) ASCAT_ANNOTATION(ASCAT.out.seg_ploidy) - ASCAT_ANNOTATION.out.ascat_annotated.view() } From 773d29a023a064a0fff356a082387e1022695b99 Mon Sep 17 00:00:00 2001 From: Mike Lloyd Date: Mon, 5 Aug 2024 14:20:22 -0400 Subject: [PATCH 21/26] added nf-test dir and config --- nf-test.config | 13 +++++++++++++ tests/nextflow.config | 5 +++++ 2 files changed, 18 insertions(+) create mode 100644 nf-test.config create mode 100644 tests/nextflow.config diff --git a/nf-test.config b/nf-test.config new file mode 100644 index 0000000..9d571fd --- /dev/null +++ b/nf-test.config @@ -0,0 +1,13 @@ +config { + + testsDir "tests" + workDir ".nf-test" + configFile "tests/nextflow.config" + profile "sumner2" + stage { + symlink "subworkflows/" + symlink "modules/" + symlink "test/" + symlink "workflows/" + } +} diff --git a/tests/nextflow.config b/tests/nextflow.config new file mode 100644 index 0000000..c99eca5 --- /dev/null +++ b/tests/nextflow.config @@ -0,0 +1,5 @@ +/* +======================================================================================== + Nextflow config file for running tests +======================================================================================== +*/ \ No newline at end of file From 5a45c9cb822a7dd2dae96d88b49bd53e412e58ac Mon Sep 17 00:00:00 2001 From: Tejas Temker Date: Thu, 15 Aug 2024 13:47:33 -0400 Subject: [PATCH 22/26] test file --- tests/cnv.nf.test | 130 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 130 insertions(+) create mode 100644 tests/cnv.nf.test diff --git a/tests/cnv.nf.test b/tests/cnv.nf.test new file mode 100644 index 0000000..da36883 --- /dev/null +++ b/tests/cnv.nf.test @@ -0,0 +1,130 @@ +nextflow_workflow { + + name "Test Workflow CNV_ARRAY" + script "workflows/cnv_array.nf" + workflow "CNV_ARRAY" + + test("Full Workflow -- Required Params") { + tag "RequiredParams" + tag "primary" + when { + params { + csv_input = "${baseDir}/test/cnv/data/example_sample_input.csv" + } + } + + then { + assert workflow.success + } + } + + test("IAAP_CLI Process") { + tag "IAAP_CLI" + tag "process" + when { + params { + outdir = "tests/results" + idat_folder = "${baseDir}/test/cnv/idat_folder" + bpm_file = "${baseDir}/test/cnv/bpm_file" + egt_file = "${baseDir}/test/cnv/egt_file" + csv_input = "${baseDir}/test/cnv/csv_input" + gc_file = "${baseDir}/test/cnv/gc_file" + rt_file = "${baseDir}/test/cnv/rt_file" + } + } + + then { + assert workflow.success + // assert GTC format files + assert file("tests/results/iaap_cli_output.csv").exists() + } + } + + test("BCFTOOLS_GTC2VCF Process") { + tag "BCFTOOLS_GTC2VCF" + tag "process" + when { + params { + outdir = "tests/results" + idat_folder = "${baseDir}/test/cnv/idat_folder" + bpm_file = "${baseDir}/test/cnv/bpm_file" + egt_file = "${baseDir}/test/cnv/egt_file" + csv_input = "${baseDir}/test/cnv/csv_input" + gc_file = "${baseDir}/test/cnv/gc_file" + rt_file = "${baseDir}/test/cnv/rt_file" + } + } + + then { + assert workflow.success + // assert the tsv, bcf, csi and vcf files + assert file("tests/results/bcftools_gtct2vcf_output.vcf").exists() + } + } + + test("BCFTOOLS_QUERY_ASCAT Process") { + tag "BCFTOOLS_QUERY_ASCAT" + tag "process" + when { + params { + outdir = "tests/results" + idat_folder = "${baseDir}/test/cnv/idat_folder" + bpm_file = "${baseDir}/test/cnv/bpm_file" + egt_file = "${baseDir}/test/cnv/egt_file" + csv_input = "${baseDir}/test/cnv/csv_input" + gc_file = "${baseDir}/test/cnv/gc_file" + rt_file = "${baseDir}/test/cnv/rt_file" + } + } + + then { + assert workflow.success + // assert the files BAF and LRR + assert file("tests/results/bcftools_query_ascat_output.csv").exists() + } + } + + test("ASCAT Process") { + tag "ASCAT" + tag "process" + when { + params { + outdir = "tests/results" + idat_folder = "${baseDir}/test/cnv/idat_folder" + bpm_file = "${baseDir}/test/cnv/bpm_file" + egt_file = "${baseDir}/test/cnv/egt_file" + csv_input = "${baseDir}/test/cnv/csv_input" + gc_file = "${baseDir}/test/cnv/gc_file" + rt_file = "${baseDir}/test/cnv/rt_file" + } + } + + then { + assert workflow.success + // Assert the files + assert file("tests/results/ascat_output.csv").exists() + } + } + + test("ASCAT_ANNOTATION Process") { + tag "ASCAT_ANNOTATION" + tag "process" + when { + params { + outdir = "tests/results" + idat_folder = "${baseDir}/test/cnv/idat_folder" + bpm_file = "${baseDir}/test/cnv/bpm_file" + egt_file = "${baseDir}/test/cnv/egt_file" + csv_input = "${baseDir}/test/cnv/csv_input" + gc_file = "${baseDir}/test/cnv/gc_file" + rt_file = "${baseDir}/test/cnv/rt_file" + } + } + + then { + assert workflow.success + // assert the files .txt file and ploly file + assert file("tests/results/ascat_annotation_output.csv").exists() + } + } +} From a91198c3b6fd41ef62aea3511d6d0ba0f12de3f9 Mon Sep 17 00:00:00 2001 From: Mike Lloyd Date: Fri, 16 Aug 2024 10:25:24 -0400 Subject: [PATCH 23/26] polish and test update --- .gitignore | 2 + bin/help/cnv_array.nf | 26 ++--- bin/log/cnv_array.nf | 5 - bin/shared/extract_cnv_array_csv.nf | 25 ++++- config/cnv_array.config | 1 - nextflow.config | 2 +- test/cnv_array/example_sample_input.csv | 4 + test/cnv_array/fail_example_input.csv | 2 + tests/cnv.nf.test | 130 ---------------------- tests/workflows/cnv_array.nf.test | 139 ++++++++++++++++++++++++ workflows/cnv_array.nf | 5 +- 11 files changed, 183 insertions(+), 158 deletions(-) create mode 100644 test/cnv_array/example_sample_input.csv create mode 100644 test/cnv_array/fail_example_input.csv delete mode 100644 tests/cnv.nf.test create mode 100644 tests/workflows/cnv_array.nf.test diff --git a/.gitignore b/.gitignore index 6bfdf18..a295e6f 100644 --- a/.gitignore +++ b/.gitignore @@ -11,3 +11,5 @@ design.csv sv_input.csv test.csv test2.csv +.nf-test/ +.nf-test* \ No newline at end of file diff --git a/bin/help/cnv_array.nf b/bin/help/cnv_array.nf index 5eef671..3f27cb8 100644 --- a/bin/help/cnv_array.nf +++ b/bin/help/cnv_array.nf @@ -1,18 +1,18 @@ def help() { println ''' -Parameter | Default | Description ------------------|---------|--------------------------------------------------------------------------- ---bpm_file | / | The path to the BPM file. ---egt_file | / | The path to the EGT file. --w | / | The directory for intermediary files and Nextflow processes. Ensure ample storage. ---help | false | Print this help message and exit. - ---bpm | / | The path to the BPM file. ---csv | / | The path to the CSV file. ---egt | / | The path to the EGT file. ---gtcs | / | The path to GTC output. ---fasta-ref | / | The path to the reference FASTA file. ---extra | / | The path to the output directory. +Parameter | Default | Description +--pubdir | / | The directory that the saved outputs will be stored. +--organize_by | sample | How to organize the output folder structure. Options: sample or analysis. +-w | / | The directory that all intermediary files and nextflow processes utilize. This directory can become quite large. This should be a location on /fastscratch or other directory with ample storage. +--gtc_csv | '/projects/compsci/omics_share/human/GRCh38/supporting_files/cnv_array/HumanCytoSNP-12v2-1_L1.csv' | Genotype Call (GTC) manifest for IDAT conversion. Provided by Illumina. +--bpm_file | '/projects/compsci/omics_share/human/GRCh38/supporting_files/cnv_array/HumanCytoSNP-12v2-1_L1.bpm' | Manifest file describing the SNP or probe content on a BeadChip. Provided by Illumina. +--egt_file | '/projects/compsci/omics_share/human/GRCh38/supporting_files/cnv_array/HumanCytoSNP-12v2-1_L.egt' | Cluster file describing the cluster positions for the Illumina genotyping array. Provided by Illumina. +--ref_fa | '/projects/compsci/omics_share/human/GRCh38/genome/sequence/gatk/Homo_sapiens_assembly38.fasta' | The reference fasta file. Reference FASTA build should match Illumina provided files. +--snp_platform | 'IlluminaCytoSNP' | SNP platform supported by ASCAT. See full supported list here: https://github.com/VanLoo-lab/ascat?tab=readme-ov-file#supported-arrays-without-matched-germline +--gc_file | '/projects/compsci/omics_share/human/GRCh38/supporting_files/cnv_array/HumanCytoSNP-12v2-1_GCcontent_validSNPloci.txt' | ASCAT’s GC correction file, generated from scripts at https://github.com/VanLoo-lab/ascat/tree/master/LogRcorrection +--rt_file | '/projects/compsci/omics_share/human/GRCh38/supporting_files/cnv_array/HumanCytoSNP-12v2-1_ReplicationTiming_SNPloci_hg38.txt' | ASCAT’s replication timing file, generated from scripts at https://github.com/VanLoo-lab/ascat/tree/master/LogRcorrection +--chrArm | '/projects/compsci/omics_share/human/GRCh38/supporting_files/cnv_array/GRCh38_chromosome_arm.txt' | Chromosome arm locations, used in CNV segment annotation. +--cnvGeneFile | '/projects/compsci/omics_share/human/GRCh38/supporting_files/cnv_array/biomaRt_GRCh38_ensemblv102_CNVgeneAnnotations_primaryChroms.txt' ''' } diff --git a/bin/log/cnv_array.nf b/bin/log/cnv_array.nf index 75d2990..4213e53 100644 --- a/bin/log/cnv_array.nf +++ b/bin/log/cnv_array.nf @@ -16,9 +16,6 @@ def param_log() { error "'--ref_fa': is not provided, it is a required parameter." } - if (!params.tsv_file) { - error "'--tsv_file': is not provided, it is a required parameter." - } // Log parameter information log.info """ @@ -28,12 +25,10 @@ def param_log() { Results Published to: ${params.pubdir ?: 'N/A'} ______________________________________________________ - --idat_folder ${params.idat_folder ?: 'N/A'} --bpm_file ${params.bpm_file} --egt_file ${params.egt_file} --gtc_csv ${params.gtc_csv} --ref_fa ${params.ref_fa} - --tsv_file ${params.tsv_file} -w ${workDir} --keep_intermediate ${params.keep_intermediate ?: 'N/A'} -c ${params.config ?: 'N/A'} diff --git a/bin/shared/extract_cnv_array_csv.nf b/bin/shared/extract_cnv_array_csv.nf index 0411a57..68dfb6a 100644 --- a/bin/shared/extract_cnv_array_csv.nf +++ b/bin/shared/extract_cnv_array_csv.nf @@ -19,13 +19,13 @@ def extract_csv(csv_file) { Channel.from(csv_file).splitCsv(header: true) .map{ row -> - if (!(row.sampleID) || !(row.idat_red || !(row.idat_green))){ + if (!(row.sampleID) || !(row.idat_red) || !(row.idat_green)) { System.err.println(ANSI_RED + "-----------------------------------------------------------------------" + ANSI_RESET) System.err.println(ANSI_RED + "Missing field in csv file header. The csv file must have fields: 'sampleID', 'idat_red', 'idat_green'." + ANSI_RESET) System.err.println(ANSI_RED + "Exiting now." + ANSI_RESET) System.err.println(ANSI_RED + "-----------------------------------------------------------------------" + ANSI_RESET) System.exit(1) - } + } [row.sampleID.toString(), row] }.groupTuple() .map{ meta, rows -> @@ -34,6 +34,24 @@ def extract_csv(csv_file) { }.transpose() .map{ row, numLanes -> //from here do the usual thing for csv parsing + if (row.idat_red.substring(row.idat_red.lastIndexOf(System.getProperty("file.separator")) + 1).count("_") > 2){ + System.err.println(ANSI_RED + "-----------------------------------------------------------------------" + ANSI_RESET) + System.err.println(ANSI_RED + "The file: " + row.idat_red + " containes more than 2 underscores in the name." + ANSI_RESET) + System.err.println(ANSI_RED + "IDAT files must have only 2 underscores (i.e., xxx_xxx_Grn.idat and xxx_xxx_Red.idat)." + ANSI_RESET) + System.err.println(ANSI_RED + "GEO (and others) rename files to have more than 2 (i.e., GSMxxx_xxx_xxx_Red.idat). File names must be adjusted prior to running." + ANSI_RESET) + System.err.println(ANSI_RED + "Exiting now." + ANSI_RESET) + System.err.println(ANSI_RED + "-----------------------------------------------------------------------" + ANSI_RESET) + System.exit(1) + } + if (row.idat_green.substring(row.idat_green.lastIndexOf(System.getProperty("file.separator")) + 1).count("_") > 2){ + System.err.println(ANSI_RED + "-----------------------------------------------------------------------" + ANSI_RESET) + System.err.println(ANSI_RED + "The file: " + row.idat_green + " containes more than 2 underscores in the name." + ANSI_RESET) + System.err.println(ANSI_RED + "IDAT files must have only 2 underscores (i.e., xxx_xxx_Grn.idat and xxx_xxx_Red.idat)." + ANSI_RESET) + System.err.println(ANSI_RED + "GEO (and others) rename files to have more than 2 (i.e., GSMxxx_xxx_xxx_Red.idat). File names must be adjusted prior to running." + ANSI_RESET) + System.err.println(ANSI_RED + "Exiting now." + ANSI_RESET) + System.err.println(ANSI_RED + "-----------------------------------------------------------------------" + ANSI_RESET) + System.exit(1) + } // Metadata to identify samplesheet def meta = [:] @@ -42,7 +60,7 @@ def extract_csv(csv_file) { if (row.gender != "XY" && row.gender != "XX" && row.gender != ""){ System.err.println(ANSI_RED + "-----------------------------------------------------------------------" + ANSI_RESET) - System.err.println(ANSI_RED + "Geneder must be 'XX', 'XY' or empty. " + row.gender + " was provided, and isn't valid." + ANSI_RESET) + System.err.println(ANSI_RED + "Gender must be 'XX', 'XY' or empty. " + row.gender + " was provided, and isn't valid." + ANSI_RESET) System.err.println(ANSI_RED + "Exiting now." + ANSI_RESET) System.err.println(ANSI_RED + "-----------------------------------------------------------------------" + ANSI_RESET) System.exit(1) @@ -78,6 +96,5 @@ def extract_csv(csv_file) { return [meta.sampleID, meta, row.idat_red, row.idat_green] - } } diff --git a/config/cnv_array.config b/config/cnv_array.config index f8c82f0..18eebf6 100644 --- a/config/cnv_array.config +++ b/config/cnv_array.config @@ -10,7 +10,6 @@ params { bpm_file = '/projects/compsci/omics_share/human/GRCh38/supporting_files/cnv_array/HumanCytoSNP-12v2-1_L1.bpm' egt_file = '/projects/compsci/omics_share/human/GRCh38/supporting_files/cnv_array/HumanCytoSNP-12v2-1_L.egt' ref_fa = '/projects/compsci/omics_share/human/GRCh38/genome/sequence/gatk/Homo_sapiens_assembly38.fasta' - tsv_file = '/home/temket/cnv_workflow/output/bcftools_convert.tsv' snp_platform = 'IlluminaCytoSNP' gc_file = '/projects/compsci/omics_share/human/GRCh38/supporting_files/cnv_array/HumanCytoSNP-12v2-1_GCcontent_validSNPloci.txt' rt_file = '/projects/compsci/omics_share/human/GRCh38/supporting_files/cnv_array/HumanCytoSNP-12v2-1_ReplicationTiming_SNPloci_hg38.txt' diff --git a/nextflow.config b/nextflow.config index dcb0c13..5bac0e2 100644 --- a/nextflow.config +++ b/nextflow.config @@ -38,7 +38,7 @@ try { } // work directory is important as it will be large, plan accordingly -workDir = "/fastscratch/${USER}/${params.workflow}" +workDir = "/flashscratch/${USER}/${params.workflow}" manifest { name = "The Jackson Laboratory Computational Sciences Nextflow based analysis pipelines" diff --git a/test/cnv_array/example_sample_input.csv b/test/cnv_array/example_sample_input.csv new file mode 100644 index 0000000..2df939c --- /dev/null +++ b/test/cnv_array/example_sample_input.csv @@ -0,0 +1,4 @@ +sampleID,gender,idat_red,idat_green +Test_Sample_XY,XY,/projects/compsci/omics_share/human/GRCh38/supporting_files/benchmarking_data/CNV_ARRAY/raw_idat/GSM7177504_205848650018R01C01_Red.idat,/projects/compsci/omics_share/human/GRCh38/supporting_files/benchmarking_data/CNV_ARRAY/raw_idat/GSM7177504_205848650018R01C01_Grn.idat +Test_Sample_XX,XX,/projects/compsci/omics_share/human/GRCh38/supporting_files/benchmarking_data/CNV_ARRAY/raw_idat/GSM7177504_205848650018R01C01_Red.idat,/projects/compsci/omics_share/human/GRCh38/supporting_files/benchmarking_data/CNV_ARRAY/raw_idat/GSM7177504_205848650018R01C01_Grn.idat +Test_Sample_NA,,/projects/compsci/omics_share/human/GRCh38/supporting_files/benchmarking_data/CNV_ARRAY/raw_idat/GSM7177504_205848650018R01C01_Red.idat,/projects/compsci/omics_share/human/GRCh38/supporting_files/benchmarking_data/CNV_ARRAY/raw_idat/GSM7177504_205848650018R01C01_Grn.idat diff --git a/test/cnv_array/fail_example_input.csv b/test/cnv_array/fail_example_input.csv new file mode 100644 index 0000000..af206bf --- /dev/null +++ b/test/cnv_array/fail_example_input.csv @@ -0,0 +1,2 @@ +sampleID,gender,idat_red,idat_green +Test_Sample_XY,XY,/projects/compsci/omics_share/human/GRCh38/supporting_files/benchmarking_data/CNV_ARRAY/raw_idat/GSM7177504_205848650018_R01C01_Red.idat,/projects/compsci/omics_share/human/GRCh38/supporting_files/benchmarking_data/CNV_ARRAY/raw_idat/GSM7177504_205848650018_R01C01_Grn.idat \ No newline at end of file diff --git a/tests/cnv.nf.test b/tests/cnv.nf.test deleted file mode 100644 index da36883..0000000 --- a/tests/cnv.nf.test +++ /dev/null @@ -1,130 +0,0 @@ -nextflow_workflow { - - name "Test Workflow CNV_ARRAY" - script "workflows/cnv_array.nf" - workflow "CNV_ARRAY" - - test("Full Workflow -- Required Params") { - tag "RequiredParams" - tag "primary" - when { - params { - csv_input = "${baseDir}/test/cnv/data/example_sample_input.csv" - } - } - - then { - assert workflow.success - } - } - - test("IAAP_CLI Process") { - tag "IAAP_CLI" - tag "process" - when { - params { - outdir = "tests/results" - idat_folder = "${baseDir}/test/cnv/idat_folder" - bpm_file = "${baseDir}/test/cnv/bpm_file" - egt_file = "${baseDir}/test/cnv/egt_file" - csv_input = "${baseDir}/test/cnv/csv_input" - gc_file = "${baseDir}/test/cnv/gc_file" - rt_file = "${baseDir}/test/cnv/rt_file" - } - } - - then { - assert workflow.success - // assert GTC format files - assert file("tests/results/iaap_cli_output.csv").exists() - } - } - - test("BCFTOOLS_GTC2VCF Process") { - tag "BCFTOOLS_GTC2VCF" - tag "process" - when { - params { - outdir = "tests/results" - idat_folder = "${baseDir}/test/cnv/idat_folder" - bpm_file = "${baseDir}/test/cnv/bpm_file" - egt_file = "${baseDir}/test/cnv/egt_file" - csv_input = "${baseDir}/test/cnv/csv_input" - gc_file = "${baseDir}/test/cnv/gc_file" - rt_file = "${baseDir}/test/cnv/rt_file" - } - } - - then { - assert workflow.success - // assert the tsv, bcf, csi and vcf files - assert file("tests/results/bcftools_gtct2vcf_output.vcf").exists() - } - } - - test("BCFTOOLS_QUERY_ASCAT Process") { - tag "BCFTOOLS_QUERY_ASCAT" - tag "process" - when { - params { - outdir = "tests/results" - idat_folder = "${baseDir}/test/cnv/idat_folder" - bpm_file = "${baseDir}/test/cnv/bpm_file" - egt_file = "${baseDir}/test/cnv/egt_file" - csv_input = "${baseDir}/test/cnv/csv_input" - gc_file = "${baseDir}/test/cnv/gc_file" - rt_file = "${baseDir}/test/cnv/rt_file" - } - } - - then { - assert workflow.success - // assert the files BAF and LRR - assert file("tests/results/bcftools_query_ascat_output.csv").exists() - } - } - - test("ASCAT Process") { - tag "ASCAT" - tag "process" - when { - params { - outdir = "tests/results" - idat_folder = "${baseDir}/test/cnv/idat_folder" - bpm_file = "${baseDir}/test/cnv/bpm_file" - egt_file = "${baseDir}/test/cnv/egt_file" - csv_input = "${baseDir}/test/cnv/csv_input" - gc_file = "${baseDir}/test/cnv/gc_file" - rt_file = "${baseDir}/test/cnv/rt_file" - } - } - - then { - assert workflow.success - // Assert the files - assert file("tests/results/ascat_output.csv").exists() - } - } - - test("ASCAT_ANNOTATION Process") { - tag "ASCAT_ANNOTATION" - tag "process" - when { - params { - outdir = "tests/results" - idat_folder = "${baseDir}/test/cnv/idat_folder" - bpm_file = "${baseDir}/test/cnv/bpm_file" - egt_file = "${baseDir}/test/cnv/egt_file" - csv_input = "${baseDir}/test/cnv/csv_input" - gc_file = "${baseDir}/test/cnv/gc_file" - rt_file = "${baseDir}/test/cnv/rt_file" - } - } - - then { - assert workflow.success - // assert the files .txt file and ploly file - assert file("tests/results/ascat_annotation_output.csv").exists() - } - } -} diff --git a/tests/workflows/cnv_array.nf.test b/tests/workflows/cnv_array.nf.test new file mode 100644 index 0000000..136e05c --- /dev/null +++ b/tests/workflows/cnv_array.nf.test @@ -0,0 +1,139 @@ +nextflow_workflow { + + name "Test Workflow CNV_ARRAY" + script "workflows/cnv_array.nf" + workflow "CNV_ARRAY" + + test("Full Workflow") { + tag "primary" + when { + params { + csv_input = "${baseDir}/test/cnv_array/example_sample_input.csv" + pipeline = 'cnv_array' + } + } + + then { + assert workflow.success + } + } + + test("Full Workflow -- GEO filename failure") { + tag "primary" + when { + params { + csv_input = "${baseDir}/test/cnv_array/fail_example_input.csv" + pipeline = 'cnv_array' + } + } + + then { + assert workflow.failed + } + } + // test("IAAP_CLI Process") { + // tag "IAAP_CLI" + // tag "process" + // when { + // params { + // outdir = "tests/results" + // bpm_file = "${baseDir}/test/cnv_array/bpm_file" + // egt_file = "${baseDir}/test/cnv_array/egt_file" + // csv_input = "${baseDir}/test/cnv_array/csv_input" + // gc_file = "${baseDir}/test/cnv_array/gc_file" + // rt_file = "${baseDir}/test/cnv_array/rt_file" + // pipeline = 'cnv_array' + // } + // } + + // then { + // assert workflow.success + // // assert GTC format files + // assert file("tests/results/iaap_cli_output.csv").exists() + // } + // } + + // test("BCFTOOLS_GTC2VCF Process") { + // tag "BCFTOOLS_GTC2VCF" + // tag "process" + // when { + // params { + // outdir = "tests/results" + // bpm_file = "${baseDir}/test/cnv_array/bpm_file" + // egt_file = "${baseDir}/test/cnv_array/egt_file" + // csv_input = "${baseDir}/test/cnv_array/csv_input" + // gc_file = "${baseDir}/test/cnv_array/gc_file" + // rt_file = "${baseDir}/test/cnv_array/rt_file" + // } + // } + + // then { + // assert workflow.success + // // assert the tsv, bcf, csi and vcf files + // assert file("tests/results/bcftools_gtct2vcf_output.vcf").exists() + // } + // } + + // test("BCFTOOLS_QUERY_ASCAT Process") { + // tag "BCFTOOLS_QUERY_ASCAT" + // tag "process" + // when { + // params { + // outdir = "tests/results" + // bpm_file = "${baseDir}/test/cnv_array/bpm_file" + // egt_file = "${baseDir}/test/cnv_array/egt_file" + // csv_input = "${baseDir}/test/cnv_array/csv_input" + // gc_file = "${baseDir}/test/cnv_array/gc_file" + // rt_file = "${baseDir}/test/cnv_array/rt_file" + // } + // } + + // then { + // assert workflow.success + // // assert the files BAF and LRR + // assert file("tests/results/bcftools_query_ascat_output.csv").exists() + // } + // } + + // test("ASCAT Process") { + // tag "ASCAT" + // tag "process" + // when { + // params { + // outdir = "tests/results" + // bpm_file = "${baseDir}/test/cnv_array/bpm_file" + // egt_file = "${baseDir}/test/cnv_array/egt_file" + // csv_input = "${baseDir}/test/cnv_array/csv_input" + // gc_file = "${baseDir}/test/cnv_array/gc_file" + // rt_file = "${baseDir}/test/cnv_array/rt_file" + // } + // } + + // then { + // assert workflow.success + // // Assert the files + // assert file("tests/results/ascat_output.csv").exists() + // } + // } + + // test("ASCAT_ANNOTATION Process") { + // tag "ASCAT_ANNOTATION" + // tag "process" + // when { + // params { + // outdir = "tests/results" + // bpm_file = "${baseDir}/test/cnv_array/bpm_file" + // egt_file = "${baseDir}/test/cnv_array/egt_file" + // csv_input = "${baseDir}/test/cnv_array/csv_input" + // gc_file = "${baseDir}/test/cnv_array/gc_file" + // rt_file = "${baseDir}/test/cnv_array/rt_file" + // } + // } + + // then { + // assert workflow.success + // // assert the files .txt file and ploly file + // assert file("tests/results/ascat_annotation_output.csv").exists() + // } + // } +} diff --git a/workflows/cnv_array.nf b/workflows/cnv_array.nf index 38b087e..6cf5f17 100644 --- a/workflows/cnv_array.nf +++ b/workflows/cnv_array.nf @@ -22,7 +22,7 @@ if (params.help) { param_log() // Parameter validation if (!params.bpm_file || !params.egt_file) { - exit 1, "All parameters (idat_folder, bpm_file, egt_file) are required." + exit 1, "All parameters (bpm_file, egt_file) are required." } if (params.csv_input) { @@ -34,9 +34,6 @@ if (params.csv_input) { GC_file = file(params.gc_file, checkIfExists: true) RT_file = file(params.rt_file, checkIfExists: true) -// Extract CSV input -ch_input = extract_csv(file(params.csv_input, checkIfExists: true)) - // Main workflow workflow CNV_ARRAY { IAAP_CLI(ch_input) From f8b7b229e591d88ca7bb649b760c5ed91d4c6307 Mon Sep 17 00:00:00 2001 From: Mike Lloyd Date: Fri, 16 Aug 2024 11:05:26 -0400 Subject: [PATCH 24/26] polish --- modules/{utility_modules => ascat}/ascat_annotation.nf | 0 modules/{r/ASCAT.nf => ascat/ascat_run.nf} | 0 workflows/cnv_array.nf | 4 ++-- 3 files changed, 2 insertions(+), 2 deletions(-) rename modules/{utility_modules => ascat}/ascat_annotation.nf (100%) rename modules/{r/ASCAT.nf => ascat/ascat_run.nf} (100%) diff --git a/modules/utility_modules/ascat_annotation.nf b/modules/ascat/ascat_annotation.nf similarity index 100% rename from modules/utility_modules/ascat_annotation.nf rename to modules/ascat/ascat_annotation.nf diff --git a/modules/r/ASCAT.nf b/modules/ascat/ascat_run.nf similarity index 100% rename from modules/r/ASCAT.nf rename to modules/ascat/ascat_run.nf diff --git a/workflows/cnv_array.nf b/workflows/cnv_array.nf index 6cf5f17..449ae25 100644 --- a/workflows/cnv_array.nf +++ b/workflows/cnv_array.nf @@ -8,8 +8,8 @@ include {extract_csv} from "${projectDir}/bin/shared/extract_cnv_array_csv.nf" include {IAAP_CLI} from "${projectDir}/modules/illumina/iaap_cli.nf" include {BCFTOOLS_GTC2VCF} from "${projectDir}/modules/bcftools/bcftools_gtct2vcf.nf" include {BCFTOOLS_QUERY_ASCAT} from "${projectDir}/modules/bcftools/bcftools_query_ascat.nf" -include {ASCAT} from "${projectDir}/modules/r/ASCAT.nf" -include {ASCAT_ANNOTATION} from "${projectDir}/modules/utility_modules/ascat_annotation.nf" +include {ASCAT} from "${projectDir}/modules/ascat/ascat_run.nf" +include {ASCAT_ANNOTATION} from "${projectDir}/modules/ascat/ascat_annotation.nf" // Help if needed From b8327f7aab464729e4e6b65d819cb54b506608e7 Mon Sep 17 00:00:00 2001 From: Mike Lloyd Date: Fri, 16 Aug 2024 12:00:26 -0400 Subject: [PATCH 25/26] polish --- bin/log/cnv_array.nf | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/bin/log/cnv_array.nf b/bin/log/cnv_array.nf index 4213e53..e62d013 100644 --- a/bin/log/cnv_array.nf +++ b/bin/log/cnv_array.nf @@ -29,10 +29,14 @@ def param_log() { --egt_file ${params.egt_file} --gtc_csv ${params.gtc_csv} --ref_fa ${params.ref_fa} + --snp_platform ${params.snp_platform} + --gc_file ${params.gc_file} + --rt_file ${params.rt_file} + --chrArm ${params.chrArm} + --cnvGeneFile ${params.cnvGeneFile} -w ${workDir} - --keep_intermediate ${params.keep_intermediate ?: 'N/A'} - -c ${params.config ?: 'N/A'} - + -c ${params.config} + Project Directory: ${projectDir} Command line call: ${workflow.commandLine} From 477709bae2e64154988b3349b72d2873062202d4 Mon Sep 17 00:00:00 2001 From: Mike Lloyd Date: Fri, 16 Aug 2024 14:25:47 -0400 Subject: [PATCH 26/26] polish --- modules/bcftools/bcftools_gtct2vcf.nf | 7 +++---- modules/illumina/iaap_cli.nf | 4 ++-- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/modules/bcftools/bcftools_gtct2vcf.nf b/modules/bcftools/bcftools_gtct2vcf.nf index a2c8155..8db2b94 100644 --- a/modules/bcftools/bcftools_gtct2vcf.nf +++ b/modules/bcftools/bcftools_gtct2vcf.nf @@ -8,15 +8,14 @@ process BCFTOOLS_GTC2VCF { errorStrategy {(task.exitStatus == 140) ? {log.info "\n\nError code: ${task.exitStatus} for task: ${task.name}. Likely caused by the task wall clock: ${task.time} or memory: ${task.memory} being exceeded.\nAttempting orderly shutdown.\nSee .command.log in: ${task.workDir} for more info.\n\n"; return 'finish'}.call() : 'finish'} container 'quay.io/jaxcompsci/gtc2vcf_with_tools:v2' - publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'bcftools' }", mode: 'copy' + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'bcftools' }", pattern: "*.{vcf,tsv}", mode: 'copy' input: tuple val(sampleID), val(meta), path(gtc) output: - tuple val(sampleID), val(meta), path('*_convert.bcf'), path('*_convert.bcf.csi'), path('*_convert.vcf'), path('*_convert.tsv'), emit: gtc2vcf + tuple val(sampleID), val(meta), path('*_convert.bcf'), path('*_convert.bcf.csi'), path('*_convert.vcf'), path('*_convert_info.tsv'), emit: gtc2vcf - script: """ bcftools +gtc2vcf --no-version -Ou \ @@ -25,7 +24,7 @@ process BCFTOOLS_GTC2VCF { --egt ${params.egt_file} \ --gtcs ./ \ --fasta-ref ${params.ref_fa} \ - --extra ${sampleID}_convert.tsv | \ + --extra ${sampleID}_convert_info.tsv | \ bcftools sort -Ou -T ./bcftools. | \ bcftools norm --no-version -Ob -c x -f ${params.ref_fa} | \ tee ${sampleID}_convert.bcf | \ diff --git a/modules/illumina/iaap_cli.nf b/modules/illumina/iaap_cli.nf index c1f46b7..c882e96 100644 --- a/modules/illumina/iaap_cli.nf +++ b/modules/illumina/iaap_cli.nf @@ -10,8 +10,8 @@ process IAAP_CLI { container 'quay.io/jaxcompsci/gtc2vcf_with_tools:v2' - publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'iaap_cli' }", pattern: "*.gtc", mode:'copy' - publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'iaap_cli' }", pattern: "*.log", mode:'copy' + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'iaap_cli' }", pattern: "*.gtc", mode:'copy', enabled: params.keep_intermediate + publishDir "${params.pubdir}/${ params.organize_by=='sample' ? sampleID : 'iaap_cli' }", pattern: "*.log", mode:'copy', enabled: params.keep_intermediate input: tuple val(sampleID), val(meta), path(red_idat), path(green_idat)