Merge pull request #50 from rki-mf1/dev

Merge dev and push to version 0.3.0
rki-mf1 · May 16, 2024 · 0d6b015 · 0d6b015
2 parents 811b395 + 79bc341
commit 0d6b015
Show file tree

Hide file tree

Showing 7 changed files with 172 additions and 21 deletions.
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -1,4 +1,4 @@
-name: CI
+name: tests
 
 on:
   push:
@@ -8,8 +8,8 @@ on:
 
 # designed as in: https://github.com/marketplace/actions/setup-miniconda
 jobs:
-  CI:
-    name: CI (Linux)
+  build:
+    name: build nf env
     runs-on: "ubuntu-latest"
     defaults:
       run:
@@ -36,6 +36,25 @@ jobs:
         run: |
           nextflow -version
 
+
+  run-cievad:
+    name: Run cievad
+    needs: build
+    runs-on: "ubuntu-latest"
+    defaults:
+      run:
+        shell: bash -el {0}
+    steps:
+      - uses: actions/checkout@v4
+      - uses: conda-incubator/setup-miniconda@v3
+        with:
+          miniconda-version: "latest"
+          activate-environment: nextflow
+          environment-file: env/conda_nxf.yml
+          channels: conda-forge,bioconda,defaults
+          channel-priority: true
+          auto-activate-base: false
+
       - name : Download reference
         run: |
           wget https://www.ebi.ac.uk/ena/browser/api/fasta/MN908947.3
@@ -47,7 +66,17 @@ jobs:
         run: |
           nextflow run hap.nf -profile local,conda
 
-      - name: Test callset evaluation
+      - name: Test callset evaluation with callset_dir
         run: |
           nextflow run eval.nf -profile local,conda --callsets_dir aux/ci_data/
 
+      - name: Test callset evaluation with sample_sheet
+        run: |
+          cwd=$(pwd)
+          echo "index,truthset,callset" > my_samples.csv
+          echo "1,${cwd}/results/simulated_hap1.vcf,${cwd}/aux/ci_data/callset_1.vcf.gz" >> my_samples.csv
+          echo "2,${cwd}/results/simulated_hap2.vcf,${cwd}/aux/ci_data/callset_2.vcf.gz" >> my_samples.csv
+          echo "3,${cwd}/results/simulated_hap3.vcf,${cwd}/aux/ci_data/callset_3.vcf.gz" >> my_samples.csv
+          
+          nextflow run eval.nf -profile local,conda --sample_sheet my_samples.csv
+
diff --git a/README.md b/README.md
@@ -70,9 +70,12 @@ nextflow run eval.nf -profile local,conda --callsets_dir <path/to/callsets>
 ```
 where `--callsets_dir` is the parameter to specify a folder containing the callset VCF files.
 Currently, a callset within this folder has to follow the naming convention `callset_<X>.vcf[.gz]` where _\<X\>_ is the integer of the corresponding truthset.
+Alternatively, one can provide a sample sheet of comma separated values (CSV file) with the columns "index", "truthset" and callset", where "index" is an integer from 1 to n (number of samples) and "callset"/"truthset" are paths to the pairwise matching VCF files.
 Callsets can optionally be _gzip_ compressed.
-
-🚧 For convenience, the `eval.nf` will get an option to provide a sample sheet as an alternative input format in the future.
+The command for the sample sheet input is
+```
+nextflow run eval.nf -profile local,conda --sample_sheet <path/to/sample_sheet>
+```
 
 <details><summary>⚠️ Run commands from the root directory </summary>
 Without further ado, please run the commands from a terminal at the top folder (root directory) of this repository.

diff --git a/VERSION b/VERSION
@@ -0,0 +1 @@
+'0.3.0'
diff --git a/eval.nf b/eval.nf
@@ -1,3 +1,10 @@
+//load in help function
+File helppages_class_file = new File("./src/Helppages.groovy");
+Class HelppagesClass = new GroovyClassLoader(getClass().getClassLoader()).parseClass(helppages_class_file);
+GroovyObject help = (GroovyObject) HelppagesClass.newInstance();
+
+if (params.help) { exit 0, help.helpEval(workflow.manifest.version, params) }
+
 // include modules - here, modules are single processes
 include { SAMTOOLS_FAIDX } from './modules/samtools/faidx/main.nf'
 include { HAPPY } from './modules/happy/main.nf'
@@ -11,22 +18,37 @@ workflow{
     ch_ref      = Channel.value("$baseDir/" + params.reference)
     ch_ref_idx  = SAMTOOLS_FAIDX(ch_ref)
 
-    ch_callsets = Channel.fromPath(params.callsets_dir + "/" + "*.{vcf,vcf.gz}")
-    ch_callsets
-        .map { it -> tuple(it.toString().split('/')[-1].tokenize('_')[1].replaceFirst('.vcf', '').replaceFirst('.gz', '').toInteger(), file(it)) }
-        .set {ch_callsets}
-    //ch_callsets.view()
+    if (params.callsets_dir != "" && params.sample_sheet == "") {
+
+        ch_callsets = Channel.fromPath(params.callsets_dir + "/" + "*.{vcf,vcf.gz}", checkIfExists: true)
+        ch_callsets
+            .map { it -> tuple(it.toString().split('/')[-1].tokenize('_')[1].replaceFirst('.vcf', '').replaceFirst('.gz', '').toInteger(), file(it)) }
+            .set {ch_callsets}
+        // ch_callsets.view()
+
+        ch_truthsets = Channel.fromPath(params.outdir + "/" + "simulated_hap*.vcf", checkIfExists: true)
+        ch_truthsets
+            .map { it -> tuple(it.toString().split('/')[-1].tokenize('_')[1].replaceFirst('hap', '').replaceFirst('.vcf', '').toInteger(), file(it)) }
+            .set {ch_truthsets}
+        // ch_truthsets.view()
 
-    ch_truthsets = Channel.fromPath(params.outdir + "/" + "simulated_hap*.vcf")
-    ch_truthsets
-        .map { it -> tuple(it.toString().split('/')[-1].tokenize('_')[1].replaceFirst('hap', '').replaceFirst('.vcf', '').toInteger(), file(it)) }
-        .set {ch_truthsets}
-    //ch_truthsets.view()
+        ch_truthsets.join(ch_callsets, by: 0)
+            .set {ch_variantsets_map}
+        // ch_variantsets_map.view()
 
-    ch_truthsets.join(ch_callsets, by: 0)
-        .set {ch_variantsets_map}
-    //ch_variantsets_map.view()
+    } else if (params.sample_sheet != "" && params.callsets_dir == "") { 
 
+        ch_variantsets_map = Channel
+            .fromPath(params.sample_sheet, checkIfExists: true)
+            .splitCsv(header: true, sep: ",")
+            .map {row -> [row["index"] as Integer, row["truthset"], row["callset"]]}
+            // .view()
+
+    } else {
+
+        exit 1, "ERROR: Data input incorrect - please supply only one of the following parameters: sample_sheet, callsets_dir\n"
+
+    }
 
     // ------------------
     // | Main processes |
@@ -35,3 +57,4 @@ workflow{
 
     SOMPY_SUMMARY(ch_csv.collect())
 }
+
diff --git a/hap.nf b/hap.nf
@@ -1,3 +1,10 @@
+//load in help function
+File helppages_class_file = new File("./src/Helppages.groovy");
+Class HelppagesClass = new GroovyClassLoader(getClass().getClassLoader()).parseClass(helppages_class_file);
+GroovyObject help = (GroovyObject) HelppagesClass.newInstance();
+
+if (params.help) { exit 0, help.helpHap(workflow.manifest.version, params) }
+
 // include modules - here, modules are single processes
 //include { AMPLISIM } from './modules/amplisim/main.nf'
 include { MASON_SIMULATOR } from './modules/mason/simulator/main.nf'

diff --git a/nextflow.config b/nextflow.config
@@ -4,7 +4,7 @@ manifest {
   description = 'A workflow for a simple, streamlined and rapid evaluation of variant callsets ' 
   author = 'Thomas Krannich'
   nextflowVersion = '>=20.04.0'
-  version = '0.2.0'
+  version = new File('./VERSION').text.trim()
 }
 
 // Parameters that are accessible in the pipeline script
@@ -15,6 +15,7 @@ params {
     read_type = 'ngs'
 
     // General parameters
+    help = false
     seed = 479
     outdir = 'results'
 
@@ -35,7 +36,8 @@ params {
     nb_reads = 180
 
     // Evaluation parameters
-    callsets_dir = 'data'
+    callsets_dir = ''
+    sample_sheet = ''
 }
 
 // Enable execution report
@@ -63,4 +65,9 @@ profiles {
         executor.name = "local"
         executor.cpus = 4
     }
+
+    slurm {
+        executor.name = "slurm"
+        executor.cpus = 4
+    }
 }
diff --git a/src/Helppages.groovy b/src/Helppages.groovy
@@ -0,0 +1,81 @@
+class Helper {
+    def helpEval(version,params){
+        String c_green = "\033[0;32m";
+        String c_reset = "\033[0m";
+        String c_yellow = "\033[0;33m";
+        String c_blue = "\033[0;34m";
+        String c_red = "\u001B[31m";
+        String c_dim = "\033[2m";
+        log.info """
+        ____________________________________________________________________________________________
+        
+        ${c_blue}Robert Koch Institute, Genome Competence Center${c_reset}
+
+        Workflow: cievad (${version}) - evaluation of callsets
+
+        ${c_yellow}Minimal Usage Examples:${c_reset}
+
+        nextflow run eval.nf -profile local,conda --callsets_dir <path/to/callsets>
+        or
+        nextflow run eval.nf -profile local,conda --sample_sheet <path/to/sample_sheet>
+
+        ${c_yellow}Input parameter (required):${c_reset}
+
+        ${c_green} --callsets_dir ${c_reset} Directory containing variant callsets for evaluation (files of format: callset_<X>.vcf[.gz]), where <X> is the index of the corresponding truthset.
+        OR
+        ${c_green} --sample_sheet ${c_reset} Sample sheet (.csv) with the header "index,truthset,callset". Every following line contains an index and matching truth- and callset.
+
+        ${c_yellow}Other workflow parameter:${c_reset}
+
+        ${c_green} --outdir ${c_reset} directory to save results in [default: ${params.outdir}]
+        """
+    }
+
+    def helpHap(version,params){
+        String c_green = "\033[0;32m";
+        String c_reset = "\033[0m";
+        String c_yellow = "\033[0;33m";
+        String c_blue = "\033[0;34m";
+        String c_red = "\u001B[31m";
+        String c_dim = "\033[2m";
+        log.info """
+        ____________________________________________________________________________________________
+        
+        ${c_blue}Robert Koch Institute, Genome Competence Center${c_reset}
+
+        Workflow: cievad (${version}) - haplotype generation
+
+        ${c_yellow}Minimal Usage Example:${c_reset}
+        
+        nextflow run hap.nf -profile local,conda --reference <cievad/path/to/ref>
+
+        ${c_yellow}Input parameter (required):${c_reset}
+
+        ${c_green} --reference ${c_reset} reference genome (.fasta) used for the generation of synthetic sequencing data
+
+        ${c_yellow}Other workflow parameter:${c_reset}
+
+        ${c_green} --n ${c_reset} number of synthetic samples to be generated [default: ${params.n}]
+        ${c_green} --read_type ${c_reset} type of synthetic reads to be generated (options: ngs, ont) [default: ${params.read_type}]
+        ${c_green} --outdir ${c_reset} directory to save results in [default: ${params.outdir}]
+
+        ${c_yellow}Next Generation Sequencing parameter, optional if [--read_type ngs] is supplied ${c_reset}
+
+        ${c_green} --nb_frag ${c_reset} number of fragments per sample [default: ${params.nb_frag}]
+        ${c_green} --fragment_min_size ${c_reset} minimum size of fragments [default: ${params.fragment_min_size}]
+        ${c_green} --fragment_max_size ${c_reset} maximum size of fragments [default: ${params.fragment_max_size}]
+        ${c_green} --fragment_mean_size ${c_reset} mean size of fragments [default: ${params.fragment_mean_size}]
+        ${c_green} --fragment_size_std_dev ${c_reset} standard deviation for fragment size [default: ${params.fragment_size_std_dev}]
+        ${c_green} --illumina_read_length ${c_reset} read length of synthetic illumina reads [default: ${params.illumina_read_length}]
+
+        ${c_yellow}Nanopore Sequencing parameter, optional if [--read_type ont] is supplied ${c_reset}
+
+        ${c_green} --dna_type ${c_reset} used DNA type (options: linear, circular) [default: ${params.dna_type}]
+        ${c_green} --model_prefix ${c_reset} path and prefix of a NanoSim model [default: ${params.model_prefix}]
+        ${c_green} --model_caller ${c_reset} algorithm to conduct the basecalling [default: ${params.model_caller}]
+        ${c_green} --median_length ${c_reset} median length of the synthetic reads [default: ${params.median_length}]
+        ${c_green} --sd_length ${c_reset} standard deviation of the synthetic read lengths [default: ${params.sd_length}]
+        ${c_green} --nb_reads ${c_reset} number of synthetic reads per sample [default: ${params.nb_reads}]
+        """
+    }
+}