Merge pull request #57 from Eco-Flow/new_input_validation

Fixed genome only option not working
nf-core · Nov 6, 2024 · 49a21b3 · 49a21b3
2 parents 6613139 + c56b5e6
commit 49a21b3
Show file tree

Hide file tree

Showing 8 changed files with 161 additions and 107 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,10 @@
+# See https://pre-commit.com for more information
+# See https://pre-commit.com/hooks.html for more hooks
+repos:
+-   repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v3.2.0
+    hooks:
+    -   id: trailing-whitespace
+    -   id: end-of-file-fixer
+    -   id: check-yaml
+    -   id: check-added-large-files
diff --git a/modules/local/create_path.nf b/modules/local/create_path.nf
@@ -11,8 +11,7 @@ process CREATE_PATH {
     tuple val(meta), val(accession)
 
     output:
-    val meta                 , emit: meta
-    path "${meta.id}.txt"    , emit: accession
+    tuple val (meta), path("${meta.id}.txt"), emit: accession
 
     when:
     task.ext.when == null || task.ext.when

diff --git a/nextflow.config b/nextflow.config
@@ -27,7 +27,7 @@ params {
     groups                     = 'all'
 
     // merqury/meryl options
-    merqury_skip               = false
+    skip_merqury               = false
     kvalue                     = 21
 
     // BUSCO options

diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -1,5 +1,5 @@
 {
-  "$schema": "http://json-schema.org/draft-07/schema",
+  "$schema": "https://json-schema.org/draft-07/schema",
   "$id": "https://raw.githubusercontent.com/ecoflow/genomeqc/master/nextflow_schema.json",
   "title": "ecoflow/genomeqc pipeline parameters",
   "description": "A pipeline to compare multiple genomes and annotations",
@@ -304,21 +304,21 @@
           "description": "A path to a BUSCO config file (optional)"
         },
         "genome_only": {
-            "type": "string"
+          "type": "boolean",
+          "description": "Run genomeqc on genomes only"
         },
         "kvalue": {
-            "type": "integer",
-            "default": 21,
-            "description": "k size for meryl (merqury)"
+          "type": "integer",
+          "default": 21,
+          "description": "k size for meryl (merqury)"
         },
-        "merqury_skip": {
-            "type": "boolean",
-            "default": true,
-            "description": "Skip meryl/merqury step?"
+        "skip_merqury": {
+          "type": "boolean",
+          "description": "Skip meryl/merqury step?"
         },
         "skip_tidk": {
           "type": "boolean",
-          "description": "Do not run TIDK.",
+          "description": "Do not run TIDK",
           "hidden": true,
           "help_text": "You may wish to turn off the tidk subworkflow"
         }

diff --git a/subworkflows/local/genome.nf b/subworkflows/local/genome.nf
@@ -20,7 +20,7 @@ workflow GENOME {
 
     BUSCO_BUSCO (
         ch_fasta,
-        "genome", // hard coded, other options ('prteins', 'transcriptome') make no sense
+        "genome", // hardcoded, other options ('proteins', 'transcriptome') make no sense
         params.busco_lineage,
         params.busco_lineages_path ?: [],
         params.busco_config ?: []

diff --git a/subworkflows/local/genome_and_annotation.nf b/subworkflows/local/genome_and_annotation.nf
@@ -4,7 +4,6 @@ include { LONGEST                             } from '../../modules/local/longes
 include { BUSCO_BUSCO                         } from '../../modules/nf-core/busco/busco/main'
 include { QUAST                               } from '../../modules/nf-core/quast/main'
 include { AGAT_SPSTATISTICS                   } from '../../modules/nf-core/agat/spstatistics/main'
-//include { GFFREAD                             } from '../../modules/nf-core/gffread/main'
 include { GFFREAD                             } from '../../modules/local/gffread'
 include { ORTHOFINDER                         } from '../../modules/nf-core/orthofinder/main'
 
@@ -17,6 +16,7 @@ workflow GENOME_AND_ANNOTATION {
     main:
 
     ch_versions = Channel.empty()
+
     // For tree plot
     ch_tree_data = Channel.empty()
 
@@ -41,44 +41,12 @@ workflow GENOME_AND_ANNOTATION {
     ch_tree_data = ch_tree_data.mix(QUAST.out.tsv.map { tuple -> tuple[1] })
 
     //
-    // Run AGAT Spstatistics
+    // Run GFFREAD
     //
 
-    AGAT_SPSTATISTICS (
-        ch_agat_gff
-    )
-    ch_versions = ch_versions.mix(AGAT_SPSTATISTICS.out.versions.first())
-
-    //
-    // Run AGAT longest isoform
-    //
-
-//    LONGEST (
-//        ch_ch_agat_gff
-//    )
-//    ch_versions = ch_versions.mix(LONGEST.out.versions.first())
-//
-//    //
-//    // Run GFFREAD
-//    //
-//
-//    ch_long_gff = LONGEST.out.longest_proteins
-//    
-    inputChannel = ch_agat_gff.combine(ch_fasta, by: 0)
-
-    // Split the input channel into two channels
-    gffChannel = inputChannel.map { tuple ->
-        // Extracting the GFF path and ID
-        [tuple[0], tuple[1]]
-    }
-    fnaChannel = inputChannel.map { tuple ->
-        // Extracting only the FNA path
-        [tuple[0], tuple[2]]
-    }
-
     GFFREAD ( 
-        fnaChannel,
-        gffChannel
+        ch_fasta,
+        ch_gff
     )
     ch_versions = ch_versions.mix(GFFREAD.out.versions.first())
 
@@ -87,7 +55,7 @@ workflow GENOME_AND_ANNOTATION {
     //
 
     ortho_ch = GFFREAD.out.longest.collect().map { it -> [[id:"orthofinder"], it] }
-
+    
     ORTHOFINDER (
         ortho_ch,
         [[],[]]
@@ -109,6 +77,16 @@ workflow GENOME_AND_ANNOTATION {
 
     ch_tree_data = ch_tree_data.mix(BUSCO_BUSCO.out.batch_summary.collect { meta, file -> file })
 
+    //
+    // Run AGAT Spstatistics
+    //
+
+    AGAT_SPSTATISTICS (
+        ch_gff
+    )
+    ch_versions = ch_versions.mix(AGAT_SPSTATISTICS.out.versions.first())
+
+
     emit:
     orthofinder = ORTHOFINDER.out.orthofinder // channel: [ val(meta), [folder] ]
     //busco = BUSCO_BUSCO.out.batch_summary.collect { meta, file -> file }

diff --git a/subworkflows/local/utils_nfcore_genomeqc_pipeline/main.nf b/subworkflows/local/utils_nfcore_genomeqc_pipeline/main.nf
@@ -149,22 +149,25 @@ def validateInputParameters() {
 def validateInputSamplesheet(input) {
     def (meta, refseq, fasta, gff, fastq) = input
     // As for now, there are only two input options: RefSeq ID or local files. The pipeline will throw an error if the sample sheet does not contain the proper information
-    // For the RefSeq ID option
-    if ( meta && refseq && !fasta && !gff ) {
-        return [ meta, refseq, fastq ]
-    // For the local files option
-    } else if ( meta && !refseq && fasta && gff) {
-        return [ meta, fasta, gff, fastq ]
+    // If --genome_only parameter
+    // Check for genome-only mode
+    if (params.genome_only) {
+        if (meta && refseq && !fasta && !gff) {
+            return [meta, refseq, fastq]
+        } else if (meta && !refseq && fasta) {
+            return [meta, fasta, gff, fastq] // Empty or not gff, either way won't be used
+        } else {
+            error("You are running in --genome_only mode. Please check input samplesheet -> Incorrect samplesheet format")
+        }
     } else {
-        error("Please check input samplesheet -> Incorrent samplesheet format")
+        if (meta && refseq && !fasta && !gff) {
+            return [ meta, refseq, fastq ]
+        } else if ( meta && !refseq && fasta && gff ) {
+            return [ meta, fasta, gff, fastq ]
+        } else {
+            error("You are running on default mode. Please check input samplesheet -> Incorrent samplesheet format")
+        }
     }
-    // Check that multiple runs of the same sample are of the same datatype i.e. single-end / paired-end
-    //def endedness_ok = metas.collect{ it.single_end }.unique().size == 1
-    //if (!endedness_ok) {
-    //    error("Please check input samplesheet -> Multiple runs of a sample must be of the same datatype i.e. single-end or paired-end: ${metas[0].id}")
-    //}
-
-    //return [ metas[0], fastqs ]
 }
 //
 // Get attribute from genome config file e.g. fasta