Add viral (#46)

* add virus without unclassified * change name viruses to viral and remove empty unclassified output * fix logo * Update some out of date conda defs, minor formatting tweaks * Update mappy container to 3.10 version --------- Co-authored-by: Biowilko <[email protected]>
artic-network · Jan 17, 2024 · 18c2996 · 18c2996
1 parent 2be273f
commit 18c2996
Show file tree

Hide file tree

Showing 10 changed files with 134 additions and 69 deletions.
diff --git a/bin/extract_fraction_from_reads.py b/bin/extract_fraction_from_reads.py
@@ -29,12 +29,12 @@ def median(l):
 
 def load_from_taxonomy(taxonomy_dir, taxids, include_unclassified):
     sys.stderr.write("Loading hierarchy\n")
-    entries = {"0":{"name": "unclassified",
-                    "taxon": "0",
-                    "rank": None
-                    }
-               }
-
+    entries = {}
+    if include_unclassified:
+        entries["0"] = {"name": "unclassified",
+                      "taxon": "0",
+                      "rank": None
+                      }
 
     taxid_map = defaultdict(set)
     for key in taxids:
@@ -159,6 +159,7 @@ def trim_read_id(read_id):
 def fastq_iterator(
     prefix: str,
     filetype: str,
+    include_unclassified: bool,
     entries: dict,
     read_map: dict,
     fastq_1: Path,
@@ -169,6 +170,7 @@ def fastq_iterator(
     Args:
         prefix (str): Outfile prefix
         filetype (str): output filetype (only affects naming)
+        include_unclassified (bool): true if includes unclassified reads
         read_map (dict): dict of read_id: taxon_list (from kraken report)
         subtaxa_map (dict): dict of subtaxa: output taxon (from load_from_taxonomy)
         exclude (bool): if true, inverts the logic for including reads
@@ -192,6 +194,9 @@ def fastq_iterator(
     print(entries)
     for taxon, entry in entries.items():
         taxon_name = entry["name"].lower()
+        if include_unclassified and taxon_name != "unclassified":
+            taxon_name += "_and_unclassified"
+        taxon_name = taxon_name.replace("viruses", "viral")
         if fastq_2:
             out_handles_1[taxon] = open(f"{taxon_name}_1.{filetype}", "w")
             out_handles_2[taxon] = open(f"{taxon_name}_2.{filetype}", "w")
@@ -332,7 +337,7 @@ def extract_reads(
         )
     else:
         out_counts, quals, lens, names = fastq_iterator(
-            prefix, filetype, entries, read_map, reads1, reads2
+            prefix, filetype, include_unclassified, entries, read_map, reads1, reads2
         )
 
     sys.stderr.write("Write summary\n")

diff --git a/bin/scylla.mako.html b/bin/scylla.mako.html
@@ -1172,7 +1172,7 @@ <h3>Read statistics</h3>
                     <small class="text-muted">Developed by the ARTIC.network, funded by Wellcome</small>
                     <br>
                     <img class="artic_no_rings" src="https://raw.githubusercontent.com/artic-network/scylla/main/docs/artic_no_rings.svg" vertical-align="left" width="50" height="50"></img>
-                    <img class="wt" src="https://raw.githubusercontent.com/artic-network/scylla/main/docs/wt.png" vertical-align="left" width="50" height="50"></img>
+                    <img class="wt" src="https://raw.githubusercontent.com/artic-network/scylla/main/docs/wellcome-logo-black.png" vertical-align="left" width="50" height="50"></img>
                 </div>
 
                 <div class="col-sm-11" style="text-align: right;">

diff --git a/dockerfiles/scylla/environment.yml b/dockerfiles/scylla/environment.yml
@@ -3,7 +3,7 @@ channels:
   - bioconda
   - conda-forge
   - defaults
-  - epi2melabs
+  - nanoporetech
 dependencies:
   - python<=3.10
   - pysam
@@ -15,5 +15,4 @@ dependencies:
   - fastp
   - tabix
   - mako
-  - jq
-  - mappy
+  - jq
diff --git a/modules/check_hcid_status.nf b/modules/check_hcid_status.nf
@@ -2,10 +2,10 @@
 
 process check_hcid {
 
-    label 'process_single'
+    label "process_single"
 
-    conda 'bioconda::biopython=1.78 bioconda::mappy=2.26'
-    container "${params.wf.container}:${params.wf.container_version}"
+    conda "bioconda::mappy=2.26"
+    container "biocontainers/mappy:2.26--py310h83093d7_1"
 
     publishDir path: "${params.outdir}/${unique_id}/qc/", mode: 'copy'
 

diff --git a/modules/extract_all.nf b/modules/extract_all.nf
@@ -3,19 +3,19 @@
 
 // it is possible that no files would be extracted if there were no subsets of reads which matched the criteria
 // also note that the reads extracted don't match up with bracken abundance reestimates, although we do use those
-// as more accurate numbers when deciding what to pull out (bracken doesn't provide read break down)
+// as more accurate numbers when deciding what to pull out (bracken doesn"t provide read break down)
 // probably want to count up how many have been found here for run log
 // ALSO this step will currently "fail" with exitcode 2 if the number of human reads found exceeds the number specified
 // in config so could be good dehuman sanity check
 
 process split_kreport {
 
-    label 'process_single'
+    label "process_single"
 
-    conda 'bioconda::biopython=1.78'
-    container "biocontainers/pyfastx:2.0.1--py39h3d4b85c_0"
+    conda "python=3.10"
+    container "biocontainers/python:3.10"
 
-    publishDir path: "${params.outdir}/${unique_id}/classifications", mode: 'copy', pattern: "*.json"
+    publishDir path: "${params.outdir}/${unique_id}/classifications", mode: "copy", pattern: "*.json"
 
     input:
         tuple val(unique_id), path(kreport)
@@ -33,12 +33,12 @@ process split_kreport {
 
 process extract_taxa_paired_reads {
 
-    label 'process_single'
-    label 'process_high_memory'
+    label "process_single"
+    label "process_high_memory"
 
-    errorStrategy {task.exitStatus in 2..3 ? 'ignore' : 'terminate'}
+    errorStrategy {task.exitStatus in 2..3 ? "ignore" : "terminate"}
 
-    conda 'bioconda::biopython=1.78 bioconda::tabix=1.11'
+    conda "bioconda::pyfastx=2.01"
     container "biocontainers/pyfastx:2.0.1--py39h3d4b85c_0"
 
     input:
@@ -75,12 +75,12 @@ process extract_taxa_paired_reads {
 
 process extract_taxa_reads {
 
-    label 'process_single'
-    label 'process_high_memory'
+    label "process_single"
+    label "process_high_memory"
 
-    errorStrategy {task.exitStatus in 2..3 ? 'ignore' : 'terminate'}
+    errorStrategy {task.exitStatus in 2..3 ? "ignore" : "terminate"}
 
-    conda 'bioconda::biopython=1.78 bioconda::tabix=1.11'
+    conda "bioconda::pyfastx=2.01"
     container "biocontainers/pyfastx:2.0.1--py39h3d4b85c_0"
 
     input:
@@ -116,12 +116,12 @@ process extract_taxa_reads {
 
 process extract_paired_virus_and_unclassified {
 
-    label 'process_single'
-    label 'process_high_memory'
+    label "process_single"
+    label "process_high_memory"
 
-    errorStrategy {task.exitStatus in 2..3 ? 'ignore' : 'terminate'}
+    errorStrategy {task.exitStatus in 2..3 ? "ignore" : "terminate"}
 
-    conda 'bioconda::biopython=1.78 bioconda::tabix=1.11'
+    conda "bioconda::pyfastx=2.01"
     container "biocontainers/pyfastx:2.0.1--py39h3d4b85c_0"
 
     input:
@@ -145,12 +145,12 @@ process extract_paired_virus_and_unclassified {
 
 process extract_virus_and_unclassified {
 
-    label 'process_single'
-    label 'process_high_memory'
+    label "process_single"
+    label "process_high_memory"
 
-    errorStrategy {task.exitStatus in 2..3 ? 'ignore' : 'terminate'}
+    errorStrategy {task.exitStatus in 2..3 ? "ignore" : "terminate"}
 
-    conda 'bioconda::biopython=1.78 bioconda::tabix=1.11'
+    conda "bioconda::pyfastx=2.01"
     container "biocontainers/pyfastx:2.0.1--py39h3d4b85c_0"
 
     input:
@@ -172,7 +172,7 @@ process extract_virus_and_unclassified {
 }
 
 
-process extract_paired_dehumanized {
+process extract_paired_virus {
 
     label 'process_single'
     label 'process_high_memory'
@@ -195,13 +195,12 @@ process extract_paired_dehumanized {
             -s2 ${fastq2} \
             -k ${kraken_assignments} \
             -t ${taxonomy_dir} \
-            -p "dehumanized" \
-            --exclude \
-            --taxid ${params.taxid_human}
+            -p "virus" \
+            --taxid 10239
         """
 }
 
-process extract_dehumanized {
+process extract_virus {
 
     label 'process_single'
     label 'process_high_memory'
@@ -211,6 +210,63 @@ process extract_dehumanized {
     conda 'bioconda::biopython=1.78 bioconda::tabix=1.11'
     container "biocontainers/pyfastx:2.0.1--py39h3d4b85c_0"
 
+    input:
+        tuple val(unique_id), path(fastq), path(kraken_assignments), path(kreport)
+        path taxonomy_dir
+    output:
+        tuple val(unique_id), path("*.fastq"), emit: reads
+        tuple val(unique_id), path("*_summary.json"), emit: summary
+    script:
+        """
+        extract_fraction_from_reads.py \
+            -s ${fastq} \
+            -k ${kraken_assignments} \
+            -t ${taxonomy_dir} \
+            -p "virus" \
+            --taxid 10239
+        """
+}
+
+
+process extract_paired_dehumanized {
+
+    label "process_single"
+    label "process_high_memory"
+
+    errorStrategy {task.exitStatus in 2..3 ? "ignore" : "terminate"}
+
+    conda "bioconda::pyfastx=2.01"
+    container "biocontainers/pyfastx:2.0.1--py39h3d4b85c_0"
+
+    input:
+        tuple val(unique_id), path(fastq1), path(fastq2), path(kraken_assignments), path(kreport)
+        path taxonomy_dir
+    output:
+        tuple val(unique_id), path("*.fastq"), emit: reads
+        tuple val(unique_id), path("*_summary.json"), emit: summary
+    script:
+        """
+        extract_fraction_from_reads.py \
+            -s1 ${fastq1} \
+            -s2 ${fastq2} \
+            -k ${kraken_assignments} \
+            -t ${taxonomy_dir} \
+            -p "dehumanized" \
+            --exclude \
+            --taxid ${params.taxid_human}
+        """
+}
+
+process extract_dehumanized {
+
+    label "process_single"
+    label "process_high_memory"
+
+    errorStrategy {task.exitStatus in 2..3 ? "ignore" : "terminate"}
+
+    conda "bioconda::pyfastx=2.01"
+    container "biocontainers/pyfastx:2.0.1--py39h3d4b85c_0"
+
     input:
         tuple val(unique_id), path(fastq), path(kraken_assignments), path(kreport)
         path taxonomy_dir
@@ -231,19 +287,19 @@ process extract_dehumanized {
 
 process bgzip_extracted_taxa {
 
-      label 'process_medium'
+      label "process_medium"
 
-      publishDir path: "${params.outdir}/${unique_id}/${prefix}", mode: 'copy'
+      publishDir path: "${params.outdir}/${unique_id}/${prefix}", mode: "copy"
 
-      conda 'bioconda::biopython=1.78 bioconda::tabix=1.11'
+      conda "bioconda::tabix=1.11"
       container "${params.wf.container}:${params.wf.container_version}"
 
       input:
           tuple val(unique_id), path(read_files)
           val(prefix)
       output:
           tuple val(unique_id), path("*.f*q.gz")
-          tuple val(unique_id), path("virus*.f*q.gz"), emit: virus, optional:true
+          tuple val(unique_id), path("viral*_and_unclassified*.f*q.gz"), emit: virus, optional:true
       script:
           """
           for f in \$(ls *.f*q)
@@ -255,9 +311,9 @@ process bgzip_extracted_taxa {
 
 process merge_read_summary {
 
-    label 'process_single'
+    label "process_single"
 
-    publishDir path: "${params.outdir}/${unique_id}/${prefix}", pattern: "reads_summary_combined.json", mode: 'copy'
+    publishDir path: "${params.outdir}/${unique_id}/${prefix}", pattern: "reads_summary_combined.json", mode: "copy"
 
     container "${params.wf.container}:${params.wf.container_version}"
 
@@ -335,21 +391,23 @@ workflow extract_fractions {
         if ( params.paired ){
             extract_paired_dehumanized(full_extract_ch, taxonomy_dir)
             extract_paired_virus_and_unclassified(full_extract_ch, taxonomy_dir)
+            extract_paired_virus(full_extract_ch, taxonomy_dir)
             extract_paired_dehumanized.out.reads
-                .concat(extract_paired_virus_and_unclassified.out.reads)
+                .concat(extract_paired_virus_and_unclassified.out.reads, extract_paired_virus.out.reads)
                 .set {extracted_fractions}
             extract_paired_dehumanized.out.summary
-                 .concat(extract_paired_virus_and_unclassified.out.summary)
+                 .concat(extract_paired_virus_and_unclassified.out.summary, extract_paired_virus.out.summary)
                  .groupTuple()
                  .set {fractions_summary_ch}
         } else {
             extract_dehumanized(full_extract_ch, taxonomy_dir)
             extract_virus_and_unclassified(full_extract_ch, taxonomy_dir)
+            extract_virus(full_extract_ch, taxonomy_dir)
             extract_dehumanized.out.reads
-                .concat(extract_virus_and_unclassified.out.reads)
+                .concat(extract_virus_and_unclassified.out.reads, extract_virus.out.reads)
                 .set {extracted_fractions}
             extract_dehumanized.out.summary
-                .concat(extract_virus_and_unclassified.out.summary)
+                .concat(extract_virus_and_unclassified.out.summary, extract_virus.out.summary)
                 .groupTuple()
                 .set {fractions_summary_ch}
         }

diff --git a/modules/generate_report.nf b/modules/generate_report.nf
@@ -6,7 +6,7 @@ process make_report {
 
     publishDir path: "${params.outdir}/${unique_id}/", mode: 'copy'
 
-    conda "bioconda::biopython=1.78 anaconda::Mako=1.2.3"
+    conda "anaconda::Mako=1.2.3"
     container "${params.wf.container}:${params.wf.container_version}"
 
     input: