Skip to content

Commit

Permalink
Add viral (#46)
Browse files Browse the repository at this point in the history
* add virus without unclassified

* change name viruses to viral and remove empty unclassified output

* fix logo

* Update some out of date conda defs, minor formatting tweaks

* Update mappy container to 3.10 version

---------

Co-authored-by: Biowilko <[email protected]>
  • Loading branch information
rmcolq and BioWilko authored Jan 17, 2024
1 parent 2be273f commit 18c2996
Show file tree
Hide file tree
Showing 10 changed files with 134 additions and 69 deletions.
19 changes: 12 additions & 7 deletions bin/extract_fraction_from_reads.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,12 +29,12 @@ def median(l):

def load_from_taxonomy(taxonomy_dir, taxids, include_unclassified):
sys.stderr.write("Loading hierarchy\n")
entries = {"0":{"name": "unclassified",
"taxon": "0",
"rank": None
}
}

entries = {}
if include_unclassified:
entries["0"] = {"name": "unclassified",
"taxon": "0",
"rank": None
}

taxid_map = defaultdict(set)
for key in taxids:
Expand Down Expand Up @@ -159,6 +159,7 @@ def trim_read_id(read_id):
def fastq_iterator(
prefix: str,
filetype: str,
include_unclassified: bool,
entries: dict,
read_map: dict,
fastq_1: Path,
Expand All @@ -169,6 +170,7 @@ def fastq_iterator(
Args:
prefix (str): Outfile prefix
filetype (str): output filetype (only affects naming)
include_unclassified (bool): true if includes unclassified reads
read_map (dict): dict of read_id: taxon_list (from kraken report)
subtaxa_map (dict): dict of subtaxa: output taxon (from load_from_taxonomy)
exclude (bool): if true, inverts the logic for including reads
Expand All @@ -192,6 +194,9 @@ def fastq_iterator(
print(entries)
for taxon, entry in entries.items():
taxon_name = entry["name"].lower()
if include_unclassified and taxon_name != "unclassified":
taxon_name += "_and_unclassified"
taxon_name = taxon_name.replace("viruses", "viral")
if fastq_2:
out_handles_1[taxon] = open(f"{taxon_name}_1.{filetype}", "w")
out_handles_2[taxon] = open(f"{taxon_name}_2.{filetype}", "w")
Expand Down Expand Up @@ -332,7 +337,7 @@ def extract_reads(
)
else:
out_counts, quals, lens, names = fastq_iterator(
prefix, filetype, entries, read_map, reads1, reads2
prefix, filetype, include_unclassified, entries, read_map, reads1, reads2
)

sys.stderr.write("Write summary\n")
Expand Down
2 changes: 1 addition & 1 deletion bin/scylla.mako.html
Original file line number Diff line number Diff line change
Expand Up @@ -1172,7 +1172,7 @@ <h3>Read statistics</h3>
<small class="text-muted">Developed by the ARTIC.network, funded by Wellcome</small>
<br>
<img class="artic_no_rings" src="https://raw.githubusercontent.com/artic-network/scylla/main/docs/artic_no_rings.svg" vertical-align="left" width="50" height="50"></img>
<img class="wt" src="https://raw.githubusercontent.com/artic-network/scylla/main/docs/wt.png" vertical-align="left" width="50" height="50"></img>
<img class="wt" src="https://raw.githubusercontent.com/artic-network/scylla/main/docs/wellcome-logo-black.png" vertical-align="left" width="50" height="50"></img>
</div>

<div class="col-sm-11" style="text-align: right;">
Expand Down
5 changes: 2 additions & 3 deletions dockerfiles/scylla/environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ channels:
- bioconda
- conda-forge
- defaults
- epi2melabs
- nanoporetech
dependencies:
- python<=3.10
- pysam
Expand All @@ -15,5 +15,4 @@ dependencies:
- fastp
- tabix
- mako
- jq
- mappy
- jq
6 changes: 3 additions & 3 deletions modules/check_hcid_status.nf
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,10 @@

process check_hcid {

label 'process_single'
label "process_single"

conda 'bioconda::biopython=1.78 bioconda::mappy=2.26'
container "${params.wf.container}:${params.wf.container_version}"
conda "bioconda::mappy=2.26"
container "biocontainers/mappy:2.26--py310h83093d7_1"

publishDir path: "${params.outdir}/${unique_id}/qc/", mode: 'copy'

Expand Down
130 changes: 94 additions & 36 deletions modules/extract_all.nf
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,19 @@

// it is possible that no files would be extracted if there were no subsets of reads which matched the criteria
// also note that the reads extracted don't match up with bracken abundance reestimates, although we do use those
// as more accurate numbers when deciding what to pull out (bracken doesn't provide read break down)
// as more accurate numbers when deciding what to pull out (bracken doesn"t provide read break down)
// probably want to count up how many have been found here for run log
// ALSO this step will currently "fail" with exitcode 2 if the number of human reads found exceeds the number specified
// in config so could be good dehuman sanity check

process split_kreport {

label 'process_single'
label "process_single"

conda 'bioconda::biopython=1.78'
container "biocontainers/pyfastx:2.0.1--py39h3d4b85c_0"
conda "python=3.10"
container "biocontainers/python:3.10"

publishDir path: "${params.outdir}/${unique_id}/classifications", mode: 'copy', pattern: "*.json"
publishDir path: "${params.outdir}/${unique_id}/classifications", mode: "copy", pattern: "*.json"

input:
tuple val(unique_id), path(kreport)
Expand All @@ -33,12 +33,12 @@ process split_kreport {

process extract_taxa_paired_reads {

label 'process_single'
label 'process_high_memory'
label "process_single"
label "process_high_memory"

errorStrategy {task.exitStatus in 2..3 ? 'ignore' : 'terminate'}
errorStrategy {task.exitStatus in 2..3 ? "ignore" : "terminate"}

conda 'bioconda::biopython=1.78 bioconda::tabix=1.11'
conda "bioconda::pyfastx=2.01"
container "biocontainers/pyfastx:2.0.1--py39h3d4b85c_0"

input:
Expand Down Expand Up @@ -75,12 +75,12 @@ process extract_taxa_paired_reads {

process extract_taxa_reads {

label 'process_single'
label 'process_high_memory'
label "process_single"
label "process_high_memory"

errorStrategy {task.exitStatus in 2..3 ? 'ignore' : 'terminate'}
errorStrategy {task.exitStatus in 2..3 ? "ignore" : "terminate"}

conda 'bioconda::biopython=1.78 bioconda::tabix=1.11'
conda "bioconda::pyfastx=2.01"
container "biocontainers/pyfastx:2.0.1--py39h3d4b85c_0"

input:
Expand Down Expand Up @@ -116,12 +116,12 @@ process extract_taxa_reads {

process extract_paired_virus_and_unclassified {

label 'process_single'
label 'process_high_memory'
label "process_single"
label "process_high_memory"

errorStrategy {task.exitStatus in 2..3 ? 'ignore' : 'terminate'}
errorStrategy {task.exitStatus in 2..3 ? "ignore" : "terminate"}

conda 'bioconda::biopython=1.78 bioconda::tabix=1.11'
conda "bioconda::pyfastx=2.01"
container "biocontainers/pyfastx:2.0.1--py39h3d4b85c_0"

input:
Expand All @@ -145,12 +145,12 @@ process extract_paired_virus_and_unclassified {

process extract_virus_and_unclassified {

label 'process_single'
label 'process_high_memory'
label "process_single"
label "process_high_memory"

errorStrategy {task.exitStatus in 2..3 ? 'ignore' : 'terminate'}
errorStrategy {task.exitStatus in 2..3 ? "ignore" : "terminate"}

conda 'bioconda::biopython=1.78 bioconda::tabix=1.11'
conda "bioconda::pyfastx=2.01"
container "biocontainers/pyfastx:2.0.1--py39h3d4b85c_0"

input:
Expand All @@ -172,7 +172,7 @@ process extract_virus_and_unclassified {
}


process extract_paired_dehumanized {
process extract_paired_virus {

label 'process_single'
label 'process_high_memory'
Expand All @@ -195,13 +195,12 @@ process extract_paired_dehumanized {
-s2 ${fastq2} \
-k ${kraken_assignments} \
-t ${taxonomy_dir} \
-p "dehumanized" \
--exclude \
--taxid ${params.taxid_human}
-p "virus" \
--taxid 10239
"""
}

process extract_dehumanized {
process extract_virus {

label 'process_single'
label 'process_high_memory'
Expand All @@ -211,6 +210,63 @@ process extract_dehumanized {
conda 'bioconda::biopython=1.78 bioconda::tabix=1.11'
container "biocontainers/pyfastx:2.0.1--py39h3d4b85c_0"

input:
tuple val(unique_id), path(fastq), path(kraken_assignments), path(kreport)
path taxonomy_dir
output:
tuple val(unique_id), path("*.fastq"), emit: reads
tuple val(unique_id), path("*_summary.json"), emit: summary
script:
"""
extract_fraction_from_reads.py \
-s ${fastq} \
-k ${kraken_assignments} \
-t ${taxonomy_dir} \
-p "virus" \
--taxid 10239
"""
}


process extract_paired_dehumanized {

label "process_single"
label "process_high_memory"

errorStrategy {task.exitStatus in 2..3 ? "ignore" : "terminate"}

conda "bioconda::pyfastx=2.01"
container "biocontainers/pyfastx:2.0.1--py39h3d4b85c_0"

input:
tuple val(unique_id), path(fastq1), path(fastq2), path(kraken_assignments), path(kreport)
path taxonomy_dir
output:
tuple val(unique_id), path("*.fastq"), emit: reads
tuple val(unique_id), path("*_summary.json"), emit: summary
script:
"""
extract_fraction_from_reads.py \
-s1 ${fastq1} \
-s2 ${fastq2} \
-k ${kraken_assignments} \
-t ${taxonomy_dir} \
-p "dehumanized" \
--exclude \
--taxid ${params.taxid_human}
"""
}

process extract_dehumanized {

label "process_single"
label "process_high_memory"

errorStrategy {task.exitStatus in 2..3 ? "ignore" : "terminate"}

conda "bioconda::pyfastx=2.01"
container "biocontainers/pyfastx:2.0.1--py39h3d4b85c_0"

input:
tuple val(unique_id), path(fastq), path(kraken_assignments), path(kreport)
path taxonomy_dir
Expand All @@ -231,19 +287,19 @@ process extract_dehumanized {

process bgzip_extracted_taxa {

label 'process_medium'
label "process_medium"

publishDir path: "${params.outdir}/${unique_id}/${prefix}", mode: 'copy'
publishDir path: "${params.outdir}/${unique_id}/${prefix}", mode: "copy"

conda 'bioconda::biopython=1.78 bioconda::tabix=1.11'
conda "bioconda::tabix=1.11"
container "${params.wf.container}:${params.wf.container_version}"

input:
tuple val(unique_id), path(read_files)
val(prefix)
output:
tuple val(unique_id), path("*.f*q.gz")
tuple val(unique_id), path("virus*.f*q.gz"), emit: virus, optional:true
tuple val(unique_id), path("viral*_and_unclassified*.f*q.gz"), emit: virus, optional:true
script:
"""
for f in \$(ls *.f*q)
Expand All @@ -255,9 +311,9 @@ process bgzip_extracted_taxa {

process merge_read_summary {

label 'process_single'
label "process_single"

publishDir path: "${params.outdir}/${unique_id}/${prefix}", pattern: "reads_summary_combined.json", mode: 'copy'
publishDir path: "${params.outdir}/${unique_id}/${prefix}", pattern: "reads_summary_combined.json", mode: "copy"

container "${params.wf.container}:${params.wf.container_version}"

Expand Down Expand Up @@ -335,21 +391,23 @@ workflow extract_fractions {
if ( params.paired ){
extract_paired_dehumanized(full_extract_ch, taxonomy_dir)
extract_paired_virus_and_unclassified(full_extract_ch, taxonomy_dir)
extract_paired_virus(full_extract_ch, taxonomy_dir)
extract_paired_dehumanized.out.reads
.concat(extract_paired_virus_and_unclassified.out.reads)
.concat(extract_paired_virus_and_unclassified.out.reads, extract_paired_virus.out.reads)
.set {extracted_fractions}
extract_paired_dehumanized.out.summary
.concat(extract_paired_virus_and_unclassified.out.summary)
.concat(extract_paired_virus_and_unclassified.out.summary, extract_paired_virus.out.summary)
.groupTuple()
.set {fractions_summary_ch}
} else {
extract_dehumanized(full_extract_ch, taxonomy_dir)
extract_virus_and_unclassified(full_extract_ch, taxonomy_dir)
extract_virus(full_extract_ch, taxonomy_dir)
extract_dehumanized.out.reads
.concat(extract_virus_and_unclassified.out.reads)
.concat(extract_virus_and_unclassified.out.reads, extract_virus.out.reads)
.set {extracted_fractions}
extract_dehumanized.out.summary
.concat(extract_virus_and_unclassified.out.summary)
.concat(extract_virus_and_unclassified.out.summary, extract_virus.out.summary)
.groupTuple()
.set {fractions_summary_ch}
}
Expand Down
2 changes: 1 addition & 1 deletion modules/generate_report.nf
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ process make_report {

publishDir path: "${params.outdir}/${unique_id}/", mode: 'copy'

conda "bioconda::biopython=1.78 anaconda::Mako=1.2.3"
conda "anaconda::Mako=1.2.3"
container "${params.wf.container}:${params.wf.container_version}"

input:
Expand Down
Loading

0 comments on commit 18c2996

Please sign in to comment.