Skip to content

Commit

Permalink
Re add nextclade assignment rules
Browse files Browse the repository at this point in the history
  • Loading branch information
j23414 committed Dec 5, 2023
1 parent dacad11 commit aaf84f6
Show file tree
Hide file tree
Showing 3 changed files with 196 additions and 0 deletions.
1 change: 1 addition & 0 deletions ingest/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ rule all:

include: "workflow/snakemake_rules/fetch_sequences.smk"
include: "workflow/snakemake_rules/transform.smk"
include: "workflow/snakemake_rules/nextclade.smk"


if config.get("upload", False):
Expand Down
16 changes: 16 additions & 0 deletions ingest/source-data/nextclade-field-map.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
key value
seqName seqName
clade clade
outbreak outbreak
lineage lineage
coverage coverage
totalMissing missing_data
totalSubstitutions divergence
totalNonACGTNs nonACGTN
qc.missingData.status QC_missing_data
qc.mixedSites.status QC_mixed_sites
qc.privateMutations.status QC_rare_mutations
qc.frameShifts.status QC_frame_shifts
qc.stopCodons.status QC_stop_codons
frameShifts frame_shifts
isReverseComplement is_reverse_complement
179 changes: 179 additions & 0 deletions ingest/workflow/snakemake_rules/nextclade.smk
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
rule nextclade_all:
input:
sequences="results/sequences.fasta",
dataset="../nextclade_data/all",
output:
"results/nextclade_results/nextclade_all.tsv",
threads: 4
shell:
"""
nextclade run \
--input-dataset {input.dataset} \
-j {threads} \
--output-tsv {output} \
--min-match-rate 0.01 \
--silent \
{input.sequences}
"""

# Might be able to parallelize this
rule split_dengue_sequences:
input:
sequences="results/sequences.fasta",
metadata="results/metadata.tsv",
nextclade_all_results="results/nextclade_results/nextclade_all.tsv",
output:
sequences_denv1="results/sequences_denv1.fasta",
sequences_denv2="results/sequences_denv2.fasta",
sequences_denv3="results/sequences_denv3.fasta",
sequences_denv4="results/sequences_denv4.fasta",
shell:
"""
augur filter \
--sequences results/sequences.fasta
--metadata {input.nextclade_all_results} \
--metadata-id-columns seqName \
--query "clade=='DENV1'" \
--output-sequences {output.sequences_denv1}
augur filter \
--sequences results/sequences.fasta
--metadata {input.nextclade_all_results} \
--metadata-id-columns seqName \
--query "clade=='DENV2'" \
--output-sequences {output.sequences_denv2}
augur filter \
--sequences results/sequences.fasta
--metadata {input.nextclade_all_results} \
--metadata-id-columns seqName \
--query "clade=='DENV3'" \
--output-sequences {output.sequences_denv3}
augur filter \
--sequences results/sequences.fasta
--metadata {input.nextclade_all_results} \
--metadata-id-columns seqName \
--query "clade=='DENV4'" \
--output-sequences {output.sequences_denv4}
"""

# The following rules should use wildcards
rule nextclade_denv1:
input:
sequences="results/sequences_denv1.fasta",
dataset="../nextclade_data/denv1",
output:
"results/nextclade_results/nextclade_denv1.tsv",
threads: 4
shell:
"""
nextclade run \
--input-dataset {input.dataset} \
-j {threads} \
--output-tsv {output} \
--min-match-rate 0.01 \
--silent \
{input.sequences}
"""

rule nextclade_denv2:
input:
sequences="results/sequences_denv2.fasta",
dataset="../nextclade_data/denv2",
output:
"results/nextclade_results/nextclade_denv2.tsv",
threads: 4
shell:
"""
nextclade run \
--input-dataset {input.dataset} \
-j {threads} \
--output-tsv {output} \
--min-match-rate 0.01 \
--silent \
{input.sequences}
"""

rule nextclade_denv3:
input:
sequences="results/sequences_denv3.fasta",
dataset="../nextclade_data/denv3",
output:
"results/nextclade_results/nextclade_denv3.tsv",
threads: 4
shell:
"""
nextclade run \
--input-dataset {input.dataset} \
-j {threads} \
--output-tsv {output} \
--min-match-rate 0.01 \
--silent \
{input.sequences}
"""

rule nextclade_denv4:
input:
sequences="results/sequences_denv4.fasta",
dataset="../nextclade_data/denv4",
output:
"results/nextclade_results/nextclade_denv4.tsv",
threads: 4
shell:
"""
nextclade run \
--input-dataset {input.dataset} \
-j {threads} \
--output-tsv {output} \
--min-match-rate 0.01 \
--silent \
{input.sequences}
"""

rule join_nextclade_clades:
input:
metadata="results/metadata.tsv",
nextclade_denv1="results/nextclade_results/nextclade_denv1.tsv",
nextclade_denv2="results/nextclade_results/nextclade_denv2.tsv",
nextclade_denv3="results/nextclade_results/nextclade_denv3.tsv",
nextclade_denv4="results/nextclade_results/nextclade_denv4.tsv",
output:
metadata_all="results/metadata_all.tsv",
metadata_denv1="results/metadata_denv1.tsv",
metadata_denv2="results/metadata_denv2.tsv",
metadata_denv3="results/metadata_denv3.tsv",
metadata_denv4="results/metadata_denv4.tsv",
shell:
"""
echo "genbank_accession,nextclade_subtype" \
| tr ',' '\t' \
> results/nextclade_subtype.tsv
tsv-filter -H -f "seqName,clade" {input.nextclade_denv1} \
| awk 'NR>1 {print}' \
>> results/nextclade_subtype.tsv
tsv-filter -H -f "seqName,clade" {input.nextclade_denv2} \
| awk 'NR>1 {print}' \
>> results/nextclade_subtype.tsv
tsv-filter -H -f "seqName,clade" {input.nextclade_denv3} \
| awk 'NR>1 {print}' \
>> results/nextclade_subtype.tsv
tsv-filter -H -f "seqName,clade" {input.nextclade_denv4} \
| awk 'NR>1 {print}' \
>> results/nextclade_subtype.tsv
tsv-join -H \
--filter-file results/nextclade_subtype.tsv \
--key-fields genbank_accession \
--data-fields nextclade_subtype \
--append-fields '*' \
--write-all ? \
{input.metadata} \
> {output.metadata_all}
tsv-filter -H -f "nextclade_subtype=='DENV1'" {output.metadata_all} > {output.metadata_denv1}
tsv-filter -H -f "nextclade_subtype=='DENV2'" {output.metadata_all} > {output.metadata_denv2}
tsv-filter -H -f "nextclade_subtype=='DENV3'" {output.metadata_all} > {output.metadata_denv3}
tsv-filter -H -f "nextclade_subtype=='DENV4'" {output.metadata_all} > {output.metadata_denv4}
"""

0 comments on commit aaf84f6

Please sign in to comment.