Skip to content

Commit

Permalink
Re add nextclade assignment rules
Browse files Browse the repository at this point in the history
  • Loading branch information
j23414 committed Dec 5, 2023
1 parent dacad11 commit a867f26
Show file tree
Hide file tree
Showing 3 changed files with 200 additions and 0 deletions.
1 change: 1 addition & 0 deletions ingest/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ rule all:

include: "workflow/snakemake_rules/fetch_sequences.smk"
include: "workflow/snakemake_rules/transform.smk"
include: "workflow/snakemake_rules/nextclade.smk"


if config.get("upload", False):
Expand Down
16 changes: 16 additions & 0 deletions ingest/source-data/nextclade-field-map.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
key value
seqName seqName
clade clade
outbreak outbreak
lineage lineage
coverage coverage
totalMissing missing_data
totalSubstitutions divergence
totalNonACGTNs nonACGTN
qc.missingData.status QC_missing_data
qc.mixedSites.status QC_mixed_sites
qc.privateMutations.status QC_rare_mutations
qc.frameShifts.status QC_frame_shifts
qc.stopCodons.status QC_stop_codons
frameShifts frame_shifts
isReverseComplement is_reverse_complement
183 changes: 183 additions & 0 deletions ingest/workflow/snakemake_rules/nextclade.smk
Original file line number Diff line number Diff line change
@@ -0,0 +1,183 @@
rule nextclade_all:
input:
sequences="results/sequences.fasta",
dataset="../nextclade_data/all",
output:
"results/nextclade_results/nextclade_all.tsv",
threads: 4
shell:
"""
nextclade run \
--input-dataset {input.dataset} \
-j {threads} \
--output-tsv {output} \
--min-match-rate 0.01 \
--silent \
{input.sequences}
"""

# Might be able to parallelize this
rule split_dengue_sequences:
input:
sequences="results/sequences.fasta",
metadata="results/metadata.tsv",
nextclade_all_results="results/nextclade_results/nextclade_all.tsv",
output:
sequences_denv1="results/sequences_denv1.fasta",
sequences_denv2="results/sequences_denv2.fasta",
sequences_denv3="results/sequences_denv3.fasta",
sequences_denv4="results/sequences_denv4.fasta",
shell:
"""
augur filter \
--sequences results/sequences.fasta
--metadata {input.nextclade_all_results} \
--metadata-id-columns seqName \
--query "clade=='DENV1'" \
--output-sequences {output.sequences_denv1}
augur filter \
--sequences results/sequences.fasta
--metadata {input.nextclade_all_results} \
--metadata-id-columns seqName \
--query "clade=='DENV2'" \
--output-sequences {output.sequences_denv2}
augur filter \
--sequences results/sequences.fasta
--metadata {input.nextclade_all_results} \
--metadata-id-columns seqName \
--query "clade=='DENV3'" \
--output-sequences {output.sequences_denv3}
augur filter \
--sequences results/sequences.fasta
--metadata {input.nextclade_all_results} \
--metadata-id-columns seqName \
--query "clade=='DENV4'" \
--output-sequences {output.sequences_denv4}
"""

# The following rules should use wildcards
rule nextclade_denv1:
input:
sequences="results/sequences_denv1.fasta",
dataset="../nextclade_data/denv1",
output:
"results/nextclade_results/nextclade_denv1.tsv",
threads: 4
shell:
"""
nextclade run \
--input-dataset {input.dataset} \
-j {threads} \
--output-tsv {output} \
--min-match-rate 0.01 \
--silent \
{input.sequences}
"""

rule nextclade_denv2:
input:
sequences="results/sequences_denv2.fasta",
dataset="../nextclade_data/denv2",
output:
"results/nextclade_results/nextclade_denv2.tsv",
threads: 4
shell:
"""
nextclade run \
--input-dataset {input.dataset} \
-j {threads} \
--output-tsv {output} \
--min-match-rate 0.01 \
--silent \
{input.sequences}
"""

rule nextclade_denv3:
input:
sequences="results/sequences_denv3.fasta",
dataset="../nextclade_data/denv3",
output:
"results/nextclade_results/nextclade_denv3.tsv",
threads: 4
shell:
"""
nextclade run \
--input-dataset {input.dataset} \
-j {threads} \
--output-tsv {output} \
--min-match-rate 0.01 \
--silent \
{input.sequences}
"""

rule nextclade_denv4:
input:
sequences="results/sequences_denv4.fasta",
dataset="../nextclade_data/denv4",
output:
"results/nextclade_results/nextclade_denv4.tsv",
threads: 4
shell:
"""
nextclade run \
--input-dataset {input.dataset} \
-j {threads} \
--output-tsv {output} \
--min-match-rate 0.01 \
--silent \
{input.sequences}
"""

#rule join_nextclade_clades:
# input:
# metadata="results/metadata.tsv",
# nextclade_denv1="results/nextclade_results/nextclade_denv1.tsv",
# nextclade_denv2="results/nextclade_results/nextclade_denv2.tsv",
# nextclade_denv3="results/nextclade_results/nextclade_denv3.tsv",
# nextclade_denv4="results/nextclade_results/nextclade_denv4.tsv",
# output:
# metadata_all="results/metadata_all.tsv",
# metadata_denv1="results/metadata_denv1.tsv",
# metadata_denv2="results/metadata_denv2.tsv",
# metadata_denv3="results/metadata_denv3.tsv",
# metadata_denv4="results/metadata_denv4.tsv",
# shell:
# """
# """


# rule join_metadata_clades:
# input:
# nextclade="data/nextclade.tsv",
# metadata="data/metadata_raw.tsv",
# nextclade_field_map=config["nextclade"]["field_map"],
# output:
# metadata="results/metadata.tsv",
# params:
# id_field=config["transform"]["id_field"],
# nextclade_id_field=config["nextclade"]["id_field"],
# shell:
# """
# export SUBSET_FIELDS=`awk 'NR>1 {{print $1}}' {input.nextclade_field_map} | tr '\n' ',' | sed 's/,$//g'`
#
# csvtk -tl cut -f $SUBSET_FIELDS \
# {input.nextclade} \
# | csvtk -tl rename2 \
# -F \
# -f '*' \
# -p '(.+)' \
# -r '{{kv}}' \
# -k {input.nextclade_field_map} \
# | tsv-join -H \
# --filter-file - \
# --key-fields {params.nextclade_id_field} \
# --data-fields {params.id_field} \
# --append-fields '*' \
# --write-all ? \
# {input.metadata} \
# | tsv-select -H --exclude {params.nextclade_id_field} \
# > {output.metadata}
# """

0 comments on commit a867f26

Please sign in to comment.