diff --git a/.gitignore b/.gitignore index e8a7bfe..ba9f4a4 100644 --- a/.gitignore +++ b/.gitignore @@ -12,6 +12,7 @@ nextclade/results nextclade/test_output phylogenetic/auspice phylogenetic/benchmarks +phylogenetic/data phylogenetic/logs phylogenetic/results diff --git a/phylogenetic/rules/annotate_phylogeny.smk b/phylogenetic/rules/annotate_phylogeny.smk index 7a52612..0659a0f 100644 --- a/phylogenetic/rules/annotate_phylogeny.smk +++ b/phylogenetic/rules/annotate_phylogeny.smk @@ -53,7 +53,7 @@ rule traits: """Inferring ancestral traits for {params.columns!s}""" input: tree = "results/{gene}/tree.nwk", - metadata = "../ingest/results/metadata.tsv", + metadata = "data/metadata.tsv", output: node_data = "results/{gene}/traits.json", params: diff --git a/phylogenetic/rules/construct_phylogeny.smk b/phylogenetic/rules/construct_phylogeny.smk index 6006d51..a299344 100644 --- a/phylogenetic/rules/construct_phylogeny.smk +++ b/phylogenetic/rules/construct_phylogeny.smk @@ -30,7 +30,7 @@ rule refine: input: tree = "results/{gene}/tree_raw.nwk", alignment = "results/{gene}/aligned_and_filtered.fasta", - metadata = "../ingest/results/metadata.tsv" + metadata = "data/metadata.tsv" output: tree = "results/{gene}/tree.nwk", node_data = "results/{gene}/branch_lengths.json" diff --git a/phylogenetic/rules/export.smk b/phylogenetic/rules/export.smk index 9514f23..d752260 100644 --- a/phylogenetic/rules/export.smk +++ b/phylogenetic/rules/export.smk @@ -7,7 +7,7 @@ rule export: """Exporting data files for for auspice""" input: tree = "results/{gene}/tree.nwk", - metadata = "../ingest/results/metadata.tsv", + metadata = "data/metadata.tsv", branch_lengths = "results/{gene}/branch_lengths.json", nt_muts = "results/{gene}/nt_muts.json", aa_muts = "results/{gene}/aa_muts.json", diff --git a/phylogenetic/rules/prepare_sequences.smk b/phylogenetic/rules/prepare_sequences.smk index 3d765a5..872cbd9 100644 --- a/phylogenetic/rules/prepare_sequences.smk +++ b/phylogenetic/rules/prepare_sequences.smk @@ -3,15 +3,40 @@ This part of the workflow prepares sequences for constructing the phylogenetic tree. """ +rule download: + output: + metadata="data/metadata.tsv.zst", + sequences="data/sequences.fasta.zst", + params: + sequences_url="https://data.nextstrain.org/files/workflows/yellow-fever/sequences.fasta.zst", + metadata_url="https://data.nextstrain.org/files/workflows/yellow-fever/metadata.tsv.zst", + shell: + r""" + curl -fsSL --compressed {params.sequences_url:q} --output {output.sequences} + curl -fsSL --compressed {params.metadata_url:q} --output {output.metadata} + """ + + +rule decompress: + input: + sequences="data/sequences.fasta.zst", + metadata="data/metadata.tsv.zst", + output: + sequences="data/sequences.fasta", + metadata="data/metadata.tsv", + shell: + r""" + zstd -d -c {input.sequences} > {output.sequences} + zstd -d -c {input.metadata} > {output.metadata} + """ + + rule filter_genome: input: exclude = config["files"]["genome"]["exclude"], include = config["files"]["genome"]["include"], - # TODO once this repo is fully automated and uploading data to - # S3, this step should download data from there instead of - # depending on the ingest build - metadata = "../ingest/results/metadata.tsv", - sequences = "../ingest/results/sequences.fasta" + metadata = "data/metadata.tsv", + sequences = "data/sequences.fasta" output: sequences = "results/genome/filtered.fasta" params: @@ -65,10 +90,7 @@ rule align_genome: rule align_and_extract_prME: input: reference=config["files"]["prM-E"]["reference"], - # TODO once this repo is fully automated and uploading data to - # S3, this step should download data from there instead of - # depending on the ingest build - sequences = "../ingest/results/sequences.fasta", + sequences = "data/sequences.fasta", output: alignment = "results/prM-E/aligned.fasta" params: @@ -97,10 +119,7 @@ rule filter_prME: input: exclude = config["files"]["prM-E"]["exclude"], include = config["files"]["prM-E"]["include"], - # TODO once this repo is fully automated and uploading data to - # S3, this step should download data from there instead of - # depending on the ingest build - metadata = "../ingest/results/metadata.tsv", + metadata = "data/metadata.tsv", sequences = "results/prM-E/aligned.fasta" output: sequences = "results/prM-E/aligned_and_filtered.fasta"