Skip to content

Commit

Permalink
Convert phylogenetic workflow to download inputs from S3 [#2]
Browse files Browse the repository at this point in the history
  • Loading branch information
genehack committed Nov 16, 2024
1 parent 12865fc commit 52f3da5
Show file tree
Hide file tree
Showing 5 changed files with 36 additions and 16 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ nextclade/results
nextclade/test_output
phylogenetic/auspice
phylogenetic/benchmarks
phylogenetic/data
phylogenetic/logs
phylogenetic/results

Expand Down
2 changes: 1 addition & 1 deletion phylogenetic/rules/annotate_phylogeny.smk
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ rule traits:
"""Inferring ancestral traits for {params.columns!s}"""
input:
tree = "results/{gene}/tree.nwk",
metadata = "../ingest/results/metadata.tsv",
metadata = "data/metadata.tsv",
output:
node_data = "results/{gene}/traits.json",
params:
Expand Down
2 changes: 1 addition & 1 deletion phylogenetic/rules/construct_phylogeny.smk
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ rule refine:
input:
tree = "results/{gene}/tree_raw.nwk",
alignment = "results/{gene}/aligned_and_filtered.fasta",
metadata = "../ingest/results/metadata.tsv"
metadata = "data/metadata.tsv"
output:
tree = "results/{gene}/tree.nwk",
node_data = "results/{gene}/branch_lengths.json"
Expand Down
2 changes: 1 addition & 1 deletion phylogenetic/rules/export.smk
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ rule export:
"""Exporting data files for for auspice"""
input:
tree = "results/{gene}/tree.nwk",
metadata = "../ingest/results/metadata.tsv",
metadata = "data/metadata.tsv",
branch_lengths = "results/{gene}/branch_lengths.json",
nt_muts = "results/{gene}/nt_muts.json",
aa_muts = "results/{gene}/aa_muts.json",
Expand Down
45 changes: 32 additions & 13 deletions phylogenetic/rules/prepare_sequences.smk
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,40 @@ This part of the workflow prepares sequences for constructing the
phylogenetic tree.
"""

rule download:
output:
metadata="data/metadata.tsv.zst",
sequences="data/sequences.fasta.zst",
params:
sequences_url="https://data.nextstrain.org/files/workflows/yellow-fever/sequences.fasta.zst",
metadata_url="https://data.nextstrain.org/files/workflows/yellow-fever/metadata.tsv.zst",
shell:
r"""
curl -fsSL --compressed {params.sequences_url:q} --output {output.sequences}
curl -fsSL --compressed {params.metadata_url:q} --output {output.metadata}
"""


rule decompress:
input:
sequences="data/sequences.fasta.zst",
metadata="data/metadata.tsv.zst",
output:
sequences="data/sequences.fasta",
metadata="data/metadata.tsv",
shell:
r"""
zstd -d -c {input.sequences} > {output.sequences}
zstd -d -c {input.metadata} > {output.metadata}
"""


rule filter_genome:
input:
exclude = config["files"]["genome"]["exclude"],
include = config["files"]["genome"]["include"],
# TODO once this repo is fully automated and uploading data to
# S3, this step should download data from there instead of
# depending on the ingest build
metadata = "../ingest/results/metadata.tsv",
sequences = "../ingest/results/sequences.fasta"
metadata = "data/metadata.tsv",
sequences = "data/sequences.fasta"
output:
sequences = "results/genome/filtered.fasta"
params:
Expand Down Expand Up @@ -65,10 +90,7 @@ rule align_genome:
rule align_and_extract_prME:
input:
reference=config["files"]["prM-E"]["reference"],
# TODO once this repo is fully automated and uploading data to
# S3, this step should download data from there instead of
# depending on the ingest build
sequences = "../ingest/results/sequences.fasta",
sequences = "data/sequences.fasta",
output:
alignment = "results/prM-E/aligned.fasta"
params:
Expand Down Expand Up @@ -97,10 +119,7 @@ rule filter_prME:
input:
exclude = config["files"]["prM-E"]["exclude"],
include = config["files"]["prM-E"]["include"],
# TODO once this repo is fully automated and uploading data to
# S3, this step should download data from there instead of
# depending on the ingest build
metadata = "../ingest/results/metadata.tsv",
metadata = "data/metadata.tsv",
sequences = "results/prM-E/aligned.fasta"
output:
sequences = "results/prM-E/aligned_and_filtered.fasta"
Expand Down

0 comments on commit 52f3da5

Please sign in to comment.