-
Notifications
You must be signed in to change notification settings - Fork 11
/
prepare_sequences.smk
100 lines (95 loc) · 3.62 KB
/
prepare_sequences.smk
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
"""
This part of the workflow prepares sequences for constructing the phylogenetic tree.
REQUIRED INPUTS:
metadata_url = url to metadata.tsv.zst
sequences_url = url to sequences.fasta.zst
reference = path to reference sequence or genbank
OUTPUTS:
prepared_sequences = results/aligned.fasta
This part of the workflow usually includes the following steps:
- augur index
- augur filter
- augur align
- augur mask
See Augur's usage docs for these commands for more details.
"""
rule download:
"""Downloading sequences and metadata from data.nextstrain.org"""
output:
sequences = "data/sequences_{serotype}.fasta.zst",
metadata = "data/metadata_{serotype}.tsv.zst"
params:
sequences_url = config["sequences_url"],
metadata_url = config["metadata_url"],
shell:
"""
curl -fsSL --compressed {params.sequences_url:q} --output {output.sequences}
curl -fsSL --compressed {params.metadata_url:q} --output {output.metadata}
"""
rule decompress:
"""Parsing fasta into sequences and metadata"""
input:
sequences = "data/sequences_{serotype}.fasta.zst",
metadata = "data/metadata_{serotype}.tsv.zst"
output:
sequences = "data/sequences_{serotype}.fasta",
metadata = "data/metadata_{serotype}.tsv"
shell:
"""
zstd -d -c {input.sequences} > {output.sequences}
zstd -d -c {input.metadata} > {output.metadata}
"""
rule filter:
"""
Filtering to
- {params.sequences_per_group} sequence(s) per {params.group_by!s}
- excluding strains in {input.exclude}
- minimum genome length of {params.min_length}
- excluding strains with missing region, country or date metadata
"""
input:
sequences = lambda wildcard: "data/sequences_{serotype}.fasta" if wildcard.gene in ['genome'] else "results/{gene}/sequences_{serotype}.fasta",
metadata = "data/metadata_{serotype}.tsv",
exclude = config["filter"]["exclude"],
include = config["filter"]["include"],
output:
sequences = "results/{gene}/filtered_{serotype}.fasta"
params:
group_by = config['filter']['group_by'],
sequences_per_group = lambda wildcards: config['filter']['sequences_per_group'][wildcards.serotype],
min_length = lambda wildcard: config['filter']['min_length'][wildcard.gene],
strain_id = config.get("strain_id_field", "strain"),
shell:
"""
augur filter \
--sequences {input.sequences} \
--metadata {input.metadata} \
--metadata-id-columns {params.strain_id} \
--exclude {input.exclude} \
--include {input.include} \
--output {output.sequences} \
--group-by {params.group_by} \
--sequences-per-group {params.sequences_per_group} \
--min-length {params.min_length} \
--exclude-where country=? region=? date=? \
"""
rule align:
"""
Aligning sequences to {input.reference}
- filling gaps with N
"""
input:
sequences = "results/{gene}/filtered_{serotype}.fasta",
reference = lambda wildcard: "config/reference_{serotype}_genome.gb" if wildcard.gene in ['genome'] else "results/config/reference_{serotype}_{gene}.gb"
output:
alignment = "results/{gene}/aligned_{serotype}.fasta"
shell:
"""
augur align \
--sequences {input.sequences} \
--reference-sequence {input.reference} \
--output {output.alignment} \
--fill-gaps \
--remove-reference \
--nthreads 1
"""