Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reorganize config and rules around filter/subsampling #185

Merged
merged 3 commits into from
Sep 22, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions config/config_hmpxv1.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
exclude: "config/exclude_accessions_mpxv.txt"
reference: "config/reference.fasta"
genemap: "config/genemap.gff"
genbank_reference: "config/reference.gb"
Expand All @@ -17,14 +16,15 @@ display_strain_field: "strain"
build_name: "hmpxv1"
auspice_name: "mpox_clade-IIb"

## filter
min_date: 2017
min_length: 100000
filter:
exclude: "config/exclude_accessions_mpxv.txt"
min_date: 2017
min_length: 100000


### Set 1: Non-B.1 sequences: use all
### Set 2: B.1 sequences: small sample across year/country, maybe month
filter:
subsample:
non_b1:
group_by: "--group-by lineage year country"
sequences_per_group: "--sequences-per-group 50"
Expand Down
9 changes: 5 additions & 4 deletions config/config_hmpxv1_big.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
exclude: "config/exclude_accessions_mpxv.txt"
reference: "config/reference.fasta"
genemap: "config/genemap.gff"
genbank_reference: "config/reference.gb"
Expand All @@ -17,10 +16,12 @@ display_strain_field: "strain"
build_name: "hmpxv1_big"
auspice_name: "mpox_lineage-B.1"

## filter
min_date: 2022
min_length: 180000
filter:
exclude: "config/exclude_accessions_mpxv.txt"
min_date: 2022
min_length: 180000

subsample:
b1:
group_by: "--group-by year month country"
sequences_per_group: "--subsample-max-sequences 5000"
Expand Down
10 changes: 5 additions & 5 deletions config/config_mpxv.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
exclude: "config/exclude_accessions_mpxv.txt"
reference: "config/reference.fasta"
genemap: "config/genemap.gff"
genbank_reference: "config/reference.gb"
Expand All @@ -17,13 +16,14 @@ display_strain_field: "strain"
build_name: "mpxv"
auspice_name: "mpox_all-clades"

## filter
min_date: 1950
min_length: 100000
filter:
exclude: "config/exclude_accessions_mpxv.txt"
min_date: 1950
min_length: 100000

### Set 1: Non-B.1 sequences: use all
### Set 2: B.1 sequences: small sample across year/country, maybe month
filter:
subsample:
non_b1:
group_by: "--group-by clade year country"
sequences_per_group: "--sequences-per-group 50"
Expand Down
48 changes: 22 additions & 26 deletions workflow/snakemake_rules/core.smk
Original file line number Diff line number Diff line change
Expand Up @@ -13,61 +13,57 @@ In addition, `build_dir` and `auspice_dir` need to be defined upstream.
"""


rule exclude_bad:
rule filter:
"""
Removing strains that violate one of
- from {params.min_date} onwards
- excluding strains in {input.exclude}
- minimum genome length of {params.min_length}
- QC_rare_mutations == bad
Removing strains that do not satisfy certain requirements.
"""
input:
sequences="data/sequences.fasta",
metadata="data/metadata.tsv",
exclude=config["exclude"],
output:
sequences=build_dir + "/{build_name}/good_sequences.fasta",
metadata=build_dir + "/{build_name}/good_metadata.tsv",
log=build_dir + "/{build_name}/good_filter.log",
params:
min_date=config["min_date"],
min_length=config["min_length"],
exclude=config["filter"]["exclude"],
min_date=config["filter"]["min_date"],
min_length=config["filter"]["min_length"],
strain_id=config["strain_id_field"],
shell:
"""
augur filter \
--sequences {input.sequences} \
--metadata {input.metadata} \
--metadata-id-columns {params.strain_id} \
--exclude {input.exclude} \
--output-sequences {output.sequences} \
--output-metadata {output.metadata} \
--exclude {params.exclude} \
--min-date {params.min_date} \
--min-length {params.min_length} \
--exclude-where QC_rare_mutations=bad \
--output-log {output.log}
"""


rule filter:
rule subsample:
input:
sequences=rules.exclude_bad.output.sequences,
metadata=rules.exclude_bad.output.metadata,
metadata=rules.filter.output.metadata,
output:
strains=build_dir + "/{build_name}/{subset}_strains.txt",
log=build_dir + "/{build_name}/{subset}_filter.log",
strains=build_dir + "/{build_name}/{sample}_strains.txt",
log=build_dir + "/{build_name}/{sample}_filter.log",
params:
group_by=lambda w: config["filter"][w.subset]["group_by"],
sequences_per_group=lambda w: config["filter"][w.subset]["sequences_per_group"],
other_filters=lambda w: config["filter"][w.subset].get("other_filters", ""),
exclude=lambda w: f"--exclude-where {' '.join([f'lineage={l}' for l in config['filter'][w.subset]['exclude_lineages']])}"
if "exclude_lineages" in config["filter"][w.subset]
group_by=lambda w: config["subsample"][w.sample]["group_by"],
sequences_per_group=lambda w: config["subsample"][w.sample][
"sequences_per_group"
],
other_filters=lambda w: config["subsample"][w.sample].get("other_filters", ""),
exclude=lambda w: f"--exclude-where {' '.join([f'lineage={l}' for l in config['subsample'][w.sample]['exclude_lineages']])}"
if "exclude_lineages" in config["subsample"][w.sample]
else "",
strain_id=config["strain_id_field"],
shell:
"""
augur filter \
--sequences {input.sequences} \
--metadata {input.metadata} \
--metadata-id-columns {params.strain_id} \
--output-strains {output.strains} \
Expand All @@ -79,14 +75,14 @@ rule filter:
"""


rule filter_merge:
rule combine_samples:
input:
strains=lambda w: [
f"{build_dir}/{w.build_name}/{subset}_strains.txt"
for subset in config["filter"]
f"{build_dir}/{w.build_name}/{sample}_strains.txt"
for sample in config["subsample"]
],
sequences=rules.exclude_bad.output.sequences,
metadata=rules.exclude_bad.output.metadata,
sequences=rules.filter.output.sequences,
metadata=rules.filter.output.metadata,
include="config/include_{build_name}.txt",
output:
sequences=build_dir + "/{build_name}/filtered.fasta",
Expand Down