From e0a236b4b5791323da5c58bbcb4603b950fd4ee6 Mon Sep 17 00:00:00 2001 From: Victor Lin <13424970+victorlin@users.noreply.github.com> Date: Thu, 21 Sep 2023 14:56:47 -0700 Subject: [PATCH 1/3] Simplify wording of filter rule description The wording is a bit confusing since it is a mix of inclusive and exclusive. Easier to just let the reader look at the arguments passed to augur filter, which removes the need to update a list of filters in the docstring over time. --- workflow/snakemake_rules/core.smk | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/workflow/snakemake_rules/core.smk b/workflow/snakemake_rules/core.smk index f8ae0197..262da22c 100644 --- a/workflow/snakemake_rules/core.smk +++ b/workflow/snakemake_rules/core.smk @@ -15,11 +15,7 @@ In addition, `build_dir` and `auspice_dir` need to be defined upstream. rule exclude_bad: """ - Removing strains that violate one of - - from {params.min_date} onwards - - excluding strains in {input.exclude} - - minimum genome length of {params.min_length} - - QC_rare_mutations == bad + Removing strains that do not satisfy certain requirements. """ input: sequences="data/sequences.fasta", From 91482e809ae4677b492c8420b93983fd0cc553de Mon Sep 17 00:00:00 2001 From: Victor Lin <13424970+victorlin@users.noreply.github.com> Date: Thu, 21 Sep 2023 14:48:26 -0700 Subject: [PATCH 2/3] Reorganize config and rules around filter/subsampling Subsampling is a form of filtering, but there is a conceptual distinction that is apparent in the rules and configs used in the ncov workflow, which I've mirrored here. Put all filtering-related rules in a "filter" config, which configures a rule with the same name. Put all subsampling-related rules in a "subsampling" config which also configures a rule with the same name. --- config/config_hmpxv1.yaml | 10 ++++---- config/config_hmpxv1_big.yaml | 9 ++++--- config/config_mpxv.yaml | 10 ++++---- workflow/snakemake_rules/core.smk | 42 ++++++++++++++++--------------- 4 files changed, 37 insertions(+), 34 deletions(-) diff --git a/config/config_hmpxv1.yaml b/config/config_hmpxv1.yaml index fc42dad8..0edb53c2 100644 --- a/config/config_hmpxv1.yaml +++ b/config/config_hmpxv1.yaml @@ -1,4 +1,3 @@ -exclude: "config/exclude_accessions_mpxv.txt" reference: "config/reference.fasta" genemap: "config/genemap.gff" genbank_reference: "config/reference.gb" @@ -17,14 +16,15 @@ display_strain_field: "strain" build_name: "hmpxv1" auspice_name: "mpox_clade-IIb" -## filter -min_date: 2017 -min_length: 100000 +filter: + exclude: "config/exclude_accessions_mpxv.txt" + min_date: 2017 + min_length: 100000 ### Set 1: Non-B.1 sequences: use all ### Set 2: B.1 sequences: small sample across year/country, maybe month -filter: +subsample: non_b1: group_by: "--group-by lineage year country" sequences_per_group: "--sequences-per-group 50" diff --git a/config/config_hmpxv1_big.yaml b/config/config_hmpxv1_big.yaml index 6844f8b4..2b32589c 100644 --- a/config/config_hmpxv1_big.yaml +++ b/config/config_hmpxv1_big.yaml @@ -1,4 +1,3 @@ -exclude: "config/exclude_accessions_mpxv.txt" reference: "config/reference.fasta" genemap: "config/genemap.gff" genbank_reference: "config/reference.gb" @@ -17,10 +16,12 @@ display_strain_field: "strain" build_name: "hmpxv1_big" auspice_name: "mpox_lineage-B.1" -## filter -min_date: 2022 -min_length: 180000 filter: + exclude: "config/exclude_accessions_mpxv.txt" + min_date: 2022 + min_length: 180000 + +subsample: b1: group_by: "--group-by year month country" sequences_per_group: "--subsample-max-sequences 5000" diff --git a/config/config_mpxv.yaml b/config/config_mpxv.yaml index 34bdebd7..ba93fccf 100644 --- a/config/config_mpxv.yaml +++ b/config/config_mpxv.yaml @@ -1,4 +1,3 @@ -exclude: "config/exclude_accessions_mpxv.txt" reference: "config/reference.fasta" genemap: "config/genemap.gff" genbank_reference: "config/reference.gb" @@ -17,13 +16,14 @@ display_strain_field: "strain" build_name: "mpxv" auspice_name: "mpox_all-clades" -## filter -min_date: 1950 -min_length: 100000 +filter: + exclude: "config/exclude_accessions_mpxv.txt" + min_date: 1950 + min_length: 100000 ### Set 1: Non-B.1 sequences: use all ### Set 2: B.1 sequences: small sample across year/country, maybe month -filter: +subsample: non_b1: group_by: "--group-by clade year country" sequences_per_group: "--sequences-per-group 50" diff --git a/workflow/snakemake_rules/core.smk b/workflow/snakemake_rules/core.smk index 262da22c..dbe8aa7b 100644 --- a/workflow/snakemake_rules/core.smk +++ b/workflow/snakemake_rules/core.smk @@ -13,21 +13,21 @@ In addition, `build_dir` and `auspice_dir` need to be defined upstream. """ -rule exclude_bad: +rule filter: """ Removing strains that do not satisfy certain requirements. """ input: sequences="data/sequences.fasta", metadata="data/metadata.tsv", - exclude=config["exclude"], output: sequences=build_dir + "/{build_name}/good_sequences.fasta", metadata=build_dir + "/{build_name}/good_metadata.tsv", log=build_dir + "/{build_name}/good_filter.log", params: - min_date=config["min_date"], - min_length=config["min_length"], + exclude=config["filter"]["exclude"], + min_date=config["filter"]["min_date"], + min_length=config["filter"]["min_length"], strain_id=config["strain_id_field"], shell: """ @@ -35,9 +35,9 @@ rule exclude_bad: --sequences {input.sequences} \ --metadata {input.metadata} \ --metadata-id-columns {params.strain_id} \ - --exclude {input.exclude} \ --output-sequences {output.sequences} \ --output-metadata {output.metadata} \ + --exclude {params.exclude} \ --min-date {params.min_date} \ --min-length {params.min_length} \ --exclude-where QC_rare_mutations=bad \ @@ -45,19 +45,21 @@ rule exclude_bad: """ -rule filter: +rule subsample: input: - sequences=rules.exclude_bad.output.sequences, - metadata=rules.exclude_bad.output.metadata, + sequences=rules.filter.output.sequences, + metadata=rules.filter.output.metadata, output: - strains=build_dir + "/{build_name}/{subset}_strains.txt", - log=build_dir + "/{build_name}/{subset}_filter.log", + strains=build_dir + "/{build_name}/{sample}_strains.txt", + log=build_dir + "/{build_name}/{sample}_filter.log", params: - group_by=lambda w: config["filter"][w.subset]["group_by"], - sequences_per_group=lambda w: config["filter"][w.subset]["sequences_per_group"], - other_filters=lambda w: config["filter"][w.subset].get("other_filters", ""), - exclude=lambda w: f"--exclude-where {' '.join([f'lineage={l}' for l in config['filter'][w.subset]['exclude_lineages']])}" - if "exclude_lineages" in config["filter"][w.subset] + group_by=lambda w: config["subsample"][w.sample]["group_by"], + sequences_per_group=lambda w: config["subsample"][w.sample][ + "sequences_per_group" + ], + other_filters=lambda w: config["subsample"][w.sample].get("other_filters", ""), + exclude=lambda w: f"--exclude-where {' '.join([f'lineage={l}' for l in config['subsample'][w.sample]['exclude_lineages']])}" + if "exclude_lineages" in config["subsample"][w.sample] else "", strain_id=config["strain_id_field"], shell: @@ -75,14 +77,14 @@ rule filter: """ -rule filter_merge: +rule combine_samples: input: strains=lambda w: [ - f"{build_dir}/{w.build_name}/{subset}_strains.txt" - for subset in config["filter"] + f"{build_dir}/{w.build_name}/{sample}_strains.txt" + for sample in config["subsample"] ], - sequences=rules.exclude_bad.output.sequences, - metadata=rules.exclude_bad.output.metadata, + sequences=rules.filter.output.sequences, + metadata=rules.filter.output.metadata, include="config/include_{build_name}.txt", output: sequences=build_dir + "/{build_name}/filtered.fasta", From 834dc220b9a521157370655057e0f9e23ace17eb Mon Sep 17 00:00:00 2001 From: Victor Lin <13424970+victorlin@users.noreply.github.com> Date: Thu, 21 Sep 2023 15:16:59 -0700 Subject: [PATCH 3/3] Don't use sequences in metadata-based subsampling Removing unnecessary sequence I/O should improve workflow run times. --- workflow/snakemake_rules/core.smk | 2 -- 1 file changed, 2 deletions(-) diff --git a/workflow/snakemake_rules/core.smk b/workflow/snakemake_rules/core.smk index dbe8aa7b..77299730 100644 --- a/workflow/snakemake_rules/core.smk +++ b/workflow/snakemake_rules/core.smk @@ -47,7 +47,6 @@ rule filter: rule subsample: input: - sequences=rules.filter.output.sequences, metadata=rules.filter.output.metadata, output: strains=build_dir + "/{build_name}/{sample}_strains.txt", @@ -65,7 +64,6 @@ rule subsample: shell: """ augur filter \ - --sequences {input.sequences} \ --metadata {input.metadata} \ --metadata-id-columns {params.strain_id} \ --output-strains {output.strains} \