From e0a236b4b5791323da5c58bbcb4603b950fd4ee6 Mon Sep 17 00:00:00 2001
From: Victor Lin <13424970+victorlin@users.noreply.github.com>
Date: Thu, 21 Sep 2023 14:56:47 -0700
Subject: [PATCH 1/3] Simplify wording of filter rule description

The wording is a bit confusing since it is a mix of inclusive and
exclusive. Easier to just let the reader look at the arguments passed to
augur filter, which removes the need to update a list of filters in the
docstring over time.
---
 workflow/snakemake_rules/core.smk | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/workflow/snakemake_rules/core.smk b/workflow/snakemake_rules/core.smk
index f8ae0197..262da22c 100644
--- a/workflow/snakemake_rules/core.smk
+++ b/workflow/snakemake_rules/core.smk
@@ -15,11 +15,7 @@ In addition, `build_dir` and `auspice_dir` need to be defined upstream.
 
 rule exclude_bad:
     """
-    Removing strains that violate one of
-        - from {params.min_date} onwards
-        - excluding strains in {input.exclude}
-        - minimum genome length of {params.min_length}
-        - QC_rare_mutations == bad
+    Removing strains that do not satisfy certain requirements.
     """
     input:
         sequences="data/sequences.fasta",

From 91482e809ae4677b492c8420b93983fd0cc553de Mon Sep 17 00:00:00 2001
From: Victor Lin <13424970+victorlin@users.noreply.github.com>
Date: Thu, 21 Sep 2023 14:48:26 -0700
Subject: [PATCH 2/3] Reorganize config and rules around filter/subsampling

Subsampling is a form of filtering, but there is a conceptual
distinction that is apparent in the rules and configs used in the ncov
workflow, which I've mirrored here.

Put all filtering-related rules in a "filter" config, which configures a
rule with the same name.

Put all subsampling-related rules in a "subsampling" config which also
configures a rule with the same name.
---
 config/config_hmpxv1.yaml         | 10 ++++----
 config/config_hmpxv1_big.yaml     |  9 ++++---
 config/config_mpxv.yaml           | 10 ++++----
 workflow/snakemake_rules/core.smk | 42 ++++++++++++++++---------------
 4 files changed, 37 insertions(+), 34 deletions(-)

diff --git a/config/config_hmpxv1.yaml b/config/config_hmpxv1.yaml
index fc42dad8..0edb53c2 100644
--- a/config/config_hmpxv1.yaml
+++ b/config/config_hmpxv1.yaml
@@ -1,4 +1,3 @@
-exclude: "config/exclude_accessions_mpxv.txt"
 reference: "config/reference.fasta"
 genemap: "config/genemap.gff"
 genbank_reference: "config/reference.gb"
@@ -17,14 +16,15 @@ display_strain_field: "strain"
 build_name: "hmpxv1"
 auspice_name: "mpox_clade-IIb"
 
-## filter
-min_date: 2017
-min_length: 100000
+filter:
+  exclude: "config/exclude_accessions_mpxv.txt"
+  min_date: 2017
+  min_length: 100000
 
 
 ### Set 1: Non-B.1 sequences: use all
 ### Set 2: B.1 sequences: small sample across year/country, maybe month
-filter:
+subsample:
   non_b1:
     group_by: "--group-by lineage year country"
     sequences_per_group: "--sequences-per-group 50"
diff --git a/config/config_hmpxv1_big.yaml b/config/config_hmpxv1_big.yaml
index 6844f8b4..2b32589c 100644
--- a/config/config_hmpxv1_big.yaml
+++ b/config/config_hmpxv1_big.yaml
@@ -1,4 +1,3 @@
-exclude: "config/exclude_accessions_mpxv.txt"
 reference: "config/reference.fasta"
 genemap: "config/genemap.gff"
 genbank_reference: "config/reference.gb"
@@ -17,10 +16,12 @@ display_strain_field: "strain"
 build_name: "hmpxv1_big"
 auspice_name: "mpox_lineage-B.1"
 
-## filter
-min_date: 2022
-min_length: 180000
 filter:
+  exclude: "config/exclude_accessions_mpxv.txt"
+  min_date: 2022
+  min_length: 180000
+
+subsample:
   b1:
     group_by: "--group-by year month country"
     sequences_per_group: "--subsample-max-sequences 5000"
diff --git a/config/config_mpxv.yaml b/config/config_mpxv.yaml
index 34bdebd7..ba93fccf 100644
--- a/config/config_mpxv.yaml
+++ b/config/config_mpxv.yaml
@@ -1,4 +1,3 @@
-exclude: "config/exclude_accessions_mpxv.txt"
 reference: "config/reference.fasta"
 genemap: "config/genemap.gff"
 genbank_reference: "config/reference.gb"
@@ -17,13 +16,14 @@ display_strain_field: "strain"
 build_name: "mpxv"
 auspice_name: "mpox_all-clades"
 
-## filter
-min_date: 1950
-min_length: 100000
+filter:
+  exclude: "config/exclude_accessions_mpxv.txt"
+  min_date: 1950
+  min_length: 100000
 
 ### Set 1: Non-B.1 sequences: use all
 ### Set 2: B.1 sequences: small sample across year/country, maybe month
-filter:
+subsample:
   non_b1:
     group_by: "--group-by clade year country"
     sequences_per_group: "--sequences-per-group 50"
diff --git a/workflow/snakemake_rules/core.smk b/workflow/snakemake_rules/core.smk
index 262da22c..dbe8aa7b 100644
--- a/workflow/snakemake_rules/core.smk
+++ b/workflow/snakemake_rules/core.smk
@@ -13,21 +13,21 @@ In addition, `build_dir` and `auspice_dir` need to be defined upstream.
 """
 
 
-rule exclude_bad:
+rule filter:
     """
     Removing strains that do not satisfy certain requirements.
     """
     input:
         sequences="data/sequences.fasta",
         metadata="data/metadata.tsv",
-        exclude=config["exclude"],
     output:
         sequences=build_dir + "/{build_name}/good_sequences.fasta",
         metadata=build_dir + "/{build_name}/good_metadata.tsv",
         log=build_dir + "/{build_name}/good_filter.log",
     params:
-        min_date=config["min_date"],
-        min_length=config["min_length"],
+        exclude=config["filter"]["exclude"],
+        min_date=config["filter"]["min_date"],
+        min_length=config["filter"]["min_length"],
         strain_id=config["strain_id_field"],
     shell:
         """
@@ -35,9 +35,9 @@ rule exclude_bad:
             --sequences {input.sequences} \
             --metadata {input.metadata} \
             --metadata-id-columns {params.strain_id} \
-            --exclude {input.exclude} \
             --output-sequences {output.sequences} \
             --output-metadata {output.metadata} \
+            --exclude {params.exclude} \
             --min-date {params.min_date} \
             --min-length {params.min_length} \
             --exclude-where QC_rare_mutations=bad \
@@ -45,19 +45,21 @@ rule exclude_bad:
         """
 
 
-rule filter:
+rule subsample:
     input:
-        sequences=rules.exclude_bad.output.sequences,
-        metadata=rules.exclude_bad.output.metadata,
+        sequences=rules.filter.output.sequences,
+        metadata=rules.filter.output.metadata,
     output:
-        strains=build_dir + "/{build_name}/{subset}_strains.txt",
-        log=build_dir + "/{build_name}/{subset}_filter.log",
+        strains=build_dir + "/{build_name}/{sample}_strains.txt",
+        log=build_dir + "/{build_name}/{sample}_filter.log",
     params:
-        group_by=lambda w: config["filter"][w.subset]["group_by"],
-        sequences_per_group=lambda w: config["filter"][w.subset]["sequences_per_group"],
-        other_filters=lambda w: config["filter"][w.subset].get("other_filters", ""),
-        exclude=lambda w: f"--exclude-where {' '.join([f'lineage={l}' for l in config['filter'][w.subset]['exclude_lineages']])}"
-        if "exclude_lineages" in config["filter"][w.subset]
+        group_by=lambda w: config["subsample"][w.sample]["group_by"],
+        sequences_per_group=lambda w: config["subsample"][w.sample][
+            "sequences_per_group"
+        ],
+        other_filters=lambda w: config["subsample"][w.sample].get("other_filters", ""),
+        exclude=lambda w: f"--exclude-where {' '.join([f'lineage={l}' for l in config['subsample'][w.sample]['exclude_lineages']])}"
+        if "exclude_lineages" in config["subsample"][w.sample]
         else "",
         strain_id=config["strain_id_field"],
     shell:
@@ -75,14 +77,14 @@ rule filter:
         """
 
 
-rule filter_merge:
+rule combine_samples:
     input:
         strains=lambda w: [
-            f"{build_dir}/{w.build_name}/{subset}_strains.txt"
-            for subset in config["filter"]
+            f"{build_dir}/{w.build_name}/{sample}_strains.txt"
+            for sample in config["subsample"]
         ],
-        sequences=rules.exclude_bad.output.sequences,
-        metadata=rules.exclude_bad.output.metadata,
+        sequences=rules.filter.output.sequences,
+        metadata=rules.filter.output.metadata,
         include="config/include_{build_name}.txt",
     output:
         sequences=build_dir + "/{build_name}/filtered.fasta",

From 834dc220b9a521157370655057e0f9e23ace17eb Mon Sep 17 00:00:00 2001
From: Victor Lin <13424970+victorlin@users.noreply.github.com>
Date: Thu, 21 Sep 2023 15:16:59 -0700
Subject: [PATCH 3/3] Don't use sequences in metadata-based subsampling

Removing unnecessary sequence I/O should improve workflow run times.
---
 workflow/snakemake_rules/core.smk | 2 --
 1 file changed, 2 deletions(-)

diff --git a/workflow/snakemake_rules/core.smk b/workflow/snakemake_rules/core.smk
index dbe8aa7b..77299730 100644
--- a/workflow/snakemake_rules/core.smk
+++ b/workflow/snakemake_rules/core.smk
@@ -47,7 +47,6 @@ rule filter:
 
 rule subsample:
     input:
-        sequences=rules.filter.output.sequences,
         metadata=rules.filter.output.metadata,
     output:
         strains=build_dir + "/{build_name}/{sample}_strains.txt",
@@ -65,7 +64,6 @@ rule subsample:
     shell:
         """
         augur filter \
-            --sequences {input.sequences} \
             --metadata {input.metadata} \
             --metadata-id-columns {params.strain_id} \
             --output-strains {output.strains} \