Pre-filter the inputs to the 21L-rooted builds with a custom rule

Filter rules in the config are applied _after_ subsampling, which poses issues with reliably getting the desired number of sequences. As @trvrb wrote¹: > In the workflow, the filter rule happens after the subsampling rules. > This makes it so that if we ask for say 2560 in a sampling bucket, we'll > lose >50% due to filtering out non-21L-descending clades. > > This could be solved by padding count targets to compensate, but this is > hacky and the numbers will change as time goes on. Or the filter rule > could be placed again before subsample, but we moved it afterwards for > good reasons. A few custom rules for the builds allows us to prefilter the full dataset before subsampling. Currently these rules are specific to our GISAID data source, but they could be easily expanded to our Open data sources too. In the future we might also provide clade-partitioned subsets from ncov-ingest², which we could use here instead with some adaptation of the build config. ¹ <#1029 (comment)> ² e.g. <nextstrain/ncov-ingest#398>
nextstrain · Apr 6, 2023 · 2323afd · 2323afd
1 parent 2661951
commit 2323afd
Show file tree

Hide file tree

Showing 3 changed files with 98 additions and 29 deletions.
diff --git a/nextstrain_profiles/nextstrain-gisaid-21L/builds.yaml b/nextstrain_profiles/nextstrain-gisaid-21L/builds.yaml
@@ -3,6 +3,7 @@ auspice_json_prefix: ncov_gisaid_21L
 # Define custom rules for pre- or post-standard workflow processing of data.
 custom_rules:
   - workflow/snakemake_rules/export_for_nextstrain.smk
+  - nextstrain_profiles/nextstrain-gisaid-21L/prefilter.smk
 
 # These parameters are only used by the `export_for_nextstrain` rule and shouldn't need to be modified.
 # To modify the s3 _source_ bucket, specify this directly in the `inputs` section of the config.
@@ -26,12 +27,12 @@ files:
   include: "nextstrain_profiles/nextstrain-gisaid-21L/include.txt"
   description: "nextstrain_profiles/nextstrain-gisaid-21L/nextstrain_description.md"
 
-# Note: unaligned sequences are provided as "aligned" sequences to avoid an initial full-DB alignment
-# as we re-align everything after subsampling.
 inputs:
   - name: gisaid
-    metadata: "s3://nextstrain-ncov-private/metadata.tsv.gz"
-    aligned: "s3://nextstrain-ncov-private/sequences.fasta.xz"
+    # These two files are produced by our custom gisaid_21L prefiltering rules
+    # for this build.
+    metadata: "results/gisaid_21L_metadata.tsv.zst"
+    aligned: "results/gisaid_21L_aligned.fasta.zst"
     skip_sanitize_metadata: true
   - name: references
     metadata: "data/references_metadata.tsv"
@@ -161,32 +162,8 @@ builds:
 
 # remove sequences without division label in US and sequences from prior to clade 21L
 filter:
+  exclude_where: "division='USA'"
   min_date: "2022-01-01"
-  exclude_where: "division='USA' \
-    Nextstrain_clade='19A' \
-    Nextstrain_clade='19B' \
-    Nextstrain_clade='20A' \
-    Nextstrain_clade='20B' \
-    Nextstrain_clade='20C' \
-    Nextstrain_clade='20D' \
-    Nextstrain_clade='20E (EU1)' \
-    Nextstrain_clade='20F' \
-    Nextstrain_clade='20G' \
-    Nextstrain_clade='20H (Beta, V2)' \
-    Nextstrain_clade='20I (Alpha, V1)' \
-    Nextstrain_clade='20J (Gamma, V3)' \
-    Nextstrain_clade='21A (Delta)' \
-    Nextstrain_clade='21B (Kappa)' \
-    Nextstrain_clade='21C (Epsilon)' \
-    Nextstrain_clade='21D (Eta)' \
-    Nextstrain_clade='21E (Theta)' \
-    Nextstrain_clade='21F (Iota)' \
-    Nextstrain_clade='21G (Lambda)' \
-    Nextstrain_clade='21H (Mu)' \
-    Nextstrain_clade='21I (Delta)' \
-    Nextstrain_clade='21J (Delta)' \
-    Nextstrain_clade='21K (Omicron)' \
-    Nextstrain_clade='21M (Omicron)'"
 
 subsampling:
 

diff --git a/nextstrain_profiles/nextstrain-gisaid-21L/exclude-clades.tsv b/nextstrain_profiles/nextstrain-gisaid-21L/exclude-clades.tsv
@@ -0,0 +1,25 @@
+clade
+19A
+19B
+20A
+20B
+20C
+20D
+20E (EU1)
+20F
+20G
+20H (Beta, V2)
+20I (Alpha, V1)
+20J (Gamma, V3)
+21A (Delta)
+21B (Kappa)
+21C (Epsilon)
+21D (Eta)
+21E (Theta)
+21F (Iota)
+21G (Lambda)
+21H (Mu)
+21I (Delta)
+21J (Delta)
+21K (Omicron)
+21M (Omicron)
diff --git a/nextstrain_profiles/nextstrain-gisaid-21L/prefilter.smk b/nextstrain_profiles/nextstrain-gisaid-21L/prefilter.smk
@@ -0,0 +1,67 @@
+rule gisaid_21L_metadata:
+    input:
+        metadata = path_or_url("s3://nextstrain-ncov-private/metadata.tsv.zst", keep_local=True),
+        exclude_clades = "nextstrain_profiles/nextstrain-gisaid-21L/exclude-clades.tsv",
+    output:
+        metadata = "results/gisaid_21L_metadata.tsv.zst",
+    log: "logs/gisaid_21L_metadata.txt"
+    benchmark: "benchmarks/gisaid_21L_metadata.txt"
+    conda: config["conda_environment"]
+    threads: 8
+    shell:
+        r"""
+        exec 2> {log:q}
+
+        < {input.metadata:q} \
+          unzstd \
+        | tsv-join \
+            --header \
+            --exclude \
+            --filter-file {input.exclude_clades:q} \
+            --key-fields clade \
+            --data-fields Nextstrain_clade \
+        | zstd -T$(({threads} - 2)) \
+        > {output.metadata:q}
+        """
+
+
+rule gisaid_21L_strains:
+    input:
+        metadata = "results/gisaid_21L_metadata.tsv.zst",
+    output:
+        strains = "results/gisaid_21L_strains.txt",
+    log: "logs/gisaid_21L_strains.txt"
+    benchmark: "benchmarks/gisaid_21L_strains.txt"
+    conda: config["conda_environment"]
+    shell:
+        r"""
+        exec 2> {log:q}
+
+        < {input.metadata:q} \
+          unzstd \
+        | tsv-select --header -f strain \
+        | sed 1d \
+        > {output.strains:q}
+        """
+
+
+rule gisaid_21L_aligned:
+    input:
+        aligned = path_or_url("s3://nextstrain-ncov-private/aligned.fasta.zst", keep_local=True),
+        strains = "results/gisaid_21L_strains.txt",
+    output:
+        aligned = "results/gisaid_21L_aligned.fasta.zst",
+    log: "logs/gisaid_21L_aligned.txt"
+    benchmark: "benchmarks/gisaid_21L_aligned.txt"
+    conda: config["conda_environment"]
+    threads: 8
+    shell:
+        r"""
+        exec 2> {log:q}
+
+        < {input.aligned:q} \
+          unzstd \
+        | seqkit grep --by-name -f {input.strains:q} \
+        | zstd -T$(({threads} - 2)) \
+        > {output.aligned:q}
+        """