Support derived haplotypes at a subset of sites

Allows users to pass a distance map JSON to the derived haplotype calculation script to specify a subset of genes and positions to consider as part of each derived haplotype. The benefit of this approach is that we can define derived haplotypes for epitope sites or RBD sites, for example.
nextstrain · Sep 11, 2024 · 7bceaa9 · 7bceaa9
1 parent 78cdcb7
commit 7bceaa9
Showing 1 changed file with 26 additions and 2 deletions.
diff --git a/scripts/add_derived_haplotypes.py b/scripts/add_derived_haplotypes.py
@@ -2,20 +2,35 @@
 Annotate derived haplotypes per node from annotated clades and store as node data JSON.
 """
 import argparse
+import json
 import pandas as pd
 
 
-def create_haplotype_for_record(record, clade_column, mutations_column, genes=None, strip_genes=False):
+def create_haplotype_for_record(record, clade_column, mutations_column, genes=None, strip_genes=False, sites_by_gene=None):
     """Create a haplotype string for the given record based on the values in its
     clade and mutations column. If a list of genes is given, filter mutations to
     only those in the requested genes.
 
     """
     clade = record[clade_column]
+
+    if record[mutations_column] == "":
+        return clade
+
     mutations = record[mutations_column].split(",")
 
     # Filter mutations to requested genes.
-    if genes is not None:
+    if sites_by_gene is not None:
+        filtered_mutations = []
+        for mutation in mutations:
+            # mutation looks like "HA1:N145K"
+            gene, allele = mutation.split(":")
+            position = allele[1:-1]
+            if gene in sites_by_gene and position in sites_by_gene[gene]:
+                filtered_mutations.append(mutation)
+
+        mutations = filtered_mutations
+    elif genes is not None:
         mutations = [
             mutation
             for mutation in mutations
@@ -44,6 +59,7 @@ def create_haplotype_for_record(record, clade_column, mutations_column, genes=No
     parser.add_argument("--clade-column", help="name of the branch attribute for clade labels in the given Nextclade annotations", default="subclade")
     parser.add_argument("--mutations-column", help="name of the attribute for mutations relative to clades in the given Nextclade annotations", default="founderMuts['subclade'].aaSubstitutions")
     parser.add_argument("--genes", nargs="+", help="list of genes to filter mutations to. If not provided, all mutations will be used.")
+    parser.add_argument("--distance-map", help="distance map JSON of genes and positions to include in haplotypes")
     parser.add_argument("--strip-genes", action="store_true", help="strip gene names from coordinates in output haplotypes")
     parser.add_argument("--attribute-name", default="haplotype", help="name of attribute to store the derived haplotype in the output file")
     parser.add_argument("--output", help="TSV file of Nextclade annotations with derived haplotype column added", required=True)
@@ -60,6 +76,13 @@ def create_haplotype_for_record(record, clade_column, mutations_column, genes=No
         na_filter=False,
     )
 
+    # Load distance map.
+    sites_by_gene = None
+    if args.distance_map:
+        with open(args.distance_map, "r", encoding="utf-8") as fh:
+            distance_map = json.load(fh)
+            sites_by_gene = distance_map["map"]
+
     # Annotate derived haplotypes.
     df[args.attribute_name] = df.apply(
         lambda record: create_haplotype_for_record(
@@ -68,6 +91,7 @@ def create_haplotype_for_record(record, clade_column, mutations_column, genes=No
             args.mutations_column,
             args.genes,
             args.strip_genes,
+            sites_by_gene,
         ),
         axis=1
     )