Skip to content

Commit

Permalink
Support derived haplotypes at a subset of sites
Browse files Browse the repository at this point in the history
Allows users to pass a distance map JSON to the derived haplotype
calculation script to specify a subset of genes and positions to
consider as part of each derived haplotype. The benefit of this
approach is that we can define derived haplotypes for epitope sites or
RBD sites, for example.
  • Loading branch information
huddlej committed Sep 11, 2024
1 parent 78cdcb7 commit 7bceaa9
Showing 1 changed file with 26 additions and 2 deletions.
28 changes: 26 additions & 2 deletions scripts/add_derived_haplotypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,35 @@
Annotate derived haplotypes per node from annotated clades and store as node data JSON.
"""
import argparse
import json
import pandas as pd


def create_haplotype_for_record(record, clade_column, mutations_column, genes=None, strip_genes=False):
def create_haplotype_for_record(record, clade_column, mutations_column, genes=None, strip_genes=False, sites_by_gene=None):
"""Create a haplotype string for the given record based on the values in its
clade and mutations column. If a list of genes is given, filter mutations to
only those in the requested genes.
"""
clade = record[clade_column]

if record[mutations_column] == "":
return clade

mutations = record[mutations_column].split(",")

# Filter mutations to requested genes.
if genes is not None:
if sites_by_gene is not None:
filtered_mutations = []
for mutation in mutations:
# mutation looks like "HA1:N145K"
gene, allele = mutation.split(":")
position = allele[1:-1]
if gene in sites_by_gene and position in sites_by_gene[gene]:
filtered_mutations.append(mutation)

mutations = filtered_mutations
elif genes is not None:
mutations = [
mutation
for mutation in mutations
Expand Down Expand Up @@ -44,6 +59,7 @@ def create_haplotype_for_record(record, clade_column, mutations_column, genes=No
parser.add_argument("--clade-column", help="name of the branch attribute for clade labels in the given Nextclade annotations", default="subclade")
parser.add_argument("--mutations-column", help="name of the attribute for mutations relative to clades in the given Nextclade annotations", default="founderMuts['subclade'].aaSubstitutions")
parser.add_argument("--genes", nargs="+", help="list of genes to filter mutations to. If not provided, all mutations will be used.")
parser.add_argument("--distance-map", help="distance map JSON of genes and positions to include in haplotypes")
parser.add_argument("--strip-genes", action="store_true", help="strip gene names from coordinates in output haplotypes")
parser.add_argument("--attribute-name", default="haplotype", help="name of attribute to store the derived haplotype in the output file")
parser.add_argument("--output", help="TSV file of Nextclade annotations with derived haplotype column added", required=True)
Expand All @@ -60,6 +76,13 @@ def create_haplotype_for_record(record, clade_column, mutations_column, genes=No
na_filter=False,
)

# Load distance map.
sites_by_gene = None
if args.distance_map:
with open(args.distance_map, "r", encoding="utf-8") as fh:
distance_map = json.load(fh)
sites_by_gene = distance_map["map"]

# Annotate derived haplotypes.
df[args.attribute_name] = df.apply(
lambda record: create_haplotype_for_record(
Expand All @@ -68,6 +91,7 @@ def create_haplotype_for_record(record, clade_column, mutations_column, genes=No
args.mutations_column,
args.genes,
args.strip_genes,
sites_by_gene,
),
axis=1
)
Expand Down

0 comments on commit 7bceaa9

Please sign in to comment.