From cd29467cb8bab174ce6a7c375edab86592702fd4 Mon Sep 17 00:00:00 2001
From: james hadfield <hadfield.james@gmail.com>
Date: Fri, 27 Sep 2024 13:01:17 +1200
Subject: [PATCH] [clade-I] hardcode clade assignment

This approach is necessary because the mutations assigned to the top
branches in the tree are random. I'd consider this approach temporary
and we should revisit it in the near-future.
---
 phylogenetic/defaults/clade-i/include.txt     |  7 +++
 phylogenetic/rules/annotate_phylogeny.smk     | 15 +++++++
 phylogenetic/scripts/assign-clade-I-clades.py | 45 +++++++++++++++++++
 3 files changed, 67 insertions(+)
 create mode 100644 phylogenetic/scripts/assign-clade-I-clades.py

diff --git a/phylogenetic/defaults/clade-i/include.txt b/phylogenetic/defaults/clade-i/include.txt
index e69de29b..f8ef05bd 100644
--- a/phylogenetic/defaults/clade-i/include.txt
+++ b/phylogenetic/defaults/clade-i/include.txt
@@ -0,0 +1,7 @@
+# Ensure we include 2 Ia and Ib samples so we can use them to check clade assignment
+# Clade Ia
+PP601197
+KJ642618
+# Clade Ib
+PP601222
+PP601209
diff --git a/phylogenetic/rules/annotate_phylogeny.smk b/phylogenetic/rules/annotate_phylogeny.smk
index 6a156275..9d9661b7 100644
--- a/phylogenetic/rules/annotate_phylogeny.smk
+++ b/phylogenetic/rules/annotate_phylogeny.smk
@@ -116,6 +116,8 @@ rule rename_clades:
         build_dir + "/{build_name}/clades_raw.json",
     output:
         node_data=build_dir + "/{build_name}/clades.json",
+    wildcard_constraints:
+        build_name="^(?!clade-i)$",
     shell:
         """
         python scripts/clades_renaming.py \
@@ -124,6 +126,19 @@ rule rename_clades:
         """
 
 
+rule clades_for_clade_I:
+    input:
+        tree=build_dir + "/clade-i/tree.nwk",
+    output:
+        node_data=build_dir + "/clade-i/clades.json",
+    shell:
+        """
+        python scripts/assign-clade-I-clades.py \
+            < {input.tree} \
+            > {output.node_data}
+        """
+
+
 rule mutation_context:
     input:
         tree=build_dir + "/{build_name}/tree.nwk",
diff --git a/phylogenetic/scripts/assign-clade-I-clades.py b/phylogenetic/scripts/assign-clade-I-clades.py
new file mode 100644
index 00000000..d6687d3b
--- /dev/null
+++ b/phylogenetic/scripts/assign-clade-I-clades.py
@@ -0,0 +1,45 @@
+"""
+Labels the two child nodes of the root as clade Ia and Ib
+based on an expected tree structure. This approach is temporary and is
+necessary because the distribution of mutations at these two nodes
+(via augur ancestral) is random and thus we can't use our normal
+`augur clades` approach.
+
+This script expects certain tips to be present for each clade
+which are force-included in the analysis.
+
+Usage: provide the tree on STDIN, node-data JSON written to STDOUT
+"""
+
+import argparse
+from sys import stdin,stdout
+from Bio import Phylo
+from collections import defaultdict
+import json
+
+TIPS = {
+   "clade Ia": ["PP601197", "KJ642618"],
+   "clade Ib": ["PP601222", "PP601209"]
+}
+
+if __name__=="__main__":
+    parser = argparse.ArgumentParser(description = __doc__)
+    args = parser.parse_args()
+
+    t = Phylo.read(stdin, "newick")
+
+    node_data = { # node-data JSON
+        "nodes": defaultdict(dict),
+        "branches": defaultdict(dict),
+    }
+
+    for node in t.clade:
+        tips = set([n.name for n in node.get_terminals()])
+        for clade_name, defining_tips in TIPS.items():
+            if all([name in tips for name in defining_tips]):
+                node_data['branches'][node.name]['labels'] = {'clade': clade_name}
+                node_data['nodes'][node.name]["clade_membership"] = clade_name
+                for descendant in node.find_clades():
+                    node_data['nodes'][descendant.name]["clade_membership"] = clade_name
+
+    json.dump(node_data, stdout)