diff --git a/phylogenetic/defaults/clade-i/include.txt b/phylogenetic/defaults/clade-i/include.txt index e69de29..f8ef05b 100644 --- a/phylogenetic/defaults/clade-i/include.txt +++ b/phylogenetic/defaults/clade-i/include.txt @@ -0,0 +1,7 @@ +# Ensure we include 2 Ia and Ib samples so we can use them to check clade assignment +# Clade Ia +PP601197 +KJ642618 +# Clade Ib +PP601222 +PP601209 diff --git a/phylogenetic/rules/annotate_phylogeny.smk b/phylogenetic/rules/annotate_phylogeny.smk index 6a15627..9d9661b 100644 --- a/phylogenetic/rules/annotate_phylogeny.smk +++ b/phylogenetic/rules/annotate_phylogeny.smk @@ -116,6 +116,8 @@ rule rename_clades: build_dir + "/{build_name}/clades_raw.json", output: node_data=build_dir + "/{build_name}/clades.json", + wildcard_constraints: + build_name="^(?!clade-i)$", shell: """ python scripts/clades_renaming.py \ @@ -124,6 +126,19 @@ rule rename_clades: """ +rule clades_for_clade_I: + input: + tree=build_dir + "/clade-i/tree.nwk", + output: + node_data=build_dir + "/clade-i/clades.json", + shell: + """ + python scripts/assign-clade-I-clades.py \ + < {input.tree} \ + > {output.node_data} + """ + + rule mutation_context: input: tree=build_dir + "/{build_name}/tree.nwk", diff --git a/phylogenetic/scripts/assign-clade-I-clades.py b/phylogenetic/scripts/assign-clade-I-clades.py new file mode 100644 index 0000000..d6687d3 --- /dev/null +++ b/phylogenetic/scripts/assign-clade-I-clades.py @@ -0,0 +1,45 @@ +""" +Labels the two child nodes of the root as clade Ia and Ib +based on an expected tree structure. This approach is temporary and is +necessary because the distribution of mutations at these two nodes +(via augur ancestral) is random and thus we can't use our normal +`augur clades` approach. + +This script expects certain tips to be present for each clade +which are force-included in the analysis. + +Usage: provide the tree on STDIN, node-data JSON written to STDOUT +""" + +import argparse +from sys import stdin,stdout +from Bio import Phylo +from collections import defaultdict +import json + +TIPS = { + "clade Ia": ["PP601197", "KJ642618"], + "clade Ib": ["PP601222", "PP601209"] +} + +if __name__=="__main__": + parser = argparse.ArgumentParser(description = __doc__) + args = parser.parse_args() + + t = Phylo.read(stdin, "newick") + + node_data = { # node-data JSON + "nodes": defaultdict(dict), + "branches": defaultdict(dict), + } + + for node in t.clade: + tips = set([n.name for n in node.get_terminals()]) + for clade_name, defining_tips in TIPS.items(): + if all([name in tips for name in defining_tips]): + node_data['branches'][node.name]['labels'] = {'clade': clade_name} + node_data['nodes'][node.name]["clade_membership"] = clade_name + for descendant in node.find_clades(): + node_data['nodes'][descendant.name]["clade_membership"] = clade_name + + json.dump(node_data, stdout)