diff --git a/cmd/prune.go b/cmd/prune.go index bad04b4..7fa295e 100644 --- a/cmd/prune.go +++ b/cmd/prune.go @@ -150,5 +150,5 @@ func init() { pruneCmd.Flags().StringVarP(&outtreefile, "output", "o", "stdout", "Output tree") pruneCmd.Flags().StringVarP(&tipfile, "tipfile", "f", "none", "Tip file") pruneCmd.Flags().BoolVarP(&revert, "revert", "r", false, "If true, then revert the behavior: will keep only species given in the command line, or keep only the species that are specific to the input tree, or keep only randomly selected taxa") - pruneCmd.PersistentFlags().IntVar(&randomtips, "random", 0, "Number of tips to randomly sample") + pruneCmd.Flags().IntVar(&randomtips, "random", 0, "Number of tips to randomly sample") } diff --git a/cmd/prunedate.go b/cmd/prunedate.go new file mode 100644 index 0000000..0863040 --- /dev/null +++ b/cmd/prunedate.go @@ -0,0 +1,67 @@ +package cmd + +import ( + goio "io" + "os" + + "github.com/evolbioinfo/gotree/io" + "github.com/evolbioinfo/gotree/tree" + "github.com/spf13/cobra" +) + +var pruneMinDate float64 + +// resolveCmd represents the resolve command +var pruneDateCmd = &cobra.Command{ + Use: "date", + Short: "Cut the input tree by keeping only parts in date window", + Long: `Cut the input tree by keeping only parts in date window. + +This command will extract part of the tree corresponding to >= min-date and <= max-date. + +If min-date falls on an internal branch, it will create a new root node and will extract a tree starting at this node. +If max-date falls on an internal branch, we do not take this part of the tree, and we remove branches that end into these cases. + +`, + RunE: func(cmd *cobra.Command, args []string) (err error) { + var f *os.File + var treefile goio.Closer + var treechan <-chan tree.Trees + var forest []*tree.Tree + + if f, err = openWriteFile(outtreefile); err != nil { + io.LogError(err) + return + } + defer closeWriteFile(f, outtreefile) + + if treefile, treechan, err = readTrees(intreefile); err != nil { + io.LogError(err) + return + } + defer treefile.Close() + + for tr := range treechan { + if tr.Err != nil { + io.LogError(tr.Err) + return tr.Err + } + if forest, err = tr.Tree.CutTreeMinDate(pruneMinDate); err != nil { + io.LogError(err) + return + } + for _, t := range forest { + f.WriteString(t.Newick() + "\n") + } + } + + return + }, +} + +func init() { + pruneCmd.AddCommand(pruneDateCmd) + pruneDateCmd.PersistentFlags().StringVarP(&intreefile, "input", "i", "stdin", "Input tree(s) file") + pruneDateCmd.PersistentFlags().StringVarP(&outtreefile, "output", "o", "stdout", "Forest output file") + pruneDateCmd.PersistentFlags().Float64Var(&pruneMinDate, "min-date", 0, "Minimum date to cut the tree") +} diff --git a/tree/dates.go b/tree/dates.go index 2d864bc..4f33f54 100644 --- a/tree/dates.go +++ b/tree/dates.go @@ -2,7 +2,9 @@ package tree import ( "fmt" + "regexp" "sort" + "strconv" "github.com/evolbioinfo/gotree/io" ) @@ -13,6 +15,47 @@ type LTTData struct { Y int // Number of lineages } +// Get Node dates +// Returns a slice of float correspsponding to all node dates (internal and external) +// Node IDs are their index in the slice. +// If one node does not have date or a malformed date, returns an error +func (t *Tree) NodeDates() (ndates []float64, err error) { + var date float64 + var pattern *regexp.Regexp + var matches []string + + ndates = make([]float64, 0) + pattern = regexp.MustCompile(`(?i)&date="(.+)"`) + nnodes := 0 + t.PreOrder(func(cur *Node, prev *Node, e *Edge) (keep bool) { + keep = true + if cur.Id() != nnodes { + err = fmt.Errorf("node id does not correspond to postorder traversal: %d vs %d", cur.Id(), nnodes) + keep = false + } else if len(cur.Comments()) > 0 { + keep = false + for _, c := range cur.Comments() { + matches = pattern.FindStringSubmatch(c) + if len(matches) < 2 { + err = fmt.Errorf("no date found: %s", c) + } else if date, err = strconv.ParseFloat(matches[1], 64); err != nil { + err = fmt.Errorf("one of the node date is malformed: %s", c) + } else { + ndates = append(ndates, date) + err = nil + keep = true + } + } + } else { + err = fmt.Errorf("a node with no date found") + keep = false + } + nnodes += 1 + return + }) + return +} + // LTTData describes a Lineage to Time data point func (t *Tree) LTT() (lttdata []LTTData) { var lttdatadup []LTTData @@ -105,3 +148,76 @@ func (t *Tree) RTT() (rttdata []RTTData, err error) { return } + +// CutTreeMinDate traverses the tree, and only keep subtree starting at the given min date +// +// If a node has the exact same date as mindate: it becomes the root of a new tree +// If a node has a date > mindate and its parent has a date < mindate: a new node is added as a the root of a new tree, with one child, the currrent node. +// The output is a forest +func (t *Tree) CutTreeMinDate(mindate float64) (forest []*Tree, err error) { + var dates []float64 + forest = make([]*Tree, 0, 10) + var tmpforest []*Tree + + // If the field [&date=] exists, then takes it + // Otherwise, returns an error + if dates, err = t.NodeDates(); err != nil { + io.LogWarning(err) + err = fmt.Errorf("no dates provided in in the tree, of the form &date=") + io.LogWarning(err) + return + } + + if tmpforest, err = cutTreeMinDateRecur(t.Root(), nil, nil, mindate, dates); err != nil { + return + } + forest = append(forest, tmpforest...) + + return +} + +func cutTreeMinDateRecur(cur, prev *Node, e *Edge, mindate float64, dates []float64) (forest []*Tree, err error) { + // We take the branches/nodes >= min-date + var tmptree *Tree + var tmpnode *Node + var tmpedge *Edge + var tmpforest []*Tree + + forest = make([]*Tree, 0) + // The current node is at the exact min date: we keep the subtree starting at this node + // And disconnect the current node from its parent + if dates[cur.Id()] == mindate || (prev == nil && dates[cur.Id()] >= mindate) { + tmptree = NewTree() + tmptree.SetRoot(cur) + prev.delNeighbor(cur) + cur.delNeighbor(prev) + tmptree.ReinitIndexes() + forest = append(forest, tmptree) + return + } else if prev != nil && dates[cur.Id()] > mindate && dates[prev.Id()] < mindate { + tmptree = NewTree() + tmpnode = tmptree.NewNode() + tmptree.SetRoot(tmpnode) + prev.delNeighbor(cur) + cur.delNeighbor(prev) + tmpedge = tmptree.ConnectNodes(tmpnode, cur) + tmpnode.AddComment(fmt.Sprintf("&date=\"%f\"", mindate)) + tmpedge.SetLength(e.Length() * (dates[cur.Id()] - mindate) / (dates[cur.Id()] - dates[prev.Id()])) + //tmptree.ReinitIndexes() + forest = append(forest, tmptree) + return + } + + edges := make([]*Edge, len(cur.Edges())) + copy(edges, cur.Edges()) + neigh := make([]*Node, len(cur.neigh)) + copy(neigh, cur.neigh) + for i, n := range neigh { + if n != prev { + tmpforest, err = cutTreeMinDateRecur(n, cur, edges[i], mindate, dates) + forest = append(forest, tmpforest...) + } + } + + return +} diff --git a/tree/node.go b/tree/node.go index d25931c..cb87679 100644 --- a/tree/node.go +++ b/tree/node.go @@ -231,7 +231,7 @@ func (n *Node) RotateNeighbors() { // from the current node func (n *Node) Newick(parent *Node, newick *bytes.Buffer) { if len(n.neigh) > 0 { - if len(n.neigh) > 1 { + if len(n.neigh) > 1 || parent == nil { newick.WriteString("(") } nbchild := 0 @@ -268,7 +268,7 @@ func (n *Node) Newick(parent *Node, newick *bytes.Buffer) { nbchild++ } } - if len(n.neigh) > 1 { + if len(n.neigh) > 1 || parent == nil { newick.WriteString(")") } } diff --git a/tree/stats.go b/tree/stats.go index 606387f..67f66ae 100644 --- a/tree/stats.go +++ b/tree/stats.go @@ -70,7 +70,11 @@ func (t *Tree) MeanSupport() float64 { func (t *Tree) MedianSupport() float64 { edges := t.Edges() tips := t.Tips() - supports := make([]float64, len(edges)-len(tips)) + nsup := len(edges) - len(tips) + if nsup < 0 { + nsup = 0 + } + supports := make([]float64, nsup) if len(supports) == 0 { return math.NaN() } diff --git a/tree/tree.go b/tree/tree.go index cab2aa8..73a74ea 100644 --- a/tree/tree.go +++ b/tree/tree.go @@ -953,47 +953,6 @@ func (t *Tree) computeDepthUnRooted() { } } -// Get Node dates -// Returns a slice of float correspsponding to all node dates (internal and external) -// Node IDs are their index in the slice. -// If one node does not have date or a malformed date, returns an error -func (t *Tree) NodeDates() (ndates []float64, err error) { - var date float64 - var pattern *regexp.Regexp - var matches []string - - ndates = make([]float64, 0) - pattern = regexp.MustCompile(`(?i)&date="(.+)"`) - nnodes := 0 - t.PreOrder(func(cur *Node, prev *Node, e *Edge) (keep bool) { - keep = true - if cur.Id() != nnodes { - err = fmt.Errorf("node id does not correspond to postorder traversal: %d vs %d", cur.Id(), nnodes) - keep = false - } else if len(cur.Comments()) > 0 { - keep = false - for _, c := range cur.Comments() { - matches = pattern.FindStringSubmatch(c) - if len(matches) < 2 { - err = fmt.Errorf("no date found: %s", c) - } else if date, err = strconv.ParseFloat(matches[1], 64); err != nil { - err = fmt.Errorf("one of the node date is malformed: %s", c) - } else { - ndates = append(ndates, date) - err = nil - keep = true - } - } - } else { - err = fmt.Errorf("a node with no date found") - keep = false - } - nnodes += 1 - return - }) - return -} - // Computes distance of all nodes to root / pseudo root. // Indices of the nodes in the rdists slice correspond to node ids func (t *Tree) NodeRootDistance() (rdists []float64) {