Skip to content

Commit

Permalink
Added cut min date command (gotree prune date --min-date)
Browse files Browse the repository at this point in the history
  • Loading branch information
fredericlemoine committed Sep 26, 2024
1 parent 1b33332 commit 69fcc18
Show file tree
Hide file tree
Showing 6 changed files with 191 additions and 45 deletions.
2 changes: 1 addition & 1 deletion cmd/prune.go
Original file line number Diff line number Diff line change
Expand Up @@ -150,5 +150,5 @@ func init() {
pruneCmd.Flags().StringVarP(&outtreefile, "output", "o", "stdout", "Output tree")
pruneCmd.Flags().StringVarP(&tipfile, "tipfile", "f", "none", "Tip file")
pruneCmd.Flags().BoolVarP(&revert, "revert", "r", false, "If true, then revert the behavior: will keep only species given in the command line, or keep only the species that are specific to the input tree, or keep only randomly selected taxa")
pruneCmd.PersistentFlags().IntVar(&randomtips, "random", 0, "Number of tips to randomly sample")
pruneCmd.Flags().IntVar(&randomtips, "random", 0, "Number of tips to randomly sample")
}
67 changes: 67 additions & 0 deletions cmd/prunedate.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
package cmd

import (
goio "io"
"os"

"github.com/evolbioinfo/gotree/io"
"github.com/evolbioinfo/gotree/tree"
"github.com/spf13/cobra"
)

var pruneMinDate float64

// resolveCmd represents the resolve command
var pruneDateCmd = &cobra.Command{
Use: "date",
Short: "Cut the input tree by keeping only parts in date window",
Long: `Cut the input tree by keeping only parts in date window.
This command will extract part of the tree corresponding to >= min-date and <= max-date.
If min-date falls on an internal branch, it will create a new root node and will extract a tree starting at this node.
If max-date falls on an internal branch, we do not take this part of the tree, and we remove branches that end into these cases.
`,
RunE: func(cmd *cobra.Command, args []string) (err error) {
var f *os.File
var treefile goio.Closer
var treechan <-chan tree.Trees
var forest []*tree.Tree

if f, err = openWriteFile(outtreefile); err != nil {
io.LogError(err)
return
}
defer closeWriteFile(f, outtreefile)

if treefile, treechan, err = readTrees(intreefile); err != nil {
io.LogError(err)
return
}
defer treefile.Close()

for tr := range treechan {
if tr.Err != nil {
io.LogError(tr.Err)
return tr.Err
}
if forest, err = tr.Tree.CutTreeMinDate(pruneMinDate); err != nil {
io.LogError(err)
return
}
for _, t := range forest {
f.WriteString(t.Newick() + "\n")
}
}

return
},
}

func init() {
pruneCmd.AddCommand(pruneDateCmd)
pruneDateCmd.PersistentFlags().StringVarP(&intreefile, "input", "i", "stdin", "Input tree(s) file")
pruneDateCmd.PersistentFlags().StringVarP(&outtreefile, "output", "o", "stdout", "Forest output file")
pruneDateCmd.PersistentFlags().Float64Var(&pruneMinDate, "min-date", 0, "Minimum date to cut the tree")
}
116 changes: 116 additions & 0 deletions tree/dates.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@ package tree

import (
"fmt"
"regexp"
"sort"
"strconv"

"github.com/evolbioinfo/gotree/io"
)
Expand All @@ -13,6 +15,47 @@ type LTTData struct {
Y int // Number of lineages
}

// Get Node dates
// Returns a slice of float correspsponding to all node dates (internal and external)
// Node IDs are their index in the slice.
// If one node does not have date or a malformed date, returns an error
func (t *Tree) NodeDates() (ndates []float64, err error) {
var date float64
var pattern *regexp.Regexp
var matches []string

ndates = make([]float64, 0)
pattern = regexp.MustCompile(`(?i)&date="(.+)"`)
nnodes := 0
t.PreOrder(func(cur *Node, prev *Node, e *Edge) (keep bool) {
keep = true
if cur.Id() != nnodes {
err = fmt.Errorf("node id does not correspond to postorder traversal: %d vs %d", cur.Id(), nnodes)
keep = false
} else if len(cur.Comments()) > 0 {
keep = false
for _, c := range cur.Comments() {
matches = pattern.FindStringSubmatch(c)
if len(matches) < 2 {
err = fmt.Errorf("no date found: %s", c)
} else if date, err = strconv.ParseFloat(matches[1], 64); err != nil {
err = fmt.Errorf("one of the node date is malformed: %s", c)
} else {
ndates = append(ndates, date)
err = nil
keep = true
}
}
} else {
err = fmt.Errorf("a node with no date found")
keep = false
}
nnodes += 1
return
})
return
}

// LTTData describes a Lineage to Time data point
func (t *Tree) LTT() (lttdata []LTTData) {
var lttdatadup []LTTData
Expand Down Expand Up @@ -105,3 +148,76 @@ func (t *Tree) RTT() (rttdata []RTTData, err error) {

return
}

// CutTreeMinDate traverses the tree, and only keep subtree starting at the given min date
//
// If a node has the exact same date as mindate: it becomes the root of a new tree
// If a node has a date > mindate and its parent has a date < mindate: a new node is added as a the root of a new tree, with one child, the currrent node.
// The output is a forest
func (t *Tree) CutTreeMinDate(mindate float64) (forest []*Tree, err error) {
var dates []float64
forest = make([]*Tree, 0, 10)
var tmpforest []*Tree

// If the field [&date=] exists, then takes it
// Otherwise, returns an error
if dates, err = t.NodeDates(); err != nil {
io.LogWarning(err)
err = fmt.Errorf("no dates provided in in the tree, of the form &date=")
io.LogWarning(err)
return
}

if tmpforest, err = cutTreeMinDateRecur(t.Root(), nil, nil, mindate, dates); err != nil {
return
}
forest = append(forest, tmpforest...)

return
}

func cutTreeMinDateRecur(cur, prev *Node, e *Edge, mindate float64, dates []float64) (forest []*Tree, err error) {
// We take the branches/nodes >= min-date
var tmptree *Tree
var tmpnode *Node
var tmpedge *Edge
var tmpforest []*Tree

forest = make([]*Tree, 0)
// The current node is at the exact min date: we keep the subtree starting at this node
// And disconnect the current node from its parent
if dates[cur.Id()] == mindate || (prev == nil && dates[cur.Id()] >= mindate) {
tmptree = NewTree()
tmptree.SetRoot(cur)
prev.delNeighbor(cur)
cur.delNeighbor(prev)
tmptree.ReinitIndexes()
forest = append(forest, tmptree)
return
} else if prev != nil && dates[cur.Id()] > mindate && dates[prev.Id()] < mindate {
tmptree = NewTree()
tmpnode = tmptree.NewNode()
tmptree.SetRoot(tmpnode)
prev.delNeighbor(cur)
cur.delNeighbor(prev)
tmpedge = tmptree.ConnectNodes(tmpnode, cur)
tmpnode.AddComment(fmt.Sprintf("&date=\"%f\"", mindate))
tmpedge.SetLength(e.Length() * (dates[cur.Id()] - mindate) / (dates[cur.Id()] - dates[prev.Id()]))
//tmptree.ReinitIndexes()
forest = append(forest, tmptree)
return
}

edges := make([]*Edge, len(cur.Edges()))
copy(edges, cur.Edges())
neigh := make([]*Node, len(cur.neigh))
copy(neigh, cur.neigh)
for i, n := range neigh {
if n != prev {
tmpforest, err = cutTreeMinDateRecur(n, cur, edges[i], mindate, dates)
forest = append(forest, tmpforest...)
}
}

return
}
4 changes: 2 additions & 2 deletions tree/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -231,7 +231,7 @@ func (n *Node) RotateNeighbors() {
// from the current node
func (n *Node) Newick(parent *Node, newick *bytes.Buffer) {
if len(n.neigh) > 0 {
if len(n.neigh) > 1 {
if len(n.neigh) > 1 || parent == nil {
newick.WriteString("(")
}
nbchild := 0
Expand Down Expand Up @@ -268,7 +268,7 @@ func (n *Node) Newick(parent *Node, newick *bytes.Buffer) {
nbchild++
}
}
if len(n.neigh) > 1 {
if len(n.neigh) > 1 || parent == nil {
newick.WriteString(")")
}
}
Expand Down
6 changes: 5 additions & 1 deletion tree/stats.go
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,11 @@ func (t *Tree) MeanSupport() float64 {
func (t *Tree) MedianSupport() float64 {
edges := t.Edges()
tips := t.Tips()
supports := make([]float64, len(edges)-len(tips))
nsup := len(edges) - len(tips)
if nsup < 0 {
nsup = 0
}
supports := make([]float64, nsup)
if len(supports) == 0 {
return math.NaN()
}
Expand Down
41 changes: 0 additions & 41 deletions tree/tree.go
Original file line number Diff line number Diff line change
Expand Up @@ -953,47 +953,6 @@ func (t *Tree) computeDepthUnRooted() {
}
}

// Get Node dates
// Returns a slice of float correspsponding to all node dates (internal and external)
// Node IDs are their index in the slice.
// If one node does not have date or a malformed date, returns an error
func (t *Tree) NodeDates() (ndates []float64, err error) {
var date float64
var pattern *regexp.Regexp
var matches []string

ndates = make([]float64, 0)
pattern = regexp.MustCompile(`(?i)&date="(.+)"`)
nnodes := 0
t.PreOrder(func(cur *Node, prev *Node, e *Edge) (keep bool) {
keep = true
if cur.Id() != nnodes {
err = fmt.Errorf("node id does not correspond to postorder traversal: %d vs %d", cur.Id(), nnodes)
keep = false
} else if len(cur.Comments()) > 0 {
keep = false
for _, c := range cur.Comments() {
matches = pattern.FindStringSubmatch(c)
if len(matches) < 2 {
err = fmt.Errorf("no date found: %s", c)
} else if date, err = strconv.ParseFloat(matches[1], 64); err != nil {
err = fmt.Errorf("one of the node date is malformed: %s", c)
} else {
ndates = append(ndates, date)
err = nil
keep = true
}
}
} else {
err = fmt.Errorf("a node with no date found")
keep = false
}
nnodes += 1
return
})
return
}

// Computes distance of all nodes to root / pseudo root.
// Indices of the nodes in the rdists slice correspond to node ids
func (t *Tree) NodeRootDistance() (rdists []float64) {
Expand Down

0 comments on commit 69fcc18

Please sign in to comment.