From 27322be2709eca4bae8e7c29ed462d4bb423969c Mon Sep 17 00:00:00 2001 From: sdgamboa Date: Mon, 25 Sep 2023 13:51:26 -0400 Subject: [PATCH] update tree and phytools vignette --- R/trees.R | 85 ++++++++----- inst/scripts/tax_tree_poly.R | 45 +++++++ man/ltp.Rd | 7 +- vignettes/articles/phytools.Rmd | 208 +++++++++++++++----------------- 4 files changed, 201 insertions(+), 144 deletions(-) create mode 100644 inst/scripts/tax_tree_poly.R diff --git a/R/trees.R b/R/trees.R index 653e165..a4edcf6 100644 --- a/R/trees.R +++ b/R/trees.R @@ -37,51 +37,78 @@ mpa <- function(x = 'tree') { #' #' \code{ltp} gets the LTP tree or data. #' -#' @param x A character string. tree, tips, or nodes. Default is tree. #' @param remove_zero_tips Remove or not tips with zero branch lengths. #' These tips could cause trouble. #' -#' @return A phylo or data.frame (see the `x` param) +#' @return A list with a phylo (the ltp tree) and a data.frame (tip_data) +#' objects. #' @export #' -ltp <- function(x = 'tree', remove_zero_tips = TRUE) { +ltp <- function(remove_zero_tips = TRUE) { tree_fname <- system.file( 'extdata', 'livingTree.newick', package = 'taxPPro' ) + tip_data_fname <- system.file( + 'extdata', 'livingTree_tips.tsv', package = 'taxPPro' + ) tree <- ape::read.tree(tree_fname) + message('Initial number of tips in LTP tree: ', length(tree$tip.label)) + tip_data <- utils::read.table( + file = tip_data_fname, header = TRUE, sep = '\t', row.names = NULL + ) |> + purrr::modify(as.character) |> + as.data.frame() + rownames(tip_data) <- tip_data$tip_label + if (remove_zero_tips) { ntips1 <- length(tree$tip.label) zero_edges <- which(tree$edge.length == 0) nodes_with_zero_edges <- tree$edge[zero_edges, 2] remove_tips <- nodes_with_zero_edges[nodes_with_zero_edges <= length(tree$tip.label)] remove_tips_names <- tree$tip.label[remove_tips] - tree <- drop.tip(tree, remove_tips_names) + tree <- ape::drop.tip(phy = tree, tip = remove_tips_names) ntips2 <- length(tree$tip.label) + tip_data <- tip_data[tree$tip.label,] message('Dropping ', ntips1 - ntips2, ' tips with zero length branches.') - message(ntips2, ' tips remaining in the tree.') - } - if (x == 'tree') { - output <- tree - } else if (x == 'tips') { - fname <- system.file( - 'extdata', 'livingTree_tips.tsv', package = 'taxPPro' - ) - output <- utils::read.table( - file = fname, header = TRUE, sep = '\t', - row.names = NULL - ) |> - purrr::modify(as.character) - rownames(output) <- output$tip_label - output <- output[tree$tip.label,] - } else if (x == 'nodes') { - fname <- system.file( - 'extdata', 'livingTree_nodes.tsv', package = 'taxPPro' - ) - output <- utils::read.table( - file = fname, header = TRUE, sep = '\t', - row.names = NULL - ) |> - purrr::modify(as.character) } - return(output) + + ntips3 <- nrow(tip_data) + tip_data <- tip_data |> + dplyr::group_by(taxid) |> + dplyr::slice_head(n = 1) |> + dplyr::ungroup() |> + as.data.frame() + rownames(tip_data) <- tip_data$tip_label + ntips4 <- nrow(tip_data) + tree <- ape::keep.tip(phy = tree, tip = tip_data$tip_label) + message('Dropping ', ntips3 - ntips4, ' tips because of duplicated taxids.') + tip_data <- tip_data[tree$tip.label,] + message('Tips remaining: ', length(tree$tip.label)) + list( + tree = tree, tip_data = tip_data + ) + # if (x == 'tree') { + # output <- tree + # } else if (x == 'tips') { + # fname <- system.file( + # 'extdata', 'livingTree_tips.tsv', package = 'taxPPro' + # ) + # output <- utils::read.table( + # file = fname, header = TRUE, sep = '\t', + # row.names = NULL + # ) |> + # purrr::modify(as.character) + # rownames(output) <- output$tip_label + # output <- output[tree$tip.label,] + # } else if (x == 'nodes') { + # fname <- system.file( + # 'extdata', 'livingTree_nodes.tsv', package = 'taxPPro' + # ) + # output <- utils::read.table( + # file = fname, header = TRUE, sep = '\t', + # row.names = NULL + # ) |> + # purrr::modify(as.character) + # } + # return(output) } diff --git a/inst/scripts/tax_tree_poly.R b/inst/scripts/tax_tree_poly.R new file mode 100644 index 0000000..9d4778e --- /dev/null +++ b/inst/scripts/tax_tree_poly.R @@ -0,0 +1,45 @@ +phys_data_ready + + + +mat <- phys_data_ready |> + select(NCBI_ID, Attribute, Score) |> + filter(!is.na(Attribute)) |> + # complete(NCB_ID, Attribute, fill = list(Score = 0)) |> + pivot_wider( + names_from = Attribute, values_from = 'Score', values_fill = 0 + ) |> + tibble::column_to_rownames(var = 'NCBI_ID') |> + as.matrix() + + + + +data('tree_list') +data_tree <- as.Node(tree_list) +phylo_tree <- as.phylo.Node(data_tree, heightAttribute = NULL) + +phylo_tree$edge.length <- rep(1, nrow(phylo_tree$edge)) + + +myIndex <- which(!phylo_tree$tip.label %in% rownames(mat)) +no_annotated_tips <- phylo_tree$tip.label[myIndex] + + + +mat2 <- matrix( + data = rep(rep(1/ncol(mat), ncol(mat)), length(no_annotated_tips)), + nrow = length((no_annotated_tips)), + byrow = TRUE, + dimnames = list(rownames = no_annotated_tips, colnames = colnames(mat)) +) + +mat3 <- rbind(mat, mat2) +mat3 <- mat3[phylo_tree$tip.label,] +dim(mat3) + +myFit <- fitMk(tree = phylo_tree, x = mat3, model = 'ER') +myAsr <- ancr(object = myFit) + + + diff --git a/man/ltp.Rd b/man/ltp.Rd index 846c01b..b21e621 100644 --- a/man/ltp.Rd +++ b/man/ltp.Rd @@ -4,16 +4,15 @@ \alias{ltp} \title{Get living tree project (LTP) tree or data} \usage{ -ltp(x = "tree", remove_zero_tips = TRUE) +ltp(remove_zero_tips = TRUE) } \arguments{ -\item{x}{A character string. tree, tips, or nodes. Default is tree.} - \item{remove_zero_tips}{Remove or not tips with zero branch lengths. These tips could cause trouble.} } \value{ -A phylo or data.frame (see the `x` param) +A list with a phylo (the ltp tree) and a data.frame (tip_data) +objects. } \description{ \code{ltp} gets the LTP tree or data. diff --git a/vignettes/articles/phytools.Rmd b/vignettes/articles/phytools.Rmd index dc8a469..bad1afb 100644 --- a/vignettes/articles/phytools.Rmd +++ b/vignettes/articles/phytools.Rmd @@ -133,156 +133,106 @@ phys_data_ready <- left_join(set_with_ids, set_without_ids) |> ) ) |> mutate(taxid = sub('^\\w__', '', NCBI_ID)) +## Split in lists to use with the NCBI taxonoomic tree phys_data_list <- split(phys_data_ready, factor(phys_data_ready$NCBI_ID)) length(phys_data_list) ``` -## Tree (living tree project tree 2023) +## Tree data -Tree as phylo object: +### Taxonomic tree ```{r} -tree <- ltp() -tree +data('tree_list') +ncbi_tree <- as.Node(tree_list) +ncbi_tree_nodes <- unname(ncbi_tree$Get('name')) +print(ncbi_tree, limit = 10) ``` -Tip data: +### Phylogenecti tree ```{r} -tip_data <- ltp('tips') -``` - -About 24% of tips have annotations. Let's use this for ASR: - -```{r} -mean(tip_data$taxid %in% phys_data_ready$taxid) +ltp <- ltp() +tree <- ltp$tree +tree ``` +### Phylogenetic tree (tip data) ```{r} -tip <- left_join(tip_data, phys_data_ready, by = 'taxid') +tip_data <- ltp$tip_data +dim(tip_data) ``` +### Some completeness stats - - - -We need a matrix of probabilites: +Annotations from sources than can be mapped to the NCBI tree: ```{r} -m1 <- tip |> - select(tip_label, Attribute, Score) |> - filter(!is.na(Attribute)) |> - # complete(NCB_ID, Attribute, fill = list(Score = 0)) |> - pivot_wider( - names_from = Attribute, values_from = 'Score', values_fill = 0 - ) |> - tibble::column_to_rownames(var = 'tip_label') |> - as.matrix() - -no_annotated_tips <- tip |> - filter(!tip_label %in% rownames(m1)) |> - pull(tip_label) - -m2 <- matrix( - data = rep(rep(1/ncol(m1), ncol(m1)), length(no_annotated_tips)), - nrow = length((no_annotated_tips)), - byrow = TRUE, - dimnames = list(rownames = no_annotated_tips, colnames = colnames(m1)) -) - -m3 <- rbind(m1, m2) -m3 <- m3[mpa_tree$tip.label,] -dim(m3) +ncbi_tree_nodes_gst <- ncbi_tree_nodes[which(grepl('^[wst]__', ncbi_tree_nodes))] +mean(phys_data_ready$NCBI_ID %in% ncbi_tree_nodes_gst) ``` - - +Proportion of NCBI tree nodes (gst) with annotations: ```{r} -fit <- fitMk( - tree = tre, x = -) +mean(ncbi_tree_nodes_gst %in% phys_data_ready$NCBI_ID) ``` - - - - - - - - - - - - - - -## More taxonomic binning (ASR and Inhertiance) - -This time taxonomic binning and inheritance are done to increase the -proportion of tips with annotations in the phylogenetic tree - -### Map all NCBI IDs to the NCBI taxonomic tree. - -At this point only genus, species, and strain annotations are present. -I think we can safely use the scores based on taxonomic binning. +Annotations from sources than can me mapped to the phylogenetic tree tips: ```{r} -data("tree_list") -data_tree <- as.Node(tree_list) -print(data_tree, limit = 10) +mean(phys_data_ready$taxid %in% tip_data$taxid) ``` -Taxa with annotations in the NCBI taxonomic tree: +Proportion of tips in the phylogenetic tree with annotations: ```{r} -get_all_nodes <- unname(data_tree$Get('name')) -mean(names(phys_data_list) %in% get_all_nodes) +mean(tip_data$taxid %in% phys_data_ready$taxid) ``` -Porcentage of tips in the LTP tree with annotations: - -```{r} -tip_data <- ltp(x = 'tips') |> - modify(as.character) |> - as_tibble() -mean(tip_data$taxid %in% sub('^\\w__', '', phys_data_ready$NCBI_ID)) -``` +## Increase number of tips with annotations -So, only 24% of the tips have annoations. This could be affect the performance -of the ASR. Let's incrase the number of tips with annotations using the -NCBI taxonomy and taxonomic binnnig: +### Map NCBI IDs to the NCBI taxonomic tree -Add taxa annotations to the NCBI taxonomic tree: +At this point only genus, species, and strain annotations are present. ```{r} -data_tree$Do(function(node) { +ncbi_tree$Do(function(node) { if (node$name %in% names(phys_data_list)) { node$attribute_tbl <- phys_data_list[[node$name]] } else { NULL } }) -## An example: -data_tree$d__2$p__1224$c__1236$o__91347$f__543$g__561$s__562$attribute_tbl +## An example (E. coli): +ncbi_tree$d__2$p__1224$c__1236$o__91347$f__543$g__561$s__562$attribute_tbl ``` -The example above clearly shows an example of a tip than can have more -annotations. For this example workflow, let's remove its annotations: + + +```{r} +ncbi_attr_gst <- ncbi_tree$Get( + 'attribute_tbl', filterFun = function(node) grepl('^[gst]__', node$name) +) +mean(!map_lgl(ncbi_attr_gst, ~ all(is.na(.x)))) +``` + + +Removing annotations just to compare with result after propagation ```{r} -data_tree$d__2$p__1224$c__1236$o__91347$f__543$g__561$attribute_tbl <- NULL -data_tree$d__2$p__1224$c__1236$o__91347$f__543$g__561$attribute_tbl +ncbi_tree$d__2$p__1224$c__1236$o__91347$f__543$g__561$attribute_tbl <- NULL +ncbi_tree$d__2$p__1224$c__1236$o__91347$f__543$g__561$attribute_tbl ``` -Lest perform some kind of ASR (acutally we could call this taxonomic binning): +Lest pool and normalize attribute scores at each taxonomic rank (gst) for +child taxa ```{r} ## ASR - post-order -asr1 <- function(node) { +taxPool <- function(node) { if (!node$isLeaf) { children_names <- names(node$children) attribute_tbls <- children_names |> @@ -330,16 +280,31 @@ asr1 <- function(node) { } } } -data_tree$Do(asr1, traversal = 'post-order') -data_tree$d__2$p__1224$c__1236$o__91347$f__543$g__561$attribute_tbl +ncbi_tree$Do(taxPool, traversal = 'post-order') +## Same E. coli example: +ncbi_tree$d__2$p__1224$c__1236$o__91347$f__543$g__561$attribute_tbl +``` + +How many nodes family and above with attribute table? + +```{r} +ncbi_attr_kpcof <- ncbi_tree$Get( + 'attribute_tbl', filterFun = function(node) grepl('^[kpcor]__', node$name) +) +mean(!map_lgl(ncbi_attr_kpcof, ~ all(is.na(.x)))) ``` -From the output above, we can see that the E. coli example now has attribute -values. This won't happen at higer taxonomic ranks (familiy and above): + +How many nodes genus and below with attribute table? ```{r} -data_tree$d__2$p__1224$c__1236$o__91347$f__543$attribute_tbl +ncbi_attr_gst <- ncbi_tree$Get( + 'attribute_tbl', filterFun = function(node) grepl('^[gst]__', node$name) +) +## A slight increase 12 to 15% +mean(!map_lgl(ncbi_attr_gst, ~ all(is.na(.x)))) ``` + Child taxa of E. coli with unobserved annotations can inherit these values directly: @@ -352,30 +317,51 @@ inh1 <- function(node) { return(NULL) if (is.null(node$attribute_tbl) && grepl('^[st]__', node$name)) { df <- node$parent$attribute_tbl + n <- nrow(tbl) df <- df |> dplyr::mutate( + target_scores = rep(1 / n, n), + score_diff = Score - target_scores, + Score = Score - adjF * score_diff, NCBI_ID = node$name, Evidence = 'inh' ) node$attribute_tbl <- df } } -data_tree$Do(inh1, traversal = 'pre-order') +ncbi_tree$Do(inh1, traversal = 'pre-order') +``` + +Proportion of nodes at the family level and above with attributes: + +```{r} +ncbi_attr_kpcof <- ncbi_tree$Get( + 'attribute_tbl', filterFun = function(node) grepl('^[kpcor]__', node$name) +) +mean(!map_lgl(ncbi_attr_kpcof, ~ all(is.na(.x)))) ``` -It seems that about 0.78 percent of the nodes at the genus, species, and strain -level have annotations now: +Proportion of nodes at the genus, species, and strain levels with attributes: ```{r} -new_data_tree_nodes <- data_tree$Get('attribute_tbl', simplify = FALSE) -index <- grepl('^[gst]__', names(new_data_tree_nodes)) -new_data_tree_nodes <- new_data_tree_nodes[index] -lgl_vct <- map_lgl(new_data_tree_nodes, ~ !all(is.na(.x))) -mean(lgl_vct) +ncbi_attr_gst <- ncbi_tree$Get( + 'attribute_tbl', filterFun = function(node) grepl('^[gst]__', node$name) +) +## Major increase 15 to 78% +mean(!map_lgl(ncbi_attr_gst, ~ all(is.na(.x)))) ``` -Now the number of tip annotations has increased considerably -(for this physiology at least): + + + + + + + + + + + ```{r} new_tip_annotations <- new_data_tree_nodes |>