From 507b4ce7df617a5ae854096c9ec17747ed98049b Mon Sep 17 00:00:00 2001 From: sdgamboa Date: Tue, 19 Sep 2023 22:34:40 -0400 Subject: [PATCH] update phyloss rmarkdown --- vignettes/articles/phytools.Rmd | 260 ++++++++++---------------------- 1 file changed, 81 insertions(+), 179 deletions(-) diff --git a/vignettes/articles/phytools.Rmd b/vignettes/articles/phytools.Rmd index 506c148..08930f5 100644 --- a/vignettes/articles/phytools.Rmd +++ b/vignettes/articles/phytools.Rmd @@ -247,28 +247,30 @@ data_tree$d__2$p__1224$c__1236$o__91347$f__543$g__561$attribute_tbl ```{r} inh1 <- function(node) { + ## This one has no penalty + if (node$isRoot) { + return(NULL) + } -} + if (is.null(node$parent$attribute_tbl)) { + return(NULL) + } -data_tree$Do() + if (is.null(node$attribute_tbl) && grepl('^[st]__', node$name)) { + df <- node$parent$attribute_tbl + df <- df |> + dplyr::mutate( + NCBI_ID = node$name, + Evidence = 'inh' + ) + node$attribute_tbl <- df + } +} +data_tree$Do(inh1, traversal = 'pre-order') ``` - - - - - - - - - - - - - - ```{r} new_nodes <- data_tree$Get('attribute_tbl', simplify = FALSE) new_nodes_not_na <- map_lgl(new_nodes, ~ !all(is.na(.x))) @@ -281,10 +283,10 @@ new_data <- new_nodes |> bind_rows() |> arrange(NCBI_ID, Attribute) |> mutate(taxid = sub('^\\w__', '', NCBI_ID)) -## just a slight increase. More useful in datasets with only genus -## and strain information, but no species, I think. mean(mpa_tip_data$taxid %in% new_data$taxid) ``` + + ```{r} tip <- left_join( mpa_tip_data, new_data, by = 'taxid', relationship = 'many-to-many' @@ -327,6 +329,7 @@ m3 <- rbind(m1, m2) m3 <- m3[mpa_tree$tip.label,] dim(m3) ``` + ```{r} system.time( fit <- fitMk.parallel( @@ -339,7 +342,7 @@ system.time( ) ) ## Took close to 1 hour the fitMK -## Took about 4 min with parallel, but I got some NaNs warnings. +## A few seconds or minutes with fitMK.parellel ``` ```{r} @@ -350,10 +353,8 @@ system.time( ``` - - ```{r} -node_data <- mpa('nodes') +node_data <- as_tibble(modify(mpa('nodes'), as.character)) res <- ace$ace rownames(res) <- mpa_tree$node.label res <- res |> @@ -367,194 +368,95 @@ dim(res) ```{r} -select() +## It seems that at least half of the nodes would have new annotations, +## which I think it's ok, since this should be the nodes of higher +## ranks in the taxonomic tree +mean(!res$taxid %in% new_data$taxid) ``` - - - - - - - - - - - - - - ```{r} -data('tree_list') -data_tree <- as.Node(tree_list) -data_tree$Do(function(node) { - node$name <- sub('\\w__', '', node$name) -}) - -data_tree_node_names <- unname(data_tree$Get('name')) -mean(res$taxid %in% data_tree_node_names) +data_for_inh2 <- filter(res, !taxid %in% new_data$taxid) +dim(data_for_inh2) ``` ```{r} -l <- split(res, factor(res$taxid)) -l <- map(l, ~ { - .x |> - select( - aerobic, anaerobic, `facultatively anaerobic`, `facultative aerobe` +input_for_inh2 <- data_for_inh2 |> + select(-ends_with('_taxid'), -node_label, -node, -n_labels) |> + relocate(NCBI_ID = taxid) |> + pivot_longer( + names_to = 'Attribute', values_to = 'Score', cols = 2:last_col() ) |> - pivot_longer( - names_to = 'attribute', values_to = 'score', - cols = c('aerobic', 'anaerobic', 'facultatively anaerobic', 'facultative aerobe') - ) -}) + complete(Attribute, Score, fill = list(Score = 0)) |> + relocate(NCBI_ID) |> + arrange(NCBI_ID, Attribute) |> + mutate(Evidence = 'asr2') |> + {\(y) split(y, factor(y$NCBI_ID))}() +length(input_for_inh2) ``` +Add more nodes + ```{r} data_tree$Do(function(node) { - if (node$name %in% names(l)) { - node$attribute_tbl <- l[[node$name]] - } else { - NULL + cond1 <- is.null(node$attribute_tbl) + taxid <- sub('^\\w__', '', node$name) + cond2 <- taxid %in% names(input_for_inh2) + if (cond1 && cond2) { + # message('Adding ', node$name, ' to the tree') + node$attribute_tbl <- input_for_inh2[[taxid]] } }) - +data_tree$d__2$attribute_tbl ``` - +Now, let's proceed to inheritance (this time let's add a penalty) ```{r} -plot(ace, args.plotTree = list(direction = "upwards")) -# tips <- sapply(labs, function(x, y) which(y == x), y = tree$tip.label) -# add.arrow(tree, tips, arrl = 3, offset = 2, lwd = 2, col = palette()[4]) - -``` - - - - - - - - - - - - -I need a matrix of prior probabilites - -```{r} -prior_prob_mat <- phys_data_ready |> - select(NCBI_ID, Attribute, Score) |> - pivot_wider( - names_from = Attribute, values_from = Score, values_fill = 0 - ) |> - tibble::column_to_rownames(var = 'NCBI_ID') |> - as.matrix() -sum(rowSums(prior_prob_mat)) == nrow(prior_prob_mat) +inh2 <- function(node, adjF = 0.1) { + cond1 <- !node$isRoot # don't apply this to the root + cond2 <- is.null(node$attribute_tbl) # attribute_tbl must be empty + cond3 <- !is.null(node$parent$attribute_tbl) # parent must have data + if (cond1 && cond2 && cond3) { + # message('Adding ', node$name, '.') + tbl <- node$parent$attribute_tbl + n <- nrow(tbl) + res <- tbl |> + mutate( + target_scores = rep(1 / n, n), + score_diff = Score - target_scores, + Score = Score - adjF * score_diff, + NCBI_ID = node$name, + Evidence = 'inh2' + ) |> + select(-target_scores, -score_diff) |> + relocate(NCBI_ID) + node$attribute_tbl <- res + } +} +data_tree$Do(inh2, traversal = 'pre-order') ``` -Import phylogenetic tree (living tree project): - ```{r} -# fname <- system.file( -# 'extdata', 'livingTree.newick', package = 'taxPPro', mustWork = TRUE -# ) -# tree <- read.tree(fname) -# tree -data('tree_list') -data_tree <- as.Node(tree_list) -tree <- as.phylo.Node(data_tree, heightAttribute = NULL) -tree <- multi2di(tree) -tree <- collapse.singles(tree) -tree$tip.label <- sub('\\w__', '', tree$tip.label) -tree$node.label <- sub('\\w__', '', tree$node.label) -tree +data_tree$d__2$p__1224$c__28211$o__356$f__335928$attribute_tbl ``` +Get all the tables ```{r} -is.binary(tree) -``` - - -Add internal nodes (if any): - -```{r add internal nodes} -temp <- tree -temp$edge.length[which(temp$edge.length == 0)] <- head(sort(unique(temp$edge.length)))[2] -node_labels <- unique(temp$node.label) -node_labels <- node_labels[node_labels != ''] -node_labels <- unlist(strsplit(node_labels, '\\+')) -mat <- prior_prob_mat[which(rownames(prior_prob_mat) %in% node_labels), , drop = TRUE] -nodes <- rownames(mat) -system.time( -for(i in seq_along(nodes)){ - regex <- paste0('\\b', nodes[i], '\\b') - tipsAndNodes <- c(temp$tip.label, temp$node.label) - pos <- grep(regex, tipsAndNodes) - if (!length(pos)) - next - temp <- bind.tip( - tree = temp, tip.label = nodes[i], edge.length = 0, where = pos +output <- data_tree$Get( + attribute = 'attribute_tbl', simplify = FALSE, + filterFun = function(node) node$name != 'ArcBac' ) -} -) +output <- bind_rows(output) +head(output) ``` -Let's fit a model of evolution - -```{r fit model} -missing_tips <- temp$tip.label[!(temp$tip.label %in% rownames(mat))] -values <- rep(1/ncol(mat), ncol(mat)) -mat2 <- matrix(rep(values, length(missing_tips)), ncol = ncol(mat), byrow = TRUE) -rownames(mat2) <- missing_tips -colnames(mat2) <- colnames(mat) -mat3 <- rbind(mat, mat2) -mat3 <- mat3[temp$tip.label,] -fit <- fitMk( - # extended Mk (hidden markov) model (used for discrete traits) - tree = temp, - x = mat3, - model = 'ARD', # all rates different - pi = "fitzjohn", # pror distribution at the root this seems to have a significant effect in the root - logscale = TRUE, - lik.func = "pruning", -) -``` - -Load data: +## Session information ```{r} -data("primate.tree") -data("primate.data") -tree <- primate.tree -data <- primate.data -rm(primate.tree) -rm(primate.data) -activity <- data$Activity_pattern -names(activity) <- rownames(data) -m <- to.matrix(activity, levels(activity)) -m[,] <- rep(1/ncol(m), ncol(m)) -labs <- rownames(m)[which(grepl('Galago', rownames(m)))] -for (i in seq_along(labs)) { - m[labs[i],] <- c(0, 0, 1) -} -fit <- fitMk( - tree = tree, x = m, model = 'ARD', pi = "fitzjohn", logscale = TRUE, - lik.func = "pruning" -) -ace <- ancr(fit, tips = TRUE) -res <- ace$ace - - +sessioninfo::session_info() ``` - -```{r} -plot(ace, args.plotTree = list(direction = "upwards")) -tips <- sapply(labs, function(x, y) which(y == x), y = tree$tip.label) -add.arrow(tree, tips, arrl = 3, offset = 2, lwd = 2, col = palette()[4]) -```