diff --git a/inst/scripts/phagorn.R b/inst/scripts/phagorn.R index e1217ca..c5ff8e8 100644 --- a/inst/scripts/phagorn.R +++ b/inst/scripts/phagorn.R @@ -1,5 +1,8 @@ +library(phangorn) x <- m3 + + x[x > 0] <- 1 x <- x[,sort(colnames(x))] rownames(x) <- NULL @@ -13,6 +16,7 @@ for (i in seq_along(output)) { } rownames(y) <- output + trait_mat <- m3[, sort(colnames(m3))] trait_mat[trait_mat > 0] <- 1 trait_v <- vector('character', nrow(trait_mat)) @@ -46,10 +50,29 @@ system.time( { ## This one takes too long fit <- pml(mpa_tree, myData) - fit <- optim.pml(fit, model = NULL, control = pml.control(trace=0)) + # fit <- optim.pml(fit, model = NULL, control = pml.control(trace=0)) } ) +system.time( + anc.ml <- ancestral.pml(fit, "ml") +) + + +system.time( + anc.ml <- ancestral.pml(fit, "ml") +) + +system.time( + anc.bayes <- ancestral.pml(fit, "bayes") +) + + + + + + + anc_pars_data <- vector('list', length(anc_pars)) for (i in seq_along(anc_pars_data)) { anc_pars_data[[i]] <- as.data.frame(unname(anc_pars[[i]])) diff --git a/vignettes/articles/ape.Rmd b/vignettes/articles/ape.Rmd new file mode 100644 index 0000000..bc08d7c --- /dev/null +++ b/vignettes/articles/ape.Rmd @@ -0,0 +1,463 @@ +--- +title: "A propagation workflow" +--- + +```{r, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>" +) +``` + +```{r setup, message=FALSE} +library(bugphyzz) +library(taxPPro) +library(data.tree) +library(phytools) +library(dplyr) +library(purrr) +library(tidyr) +``` + +Import physiology data (one physiology - aerophilicity): + +```{r import physiology, message=FALSE} +phys <- physiologies('aerophilicity') +``` + +Filter data: + +```{r filter data} +select_cols <- c( + ## Columns with information needed for propagation + 'NCBI_ID', 'Taxon_name', 'Attribute', 'Attribute_source', + 'Frequency', 'Score', 'Parent_NCBI_ID', 'Confidence_in_curation', + 'Rank', 'Evidence', + + ## Columns only used for controlling the behavior of the code + 'Attribute_type', 'Attribute_group' +) +valid_ranks <- c('genus', 'species', 'strain') +phys_data <- phys[[1]] |> + as_tibble() |> + filter(Attribute_value == TRUE) |> # only for discrete traits with multiple states + filter(!is.na(Rank), Rank %in% valid_ranks) |> + filter( + ## Filter rows with NCBI_ID. + ## If NCBI_ID is missing, Parent_NCBI_ID must be present + !((is.na(NCBI_ID) | NCBI_ID == 'unknown') & is.na(Parent_NCBI_ID)) + ) |> + filter(!is.na('Attribute_source'), !is.na('Frequency')) |> + mutate( + Score = case_when( + Frequency == 'always' ~ 1, + Frequency == 'usually' ~ 0.9, + Frequency == 'sometimes' ~ 0.5, + Frequency == 'unknown' ~ 0.1 ## arbitrary value + ) + ) |> + select(all_of(select_cols)) |> + distinct() +n_dropped_rows <- nrow(phys[[1]]) - nrow(phys_data) +message(format(n_dropped_rows, big.mark = ','), ' rows were dropped.') +``` + +#> TODO - add code for dealing with duplicates, etc. + +The phylogenetic and taxonomic trees used for propagation with +ASR and inheritance have NCBI IDs at the node and tip labels. +Therefore, only annotations with NCBI IDs can be mapped for propagation +(ASR and Inheritance). + +The annotations of taxa without NCBI ID (usually strains) can be used to infer +the annotations of their parents (with NCBI ID and usually species) through +an 'early' version of ASR (before applying a formal ASR method). + +This early version of ASR is just the normalization of scores among the +child nodes of a parent node. + +```{r divide data in sets 1 and 2} +lgl_vct <- is.na(phys_data$NCBI_ID) | phys_data$NCBI_ID == 'unknown' +set_with_ids <- phys_data |> + filter(!lgl_vct) |> + ## I need to do this here (when rows with NCBI IDs have been + ## separated from rows without NCBI_IDs). Maybe I can do this before + ## separated with mutate_if or other code. + group_by(NCBI_ID) |> + mutate(Taxon_name = paste0(sort(unique(Taxon_name)), collapse = '|')) |> + ungroup() |> + distinct() |> + group_by(NCBI_ID, Attribute) |> + slice_head(n = 1) |> + ungroup() |> + group_by(NCBI_ID) |> + mutate( + Total_score = sum(Score), + Score = Score / Total_score + ) |> + ungroup() |> + distinct() +dim(set_with_ids) # these data don't need to go through early ASR +``` + +```{r set2 for asr} +## this will be used for early ASR +set_without_ids <- phys_data |> + filter(lgl_vct) |> + select(-NCBI_ID, -Taxon_name, -Frequency) |> + relocate(NCBI_ID = Parent_NCBI_ID) |> + distinct() + +if (nrow(set_with_ids) > 0) { + set_without_ids <- set_without_ids |> + filter(!NCBI_ID %in% unique(set_with_ids$NCBI_ID)) +} +## add some code here in case all of the NCBI_IDs were alredy present in +## the original data +dim(set_without_ids) +``` + +Let's just add up the scores + +#> set_without_ids |> filter(NCBI_ID %in% set_without_ids$NCBI_ID[which(duplicated(set_without_ids$NCBI_ID))]) +#> x |> filter(NCBI_ID %in% x$NCBI_ID[which(duplicated(x$NCBI_ID))]) + +```{r} +## 55565 +set_without_ids <- set_without_ids |> + group_by(NCBI_ID, Attribute) |> + slice_head(n = 1) |> + ungroup() |> + group_by(NCBI_ID) |> + mutate( + Total_score = sum(Score), + Score = Score / Total_score + ) |> + ungroup() |> + distinct() +phys_data_ready <- left_join(set_with_ids, set_without_ids) |> + select(-Total_score, -Parent_NCBI_ID) |> + mutate( + NCBI_ID = case_when( + Rank == 'strain' ~ paste0('t__', NCBI_ID), + Rank == 'species' ~ paste0('s__', NCBI_ID), + Rank == 'genus' ~ paste0('g__', NCBI_ID) + ) +) +phys_data_list <- split(phys_data_ready, factor(phys_data_ready$NCBI_ID)) +length(phys_data_list) +``` + +## First part of ASR/INH + +Map all nodes to the data_tree file + +```{r} +data("tree_list") +data_tree <- as.Node(tree_list) +print(data_tree, limit = 10) +``` + +How many tips/nodes included in the NCBI taxonomy + +```{r} +get_all_nodes <- unname(data_tree$Get('name')) +mean(names(phys_data_list) %in% get_all_nodes) +``` + +How many tips included in the mpa tree + +```{r} +mpa_tip_data <- mpa(x = 'tips') |> + modify(as.character) |> + as_tibble() +mean(mpa_tip_data$taxid %in% sub('^\\w__', '', phys_data_ready$NCBI_ID)) +``` +Add nodes to the tree + +```{r} +data_tree$Do(function(node) { + if (node$name %in% names(phys_data_list)) { + node$attribute_tbl <- phys_data_list[[node$name]] + } else { + NULL + } +}) +data_tree$d__2$p__1224$c__1236$o__91347$f__543$g__561$s__562$attribute_tbl +``` + + +```{r} +data_tree$d__2$p__1224$c__1236$o__91347$f__543$g__561$attribute_tbl <- NULL +data_tree$d__2$p__1224$c__1236$o__91347$f__543$g__561$attribute_tbl +``` + +ASR - post-order + +```{r} +asr1 <- function(node) { + if (!node$isLeaf) { + children_names <- names(node$children) + attribute_tbls <- children_names |> + purrr::map(~ node[[.x]]$attribute_tbl) |> + purrr::discard(is.null) + not_all_children_tbls_are_null <- length(attribute_tbls) > 0 + node_attribute_tbl_is_null <- is.null(node$table) + node_is_gst <- grepl('^[gst]__', node$name) + conds <- node_attribute_tbl_is_null & + not_all_children_tbls_are_null & + node_is_gst + if (conds) { + res_tbl <- purrr::discard(attribute_tbls, is.null) |> + dplyr::bind_rows() |> + dplyr::select( + NCBI_ID, Attribute, Score, + Attribute_type, Attribute_group + ) |> + dplyr::mutate( + NCBI_ID = node$name, + Evidence = 'asr1' + ) |> + group_by(NCBI_ID) |> + mutate( + Total_score = sum(Score), + Score = Score / Total_score + ) |> + ungroup() |> + select(-Total_score) |> + group_by(NCBI_ID, Attribute) |> + mutate(Score = sum(Score)) |> + ungroup() |> + distinct() |> + mutate( + Frequency = case_when( + Score == 1 ~ 'always', + Score > 0.9 ~ 'usually', + Score >= 0.5 ~ 'sometimes', + Score > 0 & Score < 0.5 ~ 'rarely', + Score == 0 ~ 'never' + ) + ) |> + distinct() + node[['attribute_tbl']] <- res_tbl + } + } +} +data_tree$Do(asr1, traversal = 'post-order') +data_tree$d__2$p__1224$c__1236$o__91347$f__543$g__561$attribute_tbl +``` + +```{r} +inh1 <- function(node) { + ## This one has no penalty + if (node$isRoot) { + return(NULL) + } + + + if (is.null(node$parent$attribute_tbl)) { + return(NULL) + } + + if (is.null(node$attribute_tbl) && grepl('^[st]__', node$name)) { + df <- node$parent$attribute_tbl + df <- df |> + dplyr::mutate( + NCBI_ID = node$name, + Evidence = 'inh' + ) + node$attribute_tbl <- df + } +} +data_tree$Do(inh1, traversal = 'pre-order') +``` + + +```{r} +new_nodes <- data_tree$Get('attribute_tbl', simplify = FALSE) +new_nodes_not_na <- map_lgl(new_nodes, ~ !all(is.na(.x))) +mean(new_nodes_not_na) +``` + +```{r} +new_data <- new_nodes |> + discard(~ all(is.na(.x))) |> + bind_rows() |> + arrange(NCBI_ID, Attribute) |> + mutate(taxid = sub('^\\w__', '', NCBI_ID)) +mean(mpa_tip_data$taxid %in% new_data$taxid) +``` + + +```{r} +tip <- left_join( + mpa_tip_data, new_data, by = 'taxid', relationship = 'many-to-many' +) +tip |> + filter(!is.na(Attribute)) |> + dim() +``` + + +```{r} +table(tip$Evidence) +``` + + +```{r} +mpa_tree <- mpa() +m1 <- tip |> + select(tip_label, Attribute, Score) |> + filter(!is.na(Attribute)) |> + # complete(NCB_ID, Attribute, fill = list(Score = 0)) |> + pivot_wider( + names_from = Attribute, values_from = 'Score', values_fill = 0 + ) |> + tibble::column_to_rownames(var = 'tip_label') |> + as.matrix() + +no_annotated_tips <- tip |> + filter(!tip_label %in% rownames(m1)) |> + pull(tip_label) + +m2 <- matrix( + data = rep(rep(1/ncol(m1), ncol(m1)), length(no_annotated_tips)), + nrow = length((no_annotated_tips)), + byrow = TRUE, + dimnames = list(rownames = no_annotated_tips, colnames = colnames(m1)) +) + +m3 <- rbind(m1, m2) +m3 <- m3[mpa_tree$tip.label,] +dim(m3) +``` + +```{r} +system.time( + fit <- fitMk.parallel( + tree = mpa_tree, + x = m3, + model = 'ER', + pi = "fitzjohn", + logscale = TRUE, + lik.func = "pruning" + ) +) +## A few seconds or minutes with fitMK.parellel +``` + +```{r} +system.time( + # ace <- ancr(fit, type = 'marginal', tips = TRUE) + ace <- ancr(fit, type = 'marginal') # don't include tips +) +``` + + +```{r} +node_data <- as_tibble(modify(mpa('nodes'), as.character)) +res <- ace$ace +rownames(res) <- mpa_tree$node.label +res <- res |> + as.data.frame() |> + tibble::rownames_to_column(var = 'node_label') |> + left_join(node_data, by = 'node_label') |> + as_tibble() |> + filter(!is.na(taxid)) +dim(res) +``` + + +```{r} +## It seems that at least half of the nodes would have new annotations, +## which I think it's ok, since this should be the nodes of higher +## ranks in the taxonomic tree +mean(!res$taxid %in% new_data$taxid) +``` + + +```{r} +data_for_inh2 <- filter(res, !taxid %in% new_data$taxid) +dim(data_for_inh2) +``` + + +```{r} +input_for_inh2 <- data_for_inh2 |> + select(-ends_with('_taxid'), -node_label, -node, -n_labels) |> + relocate(NCBI_ID = taxid) |> + pivot_longer( + names_to = 'Attribute', values_to = 'Score', cols = 2:last_col() + ) |> + complete(Attribute, Score, fill = list(Score = 0)) |> + relocate(NCBI_ID) |> + arrange(NCBI_ID, Attribute) |> + mutate(Evidence = 'asr2') |> + {\(y) split(y, factor(y$NCBI_ID))}() +length(input_for_inh2) +``` + +Add more nodes + +```{r} +data_tree$Do(function(node) { + cond1 <- is.null(node$attribute_tbl) + taxid <- sub('^\\w__', '', node$name) + cond2 <- taxid %in% names(input_for_inh2) + if (cond1 && cond2) { + # message('Adding ', node$name, ' to the tree') + node$attribute_tbl <- input_for_inh2[[taxid]] + } +}) +data_tree$d__2$attribute_tbl +``` + +Now, let's proceed to inheritance (this time let's add a penalty) + +```{r} +inh2 <- function(node, adjF = 0.1) { + cond1 <- !node$isRoot # don't apply this to the root + cond2 <- is.null(node$attribute_tbl) # attribute_tbl must be empty + cond3 <- !is.null(node$parent$attribute_tbl) # parent must have data + if (cond1 && cond2 && cond3) { + # message('Adding ', node$name, '.') + tbl <- node$parent$attribute_tbl + n <- nrow(tbl) + res <- tbl |> + mutate( + target_scores = rep(1 / n, n), + score_diff = Score - target_scores, + Score = Score - adjF * score_diff, + NCBI_ID = node$name, + Evidence = 'inh2' + ) |> + select(-target_scores, -score_diff) |> + relocate(NCBI_ID) + node$attribute_tbl <- res + } +} +data_tree$Do(inh2, traversal = 'pre-order') +``` + + +```{r} +data_tree$d__2$p__1224$c__28211$o__356$f__335928$attribute_tbl +``` + +Get all the tables + +```{r} +output <- data_tree$Get( + attribute = 'attribute_tbl', simplify = FALSE, + filterFun = function(node) node$name != 'ArcBac' + ) +output <- bind_rows(output) +head(output) +``` + +## Session information + +```{r} +sessioninfo::session_info() +``` + diff --git a/vignettes/articles/phangorn.Rmd b/vignettes/articles/phangorn.Rmd new file mode 100644 index 0000000..80ba32c --- /dev/null +++ b/vignettes/articles/phangorn.Rmd @@ -0,0 +1,516 @@ +--- +title: "A propagation workflow, using phangorn for ASR of discrete data" +--- + +```{r, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>" +) +``` + + +```{r setup, message=FALSE} +library(bugphyzz) +library(taxPPro) +library(data.tree) +library(ape) +library(phangorn) +library(dplyr) +library(purrr) +library(tidyr) +``` + +Import physiological data (e.g. aerophilicity): + +```{r import physiology, message=FALSE} +phys <- physiologies('aerophilicity') +``` + +Filter data: + +```{r filter data} +attr_values_tsv <- system.file('extdata', 'attributes.tsv', package = 'bugphyzz') +attr_values <- read.table( + file = attr_values_tsv, header = TRUE, sep = '\t' +) +valid_attr_values <- attr_values |> + filter(attribute_group == 'aerophilicity') |> + pull(attribute) |> + unique() +select_cols <- c( + ## Columns with information needed for propagation + 'NCBI_ID', 'Taxon_name', 'Attribute', 'Attribute_source', + 'Frequency', 'Score', 'Parent_NCBI_ID', 'Confidence_in_curation', + 'Rank', 'Evidence', + + ## Columns only used for controlling the behavior of the code + 'Attribute_type', 'Attribute_group' +) +valid_ranks <- c('genus', 'species', 'strain') +phys_data <- phys[[1]] |> + as_tibble() |> + filter(Attribute %in% valid_attr_values) |> + filter(Attribute_value == TRUE) |> # only for discrete traits with multiple states + filter(!is.na(Rank), Rank %in% valid_ranks) |> + filter( + ## Filter rows with NCBI_ID. + ## If NCBI_ID is missing, Parent_NCBI_ID must be present + !((is.na(NCBI_ID) | NCBI_ID == 'unknown') & is.na(Parent_NCBI_ID)) + ) |> + filter(!is.na('Attribute_source'), !is.na('Frequency')) |> + mutate( + Score = case_when( + Frequency == 'always' ~ 1, + Frequency == 'usually' ~ 0.9, + Frequency == 'sometimes' ~ 0.5, + Frequency == 'unknown' ~ 0.1 ## arbitrary value + ) + ) |> + select(all_of(select_cols)) |> + distinct() +n_dropped_rows <- nrow(phys[[1]]) - nrow(phys_data) +message(format(n_dropped_rows, big.mark = ','), ' rows were dropped.') +``` + +We need taxids for propagation and to hoomogenize taxonomic data. If +taxid is not available, then the Parent NCBI_ID will be used for ASR. +Taxa without taxids will not be used. + +```{r divide data in sets 1 and 2} +lgl_vct <- is.na(phys_data$NCBI_ID) | phys_data$NCBI_ID == 'unknown' +set_with_ids <- phys_data |> + filter(!lgl_vct) |> + group_by(NCBI_ID) |> + mutate(Taxon_name = paste0(sort(unique(Taxon_name)), collapse = '|')) |> + ungroup() |> + distinct() |> + group_by(NCBI_ID, Attribute) |> + slice_head(n = 1) |> + ungroup() |> + group_by(NCBI_ID) |> + mutate( + Total_score = sum(Score), + Score = Score / Total_score + ) |> + ungroup() |> + distinct() +dim(set_with_ids) +``` + +```{r set2 for asr} +set_without_ids <- phys_data |> + filter(lgl_vct) |> + select(-NCBI_ID, -Taxon_name, -Frequency) |> + relocate(NCBI_ID = Parent_NCBI_ID) |> + distinct() +if (nrow(set_with_ids) > 0) { + set_without_ids <- set_without_ids |> + filter(!NCBI_ID %in% unique(set_with_ids$NCBI_ID)) +} +dim(set_without_ids) +``` + + + +```{r} +## 55565 +set_without_ids <- set_without_ids |> + group_by(NCBI_ID, Attribute) |> + slice_head(n = 1) |> + ungroup() |> + group_by(NCBI_ID) |> + mutate( + Total_score = sum(Score), + Score = Score / Total_score + ) |> + ungroup() |> + distinct() + +phys_data_ready <- left_join(set_with_ids, set_without_ids) |> + select(-Total_score, -Parent_NCBI_ID) |> + mutate( + NCBI_ID = case_when( + Rank == 'strain' ~ paste0('t__', NCBI_ID), + Rank == 'species' ~ paste0('s__', NCBI_ID), + Rank == 'genus' ~ paste0('g__', NCBI_ID) + ) +) +phys_data_list <- split(phys_data_ready, factor(phys_data_ready$NCBI_ID)) +length(phys_data_list) +``` + +## First part of ASR/INH + +Map all nodes to the data_tree file + +```{r import data_tree} +data("tree_list") +data_tree <- as.Node(tree_list) +print(data_tree, limit = 10) +``` + +```{r} +get_all_nodes <- unname(data_tree$Get('name')) +mean(names(phys_data_list) %in% get_all_nodes) +``` + +```{r} +tip_data <- ltp(x = 'tips') |> + modify(as.character) |> + as_tibble() +mean(tip_data$taxid %in% sub('^\\w__', '', phys_data_ready$NCBI_ID)) +``` + +Add nodes to the tree + +```{r} +data_tree$Do(function(node) { + if (node$name %in% names(phys_data_list)) { + node$attribute_tbl <- phys_data_list[[node$name]] + } else { + NULL + } +}) +data_tree$d__2$p__1224$c__1236$o__91347$f__543$g__561$s__562$attribute_tbl +``` + + +```{r} +data_tree$d__2$p__1224$c__1236$o__91347$f__543$g__561$attribute_tbl <- NULL +data_tree$d__2$p__1224$c__1236$o__91347$f__543$g__561$attribute_tbl +``` + +ASR - post-order + +```{r} +asr1 <- function(node) { + if (!node$isLeaf) { + children_names <- names(node$children) + attribute_tbls <- children_names |> + purrr::map(~ node[[.x]]$attribute_tbl) |> + purrr::discard(is.null) + not_all_children_tbls_are_null <- length(attribute_tbls) > 0 + node_attribute_tbl_is_null <- is.null(node$table) + node_is_gst <- grepl('^[gst]__', node$name) + conds <- node_attribute_tbl_is_null & + not_all_children_tbls_are_null & + node_is_gst + if (conds) { + res_tbl <- purrr::discard(attribute_tbls, is.null) |> + dplyr::bind_rows() |> + dplyr::select( + NCBI_ID, Attribute, Score, + Attribute_type, Attribute_group + ) |> + dplyr::mutate( + NCBI_ID = node$name, + Evidence = 'asr1' + ) |> + group_by(NCBI_ID) |> + mutate( + Total_score = sum(Score), + Score = Score / Total_score + ) |> + ungroup() |> + select(-Total_score) |> + group_by(NCBI_ID, Attribute) |> + mutate(Score = sum(Score)) |> + ungroup() |> + distinct() |> + mutate( + Frequency = case_when( + Score == 1 ~ 'always', + Score > 0.9 ~ 'usually', + Score >= 0.5 ~ 'sometimes', + Score > 0 & Score < 0.5 ~ 'rarely', + Score == 0 ~ 'never' + ) + ) |> + distinct() + node[['attribute_tbl']] <- res_tbl + } + } +} +data_tree$Do(asr1, traversal = 'post-order') +data_tree$d__2$p__1224$c__1236$o__91347$f__543$g__561$attribute_tbl +``` + +```{r} +inh1 <- function(node) { + if (node$isRoot) + return(NULL) + if (is.null(node$parent$attribute_tbl)) + return(NULL) + if (is.null(node$attribute_tbl) && grepl('^[st]__', node$name)) { + df <- node$parent$attribute_tbl + df <- df |> + dplyr::mutate( + NCBI_ID = node$name, + Evidence = 'inh' + ) + node$attribute_tbl <- df + } +} +data_tree$Do(inh1, traversal = 'pre-order') +``` + + +```{r} +new_nodes <- data_tree$Get('attribute_tbl', simplify = FALSE) +new_nodes_not_na <- map_lgl(new_nodes, ~ !all(is.na(.x))) +mean(new_nodes_not_na) +``` + +```{r} +new_data <- new_nodes |> + discard(~ all(is.na(.x))) |> + bind_rows() |> + arrange(NCBI_ID, Attribute) |> + mutate(taxid = sub('^\\w__', '', NCBI_ID)) +mean(tip_data$taxid %in% new_data$taxid) +``` + + +```{r} +tip <- left_join( + tip_data, new_data, by = 'taxid', relationship = 'many-to-many' +) +tip |> + filter(!is.na(Attribute)) |> + dim() +``` + + +```{r} +table(tip$Evidence) +``` + + +```{r} +tree <- ltp() +m1 <- tip |> + select(tip_label, Attribute, Score) |> + filter(!is.na(Attribute)) |> + # complete(NCB_ID, Attribute, fill = list(Score = 0)) |> + pivot_wider( + names_from = Attribute, values_from = 'Score', values_fill = 0 + ) |> + tibble::column_to_rownames(var = 'tip_label') |> + as.matrix() + +no_annotated_tips <- tip |> + filter(!tip_label %in% rownames(m1)) |> + pull(tip_label) + +m2 <- matrix( + data = rep(rep(1/ncol(m1), ncol(m1)), length(no_annotated_tips)), + nrow = length((no_annotated_tips)), + byrow = TRUE, + dimnames = list(rownames = no_annotated_tips, colnames = colnames(m1)) +) + +m3 <- rbind(m1, m2) +m3 <- m3[tree$tip.label,] +dim(m3) +``` + + +Let's better use a pruned version of the tree + +```{r} +pruned_tree <- keep.tip(phy = tree, tip = rownames(m1)) +pruned_tree$node.label <- NULL # Nodes might have been lost +m1 <- m1[pruned_tree$tip.label,] +dim(m1) +``` + + +```{r} +x <- m1 +x[x > 0] <- 1 +x <- x[,sort(colnames(x))] +# rownames(x) <- NULL +contrast <- as.matrix(unique(as.data.frame(x))) +contrast_row_names <- vector('character', nrow(contrast)) +for (i in seq_along(contrast_row_names)) { + pos <- which(contrast[i,] == 1) + attr_names <- colnames(contrast)[pos] + contrast_row_names[[i]] <- paste0(attr_names, collapse = ':') +} +rownames(contrast) <- contrast_row_names +contrast +``` + + +```{r} +v <- vector('character', nrow(x)) +for (i in seq_along(v)) { + pos <- which(x[i,] == 1) + attr_names <- colnames(x)[pos] + v[i] <- paste0(attr_names, collapse = ':') +} +traits <- matrix(data = v, ncol = 1) +colnames(traits) <- 'trait' +rownames(traits) <- rownames(x) +all(rownames(contrast) %in% unique(as.character(traits))) +``` + +Create a phyData + +```{r} +phyData <- phyDat( + traits, type = "USER", + levels = rownames(contrast), + contrast = contrast +) +phyData +``` + +```{r} +system.time({ + fit <- pml(pruned_tree, phyData) + asr_output <- ancestral.pml(fit, 'ml') +}) +``` + + + + + + + + + + + + +```{r} +system.time( + fit <- fitMk.parallel( + tree = mpa_tree, + x = m3, + model = 'ER', + pi = "fitzjohn", + logscale = TRUE, + lik.func = "pruning" + ) +) +## A few seconds or minutes with fitMK.parellel +``` + +```{r} +system.time( + # ace <- ancr(fit, type = 'marginal', tips = TRUE) + ace <- ancr(fit, type = 'marginal') # don't include tips +) +``` + + +```{r} +node_data <- as_tibble(modify(mpa('nodes'), as.character)) +res <- ace$ace +rownames(res) <- mpa_tree$node.label +res <- res |> + as.data.frame() |> + tibble::rownames_to_column(var = 'node_label') |> + left_join(node_data, by = 'node_label') |> + as_tibble() |> + filter(!is.na(taxid)) +dim(res) +``` + + +```{r} +## It seems that at least half of the nodes would have new annotations, +## which I think it's ok, since this should be the nodes of higher +## ranks in the taxonomic tree +mean(!res$taxid %in% new_data$taxid) +``` + + +```{r} +data_for_inh2 <- filter(res, !taxid %in% new_data$taxid) +dim(data_for_inh2) +``` + + +```{r} +input_for_inh2 <- data_for_inh2 |> + select(-ends_with('_taxid'), -node_label, -node, -n_labels) |> + relocate(NCBI_ID = taxid) |> + pivot_longer( + names_to = 'Attribute', values_to = 'Score', cols = 2:last_col() + ) |> + complete(Attribute, Score, fill = list(Score = 0)) |> + relocate(NCBI_ID) |> + arrange(NCBI_ID, Attribute) |> + mutate(Evidence = 'asr2') |> + {\(y) split(y, factor(y$NCBI_ID))}() +length(input_for_inh2) +``` + +Add more nodes + +```{r} +data_tree$Do(function(node) { + cond1 <- is.null(node$attribute_tbl) + taxid <- sub('^\\w__', '', node$name) + cond2 <- taxid %in% names(input_for_inh2) + if (cond1 && cond2) { + # message('Adding ', node$name, ' to the tree') + node$attribute_tbl <- input_for_inh2[[taxid]] + } +}) +data_tree$d__2$attribute_tbl +``` + +Now, let's proceed to inheritance (this time let's add a penalty) + +```{r} +inh2 <- function(node, adjF = 0.1) { + cond1 <- !node$isRoot # don't apply this to the root + cond2 <- is.null(node$attribute_tbl) # attribute_tbl must be empty + cond3 <- !is.null(node$parent$attribute_tbl) # parent must have data + if (cond1 && cond2 && cond3) { + # message('Adding ', node$name, '.') + tbl <- node$parent$attribute_tbl + n <- nrow(tbl) + res <- tbl |> + mutate( + target_scores = rep(1 / n, n), + score_diff = Score - target_scores, + Score = Score - adjF * score_diff, + NCBI_ID = node$name, + Evidence = 'inh2' + ) |> + select(-target_scores, -score_diff) |> + relocate(NCBI_ID) + node$attribute_tbl <- res + } +} +data_tree$Do(inh2, traversal = 'pre-order') +``` + + +```{r} +data_tree$d__2$p__1224$c__28211$o__356$f__335928$attribute_tbl +``` + +Get all the tables + +```{r} +output <- data_tree$Get( + attribute = 'attribute_tbl', simplify = FALSE, + filterFun = function(node) node$name != 'ArcBac' + ) +output <- bind_rows(output) +head(output) +``` + +## Session information + +```{r} +sessioninfo::session_info() +``` +