Skip to content

Commit

Permalink
update taxPPro repo
Browse files Browse the repository at this point in the history
  • Loading branch information
sdgamboa committed Apr 2, 2024
1 parent 8a5b496 commit 535590e
Show file tree
Hide file tree
Showing 12 changed files with 226 additions and 1,237 deletions.
193 changes: 4 additions & 189 deletions R/taxPPro.R
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,8 @@ filterData <- function(tbl) {
#'
#' \code{filterDataNumeric} filters data that could be used for propagation
#' of numeric attributes. These are labeled as "range" when being imported
#' with the \code{physiologies function}.
#' with the \code{physiologies function}. It also creates a Score column
#' based on the Frequency column.
#'
#' @param tbl A data.frame imported with \code{bugphyzz::physiologies}
#'
Expand Down Expand Up @@ -76,7 +77,8 @@ filterDataNumeric <- function(tbl) {
#' Filter data with discrete attributes
#'
#' \code{filterDataDiscrete} filters data that could be used for propagation
#' of discrete attributes.
#' of discrete attributes. It also creates a Scores column based on
#' the Frequency column.
#'
#' @param tbl A data.frame imported with \code{bugphyzz::physiologies}
#'
Expand All @@ -91,7 +93,6 @@ filterDataDiscrete <- function(tbl) {
'Frequency', 'Score', 'Evidence',
'Attribute_type', 'Attribute_group'
)
# valid_ranks <- c('genus', 'species', 'strain')
attributes_fname <- system.file(
'extdata', 'attributes.tsv', package = 'bugphyzz'
)
Expand Down Expand Up @@ -368,7 +369,6 @@ getSetWithoutIDs <- function(tbl, set_with_ids = NULL) {
dplyr::mutate(
Attribute_value = mean(cbind(.data$Attribute_value_min, .data$Attribute_value_max))
) |>
# dplyr::mutate(Attribute_value = mean(Attribute_value)) |>
dplyr::slice_max(
.data$Confidence_in_curation, n = 1, with_ties = FALSE
) |>
Expand Down Expand Up @@ -449,188 +449,3 @@ getSetWithoutIDs <- function(tbl, set_with_ids = NULL) {
'taxid'
)
}

#' Perform taxonomic pooling
#'
#' \code{taxPool} performs taxonomic pooling at the strain, species, and
#' genus ranks. Only use in the data.tree NCBI tree.
#'
#' @param node A node in a data.tree object.
#' @param grp A character string. Attribute group.
#' @param typ A character string. Attribute_type.
#'
#' @return A table for the attribute of a node.
#' @export
#'
taxPool <- function(node, grp, typ) {
if (!node$isLeaf) {
children_names <- names(node$children)
attribute_tbls <- children_names |>
purrr::map(~ node[[.x]]$attribute_tbl) |>
purrr::discard(is.null)
not_all_children_tbls_are_null <- length(attribute_tbls) > 0
node_attribute_tbl_is_null <- is.null(node$table)
# node_is_gst <- grepl('^[gst]__', node$name)
node_is_gst <- grepl('^[st]__', node$name)
conds <- node_attribute_tbl_is_null &
not_all_children_tbls_are_null &
node_is_gst
if (conds) {
res_tbl <- attribute_tbls |>
purrr::discard(is.null) |>
dplyr::bind_rows() |>
dplyr::select(
.data$NCBI_ID, .data$Attribute, .data$Score
) |>
dplyr::mutate(
NCBI_ID = node$name,
taxid = node$taxid,
Taxon_name = node$Taxon_name,
Rank = node$Rank,
Evidence = 'tax',
Attribute_group = grp,
Attribute_type = typ
) |>
dplyr::group_by(.data$NCBI_ID) |>
dplyr::mutate(
Total_score = sum(.data$Score),
Score = .data$Score / .data$Total_score
) |>
dplyr::ungroup() |>
dplyr::select(-.data$Total_score) |>
dplyr::group_by(.data$NCBI_ID, .data$Attribute) |>
dplyr::mutate(Score = sum(.data$Score)) |>
dplyr::ungroup() |>
dplyr::distinct() |>
dplyr::mutate(
Frequency = dplyr::case_when(
.data$Score == 1 ~ 'always',
.data$Score > 0.9 ~ 'usually',
.data$Score >= 0.5 ~ 'sometimes',
.data$Score > 0 & .data$Score < 0.5 ~ 'rarely',
.data$Score == 0 ~ 'never'
)
) |>
dplyr::mutate(
Attribute_source = NA,
Confidence_in_curation = NA
) |>
dplyr::distinct()
node$attribute_tbl <- res_tbl
}
}
}

#' Inheritance first round
#'
#' \code{inh1} First round of inheritance. Only use with Do in a data.tree
#' object.
#'
#' @param node A node in a data.tree.
#' @param adjF Adjustment factor for penalty. Default is 1.0.
#' @param evidence_label A character string, either inh1 or inh2.
#'
#' @return A table for a node attribute.
#' @export
#'
inh1 <- function(node, adjF = 0.1, evidence_label = 'inh') {
if (node$isRoot)
return(NULL)
if (is.null(node$parent$attribute_tbl))
return(NULL)
if (is.null(node$attribute_tbl) && grepl('^[st]__', node$name)) {
df <- node$parent$attribute_tbl
n <- nrow(df)
df <- df |>
dplyr::mutate(
target_scores = rep(1 / n, n),
score_diff = .data$Score - .data$target_scores,
Score = .data$Score - adjF * .data$score_diff,
NCBI_ID = node$name,
Evidence = evidence_label,
Taxon_name = node$Taxon_name,
Rank = node$Rank,
taxid = node$taxid,
) |>
dplyr::select(-.data$target_scores, -.data$score_diff) |>
dplyr::relocate(.data$NCBI_ID) |>
dplyr::mutate(
Attribute_source = NA,
Confidence_in_curation = NA,
taxid = node$taxid,
Taxon_name = node$Taxon_name,
Rank = node$Rank,
Frequency = dplyr::case_when(
.data$Score == 1 ~ 'always',
.data$Score > 0.9 ~ 'usually',
.data$Score >= 0.5 ~ 'sometimes',
.data$Score > 0 & .data$Score < 0.5 ~ 'rarely',
.data$Score == 0 ~ 'never'
)
)
node$attribute_tbl <- df
}
}

#' Inheritance second round
#'
#' \code{inh2} Second round of inheritance. Only use with Do in a data.tree
#' object.
#'
#' @param node A node in a data.tree.
#' @param adjF Adjustment factor for penalty. Default is 1.0.
#'
#' @return A table for a node attribute.
#' @export
#'
inh2 <- function(node, adjF = 0.1) {
cond1 <- !node$isRoot
cond2 <- is.null(node$attribute_tbl)
cond3 <- !is.null(node$parent$attribute_tbl)
if (cond1 && cond2 && cond3) {
tbl <- node$parent$attribute_tbl
n <- nrow(tbl)
res <- tbl |>
dplyr::mutate(
target_scores = rep(1 / n, n),
score_diff = .data$Score - .data$target_scores,
Score = .data$Score - adjF * .data$score_diff,
NCBI_ID = node$name,
Evidence = 'inh2'
) |>
dplyr::select(-.data$target_scores, -.data$score_diff) |>
dplyr::relocate(.data$NCBI_ID) |>
dplyr::mutate(
Attribute_source = NA,
Confidence_in_curation = NA,
taxid = node$taxid,
Taxon_name = node$Taxon_name,
Rank = node$Rank,
Frequency = dplyr::case_when(
.data$Score == 1 ~ 'always',
.data$Score > 0.9 ~ 'usually',
.data$Score >= 0.5 ~ 'sometimes',
.data$Score > 0 & .data$Score < 0.5 ~ 'rarely',
.data$Score == 0 ~ 'never'
)
)
node$attribute_tbl <- res
}
}

#' Get Most Recent Common Ancestor
#'
#' \code{getMRCATaxPPro} gets the most recent common ancestor using phytools.
#'
#' @param tree A phylo tree object.
#' @param tips Tips in the phylo tree object.
#'
#' @return A data.frame.
#' @export
#'
getMRCATaxPPro <- function(tree, tips) {
res <- phytools::findMRCA(tree = tree, tips = tips)
if (is.null(res))
res <- NA
res
}
25 changes: 25 additions & 0 deletions R/trees.R
Original file line number Diff line number Diff line change
Expand Up @@ -60,3 +60,28 @@ ltp <- function(remove_gn_nodes = TRUE, node_names = TRUE) {
gn_tips = gn_tips
)
}

#' Get NSTI
#'
#' \code{getNsti} gets the nsti described in picrust2. Code is based
#' on the source code of picrust2 using the castor package. NSTI
#' values are described in picrust and picrust2.
#'
#' @param tree A phylo object
#' @param annotated_tip_labels A character vector with the names of the tips
#'
#' @return A data.frame with two columns: tip_label and nsti.
#' @export
#'
getNsti <- function(tree, annotated_tip_labels) {
unknown_tips_index <- which(!tree$tip.label %in% annotated_tip_labels)
known_tips_index <- which(tree$tip.label %in% annotated_tip_labels)
res <- castor::find_nearest_tips(
tree = tree, target_tips = known_tips_index
)
data.frame(
tip_label = tree$tip.label[unknown_tips_index],
nsti = res$nearest_distance_per_tip[unknown_tips_index]
)
}

45 changes: 0 additions & 45 deletions R/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -119,28 +119,6 @@ classif2Table <- function(x, ranks) {
new_df
}


myFun <- function(x, adjF = 0.1) {
n <- length(x)
target_scores <- rep(1 / n, n)
score_diff <- x - target_scores
res <- x - adjF * score_diff
return(res)
}

#' Clean node
#'
#' \code{cleanNode} deletes the `attribute_tbl` slot in a data.tree node.
#'
#' @param node A node in a data.tree object.
#'
#' @return NULL (assigned to the node)
#' @export
#'
cleanNode <- function(node) {
node$attribute_tbl <- NULL
}

#' Round up to nearest decimal
#'
#' \code{roundDec} rounds up to next decimal. Meant to be used for
Expand All @@ -157,26 +135,3 @@ roundDec <- function(x) {
ceiling(((1 / n) + 1e-06) * 10) / 10
}

#' Get NSTI
#'
#' \code{getNsti} gets the nsti described in picrust2. Code is based
#' on the source code of picrust2 using the castor package. NSTI
#' values are described in picrust and picrust2.
#'
#' @param tree A phylo object
#' @param annotated_tip_labels A character vector with the names of the tips
#'
#' @return A data.frame with two columns: tip_label and nsti.
#' @export
#'
getNsti <- function(tree, annotated_tip_labels) {
unknown_tips_index <- which(!tree$tip.label %in% annotated_tip_labels)
known_tips_index <- which(tree$tip.label %in% annotated_tip_labels)
res <- castor::find_nearest_tips(
tree = tree, target_tips = known_tips_index
)
data.frame(
tip_label = tree$tip.label[unknown_tips_index],
nsti = res$nearest_distance_per_tip[unknown_tips_index]
)
}
Loading

0 comments on commit 535590e

Please sign in to comment.