update workflow for binaries;

waldronlab · Oct 4, 2023 · 816845e · 816845e
1 parent c2c444b
commit 816845e
Show file tree

Hide file tree

Showing 7 changed files with 144 additions and 58 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -18,6 +18,7 @@ export(cleanNode)
 export(conf2Fct)
 export(fillNAs)
 export(filterData)
+export(filterDataDiscrete)
 export(freq2Scores)
 export(getAgreements)
 export(getConflicts)

diff --git a/R/taxPPro.R b/R/taxPPro.R
@@ -8,14 +8,27 @@
 #' @export
 #'
 filterData <- function(tbl) {
+    types <- bugphyzz:::.DISCRETE_TYPES
     attr_type <- unique(tbl$Attribute_type)
-    if(attr_type == 'logical'){
-        output <- filterDataMulti(tbl)
+    if (attr_type %in% types){
+        output <- filterDataDiscrete(tbl)
+    } else {
+        output <- NULL
     }
     return(output)
 }
 
-filterDataMulti <- function(tbl) {
+#' Filter attributes with discrete type
+#'
+#' \code{filterDataDiscrete} filters discrete data.
+#'
+#' @param tbl A data.frame.
+#'
+#' @return A data.frame.
+#' @export
+#'
+filterDataDiscrete <- function(tbl) {
+    phys_name <- unique(tbl$Attribute_group)
     select_cols <- c(
         'NCBI_ID', 'Taxon_name', 'Parent_NCBI_ID',
         'Attribute','Attribute_source', 'Confidence_in_curation',
@@ -31,18 +44,36 @@ filterDataMulti <- function(tbl) {
         dplyr::filter(.data$attribute_group == phys_name) |>
         dplyr::pull(.data$attribute) |>
         unique()
+
+    n_rows <- nrow(tbl)
+
+    tbl <- tbl |>
+        dplyr::filter(!is.na(.data$Attribute)) |>
+        dplyr::filter(!is.na(.data$Attribute_value))
+
+    attr_type <- unique(tbl$Attribute_type)
+    if (attr_type == 'multistate-intersection') {
+        tbl <- tbl |>
+            dplyr::filter(.data$Attribute %in% valid_attributes) |>
+            dplyr::filter(.data$Attribute_value == TRUE)
+    } else if (attr_type %in% c('binary', 'multistate-union')) {
+        tbl <- tbl |>
+            dplyr::filter(.data$Attribute %in% valid_attributes) |>
+            dplyr::mutate(
+                Attribute = paste0(.data$Attribute, '--', .data$Attribute_value)
+            )
+    }
+
     phys_data <- tbl |>
         tibble::as_tibble() |>
-        dplyr::filter(.data$Attribute_value == TRUE) |>
-        dplyr::filter(.data$Attribute %in% valid_attributes) |>
         dplyr::filter(
             !((is.na(.data$NCBI_ID) | .data$NCBI_ID == 'unknown') & is.na(.data$Parent_NCBI_ID))
         ) |>
         dplyr::filter(!is.na(.data$Attribute_source), !is.na(.data$Frequency)) |>
         dplyr::mutate(Score = freq2Scores(.data$Frequency)) |>
         dplyr::select(tidyselect::all_of(select_cols)) |>
         dplyr::distinct()
-    n_dropped_rows <- nrow(tbl) - nrow(phys_data)
+    n_dropped_rows <- n_rows - nrow(phys_data)
     message(format(n_dropped_rows, big.mark = ','), ' rows were dropped.')
     return(phys_data)
 }
@@ -57,9 +88,23 @@ filterDataMulti <- function(tbl) {
 getDataReady <- function(tbl) {
     set_with_ids <- getSetWithIDs(tbl)
     set_without_ids <- getSetWithoutIDs(tbl, set_with_ids = set_with_ids)
-    dplyr::bind_rows(set_with_ids, set_without_ids) |>
-        tidyr::complete(NCBI_ID, Attribute, fill = list(Score = 0)) |>
-        dplyr::arrange(NCBI_ID, Attribute)
+    dataset <- dplyr::bind_rows(set_with_ids, set_without_ids)
+    attr_type <- unique(dataset$Attribute_type)
+    if (attr_type == 'binary') {
+        current_attr <- unique(dataset$Attribute)
+        if (length(current_attr) == 1 && grepl('--TRUE$', current_attr)) {
+            extra_level <- sub('--TRUE$', '--FALSE', current_attr)
+            dataset$Attribute <- factor(dataset$Attribute, levels = c(extra_level, current_attr))
+        }
+        output <- dataset |>
+            tidyr::complete(NCBI_ID, Attribute, fill = list(Score = 0)) |>
+            dplyr::arrange(NCBI_ID, Attribute)
+    } else if (attr_type == 'multistate-intersection') {
+        output <- dataset |>
+            tidyr::complete(NCBI_ID, Attribute, fill = list(Score = 0)) |>
+            dplyr::arrange(NCBI_ID, Attribute)
+    }
+    return(output)
 }
 
 #' Get set with IDs
@@ -108,7 +153,7 @@ getSetWithIDs <- function(tbl) {
         dplyr::filter(!is.na(.data$NCBI_ID)) |>
         dplyr::distinct() |>
         dplyr::arrange(.data$NCBI_ID, .data$Attribute) |>
-        dplyr::relocate(all_of(.orderedColumns()))
+        dplyr::relocate(tidyselect::all_of(.orderedColumns()))
 }
 
 #' Get set without IDs
@@ -130,6 +175,8 @@ getSetWithoutIDs <- function(tbl, set_with_ids = NULL) {
 
     valid_ranks <- c('genus', 'species', 'strain')
     lgl_vct <- is.na(tbl$NCBI_ID) | tbl$NCBI_ID == 'unknown'
+    if (!any(lgl_vct))
+        return(NULL)
     tbl |>
         dplyr::filter(lgl_vct) |>
         dplyr::select(

diff --git a/R/utils.R b/R/utils.R
@@ -60,15 +60,15 @@ conf2Fct <- function(x) {
 #'
 #' \code{addRankPrefix} adds a prefix (`'[kpcofgst]__'`) to an NCBI ID (taxid)
 #'
-#' @param id A charecter vector of NCBI IDs (taxids).
+#' @param id A character vector of NCBI IDs (taxids).
 #' @param rank A character vector of ranks
 #'
 #' @return A character vector of IDs with prefix.
 #' @export
 #'
 addRankPrefix <- function(id, rank) {
     highest_ranks <- c('kingdom', 'superkingdom', 'domain')
-    case_when(
+    dplyr::case_when(
         rank %in% highest_ranks ~ paste0('k__', id),
         rank == 'phylum' ~ paste0('p__', id),
         rank == 'class' ~ paste0('c__', id),

diff --git a/inst/scripts/binary.R b/inst/scripts/binary.R
@@ -0,0 +1,21 @@
+library(bugphyzz)
+library(taxPPro)
+library(tidyr)
+library(dplyr)
+library(purrr)
+
+p <- physiologies()
+
+filtered <- vector('list', length(p))
+for (i in seq_along(p)) {
+    names(filtered)[i] <- names(p)[i]
+    message(names(p)[i])
+    res <- filterData(p[[i]])
+    if (is.null(res))
+        next
+    filtered[[i]] <- res
+}
+
+ap <- p$`acetate producing`
+ap_ready <- getDataReady(filterData(ap))
+
diff --git a/inst/scripts/phytools2.R b/inst/scripts/phytools2.R
@@ -17,13 +17,13 @@ library(ggplot2)
 
 
 ## ----import physiology, message=FALSE------------------------------------------------------------------------------------
-phys_name <- 'aerophilicity'
+phys_name <- 'acetate producing'
 phys <- physiologies(phys_name)
 
 
 ## ----warning=FALSE-------------------------------------------------------------------------------------------------------
-phys_data_ready <- phys[[1]] |> 
-    filterData() |> 
+phys_data_ready <- phys[[1]] |>
+    filterData() |>
     getDataReady()
 phys_data_list <- split(phys_data_ready, factor(phys_data_ready$NCBI_ID))
 
@@ -56,7 +56,7 @@ Attribute_group_var <- Attribute_group_var[!is.na(Attribute_group_var)]
 Attribute_type_var <- unique(phys_data_ready$Attribute_type)
 Attribute_type_var <- Attribute_type_var[!is.na(Attribute_type_var)]
 ncbi_tree$Do(
-    function(node) taxPool(node = node, grp = Attribute_group_var, typ = Attribute_type_var), 
+    function(node) taxPool(node = node, grp = Attribute_group_var, typ = Attribute_type_var),
     traversal = 'post-order'
 )
 ncbi_tree$Do(inh1, traversal = 'pre-order')
@@ -65,17 +65,17 @@ ncbi_tree$Do(inh1, traversal = 'pre-order')
 ## ------------------------------------------------------------------------------------------------------------------------
 new_phys_data <- ncbi_tree$Get(
     'attribute_tbl', filterFun = function(node) grepl('^[gst]__', node$name)
-) |> 
-    discard(~ all(is.na(.x))) |> 
-    bind_rows() |> 
-    arrange(NCBI_ID, Attribute) |> 
-    filter(!NCBI_ID %in% phys_data_ready$NCBI_ID) |> 
-    mutate(taxid = sub('^\\w__', '', NCBI_ID)) |> 
+) |>
+    discard(~ all(is.na(.x))) |>
+    bind_rows() |>
+    arrange(NCBI_ID, Attribute) |>
+    filter(!NCBI_ID %in% phys_data_ready$NCBI_ID) |>
+    mutate(taxid = sub('^\\w__', '', NCBI_ID)) |>
     bind_rows(phys_data_ready)
 
 tip_data_annotated <- left_join(
-    tip_data, 
-    select(new_phys_data, taxid, Attribute, Score), 
+    tip_data,
+    select(new_phys_data, taxid, Attribute, Score),
     by = 'taxid'
     )
 
@@ -84,12 +84,12 @@ annotated_tips <- tip_data_annotated |>
     filter(!is.na(Attribute)) |>
     pivot_wider(
         names_from = 'Attribute', values_from = 'Score', values_fill = 0
-    ) |> 
+    ) |>
     tibble::column_to_rownames(var = 'tip_label') |>
     as.matrix()
 no_annotated_tips_chr <- tip_data_annotated |>
     filter(!tip_label %in% rownames(annotated_tips)) |>
-    pull(tip_label) |> 
+    pull(tip_label) |>
     unique()
 no_annotated_tips <- matrix(
     data = rep(rep(1/ncol(annotated_tips), ncol(annotated_tips)), length(no_annotated_tips_chr)),
@@ -106,7 +106,7 @@ input_matrix <- input_matrix[tree$tip.label,]
 
 ## ------------------------------------------------------------------------------------------------------------------------
 fit <- fitMk(
-    tree = tree, x = input_matrix, model = 'ER', pi = 'fitzjohn', 
+    tree = tree, x = input_matrix, model = 'ER', pi = 'fitzjohn',
     lik.func = 'pruning', logscale = TRUE
 )
 asr <- ancr(object = fit, tips = TRUE)
@@ -116,26 +116,26 @@ rownames(res)[node_rows] <- tree$node.label
 
 
 ## ------------------------------------------------------------------------------------------------------------------------
-new_taxa_from_tips <- res[rownames(no_annotated_tips),] |> 
-    as.data.frame() |> 
-    tibble::rownames_to_column(var = 'tip_label') |> 
-    left_join(tip_data, by = 'tip_label') |> 
+new_taxa_from_tips <- res[rownames(no_annotated_tips),] |>
+    as.data.frame() |>
+    tibble::rownames_to_column(var = 'tip_label') |>
+    left_join(tip_data, by = 'tip_label') |>
     mutate(
         Rank = taxizedb::taxid2rank(taxid, db = 'ncbi')
-    ) |> 
-    filter(Rank %in% c('genus', 'species', 'strain')) |> 
+    ) |>
+    filter(Rank %in% c('genus', 'species', 'strain')) |>
     mutate(
         NCBI_ID = case_when(
             Rank == 'genus' ~ paste0('g__', taxid),
             Rank == 'species' ~ paste0('s__', taxid),
             Rank == 'strain' ~ paste0('t__', taxid)
         )
-    ) |> 
-    select(-ends_with('_taxid'), -tip_label, -taxid, -accession) |> 
-    relocate(NCBI_ID, Taxon_name, Rank) |> 
+    ) |>
+    select(-ends_with('_taxid'), -tip_label, -taxid, -accession) |>
+    relocate(NCBI_ID, Taxon_name, Rank) |>
     pivot_longer(
         names_to = 'Attribute', values_to = 'Score', cols = 4:last_col()
-    ) |> 
+    ) |>
     mutate(
         Evidence = 'asr',
         Attribute_source = NA,
@@ -153,14 +153,14 @@ new_taxa_from_tips <- res[rownames(no_annotated_tips),] |>
     )
 
 nodes_annotated <- res[which(grepl('^\\d+(\\+\\d+)*', rownames(res))),]
-new_taxa_from_nodes <- nodes_annotated |> 
-    as.data.frame() |> 
-    tibble::rownames_to_column(var = 'NCBI_ID') |> 
-    filter(grepl('^\\d+(\\+\\d+)*', NCBI_ID)) |> 
-    mutate(NCBI_ID = strsplit(NCBI_ID, '\\+')) |> 
-    tidyr::unnest(NCBI_ID) |> 
-    mutate(Rank = taxizedb::taxid2rank(NCBI_ID)) |> 
-    mutate(Rank = ifelse(Rank == 'superkingdom', 'kingdom', Rank)) |> 
+new_taxa_from_nodes <- nodes_annotated |>
+    as.data.frame() |>
+    tibble::rownames_to_column(var = 'NCBI_ID') |>
+    filter(grepl('^\\d+(\\+\\d+)*', NCBI_ID)) |>
+    mutate(NCBI_ID = strsplit(NCBI_ID, '\\+')) |>
+    tidyr::unnest(NCBI_ID) |>
+    mutate(Rank = taxizedb::taxid2rank(NCBI_ID)) |>
+    mutate(Rank = ifelse(Rank == 'superkingdom', 'kingdom', Rank)) |>
     mutate(
         NCBI_ID = case_when(
             Rank == 'kingdom' ~ paste0('k__', NCBI_ID),
@@ -172,18 +172,18 @@ new_taxa_from_nodes <- nodes_annotated |>
             Rank == 'species' ~ paste0('s__', NCBI_ID),
             Rank == 'strain' ~ paste0('t__', NCBI_ID)
         )
-    ) |> 
+    ) |>
     filter(
         Rank %in% c(
             'kingdom', 'phylum', 'class', 'order', 'family', 'genus',
             'species', 'strain'
         )
-    ) |> 
-    mutate(Evidence = 'asr') |> 
-    relocate(NCBI_ID, Rank, Evidence) |> 
+    ) |>
+    mutate(Evidence = 'asr') |>
+    relocate(NCBI_ID, Rank, Evidence) |>
     pivot_longer(
         cols = 4:last_col(), names_to = 'Attribute', values_to = 'Score'
-    ) |> 
+    ) |>
     mutate(
         Attribute_source = NA,
         Confidence_in_curation = NA,
@@ -200,8 +200,8 @@ new_taxa_from_nodes <- nodes_annotated |>
         )
     )
 
-new_taxa_for_ncbi_tree <- new_taxa_from_tips |> 
-    relocate(NCBI_ID, Rank, Attribute, Score, Evidence) |> 
+new_taxa_for_ncbi_tree <- new_taxa_from_tips |>
+    relocate(NCBI_ID, Rank, Attribute, Score, Evidence) |>
     bind_rows(new_taxa_from_nodes)
 new_taxa_for_ncbi_tree_list <- split(
     new_taxa_for_ncbi_tree, factor(new_taxa_for_ncbi_tree$NCBI_ID)
@@ -221,7 +221,7 @@ ncbi_tree$Do(function(node) {
 
 ## ------------------------------------------------------------------------------------------------------------------------
 ncbi_tree$Do(
-    function(node) taxPool(node = node, grp = Attribute_group_var, typ = Attribute_type_var ), 
+    function(node) taxPool(node = node, grp = Attribute_group_var, typ = Attribute_type_var ),
     traversal = 'post-order'
 )
 ncbi_tree$Do(inh2, traversal = 'pre-order')
@@ -231,13 +231,13 @@ ncbi_tree$Do(inh2, traversal = 'pre-order')
 output <- ncbi_tree$Get(
     attribute = 'attribute_tbl', simplify = FALSE,
     filterFun = function(node) node$name != 'ArcBac'
-    ) |> 
+    ) |>
     bind_rows()
 min_thr <- 1 / length(unique(phys_data_ready$Attribute))
-add_taxa_1 <- phys_data_ready |> 
+add_taxa_1 <- phys_data_ready |>
     filter(!NCBI_ID %in% unique(output$NCBI_ID))
-add_taxa_2 <- new_taxa_for_ncbi_tree |> 
+add_taxa_2 <- new_taxa_for_ncbi_tree |>
     filter(!NCBI_ID %in% unique(output$NCBI_ID))
-final_output <- bind_rows(list(output, add_taxa_1, add_taxa_2)) |> 
+final_output <- bind_rows(list(output, add_taxa_1, add_taxa_2)) |>
     filter(Score > min_thr)
 
diff --git a/man/addRankPrefix.Rd b/man/addRankPrefix.Rd
diff --git a/man/filterDataDiscrete.Rd b/man/filterDataDiscrete.Rd