Skip to content

Commit

Permalink
update workflow for binaries;
Browse files Browse the repository at this point in the history
  • Loading branch information
sdgamboa committed Oct 4, 2023
1 parent c2c444b commit 816845e
Show file tree
Hide file tree
Showing 7 changed files with 144 additions and 58 deletions.
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ export(cleanNode)
export(conf2Fct)
export(fillNAs)
export(filterData)
export(filterDataDiscrete)
export(freq2Scores)
export(getAgreements)
export(getConflicts)
Expand Down
67 changes: 57 additions & 10 deletions R/taxPPro.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,27 @@
#' @export
#'
filterData <- function(tbl) {
types <- bugphyzz:::.DISCRETE_TYPES
attr_type <- unique(tbl$Attribute_type)
if(attr_type == 'logical'){
output <- filterDataMulti(tbl)
if (attr_type %in% types){
output <- filterDataDiscrete(tbl)
} else {
output <- NULL
}
return(output)
}

filterDataMulti <- function(tbl) {
#' Filter attributes with discrete type
#'
#' \code{filterDataDiscrete} filters discrete data.
#'
#' @param tbl A data.frame.
#'
#' @return A data.frame.
#' @export
#'
filterDataDiscrete <- function(tbl) {
phys_name <- unique(tbl$Attribute_group)
select_cols <- c(
'NCBI_ID', 'Taxon_name', 'Parent_NCBI_ID',
'Attribute','Attribute_source', 'Confidence_in_curation',
Expand All @@ -31,18 +44,36 @@ filterDataMulti <- function(tbl) {
dplyr::filter(.data$attribute_group == phys_name) |>
dplyr::pull(.data$attribute) |>
unique()

n_rows <- nrow(tbl)

tbl <- tbl |>
dplyr::filter(!is.na(.data$Attribute)) |>
dplyr::filter(!is.na(.data$Attribute_value))

attr_type <- unique(tbl$Attribute_type)
if (attr_type == 'multistate-intersection') {
tbl <- tbl |>
dplyr::filter(.data$Attribute %in% valid_attributes) |>
dplyr::filter(.data$Attribute_value == TRUE)
} else if (attr_type %in% c('binary', 'multistate-union')) {
tbl <- tbl |>
dplyr::filter(.data$Attribute %in% valid_attributes) |>
dplyr::mutate(
Attribute = paste0(.data$Attribute, '--', .data$Attribute_value)
)
}

phys_data <- tbl |>
tibble::as_tibble() |>
dplyr::filter(.data$Attribute_value == TRUE) |>
dplyr::filter(.data$Attribute %in% valid_attributes) |>
dplyr::filter(
!((is.na(.data$NCBI_ID) | .data$NCBI_ID == 'unknown') & is.na(.data$Parent_NCBI_ID))
) |>
dplyr::filter(!is.na(.data$Attribute_source), !is.na(.data$Frequency)) |>
dplyr::mutate(Score = freq2Scores(.data$Frequency)) |>
dplyr::select(tidyselect::all_of(select_cols)) |>
dplyr::distinct()
n_dropped_rows <- nrow(tbl) - nrow(phys_data)
n_dropped_rows <- n_rows - nrow(phys_data)
message(format(n_dropped_rows, big.mark = ','), ' rows were dropped.')
return(phys_data)
}
Expand All @@ -57,9 +88,23 @@ filterDataMulti <- function(tbl) {
getDataReady <- function(tbl) {
set_with_ids <- getSetWithIDs(tbl)
set_without_ids <- getSetWithoutIDs(tbl, set_with_ids = set_with_ids)
dplyr::bind_rows(set_with_ids, set_without_ids) |>
tidyr::complete(NCBI_ID, Attribute, fill = list(Score = 0)) |>
dplyr::arrange(NCBI_ID, Attribute)
dataset <- dplyr::bind_rows(set_with_ids, set_without_ids)
attr_type <- unique(dataset$Attribute_type)
if (attr_type == 'binary') {
current_attr <- unique(dataset$Attribute)
if (length(current_attr) == 1 && grepl('--TRUE$', current_attr)) {
extra_level <- sub('--TRUE$', '--FALSE', current_attr)
dataset$Attribute <- factor(dataset$Attribute, levels = c(extra_level, current_attr))
}
output <- dataset |>
tidyr::complete(NCBI_ID, Attribute, fill = list(Score = 0)) |>
dplyr::arrange(NCBI_ID, Attribute)
} else if (attr_type == 'multistate-intersection') {
output <- dataset |>
tidyr::complete(NCBI_ID, Attribute, fill = list(Score = 0)) |>
dplyr::arrange(NCBI_ID, Attribute)
}
return(output)
}

#' Get set with IDs
Expand Down Expand Up @@ -108,7 +153,7 @@ getSetWithIDs <- function(tbl) {
dplyr::filter(!is.na(.data$NCBI_ID)) |>
dplyr::distinct() |>
dplyr::arrange(.data$NCBI_ID, .data$Attribute) |>
dplyr::relocate(all_of(.orderedColumns()))
dplyr::relocate(tidyselect::all_of(.orderedColumns()))
}

#' Get set without IDs
Expand All @@ -130,6 +175,8 @@ getSetWithoutIDs <- function(tbl, set_with_ids = NULL) {

valid_ranks <- c('genus', 'species', 'strain')
lgl_vct <- is.na(tbl$NCBI_ID) | tbl$NCBI_ID == 'unknown'
if (!any(lgl_vct))
return(NULL)
tbl |>
dplyr::filter(lgl_vct) |>
dplyr::select(
Expand Down
4 changes: 2 additions & 2 deletions R/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -60,15 +60,15 @@ conf2Fct <- function(x) {
#'
#' \code{addRankPrefix} adds a prefix (`'[kpcofgst]__'`) to an NCBI ID (taxid)
#'
#' @param id A charecter vector of NCBI IDs (taxids).
#' @param id A character vector of NCBI IDs (taxids).
#' @param rank A character vector of ranks
#'
#' @return A character vector of IDs with prefix.
#' @export
#'
addRankPrefix <- function(id, rank) {
highest_ranks <- c('kingdom', 'superkingdom', 'domain')
case_when(
dplyr::case_when(
rank %in% highest_ranks ~ paste0('k__', id),
rank == 'phylum' ~ paste0('p__', id),
rank == 'class' ~ paste0('c__', id),
Expand Down
21 changes: 21 additions & 0 deletions inst/scripts/binary.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
library(bugphyzz)
library(taxPPro)
library(tidyr)
library(dplyr)
library(purrr)

p <- physiologies()

filtered <- vector('list', length(p))
for (i in seq_along(p)) {
names(filtered)[i] <- names(p)[i]
message(names(p)[i])
res <- filterData(p[[i]])
if (is.null(res))
next
filtered[[i]] <- res
}

ap <- p$`acetate producing`
ap_ready <- getDataReady(filterData(ap))

90 changes: 45 additions & 45 deletions inst/scripts/phytools2.R
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,13 @@ library(ggplot2)


## ----import physiology, message=FALSE------------------------------------------------------------------------------------
phys_name <- 'aerophilicity'
phys_name <- 'acetate producing'
phys <- physiologies(phys_name)


## ----warning=FALSE-------------------------------------------------------------------------------------------------------
phys_data_ready <- phys[[1]] |>
filterData() |>
phys_data_ready <- phys[[1]] |>
filterData() |>
getDataReady()
phys_data_list <- split(phys_data_ready, factor(phys_data_ready$NCBI_ID))

Expand Down Expand Up @@ -56,7 +56,7 @@ Attribute_group_var <- Attribute_group_var[!is.na(Attribute_group_var)]
Attribute_type_var <- unique(phys_data_ready$Attribute_type)
Attribute_type_var <- Attribute_type_var[!is.na(Attribute_type_var)]
ncbi_tree$Do(
function(node) taxPool(node = node, grp = Attribute_group_var, typ = Attribute_type_var),
function(node) taxPool(node = node, grp = Attribute_group_var, typ = Attribute_type_var),
traversal = 'post-order'
)
ncbi_tree$Do(inh1, traversal = 'pre-order')
Expand All @@ -65,17 +65,17 @@ ncbi_tree$Do(inh1, traversal = 'pre-order')
## ------------------------------------------------------------------------------------------------------------------------
new_phys_data <- ncbi_tree$Get(
'attribute_tbl', filterFun = function(node) grepl('^[gst]__', node$name)
) |>
discard(~ all(is.na(.x))) |>
bind_rows() |>
arrange(NCBI_ID, Attribute) |>
filter(!NCBI_ID %in% phys_data_ready$NCBI_ID) |>
mutate(taxid = sub('^\\w__', '', NCBI_ID)) |>
) |>
discard(~ all(is.na(.x))) |>
bind_rows() |>
arrange(NCBI_ID, Attribute) |>
filter(!NCBI_ID %in% phys_data_ready$NCBI_ID) |>
mutate(taxid = sub('^\\w__', '', NCBI_ID)) |>
bind_rows(phys_data_ready)

tip_data_annotated <- left_join(
tip_data,
select(new_phys_data, taxid, Attribute, Score),
tip_data,
select(new_phys_data, taxid, Attribute, Score),
by = 'taxid'
)

Expand All @@ -84,12 +84,12 @@ annotated_tips <- tip_data_annotated |>
filter(!is.na(Attribute)) |>
pivot_wider(
names_from = 'Attribute', values_from = 'Score', values_fill = 0
) |>
) |>
tibble::column_to_rownames(var = 'tip_label') |>
as.matrix()
no_annotated_tips_chr <- tip_data_annotated |>
filter(!tip_label %in% rownames(annotated_tips)) |>
pull(tip_label) |>
pull(tip_label) |>
unique()
no_annotated_tips <- matrix(
data = rep(rep(1/ncol(annotated_tips), ncol(annotated_tips)), length(no_annotated_tips_chr)),
Expand All @@ -106,7 +106,7 @@ input_matrix <- input_matrix[tree$tip.label,]

## ------------------------------------------------------------------------------------------------------------------------
fit <- fitMk(
tree = tree, x = input_matrix, model = 'ER', pi = 'fitzjohn',
tree = tree, x = input_matrix, model = 'ER', pi = 'fitzjohn',
lik.func = 'pruning', logscale = TRUE
)
asr <- ancr(object = fit, tips = TRUE)
Expand All @@ -116,26 +116,26 @@ rownames(res)[node_rows] <- tree$node.label


## ------------------------------------------------------------------------------------------------------------------------
new_taxa_from_tips <- res[rownames(no_annotated_tips),] |>
as.data.frame() |>
tibble::rownames_to_column(var = 'tip_label') |>
left_join(tip_data, by = 'tip_label') |>
new_taxa_from_tips <- res[rownames(no_annotated_tips),] |>
as.data.frame() |>
tibble::rownames_to_column(var = 'tip_label') |>
left_join(tip_data, by = 'tip_label') |>
mutate(
Rank = taxizedb::taxid2rank(taxid, db = 'ncbi')
) |>
filter(Rank %in% c('genus', 'species', 'strain')) |>
) |>
filter(Rank %in% c('genus', 'species', 'strain')) |>
mutate(
NCBI_ID = case_when(
Rank == 'genus' ~ paste0('g__', taxid),
Rank == 'species' ~ paste0('s__', taxid),
Rank == 'strain' ~ paste0('t__', taxid)
)
) |>
select(-ends_with('_taxid'), -tip_label, -taxid, -accession) |>
relocate(NCBI_ID, Taxon_name, Rank) |>
) |>
select(-ends_with('_taxid'), -tip_label, -taxid, -accession) |>
relocate(NCBI_ID, Taxon_name, Rank) |>
pivot_longer(
names_to = 'Attribute', values_to = 'Score', cols = 4:last_col()
) |>
) |>
mutate(
Evidence = 'asr',
Attribute_source = NA,
Expand All @@ -153,14 +153,14 @@ new_taxa_from_tips <- res[rownames(no_annotated_tips),] |>
)

nodes_annotated <- res[which(grepl('^\\d+(\\+\\d+)*', rownames(res))),]
new_taxa_from_nodes <- nodes_annotated |>
as.data.frame() |>
tibble::rownames_to_column(var = 'NCBI_ID') |>
filter(grepl('^\\d+(\\+\\d+)*', NCBI_ID)) |>
mutate(NCBI_ID = strsplit(NCBI_ID, '\\+')) |>
tidyr::unnest(NCBI_ID) |>
mutate(Rank = taxizedb::taxid2rank(NCBI_ID)) |>
mutate(Rank = ifelse(Rank == 'superkingdom', 'kingdom', Rank)) |>
new_taxa_from_nodes <- nodes_annotated |>
as.data.frame() |>
tibble::rownames_to_column(var = 'NCBI_ID') |>
filter(grepl('^\\d+(\\+\\d+)*', NCBI_ID)) |>
mutate(NCBI_ID = strsplit(NCBI_ID, '\\+')) |>
tidyr::unnest(NCBI_ID) |>
mutate(Rank = taxizedb::taxid2rank(NCBI_ID)) |>
mutate(Rank = ifelse(Rank == 'superkingdom', 'kingdom', Rank)) |>
mutate(
NCBI_ID = case_when(
Rank == 'kingdom' ~ paste0('k__', NCBI_ID),
Expand All @@ -172,18 +172,18 @@ new_taxa_from_nodes <- nodes_annotated |>
Rank == 'species' ~ paste0('s__', NCBI_ID),
Rank == 'strain' ~ paste0('t__', NCBI_ID)
)
) |>
) |>
filter(
Rank %in% c(
'kingdom', 'phylum', 'class', 'order', 'family', 'genus',
'species', 'strain'
)
) |>
mutate(Evidence = 'asr') |>
relocate(NCBI_ID, Rank, Evidence) |>
) |>
mutate(Evidence = 'asr') |>
relocate(NCBI_ID, Rank, Evidence) |>
pivot_longer(
cols = 4:last_col(), names_to = 'Attribute', values_to = 'Score'
) |>
) |>
mutate(
Attribute_source = NA,
Confidence_in_curation = NA,
Expand All @@ -200,8 +200,8 @@ new_taxa_from_nodes <- nodes_annotated |>
)
)

new_taxa_for_ncbi_tree <- new_taxa_from_tips |>
relocate(NCBI_ID, Rank, Attribute, Score, Evidence) |>
new_taxa_for_ncbi_tree <- new_taxa_from_tips |>
relocate(NCBI_ID, Rank, Attribute, Score, Evidence) |>
bind_rows(new_taxa_from_nodes)
new_taxa_for_ncbi_tree_list <- split(
new_taxa_for_ncbi_tree, factor(new_taxa_for_ncbi_tree$NCBI_ID)
Expand All @@ -221,7 +221,7 @@ ncbi_tree$Do(function(node) {

## ------------------------------------------------------------------------------------------------------------------------
ncbi_tree$Do(
function(node) taxPool(node = node, grp = Attribute_group_var, typ = Attribute_type_var ),
function(node) taxPool(node = node, grp = Attribute_group_var, typ = Attribute_type_var ),
traversal = 'post-order'
)
ncbi_tree$Do(inh2, traversal = 'pre-order')
Expand All @@ -231,13 +231,13 @@ ncbi_tree$Do(inh2, traversal = 'pre-order')
output <- ncbi_tree$Get(
attribute = 'attribute_tbl', simplify = FALSE,
filterFun = function(node) node$name != 'ArcBac'
) |>
) |>
bind_rows()
min_thr <- 1 / length(unique(phys_data_ready$Attribute))
add_taxa_1 <- phys_data_ready |>
add_taxa_1 <- phys_data_ready |>
filter(!NCBI_ID %in% unique(output$NCBI_ID))
add_taxa_2 <- new_taxa_for_ncbi_tree |>
add_taxa_2 <- new_taxa_for_ncbi_tree |>
filter(!NCBI_ID %in% unique(output$NCBI_ID))
final_output <- bind_rows(list(output, add_taxa_1, add_taxa_2)) |>
final_output <- bind_rows(list(output, add_taxa_1, add_taxa_2)) |>
filter(Score > min_thr)

2 changes: 1 addition & 1 deletion man/addRankPrefix.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

17 changes: 17 additions & 0 deletions man/filterDataDiscrete.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 816845e

Please sign in to comment.