diff --git a/DESCRIPTION b/DESCRIPTION index ee56f5d9..a9d6cc96 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: bugphyzz Title: A harmonized data resource and software for enrichment analysis of microbial physiologies -Version: 0.0.1.9 +Version: 0.0.1.10 Authors@R: c( person( diff --git a/R/bugphyzz.R b/R/bugphyzz.R index da473842..7723b34b 100644 --- a/R/bugphyzz.R +++ b/R/bugphyzz.R @@ -1,17 +1,43 @@ #' Import bugphyzz #' #' \code{importBugphyzz} imports bugphyzz annotations as a list of -#' data.frames (1 per physiology/Attribute) +#' tidy data.frames. To learn more about the structure of the data.frames +#' please check the bugphyzz vignette with `browseVignettes("bugphyzz")`. #' -#' @param version Character string. Default is 'devel' -#' (current file on the GitHub repo waldronlab/bugphyzzExports). +#' @param version Character string indicating the version. +#' Options: devel, doi, GitHub hash. #' @param force_download Logical value. Force a fresh download of the data or #' use the one stored in the cache (if available). Default is FALSE. -#' @param v Validation value. Default 0.5. +#' @param v Validation value. Default 0.5 (see details). #' @param exclude_rarely Default is TRUE. Exclude values with -#' Frequency == FALSE. +#' Frequency == FALSE (see details). #' -#' @return A list of data frames. +#' @details +#' +#' ## Data structure +#' The data structure of the data.frames imported with `importBugphyzz` are +#' detailed in the main vignette. Please run `browseVignettes("bugphyzz")`. +#' +#' ## Validation (`v` argument) +#' Data imported with `importBugphyzz` includes annotations imputed through +#' ancestral state reconstruction (ASR) methods. A 10-fold cross-validation +#' approach was implemented to assess the reliability of the data imputed. +#' Mathew's correlation coefficient (MCC) and R-squared (R2) were used for the +#' validation of discrete and numeric attributes. +#' Details can be found at: https://github.com/waldronlab/taxPProValidation. +#' By default, imputed annotations with a MCC or R2 value greater than 0.5 are +#' imported. The minimum value can be adjusted with the `v` argument (only +#' values between 0 and 1). +#' +#' ## Frequency (exclude_rarely argument) +#' One of the variables in the bugphyzz data.frames is "Frequency", which +#' can adopt values of +#' "always", "usually", "sometimes", "rarely", or "never". By default +#' "never" and "rarely" are excluded. "rarely" could be included with +#' `exclude_rarely = FALSE`. To learn more about these frequency keywords +#' please check the bugphyzz vignette with `browseVignettes("bugphyzz")`. +#' +#' @return A list of tidy data frames. #' @export #' #' @examples @@ -97,31 +123,34 @@ importBugphyzz <- function( #' Make signatures #' -#' \code{makeSignatures} Creates signatures for a list of bugphyzz -#' data.frames imported with \code{importBugphyzz} +#' \code{makeSignatures} Creates signatures for a list of bug signatures from +#' a tidy data.frame imported through the `importBugphyzz` function. Please +#' run `browseVignettes("bugphyz")` for detailed examples. #' #' @param dat A data.frame. #' @param tax_id_type A character string. Valid options: NCBI_ID, Taxon_name. #' @param tax_level A character vector. Taxonomic rank. Valid options: -#' kingdom, phylum, class, order, family, genus, species, strain. +#' superkingdom, kingdom, phylum, class, order, family, genus, species, strain. #' They can be combined. "mixed" is equivalent to select all valid ranks. #' @param evidence A character vector. Valid options: exp, igc, nas, tas, tax, asr. #' They can be combined. Default is all. #' @param frequency A character vector. Valid options: always, usually, -#' sometimes, rarely, unknown. They can be combiend. Default value is all but -#' rarely. -#' @param min_size Minimun number of bugs in a signature. Default is 10. -#' @param min Minimum value inclusive. Only for numeric attributes. Default is NULL. -#' @param max Maximum value inclusive. Only for numeric attributes. Default is NULL. +#' sometimes, rarely, unknown. They can be combined. By default, "rarely" is +#' excluded. +#' @param min_size Minimum number of bugs in a signature. Default is 10. +#' @param min Minimum value (inclusive). Only for numeric attributes. +#' Default is NULL. +#' @param max Maximum value (inclusive). Only for numeric attributes. +#' Default is NULL. #' -#' @return A list of character vector with the IDs of the bugs. +#' @return A list of character vectors with scientific names or taxids. #' @export #' #' @examples #' #' bp <- importBugphyzz() -#' sigs <- lapply(bp, makeSignatures) -#' sigs <- purrr::list_flatten(sigs) +#' sigs <- purrr::map(bp, makeSignatures) +#' sigs <- purrr::list_flatten(sigs, name_spec = "{inner}") #' makeSignatures <- function( dat, tax_id_type = "NCBI_ID", @@ -167,11 +196,13 @@ makeSignatures <- function( #' Get Taxon Signatures #' -#' \code{getTaxonSignatures} get the names of all of the signatures for a taxon. +#' \code{getTaxonSignatures} returns the names of all of the signatures associated +#' with a particular taxon. More details can be found in the main +#' bugphyzz vignette; please run `browseVignettes("bugphyzz")`. #' #' @param tax A valid NCBI ID or taxon name. If taxon name is used, the -#' tax_id_type = "Taxon_name" must also be used. -#' @param bp Import from \code{importBugphyzz}. +#' argument tax_id_type = "Taxon_name" must also be used. +#' @param bp List of data.frames imported with \code{importBugphyzz}. #' @param ... Arguments passed to \code{makeSignatures}. #' #' @return A character vector with the names of the signatures for a taxon. @@ -179,9 +210,10 @@ makeSignatures <- function( #' #' @examples #' taxid <- "562" +#' taxonName <- "Escherichia coli" #' bp <- importBugphyzz() -#' sig_names_1 <- getTaxonSignatures("562", bp) -#' sig_names_2 <- getTaxonSignatures("Escherichia coli", bp, tax_id_type = "Taxon_name") +#' sig_names_1 <- getTaxonSignatures(taxid, bp) +#' sig_names_2 <- getTaxonSignatures(taxonName, bp, tax_id_type = "Taxon_name") #' getTaxonSignatures <- function(tax, bp, ...) { sigs <- purrr::map(bp, makeSignatures, ...) diff --git a/R/physiologies.R b/R/physiologies.R index 4c90d793..7084bb58 100644 --- a/R/physiologies.R +++ b/R/physiologies.R @@ -1,12 +1,9 @@ -#' Import phsiologies +#' Import physiologies (for devs) #' -#' \code{physiologies} imports data from the -#' Google spreadsheets at https://drive.google.com/drive/folders/1i2UAolVWAYa7UnETNnCs0BDWjKPp3ev5. -#' This function (and its internal functions) do minimal changes to the -#' imported data. These changes are only meant to match data coming from -#' different sources, and attaching information needed for further processing, -#' such as source and attribute type. +#' \code{physiologies} imports a list of data.frames. This data is in "raw" +#' state before cleaning and going through the data imputation steps. It +#' should be used by developers/curators of the package. #' #' @param keyword Character vector with one or more valid keywords. #' Valid keyboards can be checked with \code{showPhys}. If 'all', all @@ -89,10 +86,11 @@ physiologies <- function(keyword = 'all', full_source = FALSE) { return(physiologies) } -#' Show list of available physiologies +#' Show list of available physiologies (for devs) #' #' \code{showPhys} prints the names of the available physiologies that can be -#' imported with the \code{\link{physiologies}} function. +#' imported with the \code{\link{physiologies}} function. This function +#' should be used by developers/curators. #' #' @param which_names A character string. Options: 'all' (default), #' 'spreadsheets', 'bacdive'. diff --git a/man/getTaxonSignatures.Rd b/man/getTaxonSignatures.Rd index 89bc22aa..7957c6a9 100644 --- a/man/getTaxonSignatures.Rd +++ b/man/getTaxonSignatures.Rd @@ -8,9 +8,9 @@ getTaxonSignatures(tax, bp, ...) } \arguments{ \item{tax}{A valid NCBI ID or taxon name. If taxon name is used, the -tax_id_type = "Taxon_name" must also be used.} +argument tax_id_type = "Taxon_name" must also be used.} -\item{bp}{Import from \code{importBugphyzz}.} +\item{bp}{List of data.frames imported with \code{importBugphyzz}.} \item{...}{Arguments passed to \code{makeSignatures}.} } @@ -18,12 +18,15 @@ tax_id_type = "Taxon_name" must also be used.} A character vector with the names of the signatures for a taxon. } \description{ -\code{getTaxonSignatures} get the names of all of the signatures for a taxon. +\code{getTaxonSignatures} returns the names of all of the signatures associated +with a particular taxon. More details can be found in the main +bugphyzz vignette; please run \code{browseVignettes("bugphyzz")}. } \examples{ taxid <- "562" +taxonName <- "Escherichia coli" bp <- importBugphyzz() -sig_names_1 <- getTaxonSignatures("562", bp) -sig_names_2 <- getTaxonSignatures("Escherichia coli", bp, tax_id_type = "Taxon_name") +sig_names_1 <- getTaxonSignatures(taxid, bp) +sig_names_2 <- getTaxonSignatures(taxonName, bp, tax_id_type = "Taxon_name") } diff --git a/man/importBugphyzz.Rd b/man/importBugphyzz.Rd index 4a07b3ac..d6bb95c0 100644 --- a/man/importBugphyzz.Rd +++ b/man/importBugphyzz.Rd @@ -12,23 +12,54 @@ importBugphyzz( ) } \arguments{ -\item{version}{Character string. Default is 'devel' -(current file on the GitHub repo waldronlab/bugphyzzExports).} +\item{version}{Character string indicating the version. +Options: devel, doi, GitHub hash.} \item{force_download}{Logical value. Force a fresh download of the data or use the one stored in the cache (if available). Default is FALSE.} -\item{v}{Validation value. Default 0.5.} +\item{v}{Validation value. Default 0.5 (see details).} \item{exclude_rarely}{Default is TRUE. Exclude values with -Frequency == FALSE.} +Frequency == FALSE (see details).} } \value{ -A list of data frames. +A list of tidy data frames. } \description{ \code{importBugphyzz} imports bugphyzz annotations as a list of -data.frames (1 per physiology/Attribute) +tidy data.frames. To learn more about the structure of the data.frames +please check the bugphyzz vignette with \code{browseVignettes("bugphyzz")}. +} +\details{ +\subsection{Data structure}{ + +The data structure of the data.frames imported with \code{importBugphyzz} are +detailed in the main vignette. Please run \code{browseVignettes("bugphyzz")}. +} + +\subsection{Validation (\code{v} argument)}{ + +Data imported with \code{importBugphyzz} includes annotations imputed through +ancestral state reconstruction (ASR) methods. A 10-fold cross-validation +approach was implemented to assess the reliability of the data imputed. +Mathew's correlation coefficient (MCC) and R-squared (R2) were used for the +validation of discrete and numeric attributes. +Details can be found at: https://github.com/waldronlab/taxPProValidation. +By default, imputed annotations with a MCC or R2 value greater than 0.5 are +imported. The minimum value can be adjusted with the \code{v} argument (only +values between 0 and 1). +} + +\subsection{Frequency (exclude_rarely argument)}{ + +One of the variables in the bugphyzz data.frames is "Frequency", which +can adopt values of +"always", "usually", "sometimes", "rarely", or "never". By default +"never" and "rarely" are excluded. "rarely" could be included with +\code{exclude_rarely = FALSE}. To learn more about these frequency keywords +please check the bugphyzz vignette with \code{browseVignettes("bugphyzz")}. +} } \examples{ diff --git a/man/makeSignatures.Rd b/man/makeSignatures.Rd index 9a6f0c1d..b6ae2c14 100644 --- a/man/makeSignatures.Rd +++ b/man/makeSignatures.Rd @@ -21,33 +21,36 @@ makeSignatures( \item{tax_id_type}{A character string. Valid options: NCBI_ID, Taxon_name.} \item{tax_level}{A character vector. Taxonomic rank. Valid options: -kingdom, phylum, class, order, family, genus, species, strain. +superkingdom, kingdom, phylum, class, order, family, genus, species, strain. They can be combined. "mixed" is equivalent to select all valid ranks.} \item{evidence}{A character vector. Valid options: exp, igc, nas, tas, tax, asr. They can be combined. Default is all.} \item{frequency}{A character vector. Valid options: always, usually, -sometimes, rarely, unknown. They can be combiend. Default value is all but -rarely.} +sometimes, rarely, unknown. They can be combined. By default, "rarely" is +excluded.} -\item{min_size}{Minimun number of bugs in a signature. Default is 10.} +\item{min_size}{Minimum number of bugs in a signature. Default is 10.} -\item{min}{Minimum value inclusive. Only for numeric attributes. Default is NULL.} +\item{min}{Minimum value (inclusive). Only for numeric attributes. +Default is NULL.} -\item{max}{Maximum value inclusive. Only for numeric attributes. Default is NULL.} +\item{max}{Maximum value (inclusive). Only for numeric attributes. +Default is NULL.} } \value{ -A list of character vector with the IDs of the bugs. +A list of character vectors with scientific names or taxids. } \description{ -\code{makeSignatures} Creates signatures for a list of bugphyzz -data.frames imported with \code{importBugphyzz} +\code{makeSignatures} Creates signatures for a list of bug signatures from +a tidy data.frame imported through the \code{importBugphyzz} function. Please +run \code{browseVignettes("bugphyz")} for detailed examples. } \examples{ bp <- importBugphyzz() -sigs <- lapply(bp, makeSignatures) -sigs <- purrr::list_flatten(sigs) +sigs <- purrr::map(bp, makeSignatures) +sigs <- purrr::list_flatten(sigs, name_spec = "{inner}") } diff --git a/man/physiologies.Rd b/man/physiologies.Rd index 96e7068a..bbe8230b 100644 --- a/man/physiologies.Rd +++ b/man/physiologies.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/physiologies.R \name{physiologies} \alias{physiologies} -\title{Import phsiologies} +\title{Import physiologies (for devs)} \usage{ physiologies(keyword = "all", full_source = FALSE) } @@ -19,12 +19,9 @@ contain shortened versions of the sources. Default is \code{FALSE}.} A list of data.frames in tidy format. } \description{ -\code{physiologies} imports data from the -Google spreadsheets at https://drive.google.com/drive/folders/1i2UAolVWAYa7UnETNnCs0BDWjKPp3ev5. -This function (and its internal functions) do minimal changes to the -imported data. These changes are only meant to match data coming from -different sources, and attaching information needed for further processing, -such as source and attribute type. +\code{physiologies} imports a list of data.frames. This data is in "raw" +state before cleaning and going through the data imputation steps. It +should be used by developers/curators of the package. } \examples{ l <- physiologies('all') diff --git a/man/showPhys.Rd b/man/showPhys.Rd index 64750c81..4e40c1f9 100644 --- a/man/showPhys.Rd +++ b/man/showPhys.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/physiologies.R \name{showPhys} \alias{showPhys} -\title{Show list of available physiologies} +\title{Show list of available physiologies (for devs)} \usage{ showPhys(which_names = "all") } @@ -15,7 +15,8 @@ A character vector with the names of the physiologies. } \description{ \code{showPhys} prints the names of the available physiologies that can be -imported with the \code{\link{physiologies}} function. +imported with the \code{\link{physiologies}} function. This function +should be used by developers/curators. } \examples{ showPhys()