Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP Set up import from Zenodo, GitHub hash; remove .downloadZ #240

Merged
merged 24 commits into from
Apr 17, 2024
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
8ec8443
Set up import from Zenodo, GitHub hash; remove .downloadZ
jwokaty Mar 29, 2024
54949ee
Add function for downloading specific hash from GH and devel, and fro…
sdgamboa Apr 1, 2024
ba67696
add a few more tests for imrpotBugphyzz
sdgamboa Apr 1, 2024
47973b5
update test with checkNAs
sdgamboa Apr 1, 2024
cff6102
combine testing for devel and hash in a single if statement
sdgamboa Apr 1, 2024
114eabd
remove default version in unexported functions used by importBugphyzz
sdgamboa Apr 1, 2024
1451df3
update attributes talbe according to the tests
sdgamboa Apr 1, 2024
57b2b2f
update curation tests
sdgamboa Apr 1, 2024
2c69d8f
update test physiologies
sdgamboa Apr 1, 2024
79b8ed4
update README with TODOs, for other branches
sdgamboa Apr 1, 2024
9c4325a
fix quotation in attributes.tsv
sdgamboa Apr 1, 2024
25c5fce
remove unnecessary LICENSE file
sdgamboa Apr 1, 2024
e08266a
Validation data was hosted on github. The data was downloaded to extd…
sdgamboa Apr 1, 2024
30f8a92
Remove line of reference to GitHub when importing validation data
sdgamboa Apr 1, 2024
43872f6
Add check-bioc
jwokaty Apr 2, 2024
606cedf
add description for the files in extdata
sdgamboa Apr 2, 2024
4477e42
update hash
sdgamboa Apr 2, 2024
d7d785b
Remove separate pkgdown workflow
jwokaty Apr 2, 2024
4040b28
Update hash in test
sdgamboa Apr 5, 2024
338836a
fix indents and length of lines
sdgamboa Apr 8, 2024
701073c
update hash of github resource
sdgamboa Apr 9, 2024
2f3439d
update PICRUst2 reference for NSTI definition
sdgamboa Apr 9, 2024
8e189a9
update tests
sdgamboa Apr 10, 2024
6c63337
update importBugphyzz and tests with Zenodo DOI
sdgamboa Apr 16, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
197 changes: 0 additions & 197 deletions LICENSE

This file was deleted.

74 changes: 43 additions & 31 deletions R/bugphyzz.R
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@ utils::globalVariables(c(
#' tidy data.frames. To learn more about the structure of the data.frames
#' please check the bugphyzz vignette with `browseVignettes("bugphyzz")`.
#'
#' @param version Character string indicating the version.
#' Options: devel or a zenodo record.
#' @param version Character string indicating the version. Default is the
#' latest release on Zenodo. Options: Zenodo DOI, GitHub commit hash, or devel.
#' @param force_download Logical value. Force a fresh download of the data or
#' use the one stored in the cache (if available). Default is FALSE.
#' @param v Validation value. Default 0.5 (see details).
Expand Down Expand Up @@ -54,12 +54,14 @@ utils::globalVariables(c(
#' names(bp)
#'
importBugphyzz <- function(
version = 'devel', force_download = FALSE, v = 0.5, exclude_rarely = TRUE
version = "d3fd894", force_download = FALSE, v = 0.5, exclude_rarely = TRUE

) {
if (version == 'devel') {
output <- .downloadDevel(force_download)
}

## output is a list of three data.frames
## one of each: binary, multistate, numeric
output <- .downloadResource(version, force_download)

## TODO add release version
output <- lapply(output, function(x) split(x, x$Attribute))
output <- purrr::list_flatten(output)
Expand Down Expand Up @@ -285,9 +287,9 @@ getTaxonSignatures <- function(tax, bp, ...) {
}

.validationData <- function() {
url <- "https://raw.githubusercontent.com/waldronlab/taxPProValidation/main/validation_summary.tsv"
fname <- system.file("extdata", "validation_summary.tsv", package = "bugphyzz")
utils::read.table(
file = url, header = TRUE, sep = "\t", row.names = NULL
file = fname, header = TRUE, sep = "\t", row.names = NULL
) |>
dplyr::mutate(
value = dplyr::case_when(
Expand All @@ -297,30 +299,20 @@ getTaxonSignatures <- function(tax, bp, ...) {
)
}

## Import the devel version of bupghyzz
.downloadDevel <- function(force_download) {
types <- c("multistate", "binary", "numeric")
urls <- paste0(
"https://github.com/waldronlab/bugphyzzExports/raw/main/bugphyzz_",
types,
".csv"
)
names(urls) <- types
output <- vector("list", length(urls))
for (i in seq_along(output)) {
message("Importing ", names(urls)[i], " data...")
names(output)[i] <- names(urls)[i]
rpath <- .getResource(
rname = paste0("bugphyzz_", names(urls)[i], ".tsv"),
url = urls[i], verbose = TRUE, force = force_download
)
output[[i]] <- utils::read.csv(rpath, header = TRUE, skip = 1) |>
dplyr::mutate(Attribute = tolower(Attribute))
## Import a version of bupghyzz
.downloadResource <- function(version, force_download) {
if (stringr::str_detect(version, "^10.5281/zenodo.[0-9]+$")) {
suffix <- sub("^10.5281/zenodo\\.", "", version)
output <- .downloadZ(suffix, force_download)
} else if (version == "devel" || stringr::str_detect(version, stringr::regex("^[:alnum:]{7}$")) ){
output <- .downloadGH(version, force_download)
} else {
stop("Version must be a Zenodo DOI, GitHub commit hash, or 'devel'.")
}
return(output)
}

## TODO update this function when relase is ready
## Function for downloading data on Zenodo
.downloadZ <- function(record, force_download) {
base_url <- paste0("https://zenodo.org/api/records/", record)
req <- httr2::request(base_url)
Expand All @@ -340,9 +332,29 @@ getTaxonSignatures <- function(tax, bp, ...) {

output <- vector("list", length(files))
for (i in seq_along(output)) {
output[[i]] <- utils::read.csv(files[i], header = TRUE)
# output[[i]] <- utils::read.csv(files, header = TRUE, skip = 1)
# dplyr::mutate(Attribute = tolower(Attribute))
output[[i]] <- utils::read.csv(files, header = TRUE, skip = 1) |>
dplyr::mutate(Attribute = tolower(Attribute))
}
return(output)
}

## Function for downloading data on GitHub
.downloadGH <- function(version, force_download) {
file_suffix <- c("binary", "multistate", "numeric")
urls <- paste0("https://github.com/waldronlab/bugphyzzExports/raw/",
version, "/bugphyzz_", file_suffix, ".csv"
)
names(urls) <- c("binary", "multistate", "numeric")
output <- vector("list", length(urls))
for (i in seq_along(output)) {
message("Importing ", names(urls)[i], " data...")
names(output)[i] <- names(urls)[i]
rpath <- .getResource(
rname = paste0("bugphyzz_", names(urls)[i], ".csv"),
url = urls[i], verbose = TRUE, force = force_download
)
output[[i]] <- utils::read.csv(rpath, header = TRUE, skip = 1) |>
dplyr::mutate(Attribute = tolower(Attribute))
}
return(output)
}
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,7 @@ After installation, check vignettes with:
```r
browseVigenttes("bugphyzz")
```

TODO:

- Add scores to signatures
32 changes: 32 additions & 0 deletions inst/extdata/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,38 @@ Columns:
| unit | units used for numeric data. |


## [validation_summary.tsv](./validation_summary.tsv)
sdgamboa marked this conversation as resolved.
Show resolved Hide resolved

Results of the validation of the ASR method using the waldronlab/taxPPro package

Hash: e736097

| Column name | Description |
| ----------- | ----------- |
| method | ASR method (phytools-ltp or castor-ltp) |
| rank | Taxonomic rank (all, genus, species, strain). all was used for the final results.|
| physiology | The name of the attribute group. |
| attribute | The name of the attribute or attribute value.|
| mcc_mean | mean of Mathew's correlation coefficient for discrete only. |
| mcc_sd | standard deviation of Matthew's correlation coefficient for discrete only.|
| r2_mean | R-squard for numeric only. |
| r2_sd | R-squared for numeric only. |
| ltp_bp | Interesction between taxa in bugphyzz (per attribute/attribute value) and the ltp tree. |
| bp | Taxa in bugphyzz (per attribute/attribute value). |
| ltp_bp_phys | Intersection between taxa in bugphyz (per attribute group) and the ltp tree. |
| bp_phys | Total taxa in the bugphyzz (per attribute group).|
| ltp | Number of taxa in the LTP tree. |
| nsti_mean | Mean NSTI value. This was calculated for all physiologies, but is only relevant for numeric ones.|
| nsti_sd | Standard deviation of the NSTI value. This was calculated for all physiologies, but is only relevant for numeric ones. |
| ltp_bp_per | Intersection of LTP tree and bugphyzz (per attribute/attribute value) in numbers. |
| ltp_bp_phys_per | Intersection of LTP tree and bugphyzz (per physiology/attribute group) in percentage. |










Expand Down
8 changes: 4 additions & 4 deletions inst/extdata/attributes.tsv
Original file line number Diff line number Diff line change
Expand Up @@ -170,8 +170,8 @@
"extreme environment" NA "logical" NA NA
"COGEM pathogenicity rating" "COGEM pathogenicity rating" "integer" NA NA
"antimicrobial sensitivity" "antimicrobial sensitivity" "logical" NA NA
"biofilm formation" "biofilm forming" "logical" "APO:0000159" "The ability to form a layer on solid surfaces with or without other microorganisms."
"Butyrate-Producing Bacteria" "butyrate producing" "numeric" "NCIT:C126417" "Any anaerobic bacteria that metabolizes complex carbohydrates to produce butyrate. These microorganisms may form colonies in the intestines of mammals; their presence may aid the host by catabolizing undigested fiber and starch. Additionally, butyrate is metabolized by colonic epithelial cells and may protect the host against ulcerative colitis and cancer."
"biofilm formation" "biofilm formation" "logical" "APO:0000159" "The ability to form a layer on solid surfaces with or without other microorganisms."
"Butyrate-Producing Bacteria" "Butyrate-Producing Bacteria" "numeric" "NCIT:C126417" "Any anaerobic bacteria that metabolizes complex carbohydrates to produce butyrate. These microorganisms may form colonies in the intestines of mammals; their presence may aid the host by catabolizing undigested fiber and starch. Additionally, butyrate is metabolized by colonic epithelial cells and may protect the host against ulcerative colitis and cancer."
"acetate producing" "acetate producing" "numeric" NA NA
"lactate producing" "lactate producing" "numeric" NA NA
"mutation rate per site per year" "mutation rates per site per year" "numeric" NA NA
Expand Down Expand Up @@ -407,7 +407,7 @@
"terrestrial biome" "habitat" "logical" NA NA
"not host-associated" "habitat" "logical" NA NA
"rumen" "habitat;isolation site" "logical" NA NA
"Waste and sludge water" "habitat" "logical" NA NA
"waste and sludge water" "habitat" "logical" NA NA
"blood" "habitat;isolation site" "logical" NA NA
"bovine" "isolation site" "logical" NA NA
"cerebral spinal fluid" "habitat" "logical" NA NA
Expand Down Expand Up @@ -453,7 +453,7 @@
"gamma" "hemolysis" "logical" NA NA
"beta" "hemolysis" "logical" NA NA
"alpha" "hemolysis" "logical" NA NA
"CAMP test" "hemolysis" "logical" NA NA
"camp test" "hemolysis" "logical" NA NA
"biosafety level 1" "biosafety level" "logical" NA NA
"biosafety level 2" "biosafety level" "logical" NA NA
"biosafety level 3" "biosafety level" "logical" NA NA
Expand Down
Loading
Loading