From aeaff36ef85887de404233e5bdea7e09a491524b Mon Sep 17 00:00:00 2001 From: Tamas Stirling Date: Mon, 27 Nov 2023 20:43:01 +0100 Subject: [PATCH] Update function names, tests, documentation --- NAMESPACE | 9 +-- R/get_meta.R | 29 ---------- R/{ncbi_meta.R => ncbi_parse.R} | 14 +++-- ...biosample.R => ncbi_parse_biosample_xml.R} | 6 +- man/get_meta.Rd | 35 ------------ man/ncbi_meta_biosample.Rd | 28 ---------- man/ncbi_parse.Rd | 55 +++++++++++++++++++ man/ncbi_parse_biosample_xml.Rd | 19 +++++++ tests/testthat/test-get_uid.R | 2 +- tests/testthat/test-ncbi_meta.R | 13 ----- tests/testthat/test-ncbi_parse.R | 13 +++++ 11 files changed, 100 insertions(+), 123 deletions(-) delete mode 100644 R/get_meta.R rename R/{ncbi_meta.R => ncbi_parse.R} (74%) rename R/{ncbi_meta_biosample.R => ncbi_parse_biosample_xml.R} (97%) delete mode 100644 man/get_meta.Rd delete mode 100644 man/ncbi_meta_biosample.Rd create mode 100644 man/ncbi_parse.Rd create mode 100644 man/ncbi_parse_biosample_xml.Rd delete mode 100644 tests/testthat/test-ncbi_meta.R create mode 100644 tests/testthat/test-ncbi_parse.R diff --git a/NAMESPACE b/NAMESPACE index 9923893..e160fd4 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -3,27 +3,20 @@ export(ena_query) export(extract_accn) export(flag_files) -export(get_meta) export(get_uid) export(link_uid) export(mgnify_endpoints) export(mgnify_instance) export(mgnify_list) export(ncbi_download_genome) +export(ncbi_parse) export(ncbi_parse_assembly_xml) export(ncbi_parse_biosample_txt) export(parse_gb_header) export(parse_report) -importFrom(XML,xmlParse) -importFrom(XML,xmlRoot) -importFrom(XML,xmlToList) importFrom(curl,curl_download) importFrom(curl,handle_setopt) importFrom(curl,new_handle) -importFrom(dplyr,bind_cols) -importFrom(dplyr,bind_rows) importFrom(httr,RETRY) importFrom(httr,content) -importFrom(rentrez,entrez_fetch) importFrom(stringr,str_locate) -importFrom(tibble,as_tibble) diff --git a/R/get_meta.R b/R/get_meta.R deleted file mode 100644 index 4e57667..0000000 --- a/R/get_meta.R +++ /dev/null @@ -1,29 +0,0 @@ -#' Get metadata for biological sequences -#' -#' This function accesses metadata that usually accompanies the biological -#' sequences. Depending on the database queried this metadata can contain a -#' number of identifiers of information about the sample e.g. when and where it -#' was collected. Currently the function works with a few NCBI databases. -#' @param uid character; the internal UID of an entry inside a database. -#' @param db character; the database to search in. Can be either -#' \code{"assembly"}, \code{"biosample"} or \code{"sra"}. -#' @return a data frame -#' @examples -#' \dontrun{ -#' # search for the species in the NCBI Assembly database -#' assembly_uid <- get_uid("Dechloromonas phosphoritropha", db = "assembly") -#' # get assembly metadata for the first two hits -#' assembly_meta <- get_meta(assembly_uid$uid[1:2], db = "assembly") -#' # convert UIDs in the Assembly database to UIDs in the Biosample database -#' biosample_uid <- link_uid(assembly_uid$uid, from = "assembly", to = "biosample") -#' # get biosample metadata for the first two hits -#' biosample_meta <- get_meta(biosample_uid$result[1:2], db = "biosample") -#' } -#' @export -get_meta <- function(uid, db = "assembly") { - if (db == "assembly") { - ncbi_meta_assembly(uid) - } else if (db == "biosample") { - ncbi_meta_biosample(uid) - } -} \ No newline at end of file diff --git a/R/ncbi_meta.R b/R/ncbi_parse.R similarity index 74% rename from R/ncbi_meta.R rename to R/ncbi_parse.R index 5df6a15..7b1adc8 100644 --- a/R/ncbi_meta.R +++ b/R/ncbi_parse.R @@ -1,8 +1,10 @@ #' Parse NCBI sequence metadata #' #' This function can be used to parse various non-sequence data sets from NCBI -#' into a tibble. The function currently supports parsing NCBI BioSample data -#' from XML format. +#' into a tibble. These data sets usually accompany the biological sequences and +#' contain additional information e.g. identifiers, information about the +#' sample, the sequencing platform, etc. The function currently supports parsing +#' NCBI BioSample data from XML format. #' @param meta character; either a character vector containing a data set that #' was retrieved through \code{rentrez::entrez_fetch()} or a path to an file #' that was downloaded from NCBI. @@ -27,7 +29,7 @@ #' retmode = "xml" #' ) #' # Parse XML -#' ncbi_meta(meta = meta_xml, db = "biosample", format = "xml") +#' ncbi_parse(meta = meta_xml, db = "biosample", format = "xml") #' #' # NCBI BioSample, download XML file from NCBI and parse #' @@ -35,16 +37,16 @@ #' # https://www.ncbi.nlm.nih.gov/biosample/?term=SAMN02714232 #' # upper right corner -> send to -> file -> format = full (xml) -> create file #' # Parse XML -#' ncbi_meta(meta = "biosample_result.xml", db = "biosample", format = "xml") +#' ncbi_parse(meta = "biosample_result.xml", db = "biosample", format = "xml") #' } #' @export -ncbi_meta <- function( +ncbi_parse <- function( meta, db, format = "xml", verbose = getOption("verbose") ) { - f <- get(paste("ncbi_meta", db, format, sep = "_")) + f <- get(paste("ncbi_parse", db, format, sep = "_")) if (db == "biosample" & format == "xml") { out <- f(meta, verbose) } else { diff --git a/R/ncbi_meta_biosample.R b/R/ncbi_parse_biosample_xml.R similarity index 97% rename from R/ncbi_meta_biosample.R rename to R/ncbi_parse_biosample_xml.R index 8d704ac..ca3081f 100644 --- a/R/ncbi_meta_biosample.R +++ b/R/ncbi_parse_biosample_xml.R @@ -6,13 +6,13 @@ #' that was retrieved through \code{rentrez::entrez_fetch()} or a path to an xml #' file that was downloaded from NCBI BioSample. #' @param verbose logical; Should verbose messages be printed to console? -ncbi_meta_biosample_xml <- function( +ncbi_parse_biosample_xml <- function( biosample_xml, verbose = getOption("verbose") ) { parsed_xml <- xml2::as_list(xml2::read_xml(biosample_xml))[[1]] names(parsed_xml) <- sapply(parsed_xml, function(x) attributes(x)$accession) - out <- lapply(parsed_xml, ncbi_meta_biosample_xml_entry) + out <- lapply(parsed_xml, ncbi_parse_biosample_xml_entry) out <- dplyr::bind_rows(out) out <- out[, c( "biosample_uid", @@ -23,7 +23,7 @@ ncbi_meta_biosample_xml <- function( return(out) } -ncbi_meta_biosample_xml_entry <- function(x, verbose = getOption("verbose")) { +ncbi_parse_biosample_xml_entry <- function(x, verbose = getOption("verbose")) { # attributes(x)$names contains all fields! use for validation main_attrs <- attributes(x) expected_names <- c( diff --git a/man/get_meta.Rd b/man/get_meta.Rd deleted file mode 100644 index fa3330b..0000000 --- a/man/get_meta.Rd +++ /dev/null @@ -1,35 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/get_meta.R -\name{get_meta} -\alias{get_meta} -\title{Get metadata for biological sequences} -\usage{ -get_meta(uid, db = "assembly") -} -\arguments{ -\item{uid}{character; the internal UID of an entry inside a database.} - -\item{db}{character; the database to search in. Can be either -\code{"assembly"}, \code{"biosample"} or \code{"sra"}.} -} -\value{ -a data frame -} -\description{ -This function accesses metadata that usually accompanies the biological -sequences. Depending on the database queried this metadata can contain a -number of identifiers of information about the sample e.g. when and where it -was collected. Currently the function works with a few NCBI databases. -} -\examples{ -\dontrun{ -# search for the species in the NCBI Assembly database -assembly_uid <- get_uid("Dechloromonas phosphoritropha", db = "assembly") -# get assembly metadata for the first two hits -assembly_meta <- get_meta(assembly_uid$uid[1:2], db = "assembly") -# convert UIDs in the Assembly database to UIDs in the Biosample database -biosample_uid <- link_uid(assembly_uid$uid, from = "assembly", to = "biosample") -# get biosample metadata for the first two hits -biosample_meta <- get_meta(biosample_uid$result[1:2], db = "biosample") -} -} diff --git a/man/ncbi_meta_biosample.Rd b/man/ncbi_meta_biosample.Rd deleted file mode 100644 index e2291e6..0000000 --- a/man/ncbi_meta_biosample.Rd +++ /dev/null @@ -1,28 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/ncbi_meta_biosample.R -\name{ncbi_meta_biosample} -\alias{ncbi_meta_biosample} -\title{Collect NCBI BioSample metadata} -\usage{ -ncbi_meta_biosample(biosample_uids, verbose = getOption("verbose")) -} -\arguments{ -\item{biosample_uids}{character; a character vector of BioSample UID-s.} - -\item{verbose}{logical; Should verbose messages be printed to console?} -} -\description{ -This function collects metadata for entries in NCBI BioSample database. -} -\examples{ -\dontrun{ -# Sample metadata from BioSample ID -biosample_uid <- get_uid("SAMN02714232", db = "biosample") -ncbi_meta(biosample_uid) - -# Sample metadata from assembly accession -assembly_uid <- get_uid("GCF_000695855.3") -biosample_uid <- ncbi_link_uids(assembly_uid, from = "assembly", to = "biosample") -ncbi_meta_biosample(biosample_uid) -} -} diff --git a/man/ncbi_parse.Rd b/man/ncbi_parse.Rd new file mode 100644 index 0000000..8833b26 --- /dev/null +++ b/man/ncbi_parse.Rd @@ -0,0 +1,55 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ncbi_parse.R +\name{ncbi_parse} +\alias{ncbi_parse} +\title{Parse NCBI sequence metadata} +\usage{ +ncbi_parse(meta, db, format = "xml", verbose = getOption("verbose")) +} +\arguments{ +\item{meta}{character; either a character vector containing a data set that +was retrieved through \code{rentrez::entrez_fetch()} or a path to an file +that was downloaded from NCBI.} + +\item{db}{character; the NCBI database from which the data was retrieved. +Currently only \code{"biosample"} is supported.} + +\item{format}{character; the format of the data set. Currently only +\code{"xml"} is supported.} + +\item{verbose}{logical; Should verbose messages be printed to console?} +} +\description{ +This function can be used to parse various non-sequence data sets from NCBI +into a tibble. These data sets usually accompany the biological sequences and +contain additional information e.g. identifiers, information about the +sample, the sequencing platform, etc. The function currently supports parsing +NCBI BioSample data from XML format. +} +\examples{ +\dontrun{ +data(examples) + +# NCBI BioSample, fully programmatic access + +# Get internal BioSample UID for BioSample ID +biosample_uid <- get_uid(examples$biosample, db = "biosample") +# Get metadata in XML format +meta_xml <- rentrez::entrez_fetch( + db = "biosample", + id = biosample_uid$uid, + rettype = "full", + retmode = "xml" +) +# Parse XML +ncbi_parse(meta = meta_xml, db = "biosample", format = "xml") + +# NCBI BioSample, download XML file from NCBI and parse + +# Manually download the XML file +# https://www.ncbi.nlm.nih.gov/biosample/?term=SAMN02714232 +# upper right corner -> send to -> file -> format = full (xml) -> create file +# Parse XML +ncbi_parse(meta = "biosample_result.xml", db = "biosample", format = "xml") +} +} diff --git a/man/ncbi_parse_biosample_xml.Rd b/man/ncbi_parse_biosample_xml.Rd new file mode 100644 index 0000000..ce20e51 --- /dev/null +++ b/man/ncbi_parse_biosample_xml.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ncbi_parse_biosample_xml.R +\name{ncbi_parse_biosample_xml} +\alias{ncbi_parse_biosample_xml} +\title{Parse NCBI BioSample metadata} +\usage{ +ncbi_parse_biosample_xml(biosample_xml, verbose = getOption("verbose")) +} +\arguments{ +\item{biosample_xml}{character; either a character vector containing an xml +that was retrieved through \code{rentrez::entrez_fetch()} or a path to an xml +file that was downloaded from NCBI BioSample.} + +\item{verbose}{logical; Should verbose messages be printed to console?} +} +\description{ +BioSample metadata from NCBI can be retrieved in multiple file formats. This +function parses metadata retrieved in XML format. +} diff --git a/tests/testthat/test-get_uid.R b/tests/testthat/test-get_uid.R index d5fdb9b..2977312 100644 --- a/tests/testthat/test-get_uid.R +++ b/tests/testthat/test-get_uid.R @@ -10,4 +10,4 @@ test_that("get_uid works with a complex term", { expect_s3_class(res, c("tbl_df", "tbl", "data.frame")) expect_true(nrow(res) > 3000) -}) \ No newline at end of file +}) diff --git a/tests/testthat/test-ncbi_meta.R b/tests/testthat/test-ncbi_meta.R deleted file mode 100644 index ff81d11..0000000 --- a/tests/testthat/test-ncbi_meta.R +++ /dev/null @@ -1,13 +0,0 @@ -test_that("get_meta() works with Assembly UID-s", { - assembly_uid <- get_uid("GCF_000695855.3", db = "assembly") - res <- get_meta(assembly_uid$uid, db = "assembly") - - expect_s3_class(res, c("tbl_df", "tbl", "data.frame")) -}) - -test_that("get_meta() works with BioSample UID-s", { - biosample_uid <- get_uid("SAMN02714232", db = "biosample") - res <- get_meta(biosample_uid$uid, db = "biosample") - - expect_s3_class(res, c("tbl_df", "tbl", "data.frame")) -}) diff --git a/tests/testthat/test-ncbi_parse.R b/tests/testthat/test-ncbi_parse.R new file mode 100644 index 0000000..f3d5cba --- /dev/null +++ b/tests/testthat/test-ncbi_parse.R @@ -0,0 +1,13 @@ +test_that("ncbi_parse() works with BioSamples", { + data(examples) + biosample_uid <- get_uid(examples$biosample, db = "biosample") + meta_xml <- rentrez::entrez_fetch( + db = "biosample", + id = biosample_uid$uid, + rettype = "full", + retmode = "xml" + ) + res <- ncbi_parse(meta = meta_xml, db = "biosample", format = "xml") + expect_s3_class(res, c("tbl_df", "tbl", "data.frame")) + expect_equal(res$biosample, c("SAMN02714232", "SAMD00057211", "SAMN32745369")) +})