From aeaff36ef85887de404233e5bdea7e09a491524b Mon Sep 17 00:00:00 2001
From: Tamas Stirling <stirling.tamas@gmail.com>
Date: Mon, 27 Nov 2023 20:43:01 +0100
Subject: [PATCH] Update function names, tests, documentation

---
 NAMESPACE                                     |  9 +--
 R/get_meta.R                                  | 29 ----------
 R/{ncbi_meta.R => ncbi_parse.R}               | 14 +++--
 ...biosample.R => ncbi_parse_biosample_xml.R} |  6 +-
 man/get_meta.Rd                               | 35 ------------
 man/ncbi_meta_biosample.Rd                    | 28 ----------
 man/ncbi_parse.Rd                             | 55 +++++++++++++++++++
 man/ncbi_parse_biosample_xml.Rd               | 19 +++++++
 tests/testthat/test-get_uid.R                 |  2 +-
 tests/testthat/test-ncbi_meta.R               | 13 -----
 tests/testthat/test-ncbi_parse.R              | 13 +++++
 11 files changed, 100 insertions(+), 123 deletions(-)
 delete mode 100644 R/get_meta.R
 rename R/{ncbi_meta.R => ncbi_parse.R} (74%)
 rename R/{ncbi_meta_biosample.R => ncbi_parse_biosample_xml.R} (97%)
 delete mode 100644 man/get_meta.Rd
 delete mode 100644 man/ncbi_meta_biosample.Rd
 create mode 100644 man/ncbi_parse.Rd
 create mode 100644 man/ncbi_parse_biosample_xml.Rd
 delete mode 100644 tests/testthat/test-ncbi_meta.R
 create mode 100644 tests/testthat/test-ncbi_parse.R

diff --git a/NAMESPACE b/NAMESPACE
index 9923893..e160fd4 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -3,27 +3,20 @@
 export(ena_query)
 export(extract_accn)
 export(flag_files)
-export(get_meta)
 export(get_uid)
 export(link_uid)
 export(mgnify_endpoints)
 export(mgnify_instance)
 export(mgnify_list)
 export(ncbi_download_genome)
+export(ncbi_parse)
 export(ncbi_parse_assembly_xml)
 export(ncbi_parse_biosample_txt)
 export(parse_gb_header)
 export(parse_report)
-importFrom(XML,xmlParse)
-importFrom(XML,xmlRoot)
-importFrom(XML,xmlToList)
 importFrom(curl,curl_download)
 importFrom(curl,handle_setopt)
 importFrom(curl,new_handle)
-importFrom(dplyr,bind_cols)
-importFrom(dplyr,bind_rows)
 importFrom(httr,RETRY)
 importFrom(httr,content)
-importFrom(rentrez,entrez_fetch)
 importFrom(stringr,str_locate)
-importFrom(tibble,as_tibble)
diff --git a/R/get_meta.R b/R/get_meta.R
deleted file mode 100644
index 4e57667..0000000
--- a/R/get_meta.R
+++ /dev/null
@@ -1,29 +0,0 @@
-#' Get metadata for biological sequences
-#' 
-#' This function accesses metadata that usually accompanies the biological
-#' sequences. Depending on the database queried this metadata can contain a
-#' number of identifiers of information about the sample e.g. when and where it
-#' was collected. Currently the function works with a few NCBI databases.
-#' @param uid character; the internal UID of an entry inside a database.
-#' @param db character; the database to search in. Can be either 
-#' \code{"assembly"}, \code{"biosample"} or \code{"sra"}.
-#' @return a data frame
-#' @examples
-#' \dontrun{
-#' # search for the species in the NCBI Assembly database
-#' assembly_uid <- get_uid("Dechloromonas phosphoritropha", db = "assembly")
-#' # get assembly metadata for the first two hits
-#' assembly_meta <- get_meta(assembly_uid$uid[1:2], db = "assembly")
-#' # convert UIDs in the Assembly database to UIDs in the Biosample database
-#' biosample_uid <- link_uid(assembly_uid$uid, from = "assembly", to = "biosample")
-#' # get biosample metadata for the first two hits
-#' biosample_meta <- get_meta(biosample_uid$result[1:2], db = "biosample")
-#' }
-#' @export
-get_meta <- function(uid, db = "assembly") {
-  if (db == "assembly") {
-    ncbi_meta_assembly(uid)
-  } else if (db == "biosample") {
-    ncbi_meta_biosample(uid)
-  }
-}
\ No newline at end of file
diff --git a/R/ncbi_meta.R b/R/ncbi_parse.R
similarity index 74%
rename from R/ncbi_meta.R
rename to R/ncbi_parse.R
index 5df6a15..7b1adc8 100644
--- a/R/ncbi_meta.R
+++ b/R/ncbi_parse.R
@@ -1,8 +1,10 @@
 #' Parse NCBI sequence metadata
 #' 
 #' This function can be used to parse various non-sequence data sets from NCBI
-#' into a tibble. The function currently supports parsing NCBI BioSample data
-#' from XML format.
+#' into a tibble. These data sets usually accompany the biological sequences and
+#' contain additional information e.g. identifiers, information about the
+#' sample, the sequencing platform, etc. The function currently supports parsing
+#' NCBI BioSample data from XML format.
 #' @param meta character; either a character vector containing a data set that
 #' was retrieved through \code{rentrez::entrez_fetch()} or a path to an file
 #' that was downloaded from NCBI.
@@ -27,7 +29,7 @@
 #'   retmode = "xml"
 #' )
 #' # Parse XML
-#' ncbi_meta(meta = meta_xml, db = "biosample", format = "xml")
+#' ncbi_parse(meta = meta_xml, db = "biosample", format = "xml")
 #' 
 #' # NCBI BioSample, download XML file from NCBI and parse
 #' 
@@ -35,16 +37,16 @@
 #' # https://www.ncbi.nlm.nih.gov/biosample/?term=SAMN02714232
 #' # upper right corner -> send to -> file -> format = full (xml) -> create file
 #' # Parse XML
-#' ncbi_meta(meta = "biosample_result.xml", db = "biosample", format = "xml")
+#' ncbi_parse(meta = "biosample_result.xml", db = "biosample", format = "xml")
 #' }
 #' @export
-ncbi_meta <- function(
+ncbi_parse <- function(
   meta,
   db,
   format = "xml",
   verbose = getOption("verbose")
 ) {
-  f <- get(paste("ncbi_meta", db, format, sep = "_"))
+  f <- get(paste("ncbi_parse", db, format, sep = "_"))
   if (db == "biosample" & format == "xml") {
     out <- f(meta, verbose)
   } else {
diff --git a/R/ncbi_meta_biosample.R b/R/ncbi_parse_biosample_xml.R
similarity index 97%
rename from R/ncbi_meta_biosample.R
rename to R/ncbi_parse_biosample_xml.R
index 8d704ac..ca3081f 100644
--- a/R/ncbi_meta_biosample.R
+++ b/R/ncbi_parse_biosample_xml.R
@@ -6,13 +6,13 @@
 #' that was retrieved through \code{rentrez::entrez_fetch()} or a path to an xml
 #' file that was downloaded from NCBI BioSample.
 #' @param verbose logical; Should verbose messages be printed to console?
-ncbi_meta_biosample_xml <- function(
+ncbi_parse_biosample_xml <- function(
     biosample_xml,
     verbose = getOption("verbose")
     ) {
   parsed_xml <- xml2::as_list(xml2::read_xml(biosample_xml))[[1]]
   names(parsed_xml) <- sapply(parsed_xml, function(x) attributes(x)$accession)
-  out <- lapply(parsed_xml, ncbi_meta_biosample_xml_entry)
+  out <- lapply(parsed_xml, ncbi_parse_biosample_xml_entry)
   out <- dplyr::bind_rows(out)
   out <- out[, c(
     "biosample_uid",
@@ -23,7 +23,7 @@ ncbi_meta_biosample_xml <- function(
   return(out)
 }
 
-ncbi_meta_biosample_xml_entry <- function(x, verbose = getOption("verbose")) {
+ncbi_parse_biosample_xml_entry <- function(x, verbose = getOption("verbose")) {
   # attributes(x)$names contains all fields! use for validation
   main_attrs <- attributes(x)
   expected_names <- c(
diff --git a/man/get_meta.Rd b/man/get_meta.Rd
deleted file mode 100644
index fa3330b..0000000
--- a/man/get_meta.Rd
+++ /dev/null
@@ -1,35 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/get_meta.R
-\name{get_meta}
-\alias{get_meta}
-\title{Get metadata for biological sequences}
-\usage{
-get_meta(uid, db = "assembly")
-}
-\arguments{
-\item{uid}{character; the internal UID of an entry inside a database.}
-
-\item{db}{character; the database to search in. Can be either 
-\code{"assembly"}, \code{"biosample"} or \code{"sra"}.}
-}
-\value{
-a data frame
-}
-\description{
-This function accesses metadata that usually accompanies the biological
-sequences. Depending on the database queried this metadata can contain a
-number of identifiers of information about the sample e.g. when and where it
-was collected. Currently the function works with a few NCBI databases.
-}
-\examples{
-\dontrun{
-# search for the species in the NCBI Assembly database
-assembly_uid <- get_uid("Dechloromonas phosphoritropha", db = "assembly")
-# get assembly metadata for the first two hits
-assembly_meta <- get_meta(assembly_uid$uid[1:2], db = "assembly")
-# convert UIDs in the Assembly database to UIDs in the Biosample database
-biosample_uid <- link_uid(assembly_uid$uid, from = "assembly", to = "biosample")
-# get biosample metadata for the first two hits
-biosample_meta <- get_meta(biosample_uid$result[1:2], db = "biosample")
-}
-}
diff --git a/man/ncbi_meta_biosample.Rd b/man/ncbi_meta_biosample.Rd
deleted file mode 100644
index e2291e6..0000000
--- a/man/ncbi_meta_biosample.Rd
+++ /dev/null
@@ -1,28 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/ncbi_meta_biosample.R
-\name{ncbi_meta_biosample}
-\alias{ncbi_meta_biosample}
-\title{Collect NCBI BioSample metadata}
-\usage{
-ncbi_meta_biosample(biosample_uids, verbose = getOption("verbose"))
-}
-\arguments{
-\item{biosample_uids}{character; a character vector of BioSample UID-s.}
-
-\item{verbose}{logical; Should verbose messages be printed to console?}
-}
-\description{
-This function collects metadata for entries in NCBI BioSample database.
-}
-\examples{
-\dontrun{
-# Sample metadata from BioSample ID
-biosample_uid <- get_uid("SAMN02714232", db = "biosample")
-ncbi_meta(biosample_uid)
-
-# Sample metadata from assembly accession 
-assembly_uid <- get_uid("GCF_000695855.3")
-biosample_uid <- ncbi_link_uids(assembly_uid, from = "assembly", to = "biosample")
-ncbi_meta_biosample(biosample_uid)
-}
-}
diff --git a/man/ncbi_parse.Rd b/man/ncbi_parse.Rd
new file mode 100644
index 0000000..8833b26
--- /dev/null
+++ b/man/ncbi_parse.Rd
@@ -0,0 +1,55 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/ncbi_parse.R
+\name{ncbi_parse}
+\alias{ncbi_parse}
+\title{Parse NCBI sequence metadata}
+\usage{
+ncbi_parse(meta, db, format = "xml", verbose = getOption("verbose"))
+}
+\arguments{
+\item{meta}{character; either a character vector containing a data set that
+was retrieved through \code{rentrez::entrez_fetch()} or a path to an file
+that was downloaded from NCBI.}
+
+\item{db}{character; the NCBI database from which the data was retrieved.
+Currently only \code{"biosample"} is supported.}
+
+\item{format}{character; the format of the data set. Currently only
+\code{"xml"} is supported.}
+
+\item{verbose}{logical; Should verbose messages be printed to console?}
+}
+\description{
+This function can be used to parse various non-sequence data sets from NCBI
+into a tibble. These data sets usually accompany the biological sequences and
+contain additional information e.g. identifiers, information about the
+sample, the sequencing platform, etc. The function currently supports parsing
+NCBI BioSample data from XML format.
+}
+\examples{
+\dontrun{
+data(examples)
+
+# NCBI BioSample, fully programmatic access
+
+# Get internal BioSample UID for BioSample ID
+biosample_uid <- get_uid(examples$biosample, db = "biosample")
+# Get metadata in XML format
+meta_xml <- rentrez::entrez_fetch(
+  db = "biosample",
+  id = biosample_uid$uid,
+  rettype = "full",
+  retmode = "xml"
+)
+# Parse XML
+ncbi_parse(meta = meta_xml, db = "biosample", format = "xml")
+
+# NCBI BioSample, download XML file from NCBI and parse
+
+# Manually download the XML file
+# https://www.ncbi.nlm.nih.gov/biosample/?term=SAMN02714232
+# upper right corner -> send to -> file -> format = full (xml) -> create file
+# Parse XML
+ncbi_parse(meta = "biosample_result.xml", db = "biosample", format = "xml")
+}
+}
diff --git a/man/ncbi_parse_biosample_xml.Rd b/man/ncbi_parse_biosample_xml.Rd
new file mode 100644
index 0000000..ce20e51
--- /dev/null
+++ b/man/ncbi_parse_biosample_xml.Rd
@@ -0,0 +1,19 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/ncbi_parse_biosample_xml.R
+\name{ncbi_parse_biosample_xml}
+\alias{ncbi_parse_biosample_xml}
+\title{Parse NCBI BioSample metadata}
+\usage{
+ncbi_parse_biosample_xml(biosample_xml, verbose = getOption("verbose"))
+}
+\arguments{
+\item{biosample_xml}{character; either a character vector containing an xml
+that was retrieved through \code{rentrez::entrez_fetch()} or a path to an xml
+file that was downloaded from NCBI BioSample.}
+
+\item{verbose}{logical; Should verbose messages be printed to console?}
+}
+\description{
+BioSample metadata from NCBI can be retrieved in multiple file formats. This
+function parses metadata retrieved in XML format.
+}
diff --git a/tests/testthat/test-get_uid.R b/tests/testthat/test-get_uid.R
index d5fdb9b..2977312 100644
--- a/tests/testthat/test-get_uid.R
+++ b/tests/testthat/test-get_uid.R
@@ -10,4 +10,4 @@ test_that("get_uid works with a complex term", {
   
   expect_s3_class(res, c("tbl_df", "tbl", "data.frame"))
   expect_true(nrow(res) > 3000)
-})
\ No newline at end of file
+})
diff --git a/tests/testthat/test-ncbi_meta.R b/tests/testthat/test-ncbi_meta.R
deleted file mode 100644
index ff81d11..0000000
--- a/tests/testthat/test-ncbi_meta.R
+++ /dev/null
@@ -1,13 +0,0 @@
-test_that("get_meta() works with Assembly UID-s", {
-  assembly_uid <- get_uid("GCF_000695855.3", db = "assembly")
-  res <- get_meta(assembly_uid$uid, db = "assembly")
-  
-  expect_s3_class(res, c("tbl_df", "tbl", "data.frame"))
-})
-
-test_that("get_meta() works with BioSample UID-s", {
-  biosample_uid <- get_uid("SAMN02714232", db = "biosample")
-  res <- get_meta(biosample_uid$uid, db = "biosample")
-  
-  expect_s3_class(res, c("tbl_df", "tbl", "data.frame"))
-})
diff --git a/tests/testthat/test-ncbi_parse.R b/tests/testthat/test-ncbi_parse.R
new file mode 100644
index 0000000..f3d5cba
--- /dev/null
+++ b/tests/testthat/test-ncbi_parse.R
@@ -0,0 +1,13 @@
+test_that("ncbi_parse() works with BioSamples", {
+  data(examples)
+  biosample_uid <- get_uid(examples$biosample, db = "biosample")
+  meta_xml <- rentrez::entrez_fetch(
+    db = "biosample",
+    id = biosample_uid$uid,
+    rettype = "full",
+    retmode = "xml"
+  )
+  res <- ncbi_parse(meta = meta_xml, db = "biosample", format = "xml")
+  expect_s3_class(res, c("tbl_df", "tbl", "data.frame"))
+  expect_equal(res$biosample, c("SAMN02714232", "SAMD00057211", "SAMN32745369"))
+})