From d5d8c19799ff38e689a70079c89cad5dcdfe5390 Mon Sep 17 00:00:00 2001 From: Tamas Stirling Date: Fri, 28 Jun 2024 20:49:52 +0200 Subject: [PATCH] Find BioSample ID regardless of GCA or GCF --- DESCRIPTION | 2 +- NAMESPACE | 1 + R/ncbi_link.R | 43 +++++++++++++++------------------ man/ncbi_link.Rd | 2 -- tests/testthat/test-ncbi_link.R | 43 +++++++++++++++++++++++++++++++++ 5 files changed, 64 insertions(+), 27 deletions(-) create mode 100644 tests/testthat/test-ncbi_link.R diff --git a/DESCRIPTION b/DESCRIPTION index 95f513a..c963f08 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -30,6 +30,6 @@ Suggests: knitr, rmarkdown, testthat -RoxygenNote: 7.3.1 +RoxygenNote: 7.3.2 VignetteBuilder: knitr Config/testthat/edition: 3 diff --git a/NAMESPACE b/NAMESPACE index af932ba..2ea93ba 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -13,6 +13,7 @@ export(ncbi_download_genome) export(ncbi_get_meta) export(ncbi_get_summary) export(ncbi_get_uid) +export(ncbi_link) export(ncbi_link_uid) export(ncbi_parse) export(ncbi_parse_assembly_xml) diff --git a/R/ncbi_link.R b/R/ncbi_link.R index 010f1cd..56c482e 100644 --- a/R/ncbi_link.R +++ b/R/ncbi_link.R @@ -20,8 +20,7 @@ #' ncbi_link("GCF_000002435.2", from = "assembly", to = "biosample") #' ncbi_link("SAMN02714232", from = "biosample", to = "assembly") #' } -#' -#' +#' @export ncbi_link <- function( query, from, @@ -61,34 +60,30 @@ ncbi_link_assembly_biosample <- function( batch_size = batch_size, verbose = verbose ) - res <- list() - for (i in 1:nrow(from_uid$web_history)) { - WH <- list( - "WebEnv" = from_uid$web_history$WebEnv[i], - "QueryKey" = from_uid$web_history$QueryKey[i] - ) - class(WH) <- c("web_history", "list") - hit <- wrap( - "entrez_summary", - package = "rentrez", - verbose = verbose, - db = "assembly", - web_history = WH - ) - if ("esummary" %in% class(hit)) { - hit <- list(hit) - } - res[[i]] <- hit - } - res <- unlist(res, recursive = FALSE) + res <- ncbi_get_summary(query = from_uid, verbose = verbose) ids <- data.frame( - assembly = unname(sapply(res, function(x) x$assemblyaccession)), + assembly_gbk = unname(sapply(res, function(x) x$synonym$genbank)), + assembly_rsq = unname(sapply(res, function(x) x$synonym$refseq)), biosample = unname(sapply(res, function(x) x$biosampleaccn)) ) + ids$assembly_rsq <- ifelse(ids$assembly_rsq == "", NA, ids$assembly_rsq) + index_gbk <- which(ids$assembly_gbk %in% assembly[which(!is.na(assembly))]) + index_rsq <- which(ids$assembly_rsq %in% assembly[which(!is.na(assembly))]) out <- data.frame( assembly = assembly ) - out <- dplyr::left_join(out, ids, by = "assembly") + out_gbk <- dplyr::right_join( + out, + ids[index_gbk, c("assembly_gbk", "biosample")], + by = c("assembly" = "assembly_gbk") + ) + out_rsq <- dplyr::right_join( + out, + ids[index_rsq, c("assembly_rsq", "biosample")], + by = c("assembly" = "assembly_rsq") + ) + out_both <- dplyr::bind_rows(out_gbk, out_rsq) + out <- dplyr::left_join(out, out_both, by = "assembly") out <- tibble::as_tibble(out) return(out) } diff --git a/man/ncbi_link.Rd b/man/ncbi_link.Rd index 3ce60fa..5cc30e4 100644 --- a/man/ncbi_link.Rd +++ b/man/ncbi_link.Rd @@ -36,6 +36,4 @@ attempts to link ID-s from one database to another. ncbi_link("GCF_000002435.2", from = "assembly", to = "biosample") ncbi_link("SAMN02714232", from = "biosample", to = "assembly") } - - } diff --git a/tests/testthat/test-ncbi_link.R b/tests/testthat/test-ncbi_link.R new file mode 100644 index 0000000..1ece05b --- /dev/null +++ b/tests/testthat/test-ncbi_link.R @@ -0,0 +1,43 @@ +data(examples) + +test_that("ncbi_link() assembly to biosample from GCA",{ + res <- ncbi_link("GCA_001698945.1", from = "assembly", to = "biosample") + + expect_equal(class(res), c("tbl_df", "tbl", "data.frame")) + expect_equal(dim(res), c(1,2)) + expect_equal(res$assembly, "GCA_001698945.1") + expect_equal(res$biosample, "SAMN03175161") +}) + +test_that("ncbi_link() assembly to biosample from GCF",{ + res <- ncbi_link("GCF_001698945.1", from = "assembly", to = "biosample") + + expect_equal(class(res), c("tbl_df", "tbl", "data.frame")) + expect_equal(dim(res), c(1,2)) + expect_equal(res$assembly, "GCF_001698945.1") + expect_equal(res$biosample, "SAMN03175161") +}) + +test_that("ncbi_link() assembly to biosample invalid queries", { + query <- c(examples$assembly, "noname", NA) + res <- ncbi_link(query, from = "assembly", to = "biosample") + + expect_equal(res$assembly, query) + expect_equal(res$assembly[4:5], c("noname", NA_character_)) + expect_equal(res$biosample[4:5], c(NA_character_, NA_character_)) +}) + +test_that("ncbi_link() biosample to assembly", { + query <- c(examples$biosample) + res <- ncbi_link(query, from = "biosample", to = "assembly") + + expect_equal(res$biosample, query) + expect_equal( + res$assembly[which(res$biosample == "SAMN02714232")], + "GCF_000695855.3" + ) + expect_equal( + res$assembly[which(res$biosample == "SAMN36356470")], + NA_character_ + ) +})