Skip to content

Commit

Permalink
Merge pull request #250 from waldronlab/sdgamboa-bioc-review
Browse files Browse the repository at this point in the history
Changes requested for inclusion in Bioconductor
  • Loading branch information
sdgamboa authored Jun 27, 2024
2 parents 496e8d6 + 5c29708 commit 8fc1c6d
Show file tree
Hide file tree
Showing 24 changed files with 514 additions and 417 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -46,3 +46,5 @@ jupyter_notebooks/updated_spore_shape_v2.xlsx
protratis_data/

misc

*.Rproj
3 changes: 2 additions & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,8 @@ Imports:
tidyselect,
BiocFileCache,
httr2,
tools
tools,
S4Vectors
Suggests:
DT,
forcats,
Expand Down
120 changes: 61 additions & 59 deletions R/bacdive.R
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@

## Main function for importing BacDive
.getBacDive <- function(verbose = FALSE ) {
bacdive_data <- .importBacDiveExcel(verbose = verbose)
colnames(bacdive_data) <- .changeBDColNames(colnames(bacdive_data))
.getTidyBD(bacdive_data)
bacdiveData <- .importBacDiveExcel(verbose = verbose)
colnames(bacdiveData) <- .changeBDColNames(colnames(bacdiveData))
.getTidyBD(bacdiveData)
}

## Helper function for .getBacDive
Expand Down Expand Up @@ -34,8 +34,8 @@
}

## Helper function for .getBacDive
.getTidyBD <- function(bacdive_data) {
bacdive_data |>
.getTidyBD <- function(bacdiveData) {
bacdiveData |>
tidyr::pivot_longer(
# Attributes start in the gram_stain column
cols = gram_stain:tidyr::last_col(),
Expand All @@ -60,10 +60,10 @@
.reshapeBacDive <- function(df) {

df[['Attribute_source']] <- 'BacDive'
split_df <- split(df, factor(df[['Attribute']]))
splitDf <- split(df, factor(df[['Attribute']]))

## Attributes that must be changed from character to logical (simplest fix)
attr_names <- c(
attrNames <- c(
'aerophilicity',
'shape',
'country',
Expand All @@ -72,20 +72,22 @@
'isolation site'
## colony color (delete)
)

for (i in seq_along(attr_names)) {
split_df[[attr_names[i]]] <- .catToLog(split_df[[attr_names[i]]])
if (attr_names[i] %in% c('aerophilicity', 'shape')) {
split_df[[attr_names[i]]]$Attribute_type <-

## Modifying an already existing vector rather than creating a list
## Keeping this for loop
for (i in seq_along(attrNames)) {
splitDf[[attrNames[i]]] <- .catToLog(splitDf[[attrNames[i]]])
if (attrNames[i] %in% c('aerophilicity', 'shape')) {
splitDf[[attrNames[i]]]$Attribute_type <-
'multistate-intersection'
} else {
split_df[[attr_names[i]]]$Attribute_type <- 'multistate-union'
splitDf[[attrNames[i]]]$Attribute_type <- 'multistate-union'
}
}

## aerophilicity ####
## This is only to match the data in the bugphyzz spreadsheet
aer <- split_df[['aerophilicity']]
aer <- splitDf[['aerophilicity']]
aer$Attribute <- dplyr::case_when(
aer$Attribute == 'aerobe' ~ 'aerobic',
aer$Attribute == 'anaerobe' ~ 'anaerobic',
Expand All @@ -95,51 +97,51 @@
aer$Attribute == 'obligate aerobe' ~ 'obligately aerobic',
TRUE ~ aer$Attribute
)
split_df[['aerophilicity']] <- aer
splitDf[['aerophilicity']] <- aer

## animal pathogen ####
pos <- names(split_df) == 'animal pathongen'
names(split_df)[pos] <- 'animal pathogen'
x_ <- split_df[['animal pathogen']][['Attribute_value']]
pos <- names(splitDf) == 'animal pathongen'
names(splitDf)[pos] <- 'animal pathogen'
x_ <- splitDf[['animal pathogen']][['Attribute_value']]
x_ <- ifelse(x_ == "yes, in single cases", "yes", x_)
x_ <- dplyr::case_when(x_ == 'yes' ~ TRUE, x_ == 'no' ~ FALSE)
split_df[['animal pathogen']][['Attribute_value']] <- x_
split_df[['animal pathogen']][['Attribute_group']] <- 'animal pathogen'
split_df[['animal pathogen']][['Attribute']] <- 'animal pathogen'
split_df[['animal pathogen']][['Attribute_type']] <- 'binary'
splitDf[['animal pathogen']][['Attribute_value']] <- x_
splitDf[['animal pathogen']][['Attribute_group']] <- 'animal pathogen'
splitDf[['animal pathogen']][['Attribute']] <- 'animal pathogen'
splitDf[['animal pathogen']][['Attribute_type']] <- 'binary'

## biosafety level ####
y <- split_df[['biosafety level comment']][
y <- splitDf[['biosafety level comment']][
, c('BacDive_ID', 'Attribute_value')
]
colnames(y)[2] <- 'Note'
x <- dplyr::left_join(split_df[['biosafety level']], y, by = 'BacDive_ID')
x <- dplyr::left_join(splitDf[['biosafety level']], y, by = 'BacDive_ID')
x[['Attribute_value']] <- paste0('biosafety level ', x[['Attribute_value']])
x[['Attribute']] <- x[['Attribute_value']]
x[['Attribute_value']] <- TRUE
x[['Attribute_group']] <- 'biosafety level'
x[['Attribute_type']] <- 'multistate-intersection'
split_df[['biosafety level']] <- x
split_df[['biosafety level comment']] <- NULL
splitDf[['biosafety level']] <- x
splitDf[['biosafety level comment']] <- NULL

## colony color ####
## This one must be removed
split_df[['colony color']] <- NULL
splitDf[['colony color']] <- NULL

## cultivation medium used - growth medium ####
pos <- names(split_df) == 'cultivation medium used'
names(split_df)[pos] <- 'growth medium'
split_df[['growth medium']][['Attribute_group']] <- 'growth medium'
pos <- names(splitDf) == 'cultivation medium used'
names(splitDf)[pos] <- 'growth medium'
splitDf[['growth medium']][['Attribute_group']] <- 'growth medium'

## growth temperature ####
## culture temperature
## culture temperature growth
## culture temperature range (ignore)
## culture temperature type (ignore)
split_df[['culture temperature range']] <- NULL
split_df[['culture temperature type']] <- NULL
a <- split_df[['culture temperature']]
b <- split_df[['culture temperature growth']]
splitDf[['culture temperature range']] <- NULL
splitDf[['culture temperature type']] <- NULL
a <- splitDf[['culture temperature']]
b <- splitDf[['culture temperature growth']]
b_ <- b[,c('BacDive_ID', 'Attribute_value')]
colnames(b_)[2] <- 'growth'
ab <- dplyr::left_join(a, b_, by = 'BacDive_ID')
Expand All @@ -148,25 +150,25 @@
ab[['Attribute_group']] <- 'growth temperature'
ab[['Attribute_type']] <- 'range'
ab[['Attribute']] <- 'growth temperature'
split_df[['growth temperature']] <- ab
split_df[['culture temperature']] <- NULL
split_df[['culture temperature growth']] <- NULL
splitDf[['growth temperature']] <- ab
splitDf[['culture temperature']] <- NULL
splitDf[['culture temperature growth']] <- NULL

## gram stain ####
gs <- split_df[['gram stain']]
gs <- splitDf[['gram stain']]
gs[['Attribute']] <- paste(gs[['Attribute']], gs[['Attribute_value']])
gs[['Attribute_value']] <- TRUE
gs[['Attribute_group']] <- 'gram stain'
gs[['Attribute_type']] <- 'multistate-intersection'
split_df[['gram stain']] <- gs
splitDf[['gram stain']] <- gs

## halophily ####
valid_terms <- c(
validTerms <- c(
'NaCl', 'KCl', 'MgCl2', 'MgCl2x6H2O', 'Na\\+', 'MgSO4x7H2O', 'Na2SO4',
'Sea salts', 'Chromium \\(Cr6\\+\\)'
)
regex <- paste0('(', paste0(valid_terms, collapse = '|'), ')')
split_df[['halophily']] <- split_df[['halophily']] |>
regex <- paste0('(', paste0(validTerms, collapse = '|'), ')')
splitDf[['halophily']] <- splitDf[['halophily']] |>
dplyr::mutate(Attribute_value = strsplit(Attribute_value, ';')) |>
tidyr::unnest(cols = 'Attribute_value') |>
dplyr::filter(!grepl('no growth', Attribute_value)) |>
Expand Down Expand Up @@ -194,7 +196,7 @@
dplyr::distinct()

## hemolysis ####
split_df[['hemolysis']] <- split_df[['hemolysis']] |>
splitDf[['hemolysis']] <- splitDf[['hemolysis']] |>
dplyr::mutate(
Attribute_value = strsplit(Attribute_value, ';|/')
) |>
Expand All @@ -210,21 +212,21 @@

## incubation period
## This one must be removed
split_df[['incubation period']] <- NULL
splitDf[['incubation period']] <- NULL

## motility ####
split_df[['motility']] <- split_df[['motility']] |>
splitDf[['motility']] <- splitDf[['motility']] |>
dplyr::mutate(
Attribute_value = dplyr::case_when(
Attribute_value == 'yes' ~ TRUE,
Attribute_value == 'no' ~ FALSE
)
)
split_df[['motility']][['Attribute_group']] <- 'motility'
split_df[['motility']][['Attribute_type']] <- 'binary'
splitDf[['motility']][['Attribute_group']] <- 'motility'
splitDf[['motility']][['Attribute_type']] <- 'binary'

## pathogenicity human ####
pat <- split_df[['pathogenicity human']]
pat <- splitDf[['pathogenicity human']]
pat[['Note']] <- stringr::str_extract(
pat[['Attribute_value']], 'in single cases'
)
Expand All @@ -235,10 +237,10 @@
pat <- pat[!is.na(pat[['Attribute_value']]),]
pat[['Attribute_group']] <- 'pathogenicity human'
pat[['Attribute_type']] <- 'binary'
split_df[['pathogenicity human']] <- pat
splitDf[['pathogenicity human']] <- pat

## metabolite production ####
mp <- split_df[['metabolite production']]
mp <- splitDf[['metabolite production']]
mp <- mp |>
dplyr::mutate(Attribute_value = strsplit(Attribute_value, ';')) |>
tidyr::unnest(Attribute_value)
Expand All @@ -250,12 +252,12 @@
mp[['Attribute']] <- sub(' (yes|no)$', '', mp[['Attribute']])
mp[['Attribute_group']] <- 'metabolite utilization'
mp[['Attribute_type']] <- 'multistate-intersection'
split_df[['metabolite production']] <- mp
splitDf[['metabolite production']] <- mp

## metabolite utilization ####
pos <- names(split_df) == 'metabolite utiilization'
names(split_df)[pos] <- 'metabolite utilization'
mu <- split_df[['metabolite utilization']]
pos <- names(splitDf) == 'metabolite utiilization'
names(splitDf)[pos] <- 'metabolite utilization'
mu <- splitDf[['metabolite utilization']]
mu <- mu |>
dplyr::mutate(Attribute_value = strsplit(Attribute_value, ';')) |>
tidyr::unnest(Attribute_value) |>
Expand All @@ -280,10 +282,10 @@
dplyr::mutate(Attribute_value = as.logical(Attribute_value))
mu[['Attribute_group']] <- 'metabolite utilization'
mu[['Attribute_type']] <- 'multistate-intersection'
split_df[['metabolite utilization']] <- mu
splitDf[['metabolite utilization']] <- mu

## spore formation ####
sf <- split_df[['spore formation']]
sf <- splitDf[['spore formation']]
sf <- sf |>
dplyr::mutate(
Attribute_value = dplyr::case_when(
Expand All @@ -294,9 +296,9 @@
Attribute_type = 'binary'
) |>
dplyr::filter(!is.na(Attribute_value))
split_df[['spore formation']] <- sf
splitDf[['spore formation']] <- sf

split_df <- lapply(split_df, function(x) {
splitDf <- lapply(splitDf, function(x) {
x <- as.data.frame(x)
x[['NCBI_ID']] <- as.character(x[['NCBI_ID']])
x[['Parent_NCBI_ID']] <- as.character(x[['Parent_NCBI_ID']])
Expand All @@ -312,7 +314,7 @@
as.data.frame(x)
})

return(split_df)
return(splitDf)
}

## Helper function for .reshapeBacDive
Expand Down
Loading

0 comments on commit 8fc1c6d

Please sign in to comment.