Skip to content

Commit

Permalink
Allow integer for cohortId (#166)
Browse files Browse the repository at this point in the history
* Relax cohortId data type rule; remove csv to define data.frame for cds and nco
* Update to add checks to prevent non-integer values for cohort id/concept id
  • Loading branch information
anthonysena authored Jun 17, 2024
1 parent cad2c9d commit b088443
Show file tree
Hide file tree
Showing 9 changed files with 141 additions and 99 deletions.
1 change: 1 addition & 0 deletions R/CohortConstruction.R
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ generateCohortSet <- function(connectionDetails = NULL,
"sql"
)
)
assertLargeInteger(cohortDefinitionSet$cohortId)
# Verify that cohort IDs are not repeated in the cohort definition
# set before generating
if (length(unique(cohortDefinitionSet$cohortId)) != length(cohortDefinitionSet$cohortId)) {
Expand Down
133 changes: 72 additions & 61 deletions R/CohortDefinitionSet.R
Original file line number Diff line number Diff line change
Expand Up @@ -29,15 +29,41 @@
#' @export
createEmptyCohortDefinitionSet <- function(verbose = FALSE) {
checkmate::assert_logical(verbose)
cohortDefinitionSetSpec <- .getCohortDefinitionSetSpecification()
df <- data.frame(
cohortId = numeric(),
cohortName = character(),
sql = character(),
json = character()
)
if (verbose) {
print(cohortDefinitionSetSpec)
print(df)
}
# Build the data.frame dynamically from the cohort definition set spec
df <- .createEmptyDataFrameFromSpecification(cohortDefinitionSetSpec)
invisible(df)
}

.cohortDefinitionSetHasRequiredColumns <- function(x, emitWarning = FALSE) {
checkmate::assert_data_frame(x)
df <- createEmptyCohortDefinitionSet(verbose = FALSE)

# Compare the column names from the input x to an empty cohort
# definition set to ensure the required columns are present
cohortDefinitionSetColumns <- colnames(df)
matchingColumns <- intersect(x = colnames(x), y = cohortDefinitionSetColumns)
columnNamesMatch <- setequal(matchingColumns, cohortDefinitionSetColumns)

if (!columnNamesMatch && emitWarning) {
columnsMissing <- setdiff(x = cohortDefinitionSetColumns, y = colnames(x))
warningMessage <- paste0(
"The following columns were missing in your cohortDefinitionSet: ",
paste(columnsMissing, collapse = ","),
". A cohortDefinitionSet requires the following columns: ",
paste(cohortDefinitionSetColumns, collapse = ",")
)
warning(warningMessage)
}
invisible(columnNamesMatch)
}

#' Is the data.frame a cohort definition set?
#'
#' @description
Expand Down Expand Up @@ -99,15 +125,15 @@ checkAndFixCohortDefinitionSetDataTypes <- function(x, fixDataTypes = TRUE, emit
checkmate::assert_data_frame(x)
df <- createEmptyCohortDefinitionSet(verbose = FALSE)
cohortDefinitionSetColumns <- colnames(df)
cohortDefinitionSetSpec <- .getCohortDefinitionSetSpecification()

columnNamesMatch <- .cohortDefinitionSetHasRequiredColumns(x = x, emitWarning = emitWarning)
if (!columnNamesMatch) {
stop("Cannot check and fix cohortDefinitionSet since it is missing required columns.")
}

# Compare the data types from the input x to an empty cohort
# definition set to ensure the same data types are present
# definition set to ensure the same data types (or close enough)
# are present
dataTypesMatch <- FALSE
# Subset x to the required columns
xSubset <- x[, cohortDefinitionSetColumns]
Expand All @@ -116,7 +142,14 @@ checkAndFixCohortDefinitionSetDataTypes <- function(x, fixDataTypes = TRUE, emit
# Get the reference data types
cohortDefinitionSetDataTypes <- sapply(df, typeof)
# Check if the data types match
dataTypesMatch <- identical(x = xDataTypes, y = cohortDefinitionSetDataTypes)
# NOTE: createEmptyCohortDefinitionSet() is the reference for the data
# types. cohortId is declared as a numeric but an integer is also fine
dataTypesMatch <- (xDataTypes[1] %in% c('integer', 'double') && all(xDataTypes[2:4] == "character"))
# Create the cohortDefinitionSetSpec from the names/data types for reference
cohortDefinitionSetSpec <- data.frame(
columnName = names(xDataTypes),
dataType = xDataTypes
)
if (!dataTypesMatch && emitWarning) {
dataTypesMismatch <- setdiff(x = cohortDefinitionSetDataTypes, y = xDataTypes)
# Create a column for the warning message
Expand Down Expand Up @@ -145,50 +178,6 @@ checkAndFixCohortDefinitionSetDataTypes <- function(x, fixDataTypes = TRUE, emit
))
}

.cohortDefinitionSetHasRequiredColumns <- function(x, emitWarning = FALSE) {
checkmate::assert_data_frame(x)
df <- createEmptyCohortDefinitionSet(verbose = FALSE)
cohortDefinitionSetSpec <- .getCohortDefinitionSetSpecification()

# Compare the column names from the input x to an empty cohort
# definition set to ensure the required columns are present
cohortDefinitionSetColumns <- colnames(df)
matchingColumns <- intersect(x = colnames(x), y = cohortDefinitionSetColumns)
columnNamesMatch <- setequal(matchingColumns, cohortDefinitionSetColumns)

if (!columnNamesMatch && emitWarning) {
columnsMissing <- setdiff(x = cohortDefinitionSetColumns, y = colnames(x))
warningMessage <- paste0(
"The following columns were missing in your cohortDefinitionSet: ",
paste(columnsMissing, collapse = ","),
". A cohortDefinitionSet requires the following columns: ",
paste(cohortDefinitionSetColumns, collapse = ",")
)
warning(warningMessage)
}
invisible(columnNamesMatch)
}

#' Helper function to return the specification description of a
#' cohortDefinitionSet
#'
#' @description
#' This function reads from the cohortDefinitionSetSpecificationDescription.csv
#' to return a data.frame that describes the required columns in a
#' cohortDefinitionSet
#'
#' @return
#' Returns a data.frame that defines a cohortDefinitionSet
#'
#' @noRd
#' @keywords internal
.getCohortDefinitionSetSpecification <- function() {
return(readCsv(system.file("csv", "cohortDefinitionSetSpecificationDescription.csv",
package = "CohortGenerator",
mustWork = TRUE
)))
}

#' Get a cohort definition set
#'
#' @description
Expand Down Expand Up @@ -262,7 +251,7 @@ getCohortDefinitionSet <- function(settingsFileName = "Cohorts.csv",
rlang::inform("Loading cohortDefinitionSet")
settings <- readCsv(file = getPath(fileName = settingsFileName), warnOnCaseMismatch = FALSE)

assert_settings_columns(names(settings), getPath(fileName = settingsFileName))
assertSettingsColumns(names(settings), getPath(fileName = settingsFileName))
checkmate::assert_true(all(cohortFileNameValue %in% names(settings)))
checkmate::assert_true((!all(.getFileDataColumns() %in% names(settings))))

Expand Down Expand Up @@ -382,7 +371,7 @@ saveCohortDefinitionSet <- function(cohortDefinitionSet,
checkmate::assertDataFrame(cohortDefinitionSet, min.rows = 1, col.names = "named")
checkmate::assert_vector(cohortFileNameValue)
checkmate::assert_true(length(cohortFileNameValue) > 0)
assert_settings_columns(names(cohortDefinitionSet))
assertSettingsColumns(names(cohortDefinitionSet))
checkmate::assert_true(all(cohortFileNameValue %in% names(cohortDefinitionSet)))
settingsFolder <- dirname(settingsFileName)
if (!dir.exists(settingsFolder)) {
Expand Down Expand Up @@ -507,15 +496,37 @@ checkSettingsColumns <- function(columnNames, settingsFileName = NULL) {
}
}

.createEmptyDataFrameFromSpecification <- function(specifications) {
# Build the data.frame dynamically from the cohort definition set spec
df <- data.frame()
for (i in 1:nrow(specifications)) {
colName <- specifications$columnName[i]
dataType <- specifications$dataType[i]
df <- df %>% dplyr::mutate(!!colName := do.call(what = dataType, args = list()))
#' Custom checkmate assertion for ensuring a vector contains only integer numbers,
#' including large ones
#'
#' @description
#' This function is used to provide a more informative message to inform
#' a user that their number must be an integer. Since the
#' cohort definition set allows for storing `numeric` data types, we need
#' to make sure that there are no digits in the mantissa of the cohort ID.
#' NOTE: This function is necessary since checkmate::assert_integerish
#' will still throw an error even in the case where you have a large
#' integer which was not desirable.
#'
#' @param x The vector containing integer/numeric values
#'
#' @param columnName The name of the column where this vector came from. This
#' is used when displaying the error message.
#' @return
#' Returns TRUE if all the values in x are integers
#' @noRd
#' @keywords internal
checkLargeInteger <- function(x, columnName = "cohortId") {
# NOTE: suppressWarnings used to mask
# warning from R which may happen for
# large values in X.
res <- all(suppressWarnings(x%%1) == 0)
if (!isTRUE(res)) {
errorMessage <- paste0("The column ", columnName, " included non-integer values. Please update and re-try")
return(errorMessage)
} else {
return(TRUE)
}
invisible(df)
}

.copySubsetDefinitions <- function(copyToCds, copyFromCds) {
Expand Down
3 changes: 2 additions & 1 deletion R/CohortGenerator.R
Original file line number Diff line number Diff line change
Expand Up @@ -27,4 +27,5 @@
NULL

# Add custom assertions
assert_settings_columns <- checkmate::makeAssertionFunction(checkSettingsColumns)
assertSettingsColumns <- checkmate::makeAssertionFunction(checkSettingsColumns)
assertLargeInteger <- checkmate::makeAssertionFunction(checkLargeInteger)
37 changes: 10 additions & 27 deletions R/NegativeControlCohorts.R
Original file line number Diff line number Diff line change
Expand Up @@ -29,36 +29,17 @@
#' @export
createEmptyNegativeControlOutcomeCohortSet <- function(verbose = FALSE) {
checkmate::assert_logical(verbose)
negativeControlOutcomeCohortSetSpecification <- .getNegativeControlOutcomeCohortSetSpecification()
df <- data.frame(
cohortId = numeric(),
cohortName = character(),
outcomeConceptId = numeric()
)
if (verbose) {
print(negativeControlOutcomeCohortSetSpecification)
print(df)
}
# Build the data.frame dynamically
df <- .createEmptyDataFrameFromSpecification(negativeControlOutcomeCohortSetSpecification)
invisible(df)
}

#' Helper function to return the specification description of a
#' negativeControlOutcomeCohortSet
#'
#' @description
#' This function reads from the negativeControlOutcomeCohortSetSpecificationDescription.csv
#' to return a data.frame that describes the required columns in a
#' negativeControlOutcomeCohortSet
#'
#' @return
#' Returns a data.frame that defines a negativeControlOutcomeCohortSet
#'
#' @noRd
#' @keywords internal
.getNegativeControlOutcomeCohortSetSpecification <- function() {
return(readCsv(system.file("csv", "negativeControlOutcomeCohortSetSpecificationDescription.csv",
package = "CohortGenerator",
mustWork = TRUE
)))
}


#' Generate a set of negative control outcome cohorts
#'
#' @description
Expand Down Expand Up @@ -111,13 +92,15 @@ generateNegativeControlOutcomeCohorts <- function(connectionDetails = NULL,
checkmate::assert_choice(x = tolower(occurrenceType), choices = c("all", "first"))
checkmate::assert_logical(detectOnDescendants)
checkmate::assertNames(colnames(negativeControlOutcomeCohortSet),
must.include = .getNegativeControlOutcomeCohortSetSpecification()$columnName
must.include = names(createEmptyNegativeControlOutcomeCohortSet())
)
checkmate::assert_data_frame(
x = negativeControlOutcomeCohortSet,
min.rows = 1
)

assertLargeInteger(negativeControlOutcomeCohortSet$cohortId)
assertLargeInteger(negativeControlOutcomeCohortSet$outcomeConceptId, columnName = "outcomeConceptId")

# Verify that cohort IDs are not repeated in the negative control
# cohort definition set before generating
if (length(unique(negativeControlOutcomeCohortSet$cohortId)) != length(negativeControlOutcomeCohortSet$cohortId)) {
Expand Down
5 changes: 0 additions & 5 deletions inst/csv/cohortDefinitionSetSpecificationDescription.csv

This file was deleted.

This file was deleted.

19 changes: 18 additions & 1 deletion tests/testthat/test-CohortConstructionAndStats.R
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,23 @@ test_that("Call instatiateCohortSet with malformed cohortDefinitionSet parameter
)
})

test_that("Call instatiateCohortSet with cohortDefinitionSet with non-integer data type", {
cohortDefinitionSet <- createEmptyCohortDefinitionSet()
cohortDefinitionSet <- rbind(cohortDefinitionSet, data.frame(
cohortId = 1.2,
cohortName = "Test",
sql = "sql",
foo = "foo"
))
expect_error(
generateCohortSet(
connectionDetails = connectionDetails,
cohortDefinitionSet = cohortDefinitionSet
),
message = "(included non-integer)"
)
})

test_that("Call instatiateCohortSet with cohortDefinitionSet with extra columns", {
cohortDefinitionSet <- createEmptyCohortDefinitionSet()
cohortDefinitionSet <- rbind(cohortDefinitionSet, data.frame(
Expand All @@ -66,7 +83,7 @@ test_that("Call instatiateCohortSet with cohortDefinitionSet with extra columns"
expect_error(
generateCohortSet(
connectionDetails = connectionDetails,
cohortDefinitionSet = data.frame()
cohortDefinitionSet = cohortDefinitionSet
),
message = "(must contain the following columns)"
)
Expand Down
6 changes: 6 additions & 0 deletions tests/testthat/test-CohortDefinitionSet.R
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,12 @@ test_that("Call isCohortDefinitionSet with incorrect cohort definition set and e
expect_warning(expect_false(isCohortDefinitionSet(cohortDefinitionSetError)))
})

test_that("Call isCohortDefinitionSet with cohort definition set with integer data type for cohort ID and expect TRUE", {
cohortDefinitionSet <- createEmptyCohortDefinitionSet()
cohortDefinitionSet$cohortId <- as.integer(cohortDefinitionSet$cohortId)
expect_true(isCohortDefinitionSet(cohortDefinitionSet))
})

test_that("Call isCohortDefinitionSet with cohort definition set with incorrect data type and expect FALSE", {
cohortDefinitionSet <- createEmptyCohortDefinitionSet()
cohortDefinitionSet$cohortName <- as.integer(cohortDefinitionSet$cohortName)
Expand Down
32 changes: 32 additions & 0 deletions tests/testthat/test-NegativeControlCohorts.R
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,38 @@ test_that("Call generateNegativeControlOutcomeCohorts without connection or conn
expect_error(generateNegativeControlOutcomeCohorts())
})

test_that("Call generateNegativeControlOutcomeCohorts with negativeControlOutcomeCohortSet containing non-integer cohort ID", {
negativeControlOutcomeCohortSet <- data.frame(
cohortId = 1.2,
cohortName = "invalid cohort id",
outcomeConceptId = 1
)
expect_error(
generateNegativeControlOutcomeCohorts(
connectionDetails = connectionDetails,
cdmDatabaseSchema = "main",
negativeControlOutcomeCohortSet = negativeControlOutcomeCohortSet
),
message = "(non-integer values)"
)
})

test_that("Call generateNegativeControlOutcomeCohorts with negativeControlOutcomeCohortSet containing non-integer outcome concept ID", {
negativeControlOutcomeCohortSet <- data.frame(
cohortId = 1,
cohortName = "invalid outcome concept id",
outcomeConceptId = 1.2
)
expect_error(
generateNegativeControlOutcomeCohorts(
connectionDetails = connectionDetails,
cdmDatabaseSchema = "main",
negativeControlOutcomeCohortSet = negativeControlOutcomeCohortSet
),
message = "(non-integer values)"
)
})

test_that("Call generateNegativeControlOutcomeCohorts with negativeControlOutcomeCohortSet containing duplicate IDs", {
negativeControlOutcomeCohortSet <- data.frame(
cohortId = 1,
Expand Down

0 comments on commit b088443

Please sign in to comment.