Allow integer for cohortId (#166)

* Relax cohortId data type rule; remove csv to define data.frame for cds and nco * Update to add checks to prevent non-integer values for cohort id/concept id
OHDSI · Jun 17, 2024 · b088443 · b088443
1 parent cad2c9d
commit b088443
Show file tree

Hide file tree

Showing 9 changed files with 141 additions and 99 deletions.
diff --git a/R/CohortConstruction.R b/R/CohortConstruction.R
@@ -76,6 +76,7 @@ generateCohortSet <- function(connectionDetails = NULL,
       "sql"
     )
   )
+  assertLargeInteger(cohortDefinitionSet$cohortId)
   # Verify that cohort IDs are not repeated in the cohort definition
   # set before generating
   if (length(unique(cohortDefinitionSet$cohortId)) != length(cohortDefinitionSet$cohortId)) {

diff --git a/R/CohortDefinitionSet.R b/R/CohortDefinitionSet.R
@@ -29,15 +29,41 @@
 #' @export
 createEmptyCohortDefinitionSet <- function(verbose = FALSE) {
   checkmate::assert_logical(verbose)
-  cohortDefinitionSetSpec <- .getCohortDefinitionSetSpecification()
+  df <- data.frame(
+    cohortId = numeric(),
+    cohortName = character(),
+    sql = character(),
+    json = character()
+  )
   if (verbose) {
-    print(cohortDefinitionSetSpec)
+    print(df)
   }
-  # Build the data.frame dynamically from the cohort definition set spec
-  df <- .createEmptyDataFrameFromSpecification(cohortDefinitionSetSpec)
   invisible(df)
 }
 
+.cohortDefinitionSetHasRequiredColumns <- function(x, emitWarning = FALSE) {
+  checkmate::assert_data_frame(x)
+  df <- createEmptyCohortDefinitionSet(verbose = FALSE)
+
+  # Compare the column names from the input x to an empty cohort
+  # definition set to ensure the required columns are present
+  cohortDefinitionSetColumns <- colnames(df)
+  matchingColumns <- intersect(x = colnames(x), y = cohortDefinitionSetColumns)
+  columnNamesMatch <- setequal(matchingColumns, cohortDefinitionSetColumns)
+
+  if (!columnNamesMatch && emitWarning) {
+    columnsMissing <- setdiff(x = cohortDefinitionSetColumns, y = colnames(x))
+    warningMessage <- paste0(
+      "The following columns were missing in your cohortDefinitionSet: ",
+      paste(columnsMissing, collapse = ","),
+      ". A cohortDefinitionSet requires the following columns: ",
+      paste(cohortDefinitionSetColumns, collapse = ",")
+    )
+    warning(warningMessage)
+  }
+  invisible(columnNamesMatch)
+}
+
 #' Is the data.frame a cohort definition set?
 #'
 #' @description
@@ -99,15 +125,15 @@ checkAndFixCohortDefinitionSetDataTypes <- function(x, fixDataTypes = TRUE, emit
   checkmate::assert_data_frame(x)
   df <- createEmptyCohortDefinitionSet(verbose = FALSE)
   cohortDefinitionSetColumns <- colnames(df)
-  cohortDefinitionSetSpec <- .getCohortDefinitionSetSpecification()
 
   columnNamesMatch <- .cohortDefinitionSetHasRequiredColumns(x = x, emitWarning = emitWarning)
   if (!columnNamesMatch) {
     stop("Cannot check and fix cohortDefinitionSet since it is missing required columns.")
   }
 
   # Compare the data types from the input x to an empty cohort
-  # definition set to ensure the same data types are present
+  # definition set to ensure the same data types (or close enough)
+  # are present
   dataTypesMatch <- FALSE
   # Subset x to the required columns
   xSubset <- x[, cohortDefinitionSetColumns]
@@ -116,7 +142,14 @@ checkAndFixCohortDefinitionSetDataTypes <- function(x, fixDataTypes = TRUE, emit
   # Get the reference data types
   cohortDefinitionSetDataTypes <- sapply(df, typeof)
   # Check if the data types match
-  dataTypesMatch <- identical(x = xDataTypes, y = cohortDefinitionSetDataTypes)
+  # NOTE: createEmptyCohortDefinitionSet() is the reference for the data
+  # types. cohortId is declared as a numeric but an integer is also fine
+  dataTypesMatch <- (xDataTypes[1] %in% c('integer', 'double') && all(xDataTypes[2:4] == "character"))
+  # Create the cohortDefinitionSetSpec from the names/data types for reference
+  cohortDefinitionSetSpec <- data.frame(
+    columnName = names(xDataTypes),
+    dataType = xDataTypes
+  )
   if (!dataTypesMatch && emitWarning) {
     dataTypesMismatch <- setdiff(x = cohortDefinitionSetDataTypes, y = xDataTypes)
     # Create a column for the warning message
@@ -145,50 +178,6 @@ checkAndFixCohortDefinitionSetDataTypes <- function(x, fixDataTypes = TRUE, emit
   ))
 }
 
-.cohortDefinitionSetHasRequiredColumns <- function(x, emitWarning = FALSE) {
-  checkmate::assert_data_frame(x)
-  df <- createEmptyCohortDefinitionSet(verbose = FALSE)
-  cohortDefinitionSetSpec <- .getCohortDefinitionSetSpecification()
-
-  # Compare the column names from the input x to an empty cohort
-  # definition set to ensure the required columns are present
-  cohortDefinitionSetColumns <- colnames(df)
-  matchingColumns <- intersect(x = colnames(x), y = cohortDefinitionSetColumns)
-  columnNamesMatch <- setequal(matchingColumns, cohortDefinitionSetColumns)
-
-  if (!columnNamesMatch && emitWarning) {
-    columnsMissing <- setdiff(x = cohortDefinitionSetColumns, y = colnames(x))
-    warningMessage <- paste0(
-      "The following columns were missing in your cohortDefinitionSet: ",
-      paste(columnsMissing, collapse = ","),
-      ". A cohortDefinitionSet requires the following columns: ",
-      paste(cohortDefinitionSetColumns, collapse = ",")
-    )
-    warning(warningMessage)
-  }
-  invisible(columnNamesMatch)
-}
-
-#' Helper function to return the specification description of a
-#' cohortDefinitionSet
-#'
-#' @description
-#' This function reads from the cohortDefinitionSetSpecificationDescription.csv
-#' to return a data.frame that describes the required columns in a
-#' cohortDefinitionSet
-#'
-#' @return
-#' Returns a data.frame that defines a cohortDefinitionSet
-#'
-#' @noRd
-#' @keywords internal
-.getCohortDefinitionSetSpecification <- function() {
-  return(readCsv(system.file("csv", "cohortDefinitionSetSpecificationDescription.csv",
-    package = "CohortGenerator",
-    mustWork = TRUE
-  )))
-}
-
 #' Get a cohort definition set
 #'
 #' @description
@@ -262,7 +251,7 @@ getCohortDefinitionSet <- function(settingsFileName = "Cohorts.csv",
   rlang::inform("Loading cohortDefinitionSet")
   settings <- readCsv(file = getPath(fileName = settingsFileName), warnOnCaseMismatch = FALSE)
 
-  assert_settings_columns(names(settings), getPath(fileName = settingsFileName))
+  assertSettingsColumns(names(settings), getPath(fileName = settingsFileName))
   checkmate::assert_true(all(cohortFileNameValue %in% names(settings)))
   checkmate::assert_true((!all(.getFileDataColumns() %in% names(settings))))
 
@@ -382,7 +371,7 @@ saveCohortDefinitionSet <- function(cohortDefinitionSet,
   checkmate::assertDataFrame(cohortDefinitionSet, min.rows = 1, col.names = "named")
   checkmate::assert_vector(cohortFileNameValue)
   checkmate::assert_true(length(cohortFileNameValue) > 0)
-  assert_settings_columns(names(cohortDefinitionSet))
+  assertSettingsColumns(names(cohortDefinitionSet))
   checkmate::assert_true(all(cohortFileNameValue %in% names(cohortDefinitionSet)))
   settingsFolder <- dirname(settingsFileName)
   if (!dir.exists(settingsFolder)) {
@@ -507,15 +496,37 @@ checkSettingsColumns <- function(columnNames, settingsFileName = NULL) {
   }
 }
 
-.createEmptyDataFrameFromSpecification <- function(specifications) {
-  # Build the data.frame dynamically from the cohort definition set spec
-  df <- data.frame()
-  for (i in 1:nrow(specifications)) {
-    colName <- specifications$columnName[i]
-    dataType <- specifications$dataType[i]
-    df <- df %>% dplyr::mutate(!!colName := do.call(what = dataType, args = list()))
+#' Custom checkmate assertion for ensuring a vector contains only integer numbers,
+#' including large ones
+#'
+#' @description
+#' This function is used to provide a more informative message to inform
+#' a user that their number must be an integer. Since the
+#' cohort definition set allows for storing `numeric` data types, we need
+#' to make sure that there are no digits in the mantissa of the cohort ID.
+#' NOTE: This function is necessary since checkmate::assert_integerish
+#' will still throw an error even in the case where you have a large
+#' integer which was not desirable.
+#'
+#' @param x The vector containing integer/numeric values
+#'
+#' @param columnName The name of the column where this vector came from. This
+#'                   is used when displaying the error message.
+#' @return
+#' Returns TRUE if all the values in x are integers
+#' @noRd
+#' @keywords internal
+checkLargeInteger <- function(x, columnName = "cohortId") {
+  # NOTE: suppressWarnings used to mask
+  # warning from R which may happen for
+  # large values in X.
+  res <- all(suppressWarnings(x%%1) == 0)
+  if (!isTRUE(res)) {
+    errorMessage <- paste0("The column ", columnName, " included non-integer values. Please update and re-try")
+    return(errorMessage)
+  } else {
+    return(TRUE)
   }
-  invisible(df)
 }
 
 .copySubsetDefinitions <- function(copyToCds, copyFromCds) {

diff --git a/R/CohortGenerator.R b/R/CohortGenerator.R
@@ -27,4 +27,5 @@
 NULL
 
 # Add custom assertions
-assert_settings_columns <- checkmate::makeAssertionFunction(checkSettingsColumns)
+assertSettingsColumns <- checkmate::makeAssertionFunction(checkSettingsColumns)
+assertLargeInteger <- checkmate::makeAssertionFunction(checkLargeInteger)
diff --git a/R/NegativeControlCohorts.R b/R/NegativeControlCohorts.R
@@ -29,36 +29,17 @@
 #' @export
 createEmptyNegativeControlOutcomeCohortSet <- function(verbose = FALSE) {
   checkmate::assert_logical(verbose)
-  negativeControlOutcomeCohortSetSpecification <- .getNegativeControlOutcomeCohortSetSpecification()
+  df <- data.frame(
+    cohortId = numeric(),
+    cohortName = character(),
+    outcomeConceptId = numeric()
+  )
   if (verbose) {
-    print(negativeControlOutcomeCohortSetSpecification)
+    print(df)
   }
-  # Build the data.frame dynamically
-  df <- .createEmptyDataFrameFromSpecification(negativeControlOutcomeCohortSetSpecification)
   invisible(df)
 }
 
-#' Helper function to return the specification description of a
-#' negativeControlOutcomeCohortSet
-#'
-#' @description
-#' This function reads from the negativeControlOutcomeCohortSetSpecificationDescription.csv
-#' to return a data.frame that describes the required columns in a
-#' negativeControlOutcomeCohortSet
-#'
-#' @return
-#' Returns a data.frame that defines a negativeControlOutcomeCohortSet
-#'
-#' @noRd
-#' @keywords internal
-.getNegativeControlOutcomeCohortSetSpecification <- function() {
-  return(readCsv(system.file("csv", "negativeControlOutcomeCohortSetSpecificationDescription.csv",
-    package = "CohortGenerator",
-    mustWork = TRUE
-  )))
-}
-
-
 #' Generate a set of negative control outcome cohorts
 #'
 #' @description
@@ -111,13 +92,15 @@ generateNegativeControlOutcomeCohorts <- function(connectionDetails = NULL,
   checkmate::assert_choice(x = tolower(occurrenceType), choices = c("all", "first"))
   checkmate::assert_logical(detectOnDescendants)
   checkmate::assertNames(colnames(negativeControlOutcomeCohortSet),
-    must.include = .getNegativeControlOutcomeCohortSetSpecification()$columnName
+    must.include = names(createEmptyNegativeControlOutcomeCohortSet())
   )
   checkmate::assert_data_frame(
     x = negativeControlOutcomeCohortSet,
     min.rows = 1
   )
-
+  assertLargeInteger(negativeControlOutcomeCohortSet$cohortId)
+  assertLargeInteger(negativeControlOutcomeCohortSet$outcomeConceptId, columnName = "outcomeConceptId")
+
   # Verify that cohort IDs are not repeated in the negative control
   # cohort definition set before generating
   if (length(unique(negativeControlOutcomeCohortSet$cohortId)) != length(negativeControlOutcomeCohortSet$cohortId)) {

diff --git a/inst/csv/cohortDefinitionSetSpecificationDescription.csv b/inst/csv/cohortDefinitionSetSpecificationDescription.csv
diff --git a/inst/csv/negativeControlOutcomeCohortSetSpecificationDescription.csv b/inst/csv/negativeControlOutcomeCohortSetSpecificationDescription.csv
diff --git a/tests/testthat/test-CohortConstructionAndStats.R b/tests/testthat/test-CohortConstructionAndStats.R
@@ -55,6 +55,23 @@ test_that("Call instatiateCohortSet with malformed cohortDefinitionSet parameter
   )
 })
 
+test_that("Call instatiateCohortSet with cohortDefinitionSet with non-integer data type", {
+  cohortDefinitionSet <- createEmptyCohortDefinitionSet()
+  cohortDefinitionSet <- rbind(cohortDefinitionSet, data.frame(
+    cohortId = 1.2,
+    cohortName = "Test",
+    sql = "sql",
+    foo = "foo"
+  ))
+  expect_error(
+    generateCohortSet(
+      connectionDetails = connectionDetails,
+      cohortDefinitionSet = cohortDefinitionSet
+    ),
+    message = "(included non-integer)"
+  )
+})
+
 test_that("Call instatiateCohortSet with cohortDefinitionSet with extra columns", {
   cohortDefinitionSet <- createEmptyCohortDefinitionSet()
   cohortDefinitionSet <- rbind(cohortDefinitionSet, data.frame(
@@ -66,7 +83,7 @@ test_that("Call instatiateCohortSet with cohortDefinitionSet with extra columns"
   expect_error(
     generateCohortSet(
       connectionDetails = connectionDetails,
-      cohortDefinitionSet = data.frame()
+      cohortDefinitionSet = cohortDefinitionSet
     ),
     message = "(must contain the following columns)"
   )

diff --git a/tests/testthat/test-CohortDefinitionSet.R b/tests/testthat/test-CohortDefinitionSet.R
@@ -230,6 +230,12 @@ test_that("Call isCohortDefinitionSet with incorrect cohort definition set and e
   expect_warning(expect_false(isCohortDefinitionSet(cohortDefinitionSetError)))
 })
 
+test_that("Call isCohortDefinitionSet with cohort definition set with integer data type for cohort ID and expect TRUE", {
+  cohortDefinitionSet <- createEmptyCohortDefinitionSet()
+  cohortDefinitionSet$cohortId <- as.integer(cohortDefinitionSet$cohortId)
+  expect_true(isCohortDefinitionSet(cohortDefinitionSet))
+})
+
 test_that("Call isCohortDefinitionSet with cohort definition set with incorrect data type and expect FALSE", {
   cohortDefinitionSet <- createEmptyCohortDefinitionSet()
   cohortDefinitionSet$cohortName <- as.integer(cohortDefinitionSet$cohortName)

diff --git a/tests/testthat/test-NegativeControlCohorts.R b/tests/testthat/test-NegativeControlCohorts.R
@@ -12,6 +12,38 @@ test_that("Call generateNegativeControlOutcomeCohorts without connection or conn
   expect_error(generateNegativeControlOutcomeCohorts())
 })
 
+test_that("Call generateNegativeControlOutcomeCohorts with negativeControlOutcomeCohortSet containing non-integer cohort ID", {
+  negativeControlOutcomeCohortSet <- data.frame(
+    cohortId = 1.2,
+    cohortName = "invalid cohort id",
+    outcomeConceptId = 1
+  )
+  expect_error(
+    generateNegativeControlOutcomeCohorts(
+      connectionDetails = connectionDetails,
+      cdmDatabaseSchema = "main",
+      negativeControlOutcomeCohortSet = negativeControlOutcomeCohortSet
+    ),
+    message = "(non-integer values)"
+  )
+})
+
+test_that("Call generateNegativeControlOutcomeCohorts with negativeControlOutcomeCohortSet containing non-integer outcome concept ID", {
+  negativeControlOutcomeCohortSet <- data.frame(
+    cohortId = 1,
+    cohortName = "invalid outcome concept id",
+    outcomeConceptId = 1.2
+  )
+  expect_error(
+    generateNegativeControlOutcomeCohorts(
+      connectionDetails = connectionDetails,
+      cdmDatabaseSchema = "main",
+      negativeControlOutcomeCohortSet = negativeControlOutcomeCohortSet
+    ),
+    message = "(non-integer values)"
+  )
+})
+
 test_that("Call generateNegativeControlOutcomeCohorts with negativeControlOutcomeCohortSet containing duplicate IDs", {
   negativeControlOutcomeCohortSet <- data.frame(
     cohortId = 1,