From 884217b1f077a2c4c8ef13d6e40759b6dfb86a77 Mon Sep 17 00:00:00 2001
From: Anthony Sena <asena5@its.jnj.com>
Date: Mon, 17 Jun 2024 10:45:09 -0400
Subject: [PATCH 1/3] Relax cohortId data type rule; remove csv to define
 data.frame for cds and nco

---
 R/CohortDefinitionSet.R                       | 102 +++++++-----------
 R/NegativeControlCohorts.R                    |  31 ++----
 ...tDefinitionSetSpecificationDescription.csv |   5 -
 ...tcomeCohortSetSpecificationDescription.csv |   4 -
 tests/testthat/test-CohortDefinitionSet.R     |   6 ++
 5 files changed, 52 insertions(+), 96 deletions(-)
 delete mode 100644 inst/csv/cohortDefinitionSetSpecificationDescription.csv
 delete mode 100644 inst/csv/negativeControlOutcomeCohortSetSpecificationDescription.csv

diff --git a/R/CohortDefinitionSet.R b/R/CohortDefinitionSet.R
index f0ff3c7..50fa42f 100644
--- a/R/CohortDefinitionSet.R
+++ b/R/CohortDefinitionSet.R
@@ -29,15 +29,41 @@
 #' @export
 createEmptyCohortDefinitionSet <- function(verbose = FALSE) {
   checkmate::assert_logical(verbose)
-  cohortDefinitionSetSpec <- .getCohortDefinitionSetSpecification()
+  df <- data.frame(
+    cohortId = numeric(),
+    cohortName = character(),
+    sql = character(),
+    json = character()
+  )
   if (verbose) {
-    print(cohortDefinitionSetSpec)
+    print(df)
   }
-  # Build the data.frame dynamically from the cohort definition set spec
-  df <- .createEmptyDataFrameFromSpecification(cohortDefinitionSetSpec)
   invisible(df)
 }
 
+.cohortDefinitionSetHasRequiredColumns <- function(x, emitWarning = FALSE) {
+  checkmate::assert_data_frame(x)
+  df <- createEmptyCohortDefinitionSet(verbose = FALSE)
+
+  # Compare the column names from the input x to an empty cohort
+  # definition set to ensure the required columns are present
+  cohortDefinitionSetColumns <- colnames(df)
+  matchingColumns <- intersect(x = colnames(x), y = cohortDefinitionSetColumns)
+  columnNamesMatch <- setequal(matchingColumns, cohortDefinitionSetColumns)
+  
+  if (!columnNamesMatch && emitWarning) {
+    columnsMissing <- setdiff(x = cohortDefinitionSetColumns, y = colnames(x))
+    warningMessage <- paste0(
+      "The following columns were missing in your cohortDefinitionSet: ",
+      paste(columnsMissing, collapse = ","),
+      ". A cohortDefinitionSet requires the following columns: ",
+      paste(cohortDefinitionSetColumns, collapse = ",")
+    )
+    warning(warningMessage)
+  }
+  invisible(columnNamesMatch)
+}
+
 #' Is the data.frame a cohort definition set?
 #'
 #' @description
@@ -99,7 +125,6 @@ checkAndFixCohortDefinitionSetDataTypes <- function(x, fixDataTypes = TRUE, emit
   checkmate::assert_data_frame(x)
   df <- createEmptyCohortDefinitionSet(verbose = FALSE)
   cohortDefinitionSetColumns <- colnames(df)
-  cohortDefinitionSetSpec <- .getCohortDefinitionSetSpecification()
 
   columnNamesMatch <- .cohortDefinitionSetHasRequiredColumns(x = x, emitWarning = emitWarning)
   if (!columnNamesMatch) {
@@ -107,7 +132,8 @@ checkAndFixCohortDefinitionSetDataTypes <- function(x, fixDataTypes = TRUE, emit
   }
 
   # Compare the data types from the input x to an empty cohort
-  # definition set to ensure the same data types are present
+  # definition set to ensure the same data types (or close enough)
+  # are present
   dataTypesMatch <- FALSE
   # Subset x to the required columns
   xSubset <- x[, cohortDefinitionSetColumns]
@@ -116,7 +142,14 @@ checkAndFixCohortDefinitionSetDataTypes <- function(x, fixDataTypes = TRUE, emit
   # Get the reference data types
   cohortDefinitionSetDataTypes <- sapply(df, typeof)
   # Check if the data types match
-  dataTypesMatch <- identical(x = xDataTypes, y = cohortDefinitionSetDataTypes)
+  # NOTE: createEmptyCohortDefinitionSet() is the reference for the data
+  # types. cohortId is declared as a numeric but an integer is also fine
+  dataTypesMatch <- (xDataTypes[1] %in% c('integer', 'double') && all(xDataTypes[2:4] == "character"))
+  # Create the cohortDefinitionSetSpec from the names/data types for reference
+  cohortDefinitionSetSpec <- data.frame(
+    columnName = names(xDataTypes),
+    dataType = xDataTypes
+  )
   if (!dataTypesMatch && emitWarning) {
     dataTypesMismatch <- setdiff(x = cohortDefinitionSetDataTypes, y = xDataTypes)
     # Create a column for the warning message
@@ -145,50 +178,6 @@ checkAndFixCohortDefinitionSetDataTypes <- function(x, fixDataTypes = TRUE, emit
   ))
 }
 
-.cohortDefinitionSetHasRequiredColumns <- function(x, emitWarning = FALSE) {
-  checkmate::assert_data_frame(x)
-  df <- createEmptyCohortDefinitionSet(verbose = FALSE)
-  cohortDefinitionSetSpec <- .getCohortDefinitionSetSpecification()
-
-  # Compare the column names from the input x to an empty cohort
-  # definition set to ensure the required columns are present
-  cohortDefinitionSetColumns <- colnames(df)
-  matchingColumns <- intersect(x = colnames(x), y = cohortDefinitionSetColumns)
-  columnNamesMatch <- setequal(matchingColumns, cohortDefinitionSetColumns)
-
-  if (!columnNamesMatch && emitWarning) {
-    columnsMissing <- setdiff(x = cohortDefinitionSetColumns, y = colnames(x))
-    warningMessage <- paste0(
-      "The following columns were missing in your cohortDefinitionSet: ",
-      paste(columnsMissing, collapse = ","),
-      ". A cohortDefinitionSet requires the following columns: ",
-      paste(cohortDefinitionSetColumns, collapse = ",")
-    )
-    warning(warningMessage)
-  }
-  invisible(columnNamesMatch)
-}
-
-#' Helper function to return the specification description of a
-#' cohortDefinitionSet
-#'
-#' @description
-#' This function reads from the cohortDefinitionSetSpecificationDescription.csv
-#' to return a data.frame that describes the required columns in a
-#' cohortDefinitionSet
-#'
-#' @return
-#' Returns a data.frame that defines a cohortDefinitionSet
-#'
-#' @noRd
-#' @keywords internal
-.getCohortDefinitionSetSpecification <- function() {
-  return(readCsv(system.file("csv", "cohortDefinitionSetSpecificationDescription.csv",
-    package = "CohortGenerator",
-    mustWork = TRUE
-  )))
-}
-
 #' Get a cohort definition set
 #'
 #' @description
@@ -507,17 +496,6 @@ checkSettingsColumns <- function(columnNames, settingsFileName = NULL) {
   }
 }
 
-.createEmptyDataFrameFromSpecification <- function(specifications) {
-  # Build the data.frame dynamically from the cohort definition set spec
-  df <- data.frame()
-  for (i in 1:nrow(specifications)) {
-    colName <- specifications$columnName[i]
-    dataType <- specifications$dataType[i]
-    df <- df %>% dplyr::mutate(!!colName := do.call(what = dataType, args = list()))
-  }
-  invisible(df)
-}
-
 .copySubsetDefinitions <- function(copyToCds, copyFromCds) {
   # deep clone any subset definitions
   if (hasSubsetDefinitions(copyFromCds)) {
diff --git a/R/NegativeControlCohorts.R b/R/NegativeControlCohorts.R
index 3808cad..549976f 100644
--- a/R/NegativeControlCohorts.R
+++ b/R/NegativeControlCohorts.R
@@ -29,36 +29,17 @@
 #' @export
 createEmptyNegativeControlOutcomeCohortSet <- function(verbose = FALSE) {
   checkmate::assert_logical(verbose)
-  negativeControlOutcomeCohortSetSpecification <- .getNegativeControlOutcomeCohortSetSpecification()
+  df <- data.frame(
+    cohortId = numeric(),
+    cohortName = character(),
+    outcomeConceptId = numeric()
+  )
   if (verbose) {
-    print(negativeControlOutcomeCohortSetSpecification)
+    print(df)
   }
-  # Build the data.frame dynamically
-  df <- .createEmptyDataFrameFromSpecification(negativeControlOutcomeCohortSetSpecification)
   invisible(df)
 }
 
-#' Helper function to return the specification description of a
-#' negativeControlOutcomeCohortSet
-#'
-#' @description
-#' This function reads from the negativeControlOutcomeCohortSetSpecificationDescription.csv
-#' to return a data.frame that describes the required columns in a
-#' negativeControlOutcomeCohortSet
-#'
-#' @return
-#' Returns a data.frame that defines a negativeControlOutcomeCohortSet
-#'
-#' @noRd
-#' @keywords internal
-.getNegativeControlOutcomeCohortSetSpecification <- function() {
-  return(readCsv(system.file("csv", "negativeControlOutcomeCohortSetSpecificationDescription.csv",
-    package = "CohortGenerator",
-    mustWork = TRUE
-  )))
-}
-
-
 #' Generate a set of negative control outcome cohorts
 #'
 #' @description
diff --git a/inst/csv/cohortDefinitionSetSpecificationDescription.csv b/inst/csv/cohortDefinitionSetSpecificationDescription.csv
deleted file mode 100644
index 5337d98..0000000
--- a/inst/csv/cohortDefinitionSetSpecificationDescription.csv
+++ /dev/null
@@ -1,5 +0,0 @@
-column_name,description,data_type
-cohortId,The identifier for the cohort in the cohort definition set.,numeric
-cohortName,The name of the cohort in the cohort definition set.,character
-sql,The SQL code used to construct the cohort,character
-json,The optional Circe compliant JSON representation of the cohort definition.,character
diff --git a/inst/csv/negativeControlOutcomeCohortSetSpecificationDescription.csv b/inst/csv/negativeControlOutcomeCohortSetSpecificationDescription.csv
deleted file mode 100644
index e7a8b15..0000000
--- a/inst/csv/negativeControlOutcomeCohortSetSpecificationDescription.csv
+++ /dev/null
@@ -1,4 +0,0 @@
-column_name,description,data_type
-cohortId,The identifier for the cohort in the negative control outcome cohort set.,numeric
-cohortName,The name of the cohort in the negative control outcome cohort set.,character
-outcomeConceptId,The concept_id used to construct the negative control cohort. This concept_id must be in the condition domain,numeric
diff --git a/tests/testthat/test-CohortDefinitionSet.R b/tests/testthat/test-CohortDefinitionSet.R
index 3a41ebb..6329d70 100644
--- a/tests/testthat/test-CohortDefinitionSet.R
+++ b/tests/testthat/test-CohortDefinitionSet.R
@@ -230,6 +230,12 @@ test_that("Call isCohortDefinitionSet with incorrect cohort definition set and e
   expect_warning(expect_false(isCohortDefinitionSet(cohortDefinitionSetError)))
 })
 
+test_that("Call isCohortDefinitionSet with cohort definition set with integer data type for cohort ID and expect TRUE", {
+  cohortDefinitionSet <- createEmptyCohortDefinitionSet()
+  cohortDefinitionSet$cohortId <- as.integer(cohortDefinitionSet$cohortId)
+  expect_true(isCohortDefinitionSet(cohortDefinitionSet))
+})
+
 test_that("Call isCohortDefinitionSet with cohort definition set with incorrect data type and expect FALSE", {
   cohortDefinitionSet <- createEmptyCohortDefinitionSet()
   cohortDefinitionSet$cohortName <- as.integer(cohortDefinitionSet$cohortName)

From d93b2edd0239685babf5fa7f0e7b69f8224abb91 Mon Sep 17 00:00:00 2001
From: Anthony Sena <asena5@its.jnj.com>
Date: Mon, 17 Jun 2024 11:14:40 -0400
Subject: [PATCH 2/3] Fix missing function call

---
 R/NegativeControlCohorts.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/NegativeControlCohorts.R b/R/NegativeControlCohorts.R
index 549976f..f0d284a 100644
--- a/R/NegativeControlCohorts.R
+++ b/R/NegativeControlCohorts.R
@@ -92,7 +92,7 @@ generateNegativeControlOutcomeCohorts <- function(connectionDetails = NULL,
   checkmate::assert_choice(x = tolower(occurrenceType), choices = c("all", "first"))
   checkmate::assert_logical(detectOnDescendants)
   checkmate::assertNames(colnames(negativeControlOutcomeCohortSet),
-    must.include = .getNegativeControlOutcomeCohortSetSpecification()$columnName
+    must.include = names(createEmptyNegativeControlOutcomeCohortSet())
   )
   checkmate::assert_data_frame(
     x = negativeControlOutcomeCohortSet,

From 01163f534b1be7e7a7141c595238b89815b47941 Mon Sep 17 00:00:00 2001
From: Anthony Sena <asena5@its.jnj.com>
Date: Mon, 17 Jun 2024 12:01:06 -0400
Subject: [PATCH 3/3] Update to add checks to prevent non-integer values for
 cohort id/concept id

---
 R/CohortConstruction.R                        |  1 +
 R/CohortDefinitionSet.R                       | 37 ++++++++++++++++++-
 R/CohortGenerator.R                           |  3 +-
 R/NegativeControlCohorts.R                    |  4 +-
 .../test-CohortConstructionAndStats.R         | 19 +++++++++-
 tests/testthat/test-NegativeControlCohorts.R  | 32 ++++++++++++++++
 6 files changed, 91 insertions(+), 5 deletions(-)

diff --git a/R/CohortConstruction.R b/R/CohortConstruction.R
index 5baecad..e27d222 100644
--- a/R/CohortConstruction.R
+++ b/R/CohortConstruction.R
@@ -76,6 +76,7 @@ generateCohortSet <- function(connectionDetails = NULL,
       "sql"
     )
   )
+  assertLargeInteger(cohortDefinitionSet$cohortId)
   # Verify that cohort IDs are not repeated in the cohort definition
   # set before generating
   if (length(unique(cohortDefinitionSet$cohortId)) != length(cohortDefinitionSet$cohortId)) {
diff --git a/R/CohortDefinitionSet.R b/R/CohortDefinitionSet.R
index 50fa42f..316f9b8 100644
--- a/R/CohortDefinitionSet.R
+++ b/R/CohortDefinitionSet.R
@@ -251,7 +251,7 @@ getCohortDefinitionSet <- function(settingsFileName = "Cohorts.csv",
   rlang::inform("Loading cohortDefinitionSet")
   settings <- readCsv(file = getPath(fileName = settingsFileName), warnOnCaseMismatch = FALSE)
 
-  assert_settings_columns(names(settings), getPath(fileName = settingsFileName))
+  assertSettingsColumns(names(settings), getPath(fileName = settingsFileName))
   checkmate::assert_true(all(cohortFileNameValue %in% names(settings)))
   checkmate::assert_true((!all(.getFileDataColumns() %in% names(settings))))
 
@@ -371,7 +371,7 @@ saveCohortDefinitionSet <- function(cohortDefinitionSet,
   checkmate::assertDataFrame(cohortDefinitionSet, min.rows = 1, col.names = "named")
   checkmate::assert_vector(cohortFileNameValue)
   checkmate::assert_true(length(cohortFileNameValue) > 0)
-  assert_settings_columns(names(cohortDefinitionSet))
+  assertSettingsColumns(names(cohortDefinitionSet))
   checkmate::assert_true(all(cohortFileNameValue %in% names(cohortDefinitionSet)))
   settingsFolder <- dirname(settingsFileName)
   if (!dir.exists(settingsFolder)) {
@@ -496,6 +496,39 @@ checkSettingsColumns <- function(columnNames, settingsFileName = NULL) {
   }
 }
 
+#' Custom checkmate assertion for ensuring a vector contains only integer numbers,
+#' including large ones
+#'
+#' @description
+#' This function is used to provide a more informative message to inform
+#' a user that their number must be an integer. Since the
+#' cohort definition set allows for storing `numeric` data types, we need
+#' to make sure that there are no digits in the mantissa of the cohort ID.
+#' NOTE: This function is necessary since checkmate::assert_integerish
+#' will still throw an error even in the case where you have a large
+#' integer which was not desirable.
+#'
+#' @param x The vector containing integer/numeric values
+#'
+#' @param columnName The name of the column where this vector came from. This
+#'                   is used when displaying the error message.
+#' @return
+#' Returns TRUE if all the values in x are integers
+#' @noRd
+#' @keywords internal
+checkLargeInteger <- function(x, columnName = "cohortId") {
+  # NOTE: suppressWarnings used to mask
+  # warning from R which may happen for
+  # large values in X.
+  res <- all(suppressWarnings(x%%1) == 0)
+  if (!isTRUE(res)) {
+    errorMessage <- paste0("The column ", columnName, " included non-integer values. Please update and re-try")
+    return(errorMessage)
+  } else {
+    return(TRUE)
+  }
+}
+
 .copySubsetDefinitions <- function(copyToCds, copyFromCds) {
   # deep clone any subset definitions
   if (hasSubsetDefinitions(copyFromCds)) {
diff --git a/R/CohortGenerator.R b/R/CohortGenerator.R
index b82cdd1..8c439b9 100644
--- a/R/CohortGenerator.R
+++ b/R/CohortGenerator.R
@@ -27,4 +27,5 @@
 NULL
 
 # Add custom assertions
-assert_settings_columns <- checkmate::makeAssertionFunction(checkSettingsColumns)
+assertSettingsColumns <- checkmate::makeAssertionFunction(checkSettingsColumns)
+assertLargeInteger <- checkmate::makeAssertionFunction(checkLargeInteger)
diff --git a/R/NegativeControlCohorts.R b/R/NegativeControlCohorts.R
index f0d284a..e4c0c14 100644
--- a/R/NegativeControlCohorts.R
+++ b/R/NegativeControlCohorts.R
@@ -98,7 +98,9 @@ generateNegativeControlOutcomeCohorts <- function(connectionDetails = NULL,
     x = negativeControlOutcomeCohortSet,
     min.rows = 1
   )
-
+  assertLargeInteger(negativeControlOutcomeCohortSet$cohortId)
+  assertLargeInteger(negativeControlOutcomeCohortSet$outcomeConceptId, columnName = "outcomeConceptId")
+  
   # Verify that cohort IDs are not repeated in the negative control
   # cohort definition set before generating
   if (length(unique(negativeControlOutcomeCohortSet$cohortId)) != length(negativeControlOutcomeCohortSet$cohortId)) {
diff --git a/tests/testthat/test-CohortConstructionAndStats.R b/tests/testthat/test-CohortConstructionAndStats.R
index a01a19b..d5e1847 100644
--- a/tests/testthat/test-CohortConstructionAndStats.R
+++ b/tests/testthat/test-CohortConstructionAndStats.R
@@ -55,6 +55,23 @@ test_that("Call instatiateCohortSet with malformed cohortDefinitionSet parameter
   )
 })
 
+test_that("Call instatiateCohortSet with cohortDefinitionSet with non-integer data type", {
+  cohortDefinitionSet <- createEmptyCohortDefinitionSet()
+  cohortDefinitionSet <- rbind(cohortDefinitionSet, data.frame(
+    cohortId = 1.2,
+    cohortName = "Test",
+    sql = "sql",
+    foo = "foo"
+  ))
+  expect_error(
+    generateCohortSet(
+      connectionDetails = connectionDetails,
+      cohortDefinitionSet = cohortDefinitionSet
+    ),
+    message = "(included non-integer)"
+  )
+})
+
 test_that("Call instatiateCohortSet with cohortDefinitionSet with extra columns", {
   cohortDefinitionSet <- createEmptyCohortDefinitionSet()
   cohortDefinitionSet <- rbind(cohortDefinitionSet, data.frame(
@@ -66,7 +83,7 @@ test_that("Call instatiateCohortSet with cohortDefinitionSet with extra columns"
   expect_error(
     generateCohortSet(
       connectionDetails = connectionDetails,
-      cohortDefinitionSet = data.frame()
+      cohortDefinitionSet = cohortDefinitionSet
     ),
     message = "(must contain the following columns)"
   )
diff --git a/tests/testthat/test-NegativeControlCohorts.R b/tests/testthat/test-NegativeControlCohorts.R
index f2444e6..46a5c9e 100644
--- a/tests/testthat/test-NegativeControlCohorts.R
+++ b/tests/testthat/test-NegativeControlCohorts.R
@@ -12,6 +12,38 @@ test_that("Call generateNegativeControlOutcomeCohorts without connection or conn
   expect_error(generateNegativeControlOutcomeCohorts())
 })
 
+test_that("Call generateNegativeControlOutcomeCohorts with negativeControlOutcomeCohortSet containing non-integer cohort ID", {
+  negativeControlOutcomeCohortSet <- data.frame(
+    cohortId = 1.2,
+    cohortName = "invalid cohort id",
+    outcomeConceptId = 1
+  )
+  expect_error(
+    generateNegativeControlOutcomeCohorts(
+      connectionDetails = connectionDetails,
+      cdmDatabaseSchema = "main",
+      negativeControlOutcomeCohortSet = negativeControlOutcomeCohortSet
+    ),
+    message = "(non-integer values)"
+  )
+})
+
+test_that("Call generateNegativeControlOutcomeCohorts with negativeControlOutcomeCohortSet containing non-integer outcome concept ID", {
+  negativeControlOutcomeCohortSet <- data.frame(
+    cohortId = 1,
+    cohortName = "invalid outcome concept id",
+    outcomeConceptId = 1.2
+  )
+  expect_error(
+    generateNegativeControlOutcomeCohorts(
+      connectionDetails = connectionDetails,
+      cdmDatabaseSchema = "main",
+      negativeControlOutcomeCohortSet = negativeControlOutcomeCohortSet
+    ),
+    message = "(non-integer values)"
+  )
+})
+
 test_that("Call generateNegativeControlOutcomeCohorts with negativeControlOutcomeCohortSet containing duplicate IDs", {
   negativeControlOutcomeCohortSet <- data.frame(
     cohortId = 1,