From 884217b1f077a2c4c8ef13d6e40759b6dfb86a77 Mon Sep 17 00:00:00 2001 From: Anthony Sena Date: Mon, 17 Jun 2024 10:45:09 -0400 Subject: [PATCH 1/3] Relax cohortId data type rule; remove csv to define data.frame for cds and nco --- R/CohortDefinitionSet.R | 102 +++++++----------- R/NegativeControlCohorts.R | 31 ++---- ...tDefinitionSetSpecificationDescription.csv | 5 - ...tcomeCohortSetSpecificationDescription.csv | 4 - tests/testthat/test-CohortDefinitionSet.R | 6 ++ 5 files changed, 52 insertions(+), 96 deletions(-) delete mode 100644 inst/csv/cohortDefinitionSetSpecificationDescription.csv delete mode 100644 inst/csv/negativeControlOutcomeCohortSetSpecificationDescription.csv diff --git a/R/CohortDefinitionSet.R b/R/CohortDefinitionSet.R index f0ff3c7..50fa42f 100644 --- a/R/CohortDefinitionSet.R +++ b/R/CohortDefinitionSet.R @@ -29,15 +29,41 @@ #' @export createEmptyCohortDefinitionSet <- function(verbose = FALSE) { checkmate::assert_logical(verbose) - cohortDefinitionSetSpec <- .getCohortDefinitionSetSpecification() + df <- data.frame( + cohortId = numeric(), + cohortName = character(), + sql = character(), + json = character() + ) if (verbose) { - print(cohortDefinitionSetSpec) + print(df) } - # Build the data.frame dynamically from the cohort definition set spec - df <- .createEmptyDataFrameFromSpecification(cohortDefinitionSetSpec) invisible(df) } +.cohortDefinitionSetHasRequiredColumns <- function(x, emitWarning = FALSE) { + checkmate::assert_data_frame(x) + df <- createEmptyCohortDefinitionSet(verbose = FALSE) + + # Compare the column names from the input x to an empty cohort + # definition set to ensure the required columns are present + cohortDefinitionSetColumns <- colnames(df) + matchingColumns <- intersect(x = colnames(x), y = cohortDefinitionSetColumns) + columnNamesMatch <- setequal(matchingColumns, cohortDefinitionSetColumns) + + if (!columnNamesMatch && emitWarning) { + columnsMissing <- setdiff(x = cohortDefinitionSetColumns, y = colnames(x)) + warningMessage <- paste0( + "The following columns were missing in your cohortDefinitionSet: ", + paste(columnsMissing, collapse = ","), + ". A cohortDefinitionSet requires the following columns: ", + paste(cohortDefinitionSetColumns, collapse = ",") + ) + warning(warningMessage) + } + invisible(columnNamesMatch) +} + #' Is the data.frame a cohort definition set? #' #' @description @@ -99,7 +125,6 @@ checkAndFixCohortDefinitionSetDataTypes <- function(x, fixDataTypes = TRUE, emit checkmate::assert_data_frame(x) df <- createEmptyCohortDefinitionSet(verbose = FALSE) cohortDefinitionSetColumns <- colnames(df) - cohortDefinitionSetSpec <- .getCohortDefinitionSetSpecification() columnNamesMatch <- .cohortDefinitionSetHasRequiredColumns(x = x, emitWarning = emitWarning) if (!columnNamesMatch) { @@ -107,7 +132,8 @@ checkAndFixCohortDefinitionSetDataTypes <- function(x, fixDataTypes = TRUE, emit } # Compare the data types from the input x to an empty cohort - # definition set to ensure the same data types are present + # definition set to ensure the same data types (or close enough) + # are present dataTypesMatch <- FALSE # Subset x to the required columns xSubset <- x[, cohortDefinitionSetColumns] @@ -116,7 +142,14 @@ checkAndFixCohortDefinitionSetDataTypes <- function(x, fixDataTypes = TRUE, emit # Get the reference data types cohortDefinitionSetDataTypes <- sapply(df, typeof) # Check if the data types match - dataTypesMatch <- identical(x = xDataTypes, y = cohortDefinitionSetDataTypes) + # NOTE: createEmptyCohortDefinitionSet() is the reference for the data + # types. cohortId is declared as a numeric but an integer is also fine + dataTypesMatch <- (xDataTypes[1] %in% c('integer', 'double') && all(xDataTypes[2:4] == "character")) + # Create the cohortDefinitionSetSpec from the names/data types for reference + cohortDefinitionSetSpec <- data.frame( + columnName = names(xDataTypes), + dataType = xDataTypes + ) if (!dataTypesMatch && emitWarning) { dataTypesMismatch <- setdiff(x = cohortDefinitionSetDataTypes, y = xDataTypes) # Create a column for the warning message @@ -145,50 +178,6 @@ checkAndFixCohortDefinitionSetDataTypes <- function(x, fixDataTypes = TRUE, emit )) } -.cohortDefinitionSetHasRequiredColumns <- function(x, emitWarning = FALSE) { - checkmate::assert_data_frame(x) - df <- createEmptyCohortDefinitionSet(verbose = FALSE) - cohortDefinitionSetSpec <- .getCohortDefinitionSetSpecification() - - # Compare the column names from the input x to an empty cohort - # definition set to ensure the required columns are present - cohortDefinitionSetColumns <- colnames(df) - matchingColumns <- intersect(x = colnames(x), y = cohortDefinitionSetColumns) - columnNamesMatch <- setequal(matchingColumns, cohortDefinitionSetColumns) - - if (!columnNamesMatch && emitWarning) { - columnsMissing <- setdiff(x = cohortDefinitionSetColumns, y = colnames(x)) - warningMessage <- paste0( - "The following columns were missing in your cohortDefinitionSet: ", - paste(columnsMissing, collapse = ","), - ". A cohortDefinitionSet requires the following columns: ", - paste(cohortDefinitionSetColumns, collapse = ",") - ) - warning(warningMessage) - } - invisible(columnNamesMatch) -} - -#' Helper function to return the specification description of a -#' cohortDefinitionSet -#' -#' @description -#' This function reads from the cohortDefinitionSetSpecificationDescription.csv -#' to return a data.frame that describes the required columns in a -#' cohortDefinitionSet -#' -#' @return -#' Returns a data.frame that defines a cohortDefinitionSet -#' -#' @noRd -#' @keywords internal -.getCohortDefinitionSetSpecification <- function() { - return(readCsv(system.file("csv", "cohortDefinitionSetSpecificationDescription.csv", - package = "CohortGenerator", - mustWork = TRUE - ))) -} - #' Get a cohort definition set #' #' @description @@ -507,17 +496,6 @@ checkSettingsColumns <- function(columnNames, settingsFileName = NULL) { } } -.createEmptyDataFrameFromSpecification <- function(specifications) { - # Build the data.frame dynamically from the cohort definition set spec - df <- data.frame() - for (i in 1:nrow(specifications)) { - colName <- specifications$columnName[i] - dataType <- specifications$dataType[i] - df <- df %>% dplyr::mutate(!!colName := do.call(what = dataType, args = list())) - } - invisible(df) -} - .copySubsetDefinitions <- function(copyToCds, copyFromCds) { # deep clone any subset definitions if (hasSubsetDefinitions(copyFromCds)) { diff --git a/R/NegativeControlCohorts.R b/R/NegativeControlCohorts.R index 3808cad..549976f 100644 --- a/R/NegativeControlCohorts.R +++ b/R/NegativeControlCohorts.R @@ -29,36 +29,17 @@ #' @export createEmptyNegativeControlOutcomeCohortSet <- function(verbose = FALSE) { checkmate::assert_logical(verbose) - negativeControlOutcomeCohortSetSpecification <- .getNegativeControlOutcomeCohortSetSpecification() + df <- data.frame( + cohortId = numeric(), + cohortName = character(), + outcomeConceptId = numeric() + ) if (verbose) { - print(negativeControlOutcomeCohortSetSpecification) + print(df) } - # Build the data.frame dynamically - df <- .createEmptyDataFrameFromSpecification(negativeControlOutcomeCohortSetSpecification) invisible(df) } -#' Helper function to return the specification description of a -#' negativeControlOutcomeCohortSet -#' -#' @description -#' This function reads from the negativeControlOutcomeCohortSetSpecificationDescription.csv -#' to return a data.frame that describes the required columns in a -#' negativeControlOutcomeCohortSet -#' -#' @return -#' Returns a data.frame that defines a negativeControlOutcomeCohortSet -#' -#' @noRd -#' @keywords internal -.getNegativeControlOutcomeCohortSetSpecification <- function() { - return(readCsv(system.file("csv", "negativeControlOutcomeCohortSetSpecificationDescription.csv", - package = "CohortGenerator", - mustWork = TRUE - ))) -} - - #' Generate a set of negative control outcome cohorts #' #' @description diff --git a/inst/csv/cohortDefinitionSetSpecificationDescription.csv b/inst/csv/cohortDefinitionSetSpecificationDescription.csv deleted file mode 100644 index 5337d98..0000000 --- a/inst/csv/cohortDefinitionSetSpecificationDescription.csv +++ /dev/null @@ -1,5 +0,0 @@ -column_name,description,data_type -cohortId,The identifier for the cohort in the cohort definition set.,numeric -cohortName,The name of the cohort in the cohort definition set.,character -sql,The SQL code used to construct the cohort,character -json,The optional Circe compliant JSON representation of the cohort definition.,character diff --git a/inst/csv/negativeControlOutcomeCohortSetSpecificationDescription.csv b/inst/csv/negativeControlOutcomeCohortSetSpecificationDescription.csv deleted file mode 100644 index e7a8b15..0000000 --- a/inst/csv/negativeControlOutcomeCohortSetSpecificationDescription.csv +++ /dev/null @@ -1,4 +0,0 @@ -column_name,description,data_type -cohortId,The identifier for the cohort in the negative control outcome cohort set.,numeric -cohortName,The name of the cohort in the negative control outcome cohort set.,character -outcomeConceptId,The concept_id used to construct the negative control cohort. This concept_id must be in the condition domain,numeric diff --git a/tests/testthat/test-CohortDefinitionSet.R b/tests/testthat/test-CohortDefinitionSet.R index 3a41ebb..6329d70 100644 --- a/tests/testthat/test-CohortDefinitionSet.R +++ b/tests/testthat/test-CohortDefinitionSet.R @@ -230,6 +230,12 @@ test_that("Call isCohortDefinitionSet with incorrect cohort definition set and e expect_warning(expect_false(isCohortDefinitionSet(cohortDefinitionSetError))) }) +test_that("Call isCohortDefinitionSet with cohort definition set with integer data type for cohort ID and expect TRUE", { + cohortDefinitionSet <- createEmptyCohortDefinitionSet() + cohortDefinitionSet$cohortId <- as.integer(cohortDefinitionSet$cohortId) + expect_true(isCohortDefinitionSet(cohortDefinitionSet)) +}) + test_that("Call isCohortDefinitionSet with cohort definition set with incorrect data type and expect FALSE", { cohortDefinitionSet <- createEmptyCohortDefinitionSet() cohortDefinitionSet$cohortName <- as.integer(cohortDefinitionSet$cohortName) From d93b2edd0239685babf5fa7f0e7b69f8224abb91 Mon Sep 17 00:00:00 2001 From: Anthony Sena Date: Mon, 17 Jun 2024 11:14:40 -0400 Subject: [PATCH 2/3] Fix missing function call --- R/NegativeControlCohorts.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/NegativeControlCohorts.R b/R/NegativeControlCohorts.R index 549976f..f0d284a 100644 --- a/R/NegativeControlCohorts.R +++ b/R/NegativeControlCohorts.R @@ -92,7 +92,7 @@ generateNegativeControlOutcomeCohorts <- function(connectionDetails = NULL, checkmate::assert_choice(x = tolower(occurrenceType), choices = c("all", "first")) checkmate::assert_logical(detectOnDescendants) checkmate::assertNames(colnames(negativeControlOutcomeCohortSet), - must.include = .getNegativeControlOutcomeCohortSetSpecification()$columnName + must.include = names(createEmptyNegativeControlOutcomeCohortSet()) ) checkmate::assert_data_frame( x = negativeControlOutcomeCohortSet, From 01163f534b1be7e7a7141c595238b89815b47941 Mon Sep 17 00:00:00 2001 From: Anthony Sena Date: Mon, 17 Jun 2024 12:01:06 -0400 Subject: [PATCH 3/3] Update to add checks to prevent non-integer values for cohort id/concept id --- R/CohortConstruction.R | 1 + R/CohortDefinitionSet.R | 37 ++++++++++++++++++- R/CohortGenerator.R | 3 +- R/NegativeControlCohorts.R | 4 +- .../test-CohortConstructionAndStats.R | 19 +++++++++- tests/testthat/test-NegativeControlCohorts.R | 32 ++++++++++++++++ 6 files changed, 91 insertions(+), 5 deletions(-) diff --git a/R/CohortConstruction.R b/R/CohortConstruction.R index 5baecad..e27d222 100644 --- a/R/CohortConstruction.R +++ b/R/CohortConstruction.R @@ -76,6 +76,7 @@ generateCohortSet <- function(connectionDetails = NULL, "sql" ) ) + assertLargeInteger(cohortDefinitionSet$cohortId) # Verify that cohort IDs are not repeated in the cohort definition # set before generating if (length(unique(cohortDefinitionSet$cohortId)) != length(cohortDefinitionSet$cohortId)) { diff --git a/R/CohortDefinitionSet.R b/R/CohortDefinitionSet.R index 50fa42f..316f9b8 100644 --- a/R/CohortDefinitionSet.R +++ b/R/CohortDefinitionSet.R @@ -251,7 +251,7 @@ getCohortDefinitionSet <- function(settingsFileName = "Cohorts.csv", rlang::inform("Loading cohortDefinitionSet") settings <- readCsv(file = getPath(fileName = settingsFileName), warnOnCaseMismatch = FALSE) - assert_settings_columns(names(settings), getPath(fileName = settingsFileName)) + assertSettingsColumns(names(settings), getPath(fileName = settingsFileName)) checkmate::assert_true(all(cohortFileNameValue %in% names(settings))) checkmate::assert_true((!all(.getFileDataColumns() %in% names(settings)))) @@ -371,7 +371,7 @@ saveCohortDefinitionSet <- function(cohortDefinitionSet, checkmate::assertDataFrame(cohortDefinitionSet, min.rows = 1, col.names = "named") checkmate::assert_vector(cohortFileNameValue) checkmate::assert_true(length(cohortFileNameValue) > 0) - assert_settings_columns(names(cohortDefinitionSet)) + assertSettingsColumns(names(cohortDefinitionSet)) checkmate::assert_true(all(cohortFileNameValue %in% names(cohortDefinitionSet))) settingsFolder <- dirname(settingsFileName) if (!dir.exists(settingsFolder)) { @@ -496,6 +496,39 @@ checkSettingsColumns <- function(columnNames, settingsFileName = NULL) { } } +#' Custom checkmate assertion for ensuring a vector contains only integer numbers, +#' including large ones +#' +#' @description +#' This function is used to provide a more informative message to inform +#' a user that their number must be an integer. Since the +#' cohort definition set allows for storing `numeric` data types, we need +#' to make sure that there are no digits in the mantissa of the cohort ID. +#' NOTE: This function is necessary since checkmate::assert_integerish +#' will still throw an error even in the case where you have a large +#' integer which was not desirable. +#' +#' @param x The vector containing integer/numeric values +#' +#' @param columnName The name of the column where this vector came from. This +#' is used when displaying the error message. +#' @return +#' Returns TRUE if all the values in x are integers +#' @noRd +#' @keywords internal +checkLargeInteger <- function(x, columnName = "cohortId") { + # NOTE: suppressWarnings used to mask + # warning from R which may happen for + # large values in X. + res <- all(suppressWarnings(x%%1) == 0) + if (!isTRUE(res)) { + errorMessage <- paste0("The column ", columnName, " included non-integer values. Please update and re-try") + return(errorMessage) + } else { + return(TRUE) + } +} + .copySubsetDefinitions <- function(copyToCds, copyFromCds) { # deep clone any subset definitions if (hasSubsetDefinitions(copyFromCds)) { diff --git a/R/CohortGenerator.R b/R/CohortGenerator.R index b82cdd1..8c439b9 100644 --- a/R/CohortGenerator.R +++ b/R/CohortGenerator.R @@ -27,4 +27,5 @@ NULL # Add custom assertions -assert_settings_columns <- checkmate::makeAssertionFunction(checkSettingsColumns) +assertSettingsColumns <- checkmate::makeAssertionFunction(checkSettingsColumns) +assertLargeInteger <- checkmate::makeAssertionFunction(checkLargeInteger) diff --git a/R/NegativeControlCohorts.R b/R/NegativeControlCohorts.R index f0d284a..e4c0c14 100644 --- a/R/NegativeControlCohorts.R +++ b/R/NegativeControlCohorts.R @@ -98,7 +98,9 @@ generateNegativeControlOutcomeCohorts <- function(connectionDetails = NULL, x = negativeControlOutcomeCohortSet, min.rows = 1 ) - + assertLargeInteger(negativeControlOutcomeCohortSet$cohortId) + assertLargeInteger(negativeControlOutcomeCohortSet$outcomeConceptId, columnName = "outcomeConceptId") + # Verify that cohort IDs are not repeated in the negative control # cohort definition set before generating if (length(unique(negativeControlOutcomeCohortSet$cohortId)) != length(negativeControlOutcomeCohortSet$cohortId)) { diff --git a/tests/testthat/test-CohortConstructionAndStats.R b/tests/testthat/test-CohortConstructionAndStats.R index a01a19b..d5e1847 100644 --- a/tests/testthat/test-CohortConstructionAndStats.R +++ b/tests/testthat/test-CohortConstructionAndStats.R @@ -55,6 +55,23 @@ test_that("Call instatiateCohortSet with malformed cohortDefinitionSet parameter ) }) +test_that("Call instatiateCohortSet with cohortDefinitionSet with non-integer data type", { + cohortDefinitionSet <- createEmptyCohortDefinitionSet() + cohortDefinitionSet <- rbind(cohortDefinitionSet, data.frame( + cohortId = 1.2, + cohortName = "Test", + sql = "sql", + foo = "foo" + )) + expect_error( + generateCohortSet( + connectionDetails = connectionDetails, + cohortDefinitionSet = cohortDefinitionSet + ), + message = "(included non-integer)" + ) +}) + test_that("Call instatiateCohortSet with cohortDefinitionSet with extra columns", { cohortDefinitionSet <- createEmptyCohortDefinitionSet() cohortDefinitionSet <- rbind(cohortDefinitionSet, data.frame( @@ -66,7 +83,7 @@ test_that("Call instatiateCohortSet with cohortDefinitionSet with extra columns" expect_error( generateCohortSet( connectionDetails = connectionDetails, - cohortDefinitionSet = data.frame() + cohortDefinitionSet = cohortDefinitionSet ), message = "(must contain the following columns)" ) diff --git a/tests/testthat/test-NegativeControlCohorts.R b/tests/testthat/test-NegativeControlCohorts.R index f2444e6..46a5c9e 100644 --- a/tests/testthat/test-NegativeControlCohorts.R +++ b/tests/testthat/test-NegativeControlCohorts.R @@ -12,6 +12,38 @@ test_that("Call generateNegativeControlOutcomeCohorts without connection or conn expect_error(generateNegativeControlOutcomeCohorts()) }) +test_that("Call generateNegativeControlOutcomeCohorts with negativeControlOutcomeCohortSet containing non-integer cohort ID", { + negativeControlOutcomeCohortSet <- data.frame( + cohortId = 1.2, + cohortName = "invalid cohort id", + outcomeConceptId = 1 + ) + expect_error( + generateNegativeControlOutcomeCohorts( + connectionDetails = connectionDetails, + cdmDatabaseSchema = "main", + negativeControlOutcomeCohortSet = negativeControlOutcomeCohortSet + ), + message = "(non-integer values)" + ) +}) + +test_that("Call generateNegativeControlOutcomeCohorts with negativeControlOutcomeCohortSet containing non-integer outcome concept ID", { + negativeControlOutcomeCohortSet <- data.frame( + cohortId = 1, + cohortName = "invalid outcome concept id", + outcomeConceptId = 1.2 + ) + expect_error( + generateNegativeControlOutcomeCohorts( + connectionDetails = connectionDetails, + cdmDatabaseSchema = "main", + negativeControlOutcomeCohortSet = negativeControlOutcomeCohortSet + ), + message = "(non-integer values)" + ) +}) + test_that("Call generateNegativeControlOutcomeCohorts with negativeControlOutcomeCohortSet containing duplicate IDs", { negativeControlOutcomeCohortSet <- data.frame( cohortId = 1,