Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow integer for cohortId #166

Merged
merged 3 commits into from
Jun 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions R/CohortConstruction.R
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ generateCohortSet <- function(connectionDetails = NULL,
"sql"
)
)
assertLargeInteger(cohortDefinitionSet$cohortId)
# Verify that cohort IDs are not repeated in the cohort definition
# set before generating
if (length(unique(cohortDefinitionSet$cohortId)) != length(cohortDefinitionSet$cohortId)) {
Expand Down
133 changes: 72 additions & 61 deletions R/CohortDefinitionSet.R
Original file line number Diff line number Diff line change
Expand Up @@ -29,15 +29,41 @@
#' @export
createEmptyCohortDefinitionSet <- function(verbose = FALSE) {
checkmate::assert_logical(verbose)
cohortDefinitionSetSpec <- .getCohortDefinitionSetSpecification()
df <- data.frame(
cohortId = numeric(),
cohortName = character(),
sql = character(),
json = character()
)
if (verbose) {
print(cohortDefinitionSetSpec)
print(df)
}
# Build the data.frame dynamically from the cohort definition set spec
df <- .createEmptyDataFrameFromSpecification(cohortDefinitionSetSpec)
invisible(df)
}

.cohortDefinitionSetHasRequiredColumns <- function(x, emitWarning = FALSE) {
checkmate::assert_data_frame(x)
df <- createEmptyCohortDefinitionSet(verbose = FALSE)

# Compare the column names from the input x to an empty cohort
# definition set to ensure the required columns are present
cohortDefinitionSetColumns <- colnames(df)
matchingColumns <- intersect(x = colnames(x), y = cohortDefinitionSetColumns)
columnNamesMatch <- setequal(matchingColumns, cohortDefinitionSetColumns)

if (!columnNamesMatch && emitWarning) {
columnsMissing <- setdiff(x = cohortDefinitionSetColumns, y = colnames(x))
warningMessage <- paste0(
"The following columns were missing in your cohortDefinitionSet: ",
paste(columnsMissing, collapse = ","),
". A cohortDefinitionSet requires the following columns: ",
paste(cohortDefinitionSetColumns, collapse = ",")
)
warning(warningMessage)
}
invisible(columnNamesMatch)
}

#' Is the data.frame a cohort definition set?
#'
#' @description
Expand Down Expand Up @@ -99,15 +125,15 @@ checkAndFixCohortDefinitionSetDataTypes <- function(x, fixDataTypes = TRUE, emit
checkmate::assert_data_frame(x)
df <- createEmptyCohortDefinitionSet(verbose = FALSE)
cohortDefinitionSetColumns <- colnames(df)
cohortDefinitionSetSpec <- .getCohortDefinitionSetSpecification()

columnNamesMatch <- .cohortDefinitionSetHasRequiredColumns(x = x, emitWarning = emitWarning)
if (!columnNamesMatch) {
stop("Cannot check and fix cohortDefinitionSet since it is missing required columns.")
}

# Compare the data types from the input x to an empty cohort
# definition set to ensure the same data types are present
# definition set to ensure the same data types (or close enough)
# are present
dataTypesMatch <- FALSE
# Subset x to the required columns
xSubset <- x[, cohortDefinitionSetColumns]
Expand All @@ -116,7 +142,14 @@ checkAndFixCohortDefinitionSetDataTypes <- function(x, fixDataTypes = TRUE, emit
# Get the reference data types
cohortDefinitionSetDataTypes <- sapply(df, typeof)
# Check if the data types match
dataTypesMatch <- identical(x = xDataTypes, y = cohortDefinitionSetDataTypes)
# NOTE: createEmptyCohortDefinitionSet() is the reference for the data
# types. cohortId is declared as a numeric but an integer is also fine
dataTypesMatch <- (xDataTypes[1] %in% c('integer', 'double') && all(xDataTypes[2:4] == "character"))
# Create the cohortDefinitionSetSpec from the names/data types for reference
cohortDefinitionSetSpec <- data.frame(
columnName = names(xDataTypes),
dataType = xDataTypes
)
if (!dataTypesMatch && emitWarning) {
dataTypesMismatch <- setdiff(x = cohortDefinitionSetDataTypes, y = xDataTypes)
# Create a column for the warning message
Expand Down Expand Up @@ -145,50 +178,6 @@ checkAndFixCohortDefinitionSetDataTypes <- function(x, fixDataTypes = TRUE, emit
))
}

.cohortDefinitionSetHasRequiredColumns <- function(x, emitWarning = FALSE) {
checkmate::assert_data_frame(x)
df <- createEmptyCohortDefinitionSet(verbose = FALSE)
cohortDefinitionSetSpec <- .getCohortDefinitionSetSpecification()

# Compare the column names from the input x to an empty cohort
# definition set to ensure the required columns are present
cohortDefinitionSetColumns <- colnames(df)
matchingColumns <- intersect(x = colnames(x), y = cohortDefinitionSetColumns)
columnNamesMatch <- setequal(matchingColumns, cohortDefinitionSetColumns)

if (!columnNamesMatch && emitWarning) {
columnsMissing <- setdiff(x = cohortDefinitionSetColumns, y = colnames(x))
warningMessage <- paste0(
"The following columns were missing in your cohortDefinitionSet: ",
paste(columnsMissing, collapse = ","),
". A cohortDefinitionSet requires the following columns: ",
paste(cohortDefinitionSetColumns, collapse = ",")
)
warning(warningMessage)
}
invisible(columnNamesMatch)
}

#' Helper function to return the specification description of a
#' cohortDefinitionSet
#'
#' @description
#' This function reads from the cohortDefinitionSetSpecificationDescription.csv
#' to return a data.frame that describes the required columns in a
#' cohortDefinitionSet
#'
#' @return
#' Returns a data.frame that defines a cohortDefinitionSet
#'
#' @noRd
#' @keywords internal
.getCohortDefinitionSetSpecification <- function() {
return(readCsv(system.file("csv", "cohortDefinitionSetSpecificationDescription.csv",
package = "CohortGenerator",
mustWork = TRUE
)))
}

#' Get a cohort definition set
#'
#' @description
Expand Down Expand Up @@ -262,7 +251,7 @@ getCohortDefinitionSet <- function(settingsFileName = "Cohorts.csv",
rlang::inform("Loading cohortDefinitionSet")
settings <- readCsv(file = getPath(fileName = settingsFileName), warnOnCaseMismatch = FALSE)

assert_settings_columns(names(settings), getPath(fileName = settingsFileName))
assertSettingsColumns(names(settings), getPath(fileName = settingsFileName))
checkmate::assert_true(all(cohortFileNameValue %in% names(settings)))
checkmate::assert_true((!all(.getFileDataColumns() %in% names(settings))))

Expand Down Expand Up @@ -382,7 +371,7 @@ saveCohortDefinitionSet <- function(cohortDefinitionSet,
checkmate::assertDataFrame(cohortDefinitionSet, min.rows = 1, col.names = "named")
checkmate::assert_vector(cohortFileNameValue)
checkmate::assert_true(length(cohortFileNameValue) > 0)
assert_settings_columns(names(cohortDefinitionSet))
assertSettingsColumns(names(cohortDefinitionSet))
checkmate::assert_true(all(cohortFileNameValue %in% names(cohortDefinitionSet)))
settingsFolder <- dirname(settingsFileName)
if (!dir.exists(settingsFolder)) {
Expand Down Expand Up @@ -507,15 +496,37 @@ checkSettingsColumns <- function(columnNames, settingsFileName = NULL) {
}
}

.createEmptyDataFrameFromSpecification <- function(specifications) {
# Build the data.frame dynamically from the cohort definition set spec
df <- data.frame()
for (i in 1:nrow(specifications)) {
colName <- specifications$columnName[i]
dataType <- specifications$dataType[i]
df <- df %>% dplyr::mutate(!!colName := do.call(what = dataType, args = list()))
#' Custom checkmate assertion for ensuring a vector contains only integer numbers,
#' including large ones
#'
#' @description
#' This function is used to provide a more informative message to inform
#' a user that their number must be an integer. Since the
#' cohort definition set allows for storing `numeric` data types, we need
#' to make sure that there are no digits in the mantissa of the cohort ID.
#' NOTE: This function is necessary since checkmate::assert_integerish
#' will still throw an error even in the case where you have a large
#' integer which was not desirable.
#'
#' @param x The vector containing integer/numeric values
#'
#' @param columnName The name of the column where this vector came from. This
#' is used when displaying the error message.
#' @return
#' Returns TRUE if all the values in x are integers
#' @noRd
#' @keywords internal
checkLargeInteger <- function(x, columnName = "cohortId") {
# NOTE: suppressWarnings used to mask
# warning from R which may happen for
# large values in X.
res <- all(suppressWarnings(x%%1) == 0)
if (!isTRUE(res)) {
errorMessage <- paste0("The column ", columnName, " included non-integer values. Please update and re-try")
return(errorMessage)
} else {
return(TRUE)
}
invisible(df)
}

.copySubsetDefinitions <- function(copyToCds, copyFromCds) {
Expand Down
3 changes: 2 additions & 1 deletion R/CohortGenerator.R
Original file line number Diff line number Diff line change
Expand Up @@ -27,4 +27,5 @@
NULL

# Add custom assertions
assert_settings_columns <- checkmate::makeAssertionFunction(checkSettingsColumns)
assertSettingsColumns <- checkmate::makeAssertionFunction(checkSettingsColumns)
assertLargeInteger <- checkmate::makeAssertionFunction(checkLargeInteger)
37 changes: 10 additions & 27 deletions R/NegativeControlCohorts.R
Original file line number Diff line number Diff line change
Expand Up @@ -29,36 +29,17 @@
#' @export
createEmptyNegativeControlOutcomeCohortSet <- function(verbose = FALSE) {
checkmate::assert_logical(verbose)
negativeControlOutcomeCohortSetSpecification <- .getNegativeControlOutcomeCohortSetSpecification()
df <- data.frame(
cohortId = numeric(),
cohortName = character(),
outcomeConceptId = numeric()
)
if (verbose) {
print(negativeControlOutcomeCohortSetSpecification)
print(df)
}
# Build the data.frame dynamically
df <- .createEmptyDataFrameFromSpecification(negativeControlOutcomeCohortSetSpecification)
invisible(df)
}

#' Helper function to return the specification description of a
#' negativeControlOutcomeCohortSet
#'
#' @description
#' This function reads from the negativeControlOutcomeCohortSetSpecificationDescription.csv
#' to return a data.frame that describes the required columns in a
#' negativeControlOutcomeCohortSet
#'
#' @return
#' Returns a data.frame that defines a negativeControlOutcomeCohortSet
#'
#' @noRd
#' @keywords internal
.getNegativeControlOutcomeCohortSetSpecification <- function() {
return(readCsv(system.file("csv", "negativeControlOutcomeCohortSetSpecificationDescription.csv",
package = "CohortGenerator",
mustWork = TRUE
)))
}


#' Generate a set of negative control outcome cohorts
#'
#' @description
Expand Down Expand Up @@ -111,13 +92,15 @@ generateNegativeControlOutcomeCohorts <- function(connectionDetails = NULL,
checkmate::assert_choice(x = tolower(occurrenceType), choices = c("all", "first"))
checkmate::assert_logical(detectOnDescendants)
checkmate::assertNames(colnames(negativeControlOutcomeCohortSet),
must.include = .getNegativeControlOutcomeCohortSetSpecification()$columnName
must.include = names(createEmptyNegativeControlOutcomeCohortSet())
)
checkmate::assert_data_frame(
x = negativeControlOutcomeCohortSet,
min.rows = 1
)

assertLargeInteger(negativeControlOutcomeCohortSet$cohortId)
assertLargeInteger(negativeControlOutcomeCohortSet$outcomeConceptId, columnName = "outcomeConceptId")

# Verify that cohort IDs are not repeated in the negative control
# cohort definition set before generating
if (length(unique(negativeControlOutcomeCohortSet$cohortId)) != length(negativeControlOutcomeCohortSet$cohortId)) {
Expand Down
5 changes: 0 additions & 5 deletions inst/csv/cohortDefinitionSetSpecificationDescription.csv

This file was deleted.

This file was deleted.

19 changes: 18 additions & 1 deletion tests/testthat/test-CohortConstructionAndStats.R
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,23 @@ test_that("Call instatiateCohortSet with malformed cohortDefinitionSet parameter
)
})

test_that("Call instatiateCohortSet with cohortDefinitionSet with non-integer data type", {
cohortDefinitionSet <- createEmptyCohortDefinitionSet()
cohortDefinitionSet <- rbind(cohortDefinitionSet, data.frame(
cohortId = 1.2,
cohortName = "Test",
sql = "sql",
foo = "foo"
))
expect_error(
generateCohortSet(
connectionDetails = connectionDetails,
cohortDefinitionSet = cohortDefinitionSet
),
message = "(included non-integer)"
)
})

test_that("Call instatiateCohortSet with cohortDefinitionSet with extra columns", {
cohortDefinitionSet <- createEmptyCohortDefinitionSet()
cohortDefinitionSet <- rbind(cohortDefinitionSet, data.frame(
Expand All @@ -66,7 +83,7 @@ test_that("Call instatiateCohortSet with cohortDefinitionSet with extra columns"
expect_error(
generateCohortSet(
connectionDetails = connectionDetails,
cohortDefinitionSet = data.frame()
cohortDefinitionSet = cohortDefinitionSet
),
message = "(must contain the following columns)"
)
Expand Down
6 changes: 6 additions & 0 deletions tests/testthat/test-CohortDefinitionSet.R
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,12 @@ test_that("Call isCohortDefinitionSet with incorrect cohort definition set and e
expect_warning(expect_false(isCohortDefinitionSet(cohortDefinitionSetError)))
})

test_that("Call isCohortDefinitionSet with cohort definition set with integer data type for cohort ID and expect TRUE", {
cohortDefinitionSet <- createEmptyCohortDefinitionSet()
cohortDefinitionSet$cohortId <- as.integer(cohortDefinitionSet$cohortId)
expect_true(isCohortDefinitionSet(cohortDefinitionSet))
})

test_that("Call isCohortDefinitionSet with cohort definition set with incorrect data type and expect FALSE", {
cohortDefinitionSet <- createEmptyCohortDefinitionSet()
cohortDefinitionSet$cohortName <- as.integer(cohortDefinitionSet$cohortName)
Expand Down
32 changes: 32 additions & 0 deletions tests/testthat/test-NegativeControlCohorts.R
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,38 @@ test_that("Call generateNegativeControlOutcomeCohorts without connection or conn
expect_error(generateNegativeControlOutcomeCohorts())
})

test_that("Call generateNegativeControlOutcomeCohorts with negativeControlOutcomeCohortSet containing non-integer cohort ID", {
negativeControlOutcomeCohortSet <- data.frame(
cohortId = 1.2,
cohortName = "invalid cohort id",
outcomeConceptId = 1
)
expect_error(
generateNegativeControlOutcomeCohorts(
connectionDetails = connectionDetails,
cdmDatabaseSchema = "main",
negativeControlOutcomeCohortSet = negativeControlOutcomeCohortSet
),
message = "(non-integer values)"
)
})

test_that("Call generateNegativeControlOutcomeCohorts with negativeControlOutcomeCohortSet containing non-integer outcome concept ID", {
negativeControlOutcomeCohortSet <- data.frame(
cohortId = 1,
cohortName = "invalid outcome concept id",
outcomeConceptId = 1.2
)
expect_error(
generateNegativeControlOutcomeCohorts(
connectionDetails = connectionDetails,
cdmDatabaseSchema = "main",
negativeControlOutcomeCohortSet = negativeControlOutcomeCohortSet
),
message = "(non-integer values)"
)
})

test_that("Call generateNegativeControlOutcomeCohorts with negativeControlOutcomeCohortSet containing duplicate IDs", {
negativeControlOutcomeCohortSet <- data.frame(
cohortId = 1,
Expand Down
Loading