Skip to content

Commit

Permalink
V0.9 release prep (#148)
Browse files Browse the repository at this point in the history
* Bump version and add NEWS.md
* Copyright info
* styler
* Fix spelling errors and regenerate documentation
* Regenerate manual
* Regenerate vignettes
* Regenerate site documentation
  • Loading branch information
anthonysena authored May 28, 2024
1 parent 54a6421 commit e29a2d5
Show file tree
Hide file tree
Showing 81 changed files with 1,136 additions and 419 deletions.
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
Package: CohortGenerator
Type: Package
Title: An R Package for Cohort Generation Against the OMOP CDM
Version: 0.8.1
Date: 2023-10-10
Version: 0.9.0
Date: 2024-05-28
Authors@R: c(
person("Anthony", "Sena", email = "[email protected]", role = c("aut", "cre")),
person("Jamie", "Gilbert", role = c("aut")),
Expand Down
15 changes: 14 additions & 1 deletion NEWS.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,22 @@
CohortGenerator 0.9.0
=======================
- Random sample functionality (for development only) (Issue #129)
- Incremental mode for negative control cohort generation (Issue #137)
- Fixes getCohortCounts() if cohortIds is not specified, but cohortDefinitionSet is. (Issue #136)
- Add cohort ID to generation output messages (Issue #132)
- Add databaseId to output of getStatsTable() (Issue #116)
- Prevent duplicate cohort IDs in cohortDefinitionSet (Issue #130)
- Fix cohort stats query for Oracle (Issue #143)
- Ensure databaseId applied to all returned cohort counts (Issue #144)
- Preserve backwards compatibility if cohort sample table is not in the list of cohort table names (Issue #147)


CohortGenerator 0.8.1
=======================
- Include cohorts with 0 people in cohort counts (Issue #91).
- Use numeric for cohort ID (Issue #98)
- Allow big ints for target pairs (#103)
- Pass `tempEmulationSchema` when creating negative controlc ohorts (#104)
- Pass `tempEmulationSchema` when creating negative control cohorts (#104)
- Target CDM v5.4 for unit tests (#119)
- Fix for subset references (#115)
- Allow for subset cohort name templating (#118)
Expand Down
6 changes: 3 additions & 3 deletions R/CohortConstruction.R
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright 2023 Observational Health Data Sciences and Informatics
# Copyright 2024 Observational Health Data Sciences and Informatics
#
# This file is part of CohortGenerator
#
Expand Down Expand Up @@ -80,8 +80,8 @@ generateCohortSet <- function(connectionDetails = NULL,
# set before generating
if (length(unique(cohortDefinitionSet$cohortId)) != length(cohortDefinitionSet$cohortId)) {
duplicatedCohortIds <- cohortDefinitionSet$cohortId[duplicated(cohortDefinitionSet$cohortId)]
stop("Cannot generate! Duplicate cohort IDs found in your cohortDefinitionSet: ", paste(duplicatedCohortIds, sep=","), ". Please fix your cohortDefinitionSet and try again.")
}
stop("Cannot generate! Duplicate cohort IDs found in your cohortDefinitionSet: ", paste(duplicatedCohortIds, sep = ","), ". Please fix your cohortDefinitionSet and try again.")
}
if (is.null(connection) && is.null(connectionDetails)) {
stop("You must provide either a database connection or the connection details.")
}
Expand Down
2 changes: 1 addition & 1 deletion R/CohortCount.R
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright 2023 Observational Health Data Sciences and Informatics
# Copyright 2024 Observational Health Data Sciences and Informatics
#
# This file is part of CohortGenerator
#
Expand Down
4 changes: 2 additions & 2 deletions R/CohortDefinitionSet.R
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright 2023 Observational Health Data Sciences and Informatics
# Copyright 2024 Observational Health Data Sciences and Informatics
#
# This file is part of CohortGenerator
#
Expand Down Expand Up @@ -534,4 +534,4 @@ checkSettingsColumns <- function(columnNames, settingsFileName = NULL) {
}

copyToCds
}
}
2 changes: 1 addition & 1 deletion R/CohortGenerator.R
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright 2023 Observational Health Data Sciences and Informatics
# Copyright 2024 Observational Health Data Sciences and Informatics
#
# This file is part of CohortGenerator
#
Expand Down
137 changes: 76 additions & 61 deletions R/CohortSample.R
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright 2023 Observational Health Data Sciences and Informatics
# Copyright 2024 Observational Health Data Sciences and Informatics
#
# This file is part of CohortGenerator
#
Expand Down Expand Up @@ -31,10 +31,11 @@
countSql <- "SELECT COUNT(DISTINCT SUBJECT_ID) as cnt FROM @cohort_database_schema.@target_table
WHERE cohort_definition_id = @target_cohort_id"
count <- DatabaseConnector::renderTranslateQuerySql(connection,
countSql,
cohort_database_schema = cohortDatabaseSchema,
target_cohort_id = targetCohortId,
target_table = targetTable) %>%
countSql,
cohort_database_schema = cohortDatabaseSchema,
target_cohort_id = targetCohortId,
target_table = targetTable
) %>%
dplyr::pull()

if (!is.null(sampleFraction)) {
Expand Down Expand Up @@ -68,26 +69,28 @@
sampleTable,
seed,
tempEmulationSchema) {

randSampleTableName <- paste0("#SAMPLE_TABLE_", seed)
DatabaseConnector::insertTable(connection = connection,
data = sampleTable,
dropTableIfExists = TRUE,
tempTable = TRUE,
tempEmulationSchema = tempEmulationSchema,
tableName = randSampleTableName)
DatabaseConnector::insertTable(
connection = connection,
data = sampleTable,
dropTableIfExists = TRUE,
tempTable = TRUE,
tempEmulationSchema = tempEmulationSchema,
tableName = randSampleTableName
)

execSql <- SqlRender::readSql(system.file("sql", "sql_server", "sampling", "RandomSample.sql", package = "CohortGenerator"))
DatabaseConnector::renderTranslateExecuteSql(connection,
execSql,
tempEmulationSchema = tempEmulationSchema,
random_sample_table = randSampleTableName,
target_cohort_id = targetCohortId,
output_cohort_id = outputCohortId,
cohort_database_schema = cohortDatabaseSchema,
output_database_schema = outputDatabaseSchema,
output_table = outputTable,
target_table = targetTable)
execSql,
tempEmulationSchema = tempEmulationSchema,
random_sample_table = randSampleTableName,
target_cohort_id = targetCohortId,
output_cohort_id = outputCohortId,
cohort_database_schema = cohortDatabaseSchema,
output_database_schema = outputDatabaseSchema,
output_table = outputTable,
target_table = targetTable
)
}


Expand Down Expand Up @@ -115,7 +118,7 @@
idSet <- c(idSet, cohortIds)
}
errorMessage <- "identifier expression does not produce unique output for cohort ids"
if(length(unique(idSet)) != length(idSet)) stop(errorMessage)
if (length(unique(idSet)) != length(idSet)) stop(errorMessage)
invisible(NULL)
}

Expand All @@ -125,7 +128,7 @@
#' Create 1 or more sample of size n of a cohort definition set
#'
#' Subsetted cohorts can be sampled, as with any other subset form.
#' However, subsetting a sampled cohort is not reccomended and not currently supported at this time.
#' However, subsetting a sampled cohort is not recommended and not currently supported at this time.
#' In the case where n > cohort count the entire cohort is copied unmodified
#'
#' As different databases have different forms of randomness, the random selection is computed in
Expand All @@ -140,7 +143,7 @@
#' @param identifierExpression Optional string R expression used to compute output cohort id. Can only use variables
#' cohortId and seed. Default is "cohortId * 1000 + seed", which is substituted and evaluated
#' @param cohortIds Optional subset of cohortIds to generate. By default this function will sample all cohorts
#' @param seed Vector of seeds to give to the R psuedorandom number generator
#' @param seed Vector of seeds to give to the R pseudorandom number generator
#' @param seedArgs optional arguments to pass to set.seed
#' @param outputDatabaseSchema optional schema to output cohorts to (if different from cohortDatabaseSchema)
#' @export
Expand All @@ -161,21 +164,21 @@ sampleCohortDefinitionSet <- function(cohortDefinitionSet,
identifierExpression = "cohortId * 1000 + seed",
incremental = FALSE,
incrementalFolder = NULL) {

checkmate::assertIntegerish(n, len = 1, null.ok = TRUE)
checkmate::assertNumeric(sampleFraction, len = 1, null.ok = TRUE, lower = 0, upper = 1.0)
checkmate::assertIntegerish(seed, min.len = 1)
checkmate::assertDataFrame(cohortDefinitionSet, min.rows = 1, col.names = "named")
checkmate::assertNames(colnames(cohortDefinitionSet),
must.include = c(
"cohortId",
"cohortName",
"sql"
)
must.include = c(
"cohortId",
"cohortName",
"sql"
)
)

if (is.null(n) && is.null(sampleFraction))
if (is.null(n) && is.null(sampleFraction)) {
stop("Must specificy n or fraction size")
}

if (is.null(connection) && is.null(connectionDetails)) {
stop("You must provide either a database connection or the connection details.")
Expand Down Expand Up @@ -208,27 +211,35 @@ sampleCohortDefinitionSet <- function(cohortDefinitionSet,

sampledCohortDefinition$isSample <- TRUE
sampledCohortDefinition$status <- "ungenerated"
outputCohortId <- .computeIdentifierExpression(identifierExpression,
sampledCohortDefinition$cohortId,
seed)
outputCohortId <- .computeIdentifierExpression(
identifierExpression,
sampledCohortDefinition$cohortId,
seed
)
sampledCohortDefinition$sampleTargetCohortId <- sampledCohortDefinition$cohortId
sampledCohortDefinition$cohortId <- outputCohortId

if (!is.null(sampleFraction)) {
sampledCohortDefinition$cohortName <- sprintf("%s [%s%% SAMPLE seed=%s]",
sampledCohortDefinition$cohortName, seed, sampleFraction * 100)
sampledCohortDefinition$cohortName <- sprintf(
"%s [%s%% SAMPLE seed=%s]",
sampledCohortDefinition$cohortName, seed, sampleFraction * 100
)
} else {
sampledCohortDefinition$cohortName <- sprintf("%s [SAMPLE seed=%s n=%s]",
sampledCohortDefinition$cohortName, seed, n)
sampledCohortDefinition$cohortName <- sprintf(
"%s [SAMPLE seed=%s n=%s]",
sampledCohortDefinition$cohortName, seed, n
)
}

if (hasSubsetDefinitions(cohortDefinitionSet)) {
# must maintain mapping for subset parent ids
sampledCohortDefinition$subsetParent <- .computeIdentifierExpression(identifierExpression,
sampledCohortDefinition$subsetParent,
seed)
sampledCohortDefinition$subsetParent <- .computeIdentifierExpression(
identifierExpression,
sampledCohortDefinition$subsetParent,
seed
)
}

if (incremental && !isTaskRequired(
cohortId = outputCohortId,
seed = seed,
Expand All @@ -239,30 +250,34 @@ sampleCohortDefinitionSet <- function(cohortDefinitionSet,
return(sampledCohortDefinition)
}
# check incremental task for cohort sampling
sampleTable <- .getSampleSet(connection = connection,
n = n,
sampleFraction = sampleFraction,
seed = seed + targetCohortId, # Seed is unique to each target cohort
seedArgs = seedArgs,
cohortDatabaseSchema = cohortDatabaseSchema,
targetCohortId = targetCohortId,
targetTable = cohortTableNames$cohortTable)
sampleTable <- .getSampleSet(
connection = connection,
n = n,
sampleFraction = sampleFraction,
seed = seed + targetCohortId, # Seed is unique to each target cohort
seedArgs = seedArgs,
cohortDatabaseSchema = cohortDatabaseSchema,
targetCohortId = targetCohortId,
targetTable = cohortTableNames$cohortTable
)

if (nrow(sampleTable) == 0) {
ParallelLogger::logInfo("No entires found for ", targetCohortId, " was it generated?")
return(sampledCohortDefinition)
}
# Called only for side effects
.sampleCohort(connection = connection,
targetCohortId = targetCohortId,
targetTable = cohortTableNames$cohortTable,
outputCohortId = outputCohortId,
outputTable = cohortTableNames$cohortSampleTable,
cohortDatabaseSchema = cohortDatabaseSchema,
outputDatabaseSchema = outputDatabaseSchema,
sampleTable = sampleTable,
seed = seed + targetCohortId, # Seed is unique to each target cohort
tempEmulationSchema = tempEmulationSchema)
.sampleCohort(
connection = connection,
targetCohortId = targetCohortId,
targetTable = cohortTableNames$cohortTable,
outputCohortId = outputCohortId,
outputTable = cohortTableNames$cohortSampleTable,
cohortDatabaseSchema = cohortDatabaseSchema,
outputDatabaseSchema = outputDatabaseSchema,
sampleTable = sampleTable,
seed = seed + targetCohortId, # Seed is unique to each target cohort
tempEmulationSchema = tempEmulationSchema
)

sampledCohortDefinition$status <- "generated"
if (incremental) {
Expand All @@ -275,7 +290,7 @@ sampleCohortDefinitionSet <- function(cohortDefinitionSet,
}
return(sampledCohortDefinition)
}, seed, cohortIds) %>%
dplyr::bind_rows()
dplyr::bind_rows()



Expand Down
2 changes: 1 addition & 1 deletion R/CohortStats.R
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright 2023 Observational Health Data Sciences and Informatics
# Copyright 2024 Observational Health Data Sciences and Informatics
#
# This file is part of CohortGenerator
#
Expand Down
10 changes: 5 additions & 5 deletions R/CohortTables.R
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright 2023 Observational Health Data Sciences and Informatics
# Copyright 2024 Observational Health Data Sciences and Informatics
#
# This file is part of CohortGenerator
#
Expand Down Expand Up @@ -196,9 +196,9 @@ dropCohortStatsTables <- function(connectionDetails = NULL,
}
}

.checkCohortTables <- function (connection,
cohortDatabaseSchema,
cohortTableNames) {
.checkCohortTables <- function(connection,
cohortDatabaseSchema,
cohortTableNames) {
# Verify the cohort tables exist and if they do not
# stop the generation process
tableExistsFlagList <- lapply(cohortTableNames, FUN = function(x) {
Expand All @@ -221,4 +221,4 @@ dropCohortStatsTables <- function(connectionDetails = NULL,
errorMsg <- paste(errorMsg, "Please use the createCohortTables function to ensure all tables exist before generating cohorts.", sep = "\n")
stop(errorMsg)
}
}
}
2 changes: 1 addition & 1 deletion R/CsvHelper.R
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright 2023 Observational Health Data Sciences and Informatics
# Copyright 2024 Observational Health Data Sciences and Informatics
#
# This file is part of CohortGenerator
#
Expand Down
2 changes: 1 addition & 1 deletion R/Export.R
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright 2023 Observational Health Data Sciences and Informatics
# Copyright 2024 Observational Health Data Sciences and Informatics
#
# This file is part of CohortGenerator
#
Expand Down
2 changes: 1 addition & 1 deletion R/Incremental.R
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright 2023 Observational Health Data Sciences and Informatics
# Copyright 2024 Observational Health Data Sciences and Informatics
#
# This file is part of CohortGenerator
#
Expand Down
14 changes: 7 additions & 7 deletions R/NegativeControlCohorts.R
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright 2023 Observational Health Data Sciences and Informatics
# Copyright 2024 Observational Health Data Sciences and Informatics
#
# This file is part of CohortGenerator
#
Expand Down Expand Up @@ -53,8 +53,8 @@ createEmptyNegativeControlOutcomeCohortSet <- function(verbose = FALSE) {
#' @keywords internal
.getNegativeControlOutcomeCohortSetSpecification <- function() {
return(readCsv(system.file("negativeControlOutcomeCohortSetSpecificationDescription.csv",
package = "CohortGenerator",
mustWork = TRUE
package = "CohortGenerator",
mustWork = TRUE
)))
}

Expand Down Expand Up @@ -111,7 +111,7 @@ generateNegativeControlOutcomeCohorts <- function(connectionDetails = NULL,
checkmate::assert_choice(x = tolower(occurrenceType), choices = c("all", "first"))
checkmate::assert_logical(detectOnDescendants)
checkmate::assertNames(colnames(negativeControlOutcomeCohortSet),
must.include = .getNegativeControlOutcomeCohortSetSpecification()$columnName
must.include = .getNegativeControlOutcomeCohortSetSpecification()$columnName
)
checkmate::assert_data_frame(
x = negativeControlOutcomeCohortSet,
Expand All @@ -122,9 +122,9 @@ generateNegativeControlOutcomeCohorts <- function(connectionDetails = NULL,
# cohort definition set before generating
if (length(unique(negativeControlOutcomeCohortSet$cohortId)) != length(negativeControlOutcomeCohortSet$cohortId)) {
duplicatedCohortIds <- negativeControlOutcomeCohortSet$cohortId[duplicated(negativeControlOutcomeCohortSet$cohortId)]
stop("Cannot generate! Duplicate cohort IDs found in your negativeControlOutcomeCohortSet: ", paste(duplicatedCohortIds, sep=","), ". Please fix your negativeControlOutcomeCohortSet and try again.")
}
stop("Cannot generate! Duplicate cohort IDs found in your negativeControlOutcomeCohortSet: ", paste(duplicatedCohortIds, sep = ","), ". Please fix your negativeControlOutcomeCohortSet and try again.")
}

if (incremental) {
if (is.null(incrementalFolder)) {
stop("Must specify incrementalFolder when incremental = TRUE")
Expand Down
Loading

0 comments on commit e29a2d5

Please sign in to comment.