V0.9 release prep (#148)

* Bump version and add NEWS.md * Copyright info * styler * Fix spelling errors and regenerate documentation * Regenerate manual * Regenerate vignettes * Regenerate site documentation
OHDSI · May 28, 2024 · e29a2d5 · e29a2d5
1 parent 54a6421
commit e29a2d5
Show file tree

Hide file tree

Showing 81 changed files with 1,136 additions and 419 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,8 +1,8 @@
 Package: CohortGenerator
 Type: Package
 Title: An R Package for Cohort Generation Against the OMOP CDM
-Version: 0.8.1
-Date: 2023-10-10
+Version: 0.9.0
+Date: 2024-05-28
 Authors@R: c(
   person("Anthony", "Sena", email = "[email protected]", role = c("aut", "cre")),
   person("Jamie", "Gilbert", role = c("aut")),

diff --git a/NEWS.md b/NEWS.md
@@ -1,9 +1,22 @@
+CohortGenerator 0.9.0
+=======================
+- Random sample functionality (for development only) (Issue #129)
+- Incremental mode for negative control cohort generation (Issue #137)
+- Fixes getCohortCounts() if cohortIds is not specified, but cohortDefinitionSet is. (Issue #136)
+- Add cohort ID to generation output messages (Issue #132)
+- Add databaseId to output of getStatsTable() (Issue #116)
+- Prevent duplicate cohort IDs in cohortDefinitionSet (Issue #130)
+- Fix cohort stats query for Oracle (Issue #143)
+- Ensure databaseId applied to all returned cohort counts (Issue #144)
+- Preserve backwards compatibility if cohort sample table is not in the list of cohort table names (Issue #147) 
+
+
 CohortGenerator 0.8.1
 =======================
 - Include cohorts with 0 people in cohort counts (Issue #91).
 - Use numeric for cohort ID (Issue #98)
 - Allow big ints for target pairs (#103)
-- Pass `tempEmulationSchema` when creating negative controlc ohorts (#104)
+- Pass `tempEmulationSchema` when creating negative control cohorts (#104)
 - Target CDM v5.4 for unit tests (#119)
 - Fix for subset references (#115)
 - Allow for subset cohort name templating (#118)

diff --git a/R/CohortConstruction.R b/R/CohortConstruction.R
@@ -1,4 +1,4 @@
-# Copyright 2023 Observational Health Data Sciences and Informatics
+# Copyright 2024 Observational Health Data Sciences and Informatics
 #
 # This file is part of CohortGenerator
 #
@@ -80,8 +80,8 @@ generateCohortSet <- function(connectionDetails = NULL,
   # set before generating
   if (length(unique(cohortDefinitionSet$cohortId)) != length(cohortDefinitionSet$cohortId)) {
     duplicatedCohortIds <- cohortDefinitionSet$cohortId[duplicated(cohortDefinitionSet$cohortId)]
-    stop("Cannot generate! Duplicate cohort IDs found in your cohortDefinitionSet: ", paste(duplicatedCohortIds, sep=","), ". Please fix your cohortDefinitionSet and try again.")
-  } 
+    stop("Cannot generate! Duplicate cohort IDs found in your cohortDefinitionSet: ", paste(duplicatedCohortIds, sep = ","), ". Please fix your cohortDefinitionSet and try again.")
+  }
   if (is.null(connection) && is.null(connectionDetails)) {
     stop("You must provide either a database connection or the connection details.")
   }

diff --git a/R/CohortCount.R b/R/CohortCount.R
@@ -1,4 +1,4 @@
-# Copyright 2023 Observational Health Data Sciences and Informatics
+# Copyright 2024 Observational Health Data Sciences and Informatics
 #
 # This file is part of CohortGenerator
 #

diff --git a/R/CohortDefinitionSet.R b/R/CohortDefinitionSet.R
@@ -1,4 +1,4 @@
-# Copyright 2023 Observational Health Data Sciences and Informatics
+# Copyright 2024 Observational Health Data Sciences and Informatics
 #
 # This file is part of CohortGenerator
 #
@@ -534,4 +534,4 @@ checkSettingsColumns <- function(columnNames, settingsFileName = NULL) {
   }
 
   copyToCds
-}
+}
diff --git a/R/CohortGenerator.R b/R/CohortGenerator.R
@@ -1,4 +1,4 @@
-# Copyright 2023 Observational Health Data Sciences and Informatics
+# Copyright 2024 Observational Health Data Sciences and Informatics
 #
 # This file is part of CohortGenerator
 #

diff --git a/R/CohortSample.R b/R/CohortSample.R
@@ -1,4 +1,4 @@
-# Copyright 2023 Observational Health Data Sciences and Informatics
+# Copyright 2024 Observational Health Data Sciences and Informatics
 #
 # This file is part of CohortGenerator
 #
@@ -31,10 +31,11 @@
   countSql <- "SELECT COUNT(DISTINCT SUBJECT_ID) as cnt FROM  @cohort_database_schema.@target_table
    WHERE cohort_definition_id = @target_cohort_id"
   count <- DatabaseConnector::renderTranslateQuerySql(connection,
-                                                      countSql,
-                                                      cohort_database_schema = cohortDatabaseSchema,
-                                                      target_cohort_id = targetCohortId,
-                                                      target_table = targetTable) %>%
+    countSql,
+    cohort_database_schema = cohortDatabaseSchema,
+    target_cohort_id = targetCohortId,
+    target_table = targetTable
+  ) %>%
     dplyr::pull()
 
   if (!is.null(sampleFraction)) {
@@ -68,26 +69,28 @@
                           sampleTable,
                           seed,
                           tempEmulationSchema) {
-
   randSampleTableName <- paste0("#SAMPLE_TABLE_", seed)
-  DatabaseConnector::insertTable(connection = connection,
-                                 data = sampleTable,
-                                 dropTableIfExists = TRUE,
-                                 tempTable = TRUE,
-                                 tempEmulationSchema = tempEmulationSchema,
-                                 tableName = randSampleTableName)
+  DatabaseConnector::insertTable(
+    connection = connection,
+    data = sampleTable,
+    dropTableIfExists = TRUE,
+    tempTable = TRUE,
+    tempEmulationSchema = tempEmulationSchema,
+    tableName = randSampleTableName
+  )
 
   execSql <- SqlRender::readSql(system.file("sql", "sql_server", "sampling", "RandomSample.sql", package = "CohortGenerator"))
   DatabaseConnector::renderTranslateExecuteSql(connection,
-                                               execSql,
-                                               tempEmulationSchema = tempEmulationSchema,
-                                               random_sample_table = randSampleTableName,
-                                               target_cohort_id = targetCohortId,
-                                               output_cohort_id = outputCohortId,
-                                               cohort_database_schema = cohortDatabaseSchema,
-                                               output_database_schema = outputDatabaseSchema,
-                                               output_table = outputTable,
-                                               target_table = targetTable)
+    execSql,
+    tempEmulationSchema = tempEmulationSchema,
+    random_sample_table = randSampleTableName,
+    target_cohort_id = targetCohortId,
+    output_cohort_id = outputCohortId,
+    cohort_database_schema = cohortDatabaseSchema,
+    output_database_schema = outputDatabaseSchema,
+    output_table = outputTable,
+    target_table = targetTable
+  )
 }
 
 
@@ -115,7 +118,7 @@
     idSet <- c(idSet, cohortIds)
   }
   errorMessage <- "identifier expression does not produce unique output for cohort ids"
-  if(length(unique(idSet)) != length(idSet)) stop(errorMessage)
+  if (length(unique(idSet)) != length(idSet)) stop(errorMessage)
   invisible(NULL)
 }
 
@@ -125,7 +128,7 @@
 #' Create 1 or more sample of size n of a cohort definition set
 #'
 #' Subsetted cohorts can be sampled, as with any other subset form.
-#' However, subsetting a sampled cohort is not reccomended and not currently supported at this time.
+#' However, subsetting a sampled cohort is not recommended and not currently supported at this time.
 #' In the case where n > cohort count the entire cohort is copied unmodified
 #'
 #' As different databases have different forms of randomness, the random selection is computed in
@@ -140,7 +143,7 @@
 #' @param identifierExpression  Optional string R expression used to compute output cohort id. Can only use variables
 #'                              cohortId and seed. Default is "cohortId * 1000 + seed", which is substituted and evaluated
 #' @param cohortIds             Optional subset of cohortIds to generate. By default this function will sample all cohorts
-#' @param seed                  Vector of seeds to give to the R psuedorandom number generator
+#' @param seed                  Vector of seeds to give to the R pseudorandom number generator
 #' @param seedArgs              optional arguments to pass to set.seed
 #' @param outputDatabaseSchema  optional schema to output cohorts to (if different from cohortDatabaseSchema)
 #' @export
@@ -161,21 +164,21 @@ sampleCohortDefinitionSet <- function(cohortDefinitionSet,
                                       identifierExpression = "cohortId * 1000 + seed",
                                       incremental = FALSE,
                                       incrementalFolder = NULL) {
-
   checkmate::assertIntegerish(n, len = 1, null.ok = TRUE)
   checkmate::assertNumeric(sampleFraction, len = 1, null.ok = TRUE, lower = 0, upper = 1.0)
   checkmate::assertIntegerish(seed, min.len = 1)
   checkmate::assertDataFrame(cohortDefinitionSet, min.rows = 1, col.names = "named")
   checkmate::assertNames(colnames(cohortDefinitionSet),
-                         must.include = c(
-                           "cohortId",
-                           "cohortName",
-                           "sql"
-                         )
+    must.include = c(
+      "cohortId",
+      "cohortName",
+      "sql"
+    )
   )
 
-  if (is.null(n) && is.null(sampleFraction))
+  if (is.null(n) && is.null(sampleFraction)) {
     stop("Must specificy n or fraction size")
+  }
 
   if (is.null(connection) && is.null(connectionDetails)) {
     stop("You must provide either a database connection or the connection details.")
@@ -208,27 +211,35 @@ sampleCohortDefinitionSet <- function(cohortDefinitionSet,
 
       sampledCohortDefinition$isSample <- TRUE
       sampledCohortDefinition$status <- "ungenerated"
-      outputCohortId <-  .computeIdentifierExpression(identifierExpression,
-                                                      sampledCohortDefinition$cohortId,
-                                                      seed)
+      outputCohortId <- .computeIdentifierExpression(
+        identifierExpression,
+        sampledCohortDefinition$cohortId,
+        seed
+      )
       sampledCohortDefinition$sampleTargetCohortId <- sampledCohortDefinition$cohortId
       sampledCohortDefinition$cohortId <- outputCohortId
 
       if (!is.null(sampleFraction)) {
-        sampledCohortDefinition$cohortName <- sprintf("%s [%s%% SAMPLE seed=%s]",
-                                                      sampledCohortDefinition$cohortName, seed, sampleFraction * 100)
+        sampledCohortDefinition$cohortName <- sprintf(
+          "%s [%s%% SAMPLE seed=%s]",
+          sampledCohortDefinition$cohortName, seed, sampleFraction * 100
+        )
       } else {
-        sampledCohortDefinition$cohortName <- sprintf("%s [SAMPLE seed=%s n=%s]",
-                                                      sampledCohortDefinition$cohortName, seed, n)
+        sampledCohortDefinition$cohortName <- sprintf(
+          "%s [SAMPLE seed=%s n=%s]",
+          sampledCohortDefinition$cohortName, seed, n
+        )
       }
 
       if (hasSubsetDefinitions(cohortDefinitionSet)) {
         # must maintain mapping for subset parent ids
-        sampledCohortDefinition$subsetParent <- .computeIdentifierExpression(identifierExpression,
-                                                                             sampledCohortDefinition$subsetParent,
-                                                                             seed)
+        sampledCohortDefinition$subsetParent <- .computeIdentifierExpression(
+          identifierExpression,
+          sampledCohortDefinition$subsetParent,
+          seed
+        )
       }
-      
+
       if (incremental && !isTaskRequired(
         cohortId = outputCohortId,
         seed = seed,
@@ -239,30 +250,34 @@ sampleCohortDefinitionSet <- function(cohortDefinitionSet,
         return(sampledCohortDefinition)
       }
       # check incremental task for cohort sampling
-      sampleTable <- .getSampleSet(connection = connection,
-                                   n = n,
-                                   sampleFraction = sampleFraction,
-                                   seed = seed + targetCohortId, # Seed is unique to each target cohort
-                                   seedArgs = seedArgs,
-                                   cohortDatabaseSchema = cohortDatabaseSchema,
-                                   targetCohortId = targetCohortId,
-                                   targetTable = cohortTableNames$cohortTable)
+      sampleTable <- .getSampleSet(
+        connection = connection,
+        n = n,
+        sampleFraction = sampleFraction,
+        seed = seed + targetCohortId, # Seed is unique to each target cohort
+        seedArgs = seedArgs,
+        cohortDatabaseSchema = cohortDatabaseSchema,
+        targetCohortId = targetCohortId,
+        targetTable = cohortTableNames$cohortTable
+      )
 
       if (nrow(sampleTable) == 0) {
         ParallelLogger::logInfo("No entires found for ", targetCohortId, " was it generated?")
         return(sampledCohortDefinition)
       }
       # Called only for side effects
-      .sampleCohort(connection = connection,
-                    targetCohortId = targetCohortId,
-                    targetTable = cohortTableNames$cohortTable,
-                    outputCohortId = outputCohortId,
-                    outputTable = cohortTableNames$cohortSampleTable,
-                    cohortDatabaseSchema = cohortDatabaseSchema,
-                    outputDatabaseSchema = outputDatabaseSchema,
-                    sampleTable = sampleTable,
-                    seed = seed + targetCohortId, # Seed is unique to each target cohort
-                    tempEmulationSchema = tempEmulationSchema)
+      .sampleCohort(
+        connection = connection,
+        targetCohortId = targetCohortId,
+        targetTable = cohortTableNames$cohortTable,
+        outputCohortId = outputCohortId,
+        outputTable = cohortTableNames$cohortSampleTable,
+        cohortDatabaseSchema = cohortDatabaseSchema,
+        outputDatabaseSchema = outputDatabaseSchema,
+        sampleTable = sampleTable,
+        seed = seed + targetCohortId, # Seed is unique to each target cohort
+        tempEmulationSchema = tempEmulationSchema
+      )
 
       sampledCohortDefinition$status <- "generated"
       if (incremental) {
@@ -275,7 +290,7 @@ sampleCohortDefinitionSet <- function(cohortDefinitionSet,
       }
       return(sampledCohortDefinition)
     }, seed, cohortIds) %>%
-      dplyr::bind_rows()
+    dplyr::bind_rows()
 
 
 

diff --git a/R/CohortStats.R b/R/CohortStats.R
@@ -1,4 +1,4 @@
-# Copyright 2023 Observational Health Data Sciences and Informatics
+# Copyright 2024 Observational Health Data Sciences and Informatics
 #
 # This file is part of CohortGenerator
 #

diff --git a/R/CohortTables.R b/R/CohortTables.R
@@ -1,4 +1,4 @@
-# Copyright 2023 Observational Health Data Sciences and Informatics
+# Copyright 2024 Observational Health Data Sciences and Informatics
 #
 # This file is part of CohortGenerator
 #
@@ -196,9 +196,9 @@ dropCohortStatsTables <- function(connectionDetails = NULL,
   }
 }
 
-.checkCohortTables <- function (connection,
-                                cohortDatabaseSchema,
-                                cohortTableNames) {
+.checkCohortTables <- function(connection,
+                               cohortDatabaseSchema,
+                               cohortTableNames) {
   # Verify the cohort tables exist and if they do not
   # stop the generation process
   tableExistsFlagList <- lapply(cohortTableNames, FUN = function(x) {
@@ -221,4 +221,4 @@ dropCohortStatsTables <- function(connectionDetails = NULL,
     errorMsg <- paste(errorMsg, "Please use the createCohortTables function to ensure all tables exist before generating cohorts.", sep = "\n")
     stop(errorMsg)
   }
-}
+}
diff --git a/R/CsvHelper.R b/R/CsvHelper.R
@@ -1,4 +1,4 @@
-# Copyright 2023 Observational Health Data Sciences and Informatics
+# Copyright 2024 Observational Health Data Sciences and Informatics
 #
 # This file is part of CohortGenerator
 #

diff --git a/R/Export.R b/R/Export.R
@@ -1,4 +1,4 @@
-# Copyright 2023 Observational Health Data Sciences and Informatics
+# Copyright 2024 Observational Health Data Sciences and Informatics
 #
 # This file is part of CohortGenerator
 #

diff --git a/R/Incremental.R b/R/Incremental.R
@@ -1,4 +1,4 @@
-# Copyright 2023 Observational Health Data Sciences and Informatics
+# Copyright 2024 Observational Health Data Sciences and Informatics
 #
 # This file is part of CohortGenerator
 #

diff --git a/R/NegativeControlCohorts.R b/R/NegativeControlCohorts.R
@@ -1,4 +1,4 @@
-# Copyright 2023 Observational Health Data Sciences and Informatics
+# Copyright 2024 Observational Health Data Sciences and Informatics
 #
 # This file is part of CohortGenerator
 #
@@ -53,8 +53,8 @@ createEmptyNegativeControlOutcomeCohortSet <- function(verbose = FALSE) {
 #' @keywords internal
 .getNegativeControlOutcomeCohortSetSpecification <- function() {
   return(readCsv(system.file("negativeControlOutcomeCohortSetSpecificationDescription.csv",
-                             package = "CohortGenerator",
-                             mustWork = TRUE
+    package = "CohortGenerator",
+    mustWork = TRUE
   )))
 }
 
@@ -111,7 +111,7 @@ generateNegativeControlOutcomeCohorts <- function(connectionDetails = NULL,
   checkmate::assert_choice(x = tolower(occurrenceType), choices = c("all", "first"))
   checkmate::assert_logical(detectOnDescendants)
   checkmate::assertNames(colnames(negativeControlOutcomeCohortSet),
-                         must.include = .getNegativeControlOutcomeCohortSetSpecification()$columnName
+    must.include = .getNegativeControlOutcomeCohortSetSpecification()$columnName
   )
   checkmate::assert_data_frame(
     x = negativeControlOutcomeCohortSet,
@@ -122,9 +122,9 @@ generateNegativeControlOutcomeCohorts <- function(connectionDetails = NULL,
   # cohort definition set before generating
   if (length(unique(negativeControlOutcomeCohortSet$cohortId)) != length(negativeControlOutcomeCohortSet$cohortId)) {
     duplicatedCohortIds <- negativeControlOutcomeCohortSet$cohortId[duplicated(negativeControlOutcomeCohortSet$cohortId)]
-    stop("Cannot generate! Duplicate cohort IDs found in your negativeControlOutcomeCohortSet: ", paste(duplicatedCohortIds, sep=","), ". Please fix your negativeControlOutcomeCohortSet and try again.")
-  } 
-  
+    stop("Cannot generate! Duplicate cohort IDs found in your negativeControlOutcomeCohortSet: ", paste(duplicatedCohortIds, sep = ","), ". Please fix your negativeControlOutcomeCohortSet and try again.")
+  }
+
   if (incremental) {
     if (is.null(incrementalFolder)) {
       stop("Must specify incrementalFolder when incremental = TRUE")