diff --git a/DESCRIPTION b/DESCRIPTION
index 6bfbbba..dd768aa 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,8 +1,8 @@
Package: CohortGenerator
Type: Package
Title: An R Package for Cohort Generation Against the OMOP CDM
-Version: 0.8.1
-Date: 2023-10-10
+Version: 0.9.0
+Date: 2024-05-28
Authors@R: c(
person("Anthony", "Sena", email = "sena@ohdsi.org", role = c("aut", "cre")),
person("Jamie", "Gilbert", role = c("aut")),
diff --git a/NEWS.md b/NEWS.md
index f1cb257..d68e06c 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,9 +1,22 @@
+CohortGenerator 0.9.0
+=======================
+- Random sample functionality (for development only) (Issue #129)
+- Incremental mode for negative control cohort generation (Issue #137)
+- Fixes getCohortCounts() if cohortIds is not specified, but cohortDefinitionSet is. (Issue #136)
+- Add cohort ID to generation output messages (Issue #132)
+- Add databaseId to output of getStatsTable() (Issue #116)
+- Prevent duplicate cohort IDs in cohortDefinitionSet (Issue #130)
+- Fix cohort stats query for Oracle (Issue #143)
+- Ensure databaseId applied to all returned cohort counts (Issue #144)
+- Preserve backwards compatibility if cohort sample table is not in the list of cohort table names (Issue #147)
+
+
CohortGenerator 0.8.1
=======================
- Include cohorts with 0 people in cohort counts (Issue #91).
- Use numeric for cohort ID (Issue #98)
- Allow big ints for target pairs (#103)
-- Pass `tempEmulationSchema` when creating negative controlc ohorts (#104)
+- Pass `tempEmulationSchema` when creating negative control cohorts (#104)
- Target CDM v5.4 for unit tests (#119)
- Fix for subset references (#115)
- Allow for subset cohort name templating (#118)
diff --git a/R/CohortConstruction.R b/R/CohortConstruction.R
index 600b3f6..2dfb86a 100644
--- a/R/CohortConstruction.R
+++ b/R/CohortConstruction.R
@@ -1,4 +1,4 @@
-# Copyright 2023 Observational Health Data Sciences and Informatics
+# Copyright 2024 Observational Health Data Sciences and Informatics
#
# This file is part of CohortGenerator
#
@@ -80,8 +80,8 @@ generateCohortSet <- function(connectionDetails = NULL,
# set before generating
if (length(unique(cohortDefinitionSet$cohortId)) != length(cohortDefinitionSet$cohortId)) {
duplicatedCohortIds <- cohortDefinitionSet$cohortId[duplicated(cohortDefinitionSet$cohortId)]
- stop("Cannot generate! Duplicate cohort IDs found in your cohortDefinitionSet: ", paste(duplicatedCohortIds, sep=","), ". Please fix your cohortDefinitionSet and try again.")
- }
+ stop("Cannot generate! Duplicate cohort IDs found in your cohortDefinitionSet: ", paste(duplicatedCohortIds, sep = ","), ". Please fix your cohortDefinitionSet and try again.")
+ }
if (is.null(connection) && is.null(connectionDetails)) {
stop("You must provide either a database connection or the connection details.")
}
diff --git a/R/CohortCount.R b/R/CohortCount.R
index ca381f4..f14244f 100644
--- a/R/CohortCount.R
+++ b/R/CohortCount.R
@@ -1,4 +1,4 @@
-# Copyright 2023 Observational Health Data Sciences and Informatics
+# Copyright 2024 Observational Health Data Sciences and Informatics
#
# This file is part of CohortGenerator
#
diff --git a/R/CohortDefinitionSet.R b/R/CohortDefinitionSet.R
index 38d1428..7b09547 100644
--- a/R/CohortDefinitionSet.R
+++ b/R/CohortDefinitionSet.R
@@ -1,4 +1,4 @@
-# Copyright 2023 Observational Health Data Sciences and Informatics
+# Copyright 2024 Observational Health Data Sciences and Informatics
#
# This file is part of CohortGenerator
#
@@ -534,4 +534,4 @@ checkSettingsColumns <- function(columnNames, settingsFileName = NULL) {
}
copyToCds
-}
\ No newline at end of file
+}
diff --git a/R/CohortGenerator.R b/R/CohortGenerator.R
index 3a1cc51..b82cdd1 100644
--- a/R/CohortGenerator.R
+++ b/R/CohortGenerator.R
@@ -1,4 +1,4 @@
-# Copyright 2023 Observational Health Data Sciences and Informatics
+# Copyright 2024 Observational Health Data Sciences and Informatics
#
# This file is part of CohortGenerator
#
diff --git a/R/CohortSample.R b/R/CohortSample.R
index dadc552..6cb393b 100644
--- a/R/CohortSample.R
+++ b/R/CohortSample.R
@@ -1,4 +1,4 @@
-# Copyright 2023 Observational Health Data Sciences and Informatics
+# Copyright 2024 Observational Health Data Sciences and Informatics
#
# This file is part of CohortGenerator
#
@@ -31,10 +31,11 @@
countSql <- "SELECT COUNT(DISTINCT SUBJECT_ID) as cnt FROM @cohort_database_schema.@target_table
WHERE cohort_definition_id = @target_cohort_id"
count <- DatabaseConnector::renderTranslateQuerySql(connection,
- countSql,
- cohort_database_schema = cohortDatabaseSchema,
- target_cohort_id = targetCohortId,
- target_table = targetTable) %>%
+ countSql,
+ cohort_database_schema = cohortDatabaseSchema,
+ target_cohort_id = targetCohortId,
+ target_table = targetTable
+ ) %>%
dplyr::pull()
if (!is.null(sampleFraction)) {
@@ -68,26 +69,28 @@
sampleTable,
seed,
tempEmulationSchema) {
-
randSampleTableName <- paste0("#SAMPLE_TABLE_", seed)
- DatabaseConnector::insertTable(connection = connection,
- data = sampleTable,
- dropTableIfExists = TRUE,
- tempTable = TRUE,
- tempEmulationSchema = tempEmulationSchema,
- tableName = randSampleTableName)
+ DatabaseConnector::insertTable(
+ connection = connection,
+ data = sampleTable,
+ dropTableIfExists = TRUE,
+ tempTable = TRUE,
+ tempEmulationSchema = tempEmulationSchema,
+ tableName = randSampleTableName
+ )
execSql <- SqlRender::readSql(system.file("sql", "sql_server", "sampling", "RandomSample.sql", package = "CohortGenerator"))
DatabaseConnector::renderTranslateExecuteSql(connection,
- execSql,
- tempEmulationSchema = tempEmulationSchema,
- random_sample_table = randSampleTableName,
- target_cohort_id = targetCohortId,
- output_cohort_id = outputCohortId,
- cohort_database_schema = cohortDatabaseSchema,
- output_database_schema = outputDatabaseSchema,
- output_table = outputTable,
- target_table = targetTable)
+ execSql,
+ tempEmulationSchema = tempEmulationSchema,
+ random_sample_table = randSampleTableName,
+ target_cohort_id = targetCohortId,
+ output_cohort_id = outputCohortId,
+ cohort_database_schema = cohortDatabaseSchema,
+ output_database_schema = outputDatabaseSchema,
+ output_table = outputTable,
+ target_table = targetTable
+ )
}
@@ -115,7 +118,7 @@
idSet <- c(idSet, cohortIds)
}
errorMessage <- "identifier expression does not produce unique output for cohort ids"
- if(length(unique(idSet)) != length(idSet)) stop(errorMessage)
+ if (length(unique(idSet)) != length(idSet)) stop(errorMessage)
invisible(NULL)
}
@@ -125,7 +128,7 @@
#' Create 1 or more sample of size n of a cohort definition set
#'
#' Subsetted cohorts can be sampled, as with any other subset form.
-#' However, subsetting a sampled cohort is not reccomended and not currently supported at this time.
+#' However, subsetting a sampled cohort is not recommended and not currently supported at this time.
#' In the case where n > cohort count the entire cohort is copied unmodified
#'
#' As different databases have different forms of randomness, the random selection is computed in
@@ -140,7 +143,7 @@
#' @param identifierExpression Optional string R expression used to compute output cohort id. Can only use variables
#' cohortId and seed. Default is "cohortId * 1000 + seed", which is substituted and evaluated
#' @param cohortIds Optional subset of cohortIds to generate. By default this function will sample all cohorts
-#' @param seed Vector of seeds to give to the R psuedorandom number generator
+#' @param seed Vector of seeds to give to the R pseudorandom number generator
#' @param seedArgs optional arguments to pass to set.seed
#' @param outputDatabaseSchema optional schema to output cohorts to (if different from cohortDatabaseSchema)
#' @export
@@ -161,21 +164,21 @@ sampleCohortDefinitionSet <- function(cohortDefinitionSet,
identifierExpression = "cohortId * 1000 + seed",
incremental = FALSE,
incrementalFolder = NULL) {
-
checkmate::assertIntegerish(n, len = 1, null.ok = TRUE)
checkmate::assertNumeric(sampleFraction, len = 1, null.ok = TRUE, lower = 0, upper = 1.0)
checkmate::assertIntegerish(seed, min.len = 1)
checkmate::assertDataFrame(cohortDefinitionSet, min.rows = 1, col.names = "named")
checkmate::assertNames(colnames(cohortDefinitionSet),
- must.include = c(
- "cohortId",
- "cohortName",
- "sql"
- )
+ must.include = c(
+ "cohortId",
+ "cohortName",
+ "sql"
+ )
)
- if (is.null(n) && is.null(sampleFraction))
+ if (is.null(n) && is.null(sampleFraction)) {
stop("Must specificy n or fraction size")
+ }
if (is.null(connection) && is.null(connectionDetails)) {
stop("You must provide either a database connection or the connection details.")
@@ -208,27 +211,35 @@ sampleCohortDefinitionSet <- function(cohortDefinitionSet,
sampledCohortDefinition$isSample <- TRUE
sampledCohortDefinition$status <- "ungenerated"
- outputCohortId <- .computeIdentifierExpression(identifierExpression,
- sampledCohortDefinition$cohortId,
- seed)
+ outputCohortId <- .computeIdentifierExpression(
+ identifierExpression,
+ sampledCohortDefinition$cohortId,
+ seed
+ )
sampledCohortDefinition$sampleTargetCohortId <- sampledCohortDefinition$cohortId
sampledCohortDefinition$cohortId <- outputCohortId
if (!is.null(sampleFraction)) {
- sampledCohortDefinition$cohortName <- sprintf("%s [%s%% SAMPLE seed=%s]",
- sampledCohortDefinition$cohortName, seed, sampleFraction * 100)
+ sampledCohortDefinition$cohortName <- sprintf(
+ "%s [%s%% SAMPLE seed=%s]",
+ sampledCohortDefinition$cohortName, seed, sampleFraction * 100
+ )
} else {
- sampledCohortDefinition$cohortName <- sprintf("%s [SAMPLE seed=%s n=%s]",
- sampledCohortDefinition$cohortName, seed, n)
+ sampledCohortDefinition$cohortName <- sprintf(
+ "%s [SAMPLE seed=%s n=%s]",
+ sampledCohortDefinition$cohortName, seed, n
+ )
}
if (hasSubsetDefinitions(cohortDefinitionSet)) {
# must maintain mapping for subset parent ids
- sampledCohortDefinition$subsetParent <- .computeIdentifierExpression(identifierExpression,
- sampledCohortDefinition$subsetParent,
- seed)
+ sampledCohortDefinition$subsetParent <- .computeIdentifierExpression(
+ identifierExpression,
+ sampledCohortDefinition$subsetParent,
+ seed
+ )
}
-
+
if (incremental && !isTaskRequired(
cohortId = outputCohortId,
seed = seed,
@@ -239,30 +250,34 @@ sampleCohortDefinitionSet <- function(cohortDefinitionSet,
return(sampledCohortDefinition)
}
# check incremental task for cohort sampling
- sampleTable <- .getSampleSet(connection = connection,
- n = n,
- sampleFraction = sampleFraction,
- seed = seed + targetCohortId, # Seed is unique to each target cohort
- seedArgs = seedArgs,
- cohortDatabaseSchema = cohortDatabaseSchema,
- targetCohortId = targetCohortId,
- targetTable = cohortTableNames$cohortTable)
+ sampleTable <- .getSampleSet(
+ connection = connection,
+ n = n,
+ sampleFraction = sampleFraction,
+ seed = seed + targetCohortId, # Seed is unique to each target cohort
+ seedArgs = seedArgs,
+ cohortDatabaseSchema = cohortDatabaseSchema,
+ targetCohortId = targetCohortId,
+ targetTable = cohortTableNames$cohortTable
+ )
if (nrow(sampleTable) == 0) {
ParallelLogger::logInfo("No entires found for ", targetCohortId, " was it generated?")
return(sampledCohortDefinition)
}
# Called only for side effects
- .sampleCohort(connection = connection,
- targetCohortId = targetCohortId,
- targetTable = cohortTableNames$cohortTable,
- outputCohortId = outputCohortId,
- outputTable = cohortTableNames$cohortSampleTable,
- cohortDatabaseSchema = cohortDatabaseSchema,
- outputDatabaseSchema = outputDatabaseSchema,
- sampleTable = sampleTable,
- seed = seed + targetCohortId, # Seed is unique to each target cohort
- tempEmulationSchema = tempEmulationSchema)
+ .sampleCohort(
+ connection = connection,
+ targetCohortId = targetCohortId,
+ targetTable = cohortTableNames$cohortTable,
+ outputCohortId = outputCohortId,
+ outputTable = cohortTableNames$cohortSampleTable,
+ cohortDatabaseSchema = cohortDatabaseSchema,
+ outputDatabaseSchema = outputDatabaseSchema,
+ sampleTable = sampleTable,
+ seed = seed + targetCohortId, # Seed is unique to each target cohort
+ tempEmulationSchema = tempEmulationSchema
+ )
sampledCohortDefinition$status <- "generated"
if (incremental) {
@@ -275,7 +290,7 @@ sampleCohortDefinitionSet <- function(cohortDefinitionSet,
}
return(sampledCohortDefinition)
}, seed, cohortIds) %>%
- dplyr::bind_rows()
+ dplyr::bind_rows()
diff --git a/R/CohortStats.R b/R/CohortStats.R
index f9caa5d..9b1c2b5 100644
--- a/R/CohortStats.R
+++ b/R/CohortStats.R
@@ -1,4 +1,4 @@
-# Copyright 2023 Observational Health Data Sciences and Informatics
+# Copyright 2024 Observational Health Data Sciences and Informatics
#
# This file is part of CohortGenerator
#
diff --git a/R/CohortTables.R b/R/CohortTables.R
index 27816fc..bf0e2b9 100644
--- a/R/CohortTables.R
+++ b/R/CohortTables.R
@@ -1,4 +1,4 @@
-# Copyright 2023 Observational Health Data Sciences and Informatics
+# Copyright 2024 Observational Health Data Sciences and Informatics
#
# This file is part of CohortGenerator
#
@@ -196,9 +196,9 @@ dropCohortStatsTables <- function(connectionDetails = NULL,
}
}
-.checkCohortTables <- function (connection,
- cohortDatabaseSchema,
- cohortTableNames) {
+.checkCohortTables <- function(connection,
+ cohortDatabaseSchema,
+ cohortTableNames) {
# Verify the cohort tables exist and if they do not
# stop the generation process
tableExistsFlagList <- lapply(cohortTableNames, FUN = function(x) {
@@ -221,4 +221,4 @@ dropCohortStatsTables <- function(connectionDetails = NULL,
errorMsg <- paste(errorMsg, "Please use the createCohortTables function to ensure all tables exist before generating cohorts.", sep = "\n")
stop(errorMsg)
}
-}
\ No newline at end of file
+}
diff --git a/R/CsvHelper.R b/R/CsvHelper.R
index a469507..25d3d10 100644
--- a/R/CsvHelper.R
+++ b/R/CsvHelper.R
@@ -1,4 +1,4 @@
-# Copyright 2023 Observational Health Data Sciences and Informatics
+# Copyright 2024 Observational Health Data Sciences and Informatics
#
# This file is part of CohortGenerator
#
diff --git a/R/Export.R b/R/Export.R
index ea958b6..f351821 100644
--- a/R/Export.R
+++ b/R/Export.R
@@ -1,4 +1,4 @@
-# Copyright 2023 Observational Health Data Sciences and Informatics
+# Copyright 2024 Observational Health Data Sciences and Informatics
#
# This file is part of CohortGenerator
#
diff --git a/R/Incremental.R b/R/Incremental.R
index ff9ce9c..de400e4 100644
--- a/R/Incremental.R
+++ b/R/Incremental.R
@@ -1,4 +1,4 @@
-# Copyright 2023 Observational Health Data Sciences and Informatics
+# Copyright 2024 Observational Health Data Sciences and Informatics
#
# This file is part of CohortGenerator
#
diff --git a/R/NegativeControlCohorts.R b/R/NegativeControlCohorts.R
index bef2695..516662c 100644
--- a/R/NegativeControlCohorts.R
+++ b/R/NegativeControlCohorts.R
@@ -1,4 +1,4 @@
-# Copyright 2023 Observational Health Data Sciences and Informatics
+# Copyright 2024 Observational Health Data Sciences and Informatics
#
# This file is part of CohortGenerator
#
@@ -53,8 +53,8 @@ createEmptyNegativeControlOutcomeCohortSet <- function(verbose = FALSE) {
#' @keywords internal
.getNegativeControlOutcomeCohortSetSpecification <- function() {
return(readCsv(system.file("negativeControlOutcomeCohortSetSpecificationDescription.csv",
- package = "CohortGenerator",
- mustWork = TRUE
+ package = "CohortGenerator",
+ mustWork = TRUE
)))
}
@@ -111,7 +111,7 @@ generateNegativeControlOutcomeCohorts <- function(connectionDetails = NULL,
checkmate::assert_choice(x = tolower(occurrenceType), choices = c("all", "first"))
checkmate::assert_logical(detectOnDescendants)
checkmate::assertNames(colnames(negativeControlOutcomeCohortSet),
- must.include = .getNegativeControlOutcomeCohortSetSpecification()$columnName
+ must.include = .getNegativeControlOutcomeCohortSetSpecification()$columnName
)
checkmate::assert_data_frame(
x = negativeControlOutcomeCohortSet,
@@ -122,9 +122,9 @@ generateNegativeControlOutcomeCohorts <- function(connectionDetails = NULL,
# cohort definition set before generating
if (length(unique(negativeControlOutcomeCohortSet$cohortId)) != length(negativeControlOutcomeCohortSet$cohortId)) {
duplicatedCohortIds <- negativeControlOutcomeCohortSet$cohortId[duplicated(negativeControlOutcomeCohortSet$cohortId)]
- stop("Cannot generate! Duplicate cohort IDs found in your negativeControlOutcomeCohortSet: ", paste(duplicatedCohortIds, sep=","), ". Please fix your negativeControlOutcomeCohortSet and try again.")
- }
-
+ stop("Cannot generate! Duplicate cohort IDs found in your negativeControlOutcomeCohortSet: ", paste(duplicatedCohortIds, sep = ","), ". Please fix your negativeControlOutcomeCohortSet and try again.")
+ }
+
if (incremental) {
if (is.null(incrementalFolder)) {
stop("Must specify incrementalFolder when incremental = TRUE")
diff --git a/R/SubsetDefinitions.R b/R/SubsetDefinitions.R
index f328812..4453fd6 100644
--- a/R/SubsetDefinitions.R
+++ b/R/SubsetDefinitions.R
@@ -1,4 +1,4 @@
-# Copyright 2023 Observational Health Data Sciences and Informatics
+# Copyright 2024 Observational Health Data Sciences and Informatics
#
# This file is part of CohortGenerator
#
diff --git a/R/SubsetQueryBuilders.R b/R/SubsetQueryBuilders.R
index 195d9b3..4edef34 100644
--- a/R/SubsetQueryBuilders.R
+++ b/R/SubsetQueryBuilders.R
@@ -1,4 +1,4 @@
-# Copyright 2023 Observational Health Data Sciences and Informatics
+# Copyright 2024 Observational Health Data Sciences and Informatics
#
# This file is part of CohortGenerator
#
diff --git a/R/Subsets.R b/R/Subsets.R
index 1444448..c50a0f8 100644
--- a/R/Subsets.R
+++ b/R/Subsets.R
@@ -1,4 +1,4 @@
-# Copyright 2023 Observational Health Data Sciences and Informatics
+# Copyright 2024 Observational Health Data Sciences and Informatics
#
# This file is part of CohortGenerator
#
diff --git a/_pkgdown.yml b/_pkgdown.yml
index 8ee695f..d0aa7e1 100644
--- a/_pkgdown.yml
+++ b/_pkgdown.yml
@@ -102,6 +102,13 @@ reference:
- isTaskRequired
- saveIncremental
- computeChecksum
+
+ - title: "Cohort Sampling"
+ desc: >
+ Functions that support sampling a cohort. Please note this is only for
+ software development purposes and NOT for running studies.
+ contents:
+ - sampleCohortDefinitionSet
navbar:
structure:
diff --git a/docs/404.html b/docs/404.html
index 81aa5d3..1fb5c5e 100644
--- a/docs/404.html
+++ b/docs/404.html
@@ -6,7 +6,7 @@
Page not found (404) • CohortGenerator
-
+
@@ -32,7 +32,7 @@
CohortGenerator
- 0.8.1
+ 0.9.0
@@ -54,6 +54,9 @@
Generating Cohorts
+
+ Sampling Cohorts
+
@@ -106,7 +109,7 @@ Page not found (404)
diff --git a/docs/articles/CreatingCohortSubsetDefinitions.html b/docs/articles/CreatingCohortSubsetDefinitions.html
index 835a6cc..5e8b709 100644
--- a/docs/articles/CreatingCohortSubsetDefinitions.html
+++ b/docs/articles/CreatingCohortSubsetDefinitions.html
@@ -6,7 +6,7 @@
Creating Cohort Subset Definitions • CohortGenerator
-
+
@@ -33,7 +33,7 @@
CohortGenerator
- 0.8.1
+ 0.9.0
@@ -55,6 +55,9 @@
Generating Cohorts
+
+ Sampling Cohorts
+
@@ -88,7 +91,7 @@ Creating Cohort Subset Definitions
James P.
Gilbert and Anthony G. Sena
- 2023-10-10
+ 2024-05-28
Source: vignettes/CreatingCohortSubsetDefinitions.Rmd
CreatingCohortSubsetDefinitions.Rmd
@@ -533,7 +536,7 @@
@@ -88,7 +91,7 @@ Generating Cohorts
Anthony G. Sena
and Martijn J. Schuemie
- 2023-10-10
+ 2024-05-28
Source: vignettes/GeneratingCohorts.Rmd
GeneratingCohorts.Rmd
@@ -260,7 +263,7 @@
Changelog
@@ -70,6 +73,8 @@ All vignettes
Generating Cohorts
+ Sampling Cohorts
+
@@ -80,7 +85,7 @@ All vignettes
diff --git a/docs/authors.html b/docs/authors.html
index 5984e02..934e4a3 100644
--- a/docs/authors.html
+++ b/docs/authors.html
@@ -1,5 +1,5 @@
-Authors and Citation • CohortGenerator Authors and Citation • CohortGenerator
@@ -17,7 +17,7 @@
CohortGenerator
- 0.8.1
+ 0.9.0
@@ -37,6 +37,9 @@
Generating Cohorts
+
+ Sampling Cohorts
+
Changelog
@@ -60,7 +63,7 @@
@@ -93,14 +96,14 @@
Citation
-
Sena A, Gilbert J, Rao G, Schuemie M (2023).
+
Sena A, Gilbert J, Rao G, Schuemie M (2024).
CohortGenerator: An R Package for Cohort Generation Against the OMOP CDM .
https://ohdsi.github.io/CohortGenerator/, https://github.com/OHDSI/CohortGenerator.
@Manual{,
title = {CohortGenerator: An R Package for Cohort Generation Against the OMOP CDM},
author = {Anthony Sena and Jamie Gilbert and Gowtham Rao and Martijn Schuemie},
- year = {2023},
+ year = {2024},
note = {https://ohdsi.github.io/CohortGenerator/, https://github.com/OHDSI/CohortGenerator},
}
@@ -115,7 +118,7 @@
Citation
diff --git a/docs/index.html b/docs/index.html
index bc36f4a..e325d4e 100644
--- a/docs/index.html
+++ b/docs/index.html
@@ -6,7 +6,7 @@
An R Package for Cohort Generation Against the OMOP CDM • CohortGenerator
-
+
@@ -33,7 +33,7 @@
CohortGenerator
- 0.8.1
+ 0.9.0
@@ -55,6 +55,9 @@
Generating Cohorts
+
+ Sampling Cohorts
+
@@ -269,7 +272,7 @@ Developers
diff --git a/docs/news/index.html b/docs/news/index.html
index 844ca9a..abf1fc5 100644
--- a/docs/news/index.html
+++ b/docs/news/index.html
@@ -1,5 +1,5 @@
-Changelog • CohortGenerator Changelog • CohortGenerator
@@ -17,7 +17,7 @@
CohortGenerator
- 0.8.1
+ 0.9.0
@@ -37,6 +37,9 @@
Generating Cohorts
+
+ Sampling Cohorts
+
Changelog
@@ -63,12 +66,24 @@ Changelog
Source: NEWS.md
+
+
+
Random sample functionality (for development only) (Issue #129 )
+Incremental mode for negative control cohort generation (Issue #137 )
+Fixes getCohortCounts() if cohortIds is not specified, but cohortDefinitionSet is. (Issue #136 )
+Add cohort ID to generation output messages (Issue #132 )
+Add databaseId to output of getStatsTable() (Issue #116 )
+Prevent duplicate cohort IDs in cohortDefinitionSet (Issue #130 )
+Fix cohort stats query for Oracle (Issue #143 )
+Ensure databaseId applied to all returned cohort counts (Issue #144 )
+Preserve backwards compatibility if cohort sample table is not in the list of cohort table names (Issue #147 )
+
Include cohorts with 0 people in cohort counts (Issue #91 ).
Use numeric for cohort ID (Issue #98 )
Allow big ints for target pairs (#103 )
-Pass tempEmulationSchema
when creating negative controlc ohorts (#104 )
+Pass tempEmulationSchema
when creating negative control cohorts (#104 )
Target CDM v5.4 for unit tests (#119 )
Fix for subset references (#115 )
Allow for subset cohort name templating (#118 )
@@ -151,7 +166,7 @@
diff --git a/docs/pkgdown.yml b/docs/pkgdown.yml
index 3100ae0..ec62a11 100644
--- a/docs/pkgdown.yml
+++ b/docs/pkgdown.yml
@@ -1,8 +1,9 @@
-pandoc: 3.1.1
-pkgdown: 2.0.7
+pandoc: 3.1.11
+pkgdown: 2.0.9
pkgdown_sha: ~
articles:
CreatingCohortSubsetDefinitions: CreatingCohortSubsetDefinitions.html
GeneratingCohorts: GeneratingCohorts.html
-last_built: 2023-10-10T18:25Z
+ SamplingCohorts: SamplingCohorts.html
+last_built: 2024-05-28T17:58Z
diff --git a/docs/reference/CohortGenerator-package.html b/docs/reference/CohortGenerator-package.html
index 2742e5f..94e32b7 100644
--- a/docs/reference/CohortGenerator-package.html
+++ b/docs/reference/CohortGenerator-package.html
@@ -1,5 +1,5 @@
-CohortGenerator: An R Package for Cohort Generation Against the OMOP CDM — CohortGenerator-package • CohortGenerator CohortGenerator: An R Package for Cohort Generation Against the OMOP CDM — CohortGenerator-package • CohortGenerator
@@ -17,7 +17,7 @@
CohortGenerator
- 0.8.1
+ 0.9.0
@@ -37,6 +37,9 @@
Generating Cohorts
+
+ Sampling Cohorts
+
Changelog
@@ -97,7 +100,7 @@ Author
diff --git a/docs/reference/CohortSubsetDefinition.html b/docs/reference/CohortSubsetDefinition.html
index 69d04ab..3540129 100644
--- a/docs/reference/CohortSubsetDefinition.html
+++ b/docs/reference/CohortSubsetDefinition.html
@@ -1,5 +1,5 @@
-Cohort Subset Definition — CohortSubsetDefinition • CohortGenerator Cohort Subset Definition — CohortSubsetDefinition • CohortGenerator
@@ -17,7 +17,7 @@
CohortGenerator
- 0.8.1
+ 0.9.0
@@ -37,6 +37,9 @@
Generating Cohorts
+
+ Sampling Cohorts
+
Changelog
@@ -285,7 +288,7 @@ Arguments
-
Site built with pkgdown 2.0.7.
+
Site built with pkgdown 2.0.9.
diff --git a/docs/reference/CohortSubsetOperator.html b/docs/reference/CohortSubsetOperator.html
index 64ffa53..eb899f8 100644
--- a/docs/reference/CohortSubsetOperator.html
+++ b/docs/reference/CohortSubsetOperator.html
@@ -1,5 +1,5 @@
-Cohort Subset Operator — CohortSubsetOperator • CohortGenerator Cohort Subset Operator — CohortSubsetOperator • CohortGenerator Criteria Subset — DemographicSubsetOperator • CohortGenerator Criteria Subset — DemographicSubsetOperator • CohortGenerator Limit Subset Operator — LimitSubsetOperator • CohortGenerator Limit Subset Operator — LimitSubsetOperator • CohortGenerator to List — SubsetCohortWindow • CohortGenerator to List — SubsetCohortWindow • CohortGenerator
@@ -17,7 +17,7 @@
CohortGenerator
- 0.8.1
+ 0.9.0
@@ -37,6 +37,9 @@
Generating Cohorts
+
+ Sampling Cohorts
+
Changelog
@@ -165,7 +168,7 @@ Arguments
-
Site built with pkgdown 2.0.7.
+
Site built with pkgdown 2.0.9.
diff --git a/docs/reference/SubsetOperator.html b/docs/reference/SubsetOperator.html
index 9c7c3f7..b9629d1 100644
--- a/docs/reference/SubsetOperator.html
+++ b/docs/reference/SubsetOperator.html
@@ -1,5 +1,5 @@
-SubsetOperator — SubsetOperator • CohortGenerator SubsetOperator — SubsetOperator • CohortGenerator
@@ -17,7 +17,7 @@
CohortGenerator
- 0.8.1
+ 0.9.0
@@ -37,6 +37,9 @@
Generating Cohorts
+
+ Sampling Cohorts
+
Changelog
@@ -235,7 +238,7 @@ Arguments
-
Site built with pkgdown 2.0.7.
+
Site built with pkgdown 2.0.9.
diff --git a/docs/reference/addCohortSubsetDefinition.html b/docs/reference/addCohortSubsetDefinition.html
index 1e97727..90567f9 100644
--- a/docs/reference/addCohortSubsetDefinition.html
+++ b/docs/reference/addCohortSubsetDefinition.html
@@ -1,5 +1,5 @@
-Add cohort subset definition to a cohort definition set — addCohortSubsetDefinition • CohortGenerator Add cohort subset definition to a cohort definition set — addCohortSubsetDefinition • CohortGenerator Check if a cohort definition set is using the proper data types — checkAndFixCohortDefinitionSetDataTypes • CohortGenerator Check if a cohort definition set is using the proper data types — checkAndFixCohortDefinitionSetDataTypes • CohortGenerator Computes the checksum for a value — computeChecksum • CohortGenerator Computes the checksum for a value — computeChecksum • CohortGenerator A definition of subset functions to be applied to a set of cohorts — createCohortSubset • CohortGenerator A definition of subset functions to be applied to a set of cohorts — createCohortSubset • CohortGenerator
@@ -17,7 +17,7 @@
CohortGenerator
- 0.8.1
+ 0.9.0
@@ -37,6 +37,9 @@
Generating Cohorts
+
+ Sampling Cohorts
+
Changelog
@@ -125,7 +128,7 @@ Value
diff --git a/docs/reference/createCohortSubsetDefinition.html b/docs/reference/createCohortSubsetDefinition.html
index b89eb3b..73a692c 100644
--- a/docs/reference/createCohortSubsetDefinition.html
+++ b/docs/reference/createCohortSubsetDefinition.html
@@ -1,5 +1,5 @@
-Create Subset Definition — createCohortSubsetDefinition • CohortGenerator Create Subset Definition — createCohortSubsetDefinition • CohortGenerator
@@ -17,7 +17,7 @@
CohortGenerator
- 0.8.1
+ 0.9.0
@@ -37,6 +37,9 @@
Generating Cohorts
+
+ Sampling Cohorts
+
Changelog
@@ -99,14 +102,14 @@ Arguments
operatorNameConcatString
-(optional) SqlRender string template for formatting names of resulting subset cohorts
-Can use the variables @baseCohortName, @subsetDefinitionName and @operatorNames.
-This is applied when adding the subset definition to a cohort definition set.
+(optional) String to concatenate operator names together when outputting resulting cohort
+name
subsetCohortNameTemplate
-(optional) String to concatenate operator names together when outputting resulting cohort
-name
+(optional) SqlRender string template for formatting names of resulting subset cohorts
+Can use the variables @baseCohortName, @subsetDefinitionName and @operatorNames.
+This is applied when adding the subset definition to a cohort definition set.
@@ -122,7 +125,7 @@ Arguments
diff --git a/docs/reference/createCohortTables.html b/docs/reference/createCohortTables.html
index 756056d..c28cf0c 100644
--- a/docs/reference/createCohortTables.html
+++ b/docs/reference/createCohortTables.html
@@ -1,5 +1,5 @@
-Create cohort tables — createCohortTables • CohortGenerator Create cohort tables — createCohortTables • CohortGenerator Create createDemographicSubset Subset — createDemographicSubset • CohortGenerator Create createDemographicSubset Subset — createDemographicSubset • CohortGenerator
@@ -17,7 +17,7 @@
CohortGenerator
- 0.8.1
+ 0.9.0
@@ -37,6 +37,9 @@
Generating Cohorts
+
+ Sampling Cohorts
+
Changelog
@@ -121,7 +124,7 @@ Arguments
diff --git a/docs/reference/createEmptyCohortDefinitionSet.html b/docs/reference/createEmptyCohortDefinitionSet.html
index a71456d..31f3f6a 100644
--- a/docs/reference/createEmptyCohortDefinitionSet.html
+++ b/docs/reference/createEmptyCohortDefinitionSet.html
@@ -1,5 +1,5 @@
-Create an empty cohort definition set — createEmptyCohortDefinitionSet • CohortGenerator Create an empty cohort definition set — createEmptyCohortDefinitionSet • CohortGenerator Create an empty negative control outcome cohort set — createEmptyNegativeControlOutcomeCohortSet • CohortGenerator Create an empty negative control outcome cohort set — createEmptyNegativeControlOutcomeCohortSet • CohortGenerator Create Limit Subset — createLimitSubset • CohortGenerator Create Limit Subset — createLimitSubset • CohortGenerator
@@ -17,7 +17,7 @@
CohortGenerator
- 0.8.1
+ 0.9.0
@@ -37,6 +37,9 @@
Generating Cohorts
+
+ Sampling Cohorts
+
Changelog
@@ -126,7 +129,7 @@ Arguments
diff --git a/docs/reference/createSubsetCohortWindow.html b/docs/reference/createSubsetCohortWindow.html
index 5e80cc7..27f90e4 100644
--- a/docs/reference/createSubsetCohortWindow.html
+++ b/docs/reference/createSubsetCohortWindow.html
@@ -1,5 +1,5 @@
-A definition of subset functions to be applied to a set of cohorts — createSubsetCohortWindow • CohortGenerator A definition of subset functions to be applied to a set of cohorts — createSubsetCohortWindow • CohortGenerator
@@ -17,7 +17,7 @@
CohortGenerator
- 0.8.1
+ 0.9.0
@@ -37,6 +37,9 @@
Generating Cohorts
+
+ Sampling Cohorts
+
Changelog
@@ -105,7 +108,7 @@ Value
diff --git a/docs/reference/dropCohortStatsTables.html b/docs/reference/dropCohortStatsTables.html
index 06b1b53..4c9e8cd 100644
--- a/docs/reference/dropCohortStatsTables.html
+++ b/docs/reference/dropCohortStatsTables.html
@@ -1,5 +1,5 @@
-Drop cohort statistics tables — dropCohortStatsTables • CohortGenerator Drop cohort statistics tables — dropCohortStatsTables • CohortGenerator
@@ -17,7 +17,7 @@
CohortGenerator
- 0.8.1
+ 0.9.0
@@ -37,6 +37,9 @@
Generating Cohorts
+
+ Sampling Cohorts
+
Changelog
@@ -123,7 +126,7 @@ Arguments
diff --git a/docs/reference/exportCohortStatsTables.html b/docs/reference/exportCohortStatsTables.html
index 66cd202..258095b 100644
--- a/docs/reference/exportCohortStatsTables.html
+++ b/docs/reference/exportCohortStatsTables.html
@@ -1,5 +1,5 @@
-Export the cohort statistics tables to the file system — exportCohortStatsTables • CohortGenerator Export the cohort statistics tables to the file system — exportCohortStatsTables • CohortGenerator Generate a set of cohorts — generateCohortSet • CohortGenerator Generate a set of cohorts — generateCohortSet • CohortGenerator
@@ -17,7 +17,7 @@
CohortGenerator
- 0.8.1
+ 0.9.0
@@ -37,6 +37,9 @@
Generating Cohorts
+
+ Sampling Cohorts
+
Changelog
@@ -203,7 +206,7 @@ Value
diff --git a/docs/reference/generateNegativeControlOutcomeCohorts.html b/docs/reference/generateNegativeControlOutcomeCohorts.html
index 6ccbe8b..3d94e46 100644
--- a/docs/reference/generateNegativeControlOutcomeCohorts.html
+++ b/docs/reference/generateNegativeControlOutcomeCohorts.html
@@ -1,5 +1,5 @@
-Generate a set of negative control outcome cohorts — generateNegativeControlOutcomeCohorts • CohortGenerator Generate a set of negative control outcome cohorts — generateNegativeControlOutcomeCohorts • CohortGenerator Count the cohort(s) — getCohortCounts • CohortGenerator Count the cohort(s) — getCohortCounts • CohortGenerator Get a cohort definition set — getCohortDefinitionSet • CohortGenerator Get a cohort definition set — getCohortDefinitionSet • CohortGenerator Get Cohort Inclusion Stats Table Data — getCohortStats • CohortGenerator Get Cohort Inclusion Stats Table Data — getCohortStats • CohortGenerator
CohortGenerator
- 0.8.1
+ 0.9.0
@@ -47,6 +47,9 @@
Generating Cohorts
+
+ Sampling Cohorts
+
Changelog
@@ -156,7 +159,7 @@ Arguments
diff --git a/docs/reference/getCohortTableNames.html b/docs/reference/getCohortTableNames.html
index d3c70bd..24eaf0a 100644
--- a/docs/reference/getCohortTableNames.html
+++ b/docs/reference/getCohortTableNames.html
@@ -1,7 +1,5 @@
-Used to get a list of cohort table names to use when creating the cohort
-tables — getCohortTableNames • CohortGenerator Used to get a list of cohort table names to use when creating the cohort tables — getCohortTableNames • CohortGenerator Get a list of tasks required when running in incremental mode — getRequiredTasks • CohortGenerator Get a list of tasks required when running in incremental mode — getRequiredTasks • CohortGenerator Get cohort subset definitions from a cohort definition set — getSubsetDefinitions • CohortGenerator Get cohort subset definitions from a cohort definition set — getSubsetDefinitions • CohortGenerator
CohortGenerator
- 0.8.1
+ 0.9.0
@@ -41,6 +41,9 @@
Generating Cohorts
+
+ Sampling Cohorts
+
Changelog
@@ -105,7 +108,7 @@ Value
diff --git a/docs/reference/index.html b/docs/reference/index.html
index 83f7675..f8b1ac2 100644
--- a/docs/reference/index.html
+++ b/docs/reference/index.html
@@ -1,5 +1,5 @@
-Function reference • CohortGenerator Function reference • CohortGenerator
@@ -17,7 +17,7 @@
CohortGenerator
- 0.8.1
+ 0.9.0
@@ -37,6 +37,9 @@
Generating Cohorts
+
+ Sampling Cohorts
+
Changelog
@@ -73,8 +76,7 @@ Cohort Tables
getCohortTableNames()
- Used to get a list of cohort table names to use when creating the cohort
-tables
+ Used to get a list of cohort table names to use when creating the cohort tables
Cohort Defintion Set
Functions that support working with a cohort definition set
@@ -190,8 +192,7 @@ Cohort Statistics
insertInclusionRuleNames()
- Used to insert the inclusion rule names from a cohort definition set
-when generating cohorts that include cohort statistics
+ Used to insert the inclusion rule names from a cohort definition set when generating cohorts that include cohort statistics
exportCohortStatsTables()
@@ -260,6 +261,14 @@ Record Keeping computeChecksum()
Computes the checksum for a value
+
+ Cohort Sampling
+
Functions that support sampling a cohort. Please note this is only for software development purposes and NOT for running studies.
+
+
+ sampleCohortDefinitionSet()
+
+ Sample Cohort Definition Set
diff --git a/docs/reference/insertInclusionRuleNames.html b/docs/reference/insertInclusionRuleNames.html
index 1cb25b4..4e47142 100644
--- a/docs/reference/insertInclusionRuleNames.html
+++ b/docs/reference/insertInclusionRuleNames.html
@@ -1,7 +1,5 @@
-Used to insert the inclusion rule names from a cohort definition set
-when generating cohorts that include cohort statistics — insertInclusionRuleNames • CohortGenerator Used to insert the inclusion rule names from a cohort definition set when generating cohorts that include cohort statistics — insertInclusionRuleNames • CohortGenerator
CohortGenerator
- 0.8.1
+ 0.9.0
@@ -43,6 +41,9 @@
Generating Cohorts
+
+ Sampling Cohorts
+
Changelog
@@ -65,8 +66,7 @@
@@ -153,7 +153,7 @@
Value
diff --git a/docs/reference/isCamelCase.html b/docs/reference/isCamelCase.html
index c64a0dc..6c716f1 100644
--- a/docs/reference/isCamelCase.html
+++ b/docs/reference/isCamelCase.html
@@ -1,5 +1,5 @@
-Used to check if a string is in lower camel case — isCamelCase • CohortGenerator Used to check if a string is in lower camel case — isCamelCase • CohortGenerator Is the data.frame a cohort definition set? — isCohortDefinitionSet • CohortGenerator Is the data.frame a cohort definition set? — isCohortDefinitionSet • CohortGenerator Is the data.frame formatted for uploading to a database? — isFormattedForDatabaseUpload • CohortGenerator Is the data.frame formatted for uploading to a database? — isFormattedForDatabaseUpload • CohortGenerator Used to check if a string is in snake case — isSnakeCase • CohortGenerator Used to check if a string is in snake case — isSnakeCase • CohortGenerator Is a task required when running in incremental mode — isTaskRequired • CohortGenerator Is a task required when running in incremental mode — isTaskRequired • CohortGenerator Used to read a .csv file — readCsv • CohortGenerator Used to read a .csv file — readCsv • CohortGenerator
CohortGenerator
- 0.8.1
+ 0.9.0
@@ -41,6 +41,9 @@
Generating Cohorts
+
+ Sampling Cohorts
+
Changelog
@@ -110,7 +113,7 @@ Value
diff --git a/docs/reference/recordTasksDone.html b/docs/reference/recordTasksDone.html
index be2d1f2..b934c79 100644
--- a/docs/reference/recordTasksDone.html
+++ b/docs/reference/recordTasksDone.html
@@ -1,5 +1,5 @@
-Record a task as complete — recordTasksDone • CohortGenerator Record a task as complete — recordTasksDone • CohortGenerator
@@ -17,7 +17,7 @@
CohortGenerator
- 0.8.1
+ 0.9.0
@@ -37,6 +37,9 @@
Generating Cohorts
+
+ Sampling Cohorts
+
Changelog
@@ -106,7 +109,7 @@ Arguments
diff --git a/docs/reference/sampleCohortDefinitionSet.html b/docs/reference/sampleCohortDefinitionSet.html
new file mode 100644
index 0000000..41c171e
--- /dev/null
+++ b/docs/reference/sampleCohortDefinitionSet.html
@@ -0,0 +1,227 @@
+
+Sample Cohort Definition Set — sampleCohortDefinitionSet • CohortGenerator
+
+
+
+
+
+
+
+
+
Create 1 or more sample of size n of a cohort definition set
+
Subsetted cohorts can be sampled, as with any other subset form.
+However, subsetting a sampled cohort is not recommended and not currently supported at this time.
+In the case where n > cohort count the entire cohort is copied unmodified
+
As different databases have different forms of randomness, the random selection is computed in
+R, based on the count for each cohort. This is, therefore, db platform independent
+
Note, this function assumes cohorts have already been generated.
+
Lifecycle Note: This functionality is considered experimental and not intended for use inside analytic packages
+
+
+
+
sampleCohortDefinitionSet (
+ cohortDefinitionSet ,
+ cohortIds = cohortDefinitionSet $ cohortId ,
+ connectionDetails = NULL ,
+ connection = NULL ,
+ tempEmulationSchema = getOption ( "sqlRenderTempEmulationSchema" ) ,
+ cohortDatabaseSchema ,
+ outputDatabaseSchema = cohortDatabaseSchema ,
+ cohortTableNames = getCohortTableNames ( ) ,
+ n = NULL ,
+ sampleFraction = NULL ,
+ seed = 64374 ,
+ seedArgs = NULL ,
+ identifierExpression = "cohortId * 1000 + seed" ,
+ incremental = FALSE ,
+ incrementalFolder = NULL
+)
+
+
+
+
Arguments
+
cohortDefinitionSet
+The cohortDefinitionSet
argument must be a data frame with
+the following columns:
cohortId
+The unique integer identifier of the cohort
+
+cohortName
+The cohort's name
+
+sql
+The OHDSI-SQL used to generate the cohort
+
+Optionally, this data frame may contain:
json
+The Circe JSON representation of the cohort
+
+
+
+
+cohortIds
+Optional subset of cohortIds to generate. By default this function will sample all cohorts
+
+
+connectionDetails
+An object of type connectionDetails
as created using the
+createConnectionDetails
function in the
+DatabaseConnector package. Can be left NULL if connection
is
+provided.
+
+
+connection
+An object of type connection
as created using the
+connect
function in the
+DatabaseConnector package. Can be left NULL if connectionDetails
+is provided, in which case a new connection will be opened at the start
+of the function, and closed when the function finishes.
+
+
+tempEmulationSchema
+Some database platforms like Oracle and Impala do not truly support
+temp tables. To emulate temp tables, provide a schema with write
+privileges where temp tables can be created.
+
+
+cohortDatabaseSchema
+Schema name where your cohort tables reside. Note that for SQL Server,
+this should include both the database and schema name, for example
+'scratch.dbo'.
+
+
+outputDatabaseSchema
+optional schema to output cohorts to (if different from cohortDatabaseSchema)
+
+
+cohortTableNames
+The names of the cohort tables. See getCohortTableNames
+for more details.
+
+
+n
+Sample size. Ignored if sample fraction is set
+
+
+sampleFraction
+Fraction of cohort to sample
+
+
+seed
+Vector of seeds to give to the R pseudorandom number generator
+
+
+seedArgs
+optional arguments to pass to set.seed
+
+
+identifierExpression
+Optional string R expression used to compute output cohort id. Can only use variables
+cohortId and seed. Default is "cohortId * 1000 + seed", which is substituted and evaluated
+
+
+incremental
+Create only cohorts that haven't been created before?
+
+
+incrementalFolder
+If incremental = TRUE
, specify a folder where records are
+kept of which definition has been executed.
+
+
+
+
Value
+
+
+
sampledCohortDefinitionSet - a data.frame like object that contains the resulting identifiers and modified names of cohorts
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/docs/reference/saveCohortDefinitionSet.html b/docs/reference/saveCohortDefinitionSet.html
index 6e911f3..22c8814 100644
--- a/docs/reference/saveCohortDefinitionSet.html
+++ b/docs/reference/saveCohortDefinitionSet.html
@@ -1,5 +1,5 @@
-Save the cohort definition set to the file system — saveCohortDefinitionSet • CohortGenerator Save the cohort definition set to the file system — saveCohortDefinitionSet • CohortGenerator
CohortGenerator
- 0.8.1
+ 0.9.0
@@ -43,6 +43,9 @@
Generating Cohorts
+
+ Sampling Cohorts
+
Changelog
@@ -160,7 +163,7 @@ Arguments
diff --git a/docs/reference/saveCohortSubsetDefinition.html b/docs/reference/saveCohortSubsetDefinition.html
index be3f292..d2291c1 100644
--- a/docs/reference/saveCohortSubsetDefinition.html
+++ b/docs/reference/saveCohortSubsetDefinition.html
@@ -1,5 +1,5 @@
-Save cohort subset definitions to json — saveCohortSubsetDefinition • CohortGenerator Save cohort subset definitions to json — saveCohortSubsetDefinition • CohortGenerator
@@ -17,7 +17,7 @@
CohortGenerator
- 0.8.1
+ 0.9.0
@@ -37,6 +37,9 @@
Generating Cohorts
+
+ Sampling Cohorts
+
Changelog
@@ -98,7 +101,7 @@ Arguments
diff --git a/docs/reference/saveIncremental.html b/docs/reference/saveIncremental.html
index 91b4a65..0f48925 100644
--- a/docs/reference/saveIncremental.html
+++ b/docs/reference/saveIncremental.html
@@ -1,5 +1,5 @@
-Used in incremental mode to save values to a file — saveIncremental • CohortGenerator Used in incremental mode to save values to a file — saveIncremental • CohortGenerator Used to write a .csv file — writeCsv • CohortGenerator Used to write a .csv file — writeCsv • CohortGenerator
CohortGenerator
- 0.8.1
+ 0.9.0
@@ -48,6 +48,9 @@
Generating Cohorts
+
+ Sampling Cohorts
+
Changelog
@@ -150,7 +153,7 @@ Value
diff --git a/docs/sitemap.xml b/docs/sitemap.xml
index d68963b..3808b80 100644
--- a/docs/sitemap.xml
+++ b/docs/sitemap.xml
@@ -12,6 +12,9 @@
/articles/index.html
+
+ /articles/SamplingCohorts.html
+
/authors.html
@@ -144,6 +147,9 @@
/reference/recordTasksDone.html
+
+ /reference/sampleCohortDefinitionSet.html
+
/reference/saveCohortDefinitionSet.html
diff --git a/extras/CohortGenerator.pdf b/extras/CohortGenerator.pdf
index 3248287..0822960 100644
Binary files a/extras/CohortGenerator.pdf and b/extras/CohortGenerator.pdf differ
diff --git a/extras/PackageMaintenance.R b/extras/PackageMaintenance.R
index 1995c62..9e73dbe 100644
--- a/extras/PackageMaintenance.R
+++ b/extras/PackageMaintenance.R
@@ -1,6 +1,6 @@
# @file PackageMaintenance
#
-# Copyright 2023 Observational Health Data Sciences and Informatics
+# Copyright 2024 Observational Health Data Sciences and Informatics
#
# This file is part of CohortGenerator
#
@@ -20,9 +20,9 @@
OhdsiRTools::formatRFolder("./R") #(note: this function has been impacted by change in formatR)
OhdsiRTools::checkUsagePackage("CohortGenerator")
OhdsiRTools::updateCopyrightYearFolder()
+devtools::spell_check()
styler::style_pkg()
devtools::document()
-devtools::spell_check()
# Create manual and vignettes:
unlink("extras/CohortGenerator.pdf")
@@ -41,6 +41,12 @@ rmarkdown::render("vignettes/CreatingCohortSubsetDefinitions.Rmd",
toc = TRUE,
number_sections = TRUE))
+rmarkdown::render("vignettes/SamplingCohorts.Rmd",
+ output_file = "../inst/doc/SamplingCohorts.pdf",
+ rmarkdown::pdf_document(latex_engine = "pdflatex",
+ toc = TRUE,
+ number_sections = TRUE))
+
unloadNamespace("CohortGenerator")
pkgdown::build_site()
OhdsiRTools::fixHadesLogo()
diff --git a/inst/doc/CreatingCohortSubsetDefinitions.pdf b/inst/doc/CreatingCohortSubsetDefinitions.pdf
new file mode 100644
index 0000000..61986cb
Binary files /dev/null and b/inst/doc/CreatingCohortSubsetDefinitions.pdf differ
diff --git a/inst/doc/GeneratingCohorts.pdf b/inst/doc/GeneratingCohorts.pdf
new file mode 100644
index 0000000..bcb7ad4
Binary files /dev/null and b/inst/doc/GeneratingCohorts.pdf differ
diff --git a/inst/doc/SamplingCohorts.pdf b/inst/doc/SamplingCohorts.pdf
new file mode 100644
index 0000000..a27ffb3
Binary files /dev/null and b/inst/doc/SamplingCohorts.pdf differ
diff --git a/man/sampleCohortDefinitionSet.Rd b/man/sampleCohortDefinitionSet.Rd
index 34a0210..8612bc5 100644
--- a/man/sampleCohortDefinitionSet.Rd
+++ b/man/sampleCohortDefinitionSet.Rd
@@ -61,7 +61,7 @@ for more details.}
\item{sampleFraction}{Fraction of cohort to sample}
-\item{seed}{Vector of seeds to give to the R psuedorandom number generator}
+\item{seed}{Vector of seeds to give to the R pseudorandom number generator}
\item{seedArgs}{optional arguments to pass to set.seed}
@@ -80,7 +80,7 @@ sampledCohortDefinitionSet - a data.frame like object that contains the resultin
Create 1 or more sample of size n of a cohort definition set
Subsetted cohorts can be sampled, as with any other subset form.
-However, subsetting a sampled cohort is not reccomended and not currently supported at this time.
+However, subsetting a sampled cohort is not recommended and not currently supported at this time.
In the case where n > cohort count the entire cohort is copied unmodified
As different databases have different forms of randomness, the random selection is computed in
diff --git a/tests/testthat/test-CohortCount.R b/tests/testthat/test-CohortCount.R
index 03c27bd..c796a63 100644
--- a/tests/testthat/test-CohortCount.R
+++ b/tests/testthat/test-CohortCount.R
@@ -129,19 +129,19 @@ test_that("Call getCohortCounts with no cohortId specified and cohortDefinitionS
packageName = "CohortGenerator",
verbose = TRUE
)
-
+
cohortDefinitionSet <- rbind(
cohortDefinitionSet,
cohortDefinitionSet[1, ] |> transform(atlasId = 100, cohortId = 100, cohortName = "not in cohort table", logicDescription = "not in cohort table")
)
-
+
testCohortCounts <- getCohortCounts(
connectionDetails = connectionDetails,
cohortDatabaseSchema = "main",
cohortTable = "cohort",
cohortDefinitionSet = cohortDefinitionSet
)
-
+
expect_true(nrow(testCohortCounts) == 4)
expect_true(testCohortCounts[testCohortCounts$cohortId == 100, "cohortEntries"] == 0)
expect_true(testCohortCounts[testCohortCounts$cohortId == 100, "cohortSubjects"] == 0)
diff --git a/tests/testthat/test-CohortSample.R b/tests/testthat/test-CohortSample.R
index 4b740cd..047df53 100644
--- a/tests/testthat/test-CohortSample.R
+++ b/tests/testthat/test-CohortSample.R
@@ -1,11 +1,12 @@
-
test_that("sampleCohortDefinitionSet", {
connectionDetails <- Eunomia::getEunomiaConnectionDetails()
conn <- DatabaseConnector::connect(connectionDetails = connectionDetails)
on.exit(DatabaseConnector::disconnect(conn))
- cohortTableNames <- getCohortTableNames(cohortTable = "cohort",
- cohortSampleTable = "cohort_sample")
+ cohortTableNames <- getCohortTableNames(
+ cohortTable = "cohort",
+ cohortSampleTable = "cohort_sample"
+ )
recordKeepingFolder <- file.path(outputFolder, "RecordKeepingSamples")
createCohortTables(
@@ -41,10 +42,12 @@ test_that("sampleCohortDefinitionSet", {
expect_true(all(cds$cohortId * 1000 + 64374 == sampledCohorts$cohortId))
# Sample table pouplated
- res <- DatabaseConnector::renderTranslateQuerySql(connection = conn,
- "SELECT cohort_definition_id, count(*) as ct FROM cohort_sample
+ res <- DatabaseConnector::renderTranslateQuerySql(
+ connection = conn,
+ "SELECT cohort_definition_id, count(*) as ct FROM cohort_sample
GROUP BY cohort_definition_id
- ")
+ "
+ )
expect_true(all(res$ct == 10))
expect_true(all(sampledCohorts$status == "generated"))
# Test incrmental logic works
@@ -80,11 +83,15 @@ test_that(".getSampleSet", {
connection <- DatabaseConnector::connect(dbms = "sqlite", server = ":memory:")
on.exit(DatabaseConnector::disconnect(connection))
- DatabaseConnector::insertTable(connection = connection,
- tableName = "cohort",
- camelCaseToSnakeCase = TRUE,
- data = data.frame(cohortDefinitionId = 1,
- subjectId = 1:1e5))
+ DatabaseConnector::insertTable(
+ connection = connection,
+ tableName = "cohort",
+ camelCaseToSnakeCase = TRUE,
+ data = data.frame(
+ cohortDefinitionId = 1,
+ subjectId = 1:1e5
+ )
+ )
cohortDatabaseSchema <- "main"
targetTable <- "cohort"
@@ -94,62 +101,71 @@ test_that(".getSampleSet", {
seedArgs <- NULL
res <- .getSampleSet(connection,
- n,
- sampleFraction = NULL,
- seed,
- seedArgs,
- cohortDatabaseSchema,
- targetCohortId,
- targetTable)
+ n,
+ sampleFraction = NULL,
+ seed,
+ seedArgs,
+ cohortDatabaseSchema,
+ targetCohortId,
+ targetTable
+ )
checkmate::expect_data_frame(res, types = "integer", nrows = n)
res2 <- .getSampleSet(connection,
- n,
- sampleFraction = NULL,
- seed,
- seedArgs,
- cohortDatabaseSchema,
- targetCohortId,
- targetTable)
+ n,
+ sampleFraction = NULL,
+ seed,
+ seedArgs,
+ cohortDatabaseSchema,
+ targetCohortId,
+ targetTable
+ )
# use of the same seed should produce the same result
expect_true(all(res$rand_id == res2$rand_id))
res2 <- .getSampleSet(connection,
- n,
- sampleFraction = NULL,
- seed + 1,
- seedArgs,
- cohortDatabaseSchema,
- targetCohortId,
- targetTable)
+ n,
+ sampleFraction = NULL,
+ seed + 1,
+ seedArgs,
+ cohortDatabaseSchema,
+ targetCohortId,
+ targetTable
+ )
expect_false(all(res$rand_id == res2$rand_id))
- DatabaseConnector::insertTable(connection = connection,
- tableName = "cohort",
- camelCaseToSnakeCase = TRUE,
- data = data.frame(cohortDefinitionId = 2,
- subjectId = 1:25))
+ DatabaseConnector::insertTable(
+ connection = connection,
+ tableName = "cohort",
+ camelCaseToSnakeCase = TRUE,
+ data = data.frame(
+ cohortDefinitionId = 2,
+ subjectId = 1:25
+ )
+ )
# Where n > count should return count rows
res3 <- .getSampleSet(connection,
- n,
- sampleFraction = NULL,
- seed,
- seedArgs,
- cohortDatabaseSchema,
- targetCohortId = 2,
- targetTable)
+ n,
+ sampleFraction = NULL,
+ seed,
+ seedArgs,
+ cohortDatabaseSchema,
+ targetCohortId = 2,
+ targetTable
+ )
checkmate::expect_data_frame(res3, types = "integer", nrows = 25)
res4 <- .getSampleSet(connection,
- n = NULL,
- sampleFraction = 0.5,
- seed,
- seedArgs,
- cohortDatabaseSchema,
- targetCohortId = 2,
- targetTable)
+ n = NULL,
+ sampleFraction = 0.5,
+ seed,
+ seedArgs,
+ cohortDatabaseSchema,
+ targetCohortId = 2,
+ targetTable
+ )
checkmate::expect_data_frame(res4, types = "integer", nrows = 12)
})
@@ -160,35 +176,41 @@ test_that(".sampleCohort", {
on.exit(DatabaseConnector::disconnect(connection))
cohortCount <- 1000
- startDates <- sample(seq(as.Date('2001/01/01'), as.Date('2023/01/01'), by = "day"), cohortCount)
+ startDates <- sample(seq(as.Date("2001/01/01"), as.Date("2023/01/01"), by = "day"), cohortCount)
endDates <- startDates + sample(1:800, cohortCount, replace = TRUE)
- tData <- data.frame(cohortDefinitionId = 1,
- subjectId = 1:cohortCount,
- cohortStartDate = startDates,
- cohortEndDate = endDates)
+ tData <- data.frame(
+ cohortDefinitionId = 1,
+ subjectId = 1:cohortCount,
+ cohortStartDate = startDates,
+ cohortEndDate = endDates
+ )
# dupes ensures that dense_rank allows selection of multiple cohort entries for the same subject
tData <- rbind(tData, tData)
- DatabaseConnector::insertTable(connection = connection,
- tableName = "cohort",
- camelCaseToSnakeCase = TRUE,
- data = tData)
+ DatabaseConnector::insertTable(
+ connection = connection,
+ tableName = "cohort",
+ camelCaseToSnakeCase = TRUE,
+ data = tData
+ )
sampleTable <- data.frame(rand_id = c(7, 8, 9, 10, 33, 198))
.sampleCohort(connection,
- targetCohortId = 1,
- targetTable = "cohort",
- outputCohortId = 999,
- outputTable = "cohort",
- cohortDatabaseSchema = "main",
- outputDatabaseSchema = "main",
- sampleTable = sampleTable,
- seed = 1,
- tempEmulationSchema = getOption("sqlRenderTempEmulationSchema"))
+ targetCohortId = 1,
+ targetTable = "cohort",
+ outputCohortId = 999,
+ outputTable = "cohort",
+ cohortDatabaseSchema = "main",
+ outputDatabaseSchema = "main",
+ sampleTable = sampleTable,
+ seed = 1,
+ tempEmulationSchema = getOption("sqlRenderTempEmulationSchema")
+ )
resCohort <- DatabaseConnector::renderTranslateQuerySql(connection,
- "SELECT * FROM main.cohort WHERE cohort_definition_id = 999",
- snakeCaseToCamelCase = TRUE)
+ "SELECT * FROM main.cohort WHERE cohort_definition_id = 999",
+ snakeCaseToCamelCase = TRUE
+ )
checkmate::expect_data_frame(resCohort, nrows = nrow(sampleTable) * 2)
expect_true(all(resCohort$subjectId %in% sampleTable$rand_id))
@@ -208,8 +230,10 @@ test_that("checkUniqueOutputIds returns error when duplicate ids are present", {
identifierExpression <- "cohortId"
cohortTableNames <- list(cohortTable = "cohort", cohortSampleTable = "cohort")
- expect_error(.checkUniqueOutputIds(cohortIds, seed, identifierExpression, cohortTableNames),
- "identifier expression does not produce unique output")
+ expect_error(
+ .checkUniqueOutputIds(cohortIds, seed, identifierExpression, cohortTableNames),
+ "identifier expression does not produce unique output"
+ )
})
test_that("checkUniqueOutputIds does not return error when all ids are unique", {
@@ -234,45 +258,45 @@ test_that("checkUniqueOutputIds does not return error when cohortTable and cohor
test_that("Error on bad params", {
# No connection details
expect_error({
- sampledCohorts <- sampleCohortDefinitionSet(
- cohortDefinitionSet = cds,
- connection = NULL,
- n = 10,
- sampleFraction = NULL,
- seed = 64374,
- cohortDatabaseSchema = "main",
- cohortTableNames = cohortTableNames,
- incremental = TRUE,
- incrementalFolder = recordKeepingFolder
- )
+ sampledCohorts <- sampleCohortDefinitionSet(
+ cohortDefinitionSet = cds,
+ connection = NULL,
+ n = 10,
+ sampleFraction = NULL,
+ seed = 64374,
+ cohortDatabaseSchema = "main",
+ cohortTableNames = cohortTableNames,
+ incremental = TRUE,
+ incrementalFolder = recordKeepingFolder
+ )
})
expect_error({
- sampledCohorts <- sampleCohortDefinitionSet(
- cohortDefinitionSet = cds,
- connectionDetails = Eunomia::getEunomiaConnectionDetails(),
- n = NULL,
- sampleFraction = NULL,
- seed = 64374,
- cohortDatabaseSchema = "main",
- cohortTableNames = cohortTableNames,
- incremental = TRUE,
- incrementalFolder = recordKeepingFolder
- )
+ sampledCohorts <- sampleCohortDefinitionSet(
+ cohortDefinitionSet = cds,
+ connectionDetails = Eunomia::getEunomiaConnectionDetails(),
+ n = NULL,
+ sampleFraction = NULL,
+ seed = 64374,
+ cohortDatabaseSchema = "main",
+ cohortTableNames = cohortTableNames,
+ incremental = TRUE,
+ incrementalFolder = recordKeepingFolder
+ )
})
expect_error({
- sampledCohorts <- sampleCohortDefinitionSet(
- cohortDefinitionSet = cds,
- connectionDetails = Eunomia::getEunomiaConnectionDetails(),
- n = 10,
- sampleFraction = NULL,
- seed = 64374,
- cohortDatabaseSchema = "main",
- cohortTableNames = cohortTableNames,
- incremental = TRUE,
- incrementalFolder = NULL
- )
+ sampledCohorts <- sampleCohortDefinitionSet(
+ cohortDefinitionSet = cds,
+ connectionDetails = Eunomia::getEunomiaConnectionDetails(),
+ n = 10,
+ sampleFraction = NULL,
+ seed = 64374,
+ cohortDatabaseSchema = "main",
+ cohortTableNames = cohortTableNames,
+ incremental = TRUE,
+ incrementalFolder = NULL
+ )
})
-})
\ No newline at end of file
+})
diff --git a/tests/testthat/test-CohortTables.R b/tests/testthat/test-CohortTables.R
index d05e4b6..a22d953 100644
--- a/tests/testthat/test-CohortTables.R
+++ b/tests/testthat/test-CohortTables.R
@@ -163,10 +163,10 @@ test_that("Create cohort tables with incremental = TRUE and partial table creati
test_that("Cohort sample table does not exist for backwards compatibility", {
cohortTableNames <- getCohortTableNames(cohortTable = "cohortSampleTable")
-
+
# Remove the sample table to make sure the create cohort table works
cohortTableNames <- cohortTableNames[-which(names(cohortTableNames) == "cohortSampleTable")]
-
+
# Create the cohort tables
expect_invisible(
createCohortTables(
diff --git a/tests/testthat/test-NegativeControlCohorts.R b/tests/testthat/test-NegativeControlCohorts.R
index 19c4d02..f2444e6 100644
--- a/tests/testthat/test-NegativeControlCohorts.R
+++ b/tests/testthat/test-NegativeControlCohorts.R
@@ -240,9 +240,9 @@ test_that("incremental mode", {
incrementalFolder = incrementalFolder,
incremental = TRUE
)
-
+
expect_equal(res, "SKIPPED")
-
+
# Test changing other params regenerates
res <- generateNegativeControlOutcomeCohorts(
connection = connection,
@@ -255,9 +255,9 @@ test_that("incremental mode", {
incrementalFolder = incrementalFolder,
incremental = TRUE
)
-
+
expect_equal(res, "FINISHED")
-
+
# Test changing other params regenerates
res <- generateNegativeControlOutcomeCohorts(
connection = connection,
@@ -270,10 +270,10 @@ test_that("incremental mode", {
incrementalFolder = incrementalFolder,
incremental = TRUE
)
-
+
expect_equal(res, "SKIPPED")
-
-
+
+
res <- generateNegativeControlOutcomeCohorts(
connection = connection,
cdmDatabaseSchema = "main",
@@ -285,10 +285,10 @@ test_that("incremental mode", {
incrementalFolder = incrementalFolder,
incremental = TRUE
)
-
+
expect_equal(res, "FINISHED")
-
-
+
+
res <- generateNegativeControlOutcomeCohorts(
connection = connection,
cdmDatabaseSchema = "main",
@@ -300,7 +300,6 @@ test_that("incremental mode", {
incrementalFolder = incrementalFolder,
incremental = TRUE
)
-
+
expect_equal(res, "SKIPPED")
-
-})
\ No newline at end of file
+})
diff --git a/tests/testthat/test-dbms-platforms.R b/tests/testthat/test-dbms-platforms.R
index cdb5caa..4332f74 100644
--- a/tests/testthat/test-dbms-platforms.R
+++ b/tests/testthat/test-dbms-platforms.R
@@ -34,7 +34,7 @@ testPlatform <- function(dbmsDetails) {
incrementalFolder = file.path(outputFolder, "RecordKeeping", dbmsDetails$connectionDetails$dbms)
)
expect_equal(nrow(cohortsGenerated), nrow(cohortsWithStats))
-
+
# Get the cohort counts
cohortCounts <- getCohortCounts(
connectionDetails = dbmsDetails$connectionDetails,
@@ -42,9 +42,9 @@ testPlatform <- function(dbmsDetails) {
cohortTable = cohortTableNames$cohortTable,
databaseId = dbmsDetails$dbmsPlatform,
cohortDefinitionSet = cohortsWithStats
- )
+ )
expect_equal(nrow(cohortsGenerated), nrow(cohortCounts))
-
+
# Insert the inclusion rule names before exporting the stats tables
insertInclusionRuleNames(
connectionDetails = dbmsDetails$connectionDetails,
@@ -52,7 +52,7 @@ testPlatform <- function(dbmsDetails) {
cohortDatabaseSchema = dbmsDetails$cohortDatabaseSchema,
cohortInclusionTable = cohortTableNames$cohortInclusionTable
)
-
+
exportCohortStatsTables(
connectionDetails = dbmsDetails$connectionDetails,
cohortTableNames = cohortTableNames,
@@ -63,7 +63,7 @@ testPlatform <- function(dbmsDetails) {
incremental = TRUE,
databaseId = dbmsDetails$dbmsPlatform
)
-
+
subsetOperations <- list(
createCohortSubset(
cohortIds = 2,
diff --git a/vignettes/SamplingCohorts.Rmd b/vignettes/SamplingCohorts.Rmd
index 82f7f4d..eb7de00 100644
--- a/vignettes/SamplingCohorts.Rmd
+++ b/vignettes/SamplingCohorts.Rmd
@@ -27,13 +27,12 @@ someFolder <- tempdir()
packageRoot <- tempdir()
baseUrl <- "https://api.ohdsi.org/WebAPI"
library(CohortGenerator)
-
```
# Sampling with CohortGenerator
-Large populations of individuals (e.g. all subjects recieving a COVID-19 vaccination) can often be too large to work with when
+Large populations of individuals (e.g. all subjects receiving a COVID-19 vaccination) can often be too large to work with when
pulling down a large collection of covariates for further analysis.
-This is prohibitive when designing studies or atteming to generate phenotypes.
+This is prohibitive when designing studies or attempting to generate phenotypes.
This guide aims to demonstrate how one can use the `sampleCohortDefinitionSet` functionality to produce sufficiently
large sample cohorts from a base `cohortDefinitionSet`.
@@ -80,20 +79,19 @@ generateCohortSet(
incremental = TRUE,
incrementalFolder = recordKeepingFolder
)
-
```
We can then create a new cohort definition set from the original sample.
```{r eval=F}
sampledCohortDefinitionSet <- sampleCohortDefinitionSet(
- cohortDefinitionSet = cds,
- connection = conn,
- sampleFraction = 0.33,
- seed = 64374, # OHDSI
- cohortDatabaseSchema = "main",
- cohortTableNames = cohortTableNames,
- incremental = TRUE,
- incrementalFolder = recordKeepingFolder
- )
+ cohortDefinitionSet = cds,
+ connection = conn,
+ sampleFraction = 0.33,
+ seed = 64374, # OHDSI
+ cohortDatabaseSchema = "main",
+ cohortTableNames = cohortTableNames,
+ incremental = TRUE,
+ incrementalFolder = recordKeepingFolder
+)
```
The resulting `sampledCohortDefinitionSet` is nearly identical to the base cohort set, however a few changes occur:
@@ -109,15 +107,15 @@ To generate multiple samples, simply specify multiple seed variables as follows:
```{r eval=F}
# Generate 800 samples of size n
sampledCohortDefinitionSet <- sampleCohortDefinitionSet(
- cohortDefinitionSet = cds,
- connection = conn,
- n = 1000,
- seed = 1:800 * 64374, # OHDSI
- cohortDatabaseSchema = "main",
- cohortTableNames = cohortTableNames,
- incremental = TRUE,
- incrementalFolder = recordKeepingFolder
- )
+ cohortDefinitionSet = cds,
+ connection = conn,
+ n = 1000,
+ seed = 1:800 * 64374, # OHDSI
+ cohortDatabaseSchema = "main",
+ cohortTableNames = cohortTableNames,
+ incremental = TRUE,
+ incrementalFolder = recordKeepingFolder
+)
```
Note that using incremental mode for your sampled cohorts will also work.
In this case, a cohort will only be re-generated if the checksum of the base cohort has changed (the checksum is based