diff --git a/DESCRIPTION b/DESCRIPTION index 6bfbbba..dd768aa 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: CohortGenerator Type: Package Title: An R Package for Cohort Generation Against the OMOP CDM -Version: 0.8.1 -Date: 2023-10-10 +Version: 0.9.0 +Date: 2024-05-28 Authors@R: c( person("Anthony", "Sena", email = "sena@ohdsi.org", role = c("aut", "cre")), person("Jamie", "Gilbert", role = c("aut")), diff --git a/NEWS.md b/NEWS.md index f1cb257..d68e06c 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,9 +1,22 @@ +CohortGenerator 0.9.0 +======================= +- Random sample functionality (for development only) (Issue #129) +- Incremental mode for negative control cohort generation (Issue #137) +- Fixes getCohortCounts() if cohortIds is not specified, but cohortDefinitionSet is. (Issue #136) +- Add cohort ID to generation output messages (Issue #132) +- Add databaseId to output of getStatsTable() (Issue #116) +- Prevent duplicate cohort IDs in cohortDefinitionSet (Issue #130) +- Fix cohort stats query for Oracle (Issue #143) +- Ensure databaseId applied to all returned cohort counts (Issue #144) +- Preserve backwards compatibility if cohort sample table is not in the list of cohort table names (Issue #147) + + CohortGenerator 0.8.1 ======================= - Include cohorts with 0 people in cohort counts (Issue #91). - Use numeric for cohort ID (Issue #98) - Allow big ints for target pairs (#103) -- Pass `tempEmulationSchema` when creating negative controlc ohorts (#104) +- Pass `tempEmulationSchema` when creating negative control cohorts (#104) - Target CDM v5.4 for unit tests (#119) - Fix for subset references (#115) - Allow for subset cohort name templating (#118) diff --git a/R/CohortConstruction.R b/R/CohortConstruction.R index 600b3f6..2dfb86a 100644 --- a/R/CohortConstruction.R +++ b/R/CohortConstruction.R @@ -1,4 +1,4 @@ -# Copyright 2023 Observational Health Data Sciences and Informatics +# Copyright 2024 Observational Health Data Sciences and Informatics # # This file is part of CohortGenerator # @@ -80,8 +80,8 @@ generateCohortSet <- function(connectionDetails = NULL, # set before generating if (length(unique(cohortDefinitionSet$cohortId)) != length(cohortDefinitionSet$cohortId)) { duplicatedCohortIds <- cohortDefinitionSet$cohortId[duplicated(cohortDefinitionSet$cohortId)] - stop("Cannot generate! Duplicate cohort IDs found in your cohortDefinitionSet: ", paste(duplicatedCohortIds, sep=","), ". Please fix your cohortDefinitionSet and try again.") - } + stop("Cannot generate! Duplicate cohort IDs found in your cohortDefinitionSet: ", paste(duplicatedCohortIds, sep = ","), ". Please fix your cohortDefinitionSet and try again.") + } if (is.null(connection) && is.null(connectionDetails)) { stop("You must provide either a database connection or the connection details.") } diff --git a/R/CohortCount.R b/R/CohortCount.R index ca381f4..f14244f 100644 --- a/R/CohortCount.R +++ b/R/CohortCount.R @@ -1,4 +1,4 @@ -# Copyright 2023 Observational Health Data Sciences and Informatics +# Copyright 2024 Observational Health Data Sciences and Informatics # # This file is part of CohortGenerator # diff --git a/R/CohortDefinitionSet.R b/R/CohortDefinitionSet.R index 38d1428..7b09547 100644 --- a/R/CohortDefinitionSet.R +++ b/R/CohortDefinitionSet.R @@ -1,4 +1,4 @@ -# Copyright 2023 Observational Health Data Sciences and Informatics +# Copyright 2024 Observational Health Data Sciences and Informatics # # This file is part of CohortGenerator # @@ -534,4 +534,4 @@ checkSettingsColumns <- function(columnNames, settingsFileName = NULL) { } copyToCds -} \ No newline at end of file +} diff --git a/R/CohortGenerator.R b/R/CohortGenerator.R index 3a1cc51..b82cdd1 100644 --- a/R/CohortGenerator.R +++ b/R/CohortGenerator.R @@ -1,4 +1,4 @@ -# Copyright 2023 Observational Health Data Sciences and Informatics +# Copyright 2024 Observational Health Data Sciences and Informatics # # This file is part of CohortGenerator # diff --git a/R/CohortSample.R b/R/CohortSample.R index dadc552..6cb393b 100644 --- a/R/CohortSample.R +++ b/R/CohortSample.R @@ -1,4 +1,4 @@ -# Copyright 2023 Observational Health Data Sciences and Informatics +# Copyright 2024 Observational Health Data Sciences and Informatics # # This file is part of CohortGenerator # @@ -31,10 +31,11 @@ countSql <- "SELECT COUNT(DISTINCT SUBJECT_ID) as cnt FROM @cohort_database_schema.@target_table WHERE cohort_definition_id = @target_cohort_id" count <- DatabaseConnector::renderTranslateQuerySql(connection, - countSql, - cohort_database_schema = cohortDatabaseSchema, - target_cohort_id = targetCohortId, - target_table = targetTable) %>% + countSql, + cohort_database_schema = cohortDatabaseSchema, + target_cohort_id = targetCohortId, + target_table = targetTable + ) %>% dplyr::pull() if (!is.null(sampleFraction)) { @@ -68,26 +69,28 @@ sampleTable, seed, tempEmulationSchema) { - randSampleTableName <- paste0("#SAMPLE_TABLE_", seed) - DatabaseConnector::insertTable(connection = connection, - data = sampleTable, - dropTableIfExists = TRUE, - tempTable = TRUE, - tempEmulationSchema = tempEmulationSchema, - tableName = randSampleTableName) + DatabaseConnector::insertTable( + connection = connection, + data = sampleTable, + dropTableIfExists = TRUE, + tempTable = TRUE, + tempEmulationSchema = tempEmulationSchema, + tableName = randSampleTableName + ) execSql <- SqlRender::readSql(system.file("sql", "sql_server", "sampling", "RandomSample.sql", package = "CohortGenerator")) DatabaseConnector::renderTranslateExecuteSql(connection, - execSql, - tempEmulationSchema = tempEmulationSchema, - random_sample_table = randSampleTableName, - target_cohort_id = targetCohortId, - output_cohort_id = outputCohortId, - cohort_database_schema = cohortDatabaseSchema, - output_database_schema = outputDatabaseSchema, - output_table = outputTable, - target_table = targetTable) + execSql, + tempEmulationSchema = tempEmulationSchema, + random_sample_table = randSampleTableName, + target_cohort_id = targetCohortId, + output_cohort_id = outputCohortId, + cohort_database_schema = cohortDatabaseSchema, + output_database_schema = outputDatabaseSchema, + output_table = outputTable, + target_table = targetTable + ) } @@ -115,7 +118,7 @@ idSet <- c(idSet, cohortIds) } errorMessage <- "identifier expression does not produce unique output for cohort ids" - if(length(unique(idSet)) != length(idSet)) stop(errorMessage) + if (length(unique(idSet)) != length(idSet)) stop(errorMessage) invisible(NULL) } @@ -125,7 +128,7 @@ #' Create 1 or more sample of size n of a cohort definition set #' #' Subsetted cohorts can be sampled, as with any other subset form. -#' However, subsetting a sampled cohort is not reccomended and not currently supported at this time. +#' However, subsetting a sampled cohort is not recommended and not currently supported at this time. #' In the case where n > cohort count the entire cohort is copied unmodified #' #' As different databases have different forms of randomness, the random selection is computed in @@ -140,7 +143,7 @@ #' @param identifierExpression Optional string R expression used to compute output cohort id. Can only use variables #' cohortId and seed. Default is "cohortId * 1000 + seed", which is substituted and evaluated #' @param cohortIds Optional subset of cohortIds to generate. By default this function will sample all cohorts -#' @param seed Vector of seeds to give to the R psuedorandom number generator +#' @param seed Vector of seeds to give to the R pseudorandom number generator #' @param seedArgs optional arguments to pass to set.seed #' @param outputDatabaseSchema optional schema to output cohorts to (if different from cohortDatabaseSchema) #' @export @@ -161,21 +164,21 @@ sampleCohortDefinitionSet <- function(cohortDefinitionSet, identifierExpression = "cohortId * 1000 + seed", incremental = FALSE, incrementalFolder = NULL) { - checkmate::assertIntegerish(n, len = 1, null.ok = TRUE) checkmate::assertNumeric(sampleFraction, len = 1, null.ok = TRUE, lower = 0, upper = 1.0) checkmate::assertIntegerish(seed, min.len = 1) checkmate::assertDataFrame(cohortDefinitionSet, min.rows = 1, col.names = "named") checkmate::assertNames(colnames(cohortDefinitionSet), - must.include = c( - "cohortId", - "cohortName", - "sql" - ) + must.include = c( + "cohortId", + "cohortName", + "sql" + ) ) - if (is.null(n) && is.null(sampleFraction)) + if (is.null(n) && is.null(sampleFraction)) { stop("Must specificy n or fraction size") + } if (is.null(connection) && is.null(connectionDetails)) { stop("You must provide either a database connection or the connection details.") @@ -208,27 +211,35 @@ sampleCohortDefinitionSet <- function(cohortDefinitionSet, sampledCohortDefinition$isSample <- TRUE sampledCohortDefinition$status <- "ungenerated" - outputCohortId <- .computeIdentifierExpression(identifierExpression, - sampledCohortDefinition$cohortId, - seed) + outputCohortId <- .computeIdentifierExpression( + identifierExpression, + sampledCohortDefinition$cohortId, + seed + ) sampledCohortDefinition$sampleTargetCohortId <- sampledCohortDefinition$cohortId sampledCohortDefinition$cohortId <- outputCohortId if (!is.null(sampleFraction)) { - sampledCohortDefinition$cohortName <- sprintf("%s [%s%% SAMPLE seed=%s]", - sampledCohortDefinition$cohortName, seed, sampleFraction * 100) + sampledCohortDefinition$cohortName <- sprintf( + "%s [%s%% SAMPLE seed=%s]", + sampledCohortDefinition$cohortName, seed, sampleFraction * 100 + ) } else { - sampledCohortDefinition$cohortName <- sprintf("%s [SAMPLE seed=%s n=%s]", - sampledCohortDefinition$cohortName, seed, n) + sampledCohortDefinition$cohortName <- sprintf( + "%s [SAMPLE seed=%s n=%s]", + sampledCohortDefinition$cohortName, seed, n + ) } if (hasSubsetDefinitions(cohortDefinitionSet)) { # must maintain mapping for subset parent ids - sampledCohortDefinition$subsetParent <- .computeIdentifierExpression(identifierExpression, - sampledCohortDefinition$subsetParent, - seed) + sampledCohortDefinition$subsetParent <- .computeIdentifierExpression( + identifierExpression, + sampledCohortDefinition$subsetParent, + seed + ) } - + if (incremental && !isTaskRequired( cohortId = outputCohortId, seed = seed, @@ -239,30 +250,34 @@ sampleCohortDefinitionSet <- function(cohortDefinitionSet, return(sampledCohortDefinition) } # check incremental task for cohort sampling - sampleTable <- .getSampleSet(connection = connection, - n = n, - sampleFraction = sampleFraction, - seed = seed + targetCohortId, # Seed is unique to each target cohort - seedArgs = seedArgs, - cohortDatabaseSchema = cohortDatabaseSchema, - targetCohortId = targetCohortId, - targetTable = cohortTableNames$cohortTable) + sampleTable <- .getSampleSet( + connection = connection, + n = n, + sampleFraction = sampleFraction, + seed = seed + targetCohortId, # Seed is unique to each target cohort + seedArgs = seedArgs, + cohortDatabaseSchema = cohortDatabaseSchema, + targetCohortId = targetCohortId, + targetTable = cohortTableNames$cohortTable + ) if (nrow(sampleTable) == 0) { ParallelLogger::logInfo("No entires found for ", targetCohortId, " was it generated?") return(sampledCohortDefinition) } # Called only for side effects - .sampleCohort(connection = connection, - targetCohortId = targetCohortId, - targetTable = cohortTableNames$cohortTable, - outputCohortId = outputCohortId, - outputTable = cohortTableNames$cohortSampleTable, - cohortDatabaseSchema = cohortDatabaseSchema, - outputDatabaseSchema = outputDatabaseSchema, - sampleTable = sampleTable, - seed = seed + targetCohortId, # Seed is unique to each target cohort - tempEmulationSchema = tempEmulationSchema) + .sampleCohort( + connection = connection, + targetCohortId = targetCohortId, + targetTable = cohortTableNames$cohortTable, + outputCohortId = outputCohortId, + outputTable = cohortTableNames$cohortSampleTable, + cohortDatabaseSchema = cohortDatabaseSchema, + outputDatabaseSchema = outputDatabaseSchema, + sampleTable = sampleTable, + seed = seed + targetCohortId, # Seed is unique to each target cohort + tempEmulationSchema = tempEmulationSchema + ) sampledCohortDefinition$status <- "generated" if (incremental) { @@ -275,7 +290,7 @@ sampleCohortDefinitionSet <- function(cohortDefinitionSet, } return(sampledCohortDefinition) }, seed, cohortIds) %>% - dplyr::bind_rows() + dplyr::bind_rows() diff --git a/R/CohortStats.R b/R/CohortStats.R index f9caa5d..9b1c2b5 100644 --- a/R/CohortStats.R +++ b/R/CohortStats.R @@ -1,4 +1,4 @@ -# Copyright 2023 Observational Health Data Sciences and Informatics +# Copyright 2024 Observational Health Data Sciences and Informatics # # This file is part of CohortGenerator # diff --git a/R/CohortTables.R b/R/CohortTables.R index 27816fc..bf0e2b9 100644 --- a/R/CohortTables.R +++ b/R/CohortTables.R @@ -1,4 +1,4 @@ -# Copyright 2023 Observational Health Data Sciences and Informatics +# Copyright 2024 Observational Health Data Sciences and Informatics # # This file is part of CohortGenerator # @@ -196,9 +196,9 @@ dropCohortStatsTables <- function(connectionDetails = NULL, } } -.checkCohortTables <- function (connection, - cohortDatabaseSchema, - cohortTableNames) { +.checkCohortTables <- function(connection, + cohortDatabaseSchema, + cohortTableNames) { # Verify the cohort tables exist and if they do not # stop the generation process tableExistsFlagList <- lapply(cohortTableNames, FUN = function(x) { @@ -221,4 +221,4 @@ dropCohortStatsTables <- function(connectionDetails = NULL, errorMsg <- paste(errorMsg, "Please use the createCohortTables function to ensure all tables exist before generating cohorts.", sep = "\n") stop(errorMsg) } -} \ No newline at end of file +} diff --git a/R/CsvHelper.R b/R/CsvHelper.R index a469507..25d3d10 100644 --- a/R/CsvHelper.R +++ b/R/CsvHelper.R @@ -1,4 +1,4 @@ -# Copyright 2023 Observational Health Data Sciences and Informatics +# Copyright 2024 Observational Health Data Sciences and Informatics # # This file is part of CohortGenerator # diff --git a/R/Export.R b/R/Export.R index ea958b6..f351821 100644 --- a/R/Export.R +++ b/R/Export.R @@ -1,4 +1,4 @@ -# Copyright 2023 Observational Health Data Sciences and Informatics +# Copyright 2024 Observational Health Data Sciences and Informatics # # This file is part of CohortGenerator # diff --git a/R/Incremental.R b/R/Incremental.R index ff9ce9c..de400e4 100644 --- a/R/Incremental.R +++ b/R/Incremental.R @@ -1,4 +1,4 @@ -# Copyright 2023 Observational Health Data Sciences and Informatics +# Copyright 2024 Observational Health Data Sciences and Informatics # # This file is part of CohortGenerator # diff --git a/R/NegativeControlCohorts.R b/R/NegativeControlCohorts.R index bef2695..516662c 100644 --- a/R/NegativeControlCohorts.R +++ b/R/NegativeControlCohorts.R @@ -1,4 +1,4 @@ -# Copyright 2023 Observational Health Data Sciences and Informatics +# Copyright 2024 Observational Health Data Sciences and Informatics # # This file is part of CohortGenerator # @@ -53,8 +53,8 @@ createEmptyNegativeControlOutcomeCohortSet <- function(verbose = FALSE) { #' @keywords internal .getNegativeControlOutcomeCohortSetSpecification <- function() { return(readCsv(system.file("negativeControlOutcomeCohortSetSpecificationDescription.csv", - package = "CohortGenerator", - mustWork = TRUE + package = "CohortGenerator", + mustWork = TRUE ))) } @@ -111,7 +111,7 @@ generateNegativeControlOutcomeCohorts <- function(connectionDetails = NULL, checkmate::assert_choice(x = tolower(occurrenceType), choices = c("all", "first")) checkmate::assert_logical(detectOnDescendants) checkmate::assertNames(colnames(negativeControlOutcomeCohortSet), - must.include = .getNegativeControlOutcomeCohortSetSpecification()$columnName + must.include = .getNegativeControlOutcomeCohortSetSpecification()$columnName ) checkmate::assert_data_frame( x = negativeControlOutcomeCohortSet, @@ -122,9 +122,9 @@ generateNegativeControlOutcomeCohorts <- function(connectionDetails = NULL, # cohort definition set before generating if (length(unique(negativeControlOutcomeCohortSet$cohortId)) != length(negativeControlOutcomeCohortSet$cohortId)) { duplicatedCohortIds <- negativeControlOutcomeCohortSet$cohortId[duplicated(negativeControlOutcomeCohortSet$cohortId)] - stop("Cannot generate! Duplicate cohort IDs found in your negativeControlOutcomeCohortSet: ", paste(duplicatedCohortIds, sep=","), ". Please fix your negativeControlOutcomeCohortSet and try again.") - } - + stop("Cannot generate! Duplicate cohort IDs found in your negativeControlOutcomeCohortSet: ", paste(duplicatedCohortIds, sep = ","), ". Please fix your negativeControlOutcomeCohortSet and try again.") + } + if (incremental) { if (is.null(incrementalFolder)) { stop("Must specify incrementalFolder when incremental = TRUE") diff --git a/R/SubsetDefinitions.R b/R/SubsetDefinitions.R index f328812..4453fd6 100644 --- a/R/SubsetDefinitions.R +++ b/R/SubsetDefinitions.R @@ -1,4 +1,4 @@ -# Copyright 2023 Observational Health Data Sciences and Informatics +# Copyright 2024 Observational Health Data Sciences and Informatics # # This file is part of CohortGenerator # diff --git a/R/SubsetQueryBuilders.R b/R/SubsetQueryBuilders.R index 195d9b3..4edef34 100644 --- a/R/SubsetQueryBuilders.R +++ b/R/SubsetQueryBuilders.R @@ -1,4 +1,4 @@ -# Copyright 2023 Observational Health Data Sciences and Informatics +# Copyright 2024 Observational Health Data Sciences and Informatics # # This file is part of CohortGenerator # diff --git a/R/Subsets.R b/R/Subsets.R index 1444448..c50a0f8 100644 --- a/R/Subsets.R +++ b/R/Subsets.R @@ -1,4 +1,4 @@ -# Copyright 2023 Observational Health Data Sciences and Informatics +# Copyright 2024 Observational Health Data Sciences and Informatics # # This file is part of CohortGenerator # diff --git a/_pkgdown.yml b/_pkgdown.yml index 8ee695f..d0aa7e1 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -102,6 +102,13 @@ reference: - isTaskRequired - saveIncremental - computeChecksum + + - title: "Cohort Sampling" + desc: > + Functions that support sampling a cohort. Please note this is only for + software development purposes and NOT for running studies. + contents: + - sampleCohortDefinitionSet navbar: structure: diff --git a/docs/404.html b/docs/404.html index 81aa5d3..1fb5c5e 100644 --- a/docs/404.html +++ b/docs/404.html @@ -6,7 +6,7 @@ Page not found (404) • CohortGenerator - + @@ -32,7 +32,7 @@ CohortGenerator - 0.8.1 + 0.9.0 @@ -54,6 +54,9 @@
  • Generating Cohorts
  • +
  • + Sampling Cohorts +
  • @@ -106,7 +109,7 @@

    Page not found (404)

    -

    Site built with pkgdown 2.0.7.

    +

    Site built with pkgdown 2.0.9.

    diff --git a/docs/articles/CreatingCohortSubsetDefinitions.html b/docs/articles/CreatingCohortSubsetDefinitions.html index 835a6cc..5e8b709 100644 --- a/docs/articles/CreatingCohortSubsetDefinitions.html +++ b/docs/articles/CreatingCohortSubsetDefinitions.html @@ -6,7 +6,7 @@ Creating Cohort Subset Definitions • CohortGenerator - + @@ -33,7 +33,7 @@ CohortGenerator - 0.8.1 + 0.9.0 @@ -55,6 +55,9 @@
  • Generating Cohorts
  • +
  • + Sampling Cohorts +
  • @@ -88,7 +91,7 @@

    Creating Cohort Subset Definitions

    James P. Gilbert and Anthony G. Sena

    -

    2023-10-10

    +

    2024-05-28

    Source: vignettes/CreatingCohortSubsetDefinitions.Rmd @@ -533,7 +536,7 @@

    Writing json objects

    -

    Site built with pkgdown 2.0.7.

    +

    Site built with pkgdown 2.0.9.

    diff --git a/docs/articles/GeneratingCohorts.html b/docs/articles/GeneratingCohorts.html index aa97899..2762151 100644 --- a/docs/articles/GeneratingCohorts.html +++ b/docs/articles/GeneratingCohorts.html @@ -6,7 +6,7 @@ Generating Cohorts • CohortGenerator - + @@ -33,7 +33,7 @@ CohortGenerator - 0.8.1 + 0.9.0 @@ -55,6 +55,9 @@
  • Generating Cohorts
  • +
  • + Sampling Cohorts +
  • @@ -88,7 +91,7 @@

    Generating Cohorts

    Anthony G. Sena and Martijn J. Schuemie

    -

    2023-10-10

    +

    2024-05-28

    Source: vignettes/GeneratingCohorts.Rmd @@ -260,7 +263,7 @@

    Generating Cohorts= cohortTableNames$cohortTable )
    #> Connecting using SQLite driver
    -
    #> Counting cohorts took 0.124 secs
    +
    #> Counting cohorts took 0.126 secs
    #>   cohortId cohortEntries cohortSubjects
     #> 1  1778211          1800           1800
     #> 2  1778212           569            569
    @@ -382,6 +385,7 @@ 

    Incremental Mode)

    #> Connecting using SQLite driver
    #> Table "cohort" already exists and in incremental mode, so not recreating it.
    +#> Table "cohort" already exists and in incremental mode, so not recreating it.
     #> Table "cohort_inclusion" already exists and in incremental mode, so not recreating it.
     #> Table "cohort_inclusion_result" already exists and in incremental mode, so not recreating it.
     #> Table "cohort_inclusion_stats" already exists and in incremental mode, so not recreating it.
    @@ -429,7 +433,7 @@ 

    Incremental Mode#> Skipping cohortId = '1778211' because it is unchanged from earlier run #> Skipping cohortId = '1778212' because it is unchanged from earlier run #> Skipping cohortId = '1778213' because it is unchanged from earlier run -#> Generating cohort set took 0.17 secs

    +#> Generating cohort set took 0.11 secs
    Potential Pitfalls of Incremental Mode
    @@ -469,7 +473,7 @@
    Potential Pitfalls of Incrementa

    -

    Site built with pkgdown 2.0.7.

    +

    Site built with pkgdown 2.0.9.

    diff --git a/docs/articles/SamplingCohorts.html b/docs/articles/SamplingCohorts.html new file mode 100644 index 0000000..76bc9ec --- /dev/null +++ b/docs/articles/SamplingCohorts.html @@ -0,0 +1,243 @@ + + + + + + + +Sampling Cohorts • CohortGenerator + + + + + + + + + + + + +
    +
    + + + + +
    +
    + + + + +
    +

    Sampling with CohortGenerator +

    +

    Large populations of individuals (e.g. all subjects receiving a +COVID-19 vaccination) can often be too large to work with when pulling +down a large collection of covariates for further analysis. This is +prohibitive when designing studies or attempting to generate phenotypes. +This guide aims to demonstrate how one can use the +sampleCohortDefinitionSet functionality to produce +sufficiently large sample cohorts from a base +cohortDefinitionSet.

    +
    +

    Sampling method +

    +

    The approach taken in this method is to sample individuals within a +cohort without replacement. Different database platforms implement a +variety of different approaches for random sampling and random number +generation that make cross platform sql difficult. Consequently, all the +randomness computed here happens inside R itself simply +from the count of unique individuals within a cohort.

    +
    +
    +

    Using the sampler functions +

    +

    To create a single sample the approach below will create cohorts in +the same base table.

    +

    First we need to load the initial cohort definition set

    + +

    We then need to create the cohort tables and cohorts in the usual +manner.

    +
    +connectionDetails <- Eunomia::getEunomiaConnectionDetails()
    +conn <- DatabaseConnector::connect(connectionDetails = connectionDetails)
    +on.exit(DatabaseConnector::disconnect(conn))
    +
    +
    +cds <- getCohortDefinitionSet(...)
    +cohortTableNames <- getCohortTableNames(cohortTable = "cohort")
    +recordKeepingFolder <- file.path(outputFolder, "RecordKeepingSamples")
    +
    +createCohortTables(
    +  connectionDetails = connectionDetails,
    +  cohortDatabaseSchema = "main",
    +  cohortTableNames = cohortTableNames
    +)
    +
    +generateCohortSet(
    +  cohortDefinitionSet = cds,
    +  connection = conn,
    +  cdmDatabaseSchema = "main",
    +  cohortDatabaseSchema = "main",
    +  cohortTableNames = cohortTableNames,
    +  incremental = TRUE,
    +  incrementalFolder = recordKeepingFolder
    +)
    +

    We can then create a new cohort definition set from the original +sample.

    +
    +sampledCohortDefinitionSet <- sampleCohortDefinitionSet(
    +  cohortDefinitionSet = cds,
    +  connection = conn,
    +  sampleFraction = 0.33,
    +  seed = 64374, # OHDSI
    +  cohortDatabaseSchema = "main",
    +  cohortTableNames = cohortTableNames,
    +  incremental = TRUE,
    +  incrementalFolder = recordKeepingFolder
    +)
    +

    The resulting sampledCohortDefinitionSet is nearly +identical to the base cohort set, however a few changes occur:

    +
      +
    • The name now include the postfix +[sample 33% seed=64374] +
    • +
    • The optional parameter idExpression changes the cohort +name. By default this is set to cohortId * 1000 + seed +however, this will throw an error if the ids are the same
    • +
    • The
    • +
    • The base cohortDefinitionSet is attached as an attribute to the +sampledCohortDefinitionSet +
    • +
    +

    To generate multiple samples, simply specify multiple seed variables +as follows:

    +
    +# Generate 800 samples of size n
    +sampledCohortDefinitionSet <- sampleCohortDefinitionSet(
    +  cohortDefinitionSet = cds,
    +  connection = conn,
    +  n = 1000,
    +  seed = 1:800 * 64374, # OHDSI
    +  cohortDatabaseSchema = "main",
    +  cohortTableNames = cohortTableNames,
    +  incremental = TRUE,
    +  incrementalFolder = recordKeepingFolder
    +)
    +

    Note that using incremental mode for your sampled cohorts will also +work. In this case, a cohort will only be re-generated if the checksum +of the base cohort has changed (the checksum is based on the cohort +SQL). The checksum applies to the pseudorandom seed of the cohort and +the sample size (n).

    +

    As the sampledCohortDefinitionSet is, for all intents and purposes, a +set of cohortDefinitions it can be passed as a parameter to all other +OHDSI packages with minimal issues. For example, +FeatureExtraction will be able to use this sample +unchanged.

    +
    +
    +
    + + + +
    + + + +
    + +
    +

    +

    Site built with pkgdown 2.0.9.

    +
    + +
    +
    + + + + + + + + diff --git a/docs/articles/index.html b/docs/articles/index.html index 0d0e955..c0fbc19 100644 --- a/docs/articles/index.html +++ b/docs/articles/index.html @@ -1,5 +1,5 @@ -Articles • CohortGeneratorArticles • CohortGenerator @@ -17,7 +17,7 @@ CohortGenerator - 0.8.1 + 0.9.0
    @@ -37,6 +37,9 @@
  • Generating Cohorts
  • +
  • + Sampling Cohorts +
  • Changelog @@ -70,6 +73,8 @@

    All vignettes

    Generating Cohorts
    +
    Sampling Cohorts
    +
    @@ -80,7 +85,7 @@

    All vignettes

    -

    Site built with pkgdown 2.0.7.

    +

    Site built with pkgdown 2.0.9.

    diff --git a/docs/authors.html b/docs/authors.html index 5984e02..934e4a3 100644 --- a/docs/authors.html +++ b/docs/authors.html @@ -1,5 +1,5 @@ -Authors and Citation • CohortGeneratorAuthors and Citation • CohortGenerator @@ -17,7 +17,7 @@ CohortGenerator - 0.8.1 + 0.9.0 @@ -37,6 +37,9 @@
  • Generating Cohorts
  • +
  • + Sampling Cohorts +
  • Changelog @@ -60,7 +63,7 @@
    @@ -93,14 +96,14 @@

    Citation

    -

    Sena A, Gilbert J, Rao G, Schuemie M (2023). +

    Sena A, Gilbert J, Rao G, Schuemie M (2024). CohortGenerator: An R Package for Cohort Generation Against the OMOP CDM. https://ohdsi.github.io/CohortGenerator/, https://github.com/OHDSI/CohortGenerator.

    @Manual{,
       title = {CohortGenerator: An R Package for Cohort Generation Against the OMOP CDM},
       author = {Anthony Sena and Jamie Gilbert and Gowtham Rao and Martijn Schuemie},
    -  year = {2023},
    +  year = {2024},
       note = {https://ohdsi.github.io/CohortGenerator/, https://github.com/OHDSI/CohortGenerator},
     }
    @@ -115,7 +118,7 @@

    Citation

    -

    Site built with pkgdown 2.0.7.

    +

    Site built with pkgdown 2.0.9.

    diff --git a/docs/index.html b/docs/index.html index bc36f4a..e325d4e 100644 --- a/docs/index.html +++ b/docs/index.html @@ -6,7 +6,7 @@ An R Package for Cohort Generation Against the OMOP CDM • CohortGenerator - + @@ -33,7 +33,7 @@ CohortGenerator - 0.8.1 + 0.9.0 @@ -55,6 +55,9 @@
  • Generating Cohorts
  • +
  • + Sampling Cohorts +
  • @@ -269,7 +272,7 @@

    Developers

    -

    Site built with pkgdown 2.0.7.

    +

    Site built with pkgdown 2.0.9.

    diff --git a/docs/news/index.html b/docs/news/index.html index 844ca9a..abf1fc5 100644 --- a/docs/news/index.html +++ b/docs/news/index.html @@ -1,5 +1,5 @@ -Changelog • CohortGeneratorChangelog • CohortGenerator @@ -17,7 +17,7 @@ CohortGenerator - 0.8.1 + 0.9.0 @@ -37,6 +37,9 @@
  • Generating Cohorts
  • +
  • + Sampling Cohorts +
  • Changelog @@ -63,12 +66,24 @@

    Changelog

    Source: NEWS.md +
    + +
    -

    Site built with pkgdown 2.0.7.

    +

    Site built with pkgdown 2.0.9.

    diff --git a/docs/pkgdown.yml b/docs/pkgdown.yml index 3100ae0..ec62a11 100644 --- a/docs/pkgdown.yml +++ b/docs/pkgdown.yml @@ -1,8 +1,9 @@ -pandoc: 3.1.1 -pkgdown: 2.0.7 +pandoc: 3.1.11 +pkgdown: 2.0.9 pkgdown_sha: ~ articles: CreatingCohortSubsetDefinitions: CreatingCohortSubsetDefinitions.html GeneratingCohorts: GeneratingCohorts.html -last_built: 2023-10-10T18:25Z + SamplingCohorts: SamplingCohorts.html +last_built: 2024-05-28T17:58Z diff --git a/docs/reference/CohortGenerator-package.html b/docs/reference/CohortGenerator-package.html index 2742e5f..94e32b7 100644 --- a/docs/reference/CohortGenerator-package.html +++ b/docs/reference/CohortGenerator-package.html @@ -1,5 +1,5 @@ -CohortGenerator: An R Package for Cohort Generation Against the OMOP CDM — CohortGenerator-package • CohortGeneratorCohortGenerator: An R Package for Cohort Generation Against the OMOP CDM — CohortGenerator-package • CohortGenerator @@ -17,7 +17,7 @@ CohortGenerator - 0.8.1 + 0.9.0 @@ -37,6 +37,9 @@
  • Generating Cohorts
  • +
  • + Sampling Cohorts +
  • Changelog @@ -97,7 +100,7 @@

    Author

    -

    Site built with pkgdown 2.0.7.

    +

    Site built with pkgdown 2.0.9.

    diff --git a/docs/reference/CohortSubsetDefinition.html b/docs/reference/CohortSubsetDefinition.html index 69d04ab..3540129 100644 --- a/docs/reference/CohortSubsetDefinition.html +++ b/docs/reference/CohortSubsetDefinition.html @@ -1,5 +1,5 @@ -Cohort Subset Definition — CohortSubsetDefinition • CohortGeneratorCohort Subset Definition — CohortSubsetDefinition • CohortGenerator @@ -17,7 +17,7 @@ CohortGenerator - 0.8.1 + 0.9.0 @@ -37,6 +37,9 @@
  • Generating Cohorts
  • +
  • + Sampling Cohorts +
  • Changelog @@ -285,7 +288,7 @@

    Arguments -

    Site built with pkgdown 2.0.7.

    +

    Site built with pkgdown 2.0.9.

    diff --git a/docs/reference/CohortSubsetOperator.html b/docs/reference/CohortSubsetOperator.html index 64ffa53..eb899f8 100644 --- a/docs/reference/CohortSubsetOperator.html +++ b/docs/reference/CohortSubsetOperator.html @@ -1,5 +1,5 @@ -Cohort Subset Operator — CohortSubsetOperator • CohortGeneratorCohort Subset Operator — CohortSubsetOperator • CohortGeneratorCriteria Subset — DemographicSubsetOperator • CohortGeneratorCriteria Subset — DemographicSubsetOperator • CohortGeneratorLimit Subset Operator — LimitSubsetOperator • CohortGeneratorLimit Subset Operator — LimitSubsetOperator • CohortGeneratorto List — SubsetCohortWindow • CohortGeneratorto List — SubsetCohortWindow • CohortGenerator @@ -17,7 +17,7 @@ CohortGenerator - 0.8.1 + 0.9.0 @@ -37,6 +37,9 @@
  • Generating Cohorts
  • +
  • + Sampling Cohorts +
  • Changelog @@ -165,7 +168,7 @@

    Arguments -

    Site built with pkgdown 2.0.7.

    +

    Site built with pkgdown 2.0.9.

    diff --git a/docs/reference/SubsetOperator.html b/docs/reference/SubsetOperator.html index 9c7c3f7..b9629d1 100644 --- a/docs/reference/SubsetOperator.html +++ b/docs/reference/SubsetOperator.html @@ -1,5 +1,5 @@ -SubsetOperator — SubsetOperator • CohortGeneratorSubsetOperator — SubsetOperator • CohortGenerator @@ -17,7 +17,7 @@ CohortGenerator - 0.8.1 + 0.9.0 @@ -37,6 +37,9 @@
  • Generating Cohorts
  • +
  • + Sampling Cohorts +
  • Changelog @@ -235,7 +238,7 @@

    Arguments -

    Site built with pkgdown 2.0.7.

    +

    Site built with pkgdown 2.0.9.

    diff --git a/docs/reference/addCohortSubsetDefinition.html b/docs/reference/addCohortSubsetDefinition.html index 1e97727..90567f9 100644 --- a/docs/reference/addCohortSubsetDefinition.html +++ b/docs/reference/addCohortSubsetDefinition.html @@ -1,5 +1,5 @@ -Add cohort subset definition to a cohort definition set — addCohortSubsetDefinition • CohortGeneratorAdd cohort subset definition to a cohort definition set — addCohortSubsetDefinition • CohortGeneratorCheck if a cohort definition set is using the proper data types — checkAndFixCohortDefinitionSetDataTypes • CohortGeneratorCheck if a cohort definition set is using the proper data types — checkAndFixCohortDefinitionSetDataTypes • CohortGeneratorComputes the checksum for a value — computeChecksum • CohortGeneratorComputes the checksum for a value — computeChecksum • CohortGeneratorA definition of subset functions to be applied to a set of cohorts — createCohortSubset • CohortGeneratorA definition of subset functions to be applied to a set of cohorts — createCohortSubset • CohortGenerator @@ -17,7 +17,7 @@ CohortGenerator - 0.8.1 + 0.9.0 @@ -37,6 +37,9 @@
  • Generating Cohorts
  • +
  • + Sampling Cohorts +
  • Changelog @@ -125,7 +128,7 @@

    Value

    -

    Site built with pkgdown 2.0.7.

    +

    Site built with pkgdown 2.0.9.

    diff --git a/docs/reference/createCohortSubsetDefinition.html b/docs/reference/createCohortSubsetDefinition.html index b89eb3b..73a692c 100644 --- a/docs/reference/createCohortSubsetDefinition.html +++ b/docs/reference/createCohortSubsetDefinition.html @@ -1,5 +1,5 @@ -Create Subset Definition — createCohortSubsetDefinition • CohortGeneratorCreate Subset Definition — createCohortSubsetDefinition • CohortGenerator @@ -17,7 +17,7 @@ CohortGenerator - 0.8.1 + 0.9.0 @@ -37,6 +37,9 @@
  • Generating Cohorts
  • +
  • + Sampling Cohorts +
  • Changelog @@ -99,14 +102,14 @@

    Arguments

    operatorNameConcatString
    -

    (optional) SqlRender string template for formatting names of resulting subset cohorts -Can use the variables @baseCohortName, @subsetDefinitionName and @operatorNames. -This is applied when adding the subset definition to a cohort definition set.

    +

    (optional) String to concatenate operator names together when outputting resulting cohort +name

    subsetCohortNameTemplate
    -

    (optional) String to concatenate operator names together when outputting resulting cohort -name

    +

    (optional) SqlRender string template for formatting names of resulting subset cohorts +Can use the variables @baseCohortName, @subsetDefinitionName and @operatorNames. +This is applied when adding the subset definition to a cohort definition set.

    @@ -122,7 +125,7 @@

    Arguments

    -

    Site built with pkgdown 2.0.7.

    +

    Site built with pkgdown 2.0.9.

    diff --git a/docs/reference/createCohortTables.html b/docs/reference/createCohortTables.html index 756056d..c28cf0c 100644 --- a/docs/reference/createCohortTables.html +++ b/docs/reference/createCohortTables.html @@ -1,5 +1,5 @@ -Create cohort tables — createCohortTables • CohortGeneratorCreate cohort tables — createCohortTables • CohortGeneratorCreate createDemographicSubset Subset — createDemographicSubset • CohortGeneratorCreate createDemographicSubset Subset — createDemographicSubset • CohortGenerator @@ -17,7 +17,7 @@ CohortGenerator - 0.8.1 + 0.9.0 @@ -37,6 +37,9 @@
  • Generating Cohorts
  • +
  • + Sampling Cohorts +
  • Changelog @@ -121,7 +124,7 @@

    Arguments

    -

    Site built with pkgdown 2.0.7.

    +

    Site built with pkgdown 2.0.9.

    diff --git a/docs/reference/createEmptyCohortDefinitionSet.html b/docs/reference/createEmptyCohortDefinitionSet.html index a71456d..31f3f6a 100644 --- a/docs/reference/createEmptyCohortDefinitionSet.html +++ b/docs/reference/createEmptyCohortDefinitionSet.html @@ -1,5 +1,5 @@ -Create an empty cohort definition set — createEmptyCohortDefinitionSet • CohortGeneratorCreate an empty cohort definition set — createEmptyCohortDefinitionSet • CohortGeneratorCreate an empty negative control outcome cohort set — createEmptyNegativeControlOutcomeCohortSet • CohortGeneratorCreate an empty negative control outcome cohort set — createEmptyNegativeControlOutcomeCohortSet • CohortGeneratorCreate Limit Subset — createLimitSubset • CohortGeneratorCreate Limit Subset — createLimitSubset • CohortGenerator @@ -17,7 +17,7 @@ CohortGenerator - 0.8.1 + 0.9.0 @@ -37,6 +37,9 @@
  • Generating Cohorts
  • +
  • + Sampling Cohorts +
  • Changelog @@ -126,7 +129,7 @@

    Arguments

    -

    Site built with pkgdown 2.0.7.

    +

    Site built with pkgdown 2.0.9.

    diff --git a/docs/reference/createSubsetCohortWindow.html b/docs/reference/createSubsetCohortWindow.html index 5e80cc7..27f90e4 100644 --- a/docs/reference/createSubsetCohortWindow.html +++ b/docs/reference/createSubsetCohortWindow.html @@ -1,5 +1,5 @@ -A definition of subset functions to be applied to a set of cohorts — createSubsetCohortWindow • CohortGeneratorA definition of subset functions to be applied to a set of cohorts — createSubsetCohortWindow • CohortGenerator @@ -17,7 +17,7 @@ CohortGenerator - 0.8.1 + 0.9.0 @@ -37,6 +37,9 @@
  • Generating Cohorts
  • +
  • + Sampling Cohorts +
  • Changelog @@ -105,7 +108,7 @@

    Value

    -

    Site built with pkgdown 2.0.7.

    +

    Site built with pkgdown 2.0.9.

    diff --git a/docs/reference/dropCohortStatsTables.html b/docs/reference/dropCohortStatsTables.html index 06b1b53..4c9e8cd 100644 --- a/docs/reference/dropCohortStatsTables.html +++ b/docs/reference/dropCohortStatsTables.html @@ -1,5 +1,5 @@ -Drop cohort statistics tables — dropCohortStatsTables • CohortGeneratorDrop cohort statistics tables — dropCohortStatsTables • CohortGenerator @@ -17,7 +17,7 @@ CohortGenerator - 0.8.1 + 0.9.0 @@ -37,6 +37,9 @@
  • Generating Cohorts
  • +
  • + Sampling Cohorts +
  • Changelog @@ -123,7 +126,7 @@

    Arguments

    -

    Site built with pkgdown 2.0.7.

    +

    Site built with pkgdown 2.0.9.

    diff --git a/docs/reference/exportCohortStatsTables.html b/docs/reference/exportCohortStatsTables.html index 66cd202..258095b 100644 --- a/docs/reference/exportCohortStatsTables.html +++ b/docs/reference/exportCohortStatsTables.html @@ -1,5 +1,5 @@ -Export the cohort statistics tables to the file system — exportCohortStatsTables • CohortGeneratorExport the cohort statistics tables to the file system — exportCohortStatsTables • CohortGeneratorGenerate a set of cohorts — generateCohortSet • CohortGeneratorGenerate a set of cohorts — generateCohortSet • CohortGenerator @@ -17,7 +17,7 @@ CohortGenerator - 0.8.1 + 0.9.0 @@ -37,6 +37,9 @@
  • Generating Cohorts
  • +
  • + Sampling Cohorts +
  • Changelog @@ -203,7 +206,7 @@

    Value

    -

    Site built with pkgdown 2.0.7.

    +

    Site built with pkgdown 2.0.9.

    diff --git a/docs/reference/generateNegativeControlOutcomeCohorts.html b/docs/reference/generateNegativeControlOutcomeCohorts.html index 6ccbe8b..3d94e46 100644 --- a/docs/reference/generateNegativeControlOutcomeCohorts.html +++ b/docs/reference/generateNegativeControlOutcomeCohorts.html @@ -1,5 +1,5 @@ -Generate a set of negative control outcome cohorts — generateNegativeControlOutcomeCohorts • CohortGeneratorGenerate a set of negative control outcome cohorts — generateNegativeControlOutcomeCohorts • CohortGeneratorCount the cohort(s) — getCohortCounts • CohortGeneratorCount the cohort(s) — getCohortCounts • CohortGeneratorGet a cohort definition set — getCohortDefinitionSet • CohortGeneratorGet a cohort definition set — getCohortDefinitionSet • CohortGeneratorGet Cohort Inclusion Stats Table Data — getCohortStats • CohortGeneratorGet Cohort Inclusion Stats Table Data — getCohortStats • CohortGenerator CohortGenerator - 0.8.1 + 0.9.0 @@ -47,6 +47,9 @@
  • Generating Cohorts
  • +
  • + Sampling Cohorts +
  • Changelog @@ -156,7 +159,7 @@

    Arguments

    -

    Site built with pkgdown 2.0.7.

    +

    Site built with pkgdown 2.0.9.

    diff --git a/docs/reference/getCohortTableNames.html b/docs/reference/getCohortTableNames.html index d3c70bd..24eaf0a 100644 --- a/docs/reference/getCohortTableNames.html +++ b/docs/reference/getCohortTableNames.html @@ -1,7 +1,5 @@ -Used to get a list of cohort table names to use when creating the cohort -tables — getCohortTableNames • CohortGeneratorUsed to get a list of cohort table names to use when creating the cohort tables — getCohortTableNames • CohortGeneratorGet a list of tasks required when running in incremental mode — getRequiredTasks • CohortGeneratorGet a list of tasks required when running in incremental mode — getRequiredTasks • CohortGeneratorGet cohort subset definitions from a cohort definition set — getSubsetDefinitions • CohortGeneratorGet cohort subset definitions from a cohort definition set — getSubsetDefinitions • CohortGenerator CohortGenerator - 0.8.1 + 0.9.0 @@ -41,6 +41,9 @@
  • Generating Cohorts
  • +
  • + Sampling Cohorts +
  • Changelog @@ -105,7 +108,7 @@

    Value

    -

    Site built with pkgdown 2.0.7.

    +

    Site built with pkgdown 2.0.9.

    diff --git a/docs/reference/index.html b/docs/reference/index.html index 83f7675..f8b1ac2 100644 --- a/docs/reference/index.html +++ b/docs/reference/index.html @@ -1,5 +1,5 @@ -Function reference • CohortGeneratorFunction reference • CohortGenerator @@ -17,7 +17,7 @@ CohortGenerator - 0.8.1 + 0.9.0 @@ -37,6 +37,9 @@
  • Generating Cohorts
  • +
  • + Sampling Cohorts +
  • Changelog @@ -73,8 +76,7 @@

    Cohort Tables

    getCohortTableNames()

    -

    Used to get a list of cohort table names to use when creating the cohort -tables

    +

    Used to get a list of cohort table names to use when creating the cohort tables

    Cohort Defintion Set

    Functions that support working with a cohort definition set

    @@ -190,8 +192,7 @@

    Cohort Statistics

    insertInclusionRuleNames()

    -

    Used to insert the inclusion rule names from a cohort definition set -when generating cohorts that include cohort statistics

    +

    Used to insert the inclusion rule names from a cohort definition set when generating cohorts that include cohort statistics

    exportCohortStatsTables()

    @@ -260,6 +261,14 @@

    Record Keeping computeChecksum()

    Computes the checksum for a value

    + +

    Cohort Sampling

    +

    Functions that support sampling a cohort. Please note this is only for software development purposes and NOT for running studies.

    + + +

    sampleCohortDefinitionSet()

    + +

    Sample Cohort Definition Set

    diff --git a/docs/reference/insertInclusionRuleNames.html b/docs/reference/insertInclusionRuleNames.html index 1cb25b4..4e47142 100644 --- a/docs/reference/insertInclusionRuleNames.html +++ b/docs/reference/insertInclusionRuleNames.html @@ -1,7 +1,5 @@ -Used to insert the inclusion rule names from a cohort definition set -when generating cohorts that include cohort statistics — insertInclusionRuleNames • CohortGeneratorUsed to insert the inclusion rule names from a cohort definition set when generating cohorts that include cohort statistics — insertInclusionRuleNames • CohortGenerator CohortGenerator - 0.8.1 + 0.9.0 @@ -43,6 +41,9 @@
  • Generating Cohorts
  • +
  • + Sampling Cohorts +
  • Changelog @@ -65,8 +66,7 @@
    @@ -153,7 +153,7 @@

    Value

    -

    Site built with pkgdown 2.0.7.

    +

    Site built with pkgdown 2.0.9.

    diff --git a/docs/reference/isCamelCase.html b/docs/reference/isCamelCase.html index c64a0dc..6c716f1 100644 --- a/docs/reference/isCamelCase.html +++ b/docs/reference/isCamelCase.html @@ -1,5 +1,5 @@ -Used to check if a string is in lower camel case — isCamelCase • CohortGeneratorUsed to check if a string is in lower camel case — isCamelCase • CohortGeneratorIs the data.frame a cohort definition set? — isCohortDefinitionSet • CohortGeneratorIs the data.frame a cohort definition set? — isCohortDefinitionSet • CohortGeneratorIs the data.frame formatted for uploading to a database? — isFormattedForDatabaseUpload • CohortGeneratorIs the data.frame formatted for uploading to a database? — isFormattedForDatabaseUpload • CohortGeneratorUsed to check if a string is in snake case — isSnakeCase • CohortGeneratorUsed to check if a string is in snake case — isSnakeCase • CohortGeneratorIs a task required when running in incremental mode — isTaskRequired • CohortGeneratorIs a task required when running in incremental mode — isTaskRequired • CohortGeneratorUsed to read a .csv file — readCsv • CohortGeneratorUsed to read a .csv file — readCsv • CohortGenerator CohortGenerator - 0.8.1 + 0.9.0 @@ -41,6 +41,9 @@
  • Generating Cohorts
  • +
  • + Sampling Cohorts +
  • Changelog @@ -110,7 +113,7 @@

    Value

    -

    Site built with pkgdown 2.0.7.

    +

    Site built with pkgdown 2.0.9.

    diff --git a/docs/reference/recordTasksDone.html b/docs/reference/recordTasksDone.html index be2d1f2..b934c79 100644 --- a/docs/reference/recordTasksDone.html +++ b/docs/reference/recordTasksDone.html @@ -1,5 +1,5 @@ -Record a task as complete — recordTasksDone • CohortGeneratorRecord a task as complete — recordTasksDone • CohortGenerator @@ -17,7 +17,7 @@ CohortGenerator - 0.8.1 + 0.9.0 @@ -37,6 +37,9 @@
  • Generating Cohorts
  • +
  • + Sampling Cohorts +
  • Changelog @@ -106,7 +109,7 @@

    Arguments

    -

    Site built with pkgdown 2.0.7.

    +

    Site built with pkgdown 2.0.9.

    diff --git a/docs/reference/sampleCohortDefinitionSet.html b/docs/reference/sampleCohortDefinitionSet.html new file mode 100644 index 0000000..41c171e --- /dev/null +++ b/docs/reference/sampleCohortDefinitionSet.html @@ -0,0 +1,227 @@ + +Sample Cohort Definition Set — sampleCohortDefinitionSet • CohortGenerator + + +
    +
    + + + +
    +
    + + +
    +

    Create 1 or more sample of size n of a cohort definition set

    +

    Subsetted cohorts can be sampled, as with any other subset form. +However, subsetting a sampled cohort is not recommended and not currently supported at this time. +In the case where n > cohort count the entire cohort is copied unmodified

    +

    As different databases have different forms of randomness, the random selection is computed in +R, based on the count for each cohort. This is, therefore, db platform independent

    +

    Note, this function assumes cohorts have already been generated.

    +

    Lifecycle Note: This functionality is considered experimental and not intended for use inside analytic packages

    +
    + +
    +
    sampleCohortDefinitionSet(
    +  cohortDefinitionSet,
    +  cohortIds = cohortDefinitionSet$cohortId,
    +  connectionDetails = NULL,
    +  connection = NULL,
    +  tempEmulationSchema = getOption("sqlRenderTempEmulationSchema"),
    +  cohortDatabaseSchema,
    +  outputDatabaseSchema = cohortDatabaseSchema,
    +  cohortTableNames = getCohortTableNames(),
    +  n = NULL,
    +  sampleFraction = NULL,
    +  seed = 64374,
    +  seedArgs = NULL,
    +  identifierExpression = "cohortId * 1000 + seed",
    +  incremental = FALSE,
    +  incrementalFolder = NULL
    +)
    +
    + +
    +

    Arguments

    +
    cohortDefinitionSet
    +

    The cohortDefinitionSet argument must be a data frame with +the following columns:

    cohortId
    +

    The unique integer identifier of the cohort

    + +
    cohortName
    +

    The cohort's name

    + +
    sql
    +

    The OHDSI-SQL used to generate the cohort

    + +

    Optionally, this data frame may contain:

    json
    +

    The Circe JSON representation of the cohort

    + +
    + + +
    cohortIds
    +

    Optional subset of cohortIds to generate. By default this function will sample all cohorts

    + + +
    connectionDetails
    +

    An object of type connectionDetails as created using the +createConnectionDetails function in the +DatabaseConnector package. Can be left NULL if connection is +provided.

    + + +
    connection
    +

    An object of type connection as created using the +connect function in the +DatabaseConnector package. Can be left NULL if connectionDetails +is provided, in which case a new connection will be opened at the start +of the function, and closed when the function finishes.

    + + +
    tempEmulationSchema
    +

    Some database platforms like Oracle and Impala do not truly support +temp tables. To emulate temp tables, provide a schema with write +privileges where temp tables can be created.

    + + +
    cohortDatabaseSchema
    +

    Schema name where your cohort tables reside. Note that for SQL Server, +this should include both the database and schema name, for example +'scratch.dbo'.

    + + +
    outputDatabaseSchema
    +

    optional schema to output cohorts to (if different from cohortDatabaseSchema)

    + + +
    cohortTableNames
    +

    The names of the cohort tables. See getCohortTableNames +for more details.

    + + +
    n
    +

    Sample size. Ignored if sample fraction is set

    + + +
    sampleFraction
    +

    Fraction of cohort to sample

    + + +
    seed
    +

    Vector of seeds to give to the R pseudorandom number generator

    + + +
    seedArgs
    +

    optional arguments to pass to set.seed

    + + +
    identifierExpression
    +

    Optional string R expression used to compute output cohort id. Can only use variables +cohortId and seed. Default is "cohortId * 1000 + seed", which is substituted and evaluated

    + + +
    incremental
    +

    Create only cohorts that haven't been created before?

    + + +
    incrementalFolder
    +

    If incremental = TRUE, specify a folder where records are +kept of which definition has been executed.

    + +
    +
    +

    Value

    + + +

    sampledCohortDefinitionSet - a data.frame like object that contains the resulting identifiers and modified names of cohorts

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/saveCohortDefinitionSet.html b/docs/reference/saveCohortDefinitionSet.html index 6e911f3..22c8814 100644 --- a/docs/reference/saveCohortDefinitionSet.html +++ b/docs/reference/saveCohortDefinitionSet.html @@ -1,5 +1,5 @@ -Save the cohort definition set to the file system — saveCohortDefinitionSet • CohortGeneratorSave the cohort definition set to the file system — saveCohortDefinitionSet • CohortGenerator CohortGenerator - 0.8.1 + 0.9.0 @@ -43,6 +43,9 @@
  • Generating Cohorts
  • +
  • + Sampling Cohorts +
  • Changelog @@ -160,7 +163,7 @@

    Arguments

    -

    Site built with pkgdown 2.0.7.

    +

    Site built with pkgdown 2.0.9.

    diff --git a/docs/reference/saveCohortSubsetDefinition.html b/docs/reference/saveCohortSubsetDefinition.html index be3f292..d2291c1 100644 --- a/docs/reference/saveCohortSubsetDefinition.html +++ b/docs/reference/saveCohortSubsetDefinition.html @@ -1,5 +1,5 @@ -Save cohort subset definitions to json — saveCohortSubsetDefinition • CohortGeneratorSave cohort subset definitions to json — saveCohortSubsetDefinition • CohortGenerator @@ -17,7 +17,7 @@ CohortGenerator - 0.8.1 + 0.9.0 @@ -37,6 +37,9 @@
  • Generating Cohorts
  • +
  • + Sampling Cohorts +
  • Changelog @@ -98,7 +101,7 @@

    Arguments

    -

    Site built with pkgdown 2.0.7.

    +

    Site built with pkgdown 2.0.9.

    diff --git a/docs/reference/saveIncremental.html b/docs/reference/saveIncremental.html index 91b4a65..0f48925 100644 --- a/docs/reference/saveIncremental.html +++ b/docs/reference/saveIncremental.html @@ -1,5 +1,5 @@ -Used in incremental mode to save values to a file — saveIncremental • CohortGeneratorUsed in incremental mode to save values to a file — saveIncremental • CohortGeneratorUsed to write a .csv file — writeCsv • CohortGeneratorUsed to write a .csv file — writeCsv • CohortGenerator CohortGenerator - 0.8.1 + 0.9.0 @@ -48,6 +48,9 @@
  • Generating Cohorts
  • +
  • + Sampling Cohorts +
  • Changelog @@ -150,7 +153,7 @@

    Value

    -

    Site built with pkgdown 2.0.7.

    +

    Site built with pkgdown 2.0.9.

    diff --git a/docs/sitemap.xml b/docs/sitemap.xml index d68963b..3808b80 100644 --- a/docs/sitemap.xml +++ b/docs/sitemap.xml @@ -12,6 +12,9 @@ /articles/index.html + + /articles/SamplingCohorts.html + /authors.html @@ -144,6 +147,9 @@ /reference/recordTasksDone.html + + /reference/sampleCohortDefinitionSet.html + /reference/saveCohortDefinitionSet.html diff --git a/extras/CohortGenerator.pdf b/extras/CohortGenerator.pdf index 3248287..0822960 100644 Binary files a/extras/CohortGenerator.pdf and b/extras/CohortGenerator.pdf differ diff --git a/extras/PackageMaintenance.R b/extras/PackageMaintenance.R index 1995c62..9e73dbe 100644 --- a/extras/PackageMaintenance.R +++ b/extras/PackageMaintenance.R @@ -1,6 +1,6 @@ # @file PackageMaintenance # -# Copyright 2023 Observational Health Data Sciences and Informatics +# Copyright 2024 Observational Health Data Sciences and Informatics # # This file is part of CohortGenerator # @@ -20,9 +20,9 @@ OhdsiRTools::formatRFolder("./R") #(note: this function has been impacted by change in formatR) OhdsiRTools::checkUsagePackage("CohortGenerator") OhdsiRTools::updateCopyrightYearFolder() +devtools::spell_check() styler::style_pkg() devtools::document() -devtools::spell_check() # Create manual and vignettes: unlink("extras/CohortGenerator.pdf") @@ -41,6 +41,12 @@ rmarkdown::render("vignettes/CreatingCohortSubsetDefinitions.Rmd", toc = TRUE, number_sections = TRUE)) +rmarkdown::render("vignettes/SamplingCohorts.Rmd", + output_file = "../inst/doc/SamplingCohorts.pdf", + rmarkdown::pdf_document(latex_engine = "pdflatex", + toc = TRUE, + number_sections = TRUE)) + unloadNamespace("CohortGenerator") pkgdown::build_site() OhdsiRTools::fixHadesLogo() diff --git a/inst/doc/CreatingCohortSubsetDefinitions.pdf b/inst/doc/CreatingCohortSubsetDefinitions.pdf new file mode 100644 index 0000000..61986cb Binary files /dev/null and b/inst/doc/CreatingCohortSubsetDefinitions.pdf differ diff --git a/inst/doc/GeneratingCohorts.pdf b/inst/doc/GeneratingCohorts.pdf new file mode 100644 index 0000000..bcb7ad4 Binary files /dev/null and b/inst/doc/GeneratingCohorts.pdf differ diff --git a/inst/doc/SamplingCohorts.pdf b/inst/doc/SamplingCohorts.pdf new file mode 100644 index 0000000..a27ffb3 Binary files /dev/null and b/inst/doc/SamplingCohorts.pdf differ diff --git a/man/sampleCohortDefinitionSet.Rd b/man/sampleCohortDefinitionSet.Rd index 34a0210..8612bc5 100644 --- a/man/sampleCohortDefinitionSet.Rd +++ b/man/sampleCohortDefinitionSet.Rd @@ -61,7 +61,7 @@ for more details.} \item{sampleFraction}{Fraction of cohort to sample} -\item{seed}{Vector of seeds to give to the R psuedorandom number generator} +\item{seed}{Vector of seeds to give to the R pseudorandom number generator} \item{seedArgs}{optional arguments to pass to set.seed} @@ -80,7 +80,7 @@ sampledCohortDefinitionSet - a data.frame like object that contains the resultin Create 1 or more sample of size n of a cohort definition set Subsetted cohorts can be sampled, as with any other subset form. -However, subsetting a sampled cohort is not reccomended and not currently supported at this time. +However, subsetting a sampled cohort is not recommended and not currently supported at this time. In the case where n > cohort count the entire cohort is copied unmodified As different databases have different forms of randomness, the random selection is computed in diff --git a/tests/testthat/test-CohortCount.R b/tests/testthat/test-CohortCount.R index 03c27bd..c796a63 100644 --- a/tests/testthat/test-CohortCount.R +++ b/tests/testthat/test-CohortCount.R @@ -129,19 +129,19 @@ test_that("Call getCohortCounts with no cohortId specified and cohortDefinitionS packageName = "CohortGenerator", verbose = TRUE ) - + cohortDefinitionSet <- rbind( cohortDefinitionSet, cohortDefinitionSet[1, ] |> transform(atlasId = 100, cohortId = 100, cohortName = "not in cohort table", logicDescription = "not in cohort table") ) - + testCohortCounts <- getCohortCounts( connectionDetails = connectionDetails, cohortDatabaseSchema = "main", cohortTable = "cohort", cohortDefinitionSet = cohortDefinitionSet ) - + expect_true(nrow(testCohortCounts) == 4) expect_true(testCohortCounts[testCohortCounts$cohortId == 100, "cohortEntries"] == 0) expect_true(testCohortCounts[testCohortCounts$cohortId == 100, "cohortSubjects"] == 0) diff --git a/tests/testthat/test-CohortSample.R b/tests/testthat/test-CohortSample.R index 4b740cd..047df53 100644 --- a/tests/testthat/test-CohortSample.R +++ b/tests/testthat/test-CohortSample.R @@ -1,11 +1,12 @@ - test_that("sampleCohortDefinitionSet", { connectionDetails <- Eunomia::getEunomiaConnectionDetails() conn <- DatabaseConnector::connect(connectionDetails = connectionDetails) on.exit(DatabaseConnector::disconnect(conn)) - cohortTableNames <- getCohortTableNames(cohortTable = "cohort", - cohortSampleTable = "cohort_sample") + cohortTableNames <- getCohortTableNames( + cohortTable = "cohort", + cohortSampleTable = "cohort_sample" + ) recordKeepingFolder <- file.path(outputFolder, "RecordKeepingSamples") createCohortTables( @@ -41,10 +42,12 @@ test_that("sampleCohortDefinitionSet", { expect_true(all(cds$cohortId * 1000 + 64374 == sampledCohorts$cohortId)) # Sample table pouplated - res <- DatabaseConnector::renderTranslateQuerySql(connection = conn, - "SELECT cohort_definition_id, count(*) as ct FROM cohort_sample + res <- DatabaseConnector::renderTranslateQuerySql( + connection = conn, + "SELECT cohort_definition_id, count(*) as ct FROM cohort_sample GROUP BY cohort_definition_id - ") + " + ) expect_true(all(res$ct == 10)) expect_true(all(sampledCohorts$status == "generated")) # Test incrmental logic works @@ -80,11 +83,15 @@ test_that(".getSampleSet", { connection <- DatabaseConnector::connect(dbms = "sqlite", server = ":memory:") on.exit(DatabaseConnector::disconnect(connection)) - DatabaseConnector::insertTable(connection = connection, - tableName = "cohort", - camelCaseToSnakeCase = TRUE, - data = data.frame(cohortDefinitionId = 1, - subjectId = 1:1e5)) + DatabaseConnector::insertTable( + connection = connection, + tableName = "cohort", + camelCaseToSnakeCase = TRUE, + data = data.frame( + cohortDefinitionId = 1, + subjectId = 1:1e5 + ) + ) cohortDatabaseSchema <- "main" targetTable <- "cohort" @@ -94,62 +101,71 @@ test_that(".getSampleSet", { seedArgs <- NULL res <- .getSampleSet(connection, - n, - sampleFraction = NULL, - seed, - seedArgs, - cohortDatabaseSchema, - targetCohortId, - targetTable) + n, + sampleFraction = NULL, + seed, + seedArgs, + cohortDatabaseSchema, + targetCohortId, + targetTable + ) checkmate::expect_data_frame(res, types = "integer", nrows = n) res2 <- .getSampleSet(connection, - n, - sampleFraction = NULL, - seed, - seedArgs, - cohortDatabaseSchema, - targetCohortId, - targetTable) + n, + sampleFraction = NULL, + seed, + seedArgs, + cohortDatabaseSchema, + targetCohortId, + targetTable + ) # use of the same seed should produce the same result expect_true(all(res$rand_id == res2$rand_id)) res2 <- .getSampleSet(connection, - n, - sampleFraction = NULL, - seed + 1, - seedArgs, - cohortDatabaseSchema, - targetCohortId, - targetTable) + n, + sampleFraction = NULL, + seed + 1, + seedArgs, + cohortDatabaseSchema, + targetCohortId, + targetTable + ) expect_false(all(res$rand_id == res2$rand_id)) - DatabaseConnector::insertTable(connection = connection, - tableName = "cohort", - camelCaseToSnakeCase = TRUE, - data = data.frame(cohortDefinitionId = 2, - subjectId = 1:25)) + DatabaseConnector::insertTable( + connection = connection, + tableName = "cohort", + camelCaseToSnakeCase = TRUE, + data = data.frame( + cohortDefinitionId = 2, + subjectId = 1:25 + ) + ) # Where n > count should return count rows res3 <- .getSampleSet(connection, - n, - sampleFraction = NULL, - seed, - seedArgs, - cohortDatabaseSchema, - targetCohortId = 2, - targetTable) + n, + sampleFraction = NULL, + seed, + seedArgs, + cohortDatabaseSchema, + targetCohortId = 2, + targetTable + ) checkmate::expect_data_frame(res3, types = "integer", nrows = 25) res4 <- .getSampleSet(connection, - n = NULL, - sampleFraction = 0.5, - seed, - seedArgs, - cohortDatabaseSchema, - targetCohortId = 2, - targetTable) + n = NULL, + sampleFraction = 0.5, + seed, + seedArgs, + cohortDatabaseSchema, + targetCohortId = 2, + targetTable + ) checkmate::expect_data_frame(res4, types = "integer", nrows = 12) }) @@ -160,35 +176,41 @@ test_that(".sampleCohort", { on.exit(DatabaseConnector::disconnect(connection)) cohortCount <- 1000 - startDates <- sample(seq(as.Date('2001/01/01'), as.Date('2023/01/01'), by = "day"), cohortCount) + startDates <- sample(seq(as.Date("2001/01/01"), as.Date("2023/01/01"), by = "day"), cohortCount) endDates <- startDates + sample(1:800, cohortCount, replace = TRUE) - tData <- data.frame(cohortDefinitionId = 1, - subjectId = 1:cohortCount, - cohortStartDate = startDates, - cohortEndDate = endDates) + tData <- data.frame( + cohortDefinitionId = 1, + subjectId = 1:cohortCount, + cohortStartDate = startDates, + cohortEndDate = endDates + ) # dupes ensures that dense_rank allows selection of multiple cohort entries for the same subject tData <- rbind(tData, tData) - DatabaseConnector::insertTable(connection = connection, - tableName = "cohort", - camelCaseToSnakeCase = TRUE, - data = tData) + DatabaseConnector::insertTable( + connection = connection, + tableName = "cohort", + camelCaseToSnakeCase = TRUE, + data = tData + ) sampleTable <- data.frame(rand_id = c(7, 8, 9, 10, 33, 198)) .sampleCohort(connection, - targetCohortId = 1, - targetTable = "cohort", - outputCohortId = 999, - outputTable = "cohort", - cohortDatabaseSchema = "main", - outputDatabaseSchema = "main", - sampleTable = sampleTable, - seed = 1, - tempEmulationSchema = getOption("sqlRenderTempEmulationSchema")) + targetCohortId = 1, + targetTable = "cohort", + outputCohortId = 999, + outputTable = "cohort", + cohortDatabaseSchema = "main", + outputDatabaseSchema = "main", + sampleTable = sampleTable, + seed = 1, + tempEmulationSchema = getOption("sqlRenderTempEmulationSchema") + ) resCohort <- DatabaseConnector::renderTranslateQuerySql(connection, - "SELECT * FROM main.cohort WHERE cohort_definition_id = 999", - snakeCaseToCamelCase = TRUE) + "SELECT * FROM main.cohort WHERE cohort_definition_id = 999", + snakeCaseToCamelCase = TRUE + ) checkmate::expect_data_frame(resCohort, nrows = nrow(sampleTable) * 2) expect_true(all(resCohort$subjectId %in% sampleTable$rand_id)) @@ -208,8 +230,10 @@ test_that("checkUniqueOutputIds returns error when duplicate ids are present", { identifierExpression <- "cohortId" cohortTableNames <- list(cohortTable = "cohort", cohortSampleTable = "cohort") - expect_error(.checkUniqueOutputIds(cohortIds, seed, identifierExpression, cohortTableNames), - "identifier expression does not produce unique output") + expect_error( + .checkUniqueOutputIds(cohortIds, seed, identifierExpression, cohortTableNames), + "identifier expression does not produce unique output" + ) }) test_that("checkUniqueOutputIds does not return error when all ids are unique", { @@ -234,45 +258,45 @@ test_that("checkUniqueOutputIds does not return error when cohortTable and cohor test_that("Error on bad params", { # No connection details expect_error({ - sampledCohorts <- sampleCohortDefinitionSet( - cohortDefinitionSet = cds, - connection = NULL, - n = 10, - sampleFraction = NULL, - seed = 64374, - cohortDatabaseSchema = "main", - cohortTableNames = cohortTableNames, - incremental = TRUE, - incrementalFolder = recordKeepingFolder - ) + sampledCohorts <- sampleCohortDefinitionSet( + cohortDefinitionSet = cds, + connection = NULL, + n = 10, + sampleFraction = NULL, + seed = 64374, + cohortDatabaseSchema = "main", + cohortTableNames = cohortTableNames, + incremental = TRUE, + incrementalFolder = recordKeepingFolder + ) }) expect_error({ - sampledCohorts <- sampleCohortDefinitionSet( - cohortDefinitionSet = cds, - connectionDetails = Eunomia::getEunomiaConnectionDetails(), - n = NULL, - sampleFraction = NULL, - seed = 64374, - cohortDatabaseSchema = "main", - cohortTableNames = cohortTableNames, - incremental = TRUE, - incrementalFolder = recordKeepingFolder - ) + sampledCohorts <- sampleCohortDefinitionSet( + cohortDefinitionSet = cds, + connectionDetails = Eunomia::getEunomiaConnectionDetails(), + n = NULL, + sampleFraction = NULL, + seed = 64374, + cohortDatabaseSchema = "main", + cohortTableNames = cohortTableNames, + incremental = TRUE, + incrementalFolder = recordKeepingFolder + ) }) expect_error({ - sampledCohorts <- sampleCohortDefinitionSet( - cohortDefinitionSet = cds, - connectionDetails = Eunomia::getEunomiaConnectionDetails(), - n = 10, - sampleFraction = NULL, - seed = 64374, - cohortDatabaseSchema = "main", - cohortTableNames = cohortTableNames, - incremental = TRUE, - incrementalFolder = NULL - ) + sampledCohorts <- sampleCohortDefinitionSet( + cohortDefinitionSet = cds, + connectionDetails = Eunomia::getEunomiaConnectionDetails(), + n = 10, + sampleFraction = NULL, + seed = 64374, + cohortDatabaseSchema = "main", + cohortTableNames = cohortTableNames, + incremental = TRUE, + incrementalFolder = NULL + ) }) -}) \ No newline at end of file +}) diff --git a/tests/testthat/test-CohortTables.R b/tests/testthat/test-CohortTables.R index d05e4b6..a22d953 100644 --- a/tests/testthat/test-CohortTables.R +++ b/tests/testthat/test-CohortTables.R @@ -163,10 +163,10 @@ test_that("Create cohort tables with incremental = TRUE and partial table creati test_that("Cohort sample table does not exist for backwards compatibility", { cohortTableNames <- getCohortTableNames(cohortTable = "cohortSampleTable") - + # Remove the sample table to make sure the create cohort table works cohortTableNames <- cohortTableNames[-which(names(cohortTableNames) == "cohortSampleTable")] - + # Create the cohort tables expect_invisible( createCohortTables( diff --git a/tests/testthat/test-NegativeControlCohorts.R b/tests/testthat/test-NegativeControlCohorts.R index 19c4d02..f2444e6 100644 --- a/tests/testthat/test-NegativeControlCohorts.R +++ b/tests/testthat/test-NegativeControlCohorts.R @@ -240,9 +240,9 @@ test_that("incremental mode", { incrementalFolder = incrementalFolder, incremental = TRUE ) - + expect_equal(res, "SKIPPED") - + # Test changing other params regenerates res <- generateNegativeControlOutcomeCohorts( connection = connection, @@ -255,9 +255,9 @@ test_that("incremental mode", { incrementalFolder = incrementalFolder, incremental = TRUE ) - + expect_equal(res, "FINISHED") - + # Test changing other params regenerates res <- generateNegativeControlOutcomeCohorts( connection = connection, @@ -270,10 +270,10 @@ test_that("incremental mode", { incrementalFolder = incrementalFolder, incremental = TRUE ) - + expect_equal(res, "SKIPPED") - - + + res <- generateNegativeControlOutcomeCohorts( connection = connection, cdmDatabaseSchema = "main", @@ -285,10 +285,10 @@ test_that("incremental mode", { incrementalFolder = incrementalFolder, incremental = TRUE ) - + expect_equal(res, "FINISHED") - - + + res <- generateNegativeControlOutcomeCohorts( connection = connection, cdmDatabaseSchema = "main", @@ -300,7 +300,6 @@ test_that("incremental mode", { incrementalFolder = incrementalFolder, incremental = TRUE ) - + expect_equal(res, "SKIPPED") - -}) \ No newline at end of file +}) diff --git a/tests/testthat/test-dbms-platforms.R b/tests/testthat/test-dbms-platforms.R index cdb5caa..4332f74 100644 --- a/tests/testthat/test-dbms-platforms.R +++ b/tests/testthat/test-dbms-platforms.R @@ -34,7 +34,7 @@ testPlatform <- function(dbmsDetails) { incrementalFolder = file.path(outputFolder, "RecordKeeping", dbmsDetails$connectionDetails$dbms) ) expect_equal(nrow(cohortsGenerated), nrow(cohortsWithStats)) - + # Get the cohort counts cohortCounts <- getCohortCounts( connectionDetails = dbmsDetails$connectionDetails, @@ -42,9 +42,9 @@ testPlatform <- function(dbmsDetails) { cohortTable = cohortTableNames$cohortTable, databaseId = dbmsDetails$dbmsPlatform, cohortDefinitionSet = cohortsWithStats - ) + ) expect_equal(nrow(cohortsGenerated), nrow(cohortCounts)) - + # Insert the inclusion rule names before exporting the stats tables insertInclusionRuleNames( connectionDetails = dbmsDetails$connectionDetails, @@ -52,7 +52,7 @@ testPlatform <- function(dbmsDetails) { cohortDatabaseSchema = dbmsDetails$cohortDatabaseSchema, cohortInclusionTable = cohortTableNames$cohortInclusionTable ) - + exportCohortStatsTables( connectionDetails = dbmsDetails$connectionDetails, cohortTableNames = cohortTableNames, @@ -63,7 +63,7 @@ testPlatform <- function(dbmsDetails) { incremental = TRUE, databaseId = dbmsDetails$dbmsPlatform ) - + subsetOperations <- list( createCohortSubset( cohortIds = 2, diff --git a/vignettes/SamplingCohorts.Rmd b/vignettes/SamplingCohorts.Rmd index 82f7f4d..eb7de00 100644 --- a/vignettes/SamplingCohorts.Rmd +++ b/vignettes/SamplingCohorts.Rmd @@ -27,13 +27,12 @@ someFolder <- tempdir() packageRoot <- tempdir() baseUrl <- "https://api.ohdsi.org/WebAPI" library(CohortGenerator) - ``` # Sampling with CohortGenerator -Large populations of individuals (e.g. all subjects recieving a COVID-19 vaccination) can often be too large to work with when +Large populations of individuals (e.g. all subjects receiving a COVID-19 vaccination) can often be too large to work with when pulling down a large collection of covariates for further analysis. -This is prohibitive when designing studies or atteming to generate phenotypes. +This is prohibitive when designing studies or attempting to generate phenotypes. This guide aims to demonstrate how one can use the `sampleCohortDefinitionSet` functionality to produce sufficiently large sample cohorts from a base `cohortDefinitionSet`. @@ -80,20 +79,19 @@ generateCohortSet( incremental = TRUE, incrementalFolder = recordKeepingFolder ) - ``` We can then create a new cohort definition set from the original sample. ```{r eval=F} sampledCohortDefinitionSet <- sampleCohortDefinitionSet( - cohortDefinitionSet = cds, - connection = conn, - sampleFraction = 0.33, - seed = 64374, # OHDSI - cohortDatabaseSchema = "main", - cohortTableNames = cohortTableNames, - incremental = TRUE, - incrementalFolder = recordKeepingFolder - ) + cohortDefinitionSet = cds, + connection = conn, + sampleFraction = 0.33, + seed = 64374, # OHDSI + cohortDatabaseSchema = "main", + cohortTableNames = cohortTableNames, + incremental = TRUE, + incrementalFolder = recordKeepingFolder +) ``` The resulting `sampledCohortDefinitionSet` is nearly identical to the base cohort set, however a few changes occur: @@ -109,15 +107,15 @@ To generate multiple samples, simply specify multiple seed variables as follows: ```{r eval=F} # Generate 800 samples of size n sampledCohortDefinitionSet <- sampleCohortDefinitionSet( - cohortDefinitionSet = cds, - connection = conn, - n = 1000, - seed = 1:800 * 64374, # OHDSI - cohortDatabaseSchema = "main", - cohortTableNames = cohortTableNames, - incremental = TRUE, - incrementalFolder = recordKeepingFolder - ) + cohortDefinitionSet = cds, + connection = conn, + n = 1000, + seed = 1:800 * 64374, # OHDSI + cohortDatabaseSchema = "main", + cohortTableNames = cohortTableNames, + incremental = TRUE, + incrementalFolder = recordKeepingFolder +) ``` Note that using incremental mode for your sampled cohorts will also work. In this case, a cohort will only be re-generated if the checksum of the base cohort has changed (the checksum is based