From 2f30ef86179dc951e7f6a603ab65481ed5735272 Mon Sep 17 00:00:00 2001 From: Anthony Sena Date: Thu, 20 Jun 2024 09:18:48 -0400 Subject: [PATCH] Fix bug in right join for negate subset operator (#168) * Fix bug in right join for negate subset operator * Add subset logic checks * Fix R CMD checks --- .../subsets/CohortSubsetOperator.sql | 3 +- tests/testthat/test-Subsets.R | 444 ++++++++++++++++++ 2 files changed, 446 insertions(+), 1 deletion(-) diff --git a/inst/sql/sql_server/subsets/CohortSubsetOperator.sql b/inst/sql/sql_server/subsets/CohortSubsetOperator.sql index 892fb5f..d93448d 100644 --- a/inst/sql/sql_server/subsets/CohortSubsetOperator.sql +++ b/inst/sql/sql_server/subsets/CohortSubsetOperator.sql @@ -20,6 +20,7 @@ FROM ( HAVING COUNT (DISTINCT S.COHORT_DEFINITION_ID) >= @subset_length ) A {@negate == '1'}?{ -RIGHT JOIN @target_table B ON B.subject_id = A.subject_id +RIGHT JOIN @target_table B ON B.subject_id = A.subject_id + AND b.cohort_start_date = a.cohort_start_date WHERE A.subject_id IS NULL } diff --git a/tests/testthat/test-Subsets.R b/tests/testthat/test-Subsets.R index 518806c..c9ffe9c 100644 --- a/tests/testthat/test-Subsets.R +++ b/tests/testthat/test-Subsets.R @@ -461,3 +461,447 @@ test_that("Subset name templates function", { checkmate::expect_list(attr(cds2, "cohortSubsetDefinitions")) expect_true(attr(cds2, "hasSubsetDefinitions")) }) + +test_that("Subset logic checks", { + databaseFile <- tempfile(fileext = ".sqlite") + sqliteConnectionDetails <- DatabaseConnector::createConnectionDetails( + dbms = "sqlite", + server = databaseFile + ) + sqliteResultsDatabaseSchema <- "main" + connection <- DatabaseConnector::connect(sqliteConnectionDetails) + withr::defer({ + DatabaseConnector::disconnect(connection) + unlink(databaseFile, force = TRUE) + }, + testthat::teardown_env() + ) + + # Create dummy OMOP data for testing ------------------ + DatabaseConnector::insertTable( + connection = connection, + databaseSchema = sqliteResultsDatabaseSchema, + tableName = "observation_period", + data = data.frame( + observation_period_id = 1, + person_id = 1, + observation_period_start_date = lubridate::date("2000-01-01"), + observation_period_end_date = lubridate::date("2008-12-31") + ) + ) + + DatabaseConnector::insertTable( + connection = connection, + databaseSchema = sqliteResultsDatabaseSchema, + tableName = "person", + data = data.frame( + person_id = 1, + gender_concept_id = 8532, + year_of_birth = 2000, + race_concept_id = 0, + ethnicity_concept_id = 0 + ) + ) + + + # Define limit subsets for tests ------------- + lsd1 <- createCohortSubsetDefinition( + name = "first ever", + definitionId = 101, + subsetOperators = list( + createLimitSubset( + name = "first ever", + limitTo = "firstEver" + ) + ) + ) + + lsd2 <- createCohortSubsetDefinition( + name = "earliestRemaining", + definitionId = 102, + subsetOperators = list( + createLimitSubset( + name = "earliestRemaining", + limitTo = "earliestRemaining", + priorTime = 500 + ) + ) + ) + + lsd3 <- createCohortSubsetDefinition( + name = "latestRemaining", + definitionId = 103, + subsetOperators = list( + createLimitSubset( + name = "latestRemaining", + limitTo = "latestRemaining", + followUpTime = 800 + ) + ) + ) + + lsd4 <- createCohortSubsetDefinition( + name = "lastEver", + definitionId = 104, + subsetOperators = list( + createLimitSubset( + name = "lastEver", + limitTo = "lastEver" + ) + ) + ) + + lsd5 <- createCohortSubsetDefinition( + name = "calendar", + definitionId = 105, + subsetOperators = list( + createLimitSubset( + name = "2003 - 2006", + calendarStartDate = "2003-01-01", + calendarEndDate = "2006-12-31", + ) + ) + ) + + lsd6 <- createCohortSubsetDefinition( + name = "firstEver + calendar", + definitionId = 106, + subsetOperators = list( + createLimitSubset( + limitTo = "firstEver", + name = "2003 - 2006", + calendarStartDate = "2003-01-01", + calendarEndDate = "2006-12-31", + ) + ) + ) + + lsd7 <- createCohortSubsetDefinition( + name = "earliestRemaining + calendar", + definitionId = 107, + subsetOperators = list( + createLimitSubset( + limitTo = "earliestRemaining", + name = "2003 - 2006", + priorTime = 500, + calendarStartDate = "2003-01-01", + calendarEndDate = "2006-12-31", + ) + ) + ) + + # Define demographics subsets for tests ------------- + ds1 <- createCohortSubsetDefinition( + name = "Age subset", + definition = 201, + subsetOperators = list( + createDemographicSubset( + name = "Age 2-5", + ageMin = 2, + ageMax = 5 + ) + ) + ) + + ds2 <- createCohortSubsetDefinition( + name = "Gender subset", + definition = 202, + subsetOperators = list( + createDemographicSubset( + name = "Gender = 8532", + gender = 8532 + ) + ) + ) + + ds3 <- createCohortSubsetDefinition( + name = "Race subset", + definition = 203, + subsetOperators = list( + createDemographicSubset( + name = "Race = 0", + race = 0 + ) + ) + ) + + ds4 <- createCohortSubsetDefinition( + name = "Race subset", + definition = 204, + subsetOperators = list( + createDemographicSubset( + name = "Ethnicity = 0", + ethnicity = 0 + ) + ) + ) + + # Define cohort subsets for tests ------------- + cs1 <- createCohortSubsetDefinition( + name = "Subset overlaps cohort start", + definition = 301, + subsetOperators = list( + createCohortSubset( + name = "subsetOverlapTargetStart", + cohortIds = c(2), + negate = F, + cohortCombinationOperator = "any", + startWindow = createSubsetCohortWindow(-99999, 0, "cohortStart"), + endWindow = createSubsetCohortWindow(0, 99999, "cohortStart") + ) + ) + ) + + cs2 <- createCohortSubsetDefinition( + name = "Subset overlaps entire target cohort period", + definition = 302, + subsetOperators = list( + createCohortSubset( + name = "subsetSubsumesTarget", + cohortIds = c(3), + negate = F, + cohortCombinationOperator = "any", + startWindow = createSubsetCohortWindow(-99999, -1, "cohortStart"), + endWindow = createSubsetCohortWindow(1, 99999, "cohortEnd") + ) + ) + ) + + cs3 <- createCohortSubsetDefinition( + name = "Subset subsumed by entire target cohort period", + definition = 303, + subsetOperators = list( + createCohortSubset( + name = "targetSubsumesSubset", + cohortIds = c(4), + negate = F, + cohortCombinationOperator = "any", + startWindow = createSubsetCohortWindow(1, 99999, "cohortStart"), + endWindow = createSubsetCohortWindow(-99999, 1, "cohortEnd") + ) + ) + ) + + cs4 <- createCohortSubsetDefinition( + name = "Subset overlaps cohort end", + definition = 304, + subsetOperators = list( + createCohortSubset( + name = "subsetOverlapTargetEnd", + cohortIds = c(5), + negate = F, + cohortCombinationOperator = "any", + startWindow = createSubsetCohortWindow(-99999, 0, "cohortEnd"), + endWindow = createSubsetCohortWindow(0, 99999, "cohortEnd") + ) + ) + ) + + cs5 <- createCohortSubsetDefinition( + name = "Subset does NOT overlap cohort end - negate", + definition = 305, + subsetOperators = list( + createCohortSubset( + name = "subsetOverlapTargetEndNegate", + cohortIds = c(5), + negate = T, + cohortCombinationOperator = "any", + startWindow = createSubsetCohortWindow(-99999, 0, "cohortEnd"), + endWindow = createSubsetCohortWindow(0, 99999, "cohortEnd") + ) + ) + ) + + cs6 <- createCohortSubsetDefinition( + name = "Subset overlaps target start - tests combo == all", + definition = 306, + subsetOperators = list( + createCohortSubset( + name = "subsetOverlapTargetStartComboAll", + cohortIds = c(2,3), + negate = F, + cohortCombinationOperator = "all", + startWindow = createSubsetCohortWindow(-99999, 0, "cohortStart"), + endWindow = createSubsetCohortWindow(0, 99999, "cohortStart") + ) + ) + ) + + # Create cohort def. set and apply subset definitions --------- + cohortDefinitionSet <- data.frame( + cohortId = 1, + cohortName = "Test Target Cohort", + sql = " + INSERT INTO @results_database_schema.@target_cohort_table ( + cohort_definition_id, + subject_id, + cohort_start_date, + cohort_end_date + ) + SELECT @target_cohort_id, 1, DATEFROMPARTS(2001, 01, 01), DATEFROMPARTS(2002, 01, 01) + UNION + SELECT @target_cohort_id, 1, DATEFROMPARTS(2003, 01, 01), DATEFROMPARTS(2004, 01, 01) + UNION + SELECT @target_cohort_id, 1, DATEFROMPARTS(2005, 01, 01), DATEFROMPARTS(2006, 01, 01) + UNION + SELECT @target_cohort_id, 1, DATEFROMPARTS(2007, 01, 01), DATEFROMPARTS(2008, 01, 01) + ;", + json = "" + ) + cohortDefinitionSet <- rbind( + cohortDefinitionSet, + data.frame( + cohortId = 2, + cohortName = "Test Subset 1 - Subset Overlaps Target Start Date", + sql = " + INSERT INTO @results_database_schema.@target_cohort_table ( + cohort_definition_id, + subject_id, + cohort_start_date, + cohort_end_date + ) + SELECT @target_cohort_id, 1, DATEFROMPARTS(2000, 01, 01), DATEFROMPARTS(2001, 12, 31) + UNION + SELECT @target_cohort_id, 1, DATEFROMPARTS(2002, 01, 01), DATEFROMPARTS(2003, 12, 31) + UNION + -- NOTE: DOES NOT OVERLAP COHORT ID = 1 FOR TESTING + SELECT @target_cohort_id, 1, DATEFROMPARTS(2004, 01, 01), DATEFROMPARTS(2004, 12, 31) + ;", + json = "" + )) + + cohortDefinitionSet <- rbind( + cohortDefinitionSet, + data.frame( + cohortId = 3, + cohortName = "Test Subset 2 - Subset start+end subsumes target start+end", + sql = " + INSERT INTO @results_database_schema.@target_cohort_table ( + cohort_definition_id, + subject_id, + cohort_start_date, + cohort_end_date + ) + SELECT @target_cohort_id, 1, DATEFROMPARTS(2000, 01, 01), DATEFROMPARTS(2003, 12, 31) + UNION + SELECT @target_cohort_id, 1, DATEFROMPARTS(2002, 01, 01), DATEFROMPARTS(2005, 12, 31) + UNION + -- NOTE: DOES NOT FULLY SUBSUME COHORT ID = 1 FOR TESTING + SELECT @target_cohort_id, 1, DATEFROMPARTS(2004, 01, 01), DATEFROMPARTS(2005, 12, 31) + ;", + json = "" + )) + + cohortDefinitionSet <- rbind( + cohortDefinitionSet, + data.frame( + cohortId = 4, + cohortName = "Test Subset 3 - Target start+end subsumes Subset start+end", + sql = " + INSERT INTO @results_database_schema.@target_cohort_table ( + cohort_definition_id, + subject_id, + cohort_start_date, + cohort_end_date + ) + SELECT @target_cohort_id, 1, DATEFROMPARTS(2001, 02, 01), DATEFROMPARTS(2001, 12, 31) + UNION + SELECT @target_cohort_id, 1, DATEFROMPARTS(2003, 02, 01), DATEFROMPARTS(2003, 12, 31) + UNION + -- NOTE: IS NOT FULLY SUBSUMED BY COHORT ID = 1 FOR TESTING + SELECT @target_cohort_id, 1, DATEFROMPARTS(2004, 01, 01), DATEFROMPARTS(2005, 12, 31) + ;", + json = "" + )) + + cohortDefinitionSet <- rbind( + cohortDefinitionSet, + data.frame( + cohortId = 5, + cohortName = "Test Subset 4 - Subset Overlaps Target End Date", + sql = " + INSERT INTO @results_database_schema.@target_cohort_table ( + cohort_definition_id, + subject_id, + cohort_start_date, + cohort_end_date + ) + SELECT @target_cohort_id, 1, DATEFROMPARTS(2001, 02, 01), DATEFROMPARTS(2002, 02, 01) + UNION + SELECT @target_cohort_id, 1, DATEFROMPARTS(2003, 02, 01), DATEFROMPARTS(2004, 02, 01) + UNION + -- NOTE: DOES NOT OVERLAP ANY END DATE ENTRIES IN COHORT ID = 1 FOR TESTING + SELECT @target_cohort_id, 1, DATEFROMPARTS(2003, 02, 01), DATEFROMPARTS(2003, 03, 01) + ;", + json = "" + )) + + cohortDefinitionSet <- cohortDefinitionSet |> + addCohortSubsetDefinition(lsd1, targetCohortIds = c(1)) |> + addCohortSubsetDefinition(lsd2, targetCohortIds = c(1)) |> + addCohortSubsetDefinition(lsd3, targetCohortIds = c(1)) |> + addCohortSubsetDefinition(lsd4, targetCohortIds = c(1)) |> + addCohortSubsetDefinition(lsd5, targetCohortIds = c(1)) |> + addCohortSubsetDefinition(lsd6, targetCohortIds = c(1)) |> + addCohortSubsetDefinition(lsd7, targetCohortIds = c(1)) |> + addCohortSubsetDefinition(ds1, targetCohortIds = c(1)) |> + addCohortSubsetDefinition(ds2, targetCohortIds = c(1)) |> + addCohortSubsetDefinition(ds3, targetCohortIds = c(1)) |> + addCohortSubsetDefinition(ds4, targetCohortIds = c(1)) |> + addCohortSubsetDefinition(cs1, targetCohortIds = c(1)) |> + addCohortSubsetDefinition(cs2, targetCohortIds = c(1)) |> + addCohortSubsetDefinition(cs3, targetCohortIds = c(1)) |> + addCohortSubsetDefinition(cs4, targetCohortIds = c(1)) |> + addCohortSubsetDefinition(cs5, targetCohortIds = c(1)) |> + addCohortSubsetDefinition(cs6, targetCohortIds = c(1)) + + # Generate cohorts ------------ + cohortTableNames <- getCohortTableNames() + + createCohortTables( + connection = connection, + cohortDatabaseSchema = "main", + cohortTableNames = cohortTableNames + ) + + generateCohortSet( + connection = connection, + cdmDatabaseSchema = "main", + cohortDatabaseSchema = "main", + cohortTableNames = getCohortTableNames(), + cohortDefinitionSet = cohortDefinitionSet + ) + + + cohorts <- DatabaseConnector::querySql( + connection = connection, + sql = "SELECT * FROM main.cohort ORDER BY COHORT_DEFINITION_ID, SUBJECT_ID, COHORT_START_DATE;" + ) + + # Check the cohort counts to verify the logic worked as expected --------- + # cohorts # <------ USE TO SEE THE COHORTS TO VERIFY THE INFO BELOW + + # Limit subsets cohort definition 1100 range ------ + expect_equal(cohorts[cohorts$COHORT_DEFINITION_ID == 1101,]$COHORT_START_DATE[[1]], lubridate::date("2001-01-01")) # 1101 - First Ever + expect_equal(cohorts[cohorts$COHORT_DEFINITION_ID == 1102,]$COHORT_START_DATE[[1]], lubridate::date("2003-01-01")) # 1102 - Earliest Remaining + expect_equal(cohorts[cohorts$COHORT_DEFINITION_ID == 1103,]$COHORT_START_DATE[[1]], lubridate::date("2005-01-01")) # 1103 - Latest Remaining + expect_equal(cohorts[cohorts$COHORT_DEFINITION_ID == 1104,]$COHORT_START_DATE[[1]], lubridate::date("2007-01-01")) # 1104 - Last Ever + expect_equal(cohorts[cohorts$COHORT_DEFINITION_ID == 1105,]$COHORT_START_DATE[[1]], lubridate::date("2003-01-01")) # 1105 - Calendar #1 + expect_equal(cohorts[cohorts$COHORT_DEFINITION_ID == 1105,]$COHORT_START_DATE[[2]], lubridate::date("2005-01-01")) # 1105 - Calendar #2 + expect_equal(nrow(cohorts[cohorts$COHORT_DEFINITION_ID == 1106,]), 0) # 1106 - First ever + calendar time that restricts to no one + expect_equal(cohorts[cohorts$COHORT_DEFINITION_ID == 1107,]$COHORT_START_DATE[[1]], lubridate::date("2003-01-01")) # 1107 - Earliest remaining+calendar restriction + + # Demographic subsets cohort definition 1200 range ------ + expect_equal(nrow(cohorts[cohorts$COHORT_DEFINITION_ID == 1201,]), 2) # 1201 - Age 2-5 + expect_equal(nrow(cohorts[cohorts$COHORT_DEFINITION_ID == 1202,]), 4) # 1202 - Gender + expect_equal(nrow(cohorts[cohorts$COHORT_DEFINITION_ID == 1203,]), 4) # 1203 - Race + expect_equal(nrow(cohorts[cohorts$COHORT_DEFINITION_ID == 1204,]), 4) # 1204 - Ethnicity + + # Cohort subsets cohort definition 1300 range ------ + expect_equal(nrow(cohorts[cohorts$COHORT_DEFINITION_ID == 1301,]), 2) # 1301 - Subset overlaps cohort start + expect_equal(nrow(cohorts[cohorts$COHORT_DEFINITION_ID == 1302,]), 2) # 1302 - Subset overlaps entire target cohort period + expect_equal(nrow(cohorts[cohorts$COHORT_DEFINITION_ID == 1303,]), 2) # 1303 - Subset subsumed by entire target cohort period + expect_equal(nrow(cohorts[cohorts$COHORT_DEFINITION_ID == 1304,]), 2) # 1304 - Subset overlaps cohort end + expect_equal(nrow(cohorts[cohorts$COHORT_DEFINITION_ID == 1305,]), 2) # 1305 - Subset does NOT overlap cohort end - negate + expect_equal(nrow(cohorts[cohorts$COHORT_DEFINITION_ID == 1306,]), 2) # 1306 - Subset overlaps target start - tests combo == all +}) \ No newline at end of file