-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #224 from OHDSI/issue_198
Issue 198
- Loading branch information
Showing
3 changed files
with
209 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,156 @@ | ||
#' Summarise missing data in omop tables | ||
#' | ||
#' @param cdm A cdm object | ||
#' @param omopTableName A character vector of the names of the tables to | ||
#' summarise in the cdm object. | ||
#' @param col A character vector of column names to check for missing values. | ||
#' If `NULL`, all columns in the specified tables are checked. Default is `NULL`. | ||
#' @param sex TRUE or FALSE. If TRUE code use will be summarised by sex. | ||
#' @param ageGroup A list of ageGroup vectors of length two. Code use will be | ||
#' thus summarised by age groups. | ||
#' @return A summarised_result object with results overall and, if specified, by | ||
#' strata. | ||
#' @export | ||
|
||
summariseMissingData <- function(cdm, | ||
omopTableName, | ||
col = NULL, | ||
sex = FALSE, | ||
year = FALSE, | ||
ageGroup = NULL){ | ||
|
||
|
||
omopgenerics::validateCdmArgument(cdm) | ||
|
||
omopgenerics::assertLogical(sex, length = 1) | ||
omopgenerics::assertChoice(omopTableName,choices = omopgenerics::omopTables(), unique = TRUE) | ||
|
||
|
||
ageGroup <- omopgenerics::validateAgeGroupArgument(ageGroup, ageGroupName = "")[[1]] | ||
|
||
strata <- my_getStrataList(sex = sex, ageGroup = ageGroup, year = year) | ||
stratification <- c(list(character()),omopgenerics::combineStrata(strata)) | ||
|
||
result_tables <- purrr::map(omopTableName, function(table) { | ||
|
||
if (omopgenerics::isTableEmpty(cdm[[table]])){ | ||
cli::cli_warn(paste0(table, " omop table is empty.")) | ||
return(NULL) | ||
} | ||
|
||
omopTable <- cdm[[table]] | ||
col_table <- intersect(col, colnames(omopTable)) | ||
if (is.null(col_table) | rlang::is_empty(col_table)){ | ||
col_table<-colnames(omopTable) | ||
} | ||
|
||
indexDate <- startDate(omopgenerics::tableName(omopTable)) | ||
x <- omopTable |> PatientProfiles::addDemographicsQuery(age = FALSE, ageGroup = ageGroup, sex = sex, indexDate = indexDate) | ||
if (year){ | ||
x <- x|> dplyr::mutate(year = as.character(clock::get_year(.data[[indexDate]]))) | ||
} | ||
|
||
result_columns <- purrr::map(col_table, function(c) { | ||
|
||
stratified_result <- x |> | ||
dplyr::group_by(dplyr::across(dplyr::all_of(strata))) |> | ||
dplyr::summarise( | ||
na_count = sum(as.integer(is.na(.data[[c]])), na.rm = TRUE), | ||
total_count = dplyr::n(), | ||
.groups = "drop" | ||
) |> | ||
dplyr::collect() | ||
|
||
# Group results for each level of stratification | ||
grouped_results <- purrr::map(stratification, function(g) { | ||
stratified_result |> | ||
dplyr::group_by(dplyr::across(dplyr::all_of(g))) |> | ||
dplyr::summarise( | ||
na_count = sum(na_count, na.rm = TRUE), | ||
total_count = sum(total_count, na.rm = TRUE), | ||
colName = c, | ||
.groups = "drop" | ||
) |> | ||
dplyr::mutate(na_percentage = dplyr::if_else(.data$total_count > 0, (.data$na_count / .data$total_count) * 100, 0)) | ||
}) | ||
|
||
return(purrr::reduce(grouped_results, dplyr::bind_rows)) | ||
|
||
}) | ||
|
||
res <- purrr::reduce(result_columns, dplyr::union)|> | ||
dplyr::mutate(omop_table = table) | ||
|
||
warningDataRequire(cdm = cdm, res = res, table = table) | ||
|
||
return(res) | ||
}) | ||
if (rlang::is_empty(purrr::compact(result_tables))){ | ||
return(omopgenerics::emptySummarisedResult()) | ||
} | ||
|
||
|
||
result <- purrr::compact(result_tables) |> | ||
purrr::reduce(dplyr::union)|> | ||
dplyr::mutate(dplyr::across(dplyr::all_of(strata), ~ dplyr::coalesce(., "overall")))|> | ||
dplyr::mutate( | ||
na_count = as.double(na_count), # Cast na_count to double | ||
na_percentage = as.double(na_percentage) | ||
)|> | ||
tidyr::pivot_longer( | ||
cols = c(na_count, na_percentage), | ||
names_to = "estimate_name", | ||
values_to = "estimate_value" | ||
) | ||
|
||
|
||
sr <- result |> | ||
dplyr::mutate( | ||
result_id = 1L, | ||
cdm_name = omopgenerics::cdmName(cdm), | ||
) |> | ||
visOmopResults::uniteGroup(cols = "omop_table") |> | ||
visOmopResults::uniteStrata(cols = strata) |> | ||
visOmopResults::uniteAdditional() |> | ||
dplyr::mutate( | ||
"estimate_value" = as.character(.data$estimate_value), | ||
"estimate_type" = "integer", | ||
"variable_level" = NA_character_ | ||
) |> | ||
dplyr::rename("variable_name" = "colName") |> | ||
dplyr::select(!c(total_count)) | ||
|
||
settings <- dplyr::tibble( | ||
result_id = unique(sr$result_id), | ||
package_name = "omopSketch", | ||
package_version = as.character(utils::packageVersion("OmopSketch")), | ||
result_type = "summarise_missing_data" | ||
) | ||
sr <- sr |> | ||
omopgenerics::newSummarisedResult(settings = settings) | ||
|
||
|
||
return(sr) | ||
|
||
} | ||
|
||
warningDataRequire <- function(cdm, table, res){ | ||
required_cols <- omopgenerics::omopTableFields(CDMConnector::cdmVersion(cdm))|> | ||
dplyr::filter(.data$cdm_table_name==table)|> | ||
dplyr::filter(.data$is_required==TRUE)|> | ||
dplyr::pull(.data$cdm_field_name) | ||
warning_columns <- res |> | ||
dplyr::filter(.data$colName %in% required_cols)|> | ||
dplyr::filter(.data$na_count>0)|> | ||
dplyr::distinct(.data$colName)|> | ||
dplyr::pull() | ||
|
||
if (length(warning_columns) > 0) { | ||
cli::cli_warn(c( | ||
"These columns contain missing values, which are not permitted:", | ||
"{.val {warning_columns}}" | ||
)) | ||
} | ||
} | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
test_that("summariseMissingData() works", { | ||
skip_on_cran() | ||
# Load mock database ---- | ||
cdm <- cdmEunomia() | ||
|
||
# Check all tables work ---- | ||
expect_true(inherits(summariseMissingData(cdm, "drug_exposure"),"summarised_result")) | ||
expect_no_error(y<-summariseMissingData(cdm, "observation_period")) | ||
expect_no_error(x<-summariseMissingData(cdm, "visit_occurrence")) | ||
expect_no_error(summariseMissingData(cdm, "condition_occurrence")) | ||
expect_no_error(summariseMissingData(cdm, "drug_exposure")) | ||
expect_no_error(summariseMissingData(cdm, "procedure_occurrence", year = TRUE)) | ||
expect_warning(summariseMissingData(cdm, "device_exposure")) | ||
expect_no_error(z<-summariseMissingData(cdm, "measurement")) | ||
expect_no_error(s<-summariseMissingData(cdm, "observation")) | ||
expect_warning(summariseMissingData(cdm, "death")) | ||
|
||
|
||
expect_no_error(all <- summariseMissingData(cdm, c("observation_period", "visit_occurrence", "measurement"))) | ||
expect_equal(all, dplyr::bind_rows(y, x, z)) | ||
expect_equal(summariseMissingData(cdm, "observation"), summariseMissingData(cdm, "observation", col = colnames(cdm[['observation']]))) | ||
x<-summariseMissingData(cdm, "procedure_occurrence", col = "procedure_date") | ||
|
||
expect_equal(summariseMissingData(cdm, c("procedure_occurrence","observation" ), col = "procedure_date"), dplyr::bind_rows(x,s)) | ||
y<-summariseMissingData(cdm, "observation",col = "observation_date") | ||
expect_equal(summariseMissingData(cdm, c("procedure_occurrence","observation" ), col = c("procedure_date", "observation_date")), dplyr::bind_rows(x,y)) | ||
|
||
# Check inputs ---- | ||
expect_true(summariseMissingData(cdm, "procedure_occurrence", col="person_id")|> | ||
dplyr::select(estimate_value)|> | ||
dplyr::mutate(estimate_value = as.numeric(estimate_value)) |> | ||
dplyr::summarise(sum = sum(estimate_value)) |> | ||
dplyr::pull() == 0) | ||
|
||
expect_true(summariseMissingData(cdm, "procedure_occurrence", col="person_id", sex = TRUE, ageGroup = list(c(0,50), c(51,Inf)))|> | ||
dplyr::distinct(.data$strata_level)|> | ||
dplyr::tally()|> | ||
dplyr::pull()==9) | ||
|
||
expect_true(summariseMissingData(cdm, "procedure_occurrence", col="person_id", ageGroup = list(c(0,50)))|> | ||
dplyr::distinct(.data$strata_level)|> | ||
dplyr::tally()|> | ||
dplyr::pull()==3) | ||
|
||
cdm$procedure_occurrence <- cdm$procedure_occurrence |> | ||
dplyr::mutate(procedure_concept_id = NA_integer_) |> | ||
dplyr::compute(name = "procedure_occurrence", temporary = FALSE) | ||
|
||
expect_warning(summariseMissingData(cdm, "procedure_occurrence", col="procedure_concept_id", ageGroup = list(c(0,50)))) | ||
|
||
PatientProfiles::mockDisconnect(cdm = cdm) | ||
}) |