Skip to content

Commit

Permalink
Merge pull request #224 from OHDSI/issue_198
Browse files Browse the repository at this point in the history
Issue 198
  • Loading branch information
martaalcalde authored Nov 6, 2024
2 parents b5c04d5 + 45b80dd commit cc0bf1b
Show file tree
Hide file tree
Showing 3 changed files with 209 additions and 1 deletion.
2 changes: 1 addition & 1 deletion R/summariseAllConceptCounts.R
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ summariseAllConceptCounts <- function(cdm,
result<- result |>
dplyr::mutate("omop_table" = table,
"variable_level" = as.character(.data[[conceptId]])) |>
dplyr::select(!c(conceptId))
dplyr::select(-dplyr::all_of(conceptId))
return(result)
})
if (rlang::is_empty(purrr::compact(result_tables))){
Expand Down
156 changes: 156 additions & 0 deletions R/summariseMissingData.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
#' Summarise missing data in omop tables
#'
#' @param cdm A cdm object
#' @param omopTableName A character vector of the names of the tables to
#' summarise in the cdm object.
#' @param col A character vector of column names to check for missing values.
#' If `NULL`, all columns in the specified tables are checked. Default is `NULL`.
#' @param sex TRUE or FALSE. If TRUE code use will be summarised by sex.
#' @param ageGroup A list of ageGroup vectors of length two. Code use will be
#' thus summarised by age groups.
#' @return A summarised_result object with results overall and, if specified, by
#' strata.
#' @export

summariseMissingData <- function(cdm,
omopTableName,
col = NULL,
sex = FALSE,
year = FALSE,
ageGroup = NULL){


omopgenerics::validateCdmArgument(cdm)

omopgenerics::assertLogical(sex, length = 1)
omopgenerics::assertChoice(omopTableName,choices = omopgenerics::omopTables(), unique = TRUE)


ageGroup <- omopgenerics::validateAgeGroupArgument(ageGroup, ageGroupName = "")[[1]]

strata <- my_getStrataList(sex = sex, ageGroup = ageGroup, year = year)
stratification <- c(list(character()),omopgenerics::combineStrata(strata))

result_tables <- purrr::map(omopTableName, function(table) {

if (omopgenerics::isTableEmpty(cdm[[table]])){
cli::cli_warn(paste0(table, " omop table is empty."))
return(NULL)
}

omopTable <- cdm[[table]]
col_table <- intersect(col, colnames(omopTable))
if (is.null(col_table) | rlang::is_empty(col_table)){
col_table<-colnames(omopTable)
}

indexDate <- startDate(omopgenerics::tableName(omopTable))
x <- omopTable |> PatientProfiles::addDemographicsQuery(age = FALSE, ageGroup = ageGroup, sex = sex, indexDate = indexDate)
if (year){
x <- x|> dplyr::mutate(year = as.character(clock::get_year(.data[[indexDate]])))
}

result_columns <- purrr::map(col_table, function(c) {

stratified_result <- x |>
dplyr::group_by(dplyr::across(dplyr::all_of(strata))) |>
dplyr::summarise(
na_count = sum(as.integer(is.na(.data[[c]])), na.rm = TRUE),
total_count = dplyr::n(),
.groups = "drop"
) |>
dplyr::collect()

# Group results for each level of stratification
grouped_results <- purrr::map(stratification, function(g) {
stratified_result |>
dplyr::group_by(dplyr::across(dplyr::all_of(g))) |>
dplyr::summarise(
na_count = sum(na_count, na.rm = TRUE),
total_count = sum(total_count, na.rm = TRUE),
colName = c,
.groups = "drop"
) |>
dplyr::mutate(na_percentage = dplyr::if_else(.data$total_count > 0, (.data$na_count / .data$total_count) * 100, 0))
})

return(purrr::reduce(grouped_results, dplyr::bind_rows))

})

res <- purrr::reduce(result_columns, dplyr::union)|>
dplyr::mutate(omop_table = table)

warningDataRequire(cdm = cdm, res = res, table = table)

return(res)
})
if (rlang::is_empty(purrr::compact(result_tables))){
return(omopgenerics::emptySummarisedResult())
}


result <- purrr::compact(result_tables) |>
purrr::reduce(dplyr::union)|>
dplyr::mutate(dplyr::across(dplyr::all_of(strata), ~ dplyr::coalesce(., "overall")))|>
dplyr::mutate(
na_count = as.double(na_count), # Cast na_count to double
na_percentage = as.double(na_percentage)
)|>
tidyr::pivot_longer(
cols = c(na_count, na_percentage),
names_to = "estimate_name",
values_to = "estimate_value"
)


sr <- result |>
dplyr::mutate(
result_id = 1L,
cdm_name = omopgenerics::cdmName(cdm),
) |>
visOmopResults::uniteGroup(cols = "omop_table") |>
visOmopResults::uniteStrata(cols = strata) |>
visOmopResults::uniteAdditional() |>
dplyr::mutate(
"estimate_value" = as.character(.data$estimate_value),
"estimate_type" = "integer",
"variable_level" = NA_character_
) |>
dplyr::rename("variable_name" = "colName") |>
dplyr::select(!c(total_count))

settings <- dplyr::tibble(
result_id = unique(sr$result_id),
package_name = "omopSketch",
package_version = as.character(utils::packageVersion("OmopSketch")),
result_type = "summarise_missing_data"
)
sr <- sr |>
omopgenerics::newSummarisedResult(settings = settings)


return(sr)

}

warningDataRequire <- function(cdm, table, res){
required_cols <- omopgenerics::omopTableFields(CDMConnector::cdmVersion(cdm))|>
dplyr::filter(.data$cdm_table_name==table)|>
dplyr::filter(.data$is_required==TRUE)|>
dplyr::pull(.data$cdm_field_name)
warning_columns <- res |>
dplyr::filter(.data$colName %in% required_cols)|>
dplyr::filter(.data$na_count>0)|>
dplyr::distinct(.data$colName)|>
dplyr::pull()

if (length(warning_columns) > 0) {
cli::cli_warn(c(
"These columns contain missing values, which are not permitted:",
"{.val {warning_columns}}"
))
}
}


52 changes: 52 additions & 0 deletions tests/testthat/test-summariseMissingData.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
test_that("summariseMissingData() works", {
skip_on_cran()
# Load mock database ----
cdm <- cdmEunomia()

# Check all tables work ----
expect_true(inherits(summariseMissingData(cdm, "drug_exposure"),"summarised_result"))
expect_no_error(y<-summariseMissingData(cdm, "observation_period"))
expect_no_error(x<-summariseMissingData(cdm, "visit_occurrence"))
expect_no_error(summariseMissingData(cdm, "condition_occurrence"))
expect_no_error(summariseMissingData(cdm, "drug_exposure"))
expect_no_error(summariseMissingData(cdm, "procedure_occurrence", year = TRUE))
expect_warning(summariseMissingData(cdm, "device_exposure"))
expect_no_error(z<-summariseMissingData(cdm, "measurement"))
expect_no_error(s<-summariseMissingData(cdm, "observation"))
expect_warning(summariseMissingData(cdm, "death"))


expect_no_error(all <- summariseMissingData(cdm, c("observation_period", "visit_occurrence", "measurement")))
expect_equal(all, dplyr::bind_rows(y, x, z))
expect_equal(summariseMissingData(cdm, "observation"), summariseMissingData(cdm, "observation", col = colnames(cdm[['observation']])))
x<-summariseMissingData(cdm, "procedure_occurrence", col = "procedure_date")

expect_equal(summariseMissingData(cdm, c("procedure_occurrence","observation" ), col = "procedure_date"), dplyr::bind_rows(x,s))
y<-summariseMissingData(cdm, "observation",col = "observation_date")
expect_equal(summariseMissingData(cdm, c("procedure_occurrence","observation" ), col = c("procedure_date", "observation_date")), dplyr::bind_rows(x,y))

# Check inputs ----
expect_true(summariseMissingData(cdm, "procedure_occurrence", col="person_id")|>
dplyr::select(estimate_value)|>
dplyr::mutate(estimate_value = as.numeric(estimate_value)) |>
dplyr::summarise(sum = sum(estimate_value)) |>
dplyr::pull() == 0)

expect_true(summariseMissingData(cdm, "procedure_occurrence", col="person_id", sex = TRUE, ageGroup = list(c(0,50), c(51,Inf)))|>
dplyr::distinct(.data$strata_level)|>
dplyr::tally()|>
dplyr::pull()==9)

expect_true(summariseMissingData(cdm, "procedure_occurrence", col="person_id", ageGroup = list(c(0,50)))|>
dplyr::distinct(.data$strata_level)|>
dplyr::tally()|>
dplyr::pull()==3)

cdm$procedure_occurrence <- cdm$procedure_occurrence |>
dplyr::mutate(procedure_concept_id = NA_integer_) |>
dplyr::compute(name = "procedure_occurrence", temporary = FALSE)

expect_warning(summariseMissingData(cdm, "procedure_occurrence", col="procedure_concept_id", ageGroup = list(c(0,50))))

PatientProfiles::mockDisconnect(cdm = cdm)
})

0 comments on commit cc0bf1b

Please sign in to comment.