From 9542588ea59a0e9a6eea9cd57d839b649f87946d Mon Sep 17 00:00:00 2001 From: Rachael Stickland <50215726+RayStick@users.noreply.github.com> Date: Thu, 21 Mar 2024 11:54:10 +0000 Subject: [PATCH] add_lookup_file --- R/data-look_up.R | 18 +++++++++ R/domain_mapping.R | 85 +++++++++++------------------------------- data-raw/look_up.csv | 28 ++++++++++++++ data/look_up.rda | Bin 0 -> 479 bytes man/domain_mapping.Rd | 15 +++----- man/look_up.Rd | 26 +++++++++++++ 6 files changed, 98 insertions(+), 74 deletions(-) create mode 100644 R/data-look_up.R create mode 100644 data-raw/look_up.csv create mode 100644 data/look_up.rda create mode 100644 man/look_up.Rd diff --git a/R/data-look_up.R b/R/data-look_up.R new file mode 100644 index 00000000..5359d627 --- /dev/null +++ b/R/data-look_up.R @@ -0,0 +1,18 @@ +#' Auto-categorisations +#' +#' A list of pre-defined pairings between DataElement (variable) and domain code. \cr \cr +#' For each DataElement that domain_mapping.R processes: \cr \cr +#' If it is contained within this look-up table, it uses the auto-categorised domain code rather than asking the user to categorise.\cr\cr +#' This data was created with these two steps: +#' \enumerate{ +#' \item \code{look_up <- read.csv('browseMetadata/data-raw/look_up.csv')} +#' \item \code{usethis::use_data(look_up)} +#' } +#' @docType data +# +#' @usage data(look_up) +#' +#' @format A data frame with a variable number of rows and 3 columns +#' +#' @source The csv was manually created +"look_up" diff --git a/R/domain_mapping.R b/R/domain_mapping.R index 83fd05d9..5533099a 100755 --- a/R/domain_mapping.R +++ b/R/domain_mapping.R @@ -3,27 +3,21 @@ #' This function will read in the metadata file for a chosen dataset, loop through all the variables, and ask the user to catergorise/label each variable as belonging to one or more domains.\cr \cr #' The domains will appear in the Plots tab and dataset information will be printed to the R console, for the user's reference in making these categorisations. \cr \cr #' A log file will be saved with the catergorisations made. -#' To speed up this process, some auto-categorisations will be made by the function for commonly occurring variables; -#' these auto-categorisations should be verified by the user by checking the csv log file. \cr \cr +#' To speed up this process, some auto-categorisations will be made by the function for commonly occurring variables. \cr \cr #' Example inputs are provided within the package data, for the user to run this function in a demo mode. #' @param json_file The metadata file. This should be downloaded from the metadata catalogue as a json file. See 'data-raw/maternity_indicators_dataset_(mids)_20240105T132210.json' for an example download. #' @param domain_file The domain list file. This should be a csv file created by the user, with each domain listed on a separate line. See 'data-raw/domain_list_demo.csv' for a template. +#' @param look_up_file The look-up table file, with auto-categorisations. By default, the code uses 'data/look-up.rda'. The user can provide their own look-up table in the same format as 'data-raw/look-up.csv'. #' @return The function will return a log file with the mapping between dataset variables and domains, alongside details about the dataset. #' @examples #' # Run in demo mode by providing no inputs: domain_mapping() #' # Demo mode will use the /data files provided in this package -#' # Respond with your initials when prompted. -#' # Respond 'Demo List ' for the description of domain list. -#' # Respond 'Y' if you want to see the descriptions printed out. -#' # Respond '1,10' to the RANGE OF VARIABLES prompt (or process the full 93 variables if you like!) -#' # Reference the plot tab and categorise each variable into a single ('1') domain -#' # or multiple ('1,2') domains. -#' # Write a note explaining your category choice (optional). +#' # For detailed instructions, refer to the package README.md file and the R manual files ('man' directory). #' @export #' @importFrom graphics plot.new #' @importFrom utils read.csv write.csv -domain_mapping <- function(json_file = NULL, domain_file = NULL) { +domain_mapping <- function(json_file = NULL, domain_file = NULL, look_up_file = NULL) { # Load data: Check if demo data should be used if (is.null(json_file) && is.null(domain_file)) { # If both json_file and domain_file are NULL, use demo data @@ -45,11 +39,19 @@ domain_mapping <- function(json_file = NULL, domain_file = NULL) { DomainListDesc <- tools::file_path_sans_ext(basename(domain_file)) } + # Check if user has provided a look-up table + if (is.null(look_up_file)) { + cli_alert_info("Using the default look-up table in data/look-up.rda") + lookup <- get("look_up") + } else { + lookup <- read.csv(look_up_file) + } + # Present domains plots panel for user's reference ---- graphics::plot.new() domains_extend <- rbind(c("*NO MATCH / UNSURE*"), c("*METADATA*"), c("*ALF ID*"), c("*OTHER ID*"), c("*DEMOGRAPHICS*"), domains) gridExtra::grid.table(domains_extend[1], cols = "Domain", rows = 0:(nrow(domains_extend) - 1)) - + # Get user and demo list info for log file ---- User_Initials <- "" while (User_Initials == "") { @@ -147,64 +149,19 @@ domain_mapping <- function(json_file = NULL, domain_file = NULL) { # Loop through each variable, request response from the user to match to a domain ---- for (datavar in start_var:end_var) { - # auto categorise (full string and partial string matches) - if (selectDataClass_df$Label[datavar] == "NA") { - Output[nrow(Output) + 1, ] <- NA - Output$DataElement[datavar] + datavar_index <- which(look_up$DataElement == selectDataClass_df$Label[datavar]) # we should ignore the case + look_up_subset <- look_up[datavar_index,] + if (nrow(look_up_subset) == 1) { + # auto categorisations + Output[nrow(Output) + 1, ] <- NA #why? Output$DataElement[datavar] <- selectDataClass_df$Label[datavar] - Output$Domain_code[datavar] <- "0" + Output$Domain_code[datavar] <- look_up_subset$DomainCode Output$Note[datavar] <- "AUTO CATEGORISED" - } else if (selectDataClass_df$Label[datavar] == "AVAIL_FROM_DT") { - Output[nrow(Output) + 1, ] <- NA - Output$DataElement[datavar] - Output$DataElement[datavar] <- selectDataClass_df$Label[datavar] - Output$Domain_code[datavar] <- "1" - Output$Note[datavar] <- "AUTO CATEGORISED" - } else if ((selectDataClass_df$Label[datavar] == "ALF_E") || - (selectDataClass_df$Label[datavar] == "RALF") || - (selectDataClass_df$Label[datavar] == "ALF_STS_CD") || - (selectDataClass_df$Label[datavar] == "ALF_MTCH_PCT") || - (grepl("_ALF_E", selectDataClass_df$Label[datavar], ignore.case = TRUE)) # grepl because of MOTHER_ALF_E and CHILD_ALF_E etc. - || (grepl("_RALF", selectDataClass_df$Label[datavar], ignore.case = TRUE)) || - (grepl("_ALF_STS_CD", selectDataClass_df$Label[datavar], ignore.case = TRUE)) || - (grepl("_ALF_MTCH_PCT", selectDataClass_df$Label[datavar], ignore.case = TRUE))) { - Output[nrow(Output) + 1, ] <- NA - Output$DataElement[datavar] <- selectDataClass_df$Label[datavar] - Output$Domain_code[datavar] <- "2" - Output$Note[datavar] <- "AUTO CATEGORISED" - } else if (grepl("_ID_", selectDataClass_df$Label[datavar], ignore.case = TRUE)) { # picking up generic IDs - Output[nrow(Output) + 1, ] <- NA - Output$DataElement[datavar] <- selectDataClass_df$Label[datavar] - Output$Domain_code[datavar] <- "3" - Output$Note[datavar] <- "AUTO CATEGORISED" - } else if ((selectDataClass_df$Label[datavar] == "AGE") # likely to be a better way to code this section with fewer lines - || (selectDataClass_df$Label[datavar] == "DOB") || - (selectDataClass_df$Label[datavar] == "WOB") || - (selectDataClass_df$Label[datavar] == "SEX") || - (selectDataClass_df$Label[datavar] == "GENDER") || - (selectDataClass_df$Label[datavar] == "GNDR") || - (grepl("_AGE", selectDataClass_df$Label[datavar], ignore.case = TRUE)) || - (grepl("_DOB", selectDataClass_df$Label[datavar], ignore.case = TRUE)) || - (grepl("_WOB", selectDataClass_df$Label[datavar], ignore.case = TRUE)) || - (grepl("_SEX", selectDataClass_df$Label[datavar], ignore.case = TRUE)) || - (grepl("_GENDER", selectDataClass_df$Label[datavar], ignore.case = TRUE)) || - (grepl("_GNDR", selectDataClass_df$Label[datavar], ignore.case = TRUE)) || - (grepl("AGE_", selectDataClass_df$Label[datavar], ignore.case = TRUE)) || - (grepl("DOB_", selectDataClass_df$Label[datavar], ignore.case = TRUE)) || - (grepl("WOB_", selectDataClass_df$Label[datavar], ignore.case = TRUE)) || - (grepl("SEX_", selectDataClass_df$Label[datavar], ignore.case = TRUE)) || - (grepl("GENDER_", selectDataClass_df$Label[datavar], ignore.case = TRUE)) || - (grepl("GNDR_", selectDataClass_df$Label[datavar], ignore.case = TRUE))) { - Output[nrow(Output) + 1, ] <- NA - Output$DataElement[datavar] <- selectDataClass_df$Label[datavar] - Output$Domain_code[datavar] <- "4" - Output$Note[datavar] <- "AUTO CATEGORISED" - } else { - + } else { # collect user responses decision_output <- user_categorisation(selectDataClass_df$Label[datavar],selectDataClass_df$Description[datavar],selectDataClass_df$Type[datavar]) # input user responses into output - Output[nrow(Output) + 1, ] <- NA + Output[nrow(Output) + 1, ] <- NA #why? Output$DataElement[datavar] <- selectDataClass_df$Label[datavar] Output$Domain_code[datavar] <- decision_output$decision Output$Note[datavar] <- decision_output$decision_note diff --git a/data-raw/look_up.csv b/data-raw/look_up.csv new file mode 100644 index 00000000..c72ac04c --- /dev/null +++ b/data-raw/look_up.csv @@ -0,0 +1,28 @@ +DataElement,DomainLabel,DomainCode +NA,No Match / Unsure,0 +AVAIL_FROM_DT,Metadata,1 +ALF_E,ALF ID,2 +MOTHER_ALF_E,ALF ID,2 +CHILD_ALF_E,ALF ID,2 +RALF,ALF ID,2 +ALF_STS_CD,ALF ID,2 +MOTHER_ALF_STS_CD,ALF ID,2 +CHILD_ALF_STS_CD,ALF ID,2 +ALF_MTCH_PCT,ALF ID,2 +MOTHER_ALF_MTCH_PCT,ALF ID,2 +CHILD_ALF_MTCH_PCT,ALF ID,2 +SERVICE_USER_LOCAL_ID_E,OTHER ID,3 +MAT_SERVICE_USER_LOCAL_ID_E,OTHER ID,3 +CLIENT_ID_E,OTHER ID,3 +AGE,Demographics,4 +MAT_AGE,Demographics,4 +MAT_AGE_AT_ASS,Demographics,4 +CONTACT_AGE,Demographics,4 +WOB,Demographics,4 +MAT_WOB,Demographics,4 +SEX,Demographics,4 +SERVICE_USER_SEX_CD,Demographics,4 +NENONATE_SEX_CD,Demographics,4 +GENDER,Demographics,4 +GNDR,Demographics,4 +GNDR_CD,Demographics,4 \ No newline at end of file diff --git a/data/look_up.rda b/data/look_up.rda new file mode 100644 index 0000000000000000000000000000000000000000..4a3b8aa7670813978fe82856588bf2ccae246eea GIT binary patch literal 479 zcmV<50U-WDT4*^jL0KkKSqapmo&W*(|H1$N_=rFOkN`jD-@w1`-he;=0ssI3zyZ7| zLK-50wHg4>8UWFupa#-p4H^eD#L1IJ69mN50GI?YMg(9vrY209FqkGLm;}Hff-oZh zibT?6o=pVOdW@O}q&*-8hM5eRrc+GAHBv(CC5Yj!%#B#oi6G;FPSO}B%QeSgFAdwc z6+59_{aGg{R7~)x&8|LC^r898B`IAjqb;4Xtov#3#o|&ak|gTD>58Zva08tcg;N~l zlL%5FB>CuGMeM5}iB4SiL>Vj2L`M~nVk&(NR@GA`dwLvE;BMAHJw4_5IqdoF27Emh zUBTaVz|`H}U^{=n{(-!!vYcNUic$y%F>ZZzdVN>}Zc{dm_k+85Gv9WO=}Amngfov^ VwqueoWDQ@%+>uTcBm#9P=YY#R+vWfO literal 0 HcmV?d00001 diff --git a/man/domain_mapping.Rd b/man/domain_mapping.Rd index 5d7280fd..3df3bd1b 100644 --- a/man/domain_mapping.Rd +++ b/man/domain_mapping.Rd @@ -4,12 +4,14 @@ \alias{domain_mapping} \title{domain_mapping} \usage{ -domain_mapping(json_file = NULL, domain_file = NULL) +domain_mapping(json_file = NULL, domain_file = NULL, look_up_file = NULL) } \arguments{ \item{json_file}{The metadata file. This should be downloaded from the metadata catalogue as a json file. See 'data-raw/maternity_indicators_dataset_(mids)_20240105T132210.json' for an example download.} \item{domain_file}{The domain list file. This should be a csv file created by the user, with each domain listed on a separate line. See 'data-raw/domain_list_demo.csv' for a template.} + +\item{look_up_file}{The look-up table file, with auto-categorisations. By default, the code uses 'data/look-up.rda'. The user can provide their own look-up table in the same format as 'data-raw/look-up.csv'.} } \value{ The function will return a log file with the mapping between dataset variables and domains, alongside details about the dataset. @@ -18,18 +20,11 @@ The function will return a log file with the mapping between dataset variables a This function will read in the metadata file for a chosen dataset, loop through all the variables, and ask the user to catergorise/label each variable as belonging to one or more domains.\cr \cr The domains will appear in the Plots tab and dataset information will be printed to the R console, for the user's reference in making these categorisations. \cr \cr A log file will be saved with the catergorisations made. -To speed up this process, some auto-categorisations will be made by the function for commonly occurring variables; -these auto-categorisations should be verified by the user by checking the csv log file. \cr \cr +To speed up this process, some auto-categorisations will be made by the function for commonly occurring variables. \cr \cr Example inputs are provided within the package data, for the user to run this function in a demo mode. } \examples{ # Run in demo mode by providing no inputs: domain_mapping() # Demo mode will use the /data files provided in this package -# Respond with your initials when prompted. -# Respond 'Demo List ' for the description of domain list. -# Respond 'Y' if you want to see the descriptions printed out. -# Respond '1,10' to the RANGE OF VARIABLES prompt (or process the full 93 variables if you like!) -# Reference the plot tab and categorise each variable into a single ('1') domain -# or multiple ('1,2') domains. -# Write a note explaining your category choice (optional). +# For detailed instructions, refer to the package README.md file and the R manual files ('man' directory). } diff --git a/man/look_up.Rd b/man/look_up.Rd new file mode 100644 index 00000000..7eeae2d7 --- /dev/null +++ b/man/look_up.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data-look_up.R +\docType{data} +\name{look_up} +\alias{look_up} +\title{Auto-categorisations} +\format{ +A data frame with a variable number of rows and 3 columns +} +\source{ +The csv was manually created +} +\usage{ +data(look_up) +} +\description{ +A list of pre-defined pairings between DataElement (variable) and domain code. \cr \cr +For each DataElement that domain_mapping.R processes: \cr \cr +If it is contained within this look-up table, it uses the auto-categorised domain code rather than asking the user to categorise.\cr\cr +This data was created with these two steps: +\enumerate{ + \item \code{look_up <- read.csv('browseMetadata/data-raw/look_up.csv')} + \item \code{usethis::use_data(look_up)} +} +} +\keyword{datasets}