From 9542588ea59a0e9a6eea9cd57d839b649f87946d Mon Sep 17 00:00:00 2001
From: Rachael Stickland <50215726+RayStick@users.noreply.github.com>
Date: Thu, 21 Mar 2024 11:54:10 +0000
Subject: [PATCH] add_lookup_file

---
 R/data-look_up.R      |  18 +++++++++
 R/domain_mapping.R    |  85 +++++++++++-------------------------------
 data-raw/look_up.csv  |  28 ++++++++++++++
 data/look_up.rda      | Bin 0 -> 479 bytes
 man/domain_mapping.Rd |  15 +++-----
 man/look_up.Rd        |  26 +++++++++++++
 6 files changed, 98 insertions(+), 74 deletions(-)
 create mode 100644 R/data-look_up.R
 create mode 100644 data-raw/look_up.csv
 create mode 100644 data/look_up.rda
 create mode 100644 man/look_up.Rd

diff --git a/R/data-look_up.R b/R/data-look_up.R
new file mode 100644
index 00000000..5359d627
--- /dev/null
+++ b/R/data-look_up.R
@@ -0,0 +1,18 @@
+#' Auto-categorisations
+#'
+#' A list of pre-defined pairings between DataElement (variable) and domain code. \cr \cr
+#' For each DataElement that domain_mapping.R processes: \cr \cr
+#' If it is contained within this look-up table, it uses the auto-categorised domain code rather than asking the user to categorise.\cr\cr
+#' This data was created with these two steps:
+#' \enumerate{
+#'  \item \code{look_up <- read.csv('browseMetadata/data-raw/look_up.csv')}
+#'  \item \code{usethis::use_data(look_up)}
+#' }
+#' @docType data
+#
+#' @usage data(look_up)
+#'
+#' @format A data frame with a variable number of rows and 3 columns
+#'
+#' @source The csv was manually created
+"look_up"
diff --git a/R/domain_mapping.R b/R/domain_mapping.R
index 83fd05d9..5533099a 100755
--- a/R/domain_mapping.R
+++ b/R/domain_mapping.R
@@ -3,27 +3,21 @@
 #' This function will read in the metadata file for a chosen dataset, loop through all the variables, and ask the user to catergorise/label each variable as belonging to one or more domains.\cr \cr
 #' The domains will appear in the Plots tab and dataset information will be printed to the R console, for the user's reference in making these categorisations. \cr \cr
 #' A log file will be saved with the catergorisations made.
-#' To speed up this process, some auto-categorisations will be made by the function for commonly occurring variables;
-#' these auto-categorisations should be verified by the user by checking the csv log file. \cr \cr
+#' To speed up this process, some auto-categorisations will be made by the function for commonly occurring variables. \cr \cr
 #' Example inputs are provided within the package data, for the user to run this function in a demo mode.
 #' @param json_file The metadata file. This should be downloaded from the metadata catalogue as a json file. See 'data-raw/maternity_indicators_dataset_(mids)_20240105T132210.json' for an example download.
 #' @param domain_file The domain list file. This should be a csv file created by the user, with each domain listed on a separate line. See 'data-raw/domain_list_demo.csv' for a template.
+#' @param look_up_file The look-up table file, with auto-categorisations. By default, the code uses 'data/look-up.rda'. The user can provide their own look-up table in the same format as 'data-raw/look-up.csv'.
 #' @return The function will return a log file with the mapping between dataset variables and domains, alongside details about the dataset.
 #' @examples
 #' # Run in demo mode by providing no inputs: domain_mapping()
 #' # Demo mode will use the /data files provided in this package
-#' # Respond with your initials when prompted.
-#' # Respond 'Demo List ' for the description of domain list.
-#' # Respond 'Y' if you want to see the descriptions printed out.
-#' # Respond '1,10' to the RANGE OF VARIABLES prompt (or process the full 93 variables if you like!)
-#' # Reference the plot tab and categorise each variable into a single ('1') domain
-#' # or multiple ('1,2') domains.
-#' # Write a note explaining your category choice (optional).
+#' # For detailed instructions, refer to the package README.md file and the R manual files ('man' directory).
 #' @export
 #' @importFrom graphics plot.new
 #' @importFrom utils read.csv write.csv
 
-domain_mapping <- function(json_file = NULL, domain_file = NULL) {
+domain_mapping <- function(json_file = NULL, domain_file = NULL, look_up_file = NULL) {
   # Load data: Check if demo data should be used
   if (is.null(json_file) && is.null(domain_file)) {
     # If both json_file and domain_file are NULL, use demo data
@@ -45,11 +39,19 @@ domain_mapping <- function(json_file = NULL, domain_file = NULL) {
     DomainListDesc <- tools::file_path_sans_ext(basename(domain_file))
   }
 
+  # Check if user has provided a look-up table
+  if (is.null(look_up_file)) {
+    cli_alert_info("Using the default look-up table in data/look-up.rda")
+    lookup <- get("look_up")
+  } else {
+    lookup <- read.csv(look_up_file)
+  }
+
   # Present domains plots panel for user's reference ----
   graphics::plot.new()
   domains_extend <- rbind(c("*NO MATCH / UNSURE*"), c("*METADATA*"), c("*ALF ID*"), c("*OTHER ID*"), c("*DEMOGRAPHICS*"), domains)
   gridExtra::grid.table(domains_extend[1], cols = "Domain", rows = 0:(nrow(domains_extend) - 1))
-  
+
   # Get user and demo list info for log file ----
   User_Initials <- ""
   while (User_Initials == "") {
@@ -147,64 +149,19 @@ domain_mapping <- function(json_file = NULL, domain_file = NULL) {
 
     # Loop through each variable, request response from the user to match to a domain ----
     for  (datavar in start_var:end_var) {
-      # auto categorise (full string and partial string matches)
-      if (selectDataClass_df$Label[datavar] == "NA") {
-        Output[nrow(Output) + 1, ] <- NA
-        Output$DataElement[datavar]
+      datavar_index <- which(look_up$DataElement == selectDataClass_df$Label[datavar]) # we should ignore the case
+      look_up_subset <- look_up[datavar_index,]
+      if (nrow(look_up_subset) == 1) {
+        # auto categorisations
+        Output[nrow(Output) + 1, ] <- NA #why?
         Output$DataElement[datavar] <- selectDataClass_df$Label[datavar]
-        Output$Domain_code[datavar] <- "0"
+        Output$Domain_code[datavar] <- look_up_subset$DomainCode
         Output$Note[datavar] <- "AUTO CATEGORISED"
-      } else if (selectDataClass_df$Label[datavar] == "AVAIL_FROM_DT") {
-        Output[nrow(Output) + 1, ] <- NA
-        Output$DataElement[datavar]
-        Output$DataElement[datavar] <- selectDataClass_df$Label[datavar]
-        Output$Domain_code[datavar] <- "1"
-        Output$Note[datavar] <- "AUTO CATEGORISED"
-      } else if ((selectDataClass_df$Label[datavar] == "ALF_E") ||
-        (selectDataClass_df$Label[datavar] == "RALF") ||
-        (selectDataClass_df$Label[datavar] == "ALF_STS_CD") ||
-        (selectDataClass_df$Label[datavar] == "ALF_MTCH_PCT") ||
-        (grepl("_ALF_E", selectDataClass_df$Label[datavar], ignore.case = TRUE)) # grepl because of MOTHER_ALF_E and CHILD_ALF_E etc.
-      || (grepl("_RALF", selectDataClass_df$Label[datavar], ignore.case = TRUE)) ||
-        (grepl("_ALF_STS_CD", selectDataClass_df$Label[datavar], ignore.case = TRUE)) ||
-        (grepl("_ALF_MTCH_PCT", selectDataClass_df$Label[datavar], ignore.case = TRUE))) {
-        Output[nrow(Output) + 1, ] <- NA
-        Output$DataElement[datavar] <- selectDataClass_df$Label[datavar]
-        Output$Domain_code[datavar] <- "2"
-        Output$Note[datavar] <- "AUTO CATEGORISED"
-      } else if (grepl("_ID_", selectDataClass_df$Label[datavar], ignore.case = TRUE)) { # picking up generic IDs
-        Output[nrow(Output) + 1, ] <- NA
-        Output$DataElement[datavar] <- selectDataClass_df$Label[datavar]
-        Output$Domain_code[datavar] <- "3"
-        Output$Note[datavar] <- "AUTO CATEGORISED"
-      } else if ((selectDataClass_df$Label[datavar] == "AGE") # likely to be a better way to code this section with fewer lines
-      || (selectDataClass_df$Label[datavar] == "DOB") ||
-        (selectDataClass_df$Label[datavar] == "WOB") ||
-        (selectDataClass_df$Label[datavar] == "SEX") ||
-        (selectDataClass_df$Label[datavar] == "GENDER") ||
-        (selectDataClass_df$Label[datavar] == "GNDR") ||
-        (grepl("_AGE", selectDataClass_df$Label[datavar], ignore.case = TRUE)) ||
-        (grepl("_DOB", selectDataClass_df$Label[datavar], ignore.case = TRUE)) ||
-        (grepl("_WOB", selectDataClass_df$Label[datavar], ignore.case = TRUE)) ||
-        (grepl("_SEX", selectDataClass_df$Label[datavar], ignore.case = TRUE)) ||
-        (grepl("_GENDER", selectDataClass_df$Label[datavar], ignore.case = TRUE)) ||
-        (grepl("_GNDR", selectDataClass_df$Label[datavar], ignore.case = TRUE)) ||
-        (grepl("AGE_", selectDataClass_df$Label[datavar], ignore.case = TRUE)) ||
-        (grepl("DOB_", selectDataClass_df$Label[datavar], ignore.case = TRUE)) ||
-        (grepl("WOB_", selectDataClass_df$Label[datavar], ignore.case = TRUE)) ||
-        (grepl("SEX_", selectDataClass_df$Label[datavar], ignore.case = TRUE)) ||
-        (grepl("GENDER_", selectDataClass_df$Label[datavar], ignore.case = TRUE)) ||
-        (grepl("GNDR_", selectDataClass_df$Label[datavar], ignore.case = TRUE))) {
-        Output[nrow(Output) + 1, ] <- NA
-        Output$DataElement[datavar] <- selectDataClass_df$Label[datavar]
-        Output$Domain_code[datavar] <- "4"
-        Output$Note[datavar] <- "AUTO CATEGORISED"
-      } else {
-
+        } else {
         # collect user responses
         decision_output <- user_categorisation(selectDataClass_df$Label[datavar],selectDataClass_df$Description[datavar],selectDataClass_df$Type[datavar])
         # input user responses into output
-        Output[nrow(Output) + 1, ] <- NA
+        Output[nrow(Output) + 1, ] <- NA #why?
         Output$DataElement[datavar] <- selectDataClass_df$Label[datavar]
         Output$Domain_code[datavar] <- decision_output$decision
         Output$Note[datavar] <- decision_output$decision_note
diff --git a/data-raw/look_up.csv b/data-raw/look_up.csv
new file mode 100644
index 00000000..c72ac04c
--- /dev/null
+++ b/data-raw/look_up.csv
@@ -0,0 +1,28 @@
+DataElement,DomainLabel,DomainCode
+NA,No Match / Unsure,0
+AVAIL_FROM_DT,Metadata,1
+ALF_E,ALF ID,2
+MOTHER_ALF_E,ALF ID,2
+CHILD_ALF_E,ALF ID,2
+RALF,ALF ID,2
+ALF_STS_CD,ALF ID,2
+MOTHER_ALF_STS_CD,ALF ID,2
+CHILD_ALF_STS_CD,ALF ID,2
+ALF_MTCH_PCT,ALF ID,2
+MOTHER_ALF_MTCH_PCT,ALF ID,2
+CHILD_ALF_MTCH_PCT,ALF ID,2
+SERVICE_USER_LOCAL_ID_E,OTHER ID,3
+MAT_SERVICE_USER_LOCAL_ID_E,OTHER ID,3
+CLIENT_ID_E,OTHER ID,3
+AGE,Demographics,4
+MAT_AGE,Demographics,4
+MAT_AGE_AT_ASS,Demographics,4
+CONTACT_AGE,Demographics,4
+WOB,Demographics,4
+MAT_WOB,Demographics,4
+SEX,Demographics,4
+SERVICE_USER_SEX_CD,Demographics,4
+NENONATE_SEX_CD,Demographics,4
+GENDER,Demographics,4
+GNDR,Demographics,4
+GNDR_CD,Demographics,4
\ No newline at end of file
diff --git a/data/look_up.rda b/data/look_up.rda
new file mode 100644
index 0000000000000000000000000000000000000000..4a3b8aa7670813978fe82856588bf2ccae246eea
GIT binary patch
literal 479
zcmV<50U-WDT4*^jL0KkKSqapmo&W*(|H1$N_=rFOkN`jD-@w1`-he;=0ssI3zyZ7|
zLK-50wHg4>8UWFupa#-p4H^eD#L1IJ69mN50GI?YMg(9vrY209FqkGLm;}Hff-oZh
zibT?6o=pVOdW@O}q&*-8hM5eRrc+GAHBv(CC5Yj!%#B#oi6G;FPSO}B%QeSgFAdwc
z6+59_{aGg{R7~)x&8|LC^r898B`IAjqb;4Xtov#3#o|&ak|gTD>58Zva08tcg;N~l
zlL%5FB><vV2f7dnLiS}7L$~3@z3KF#pkfs`2qZXqOtbI^TEYxF00{)L+r{dHk<_1(
z`w)m?A-%_+t)p&Wb4tFbh+-nj`MTYVFf_}1<-JUBhF4T`ee4?tjpatx+uLwp6BP-}
z(x=@+T_9kaqO@yhK!lv$!mz)3x{Dz)Og3e4dXfrJmI;vX{~eVakontia^TmbZfQ1<
zSn8IBd>CuGMeM5}iB4SiL>Vj2L`M~nVk&(NR@GA`dwLvE;BMAHJw4_5IqdoF27Emh
zUBTaVz|`H}U^{=n{(-!!vYcNUic$y%F>ZZzdVN>}Zc{dm_k+85Gv9WO=}Amngfov^
VwqueoWDQ@%+>uTcBm#9P=YY#R+vWfO

literal 0
HcmV?d00001

diff --git a/man/domain_mapping.Rd b/man/domain_mapping.Rd
index 5d7280fd..3df3bd1b 100644
--- a/man/domain_mapping.Rd
+++ b/man/domain_mapping.Rd
@@ -4,12 +4,14 @@
 \alias{domain_mapping}
 \title{domain_mapping}
 \usage{
-domain_mapping(json_file = NULL, domain_file = NULL)
+domain_mapping(json_file = NULL, domain_file = NULL, look_up_file = NULL)
 }
 \arguments{
 \item{json_file}{The metadata file. This should be downloaded from the metadata catalogue as a json file. See 'data-raw/maternity_indicators_dataset_(mids)_20240105T132210.json' for an example download.}
 
 \item{domain_file}{The domain list file. This should be a csv file created by the user, with each domain listed on a separate line. See 'data-raw/domain_list_demo.csv' for a template.}
+
+\item{look_up_file}{The look-up table file, with auto-categorisations. By default, the code uses 'data/look-up.rda'. The user can provide their own look-up table in the same format as 'data-raw/look-up.csv'.}
 }
 \value{
 The function will return a log file with the mapping between dataset variables and domains, alongside details about the dataset.
@@ -18,18 +20,11 @@ The function will return a log file with the mapping between dataset variables a
 This function will read in the metadata file for a chosen dataset, loop through all the variables, and ask the user to catergorise/label each variable as belonging to one or more domains.\cr \cr
 The domains will appear in the Plots tab and dataset information will be printed to the R console, for the user's reference in making these categorisations. \cr \cr
 A log file will be saved with the catergorisations made.
-To speed up this process, some auto-categorisations will be made by the function for commonly occurring variables;
-these auto-categorisations should be verified by the user by checking the csv log file. \cr \cr
+To speed up this process, some auto-categorisations will be made by the function for commonly occurring variables. \cr \cr
 Example inputs are provided within the package data, for the user to run this function in a demo mode.
 }
 \examples{
 # Run in demo mode by providing no inputs: domain_mapping()
 # Demo mode will use the /data files provided in this package
-# Respond with your initials when prompted.
-# Respond 'Demo List ' for the description of domain list.
-# Respond 'Y' if you want to see the descriptions printed out.
-# Respond '1,10' to the RANGE OF VARIABLES prompt (or process the full 93 variables if you like!)
-# Reference the plot tab and categorise each variable into a single ('1') domain
-# or multiple ('1,2') domains.
-# Write a note explaining your category choice (optional).
+# For detailed instructions, refer to the package README.md file and the R manual files ('man' directory).
 }
diff --git a/man/look_up.Rd b/man/look_up.Rd
new file mode 100644
index 00000000..7eeae2d7
--- /dev/null
+++ b/man/look_up.Rd
@@ -0,0 +1,26 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/data-look_up.R
+\docType{data}
+\name{look_up}
+\alias{look_up}
+\title{Auto-categorisations}
+\format{
+A data frame with a variable number of rows and 3 columns
+}
+\source{
+The csv was manually created
+}
+\usage{
+data(look_up)
+}
+\description{
+A list of pre-defined pairings between DataElement (variable) and domain code. \cr \cr
+For each DataElement that domain_mapping.R processes: \cr \cr
+If it is contained within this look-up table, it uses the auto-categorised domain code rather than asking the user to categorise.\cr\cr
+This data was created with these two steps:
+\enumerate{
+ \item \code{look_up <- read.csv('browseMetadata/data-raw/look_up.csv')}
+ \item \code{usethis::use_data(look_up)}
+}
+}
+\keyword{datasets}