diff --git a/NAMESPACE b/NAMESPACE index d5be7486..d1fe3961 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,6 +1,7 @@ # Generated by roxygen2: do not edit by hand export(domain_mapping) +export(user_categorisation) import(cli) import(devtools) import(grid) diff --git a/R/domain_mapping.R b/R/domain_mapping.R index f5e3d1dc..2292463f 100755 --- a/R/domain_mapping.R +++ b/R/domain_mapping.R @@ -16,8 +16,8 @@ #' # Respond 'Demo List ' for the description of domain list. #' # Respond 'Y' if you want to see the descriptions printed out. #' # Respond '1,10' to the RANGE OF VARIABLES prompt (or process the full 93 variables if you like!) -#' # Reference the plot tab and categorise each variable into a single ('1') -#' # or multiple ('1,2') domain. +#' # Reference the plot tab and categorise each variable into a single ('1') domain +#' # or multiple ('1,2') domains. #' # Write a note explaining your category choice (optional). #' @export #' @importFrom graphics plot.new @@ -50,11 +50,14 @@ domain_mapping <- function(json_file = NULL, domain_file = NULL) { domains_extend <- rbind(c("*NO MATCH / UNSURE*"), c("*METADATA*"), c("*ALF ID*"), c("*OTHER ID*"), c("*DEMOGRAPHICS*"), domains) gridExtra::grid.table(domains_extend[1], cols = "Domain", rows = 0:(nrow(domains_extend) - 1)) + # temp - delete later + cat("\n You are in the improve-auto branch \n") + # Get user and demo list info for log file ---- User_Initials <- "" while (User_Initials == "") { cat("\n \n") - User_Initials <- readline(prompt = "ENTER INITIALS: ") + User_Initials <- readline(prompt = "Enter Initials: ") } # Print information about Data Asset ---- @@ -135,7 +138,7 @@ domain_mapping <- function(json_file = NULL, domain_file = NULL) { # User inputs ---- cat("\n \n") - select_vars_n <- readline(prompt = "RANGE OF VARIABLES (DATA ELEMENTS) TO PROCESS (write as 'start_var,end_var' or press Enter to process all): ") + select_vars_n <- readline(prompt = "Enter the range of variables (data elements) to process. Press Enter to process all: ") if (select_vars_n == "") { start_var <- 1 end_var <- length(thisDataClass) @@ -173,7 +176,6 @@ domain_mapping <- function(json_file = NULL, domain_file = NULL) { Output$Domain_code[datavar] <- "2" Output$Note[datavar] <- "AUTO CATEGORISED" } else if (grepl("_ID_", selectDataClass_df$Label[datavar], ignore.case = TRUE)) { # picking up generic IDs - Output[nrow(Output) + 1, ] <- NA Output$DataElement[datavar] <- selectDataClass_df$Label[datavar] Output$Domain_code[datavar] <- "3" @@ -201,50 +203,88 @@ domain_mapping <- function(json_file = NULL, domain_file = NULL) { Output$Domain_code[datavar] <- "4" Output$Note[datavar] <- "AUTO CATEGORISED" } else { - # user response - cat(paste( - "\nDATA ELEMENT -----> ", selectDataClass_df$Label[datavar], - "\n\nDESCRIPTION -----> ", selectDataClass_df$Description[datavar], - "\n\nDATA TYPE -----> ", selectDataClass_df$Type[datavar], "\n" - )) - - decision <- "" - while (decision == "") { - cat("\n \n") - decision <- readline(prompt = "CATEGORISE THIS VARIABLE (input a comma separated list of domain numbers): ") - } - - decision_note <- "" - while (decision_note == "") { - cat("\n \n") - decision_note <- readline(prompt = "NOTES (write 'N' if no notes): ") - } + # collect user responses + decision_output <- user_categorisation(selectDataClass_df$Label[datavar],selectDataClass_df$Description[datavar],selectDataClass_df$Type[datavar]) + # input user responses into output Output[nrow(Output) + 1, ] <- NA Output$DataElement[datavar] <- selectDataClass_df$Label[datavar] - Output$Domain_code[datavar] <- decision - Output$Note[datavar] <- decision_note + Output$Domain_code[datavar] <- decision_output$decision + Output$Note[datavar] <- decision_output$decision_note + } + + # Fill in columns that have all rows identical + Output$Initials <- User_Initials + Output$MetaDataVersion <- meta_json$dataModel$documentationVersion + Output$MetaDataLastUpdated <- meta_json$dataModel$lastUpdated + Output$DomainListDesc <- DomainListDesc + Output$DataAsset <- meta_json$dataModel$label + Output$DataClass <- meta_json$dataModel$childDataClasses[[dc]]$label + + # Save as we go in case session terminates prematurely + Output[Output == ""] <- NA + utils::write.csv(Output, output_fname, row.names = FALSE) # save as we go in case session terminates prematurely + } # end of loop for variable + + # Print the AUTO CATEGORISED responses for this DataClass - request review + Output_auto <- subset(Output, Note == 'AUTO CATEGORISED') + cat("\n \n") + cli_alert_warning("Please check the auto categorised data elements are accurate:") + cat("\n \n") + print(Output_auto[, c("DataClass", "DataElement", "Domain_code")]) + cat("\n \n") + auto_row_str <- readline(prompt = "Enter row numbers you'd like to edit or press enter to accept the auto categorisations: ") + + if (auto_row_str != "") { + + auto_row <- as.integer(unlist(strsplit(auto_row_str,","))) #probably sub-optimal coding + + for (datavar_auto in auto_row) { + + # collect user responses + decision_output <- user_categorisation(selectDataClass_df$Label[datavar_auto],selectDataClass_df$Description[datavar_auto],selectDataClass_df$Type[datavar_auto]) + # input user responses into output + Output$Domain_code[datavar_auto] <- decision_output$decision + Output$Note[datavar_auto] <- decision_output$decision_note } } - # Fill in columns that have all rows identical - Output$Initials <- User_Initials - Output$MetaDataVersion <- meta_json$dataModel$documentationVersion - Output$MetaDataLastUpdated <- meta_json$dataModel$lastUpdated - Output$DomainListDesc <- DomainListDesc - Output$DataAsset <- meta_json$dataModel$label - Output$DataClass <- meta_json$dataModel$childDataClasses[[dc]]$label + # Ask if user wants to review their responses for this DataClass + review_cats <- "" + while (review_cats != "Y" & review_cats != "N") { + cat("\n \n") + review_cats <- readline(prompt = "Would you like to review your categorisations? (Y/N) ") + } + + if (review_cats == 'Y') { + + Output_not_auto <- subset(Output, Note != 'AUTO CATEGORISED') + cat("\n \n") + print(Output_not_auto[, c("DataClass", "DataElement", "Domain_code")]) + cat("\n \n") + not_auto_row_str <- readline(prompt = "Enter row numbers you'd like to edit or press enter to accept: ") + + if (not_auto_row_str != "") { + + not_auto_row <- as.integer(unlist(strsplit(not_auto_row_str,","))) #probably sub-optimal coding + + for (datavar_not_auto in not_auto_row) { - # Save file & print the responses to be saved + # collect user responses + decision_output <- user_categorisation(selectDataClass_df$Label[datavar_not_auto],selectDataClass_df$Description[datavar_not_auto],selectDataClass_df$Type[datavar_not_auto]) + # input user responses into output + Output$Domain_code[datavar_not_auto] <- decision_output$decision + Output$Note[datavar_not_auto] <- decision_output$decision_note + } + } + } + + # Save final categorisations for this DataClass Output[Output == ""] <- NA - utils::write.csv(Output, output_fname, row.names = FALSE) # save as we go in case session terminates prematurely + utils::write.csv(Output, output_fname, row.names = FALSE) cat("\n") - cli_alert_info("The below responses will be saved to {output_fname}") - cat("\n") - print(Output[, c("DataClass", "DataElement", "Domain_code", "Note")]) - } + cli_alert_info("Your final categorisations have been saved to {output_fname}") + + } # end of loop for each data class - cat("\n \n") - cli_alert_warning("Please check the auto categorised data elements are accurate!") - cli_alert_warning("Manually edit csv file to correct errors, if needed.") -} +} # end of function diff --git a/R/user_categorisation.R b/R/user_categorisation.R new file mode 100644 index 00000000..6424ea28 --- /dev/null +++ b/R/user_categorisation.R @@ -0,0 +1,47 @@ +#' user_categorisation +#' +#' This function is used within the domain_mapping function. \cr \cr +#' It displays data properties to the user and requests a categorisation into a domain. \cr \cr +#' An optional note can be included with the categorisation. +#' +#' @param data_element Name of the variable +#' @param data_desc Description of the variable +#' @param data_type Data type of the variable +#' @return It returns a list containing the decision and decision note +#' @export + +user_categorisation <- function(data_element,data_desc,data_type) { + + # print text to R console + cat(paste( + "\nDATA ELEMENT -----> ", data_element, + "\n\nDESCRIPTION -----> ", data_desc, + "\n\nDATA TYPE -----> ", data_type, "\n" + )) + + state <- "redo" + while (state == "redo") { + + # ask user for categorisation + decision <- "" + while (decision == "") { + cat("\n \n") + decision <- readline(prompt = "Categorise this variable: ") + } + + # ask user for note on categorisation + decision_note <- "" + while (decision_note == "") { + cat("\n \n") + decision_note <- readline(prompt = "Notes (write 'N' if no notes): ") + } + + # check if user wants to continue or redo + cat("\n \n") + state <- readline(prompt = "Press enter to continue or write 'redo' to correct previous answer: ") + + } + +return(list(decision = decision,decision_note = decision_note)) + +} diff --git a/README.md b/README.md index c70e6adf..9ee59cb1 100644 --- a/README.md +++ b/README.md @@ -92,7 +92,7 @@ The R console will show: ``` ℹ Running domain_mapping in demo mode using package data files -ENTER INITIALS: +Enter Initials: RS ``` Respond with your initials and press enter. @@ -111,19 +111,19 @@ Rachael Stickland at 2024-01-05T13:22:09.774Z ℹ Found 2 Data Classes (2 tables) in this Data Asset -Would you like to read a description of the Data Asset? (Y/N) +Would you like to read a description of the Data Asset? (Y/N) Y ``` -Press Y to read these descriptions, for the purpose of the demo. +Enter Y to read these descriptions, for the purpose of the demo. For this example, the Data Asset is called MIDS and the tables inside this Data Class are BIRTH and INITIAL_ASSESSMENT. -It will then ask which variables to process: +For each table, it will ask which variables to process: ``` -RANGE OF VARIABLES (DATA ELEMENTS) TO PROCESS (write as 'start_var,end_var' or press Enter to process all): 1,10 +Enter the range of variables (data elements) to process. Press Enter to process all: 1,10 ``` -If you press enter it will process all the variables, so use a smaller number like 10 for this demo. +If you press enter it will process all the variables, so use a smaller range like 1 to 10 the first time you run this demo. For each data element (variable) you will be shown this structure: @@ -136,19 +136,58 @@ DATA TYPE -----> CHARACTER ``` By referencing the plots tab, and other info you may have, categorise this variable with a number(s). -A variable can map to more than one domain. +A variable can map to more than one domain so a comma separated list of numbers can be given (7,8). There is an (optional) note field to explain your choice. ``` -CATEGORISE THIS VARIABLE (input a comma separated list of domain numbers): 8 +Categorise this variable: 8 -NOTES (write 'N' if no notes): N +Notes (write 'N' if no notes): N ``` +If you make a mistake, the next prompt allows you to redo. Or press enter if you are happy to continue. + +``` +Press enter to continue or write 'redo' to correct previous answer: +``` + +When you get to the end of your requested number of variables it will show you variables that have been auto categorised. + +If you want to change these auto categorisations, and do them manually, include the row number (1,9) in the list. + +``` +! Please check the auto categorised data elements are accurate: + + DataClass DataElement Domain_code +1 BIRTH AVAIL_FROM_DT 1 +9 BIRTH CHILD_ALF_E 2 +10 BIRTH CHILD_ALF_STS_CD 2 + +Enter row numbers you'd like to edit or press enter to accept the auto categorisations: +``` + +Finally, it will ask you if you want to review the categorisations you previously made. + +``` +Would you like to review your categorisations? (Y/N) +``` + +If you say yes (with Y) it will take you through the same review process you just did for auto categorisations. + +At the end of processing that Data Class (table) it will show: + +``` +ℹ Your final categorisations have been saved to LOG_MaternityIndicatorsDataset(MIDS)_BIRTH_2024-01-30_10-42-15.csv +``` + +The function will then repeat the same steps for the next Data Class (table). + +#### Understanding the domain list + For this demo, a simple list of domains are provided, see [data-raw/domain_list_demo.csv](data-raw/domain_list_demo.csv). -This list is in this plot tab: +This list shows up in the R plot tab: - [0] *NO MATCH / UNSURE* - [1] *METADATA* @@ -162,12 +201,11 @@ This list is in this plot tab: There are 5 default domains always included [0-4], appended on to any domain list given. -For a research study, your domains will likely be more specific e.g. 'Prenatal, antenatal, neonatal and birth' or 'Health behaviours and diet'. +For a research study, your domains are likely to be much more specific e.g. 'Prenatal, antenatal, neonatal and birth' or 'Health behaviours and diet'. #### Output -The output of your decisions will be pasted to the R console. -These decisions will also be saved to a csv file. +The output of your decisions will be saved to a csv file. The csv file name includes the data asset, data class, and date stamp. This csv file, in addition to what is shown on the console, contains: - user initials (from user input) @@ -179,40 +217,35 @@ The intended use case for this log file is to be loaded up, compared across users, and used as an input in later analysis steps when working out which variables can be used to represent which research domains. +A subset of columns from the csv outputs are shown below, running with '1,10' data elements: + ``` -ℹ The below responses will be saved to LOG_MaternityIndicatorsDataset(MIDS)_BIRTH_2024-01-30_10-42-15.csv - - DataClass DataElement Domain_code Note -1 BIRTH AVAIL_FROM_DT 1 AUTO CATEGORISED -2 BIRTH BABY_BIRTH_DT 4 N -3 BIRTH BIRTH_APGAR_SCORE 8 N -4 BIRTH BIRTH_MODE_CD 8 N -5 BIRTH BIRTH_ORDER 8 N -6 BIRTH BIRTH_OUTCOME_CD 8 N -7 BIRTH BIRTH_TREAT_CD 0 No description given -8 BIRTH BIRTH_TREAT_SITE_CD 6 N -9 BIRTH CHILD_ALF_E 2 AUTO CATEGORISED -10 BIRTH CHILD_ALF_STS_CD 2 AUTO CATEGORISED + DataClass DataElement Domain_code Note +1 BIRTH AVAIL_FROM_DT 1 AUTO CATEGORISED +2 BIRTH BABY_BIRTH_DT 4 N +3 BIRTH BIRTH_APGAR_SCORE 8 N +4 BIRTH BIRTH_MODE_CD 8 N +5 BIRTH BIRTH_ORDER 8 N +6 BIRTH BIRTH_OUTCOME_CD 8 N +7 BIRTH BIRTH_TREAT_CD 0 No description given +8 BIRTH BIRTH_TREAT_SITE_CD 6 N +9 BIRTH CHILD_ALF_E 2 AUTO CATEGORISED +10 BIRTH CHILD_ALF_STS_CD 2 AUTO CATEGORISED ``` ``` -ℹ The below responses will be saved to LOG_MaternityIndicatorsDataset(MIDS)_INITIAL_ASSESSMENT_2024-01-30_10-43-05.csv - - DataClass DataElement Domain_code Note -1 INITIAL_ASSESSMENT AVAIL_FROM_DT 1 AUTO CATEGORISED -2 INITIAL_ASSESSMENT GEST_WEEKS 8 N -3 INITIAL_ASSESSMENT INITIAL_ASS_DT 8 Date of health visit -4 INITIAL_ASSESSMENT MAT_AGE_AT_ASS 4 AUTO CATEGORISED -5 INITIAL_ASSESSMENT MOTHER_ALF_E 2 AUTO CATEGORISED -6 INITIAL_ASSESSMENT MOTHER_ALF_STS_CD 2 AUTO CATEGORISED -7 INITIAL_ASSESSMENT PROV_CD 6,8 Org code for health provider -8 INITIAL_ASSESSMENT SERVICE_USER_GRAVIDA_CD 8 N -9 INITIAL_ASSESSMENT SERVICE_USER_HAS_MENTAL_HEALTH_CARE_PLAN_CD 8 N -10 INITIAL_ASSESSMENT SERVICE_USER_HAS_MENTAL_HEALTH_CONDITION_CD 8 N - -! Please check the auto categorised data elements are accurate! -! Manually edit csv file to correct errors, if needed. -``` + DataClass DataElement Domain_code Note +1 INITIAL_ASSESSMENT AVAIL_FROM_DT 1 AUTO CATEGORISED +2 INITIAL_ASSESSMENT GEST_WEEKS 8 N +3 INITIAL_ASSESSMENT INITIAL_ASS_DT 8 Date of health visit +4 INITIAL_ASSESSMENT MAT_AGE_AT_ASS 4 AUTO CATEGORISED +5 INITIAL_ASSESSMENT MOTHER_ALF_E 2 AUTO CATEGORISED +6 INITIAL_ASSESSMENT MOTHER_ALF_STS_CD 2 AUTO CATEGORISED +7 INITIAL_ASSESSMENT PROV_CD 6,8 Org code for health provider +8 INITIAL_ASSESSMENT SERVICE_USER_GRAVIDA_CD 8 N +9 INITIAL_ASSESSMENT SERVICE_USER_HAS_MENTAL_HEALTH_CARE_PLAN_CD 8 N +10 INITIAL_ASSESSMENT SERVICE_USER_HAS_MENTAL_HEALTH_CONDITION_CD 8 N + ``` ### Using your own input files diff --git a/man/domain_mapping.Rd b/man/domain_mapping.Rd index bf4ae21a..5d7280fd 100644 --- a/man/domain_mapping.Rd +++ b/man/domain_mapping.Rd @@ -29,7 +29,7 @@ Example inputs are provided within the package data, for the user to run this fu # Respond 'Demo List ' for the description of domain list. # Respond 'Y' if you want to see the descriptions printed out. # Respond '1,10' to the RANGE OF VARIABLES prompt (or process the full 93 variables if you like!) -# Reference the plot tab and categorise each variable into a single ('1') -# or multiple ('1,2') domain. +# Reference the plot tab and categorise each variable into a single ('1') domain +# or multiple ('1,2') domains. # Write a note explaining your category choice (optional). } diff --git a/man/user_categorisation.Rd b/man/user_categorisation.Rd new file mode 100644 index 00000000..d7e64427 --- /dev/null +++ b/man/user_categorisation.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/user_categorisation.R +\name{user_categorisation} +\alias{user_categorisation} +\title{user_categorisation} +\usage{ +user_categorisation(data_element, data_desc, data_type) +} +\arguments{ +\item{data_element}{Name of the variable} + +\item{data_desc}{Description of the variable} + +\item{data_type}{Data type of the variable} +} +\value{ +It returns a list containing the decision and decision note +} +\description{ +This function is used within the domain_mapping function. \cr \cr +It displays data properties to the user and requests a categorisation into a domain. \cr \cr +An optional note can be included with the categorisation. +}