From bf64d0b16f6227f3bf5a3cad810f47f500ef72b0 Mon Sep 17 00:00:00 2001 From: Rachael Stickland <50215726+RayStick@users.noreply.github.com> Date: Wed, 3 Jul 2024 16:32:52 +0100 Subject: [PATCH 1/9] update to latest version --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index d459ed58..9b77a7fb 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: browseMetadata Type: Package Title: Browses available metadata, to catergorise or label each variable in a dataset -Version: 1.0.0 +Version: 1.0.1 Authors@R: person("Rachael", "Stickland", email = "rstickland@turing.ac.uk", role = c("aut", "cre"), comment = c(ORCID = "0000-0003-3398-4272")) Maintainer: Rachael Stickland From 0b53bbfa96ee8cc4eb437a031e50cc62d947f597 Mon Sep 17 00:00:00 2001 From: Rachael Stickland <50215726+RayStick@users.noreply.github.com> Date: Wed, 3 Jul 2024 16:33:30 +0100 Subject: [PATCH 2/9] split outputs & track package version --- R/domain_mapping.R | 48 ++++++++++++++++++++++++++++++---------------- 1 file changed, 32 insertions(+), 16 deletions(-) diff --git a/R/domain_mapping.R b/R/domain_mapping.R index eb688b1c..ebe2dd9a 100755 --- a/R/domain_mapping.R +++ b/R/domain_mapping.R @@ -139,6 +139,9 @@ domain_mapping <- function(json_file = NULL, domain_file = NULL, look_up_file = readline(prompt = "Press any key to proceed") } + cat("\n \n") + table_note <- readline("Optional free text note about this table (or press enter to continue): ") + thisTable <- meta_json$dataModel$childDataClasses[[dc]]$childDataElements # probably a better way of dealing with complex json files in R ... thisTable_df <- data.frame(do.call(rbind, thisTable)) # nested list to dataframe dataType_df <- data.frame(do.call(rbind, thisTable_df$dataType)) # nested list to dataframe @@ -155,16 +158,24 @@ domain_mapping <- function(json_file = NULL, domain_file = NULL, look_up_file = timestamp_now <- gsub(" ", "_", Sys.time()) timestamp_now <- gsub(":", "-", timestamp_now) - output_fname_csv <- paste0("LOG_", gsub(" ", "", meta_json$dataModel$label), "_", gsub(" ", "", meta_json$dataModel$childDataClasses[[dc]]$label), "_", timestamp_now, ".csv") + output_fname_csv <- paste0("OUTPUT_", gsub(" ", "", meta_json$dataModel$label), "_", gsub(" ", "", meta_json$dataModel$childDataClasses[[dc]]$label), "_", timestamp_now, ".csv") + output_fname_log_csv <- paste0("LOG_", gsub(" ", "", meta_json$dataModel$label), "_", gsub(" ", "", meta_json$dataModel$childDataClasses[[dc]]$label), "_", timestamp_now, ".csv") output_fname_png <- paste0("PLOT_", gsub(" ", "", meta_json$dataModel$label), "_", gsub(" ", "", meta_json$dataModel$childDataClasses[[dc]]$label), "_", timestamp_now, ".png") + log_Output <- data.frame( + timestamp = character(1), + browseMetadata = character(1), + Initials = character(1), + MetaDataVersion = character(1), + MetaDataLastUpdated = character(1), + DomainListDesc = character(1), + Dataset = character(1), + Table = character(1), + Table_note = character(1) + ) + row_Output <- data.frame( - Initials = character(0), - MetaDataVersion = character(0), - MetaDataLastUpdated = character(0), - DomainListDesc = character(0), - Dataset = character(0), - Table = character(0), + timestamp = character(0), DataElement_N = character(0), DataElement = character(0), Domain_code = character(0), @@ -293,20 +304,26 @@ domain_mapping <- function(json_file = NULL, domain_file = NULL, look_up_file = } ## Fill in columns that have all rows identical ---- - Output$Initials <- User_Initials - Output$MetaDataVersion <- meta_json$dataModel$documentationVersion - Output$MetaDataLastUpdated <- meta_json$dataModel$lastUpdated - Output$DomainListDesc <- DomainListDesc - Output$Dataset <- meta_json$dataModel$label - Output$Table <- meta_json$dataModel$childDataClasses[[dc]]$label + log_Output$timestamp <- timestamp_now + log_Output$browseMetadata <- packageVersion("browseMetadata") + log_Output$Initials <- User_Initials + log_Output$MetaDataVersion <- meta_json$dataModel$documentationVersion + log_Output$MetaDataLastUpdated <- meta_json$dataModel$lastUpdated + log_Output$DomainListDesc <- DomainListDesc + log_Output$Dataset <- meta_json$dataModel$label + log_Output$Table <- meta_json$dataModel$childDataClasses[[dc]]$label + log_Output$Table_note <- table_note ## Save final categorisations for this Table ---- + Output$timestamp <- timestamp_now if (is.null(output_dir)) { output_dir = getwd() } utils::write.csv(Output, paste(output_dir,output_fname_csv,sep='/'), row.names = FALSE) + utils::write.csv(log_Output, paste(output_dir,output_fname_log_csv,sep='/'), row.names = FALSE) cat("\n") - cli_alert_success("Your final categorisations have been saved to {output_fname_csv}") + cli_alert_success("Your final categorisations have been saved:\n{output_fname_csv}") + cli_alert_success("Your session log has been saved:\n{output_fname_log_csv}") ## Create and save a summary plot counts <- Output %>% group_by(Domain_code) %>% count() %>% arrange(n) @@ -323,8 +340,7 @@ domain_mapping <- function(json_file = NULL, domain_file = NULL, look_up_file = full_plot <- grid.arrange(Domain_plot, Domain_table,nrow=1,ncol=2) ggsave(plot = full_plot,paste(output_dir,output_fname_png,sep='/'),width = 14, height = 8, units = "in") - cat("\n") - cli_alert_success("A summary plot has been saved to {output_fname_png}") + cli_alert_success("A summary plot has been saved:\n{output_fname_png}") } # end of loop for each table From e576c1f61287442decd6a8b7243b93156459af03 Mon Sep 17 00:00:00 2001 From: Rachael Stickland <50215726+RayStick@users.noreply.github.com> Date: Thu, 4 Jul 2024 09:50:17 +0100 Subject: [PATCH 3/9] change to match 2 csv inputs and fix bugs --- R/compare_csv_outputs.R | 129 ++++++++++++++++++++++------------------ 1 file changed, 72 insertions(+), 57 deletions(-) diff --git a/R/compare_csv_outputs.R b/R/compare_csv_outputs.R index cf41ae1f..9442d3e8 100644 --- a/R/compare_csv_outputs.R +++ b/R/compare_csv_outputs.R @@ -1,83 +1,99 @@ #' compare_csv_outputs #' #' This function is to be used after running the domain_mapping function. \cr \cr -#' It compares two csv outputs, finds their differences, and asks for a consensus. \cr \cr +#' It compares outputs from two sessions, finds their differences, and asks for a consensus. \cr \cr #' -#' @param csv_file_1 CSV output file from running domain_mapping -#' @param csv_file_2 CSV output file from running domain_mapping (different to csv_file_1) -#' @param json_file The metadata file used when running domain_mapping (should be the same for csv_file_1 and csv_file_2) -#' @return It returns csv_3, with consensus decisions +#' @param csv_file_1a CSV log output file from running domain_mapping (session 1) +#' @param csv_file_1b CSV file from running domain_mapping (session 1) +#' @param csv_file_2a CSV log output file from running domain_mapping (session 2) +#' @param csv_file_2b CSV file from running domain_mapping (session 2) +#' @param json_file The metadata file used when running domain_mapping (should be the same for session 1 and session 2) +#' @return It returns a csv output, which represents the consensus decisions between session 1 and session 2 #' @importFrom dplyr left_join select join_by #' @export -compare_csv_outputs <- function(csv_file_1,csv_file_2,json_file) { +compare_csv_outputs <- function(csv_file_1a,csv_file_1b,csv_file_2a,csv_file_2b,json_file) { timestamp_now <- gsub(" ", "_", Sys.time()) timestamp_now <- gsub(":", "-", timestamp_now) - # read in input files - csv_1 <- read.csv(csv_file_1) - csv_2 <- read.csv(csv_file_2) + # read in the log input files: + + csv_1a <- read.csv(csv_file_1a) + csv_2a <- read.csv(csv_file_2a) + csv_1b <- read.csv(csv_file_1b) + csv_2b <- read.csv(csv_file_2b) meta_json <- rjson::fromJSON(file = json_file) - # check the csv files can be compared - if (csv_1$MetaDataVersion[1] != csv_2$MetaDataVersion[1]){ + # check the session csvs can be compared to each other and to the json (min requirements): + + if (csv_1a$DomainListDesc[1] != csv_2a$DomainListDesc[1]){ cat("\n\n") - cli_alert_danger("Cannot compare csv 1 and csv 2: different metadata versions") + cli_alert_danger("Cannot compare session 1 and 2: different domain lists") stop()} - if (csv_1$MetaDataLastUpdated[1] != csv_2$MetaDataLastUpdated[1]){ + if (csv_1a$Dataset[1] != csv_2a$Dataset[1]){ cat("\n\n") - cli_alert_danger("Cannot compare csv 1 and csv 2: different dates for metadata") + cli_alert_danger("Cannot compare session 1 and 2: different datasets") stop()} - if (csv_1$DomainListDesc[1] != csv_2$DomainListDesc[1]){ + if (csv_1a$Table[1] != csv_2a$Table[1]){ cat("\n\n") - cli_alert_danger("Cannot compare csv 1 and csv 2: different domain lists") + cli_alert_danger("Cannot compare session 1 and 2: different tables") stop()} - if (csv_1$Dataset[1] != csv_2$Dataset[1]){ + if (csv_1a$Dataset[1] != meta_json$dataModel$label){ cat("\n\n") - cli_alert_danger("Cannot compare csv 1 and csv 2: different datasets") + cli_alert_danger("The csv files do not match the json: different datasets") stop()} - if (csv_1$Table[1] != csv_2$Table[1]){ + # check the session csvs can be compared to each other and to the json (warnings for user to check): + + if (csv_1a$browseMetadata[1] != csv_2a$browseMetadata[1]){ cat("\n\n") - cli_alert_danger("Cannot compare csv 1 and csv 2: different tables") - stop()} + cli_alert_warning("Different version of browseMetadata for session 1 and 2!\nValid comparison may not be possible - please check!") + continue <- readline("Press enter to continue or Esc to cancel: ")} - if (length(csv_1) != length(csv_2)){ + if (csv_1a$MetaDataVersion[1] != csv_2a$MetaDataVersion[1]){ cat("\n\n") - cli_alert_warning("Different number of rows in csv 1 and csv 2 - please check")} + cli_alert_warning("Different metadata versions for session 1 and 2\nValid comparison may not be possible - please check!") + continue <- readline("Press enter to continue or Esc to cancel: ")} - # check the csv files can be compared to the meta_json - if (csv_1$MetaDataVersion[1] != meta_json$dataModel$documentationVersion){ + if (csv_1a$MetaDataVersion[1] != meta_json$dataModel$documentationVersion){ cat("\n\n") - cli_alert_danger("The csv files do not match the json: different metadata versions") - stop()} + cli_alert_warning("The session files do not match the json (different metadata versions)\nValid comparison may not be possible - please check!") + continue <- readline("Press enter to continue or Esc to cancel: ")} - if (csv_1$MetaDataLastUpdated[1] != meta_json$dataModel$lastUpdated){ + if (csv_1a$MetaDataLastUpdated[1] != csv_2a$MetaDataLastUpdated[1]){ cat("\n\n") - cli_alert_danger("The csv files do not match the json: different dates for metadata") - stop()} + cli_alert_warning("Different metadata date for session 1 and 2\nValid comparison may not be possible - please check!") + continue <- readline("Press enter to continue or Esc to cancel: ")} - if (csv_1$Dataset[1] != meta_json$dataModel$label){ + if (csv_1a$MetaDataLastUpdated[1] != meta_json$dataModel$lastUpdated){ cat("\n\n") - cli_alert_danger("The csv files do not match the json: different datasets") - stop()} + cli_alert_warning("The session files do not match the json (different dates for metadata)\nValid comparison may not be possible - please check!") + continue <- readline("Press enter to continue or Esc to cancel: ")} - # print details about each csv - cat("\n\n") - cli_alert_success("Comparing csv 1 and csv 2") - cli_alert_success("csv_1 -> {csv_file_1}") - cli_alert_success("csv_2 -> {csv_file_2}") - cli_alert_success("csv_1 created by {csv_1$Initials[1]} and csv_2 created by {csv_2$Initials[2]}") + if (nrow(csv_1b) != nrow(csv_2b)){ + cat("\n\n") + cli_alert_warning("Different number of data elements for session 1 and 2\nValid comparison may not be possible - please check!") + continue <- readline("Press enter to continue or Esc to cancel: ")} - # join csv_1 and csv_2 in order to compare - csv_join <- left_join(csv_1,csv_2,suffix = c("_csv1","_csv2"),join_by(MetaDataVersion,MetaDataLastUpdated,DomainListDesc,Dataset,Table,DataElement)) - csv_join$Domain_code_join <- NA - csv_join$Note_join <- NA - csv_join <- select(csv_join,2,3,4,5,6,7,1,8,9,10,11,12,13,14) + # print details about each session + cat("\n\n") + cli_alert_success("Comparing session 1 and session 2") + cli_alert_success("Session 1 created by {csv_1a$Initials[1]} and session 2 created by {csv_2a$Initials[1]}") + + # join csv_1b and csv_2b in order to compare + ses_join <- left_join(csv_1b,csv_2b,suffix = c("_ses1","_ses2"),join_by(DataElement)) + ses_join$Domain_code_join <- NA + ses_join$Note_join <- NA + ses_join <- select(ses_join, + 'timestamp_ses1','timestamp_ses2', + 'DataElement_N_ses1','DataElement_N_ses2', + 'Domain_code_ses1','Domain_code_ses2', + 'Note_ses1','Note_ses2', + 'Domain_code_join','Note_join') # extract table from meta_json - same code as domain_mapping table_find <- data.frame(table_n = numeric(length(meta_json$dataModel$childDataClasses)),table_label = character(length(meta_json$dataModel$childDataClasses))) @@ -86,7 +102,7 @@ compare_csv_outputs <- function(csv_file_1,csv_file_2,json_file) { table_find$table_label[t] = meta_json$dataModel$childDataClasses[[t]]$label } - table_n = table_find$table_n[table_find$table_label == csv_1$Table[1]] + table_n = table_find$table_n[table_find$table_label == csv_1a$Table[1]] thisTable <- meta_json$dataModel$childDataClasses[[table_n]]$childDataElements thisTable_df <- data.frame(do.call(rbind, thisTable)) # nested list to dataframe dataType_df <- data.frame(do.call(rbind, thisTable_df$dataType)) # nested list to dataframe @@ -100,30 +116,29 @@ compare_csv_outputs <- function(csv_file_1,csv_file_2,json_file) { selectTable_df <- selectTable_df[order(selectTable_df$Label), ] # find the mismatches and ask for consensus decisions - for (datavar in 1:nrow(csv_join)) { - cat("\n \n") + for (datavar in 1:nrow(ses_join)) { # collect user responses - if (csv_join$Domain_code_csv1[datavar] != csv_join$Domain_code_csv2[datavar]){ + if (ses_join$Domain_code_ses1[datavar] != ses_join$Domain_code_ses2[datavar]){ cat("\n\n") - cli_alert_info("Mismatch of DataElement {csv_join$DataElement[datavar]}") + cli_alert_info("Mismatch of DataElement {ses_join$DataElement[datavar]}") cat(paste( - "\nDOMAIN CODE (note) for csv 1 --> ",csv_join$Domain_code_csv1[datavar],'(',csv_join$Note_csv1[datavar],')', - "\nDOMAIN CODE (note) for csv 2 --> ",csv_join$Domain_code_csv2[datavar],'(',csv_join$Note_csv2[datavar],')' + "\nDOMAIN CODE (note) for session 1 --> ",ses_join$Domain_code_ses1[datavar],'(',ses_join$Note_ses1[datavar],')', + "\nDOMAIN CODE (note) for session 2 --> ",ses_join$Domain_code_ses2[datavar],'(',ses_join$Note_ses2[datavar],')' )) cat("\n\n") cli_alert_info("Provide concensus decision for this DataElement:") decision_output <- user_categorisation(selectTable_df$Label[datavar],selectTable_df$Description[datavar],selectTable_df$Type[datavar]) - csv_join$Domain_code_join <- decision_output$decision - csv_join$Note_join <- decision_output$decision_note + ses_join$Domain_code_join[datavar] <- decision_output$decision + ses_join$Note_join[datavar] <- decision_output$decision_note } else { - csv_join$Domain_code_join <- csv_join$Domain_code_csv1[datavar] - csv_join$Note_join <- 'No mismatch' + ses_join$Domain_code_join[datavar] <- ses_join$Domain_code_ses1[datavar] + ses_join$Note_join[datavar] <- 'No mismatch!' } } # end of loop for DataElement # save to new csv - output_fname <- paste0("CONCENSUS_LOG_", gsub(" ", "", meta_json$dataModel$label), "_", table_find$table_label[table_n], "_", timestamp_now, ".csv") - utils::write.csv(csv_join, output_fname, row.names = FALSE) + output_fname <- paste0("CONCENSUS_OUTPUT_", gsub(" ", "", meta_json$dataModel$label), "_", table_find$table_label[table_n], "_", timestamp_now, ".csv") + utils::write.csv(ses_join, output_fname, row.names = FALSE) cat("\n") cli_alert_success("Your concensus categorisations have been saved to {output_fname}") } From 1e2c0e7f8cf6b99859eee0480493070e20bc0166 Mon Sep 17 00:00:00 2001 From: Rachael Stickland <50215726+RayStick@users.noreply.github.com> Date: Thu, 4 Jul 2024 09:51:54 +0100 Subject: [PATCH 4/9] update docs --- man/compare_csv_outputs.Rd | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/man/compare_csv_outputs.Rd b/man/compare_csv_outputs.Rd index 97a93a2e..86609cd0 100644 --- a/man/compare_csv_outputs.Rd +++ b/man/compare_csv_outputs.Rd @@ -4,19 +4,29 @@ \alias{compare_csv_outputs} \title{compare_csv_outputs} \usage{ -compare_csv_outputs(csv_file_1, csv_file_2, json_file) +compare_csv_outputs( + csv_file_1a, + csv_file_1b, + csv_file_2a, + csv_file_2b, + json_file +) } \arguments{ -\item{csv_file_1}{CSV output file from running domain_mapping} +\item{csv_file_1a}{CSV log output file from running domain_mapping (session 1)} -\item{csv_file_2}{CSV output file from running domain_mapping (different to csv_file_1)} +\item{csv_file_1b}{CSV file from running domain_mapping (session 1)} -\item{json_file}{The metadata file used when running domain_mapping (should be the same for csv_file_1 and csv_file_2)} +\item{csv_file_2a}{CSV log output file from running domain_mapping (session 2)} + +\item{csv_file_2b}{CSV file from running domain_mapping (session 2)} + +\item{json_file}{The metadata file used when running domain_mapping (should be the same for session 1 and session 2)} } \value{ -It returns csv_3, with consensus decisions +It returns a csv output, which represents the consensus decisions between session 1 and session 2 } \description{ This function is to be used after running the domain_mapping function. \cr \cr -It compares two csv outputs, finds their differences, and asks for a consensus. \cr \cr +It compares outputs from two sessions, finds their differences, and asks for a consensus. \cr \cr } From 3532aa517cebcba037fbb4470a75ffd48814cb7c Mon Sep 17 00:00:00 2001 From: Rachael Stickland <50215726+RayStick@users.noreply.github.com> Date: Thu, 4 Jul 2024 10:07:17 +0100 Subject: [PATCH 5/9] update to match code changes --- README.md | 39 ++++++++++++++++----------------------- 1 file changed, 16 insertions(+), 23 deletions(-) diff --git a/README.md b/README.md index 5deb8ea8..7856dc38 100644 --- a/README.md +++ b/README.md @@ -286,32 +286,19 @@ If you want to change your categorisation, enter in the row number (e.g. 8 for B It will then take you through the same process as before, and you can over-write your previous categorisation. -All finished! Take a look at the csv log output: +All finished! Take a look at the outputs: ``` -✔ Your final categorisations have been saved to LOG_NationalCommunityChildHealthDatabase(NCCHD)_CHILD_2024-04-05_14-37-36.csv +✔ Your final categorisations have been saved: +OUTPUT_NationalCommunityChildHealthDatabase(NCCHD)_CHILD_2024-04-05_14-37-36.csv +✔ Your session log has been saved: +LOG_NationalCommunityChildHealthDatabase(NCCHD)_CHILD_2024-04-05_14-37-36.csv +✔ A summary plot has been saved: +PLOT_NationalCommunityChildHealthDatabase(NCCHD)_CHILD_2024-04-05_14-37-36.png ``` -There is an accompanying png output saved which plots the count of domain codes for each table: - -``` -✔ A summary plot has been saved to PLOT_NationalCommunityChildHealthDatabase(NCCHD)_CHILD_2024-04-05_14-37-36.png -``` - -#### Output - -The output of your decisions will be saved to a csv file. -The csv file name includes the dataset, table, and date stamp. -This csv file, in addition to what is shown on the console, contains: - -- user initials (from user input) -- metadata version (from json) -- date time stamp the metadata was last updated (from json) -- dataset name (from json) - -Intended use-cases: -- Compare categorisations across researchers (see the function [R/compare_csv_outputs.R](R/compare_csv_outputs.R)) -- Use as an input in later analysis steps to filter variables and visualise how they map to research domains +The OUTPUT csv contains the categorisations you made. The LOG csv contains information about the session as a whole, including various metadata. +These two csv files contain the same timestamp column. The PLOT png file saves a simple plot displaying the count of domain codes for that table. ### Using your own input files @@ -341,9 +328,15 @@ The lookup file: - a [default lookup file](dataraw/look_up.csv) is used by the domain_mapping function - optional: a csv can be created by the user (using the same format as the default) and provided as the input - - the lookup file makes autocategorisations intended for variables that come up regularly in health datasets (e.g. IDs and demographics) + - the lookup file makes auto-categorisations intended for variables that come up regularly in health datasets (e.g. IDs and demographics) - the lookup file only works for 1:1 mappings right now, i.e. the DataElement should only be listed once in the lookup file +### Potential use-cases for the output files + +The csv output file containing the categorisation for each data element could be used as an input in later analysis steps to filter variables and visualise how each variable maps to research domains of interest. + +Categorisations across researchers can be compared by using the function [R/compare_csv_outputs.R](R/compare_csv_outputs.R). Type `?compare_csv_outputs` to read the manual on how to run this function. In brief, it compares outputs from two sessions, finds their differences, and asks for a consensus. + ## License This project is licensed under the GNU General Public License v3.0 - see From 0dc8c36eb39748bfe943dc3a8320b1a36c3b7b45 Mon Sep 17 00:00:00 2001 From: Rachael Stickland <50215726+RayStick@users.noreply.github.com> Date: Thu, 4 Jul 2024 14:19:56 +0100 Subject: [PATCH 6/9] validate user input --- R/domain_mapping.R | 6 +++--- R/user_categorisation.R | 30 +++++++++++++++++++++++++----- man/user_categorisation.Rd | 4 +++- 3 files changed, 31 insertions(+), 9 deletions(-) diff --git a/R/domain_mapping.R b/R/domain_mapping.R index ebe2dd9a..9cb50155 100755 --- a/R/domain_mapping.R +++ b/R/domain_mapping.R @@ -214,7 +214,7 @@ domain_mapping <- function(json_file = NULL, domain_file = NULL, look_up_file = Output <- rbind(Output,this_Output) } else { # collect user responses - decision_output <- user_categorisation(selectTable_df$Label[datavar],selectTable_df$Description[datavar],selectTable_df$Type[datavar]) + decision_output <- user_categorisation(selectTable_df$Label[datavar],selectTable_df$Description[datavar],selectTable_df$Type[datavar],max(Code$Code)) # input user responses into output this_Output <- row_Output this_Output[nrow(this_Output) + 1 , ] <- NA @@ -254,7 +254,7 @@ domain_mapping <- function(json_file = NULL, domain_file = NULL, look_up_file = for (datavar_auto in unique(auto_row)) { # collect user responses - decision_output <- user_categorisation(selectTable_df$Label[datavar_auto],selectTable_df$Description[datavar_auto],selectTable_df$Type[datavar_auto]) + decision_output <- user_categorisation(selectTable_df$Label[datavar_auto],selectTable_df$Description[datavar_auto],selectTable_df$Type[datavar_auto],max(Code$Code)) # input user responses into output Output$Domain_code[datavar_auto] <- decision_output$decision Output$Note[datavar_auto] <- decision_output$decision_note @@ -295,7 +295,7 @@ domain_mapping <- function(json_file = NULL, domain_file = NULL, look_up_file = for (datavar_not_auto in unique(not_auto_row)) { # collect user responses - decision_output <- user_categorisation(selectTable_df$Label[datavar_not_auto],selectTable_df$Description[datavar_not_auto],selectTable_df$Type[datavar_not_auto]) + decision_output <- user_categorisation(selectTable_df$Label[datavar_not_auto],selectTable_df$Description[datavar_not_auto],selectTable_df$Type[datavar_not_auto],max(Code$Code)) # input user responses into output Output$Domain_code[datavar_not_auto] <- decision_output$decision Output$Note[datavar_not_auto] <- decision_output$decision_note diff --git a/R/user_categorisation.R b/R/user_categorisation.R index d275067b..567846f9 100644 --- a/R/user_categorisation.R +++ b/R/user_categorisation.R @@ -7,10 +7,11 @@ #' @param data_element Name of the variable #' @param data_desc Description of the variable #' @param data_type Data type of the variable +#' @param domain_code_max Max code in the domain list (0-3 auto included, then N included via domain_file) #' @return It returns a list containing the decision and decision note #' @export -user_categorisation <- function(data_element,data_desc,data_type) { +user_categorisation <- function(data_element,data_desc,data_type,domain_code_max) { first_run = TRUE go_back = '' @@ -25,21 +26,40 @@ user_categorisation <- function(data_element,data_desc,data_type) { "\n\nDATA TYPE -----> ", data_type, "\n" )) - # ask user for categorisation + # ask user for categorisation: decision <- "" + validated = FALSE cat("\n \n") - while (decision == "") { + + while (decision == "" | validated == FALSE) { decision <- readline("Categorise data element into domain(s). E.g. 3 or 3,4: ") + + # validate input given by user + decision_int <- as.integer(unlist(strsplit(decision,","))) + decision_int_NA <- any(is.na((decision_int))) + decision_int_max <- max(decision_int,na.rm=TRUE) + decision_int_min <- min(decision_int,na.rm=TRUE) + if (decision_int_NA == TRUE | decision_int_max > domain_code_max | decision_int_min < 0){ + cli_alert_warning("Formatting is invalid or integer out of range. Provide one integer or a comma seperated list of integers.") + validated = FALSE} + else { + validated = TRUE + # standardize output + decision_int <- sort(decision_int) + decision <- paste(decision_int, collapse = ',') + } + } - # ask user for note on categorisation + # ask user for note on categorisation: + cat("\n \n") decision_note <- readline("Categorisation note (or press enter to continue): ") while (go_back != "Y" & go_back != "y" & go_back != "N" & go_back != "n") { cat("\n \n") - go_back <- readline(prompt = "Re-do last categorisation? (y/n): ") + go_back <- readline(prompt = paste0("Respone to be saved is '",decision,"'. Would you like to re-do? (y/n): ")) } first_run = FALSE diff --git a/man/user_categorisation.Rd b/man/user_categorisation.Rd index d7e64427..13b7b46d 100644 --- a/man/user_categorisation.Rd +++ b/man/user_categorisation.Rd @@ -4,7 +4,7 @@ \alias{user_categorisation} \title{user_categorisation} \usage{ -user_categorisation(data_element, data_desc, data_type) +user_categorisation(data_element, data_desc, data_type, domain_code_max) } \arguments{ \item{data_element}{Name of the variable} @@ -12,6 +12,8 @@ user_categorisation(data_element, data_desc, data_type) \item{data_desc}{Description of the variable} \item{data_type}{Data type of the variable} + +\item{domain_code_max}{Max code in the domain list (0-3 auto included, then N included via domain_file)} } \value{ It returns a list containing the decision and decision note From b4c7c12f366bb0b90b8d3101c24b1bccb93bf76c Mon Sep 17 00:00:00 2001 From: Rachael Stickland <50215726+RayStick@users.noreply.github.com> Date: Thu, 4 Jul 2024 14:35:00 +0100 Subject: [PATCH 7/9] correct sp mistake & update README --- R/user_categorisation.R | 2 +- README.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/R/user_categorisation.R b/R/user_categorisation.R index 567846f9..027eb5d6 100644 --- a/R/user_categorisation.R +++ b/R/user_categorisation.R @@ -59,7 +59,7 @@ user_categorisation <- function(data_element,data_desc,data_type,domain_code_max while (go_back != "Y" & go_back != "y" & go_back != "N" & go_back != "n") { cat("\n \n") - go_back <- readline(prompt = paste0("Respone to be saved is '",decision,"'. Would you like to re-do? (y/n): ")) + go_back <- readline(prompt = paste0("Response to be saved is '",decision,"'. Would you like to re-do? (y/n): ")) } first_run = FALSE diff --git a/README.md b/README.md index 7856dc38..0f1109c9 100644 --- a/README.md +++ b/README.md @@ -231,7 +231,7 @@ A note can be included to explain why a categorisation has been made. Or press e You have the option to re-do the categorisation you just made, by replying 'y' to the question: ``` -Re-do last categorisation? (y/n): y +Response to be saved is '7'. Would you like to re-do? (y/n): y ``` After completing 20, it will then ask you to review the auto-categorisations it made. From 9cad2f6a43126662ec250356478b5d25bbdfb2db Mon Sep 17 00:00:00 2001 From: Rachael Stickland <50215726+RayStick@users.noreply.github.com> Date: Mon, 8 Jul 2024 13:34:17 +0100 Subject: [PATCH 8/9] correct error & simplify inputs --- R/compare_csv_outputs.R | 38 +++++++++++++++++++++----------------- man/compare_csv_outputs.Rd | 20 ++++++++++---------- 2 files changed, 31 insertions(+), 27 deletions(-) diff --git a/R/compare_csv_outputs.R b/R/compare_csv_outputs.R index 9442d3e8..10609b95 100644 --- a/R/compare_csv_outputs.R +++ b/R/compare_csv_outputs.R @@ -3,35 +3,31 @@ #' This function is to be used after running the domain_mapping function. \cr \cr #' It compares outputs from two sessions, finds their differences, and asks for a consensus. \cr \cr #' -#' @param csv_file_1a CSV log output file from running domain_mapping (session 1) -#' @param csv_file_1b CSV file from running domain_mapping (session 1) -#' @param csv_file_2a CSV log output file from running domain_mapping (session 2) -#' @param csv_file_2b CSV file from running domain_mapping (session 2) -#' @param json_file The metadata file used when running domain_mapping (should be the same for session 1 and session 2) +#' @param session_dir This directory should contain 2 csv files for each session (LOG_ and OUTPUT_), 4 csv files in total. +#' @param session1_base Base file name for session 1 e.g. 'NationalCommunityChildHealthDatabase(NCCHD)_BLOOD_TEST_2024-07-05_16-07-38.599493' +#' @param session2_base Base file name for session 1 e.g. 'NationalCommunityChildHealthDatabase(NCCHD)_BLOOD_TEST_2024-07-08_12-03-30.429336' +#' @param json_file The full path to the metadata file used when running domain_mapping (should be the same for session 1 and session 2) +#' @param domain_file The full path to the domain file used when running domain_mapping (should be the same for session 1 and session 2) #' @return It returns a csv output, which represents the consensus decisions between session 1 and session 2 #' @importFrom dplyr left_join select join_by #' @export -compare_csv_outputs <- function(csv_file_1a,csv_file_1b,csv_file_2a,csv_file_2b,json_file) { +compare_csv_outputs <- function(session_dir,session1_base,session2_base,json_file,domain_file) { timestamp_now <- gsub(" ", "_", Sys.time()) timestamp_now <- gsub(":", "-", timestamp_now) - # read in the log input files: + # read in the input files: - csv_1a <- read.csv(csv_file_1a) - csv_2a <- read.csv(csv_file_2a) - csv_1b <- read.csv(csv_file_1b) - csv_2b <- read.csv(csv_file_2b) + csv_1a <- read.csv(paste0(session_dir,'/','LOG_',session1_base,'.csv')) + csv_2a <- read.csv(paste0(session_dir,'/','LOG_',session2_base,'.csv')) + csv_1b <- read.csv(paste0(session_dir,'/','OUTPUT_',session1_base,'.csv')) + csv_2b <- read.csv(paste0(session_dir,'/','OUTPUT_',session2_base,'.csv')) meta_json <- rjson::fromJSON(file = json_file) + domains <- read.csv(domain_file, header = FALSE) # check the session csvs can be compared to each other and to the json (min requirements): - if (csv_1a$DomainListDesc[1] != csv_2a$DomainListDesc[1]){ - cat("\n\n") - cli_alert_danger("Cannot compare session 1 and 2: different domain lists") - stop()} - if (csv_1a$Dataset[1] != csv_2a$Dataset[1]){ cat("\n\n") cli_alert_danger("Cannot compare session 1 and 2: different datasets") @@ -84,6 +80,14 @@ compare_csv_outputs <- function(csv_file_1a,csv_file_1b,csv_file_2a,csv_file_2b, cli_alert_success("Comparing session 1 and session 2") cli_alert_success("Session 1 created by {csv_1a$Initials[1]} and session 2 created by {csv_2a$Initials[1]}") + # Present domains plots panel for user's reference (as in domain_mapping) + colnames(domains)[1] = "Domain Name" + graphics::plot.new() + domains_extend <- rbind(c("*NO MATCH / UNSURE*"), c("*METADATA*"), c("*ID*"), c("*DEMOGRAPHICS*"), domains) + Code <- data.frame(Code = 0:(nrow(domains_extend) - 1)) + Domain_table <- tableGrob(cbind(Code,domains_extend),rows = NULL,theme = ttheme_default()) + grid.arrange(Domain_table,nrow=1,ncol=1) + # join csv_1b and csv_2b in order to compare ses_join <- left_join(csv_1b,csv_2b,suffix = c("_ses1","_ses2"),join_by(DataElement)) ses_join$Domain_code_join <- NA @@ -127,7 +131,7 @@ compare_csv_outputs <- function(csv_file_1a,csv_file_1b,csv_file_2a,csv_file_2b, )) cat("\n\n") cli_alert_info("Provide concensus decision for this DataElement:") - decision_output <- user_categorisation(selectTable_df$Label[datavar],selectTable_df$Description[datavar],selectTable_df$Type[datavar]) + decision_output <- user_categorisation(selectTable_df$Label[datavar],selectTable_df$Description[datavar],selectTable_df$Type[datavar],max(Code$Code)) ses_join$Domain_code_join[datavar] <- decision_output$decision ses_join$Note_join[datavar] <- decision_output$decision_note } else { diff --git a/man/compare_csv_outputs.Rd b/man/compare_csv_outputs.Rd index 86609cd0..018be309 100644 --- a/man/compare_csv_outputs.Rd +++ b/man/compare_csv_outputs.Rd @@ -5,23 +5,23 @@ \title{compare_csv_outputs} \usage{ compare_csv_outputs( - csv_file_1a, - csv_file_1b, - csv_file_2a, - csv_file_2b, - json_file + session_dir, + session1_base, + session2_base, + json_file, + domain_file ) } \arguments{ -\item{csv_file_1a}{CSV log output file from running domain_mapping (session 1)} +\item{session_dir}{This directory should contain 2 csv files for each session (LOG_ and OUTPUT_), 4 csv files in total.} -\item{csv_file_1b}{CSV file from running domain_mapping (session 1)} +\item{session1_base}{Base file name for session 1 e.g. 'NationalCommunityChildHealthDatabase(NCCHD)_BLOOD_TEST_2024-07-05_16-07-38.599493'} -\item{csv_file_2a}{CSV log output file from running domain_mapping (session 2)} +\item{session2_base}{Base file name for session 1 e.g. 'NationalCommunityChildHealthDatabase(NCCHD)_BLOOD_TEST_2024-07-08_12-03-30.429336'} -\item{csv_file_2b}{CSV file from running domain_mapping (session 2)} +\item{json_file}{The full path to the metadata file used when running domain_mapping (should be the same for session 1 and session 2)} -\item{json_file}{The metadata file used when running domain_mapping (should be the same for session 1 and session 2)} +\item{domain_file}{The full path to the domain file used when running domain_mapping (should be the same for session 1 and session 2)} } \value{ It returns a csv output, which represents the consensus decisions between session 1 and session 2 From 7fac9ed2037f069f6ca9621e4f40f64d876532ed Mon Sep 17 00:00:00 2001 From: Rachael Stickland <50215726+RayStick@users.noreply.github.com> Date: Mon, 8 Jul 2024 13:38:23 +0100 Subject: [PATCH 9/9] rename --- NAMESPACE | 2 +- R/{compare_csv_outputs.R => compare_sessions.R} | 6 +++--- README.md | 2 +- man/{compare_csv_outputs.Rd => compare_sessions.Rd} | 12 ++++++------ 4 files changed, 11 insertions(+), 11 deletions(-) rename R/{compare_csv_outputs.R => compare_sessions.R} (97%) rename man/{compare_csv_outputs.Rd => compare_sessions.Rd} (79%) diff --git a/NAMESPACE b/NAMESPACE index 1aded647..21344367 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,6 +1,6 @@ # Generated by roxygen2: do not edit by hand -export(compare_csv_outputs) +export(compare_sessions) export(domain_mapping) export(user_categorisation) import(cli) diff --git a/R/compare_csv_outputs.R b/R/compare_sessions.R similarity index 97% rename from R/compare_csv_outputs.R rename to R/compare_sessions.R index 10609b95..10da6112 100644 --- a/R/compare_csv_outputs.R +++ b/R/compare_sessions.R @@ -1,7 +1,7 @@ -#' compare_csv_outputs +#' compare_sessions #' #' This function is to be used after running the domain_mapping function. \cr \cr -#' It compares outputs from two sessions, finds their differences, and asks for a consensus. \cr \cr +#' It compares csv outputs from two sessions, finds their differences, and asks for a consensus. \cr \cr #' #' @param session_dir This directory should contain 2 csv files for each session (LOG_ and OUTPUT_), 4 csv files in total. #' @param session1_base Base file name for session 1 e.g. 'NationalCommunityChildHealthDatabase(NCCHD)_BLOOD_TEST_2024-07-05_16-07-38.599493' @@ -12,7 +12,7 @@ #' @importFrom dplyr left_join select join_by #' @export -compare_csv_outputs <- function(session_dir,session1_base,session2_base,json_file,domain_file) { +compare_sessions <- function(session_dir,session1_base,session2_base,json_file,domain_file) { timestamp_now <- gsub(" ", "_", Sys.time()) timestamp_now <- gsub(":", "-", timestamp_now) diff --git a/README.md b/README.md index 0f1109c9..76884e83 100644 --- a/README.md +++ b/README.md @@ -335,7 +335,7 @@ The lookup file: The csv output file containing the categorisation for each data element could be used as an input in later analysis steps to filter variables and visualise how each variable maps to research domains of interest. -Categorisations across researchers can be compared by using the function [R/compare_csv_outputs.R](R/compare_csv_outputs.R). Type `?compare_csv_outputs` to read the manual on how to run this function. In brief, it compares outputs from two sessions, finds their differences, and asks for a consensus. +Categorisations across researchers can be compared by using the function [R/compare_sessions.R](R/compare_sessions.R). Type `?compare__sessions` to read the manual on how to run this function. In brief, it compares csv outputs from two sessions, finds their differences, and asks for a consensus. ## License diff --git a/man/compare_csv_outputs.Rd b/man/compare_sessions.Rd similarity index 79% rename from man/compare_csv_outputs.Rd rename to man/compare_sessions.Rd index 018be309..568114dd 100644 --- a/man/compare_csv_outputs.Rd +++ b/man/compare_sessions.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/compare_csv_outputs.R -\name{compare_csv_outputs} -\alias{compare_csv_outputs} -\title{compare_csv_outputs} +% Please edit documentation in R/compare_sessions.R +\name{compare_sessions} +\alias{compare_sessions} +\title{compare_sessions} \usage{ -compare_csv_outputs( +compare_sessions( session_dir, session1_base, session2_base, @@ -28,5 +28,5 @@ It returns a csv output, which represents the consensus decisions between sessio } \description{ This function is to be used after running the domain_mapping function. \cr \cr -It compares outputs from two sessions, finds their differences, and asks for a consensus. \cr \cr +It compares csv outputs from two sessions, finds their differences, and asks for a consensus. \cr \cr }