Merge pull request #41 from elpaco-escience/update_reports

Update reporting stats to use basic talkr columns
elpaco-escience · Dec 20, 2023 · 3d23d35 · 3d23d35
2 parents f8cd89a + 18f5c7c
commit 3d23d35
Show file tree

Hide file tree

Showing 21 changed files with 425 additions and 279 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -29,5 +29,7 @@ Imports:
     stats,
     stringx,
     tidyr,
+    tidyselect,
+    tibble,
     viridis
 Config/testthat/edition: 3
diff --git a/NAMESPACE b/NAMESPACE
@@ -7,7 +7,7 @@ export(geom_turn)
 export(GeomTurn)
 export(init)
 export(inspect_language)
-export(report_summaries)
+export(report_stats)
 export(theme_turnPlot)
 import(ggplot2)
 import(viridis)
diff --git a/R/check.R b/R/check.R
@@ -0,0 +1,29 @@
+#' Check the presence of necessary columns in a dataset
+#'
+#' @param data dataset to check
+#' @param columns a vector of column names that must be present
+#'
+#' @return nothing, but throws an error if a column is missing
+check_columns <- function(data, columns){
+  for (column in columns){
+    if(!column %in% colnames(data)){
+      stop(paste0("Column `",column,"` was not found in the dataset."))
+    }
+  }
+}
+
+
+#' Check the presence of talkr-workflow columns in the dataset.
+#'
+#' Uses check_columns() to check for:
+#' - begin
+#' - end
+#' - participant
+#' - utterance
+#' - source
+#'
+#' @param data dataset to check
+check_talkr <- function(data) {
+  required_cols <- c("begin", "end", "participant", "utterance", "source")
+  check_columns(data, required_cols)
+}
diff --git a/R/init.R b/R/init.R
@@ -6,28 +6,52 @@
 #' Initializing a talkr dataset is the first step in the talkr workflow.
 #'
 #' @param data A dataframe object
+#' @param source The column name identifying the conversation source (e.g. a filename; is used as unique conversation ID)
 #' @param begin The column name with the begin time of the utterance (in milliseconds)
 #' @param end The column name with the end time of the utterance (in milliseconds)
 #' @param participant The column name with the participant who produced the utterance
 #' @param utterance The column name with the utterance itself
+#' @param format_timestamps The format of the timestamps in the begin and end columns. Default is "ms", which expects milliseconds. `\%H:\%M:\%OS` will format eg. 00:00:00.010 to milliseconds (10). See `?strptime` for more format examples.
 #'
 #' @return A dataframe object with columns needed for the talkr workflow
 #' @export
 init <- function(data,
+                 source = "source",
                  begin = "begin",
                  end = "end",
                  participant = "participant",
-                 utterance = "utterance"){
+                 utterance = "utterance",
+                 format_timestamps = "ms"){
 
   # verify that column names declared actually exist in the dataset
-  names_required <- c(begin, end, participant, utterance)
+  names_required <- c(source, begin, end, participant, utterance)
   check_columns(data, names_required)
 
   data <- data |>
-    dplyr::rename(begin = begin,
-                  end = end,
-                  participant = participant,
-                  utterance = utterance)
+    dplyr::rename(source = tidyselect::all_of(source),
+                  begin = tidyselect::all_of(begin),
+                  end = tidyselect::all_of(end),
+                  participant = tidyselect::all_of(participant),
+                  utterance = tidyselect::all_of(utterance))
+
+  # convert timestamps if necessary
+  if (format_timestamps != "ms"){
+    data$begin <- timestamp_to_milliseconds(data$begin, format = format_timestamps)
+    data$end <- timestamp_to_milliseconds(data$end, format = format_timestamps)
+  } else {
+    data$begin <- as.numeric(data$begin)
+    data$end <- as.numeric(data$end)
+  }
 
   return(data)
 }
+
+
+timestamp_to_milliseconds <- function(timestamp, format = "%H:%M:%OS"){
+  timestamp <- strptime(timestamp, format = format)
+  seconds <- as.numeric(strftime(timestamp, format = "%OS3"))
+  minutes <- as.numeric(strftime(timestamp, format = "%M"))
+  hours <- as.numeric(strftime(timestamp, format = "%H"))
+  time_in_ms <- (seconds + minutes*60 + hours*60*60) * 1000
+  return(time_in_ms)
+}
diff --git a/R/inspect_language.R b/R/inspect_language.R
@@ -55,7 +55,7 @@ inspect_language <- function(data_conv,
   }
 
   # print summary stats
-  report_summaries(data_conv, lang, allsources)
+  report_stats(data_conv)
 }
 
 

diff --git a/R/report.R b/R/report.R
@@ -0,0 +1,104 @@
+#' Report corpus-level and conversation-level statistics
+#'
+#' Basic conversation statistics are reported to the console:
+#' - Corpus-level statistics, reporting on the dataset as a whole;
+#' - Conversation-level statistics, reporting per source.
+#'
+#' The input for this function must be a `talkr` dataset, containing
+#' the columns `source`, `participant`, `begin`, and `end`. Time stamps in the
+#' columns `begin` and `end` must be in milliseconds.
+#' To easily transform a dataset to a `talkr` dataset, consult `talkr::init()`.
+#'
+#' @param data talkr dataset
+#'
+#' @export
+report_stats <- function(data) {
+  summary <- report(data)
+
+  # combined
+  combined <- report_combined(summary)
+  print_single_line_table(header = "Corpus-level statistics", table = combined)
+
+  # basics
+  basics <- report_basics(summary)
+  print_summary(header = "Conversation-level statistics (per source)", table = basics)
+
+  # turns
+  turns <- report_turns(summary)
+  print_summary(header = "Turn statistics (per source)", table = turns)
+
+}
+
+report <- function(data) {
+  # check if data is a talkr object
+  check_talkr(data)
+
+  # report summary
+  summary <- data |>
+    dplyr::mutate(XXX_turn_duration = .data$end - .data$begin) |>
+    dplyr::group_by(.data$source) |>
+    dplyr::summarize(start=min.na(.data$begin),
+                     end=max.na(.data$end),
+                     n_turns=dplyr::n(),
+                     shortest_turn=min_na(.data$XXX_turn_duration),
+                     longest_turn=max_na(.data$XXX_turn_duration),
+                     avg_turn=mean_na(.data$XXX_turn_duration),
+                     n_participants=dplyr::n_distinct(.data$participant),
+                     totaltime = .data$end - .data$start,
+                     total_speaking = sum.na(.data$XXX_turn_duration)
+    )
+  return(summary)
+}
+
+
+report_combined <- function(summary) {
+  combined <- summary |>
+    dplyr::summarize(`nr of sources` = dplyr::n(),
+                     `nr of participants` = sum_na(.data$n_participants),
+                     `nr of turns` = sum.na(.data$n_turns),
+                     `mean turn duration (ms)` = round(sum_na(.data$total_speaking) / sum_na(.data$n_turns), 0),
+                     `turns per hour` = round(sum_na(.data$n_turns) / (sum_na(.data$totaltime) / (60 * 60 * 1000)), 0),
+                     `total recording (min)` = round((sum_na(.data$totaltime) / (60 * 1000)), 1),
+                     `total recording (hours)` = round(sum_na(.data$totaltime) / (60 * 60 * 1000), 2),
+                     `total speaking time (min)` = round(sum_na(.data$total_speaking) / (60 * 1000), 1),
+                     `total speaking time (hours)` = round(sum_na(.data$total_speaking) / (60 * 60 * 1000), 2)
+    )
+}
+
+report_basics <- function(summary) {
+  basics <- tibble::tibble(source = summary$source)
+  basics$`nr of participants` = summary$n_participants
+  basics$`total recording (ms)` = summary$totaltime
+  basics$`total recording (min)` = round(summary$totaltime / (60 * 1000), 1)
+  basics$`total speaking time (ms)` = summary$total_speaking
+  basics$`total speaking time (min)` = round(summary$total_speaking / (60 * 1000), 1)
+  return(basics)
+}
+
+report_turns <- function(summary) {
+  turns <- tibble::tibble(source = summary$source)
+  turns$`nr of turns` = summary$n_turns
+  turns$`mean turn duration (ms)` = round(summary$avg_turn, 0)
+  turns$`shortest turn (ms)` = summary$shortest_turn
+  turns$`longest turn (ms)` = summary$longest_turn
+  turns$`turns per hour` = round(summary$n_turns / (summary$totaltime / (60 * 60 * 1000)), 0)
+  return(turns)
+}
+
+
+print_summary <- function(header, table){
+  cat(paste("###", header))
+  print(knitr::kable(table))
+  cat("\n")
+}
+
+print_single_line_table <- function(header, table){
+  cat(paste("###", header, "\n\n"))
+  for (n in names(table)){
+    item <- table[1,n]
+    cat(paste(n, item, sep = ": "))
+    cat("\n")
+  }
+  cat("\n")
+}
+
diff --git a/R/small_helper_set.R b/R/small_helper_set.R
@@ -1,8 +1,17 @@
-`%notin%` <- function(x,y) !(x %in% y)
+#`%notin%` <- function(x,y) !(x %in% y)
 mean.na <- function(x) mean(x, na.rm = T)
 median.na <- function(x) stats::median(x, na.rm= T)
 min.na <- function(x) min(x, na.rm = T)
 max.na <- function(x) max(x, na.rm = T)
 sd.na <- function(x) stats::sd(x, na.rm = T)
 sum.na <- function(x) sum(x, na.rm = T)
 
+
+`%notin%` <- function(x,y) !(x %in% y)
+mean_na <- function(x) mean(x, na.rm = T)
+median_na <- function(x) stats::median(x, na.rm= T)
+min_na <- function(x) min(x, na.rm = T)
+max_na <- function(x) max(x, na.rm = T)
+sd_na <- function(x) stats::sd(x, na.rm = T)
+sum_na <- function(x) sum(x, na.rm = T)
+
diff --git a/R/source_quality.R b/R/source_quality.R
@@ -37,16 +37,4 @@ check_quality <- function(data, source){
 }
 
 
-#' Check the presence of necessary columns in a dataset
-#'
-#' @param data dataset to check
-#' @param columns a vector of column names that must be present
-#'
-#' @return nothing, but throws an error if a column is missing
-check_columns <- function(data, columns){
-  for (column in columns){
-    if(!column %in% colnames(data)){
-      stop(paste0("Column `",column,"` was not found in the dataset."))
-    }
-  }
-}
+
diff --git a/R/summaries.R b/R/summaries.R
diff --git a/man/check_columns.Rd b/man/check_columns.Rd
diff --git a/man/check_talkr.Rd b/man/check_talkr.Rd