Skip to content

Commit

Permalink
Merge pull request #41 from elpaco-escience/update_reports
Browse files Browse the repository at this point in the history
Update reporting stats to use basic talkr columns
  • Loading branch information
bvreede authored Dec 20, 2023
2 parents f8cd89a + 18f5c7c commit 3d23d35
Show file tree
Hide file tree
Showing 21 changed files with 425 additions and 279 deletions.
2 changes: 2 additions & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -29,5 +29,7 @@ Imports:
stats,
stringx,
tidyr,
tidyselect,
tibble,
viridis
Config/testthat/edition: 3
2 changes: 1 addition & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ export(geom_turn)
export(GeomTurn)
export(init)
export(inspect_language)
export(report_summaries)
export(report_stats)
export(theme_turnPlot)
import(ggplot2)
import(viridis)
29 changes: 29 additions & 0 deletions R/check.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#' Check the presence of necessary columns in a dataset
#'
#' @param data dataset to check
#' @param columns a vector of column names that must be present
#'
#' @return nothing, but throws an error if a column is missing
check_columns <- function(data, columns){
for (column in columns){
if(!column %in% colnames(data)){
stop(paste0("Column `",column,"` was not found in the dataset."))
}
}
}


#' Check the presence of talkr-workflow columns in the dataset.
#'
#' Uses check_columns() to check for:
#' - begin
#' - end
#' - participant
#' - utterance
#' - source
#'
#' @param data dataset to check
check_talkr <- function(data) {
required_cols <- c("begin", "end", "participant", "utterance", "source")
check_columns(data, required_cols)
}
36 changes: 30 additions & 6 deletions R/init.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,28 +6,52 @@
#' Initializing a talkr dataset is the first step in the talkr workflow.
#'
#' @param data A dataframe object
#' @param source The column name identifying the conversation source (e.g. a filename; is used as unique conversation ID)
#' @param begin The column name with the begin time of the utterance (in milliseconds)
#' @param end The column name with the end time of the utterance (in milliseconds)
#' @param participant The column name with the participant who produced the utterance
#' @param utterance The column name with the utterance itself
#' @param format_timestamps The format of the timestamps in the begin and end columns. Default is "ms", which expects milliseconds. `\%H:\%M:\%OS` will format eg. 00:00:00.010 to milliseconds (10). See `?strptime` for more format examples.
#'
#' @return A dataframe object with columns needed for the talkr workflow
#' @export
init <- function(data,
source = "source",
begin = "begin",
end = "end",
participant = "participant",
utterance = "utterance"){
utterance = "utterance",
format_timestamps = "ms"){

# verify that column names declared actually exist in the dataset
names_required <- c(begin, end, participant, utterance)
names_required <- c(source, begin, end, participant, utterance)
check_columns(data, names_required)

data <- data |>
dplyr::rename(begin = begin,
end = end,
participant = participant,
utterance = utterance)
dplyr::rename(source = tidyselect::all_of(source),
begin = tidyselect::all_of(begin),
end = tidyselect::all_of(end),
participant = tidyselect::all_of(participant),
utterance = tidyselect::all_of(utterance))

# convert timestamps if necessary
if (format_timestamps != "ms"){
data$begin <- timestamp_to_milliseconds(data$begin, format = format_timestamps)
data$end <- timestamp_to_milliseconds(data$end, format = format_timestamps)
} else {
data$begin <- as.numeric(data$begin)
data$end <- as.numeric(data$end)
}

return(data)
}


timestamp_to_milliseconds <- function(timestamp, format = "%H:%M:%OS"){
timestamp <- strptime(timestamp, format = format)
seconds <- as.numeric(strftime(timestamp, format = "%OS3"))
minutes <- as.numeric(strftime(timestamp, format = "%M"))
hours <- as.numeric(strftime(timestamp, format = "%H"))
time_in_ms <- (seconds + minutes*60 + hours*60*60) * 1000
return(time_in_ms)
}
2 changes: 1 addition & 1 deletion R/inspect_language.R
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ inspect_language <- function(data_conv,
}

# print summary stats
report_summaries(data_conv, lang, allsources)
report_stats(data_conv)
}


Expand Down
104 changes: 104 additions & 0 deletions R/report.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
#' Report corpus-level and conversation-level statistics
#'
#' Basic conversation statistics are reported to the console:
#' - Corpus-level statistics, reporting on the dataset as a whole;
#' - Conversation-level statistics, reporting per source.
#'
#' The input for this function must be a `talkr` dataset, containing
#' the columns `source`, `participant`, `begin`, and `end`. Time stamps in the
#' columns `begin` and `end` must be in milliseconds.
#' To easily transform a dataset to a `talkr` dataset, consult `talkr::init()`.
#'
#' @param data talkr dataset
#'
#' @export
report_stats <- function(data) {
summary <- report(data)

# combined
combined <- report_combined(summary)
print_single_line_table(header = "Corpus-level statistics", table = combined)

# basics
basics <- report_basics(summary)
print_summary(header = "Conversation-level statistics (per source)", table = basics)

# turns
turns <- report_turns(summary)
print_summary(header = "Turn statistics (per source)", table = turns)

}

report <- function(data) {
# check if data is a talkr object
check_talkr(data)

# report summary
summary <- data |>
dplyr::mutate(XXX_turn_duration = .data$end - .data$begin) |>
dplyr::group_by(.data$source) |>
dplyr::summarize(start=min.na(.data$begin),
end=max.na(.data$end),
n_turns=dplyr::n(),
shortest_turn=min_na(.data$XXX_turn_duration),
longest_turn=max_na(.data$XXX_turn_duration),
avg_turn=mean_na(.data$XXX_turn_duration),
n_participants=dplyr::n_distinct(.data$participant),
totaltime = .data$end - .data$start,
total_speaking = sum.na(.data$XXX_turn_duration)
)
return(summary)
}


report_combined <- function(summary) {
combined <- summary |>
dplyr::summarize(`nr of sources` = dplyr::n(),
`nr of participants` = sum_na(.data$n_participants),
`nr of turns` = sum.na(.data$n_turns),
`mean turn duration (ms)` = round(sum_na(.data$total_speaking) / sum_na(.data$n_turns), 0),
`turns per hour` = round(sum_na(.data$n_turns) / (sum_na(.data$totaltime) / (60 * 60 * 1000)), 0),
`total recording (min)` = round((sum_na(.data$totaltime) / (60 * 1000)), 1),
`total recording (hours)` = round(sum_na(.data$totaltime) / (60 * 60 * 1000), 2),
`total speaking time (min)` = round(sum_na(.data$total_speaking) / (60 * 1000), 1),
`total speaking time (hours)` = round(sum_na(.data$total_speaking) / (60 * 60 * 1000), 2)
)
}

report_basics <- function(summary) {
basics <- tibble::tibble(source = summary$source)
basics$`nr of participants` = summary$n_participants
basics$`total recording (ms)` = summary$totaltime
basics$`total recording (min)` = round(summary$totaltime / (60 * 1000), 1)
basics$`total speaking time (ms)` = summary$total_speaking
basics$`total speaking time (min)` = round(summary$total_speaking / (60 * 1000), 1)
return(basics)
}

report_turns <- function(summary) {
turns <- tibble::tibble(source = summary$source)
turns$`nr of turns` = summary$n_turns
turns$`mean turn duration (ms)` = round(summary$avg_turn, 0)
turns$`shortest turn (ms)` = summary$shortest_turn
turns$`longest turn (ms)` = summary$longest_turn
turns$`turns per hour` = round(summary$n_turns / (summary$totaltime / (60 * 60 * 1000)), 0)
return(turns)
}


print_summary <- function(header, table){
cat(paste("###", header))
print(knitr::kable(table))
cat("\n")
}

print_single_line_table <- function(header, table){
cat(paste("###", header, "\n\n"))
for (n in names(table)){
item <- table[1,n]
cat(paste(n, item, sep = ": "))
cat("\n")
}
cat("\n")
}

11 changes: 10 additions & 1 deletion R/small_helper_set.R
Original file line number Diff line number Diff line change
@@ -1,8 +1,17 @@
`%notin%` <- function(x,y) !(x %in% y)
#`%notin%` <- function(x,y) !(x %in% y)
mean.na <- function(x) mean(x, na.rm = T)
median.na <- function(x) stats::median(x, na.rm= T)
min.na <- function(x) min(x, na.rm = T)
max.na <- function(x) max(x, na.rm = T)
sd.na <- function(x) stats::sd(x, na.rm = T)
sum.na <- function(x) sum(x, na.rm = T)


`%notin%` <- function(x,y) !(x %in% y)
mean_na <- function(x) mean(x, na.rm = T)
median_na <- function(x) stats::median(x, na.rm= T)
min_na <- function(x) min(x, na.rm = T)
max_na <- function(x) max(x, na.rm = T)
sd_na <- function(x) stats::sd(x, na.rm = T)
sum_na <- function(x) sum(x, na.rm = T)

14 changes: 1 addition & 13 deletions R/source_quality.R
Original file line number Diff line number Diff line change
Expand Up @@ -37,16 +37,4 @@ check_quality <- function(data, source){
}


#' Check the presence of necessary columns in a dataset
#'
#' @param data dataset to check
#' @param columns a vector of column names that must be present
#'
#' @return nothing, but throws an error if a column is missing
check_columns <- function(data, columns){
for (column in columns){
if(!column %in% colnames(data)){
stop(paste0("Column `",column,"` was not found in the dataset."))
}
}
}

105 changes: 0 additions & 105 deletions R/summaries.R

This file was deleted.

2 changes: 1 addition & 1 deletion man/check_columns.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

19 changes: 19 additions & 0 deletions man/check_talkr.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 3d23d35

Please sign in to comment.