-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #41 from elpaco-escience/update_reports
Update reporting stats to use basic talkr columns
- Loading branch information
Showing
21 changed files
with
425 additions
and
279 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -29,5 +29,7 @@ Imports: | |
stats, | ||
stringx, | ||
tidyr, | ||
tidyselect, | ||
tibble, | ||
viridis | ||
Config/testthat/edition: 3 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
#' Check the presence of necessary columns in a dataset | ||
#' | ||
#' @param data dataset to check | ||
#' @param columns a vector of column names that must be present | ||
#' | ||
#' @return nothing, but throws an error if a column is missing | ||
check_columns <- function(data, columns){ | ||
for (column in columns){ | ||
if(!column %in% colnames(data)){ | ||
stop(paste0("Column `",column,"` was not found in the dataset.")) | ||
} | ||
} | ||
} | ||
|
||
|
||
#' Check the presence of talkr-workflow columns in the dataset. | ||
#' | ||
#' Uses check_columns() to check for: | ||
#' - begin | ||
#' - end | ||
#' - participant | ||
#' - utterance | ||
#' - source | ||
#' | ||
#' @param data dataset to check | ||
check_talkr <- function(data) { | ||
required_cols <- c("begin", "end", "participant", "utterance", "source") | ||
check_columns(data, required_cols) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,104 @@ | ||
#' Report corpus-level and conversation-level statistics | ||
#' | ||
#' Basic conversation statistics are reported to the console: | ||
#' - Corpus-level statistics, reporting on the dataset as a whole; | ||
#' - Conversation-level statistics, reporting per source. | ||
#' | ||
#' The input for this function must be a `talkr` dataset, containing | ||
#' the columns `source`, `participant`, `begin`, and `end`. Time stamps in the | ||
#' columns `begin` and `end` must be in milliseconds. | ||
#' To easily transform a dataset to a `talkr` dataset, consult `talkr::init()`. | ||
#' | ||
#' @param data talkr dataset | ||
#' | ||
#' @export | ||
report_stats <- function(data) { | ||
summary <- report(data) | ||
|
||
# combined | ||
combined <- report_combined(summary) | ||
print_single_line_table(header = "Corpus-level statistics", table = combined) | ||
|
||
# basics | ||
basics <- report_basics(summary) | ||
print_summary(header = "Conversation-level statistics (per source)", table = basics) | ||
|
||
# turns | ||
turns <- report_turns(summary) | ||
print_summary(header = "Turn statistics (per source)", table = turns) | ||
|
||
} | ||
|
||
report <- function(data) { | ||
# check if data is a talkr object | ||
check_talkr(data) | ||
|
||
# report summary | ||
summary <- data |> | ||
dplyr::mutate(XXX_turn_duration = .data$end - .data$begin) |> | ||
dplyr::group_by(.data$source) |> | ||
dplyr::summarize(start=min.na(.data$begin), | ||
end=max.na(.data$end), | ||
n_turns=dplyr::n(), | ||
shortest_turn=min_na(.data$XXX_turn_duration), | ||
longest_turn=max_na(.data$XXX_turn_duration), | ||
avg_turn=mean_na(.data$XXX_turn_duration), | ||
n_participants=dplyr::n_distinct(.data$participant), | ||
totaltime = .data$end - .data$start, | ||
total_speaking = sum.na(.data$XXX_turn_duration) | ||
) | ||
return(summary) | ||
} | ||
|
||
|
||
report_combined <- function(summary) { | ||
combined <- summary |> | ||
dplyr::summarize(`nr of sources` = dplyr::n(), | ||
`nr of participants` = sum_na(.data$n_participants), | ||
`nr of turns` = sum.na(.data$n_turns), | ||
`mean turn duration (ms)` = round(sum_na(.data$total_speaking) / sum_na(.data$n_turns), 0), | ||
`turns per hour` = round(sum_na(.data$n_turns) / (sum_na(.data$totaltime) / (60 * 60 * 1000)), 0), | ||
`total recording (min)` = round((sum_na(.data$totaltime) / (60 * 1000)), 1), | ||
`total recording (hours)` = round(sum_na(.data$totaltime) / (60 * 60 * 1000), 2), | ||
`total speaking time (min)` = round(sum_na(.data$total_speaking) / (60 * 1000), 1), | ||
`total speaking time (hours)` = round(sum_na(.data$total_speaking) / (60 * 60 * 1000), 2) | ||
) | ||
} | ||
|
||
report_basics <- function(summary) { | ||
basics <- tibble::tibble(source = summary$source) | ||
basics$`nr of participants` = summary$n_participants | ||
basics$`total recording (ms)` = summary$totaltime | ||
basics$`total recording (min)` = round(summary$totaltime / (60 * 1000), 1) | ||
basics$`total speaking time (ms)` = summary$total_speaking | ||
basics$`total speaking time (min)` = round(summary$total_speaking / (60 * 1000), 1) | ||
return(basics) | ||
} | ||
|
||
report_turns <- function(summary) { | ||
turns <- tibble::tibble(source = summary$source) | ||
turns$`nr of turns` = summary$n_turns | ||
turns$`mean turn duration (ms)` = round(summary$avg_turn, 0) | ||
turns$`shortest turn (ms)` = summary$shortest_turn | ||
turns$`longest turn (ms)` = summary$longest_turn | ||
turns$`turns per hour` = round(summary$n_turns / (summary$totaltime / (60 * 60 * 1000)), 0) | ||
return(turns) | ||
} | ||
|
||
|
||
print_summary <- function(header, table){ | ||
cat(paste("###", header)) | ||
print(knitr::kable(table)) | ||
cat("\n") | ||
} | ||
|
||
print_single_line_table <- function(header, table){ | ||
cat(paste("###", header, "\n\n")) | ||
for (n in names(table)){ | ||
item <- table[1,n] | ||
cat(paste(n, item, sep = ": ")) | ||
cat("\n") | ||
} | ||
cat("\n") | ||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,8 +1,17 @@ | ||
`%notin%` <- function(x,y) !(x %in% y) | ||
#`%notin%` <- function(x,y) !(x %in% y) | ||
mean.na <- function(x) mean(x, na.rm = T) | ||
median.na <- function(x) stats::median(x, na.rm= T) | ||
min.na <- function(x) min(x, na.rm = T) | ||
max.na <- function(x) max(x, na.rm = T) | ||
sd.na <- function(x) stats::sd(x, na.rm = T) | ||
sum.na <- function(x) sum(x, na.rm = T) | ||
|
||
|
||
`%notin%` <- function(x,y) !(x %in% y) | ||
mean_na <- function(x) mean(x, na.rm = T) | ||
median_na <- function(x) stats::median(x, na.rm= T) | ||
min_na <- function(x) min(x, na.rm = T) | ||
max_na <- function(x) max(x, na.rm = T) | ||
sd_na <- function(x) stats::sd(x, na.rm = T) | ||
sum_na <- function(x) sum(x, na.rm = T) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
Oops, something went wrong.