Skip to content

Commit

Permalink
Data explorer: Allow for optional summary stats for dates (#501)
Browse files Browse the repository at this point in the history
* Apply contract changes

* Allow for optional type

* Handle all NA dates nicely

* Make sure Debug is implemented for the error type
  • Loading branch information
dfalbel authored Oct 1, 2024
1 parent c6df0b2 commit 2785a41
Show file tree
Hide file tree
Showing 3 changed files with 90 additions and 42 deletions.
20 changes: 10 additions & 10 deletions crates/amalthea/src/comm/data_explorer_comm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -383,38 +383,38 @@ pub struct SummaryStatsString {
#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
pub struct SummaryStatsDate {
/// The exact number of distinct values
pub num_unique: i64,
pub num_unique: Option<i64>,

/// Minimum date value as string
pub min_date: String,
pub min_date: Option<String>,

/// Average date value as string
pub mean_date: String,
pub mean_date: Option<String>,

/// Sample median (50% value) date value as string
pub median_date: String,
pub median_date: Option<String>,

/// Maximum date value as string
pub max_date: String
pub max_date: Option<String>
}

/// SummaryStatsDatetime in Schemas
#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
pub struct SummaryStatsDatetime {
/// The exact number of distinct values
pub num_unique: i64,
pub num_unique: Option<i64>,

/// Minimum date value as string
pub min_date: String,
pub min_date: Option<String>,

/// Average date value as string
pub mean_date: String,
pub mean_date: Option<String>,

/// Sample median (50% value) date value as string
pub median_date: String,
pub median_date: Option<String>,

/// Maximum date value as string
pub max_date: String,
pub max_date: Option<String>,

/// Time zone for timestamp with time zone
pub timezone: Option<String>
Expand Down
89 changes: 61 additions & 28 deletions crates/ark/src/data_explorer/summary_stats.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
//
//

use core::fmt;
use std::collections::HashMap;

use amalthea::comm::data_explorer_comm;
Expand All @@ -24,6 +25,7 @@ use harp::utils::r_names2;
use harp::vector::CharacterVector;
use harp::vector::Vector;
use libr::SEXP;
use stdext::unwrap;

use crate::data_explorer::format::format_string;
use crate::modules::ARK_ENVS;
Expand Down Expand Up @@ -86,8 +88,8 @@ fn summary_stats_string(column: SEXP) -> anyhow::Result<SummaryStatsString> {
let r_stats: HashMap<String, i32> = stats.try_into()?;

Ok(SummaryStatsString {
num_empty: get_stat(&r_stats, "num_empty")? as i64,
num_unique: get_stat(&r_stats, "num_unique")? as i64,
num_empty: get_stat(&r_stats, "num_empty")?,
num_unique: get_stat(&r_stats, "num_unique")?,
})
}

Expand All @@ -96,23 +98,25 @@ fn summary_stats_boolean(column: SEXP) -> anyhow::Result<SummaryStatsBoolean> {
let r_stats: HashMap<String, i32> = stats.try_into()?;

Ok(SummaryStatsBoolean {
true_count: get_stat(&r_stats, "true_count")? as i64,
false_count: get_stat(&r_stats, "false_count")? as i64,
true_count: get_stat(&r_stats, "true_count")?,
false_count: get_stat(&r_stats, "false_count")?,
})
}

fn summary_stats_date(column: SEXP) -> anyhow::Result<SummaryStatsDate> {
let r_stats: HashMap<String, RObject> =
call_summary_fn("summary_stats_date", column)?.try_into()?;

let num_unique: i32 = get_stat(&r_stats, "num_unique")?.try_into()?;
let num_unique: Option<i64> = get_stat::<i32, RObject>(&r_stats, "num_unique")
.ok()
.and_then(|x| Some(x as i64));

Ok(SummaryStatsDate {
min_date: get_stat(&r_stats, "min_date")?.try_into()?,
mean_date: get_stat(&r_stats, "mean_date")?.try_into()?,
median_date: get_stat(&r_stats, "median_date")?.try_into()?,
max_date: get_stat(&r_stats, "max_date")?.try_into()?,
num_unique: num_unique as i64,
min_date: get_stat(&r_stats, "min_date").ok(),
mean_date: get_stat(&r_stats, "mean_date").ok(),
median_date: get_stat(&r_stats, "median_date").ok(),
max_date: get_stat(&r_stats, "max_date").ok(),
num_unique,
})
}

Expand All @@ -122,18 +126,21 @@ fn summary_stats_datetime(column: SEXP) -> anyhow::Result<SummaryStatsDatetime>
let r_stats: HashMap<String, RObject> =
call_summary_fn("summary_stats_date", column)?.try_into()?;

let num_unique: i32 = get_stat(&r_stats, "num_unique")?.try_into()?;
let num_unique: Option<i64> = get_stat::<i32, RObject>(&r_stats, "num_unique")
.ok()
.and_then(|x| Some(x as i64));

let timezone: Option<String> = RFunction::from("summary_stats_get_timezone")
.add(column)
.call_in(ARK_ENVS.positron_ns)?
.try_into()?;

Ok(SummaryStatsDatetime {
min_date: get_stat(&r_stats, "min_date")?.try_into()?,
mean_date: get_stat(&r_stats, "mean_date")?.try_into()?,
median_date: get_stat(&r_stats, "median_date")?.try_into()?,
max_date: get_stat(&r_stats, "max_date")?.try_into()?,
num_unique: num_unique as i64,
min_date: get_stat(&r_stats, "min_date").ok(),
mean_date: get_stat(&r_stats, "mean_date").ok(),
median_date: get_stat(&r_stats, "median_date").ok(),
max_date: get_stat(&r_stats, "max_date").ok(),
num_unique,
timezone,
})
}
Expand All @@ -155,11 +162,20 @@ fn empty_column_summary_stats() -> data_explorer_comm::ColumnSummaryStats {
}
}

fn get_stat<T: Clone>(stats: &HashMap<String, T>, name: &str) -> anyhow::Result<T> {
fn get_stat<Return, T: Clone>(stats: &HashMap<String, T>, name: &str) -> anyhow::Result<Return>
where
Return: TryFrom<T>,
Return::Error: fmt::Debug,
{
let value = stats.get(name);

match value {
Some(value) => Ok(value.clone()),
Some(value) => {
let value: Return = unwrap!(value.clone().try_into(), Err(err) => {
return Err(anyhow!("Can't cast to return type. {err:?}"))
});
Ok(value)
},
None => Err(anyhow!("Missing stat {}", name)),
}
}
Expand Down Expand Up @@ -265,11 +281,11 @@ mod tests {
let stats =
summary_stats(column.sexp, ColumnDisplayType::Date, &default_options()).unwrap();
let expected = SummaryStatsDate {
min_date: "2021-01-01".to_string(),
mean_date: "2021-01-02".to_string(),
median_date: "2021-01-02".to_string(),
max_date: "2021-01-04".to_string(),
num_unique: 5,
min_date: Some("2021-01-01".to_string()),
mean_date: Some("2021-01-02".to_string()),
median_date: Some("2021-01-02".to_string()),
max_date: Some("2021-01-04".to_string()),
num_unique: Some(5),
};
assert_eq!(stats.date_stats, Some(expected));
})
Expand All @@ -285,14 +301,31 @@ mod tests {
let stats = summary_stats(column.sexp, ColumnDisplayType::Datetime, &default_options())
.unwrap();
let expected = SummaryStatsDatetime {
num_unique: 2,
min_date: "2015-07-24 23:15:07".to_string(),
mean_date: "2015-07-24 23:15:07".to_string(),
median_date: "2015-07-24 23:15:07".to_string(),
max_date: "2015-07-24 23:15:07".to_string(),
num_unique: Some(2),
min_date: Some("2015-07-24 23:15:07".to_string()),
mean_date: Some("2015-07-24 23:15:07".to_string()),
median_date: Some("2015-07-24 23:15:07".to_string()),
max_date: Some("2015-07-24 23:15:07".to_string()),
timezone: Some("Japan".to_string()),
};
assert_eq!(stats.datetime_stats, Some(expected));
})
}

#[test]
fn test_date_all_na() {
r_test(|| {
let column = harp::parse_eval_base("as.Date(NA)").unwrap();
let stats =
summary_stats(column.sexp, ColumnDisplayType::Date, &default_options()).unwrap();
let expected = SummaryStatsDate {
num_unique: Some(1),
min_date: None,
mean_date: None,
median_date: None,
max_date: None,
};
assert_eq!(stats.date_stats, Some(expected));
})
}
}
23 changes: 19 additions & 4 deletions crates/ark/src/modules/positron/r_data_explorer.R
Original file line number Diff line number Diff line change
Expand Up @@ -78,16 +78,31 @@ summary_stats_date <- function(col) {
# When calling `min` on `x` would raise a warning.
# Turns out, some Parquet files might generate malformed timezones too.
suppressWarnings({
# When all values in the column are NA's, there min and max return -Inf and +Inf,
# mean returns NaN and median returns NA. We make everything return `NULL` so we
# correctly display the values in the front-end.
min_date <- finite_or_null(min(col, na.rm = TRUE))
max_date <- finite_or_null(max(col, na.rm = TRUE))
mean_date <- finite_or_null(mean(col, na.rm = TRUE))
median_date <- finite_or_null(stats::median(col, na.rm = TRUE))
list(
min_date = as.character(min(col, na.rm = TRUE)),
mean_date = as.character(mean(col, na.rm = TRUE)),
median_date = as.character(stats::median(col, na.rm = TRUE)),
max_date = as.character(max(col, na.rm = TRUE)),
min_date = as.character(min_date),
mean_date = as.character(mean_date),
median_date = as.character(median_date),
max_date = as.character(max_date),
num_unique = length(unique(col))
)
})
}

finite_or_null <- function(x) {
if (!is.finite(x)) {
NULL
} else {
x
}
}

summary_stats_get_timezone <- function(x) {
# this is the implementation in lubridate for POSIXt objects
tz <- function (x) {
Expand Down

0 comments on commit 2785a41

Please sign in to comment.