diff --git a/DESCRIPTION b/DESCRIPTION index 81a8b82..b36f10c 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -39,4 +39,4 @@ Suggests: corrplot Config/testthat/edition: 3 VignetteBuilder: knitr -URL: https://cbhurley.github.io/bullseye/ +URL: https://cbhurley.github.io/bullseye/, https://github.com/cbhurley/bullseye diff --git a/NAMESPACE b/NAMESPACE index 11aa4ae..263d20f 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -21,7 +21,10 @@ export(pair_nmi) export(pair_polychor) export(pair_polyserial) export(pair_scagnostics) -export(pair_tau) +export(pair_tauA) +export(pair_tauB) +export(pair_tauC) +export(pair_tauW) export(pair_uncertainty) export(pairwise) export(pairwise_by) diff --git a/R/pair_methods.R b/R/pair_methods.R index beaf383..bd9f2bd 100644 --- a/R/pair_methods.R +++ b/R/pair_methods.R @@ -39,17 +39,31 @@ pair_cor <- function(d, method="pearson", handle.na=TRUE,...){ } } - # pair_cor <- function(d, method="pearson", handle.na=TRUE,...){ - # check_df(d) - # d <- d[, sapply(d, is.numeric), drop=FALSE] - # if (ncol(d) > 1){ - # if (handle.na) - # dcor <- cor(d,method=method,use="pairwise.complete.obs") - # else dcor <- cor(d,method=method,...) - # pairwise(dcor, score=method, pair_type = "nn") - # } - # } + + +ccor <- function(x,y, handle.na=TRUE){ + if(handle.na){ + pick <- complete.cases(x, y) + x <- x[pick] + y <- y[pick] + } + + if (length(x) <= 2) { + return (NA) + } + if (!is.numeric(x)) + x <- sapply(unique(x), function(u) as.numeric(x ==u))[,-1] + if (!is.numeric(y)) + y <- sapply(unique(y), function(u) as.numeric(y ==u))[,-1] + tryCatch(cancor(x,y)$cor[1], error = function(e) { + # message("Cannot calculate cancor, returning NA") + NA + } + ) +} + + #' Canonical correlation #' #' Calculates canonical correlation for every variable pair in a dataset. @@ -74,24 +88,7 @@ pair_cancor <- function(d,handle.na=TRUE,...){ fn <- function(x,y){ x <- d[[x]] y <- d[[y]] - if(handle.na){ - pick <- complete.cases(x, y) - x <- x[pick] - y <- y[pick] - } - - if (length(x) <= 2) { - return (NA) - } - if (!is.numeric(x)) - x <- sapply(unique(x), function(u) as.numeric(x ==u))[,-1] - if (!is.numeric(y)) - y <- sapply(unique(y), function(u) as.numeric(y ==u))[,-1] - tryCatch(cancor(x,y)$cor[1], error = function(e) { - # message("Cannot calculate cancor, returning NA") - NA - } - ) + ccor(x,y,handle.na=handle.na) } a$value <- mapply(fn, a$x,a$y, USE.NAMES = FALSE) @@ -138,13 +135,33 @@ pair_dcor <- function(d, handle.na=TRUE,...){ } + + + +# pair_mine <- function(d, method="mic",handle.na=TRUE,...){ +# if (!requireNamespace("minerva", quietly = TRUE)) +# stop("Please install package 'minerva' to use pair_mine", call.=FALSE) +# check_df(d) +# +# d <- d[, sapply(d, is.numeric), drop=FALSE] +# if(ncol(d)>1){ +# if (handle.na) +# dcor <- minerva::mine(d,use="pairwise.complete.obs",...) +# else dcor <- minerva::mine(d,...) +# +# dcor <- dcor[[toupper(method)]] +# pairwise(dcor, score=method, pair_type = "nn") +# } +# +# } + + #' MINE family values #' #' Calculates MINE family values for every numeric variable pair in a dataset. #' #' @param d A dataframe -#' @param method character string for the MINE value to be calculated. Either "mic" (default), "mas", "mev", -#' "mcn", or "mic-r2" +#' @param method character vector for the MINE value to be calculated. Subset of "MIC","MAS","MEV","MCN","MICR2", "GMIC", "TIC" #' @param handle.na If TRUE uses pairwise complete observations to calculate score, otherwise NAs not handled. #' @param ... other arguments #' @@ -154,13 +171,12 @@ pair_dcor <- function(d, handle.na=TRUE,...){ #' @details The values are calculated using \code{\link[minerva]{mine}} from \code{minerva} #' @examples #' pair_mine(iris) -#' pair_mine(iris, method="mas") +#' pair_mine(iris, method="MAS") #' @references Reshef, David N., et al. "Detecting novel associations in large data sets." #' science 334.6062 (2011): 1518-1524 - - -pair_mine <- function(d, method="mic",handle.na=TRUE,...){ +#' +pair_mine <- function(d, method="MIC",handle.na=TRUE,...){ if (!requireNamespace("minerva", quietly = TRUE)) stop("Please install package 'minerva' to use pair_mine", call.=FALSE) check_df(d) @@ -168,13 +184,19 @@ pair_mine <- function(d, method="mic",handle.na=TRUE,...){ d <- d[, sapply(d, is.numeric), drop=FALSE] if(ncol(d)>1){ if (handle.na) - dcor <- minerva::mine(d,use="pairwise.complete.obs",...) - else dcor <- minerva::mine(d,...) - - dcor <- dcor[[toupper(method)]] - pairwise(dcor, score=method, pair_type = "nn") + dcor <- minerva::mine(d,use="pairwise.complete.obs",normalization=TRUE,...) + else dcor <- minerva::mine(d,normalization=TRUE,...) + mine_choices <- names(dcor) + sel_mine <- match.arg(toupper(method), mine_choices, several.ok=TRUE) + p <- pairwise(dcor[[sel_mine[1]]], score=sel_mine[1], pair_type = "nn") + for (m in sel_mine[-1]){ + pm <- pairwise(dcor[[m]], score=m, pair_type = "nn") + p <- rbind(p, pm) + } + p |> + dplyr::arrange(.data$x, .data$y) } - + } @@ -296,48 +318,14 @@ pair_polyserial <- function(d,handle.na=TRUE,...){ } -#' Kendall's tau A, B, C and Kendall's W -#' -#' Calculates one of either Kendall's tau A, B, C or Kendall's W for every factor variable pair in a dataset. -#' -#' @param d A dataframe -#' @param method A character string for the correlation coefficient to be calculated, one of "B" (default), -#' "A", "C" or "W". If the value is "all", then all four correlations are calculated. -#' @param ... other arguments -#' -#' @return A tibble of class `pairwise` with factor pairs along with one of either Kendall's tau A, B, C or -#' Kendall's W value, or NULL if there are not at least two factor variables -#' -#' @details The association values Kendall's tau A, B, C or Kendall's W are calculated using \code{\link[DescTools]{KendallTauA}}, -#' \code{\link[DescTools]{KendallTauB}}, \code{\link[DescTools]{StuartTauC}} or \code{\link[DescTools]{KendallW}} respectively,from the -#' \code{DescTools} package, and assumes factor levels are in the given order. NAs are automatically handled by pairwise omit. -#' -#' @export -#' -#' @examples -#' pair_tau(iris) -#' pair_tau(iris, method="A") -#' pair_tau(iris, method="C") -#' pair_tau(iris, method="W") -pair_tau <- function(d,method=c("B","A","C","W"),...){ - if (!requireNamespace("DescTools", quietly = TRUE)) - stop("Please install package 'DescTools' to use pair_tau", call.=FALSE) - check_df(d) - - # automatically does pairwise omit, Kendall - method <- method[1] + +pair_tau <- function(d,method="B",handle.na=TRUE,...){ + # automatically does pairwise omit for A,B,C d <- dplyr::select(d, dplyr::where(is.factor)) if(ncol(d)>1){ - if (method == "all"){ - B <- pair_tau(d,method="B") - A <- pair_tau(d,method="A") - C <- pair_tau(d,method="C") - W <- pair_tau(d,method="W") - dplyr::bind_rows(B,A,C,W) - } else { a <- pairwise(d, score=paste0("tau", method), pair_type = "ff") fns <- c("A"= DescTools::KendallTauA, "B"=DescTools::KendallTauB, "C" = DescTools::StuartTauC, "W"= DescTools::KendallW) @@ -346,22 +334,124 @@ pair_tau <- function(d,method=c("B","A","C","W"),...){ if (length(unique(d[[x]])) <= 1) return(NA) if (length(unique(d[[y]])) <= 1) return(NA) if (method =="W") - fn(d[c(x,y)], correct=TRUE) - else fn(d[[x]],d[[y]]) + fn(d[c(x,y)], correct=TRUE, na.rm=handle.na,...) + else fn(d[[x]],d[[y]],...) } a$value <- mapply(fnlocal, a$x,a$y, USE.NAMES = FALSE) a } - - } } -#' Uncertainty coefficient +#' Kendall's tau B for association between ordinal factors. +#' +#' Calculates Kendall's tau B every factor variable pair in a dataset. +#' +#' @param d A dataframe +#' @param ... other arguments +#' @param handle.na ignored. Pairwise complete observations are used automatically. +#' @return A tibble of class `pairwise` with factor pairs, or NULL if there are not at least two factor variables +#' +#' @details Calculated using \code{\link[DescTools]{KendallTauB}}. Assumes factor levels are in the given order. +#' NAs are automatically handled by pairwise omit. +#' @export +#' @examples + +#' d <- data.frame(x=rnorm(20), +#' y=factor(sample(3,20, replace=TRUE)), +#' z=factor(sample(2,20, replace=TRUE))) +#' pair_tauB(d) + +pair_tauB <- function(d,handle.na=TRUE,...){ + if (!requireNamespace("DescTools", quietly = TRUE)) + stop("Please install package 'DescTools' to use pair_tauB ", call.=FALSE) + check_df(d) + pair_tau(d, method="B",...) +} + +#' Kendall's tau A for association between ordinal factors. +#' +#' Calculates Kendall's tau A for every factor variable pair in a dataset. +#' +#' @param d A dataframe +#' @param handle.na ignored. Pairwise complete observations are used automatically. +#' @param ... other arguments +#' +#' @return A tibble of class `pairwise` with factor pairs, or NULL if there are not at least two factor variables +#' +#' @details Calculated using \code{\link[DescTools]{KendallTauA}}. Assumes factor levels are in the given order. +#' NAs are automatically handled by pairwise omit. +#' @export +#' @examples +#' d <- data.frame(x=rnorm(20), +#' y=factor(sample(3,20, replace=TRUE)), +#' z=factor(sample(2,20, replace=TRUE))) +#' pair_tauA(d) + +pair_tauA <- function(d,handle.na=TRUE,...){ + if (!requireNamespace("DescTools", quietly = TRUE)) + stop("Please install package 'DescTools' to use pair_tauA ", call.=FALSE) + check_df(d) + pair_tau(d, method="A",...) +} + +#' Stuarts's tau C for association between ordinal factors. +#' +#' Calculates Stuarts's tau C every factor variable pair in a dataset. +#' +#' @param d A dataframe +#' @param handle.na ignored. Pairwise complete observations are used automatically. +#' @param ... other arguments +#' +#' @return A tibble of class `pairwise` with factor pairs, or NULL if there are not at least two factor variables +#' +#' @details Calculated using \code{\link[DescTools]{StuartTauC}}. Assumes factor levels are in the given order. +#' NAs are automatically handled by pairwise omit. +#' @export +#' @examples +#' d <- data.frame(x=rnorm(20), +#' y=factor(sample(3,20, replace=TRUE)), +#' z=factor(sample(2,20, replace=TRUE))) +#' pair_tauC(d) + +pair_tauC <- function(d,handle.na=TRUE,...){ + if (!requireNamespace("DescTools", quietly = TRUE)) + stop("Please install package 'DescTools' to use pair_tauC ", call.=FALSE) + check_df(d) + pair_tau(d, method="C") +} + +#' Kendall's W for association between ordinal factors. +#' +#' Calculates Kendall's tau W every factor variable pair in a dataset. +#' +#' @param d A dataframe +#' @param handle.na ignored. Pairwise complete observations are used automatically. +#' @param ... other arguments +#' +#' @return A tibble of class `pairwise` with factor pairs, or NULL if there are not at least two factor variables +#' +#' @details Calculated using \code{\link[DescTools]{KendallW}}. Assumes factor levels are in the given order. +#' NAs are automatically handled by pairwise omit. +#' @export +#' @examples +#' d <- data.frame(x=rnorm(20), +#' y=factor(sample(3,20, replace=TRUE)), +#' z=factor(sample(2,20, replace=TRUE))) +#' pair_tauW(d) + +pair_tauW <- function(d,handle.na=TRUE,...){ + if (!requireNamespace("DescTools", quietly = TRUE)) + stop("Please install package 'DescTools' to use pair_tauW ", call.=FALSE) + check_df(d) + pair_tau(d, method="W") +} + +#' Uncertainty coefficient for association between factors. #' #' Calculates uncertainty coefficient for every factor variable pair in a dataset. #' #' @param d A dataframe -#' @param handle.na If TRUE uses pairwise complete observations to calculate uncertainty coefficient, otherwise NAs not handled. +#' @param handle.na ignored. Pairwise complete observations are used automatically. #' @param ... other arguments #' #' @return A tibble of class `pairwise` with every factor variable pair and uncertainty coefficient value, @@ -390,18 +480,19 @@ pair_uncertainty <- function(d,handle.na=TRUE,...){ } -#' Goodman Kruskal's Tau +#' Goodman Kruskal's Tau for association between ordinal factors. #' #' Calculates Goodman Kruskal's Tau coefficient for every factor variable pair in a dataset. #' #' @param d A dataframe -#' @param handle.na If TRUE uses pairwise complete observations, otherwise NAs not handled. +#' @param handle.na ignored. Pairwise complete observations are used automatically. #' @param ... other arguments #' #' @return A tibble of class `pairwise` with Goodman Kruskal's Tau for every factor variable pair, #' or NULL if there are not at least two factor variables #' @details The Goodman Kruskal's Tau coefficient is calculated using \code{\link[DescTools]{GoodmanKruskalTau}} -#' function from the \code{DescTools} package. +#' function from the \code{DescTools} package. Assumes factor levels are in the given order. +#' NAs are automatically handled by pairwise omit. #' @export #' #' @examples @@ -415,7 +506,8 @@ pair_gkTau <- function(d,handle.na=TRUE,...){ d <- dplyr::select(d, dplyr::where(is.factor)) if(ncol(d)>1){ a <- pairwise(d, score="gkTau", pair_type = "ff") - fnlocal <- function(x,y) max(DescTools::GoodmanKruskalTau(d[[x]],d[[y]]),DescTools::GoodmanKruskalTau(d[[y]],d[[x]])) + fnlocal <- function(x,y) + max(DescTools::GoodmanKruskalTau(d[[x]],d[[y]],...),DescTools::GoodmanKruskalTau(d[[y]],d[[x]],...)) a$value <- mapply(fnlocal, a$x,a$y, USE.NAMES = FALSE) a } @@ -423,19 +515,20 @@ pair_gkTau <- function(d,handle.na=TRUE,...){ } -#' Goodman Kruskal's Gamma +#' Goodman Kruskal's Gamma for association between ordinal factors. #' #' Calculates Goodman Kruskal's Gamma coefficient for every factor variable pair in a dataset. #' #' @param d A dataframe -#' @param handle.na If TRUE uses pairwise complete observations, otherwise NAs not handled. +#' @param handle.na ignored. Pairwise complete observations are used automatically. #' @param ... other arguments #' #' @return A tibble of class `pairwise` with factor variable pairs and Goodman Kruskal's Gamma coefficient, #' or NULL if there are not at least two factor variables #' @details The Goodman Kruskal's Gamma coefficient is calculated using \code{\link[DescTools]{GoodmanKruskalGamma}} -#' function from the \code{DescTools} package,and assumes factor levels are in the given order. +#' function from the \code{DescTools} package. Assumes factor levels are in the given order. +#' NAs are automatically handled by pairwise omit. #' @export #' #' @examples @@ -456,19 +549,19 @@ pair_gkGamma <- function(d,handle.na=TRUE,...){ } -#' Pearson's Contingency Coefficient +#' Pearson's Contingency Coefficient for association between factors. #' #' Calculates Pearson's Contingency coefficient for every factor variable pair in a dataset. #' #' @param d A dataframe -#' @param handle.na If TRUE uses pairwise complete observations. +#' @param handle.na ignored. Pairwise complete observations are used automatically. #' @param ... other arguments #' #' @return A tibble of class `pairwise` with calculated Pearson's contingency coefficient for every factor variable #' pair, or NULL if there are not at least two factor variables #' @export -#' @details The Pearson's contingency coefficient is calculated using \code{\link[DescTools]{ContCoef}} -#' function from the \code{DescTools} package. +#' @details The Pearson's contingency coefficient is calculated using \code{\link[DescTools]{ContCoef}}. +#' NAs are automatically handled by pairwise omit. #' #' @examples #' pair_chi(iris) @@ -575,21 +668,24 @@ pair_ace <- function(d, handle.na = T, ...) { #' pair_methods pair_methods <- dplyr::tribble( - ~name, ~nn, ~ff, ~fn, ~from, ~range, ~comments, - "pair_cor", TRUE, FALSE, FALSE, "cor", "[-1,1]", "", - "pair_dcor", TRUE, FALSE, FALSE, "energy::dcor2d", "[0,1]", "", - "pair_mine", TRUE, FALSE, FALSE, "minerva::mine", "[0,1]","", - "pair_ace", TRUE, TRUE, TRUE, "acepack::ace", "[0,1]","", - "pair_cancor", TRUE, TRUE, TRUE, "cancor", "[0,1]","", - "pair_nmi", TRUE, TRUE, TRUE, "linkspotter::maxNMI", "[0,1]","", - "pair_polychor", FALSE, TRUE, FALSE, "polycor::polychor", "[-1,1]","factors treated as ordinal", - "pair_polyserial", FALSE, FALSE, TRUE, "polycor::polyserial", "[-1,1]","factor treated as ordinal", - "pair_tau", FALSE, TRUE, FALSE, "DescTools::KendalTauA,B,C,W", "[-1,1]","factors treated as ordinal", - "pair_gkGamma", FALSE, TRUE, FALSE, "DescTools::GoodmanKruskalGamma", "[-1,1]","factors treated as ordinal", - "pair_gkTau", FALSE, TRUE, FALSE, "DescTools::GoodmanKruskalTau", "[0,1]","", - "pair_uncertainty", FALSE, TRUE, FALSE, "DescTools::UncertCoef", "[0,1]","", - "pair_chi", FALSE, TRUE, FALSE, "DescTools::ContCoef", "[0,1]","", - "pair_scag", TRUE, FALSE, FALSE, "scagnostics::scagnostics", "[0,1]","", + ~name, ~nn, ~ff, ~fn, ~from, ~range, ~ordinal, + "pair_cor", TRUE, FALSE, FALSE, "cor", "[-1,1]", NA, + "pair_dcor", TRUE, FALSE, FALSE, "energy::dcor2d", "[0,1]", NA, + "pair_mine", TRUE, FALSE, FALSE, "minerva::mine", "[0,1]",NA, + "pair_ace", TRUE, TRUE, TRUE, "acepack::ace", "[0,1]",FALSE, + "pair_cancor", TRUE, TRUE, TRUE, "cancor", "[0,1]",FALSE, + "pair_nmi", TRUE, TRUE, TRUE, "linkspotter::maxNMI", "[0,1]",FALSE, + "pair_polychor", FALSE, TRUE, FALSE, "polycor::polychor", "[-1,1]",TRUE, + "pair_polyserial", FALSE, FALSE, TRUE, "polycor::polyserial", "[-1,1]",TRUE, + "pair_tauB", FALSE, TRUE, FALSE, "DescTools::KendalTauB", "[-1,1]",TRUE, + "pair_tauA", FALSE, TRUE, FALSE, "DescTools::KendalTauA", "[-1,1]",TRUE, + "pair_tauC", FALSE, TRUE, FALSE, "DescTools::StuartTauC", "[-1,1]",TRUE, + "pair_tauW", FALSE, TRUE, FALSE, "DescTools::KendalW", "[-1,1]",TRUE, + "pair_gkGamma", FALSE, TRUE, FALSE, "DescTools::GoodmanKruskalGamma", "[-1,1]",TRUE, + "pair_gkTau", FALSE, TRUE, FALSE, "DescTools::GoodmanKruskalTau", "[0,1]",TRUE, + "pair_uncertainty", FALSE, TRUE, FALSE, "DescTools::UncertCoef", "[0,1]",FALSE, + "pair_chi", FALSE, TRUE, FALSE, "DescTools::ContCoef", "[0,1]",FALSE, + "pair_scag", TRUE, FALSE, FALSE, "scagnostics::scagnostics", "[0,1]",NA, ) @@ -597,7 +693,7 @@ pair_methods <- dplyr::tribble( #' #' @param d a dataframe #' @param by a character string for the name of the conditioning variable. -#' @param pair_fun One of the `pair_` functions +#' @param pair_fun A function returning a `pairwise` from a dataset. #' @param ungrouped If TRUE calculates the ungrouped score in addition to grouped scores. #' #' @return tibble of class "pairwise" @@ -625,5 +721,5 @@ pairwise_by <- function(d, by, pair_fun, ungrouped=TRUE){ pair_fun() result <- rbind(result, overall) } - result + result |> dplyr::arrange(.data$x, .data$y) } \ No newline at end of file diff --git a/R/pair_scagnostics.R b/R/pair_scagnostics.R index bb87c4c..c1b0ce7 100644 --- a/R/pair_scagnostics.R +++ b/R/pair_scagnostics.R @@ -23,7 +23,7 @@ pair_scagnostics <- function(d, scagnostic = c("Outlying","Skewed","Clumpy","Sparse","Striated", "Convex","Skinny","Stringy","Monotonic"), - handle.na = T, ...) { + handle.na = TRUE, ...) { if (!requireNamespace("scagnostics", quietly = TRUE)) stop("Please install package 'scagnostics' to use pair_scagnostics", call.=FALSE) check_df(d) @@ -52,5 +52,6 @@ pair_scagnostics <- function(d, scagnostic = c("Outlying","Skewed","Clumpy","Spa } scag$value <- as.numeric(t(mapply(scag_fn, scag1$x,scag1$y))) - scag + scag |> + dplyr::arrange(.data$x, .data$y) } diff --git a/R/pairwise.R b/R/pairwise.R index 888f576..b871011 100644 --- a/R/pairwise.R +++ b/R/pairwise.R @@ -19,12 +19,11 @@ pairwise <- function(x, score=NA_character_, pair_type=NA_character_){ } -pairwise_to_matrix <- function(scores, stat=function(x) diff(range(x,na.rm=TRUE)), default=NA){ +pairwise_to_matrix <- function(scores, stat=dplyr::first, default=NA){ allvars <- unique(c(scores$x, scores$y)) scores1 <- dplyr::summarise(scores, - n = dplyr::n(), - measure= if (.data$n > 1) stat(.data$value) else .data$value, + measure= stat(.data$value), .by=dplyr::all_of(c("x","y"))) scores1 <- scores1[!is.na(scores1$measure),] m <- matrix(default, nrow=length(allvars), ncol=length(allvars)) diff --git a/R/pairwise_multi.R b/R/pairwise_multi.R index 85c07f0..3aaffa7 100644 --- a/R/pairwise_multi.R +++ b/R/pairwise_multi.R @@ -5,7 +5,7 @@ #' Calculates multiple scores for every variable pair in a dataset. #' #' @param d dataframe -#' @param scores a vector naming pairwise functions. +#' @param scores a vector naming functions returning a `pairwise` from a dataset. #' #' @param handle.na If TRUE uses pairwise complete observations to calculate pairwise score, otherwise NAs not handled. #' @@ -43,6 +43,7 @@ pairwise_multi <- function(d,scores=c("pair_cor", "pair_dcor","pair_mine","pair_ results <- dplyr::bind_rows(results, taua, tauc, tauw) } - results + results |> + dplyr::arrange(.data$x, .data$y) } diff --git a/R/pairwise_scores.R b/R/pairwise_scores.R index 62605cd..da9a3e9 100644 --- a/R/pairwise_scores.R +++ b/R/pairwise_scores.R @@ -72,7 +72,6 @@ pairwise_scores <- function(d, if (ncol(dsub)>1 & !is.null(entry$funName)){ if (is.null(entry$argList)) m <- do.call(get(entry$funName), list(dsub, handle.na=handle.na)) - # else m <- do.call(get(entry$funName), append(list(dsub, handle.na=handle.na), entry$argList)) else m <- do.call(get(entry$funName), list(dsub, handle.na=handle.na, entry$argList)) if (!inherits(m, "pairwise")) stop("Calculated pairwise scores must be of type pairwise") @@ -137,11 +136,10 @@ pairwise_scores <- function(d, #' @export #' -pair_control <- function(nn = c("pair_cor","pair_dcor","pair_mine","pair_ace", - "pair_cancor","pair_nmi", "pair_scagnostics" ), - oo = c("pair_polychor", "pair_tau","pair_gkGamma","pair_gkTau"), - ff = c("pair_cancor","pair_ace","pair_nmi", "pair_uncertainty","pair_chi"), - fn = c("pair_cancor", "pair_nmi","pair_ace"), +pair_control <- function(nn = "pair_cor", + oo = "pair_polychor", + ff = "pair_cancor", + fn = "pair_cancor", nnargs=NULL, ooargs=NULL, ffargs=NULL,fnargs=NULL){ list(nn=nn[1], fn=fn[1],oo=oo[1], ff=ff[1],nnargs=nnargs,fnargs=fnargs,ooargs=NULL, ffargs=ffargs) diff --git a/R/plot_pairwise.R b/R/plot_pairwise.R index db43898..e59a82b 100644 --- a/R/plot_pairwise.R +++ b/R/plot_pairwise.R @@ -14,7 +14,8 @@ #' @param interactive defaults to FALSE #' @return A `girafe` object if interactive==TRUE, otherwise a `ggplot2`. #' -#' If scores has one value for x,y pair, then a filled circle is drawn with fill representing the score value. If there are multiple values for each x,y pair then the filled circle is split into wedges, with the wedge fill representing the values. If some rows have `group=center_level`, then the glyph is drawn as a bullseye. +#' If scores has one value for x,y pair, then a filled circle is drawn with fill representing the score value. If there are multiple values for each x,y pair then the filled circle is split into wedges, with the wedge fill representing the values. +#' If some rows have `group=center_level`, then the glyph is drawn as a bullseye. #' @examples #' plot_pairwise(pair_cor(iris)) #' plot_pairwise(pairwise_scores(iris,by="Species")) @@ -31,12 +32,13 @@ plot_pairwise <- function(scores, var_order="seriate_max", score_limits=NULL, var_order <- prep$var_order score_label <- prep$score_label - if (grepl("seriate", var_order[1])) { - serfn <- if (var_order[1] == "seriate_max_diff") - function(x) diff(range(x, na.rm=TRUE)) else function(x) max(x, na.rm=TRUE) - m <- pairwise_to_matrix(scores, serfn, default=0) - o <- suppressMessages(DendSer::dser(stats::as.dist(-m), cost = DendSer::costLPL)) - var_order <- rownames(m)[o] + if (length(var_order)==1){ + if (grepl("seriate", var_order)) { + serfn <- if (var_order == "seriate_max_diff") ser_max_diff else ser_max + m <- pairwise_to_matrix(scores, serfn, default=0) + o <- suppressMessages(DendSer::dser(stats::as.dist(-m), cost = DendSer::costLPL)) + var_order <- rownames(m)[o] + } } scores$x <- factor(scores$x, levels=var_order) scores$y <- factor(scores$y, levels=var_order) @@ -134,7 +136,7 @@ plot_pairwise_prep <- function(scores, score_limits=NULL, var_order=NULL, ignore allvars <- unique(c(scores$x, scores$y)) if (is.null(var_order)) var_order <- sort(allvars) - else if (!grepl("seriate", var_order)) { + else if ((length(var_order) == 1) && !grepl("seriate", var_order)) { if (length(intersect(allvars, var_order) ==0)) stop("'var_order' must be NULL, 'seriate_max', 'seriate_max_diff' or a subset of the x and y variables in 'scores'") else { @@ -198,8 +200,7 @@ plot_pairwise_linear <- function(scores, if (grepl("seriate", pair_order)){ - serfn <- if (pair_order == "seriate_max_diff") - function(x) diff(range(x)) else function(x) max(abs(x)) + serfn <- if (pair_order == "seriate_max_diff") ser_max_diff else ser_max ord <- dplyr::summarise(scores, n = dplyr::n(), measure= if (.data$n > 1) serfn(.data$value) else .data$value, @@ -239,7 +240,7 @@ plot_pairwise_linear <- function(scores, show.legend = FALSE)}+ ylim(score_limits[1],score_limits[2]) + coord_flip() +scale_x_discrete(limits=rev) + - labs(y = score_label) + labs(y = "scores") } p <- p+ theme(legend.position="bottom", axis.title.y = element_blank()) if (interactive) ggiraph::girafe(ggobj=p) else p @@ -288,6 +289,16 @@ plot.pairwise<- function(x, type=c("matrix", "linear"), ...){ } +ser_max_diff <- function(x){ + if (all(is.na(x))) 0 else diff(range(x, na.rm=TRUE)) +} + +ser_max <- function(x){ + if (all(is.na(x))) 0 else max(abs(x), na.rm=TRUE) +} + + + #' Converts a pairwise to a symmetric matrix. Uses the first entry for each (x,y) pair. #' @param x An object of class pairwise #' @return A symmetric matrix diff --git a/man/bullseye-package.Rd b/man/bullseye-package.Rd index 51cda92..2022746 100644 --- a/man/bullseye-package.Rd +++ b/man/bullseye-package.Rd @@ -12,6 +12,7 @@ We provide a tidy data structure and visualisations for multiple or grouped vari Useful links: \itemize{ \item \url{https://cbhurley.github.io/bullseye/} + \item \url{https://github.com/cbhurley/bullseye} } } diff --git a/man/pair_chi.Rd b/man/pair_chi.Rd index 1c263f3..d55e72a 100644 --- a/man/pair_chi.Rd +++ b/man/pair_chi.Rd @@ -2,14 +2,14 @@ % Please edit documentation in R/pair_methods.R \name{pair_chi} \alias{pair_chi} -\title{Pearson's Contingency Coefficient} +\title{Pearson's Contingency Coefficient for association between factors.} \usage{ pair_chi(d, handle.na = TRUE, ...) } \arguments{ \item{d}{A dataframe} -\item{handle.na}{If TRUE uses pairwise complete observations.} +\item{handle.na}{ignored. Pairwise complete observations are used automatically.} \item{...}{other arguments} } @@ -21,8 +21,8 @@ pair, or NULL if there are not at least two factor variables Calculates Pearson's Contingency coefficient for every factor variable pair in a dataset. } \details{ -The Pearson's contingency coefficient is calculated using \code{\link[DescTools]{ContCoef}} -function from the \code{DescTools} package. +The Pearson's contingency coefficient is calculated using \code{\link[DescTools]{ContCoef}}. +NAs are automatically handled by pairwise omit. } \examples{ pair_chi(iris) diff --git a/man/pair_control.Rd b/man/pair_control.Rd index f3e1765..5060a34 100644 --- a/man/pair_control.Rd +++ b/man/pair_control.Rd @@ -5,11 +5,10 @@ \title{Default scores calculated by \code{pairwise_scores}} \usage{ pair_control( - nn = c("pair_cor", "pair_dcor", "pair_mine", "pair_ace", "pair_cancor", "pair_nmi", - "pair_scagnostics"), - oo = c("pair_polychor", "pair_tau", "pair_gkGamma", "pair_gkTau"), - ff = c("pair_cancor", "pair_ace", "pair_nmi", "pair_uncertainty", "pair_chi"), - fn = c("pair_cancor", "pair_nmi", "pair_ace"), + nn = "pair_cor", + oo = "pair_polychor", + ff = "pair_cancor", + fn = "pair_cancor", nnargs = NULL, ooargs = NULL, ffargs = NULL, diff --git a/man/pair_gkGamma.Rd b/man/pair_gkGamma.Rd index 1c86ba8..e7e0333 100644 --- a/man/pair_gkGamma.Rd +++ b/man/pair_gkGamma.Rd @@ -2,14 +2,14 @@ % Please edit documentation in R/pair_methods.R \name{pair_gkGamma} \alias{pair_gkGamma} -\title{Goodman Kruskal's Gamma} +\title{Goodman Kruskal's Gamma for association between ordinal factors.} \usage{ pair_gkGamma(d, handle.na = TRUE, ...) } \arguments{ \item{d}{A dataframe} -\item{handle.na}{If TRUE uses pairwise complete observations, otherwise NAs not handled.} +\item{handle.na}{ignored. Pairwise complete observations are used automatically.} \item{...}{other arguments} } @@ -22,7 +22,8 @@ Calculates Goodman Kruskal's Gamma coefficient for every factor variable pair in } \details{ The Goodman Kruskal's Gamma coefficient is calculated using \code{\link[DescTools]{GoodmanKruskalGamma}} -function from the \code{DescTools} package,and assumes factor levels are in the given order. +function from the \code{DescTools} package. Assumes factor levels are in the given order. +NAs are automatically handled by pairwise omit. } \examples{ pair_gkGamma(iris) diff --git a/man/pair_gkTau.Rd b/man/pair_gkTau.Rd index 6d56544..a55a608 100644 --- a/man/pair_gkTau.Rd +++ b/man/pair_gkTau.Rd @@ -2,14 +2,14 @@ % Please edit documentation in R/pair_methods.R \name{pair_gkTau} \alias{pair_gkTau} -\title{Goodman Kruskal's Tau} +\title{Goodman Kruskal's Tau for association between ordinal factors.} \usage{ pair_gkTau(d, handle.na = TRUE, ...) } \arguments{ \item{d}{A dataframe} -\item{handle.na}{If TRUE uses pairwise complete observations, otherwise NAs not handled.} +\item{handle.na}{ignored. Pairwise complete observations are used automatically.} \item{...}{other arguments} } @@ -22,7 +22,8 @@ Calculates Goodman Kruskal's Tau coefficient for every factor variable pair in a } \details{ The Goodman Kruskal's Tau coefficient is calculated using \code{\link[DescTools]{GoodmanKruskalTau}} -function from the \code{DescTools} package. +function from the \code{DescTools} package. Assumes factor levels are in the given order. +NAs are automatically handled by pairwise omit. } \examples{ pair_gkTau(iris) diff --git a/man/pair_methods.Rd b/man/pair_methods.Rd index d72a4f7..90263a1 100644 --- a/man/pair_methods.Rd +++ b/man/pair_methods.Rd @@ -5,7 +5,7 @@ \alias{pair_methods} \title{Pairwise score functions available in the package} \format{ -An object of class \code{tbl_df} (inherits from \code{tbl}, \code{data.frame}) with 14 rows and 7 columns. +An object of class \code{tbl_df} (inherits from \code{tbl}, \code{data.frame}) with 17 rows and 7 columns. } \usage{ pair_methods diff --git a/man/pair_mine.Rd b/man/pair_mine.Rd index c0ba5d6..d8bba5e 100644 --- a/man/pair_mine.Rd +++ b/man/pair_mine.Rd @@ -4,13 +4,12 @@ \alias{pair_mine} \title{MINE family values} \usage{ -pair_mine(d, method = "mic", handle.na = TRUE, ...) +pair_mine(d, method = "MIC", handle.na = TRUE, ...) } \arguments{ \item{d}{A dataframe} -\item{method}{character string for the MINE value to be calculated. Either "mic" (default), "mas", "mev", -"mcn", or "mic-r2"} +\item{method}{character vector for the MINE value to be calculated. Subset of "MIC","MAS","MEV","MCN","MICR2", "GMIC", "TIC"} \item{handle.na}{If TRUE uses pairwise complete observations to calculate score, otherwise NAs not handled.} @@ -28,7 +27,7 @@ The values are calculated using \code{\link[minerva]{mine}} from \code{minerva} } \examples{ pair_mine(iris) - pair_mine(iris, method="mas") + pair_mine(iris, method="MAS") } \references{ Reshef, David N., et al. "Detecting novel associations in large data sets." diff --git a/man/pair_scagnostics.Rd b/man/pair_scagnostics.Rd index 7bbbcc0..dcd1308 100644 --- a/man/pair_scagnostics.Rd +++ b/man/pair_scagnostics.Rd @@ -8,7 +8,7 @@ pair_scagnostics( d, scagnostic = c("Outlying", "Skewed", "Clumpy", "Sparse", "Striated", "Convex", "Skinny", "Stringy", "Monotonic"), - handle.na = T, + handle.na = TRUE, ... ) } diff --git a/man/pair_tau.Rd b/man/pair_tau.Rd deleted file mode 100644 index ccf9ddf..0000000 --- a/man/pair_tau.Rd +++ /dev/null @@ -1,34 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/pair_methods.R -\name{pair_tau} -\alias{pair_tau} -\title{Kendall's tau A, B, C and Kendall's W} -\usage{ -pair_tau(d, method = c("B", "A", "C", "W"), ...) -} -\arguments{ -\item{d}{A dataframe} - -\item{method}{A character string for the correlation coefficient to be calculated, one of "B" (default), -"A", "C" or "W". If the value is "all", then all four correlations are calculated.} - -\item{...}{other arguments} -} -\value{ -A tibble of class \code{pairwise} with factor pairs along with one of either Kendall's tau A, B, C or -Kendall's W value, or NULL if there are not at least two factor variables -} -\description{ -Calculates one of either Kendall's tau A, B, C or Kendall's W for every factor variable pair in a dataset. -} -\details{ -The association values Kendall's tau A, B, C or Kendall's W are calculated using \code{\link[DescTools]{KendallTauA}}, -\code{\link[DescTools]{KendallTauB}}, \code{\link[DescTools]{StuartTauC}} or \code{\link[DescTools]{KendallW}} respectively,from the -\code{DescTools} package, and assumes factor levels are in the given order. NAs are automatically handled by pairwise omit. -} -\examples{ - pair_tau(iris) - pair_tau(iris, method="A") - pair_tau(iris, method="C") - pair_tau(iris, method="W") -} diff --git a/man/pair_tauA.Rd b/man/pair_tauA.Rd new file mode 100644 index 0000000..53d4682 --- /dev/null +++ b/man/pair_tauA.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/pair_methods.R +\name{pair_tauA} +\alias{pair_tauA} +\title{Kendall's tau A for association between ordinal factors.} +\usage{ +pair_tauA(d, handle.na = TRUE, ...) +} +\arguments{ +\item{d}{A dataframe} + +\item{handle.na}{ignored. Pairwise complete observations are used automatically.} + +\item{...}{other arguments} +} +\value{ +A tibble of class \code{pairwise} with factor pairs, or NULL if there are not at least two factor variables +} +\description{ +Calculates Kendall's tau A for every factor variable pair in a dataset. +} +\details{ +Calculated using \code{\link[DescTools]{KendallTauA}}. Assumes factor levels are in the given order. +NAs are automatically handled by pairwise omit. +} +\examples{ + d <- data.frame(x=rnorm(20), + y=factor(sample(3,20, replace=TRUE)), + z=factor(sample(2,20, replace=TRUE))) + pair_tauA(d) +} diff --git a/man/pair_tauB.Rd b/man/pair_tauB.Rd new file mode 100644 index 0000000..505d98e --- /dev/null +++ b/man/pair_tauB.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/pair_methods.R +\name{pair_tauB} +\alias{pair_tauB} +\title{Kendall's tau B for association between ordinal factors.} +\usage{ +pair_tauB(d, handle.na = TRUE, ...) +} +\arguments{ +\item{d}{A dataframe} + +\item{handle.na}{ignored. Pairwise complete observations are used automatically.} + +\item{...}{other arguments} +} +\value{ +A tibble of class \code{pairwise} with factor pairs, or NULL if there are not at least two factor variables +} +\description{ +Calculates Kendall's tau B every factor variable pair in a dataset. +} +\details{ +Calculated using \code{\link[DescTools]{KendallTauB}}. Assumes factor levels are in the given order. +NAs are automatically handled by pairwise omit. +} +\examples{ + d <- data.frame(x=rnorm(20), + y=factor(sample(3,20, replace=TRUE)), + z=factor(sample(2,20, replace=TRUE))) + pair_tauB(d) +} diff --git a/man/pair_tauC.Rd b/man/pair_tauC.Rd new file mode 100644 index 0000000..1df5c34 --- /dev/null +++ b/man/pair_tauC.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/pair_methods.R +\name{pair_tauC} +\alias{pair_tauC} +\title{Stuarts's tau C for association between ordinal factors.} +\usage{ +pair_tauC(d, handle.na = TRUE, ...) +} +\arguments{ +\item{d}{A dataframe} + +\item{handle.na}{ignored. Pairwise complete observations are used automatically.} + +\item{...}{other arguments} +} +\value{ +A tibble of class \code{pairwise} with factor pairs, or NULL if there are not at least two factor variables +} +\description{ +Calculates Stuarts's tau C every factor variable pair in a dataset. +} +\details{ +Calculated using \code{\link[DescTools]{StuartTauC}}. Assumes factor levels are in the given order. +NAs are automatically handled by pairwise omit. +} +\examples{ + d <- data.frame(x=rnorm(20), + y=factor(sample(3,20, replace=TRUE)), + z=factor(sample(2,20, replace=TRUE))) + pair_tauC(d) +} diff --git a/man/pair_tauW.Rd b/man/pair_tauW.Rd new file mode 100644 index 0000000..c7240bc --- /dev/null +++ b/man/pair_tauW.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/pair_methods.R +\name{pair_tauW} +\alias{pair_tauW} +\title{Kendall's W for association between ordinal factors.} +\usage{ +pair_tauW(d, handle.na = TRUE, ...) +} +\arguments{ +\item{d}{A dataframe} + +\item{handle.na}{ignored. Pairwise complete observations are used automatically.} + +\item{...}{other arguments} +} +\value{ +A tibble of class \code{pairwise} with factor pairs, or NULL if there are not at least two factor variables +} +\description{ +Calculates Kendall's tau W every factor variable pair in a dataset. +} +\details{ +Calculated using \code{\link[DescTools]{KendallW}}. Assumes factor levels are in the given order. +NAs are automatically handled by pairwise omit. +} +\examples{ + d <- data.frame(x=rnorm(20), + y=factor(sample(3,20, replace=TRUE)), + z=factor(sample(2,20, replace=TRUE))) + pair_tauW(d) +} diff --git a/man/pair_uncertainty.Rd b/man/pair_uncertainty.Rd index 26a9332..252a32e 100644 --- a/man/pair_uncertainty.Rd +++ b/man/pair_uncertainty.Rd @@ -2,14 +2,14 @@ % Please edit documentation in R/pair_methods.R \name{pair_uncertainty} \alias{pair_uncertainty} -\title{Uncertainty coefficient} +\title{Uncertainty coefficient for association between factors.} \usage{ pair_uncertainty(d, handle.na = TRUE, ...) } \arguments{ \item{d}{A dataframe} -\item{handle.na}{If TRUE uses pairwise complete observations to calculate uncertainty coefficient, otherwise NAs not handled.} +\item{handle.na}{ignored. Pairwise complete observations are used automatically.} \item{...}{other arguments} } diff --git a/man/pairwise_by.Rd b/man/pairwise_by.Rd index b97c0c3..36eb7a8 100644 --- a/man/pairwise_by.Rd +++ b/man/pairwise_by.Rd @@ -11,7 +11,7 @@ pairwise_by(d, by, pair_fun, ungrouped = TRUE) \item{by}{a character string for the name of the conditioning variable.} -\item{pair_fun}{One of the \code{pair_} functions} +\item{pair_fun}{A function returning a \code{pairwise} from a dataset.} \item{ungrouped}{If TRUE calculates the ungrouped score in addition to grouped scores.} } diff --git a/man/pairwise_multi.Rd b/man/pairwise_multi.Rd index d61f312..adc6b66 100644 --- a/man/pairwise_multi.Rd +++ b/man/pairwise_multi.Rd @@ -14,7 +14,7 @@ pairwise_multi( \arguments{ \item{d}{dataframe} -\item{scores}{a vector naming pairwise functions.} +\item{scores}{a vector naming functions returning a \code{pairwise} from a dataset.} \item{handle.na}{If TRUE uses pairwise complete observations to calculate pairwise score, otherwise NAs not handled.} } diff --git a/man/plot_pairwise.Rd b/man/plot_pairwise.Rd index af7cd20..459d9a1 100644 --- a/man/plot_pairwise.Rd +++ b/man/plot_pairwise.Rd @@ -34,7 +34,8 @@ variables are re-ordered to emphasize pairs with maximum score differences. Othe \value{ A \code{girafe} object if interactive==TRUE, otherwise a \code{ggplot2}. -If scores has one value for x,y pair, then a filled circle is drawn with fill representing the score value. If there are multiple values for each x,y pair then the filled circle is split into wedges, with the wedge fill representing the values. If some rows have \code{group=center_level}, then the glyph is drawn as a bullseye. +If scores has one value for x,y pair, then a filled circle is drawn with fill representing the score value. If there are multiple values for each x,y pair then the filled circle is split into wedges, with the wedge fill representing the values. +If some rows have \code{group=center_level}, then the glyph is drawn as a bullseye. } \description{ Plots multiple pairwise variable scores in a matrix layout. diff --git a/tests/testthat/test-pair_methods.R b/tests/testthat/test-pair_methods.R index 6305ca6..04fd7f3 100644 --- a/tests/testthat/test-pair_methods.R +++ b/tests/testthat/test-pair_methods.R @@ -50,7 +50,16 @@ test_that("pair polyserial", { test_that("pair tau", { iris1 <- iris[c(1,2,53,55),] iris1$Sepal.Length <- cut(iris1$Sepal.Length,2) - p <- pair_tau(droplevels(iris1)) + p <- pair_tauB(droplevels(iris1)) + expect_s3_class(p, "pairwise") + expect_identical(dim(p), c(1L,6L)) + p <- pair_tauA(droplevels(iris1)) + expect_s3_class(p, "pairwise") + expect_identical(dim(p), c(1L,6L)) + p <- pair_tauC(droplevels(iris1)) + expect_s3_class(p, "pairwise") + expect_identical(dim(p), c(1L,6L)) + p <- pair_tauW(droplevels(iris1)) expect_s3_class(p, "pairwise") expect_identical(dim(p), c(1L,6L)) }) diff --git a/vignettes/calc_pairwise.Rmd b/vignettes/calc_pairwise.Rmd index 9aa2f2c..91db389 100644 --- a/vignettes/calc_pairwise.Rmd +++ b/vignettes/calc_pairwise.Rmd @@ -94,7 +94,7 @@ dcor_nmi <- pairwise_multi(penguins, c("pair_dcor", "pair_nmi")) ## Calculating grouped measures -For each of the `pair_` functions, they can be wrapped using `pairwise_by` to build a score calculation for each level of a grouping variable. Of course, grouped scores could be calculated using `dplyr` machinery, but it is a bit more work. +For each of the pairwise calculation functions, they can be wrapped using `pairwise_by` to build a score calculation for each level of a grouping variable. Of course, grouped scores could be calculated using `dplyr` machinery, but it is a bit more work. ```{r} pairwise_by(penguins, by="species", pair_cor) diff --git a/vignettes/integrating.Rmd b/vignettes/integrating.Rmd index 258d101..31a7e2b 100644 --- a/vignettes/integrating.Rmd +++ b/vignettes/integrating.Rmd @@ -90,4 +90,16 @@ sc_multi<- bind_rows( as.pairwise(correlation::correlation(peng, method = "blomqvist")), as.pairwise(correlation::correlation(peng, method = "biweight"))) plot(sc_multi) -``` \ No newline at end of file +``` + +## Using other visualisations with `bullseye` results. + +In this example we compare ace and nmi measures for the penguin data + +```{r} +pm <- pairwise_multi(peng) +tidyr::pivot_wider(pm, names_from=score, values_from = value) |> + ggplot(aes(x=nmi, y=ace))+ geom_point() + +``` + diff --git a/vignettes/vis_pairwise.Rmd b/vignettes/vis_pairwise.Rmd index 6af9340..07e8159 100644 --- a/vignettes/vis_pairwise.Rmd +++ b/vignettes/vis_pairwise.Rmd @@ -65,7 +65,6 @@ Alternatively, if you wish to show different association measures for correlatio the result of `pairwise_scores`: ```{r} -scores <- pairwise_scores(peng) plot(pairwise_scores(peng), interactive=TRUE) ``` @@ -97,7 +96,7 @@ plot(scores, var_order="seriate_max_diff", interactive=TRUE) Pairs of numeric variables exhibit Simpsons paradox if the ungrouped correlation is negative and the grouped corelations are positive (or vice-versa). This is present for the pairs (body_mass_mm, bill_depth_mm) and (bill_depth_mm, bill_length_mm). The island variable is also associated with the penguin dimension variables. -However, this is mostly because two of the species (Gentoo and Chinstrap) are located on one island only. For these species, the score values for island and the the other variables is NA, shown in grey. +However, this is mostly because two of the species (Gentoo and Chinstrap) are located on one island only. For these species, the score values for island and the other variables is NA, shown in grey. @@ -197,11 +196,11 @@ a <- ace_cor(acs12$income, acs12$employment) plot(a$x, a$tx) ``` -Similarly `age:income` has a high ace score, and a plot of these two variables shows income goes up with aage until about age 40 and then drops off. +Similarly `age:income` has a high ace score, and a plot of these two variables shows income goes up with age until about age 40 and then drops off. -Next, we calculate scores by race and filter those x,y pairs with high differences: +Next, we calculate scores by race and filter those x,y pairs with high values and high differences: ```{r} group_scores <- pairwise_scores(acs12, by = "race") @@ -223,17 +222,17 @@ mutate(group_scores, valrange = rng(value),valmax = max(abs(value)), .by=c(x,y)) Asians have much higher association than other groups for many of the variables. Employed Asians report much higher hours worked: -```{r} +```{r, fig.width=8} ggplot(data=acs12, aes(x=employment, y=hrs_work))+ geom_boxplot()+ - facet_grid(rows=vars(race)) # NA col is because for employed=NA, there hrs_work also NA + facet_grid(cols=vars(race)) +scale_x_discrete(na.translate = FALSE) ``` For Asians, there is a big difference in travel time to work for genders compared to other races. ```{r, fig.height=3} ggplot(data=acs12, aes(x=gender, y=time_to_work))+ geom_boxplot()+ - facet_grid(cols=vars(race)) + facet_grid(cols=vars(race)) ``` For Asians, there is a big difference in income across genders compared to other races.