diff --git a/DESCRIPTION b/DESCRIPTION index bee83a6..9cef286 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: rPHG2 Title: R interface to PHGv2 -Version: 0.6 +Version: 0.6.1 Authors@R: person( given = "Brandon", @@ -41,6 +41,7 @@ Imports: patchwork, rJava, rlang, + S4Vectors, scales, tibble, tools, diff --git a/NEWS.md b/NEWS.md index 37fbe19..6a50813 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,9 @@ +## rPHG2 0.7 +* Added new parameter to `plotHaploCounts()`: + + `geom` + + allows user to specify a given geometry (line, bar, or point) + + ## rPHG2 0.6 * Added new parameters to `plogtGvcf()`: + `mData` and `mVar` diff --git a/R/class_phg_dataset.R b/R/class_phg_dataset.R index 632719b..6651998 100644 --- a/R/class_phg_dataset.R +++ b/R/class_phg_dataset.R @@ -228,75 +228,42 @@ setMethod( ## ---- #' @param gr -#' A \code{GenomicRanges} object for subsetting based on chromosome ID and -#' start/stop positions (bp). If \code{NULL}, all haplotype counts will be -#' plotted similar to a Manhattan plot. Defaults to \code{NULL}. +#' A \code{GRanges} object specifying a genomic range for filtering the +#' haplotype data. If \code{NULL}, the function will plot the data across the +#' entire dataset. +#' @param geom +#' A character string specifying the type of geometric representation for the +#' plot. Accepted values are: +#' \itemize{ +#' \item \code{"l"} for lines (default), +#' \item \code{"b"} for bars, +#' \item \code{"p"} for points. +#' } +#' If an invalid value is provided, the function will raise an error. +#' +#' @details +#' When no genomic range is provided (i.e., \code{gr = NULL}), the function +#' plots the number of unique haplotypes across the entire reference genome or +#' dataset. This will default to point geometry regardless of the value +#' provided within the \code{geom} parameter. If a genomic range is provided, +#' it filters the data based on overlaps between the reference ranges in the +#' dataset and the query range. In both cases, the resulting plot uses +#' \code{ggplot2} for visualization, and different geometries can be selected +#' via the \code{geom} parameter. +#' +#' @return A \code{ggplot} object visualizing the haplotype counts. When +#' \code{gr} is \code{NULL}, the plot shows the number of unique haplotypes +#' across reference positions. When \code{gr} is provided, the plot is filtered +#' to display haplotype counts within the specified range. +#' #' #' @rdname plotHaploCounts #' @export setMethod( f = "plotHaploCounts", signature = signature(object = "PHGDataSet"), - definition = function(object, gr = NULL) { - nHaplo <- numberOfHaplotypes(object, byRefRange = TRUE) - p <- NULL - if (is.null(gr)) { - p <- ggplot2::ggplot(nHaplo) + - ggplot2::aes(x = !!rlang::sym("start"), y = !!rlang::sym("n_haplo")) + - ggplot2::geom_point() + - ggplot2::scale_y_continuous( - breaks = seq_len(max(nHaplo$n_haplo)), - limits = c(1, max(nHaplo$n_haplo)) - ) + - ggplot2::scale_x_continuous( - labels = scales::label_number( - scale_cut = scales::cut_short_scale() - ) - ) + - ggplot2::xlab("Position (bp)") + - ggplot2::ylab("Number of unique haplotypes") + - ggplot2::facet_wrap(~ seqnames, scales = "free_x") + - ggplot2::theme_bw() - } else { - refRanges <- readRefRanges(object) - if (!is(gr, "GRanges")) { - rlang::abort("'gr' object is not of type 'GRanges'") - } - - gr$sub_id <- paste0("QR ", GenomeInfoDb::seqnames(gr), ":", IRanges::ranges(gr)) - - # Find overlaps - overlaps <- suppressWarnings(GenomicRanges::findOverlaps(refRanges, gr)) - if (length(overlaps) == 0) { - rlang::abort("No reference ranges identified with given query") - } - - # Filter based on overlaps - filtGr <- refRanges[S4Vectors::queryHits(overlaps)] - - # Add sub_id metadata - filtGr$sub_id <- gr$sub_id[S4Vectors::subjectHits(overlaps)] - filtGrDf <- as.data.frame(filtGr) - filtGrDf <- merge(x = filtGrDf, y = nHaplo) - - p <- ggplot2::ggplot(filtGrDf) + - ggplot2::aes(x = !!rlang::sym("rr_id"), y = !!rlang::sym("n_haplo")) + - ggplot2::geom_bar(stat = "identity") + - ggplot2::scale_y_continuous(breaks = seq(0, max(nHaplo$n_haplo), by = 1)) + - ggplot2::xlab("Reference range ID") + - ggplot2::ylab("Number of unique haplotypes") + - ggplot2::facet_grid(~ sub_id, scales = "free_x", space = "free") + - ggplot2::theme_bw() + - ggplot2::theme( - axis.text.x = ggplot2::element_text( - angle = 90, - vjust = 0.5, - hjust = 1 - ) - ) - } - - return(p) + definition = function(object, gr = NULL, geom = "l") { + plotHaploFromPhgDataSet(object, gr, geom) } ) diff --git a/R/vis_plot_dot.R b/R/vis_plot_dot.R index 17617b1..4194b76 100644 --- a/R/vis_plot_dot.R +++ b/R/vis_plot_dot.R @@ -1,3 +1,4 @@ +## ---- # Plot a Dot Plot from AnchorWave metrics # # @param df diff --git a/R/vis_plot_haplo.R b/R/vis_plot_haplo.R new file mode 100644 index 0000000..80c7e65 --- /dev/null +++ b/R/vis_plot_haplo.R @@ -0,0 +1,131 @@ +## ---- +# Helper function to construct common ggplot theme +custHapTheme <- function() { + ggplot2::theme_bw() + + ggplot2::theme( + axis.text.x = ggplot2::element_text( + angle = 90, + vjust = 0.5, + hjust = 1 + ) + ) +} + + +## ---- +# Helper function to construct the geometry based on the 'geom' parameter +selectGeom <- function(geom) { + switch(geom, + "l" = ggplot2::geom_line(ggplot2::aes(group = 1)), + "b" = ggplot2::geom_bar(stat = "identity"), + "p" = ggplot2::geom_point() + ) +} + + +## ---- +# Plot Haplotype Data from PHG Dataset +# +# @description +# This function creates a ggplot visualization of haplotype data from a PHG +# (Practical Haplotype Graph) dataset, allowing for multiple geometric +# representations (lines, bars, points). The plot can either cover the entire +# dataset or a specific genomic range provided by the user. +# +# @param object +# A PHG dataset object containing haplotype information. This is typically the +# output of a function like `loadPhgDataSet()` or similar. +# @param gr +# A \code{GRanges} object specifying a genomic range for filtering the +# haplotype data. If \code{NULL}, the function will plot the data across the +# entire dataset. +# @param geom +# A character string specifying the type of geometric representation for the +# plot. Accepted values are: +# \itemize{ +# \item \code{"l"} for lines (default), +# \item \code{"b"} for bars, +# \item \code{"p"} for points. +# } +# If an invalid value is provided, the function will raise an error. +# +# @details +# When no genomic range is provided (i.e., \code{gr = NULL}), the function +# plots the number of unique haplotypes across the entire reference genome or +# dataset. If a genomic range is provided, it filters the data based on +# overlaps between the reference ranges in the dataset and the query range. In +# both cases, the resulting plot uses \code{ggplot2} for visualization, and +# different geometries can be selected via the \code{geom} parameter. +# +# @return A \code{ggplot} object visualizing the haplotype counts. When +# \code{gr} is \code{NULL}, the plot shows the number of unique haplotypes +# across reference positions. When \code{gr} is provided, the plot is filtered +# to display haplotype counts within the specified range. +plotHaploFromPhgDataSet <- function(object, gr = NULL, geom = "l") { + # Validate 'geom' parameter + if (!geom %in% c("l", "b", "p")) { + rlang::abort("'geom' must be one of 'l' (line), 'b' (bar), or 'p' (point).") + } + + nHaplo <- numberOfHaplotypes(object, byRefRange = TRUE) + + # Plot when 'gr' is NULL + if (is.null(gr)) { + return( + ggplot2::ggplot(nHaplo) + + ggplot2::aes( + x = !!rlang::sym("start"), + y = !!rlang::sym("n_haplo"), + alpha = 0.01 + ) + + selectGeom("p") + + ggplot2::scale_y_continuous( + breaks = seq_len(max(nHaplo$n_haplo)), + limits = c(1, max(nHaplo$n_haplo)) + ) + + ggplot2::scale_x_continuous( + labels = scales::label_number( + scale_cut = scales::cut_short_scale() + ) + ) + + ggplot2::xlab("Position (bp)") + + ggplot2::ylab("Number of unique haplotypes") + + ggplot2::facet_wrap(~ seqnames, scales = "free_x") + + ggplot2::guides(alpha = "none") + + custHapTheme() + ) + } + + # Validate 'gr' type + if (!is(gr, "GRanges")) { + rlang::abort("'gr' object is not of type 'GRanges'") + } + + # Get ref ranges GRanges object and add 'sub_id" parameters + refRanges <- readRefRanges(object) + gr$sub_id <- paste0("QR ", GenomeInfoDb::seqnames(gr), ":", IRanges::ranges(gr)) + + # Find overlaps + overlaps <- suppressWarnings(GenomicRanges::findOverlaps(refRanges, gr)) + + if (length(overlaps) == 0) { + rlang::abort("No reference ranges identified with given query") + } + + # Filter and merge data + filtGr <- refRanges[S4Vectors::queryHits(overlaps)] + filtGr$sub_id <- gr$sub_id[S4Vectors::subjectHits(overlaps)] + filtGrDf <- merge(as.data.frame(filtGr), nHaplo) + + # Plot with filtered data + ggplot2::ggplot(filtGrDf) + + ggplot2::aes(x = !!rlang::sym("rr_id"), y = !!rlang::sym("n_haplo")) + + selectGeom(geom) + + ggplot2::scale_y_continuous(breaks = seq(0, max(nHaplo$n_haplo), by = 1)) + + ggplot2::xlab("Reference range ID") + + ggplot2::ylab("Number of unique haplotypes") + + ggplot2::facet_grid(~ sub_id, scales = "free_x", space = "free") + + custHapTheme() +} + + diff --git a/man/plotHaploCounts.Rd b/man/plotHaploCounts.Rd index 32bffeb..e8c49bb 100644 --- a/man/plotHaploCounts.Rd +++ b/man/plotHaploCounts.Rd @@ -7,17 +7,42 @@ \usage{ plotHaploCounts(object, ...) -\S4method{plotHaploCounts}{PHGDataSet}(object, gr = NULL) +\S4method{plotHaploCounts}{PHGDataSet}(object, gr = NULL, geom = "l") } \arguments{ \item{object}{a \code{\linkS4class{PHGDataSet}} object} \item{...}{Additional arguments, for use in specific methods} -\item{gr}{A \code{GenomicRanges} object for subsetting based on chromosome ID and -start/stop positions (bp). If \code{NULL}, all haplotype counts will be -plotted similar to a Manhattan plot. Defaults to \code{NULL}.} +\item{gr}{A \code{GRanges} object specifying a genomic range for filtering the +haplotype data. If \code{NULL}, the function will plot the data across the +entire dataset.} + +\item{geom}{A character string specifying the type of geometric representation for the +plot. Accepted values are: +\itemize{ +\item \code{"l"} for lines (default), +\item \code{"b"} for bars, +\item \code{"p"} for points. +} +If an invalid value is provided, the function will raise an error.} +} +\value{ +A \code{ggplot} object visualizing the haplotype counts. When +\code{gr} is \code{NULL}, the plot shows the number of unique haplotypes +across reference positions. When \code{gr} is provided, the plot is filtered +to display haplotype counts within the specified range. } \description{ Plots the counts of unique haplotype IDs found in each reference range. } +\details{ +When no genomic range is provided (i.e., \code{gr = NULL}), the function +plots the number of unique haplotypes across the entire reference genome or +dataset. This will default to point geometry regardless of the value +provided within the \code{geom} parameter. If a genomic range is provided, +it filters the data based on overlaps between the reference ranges in the +dataset and the query range. In both cases, the resulting plot uses +\code{ggplot2} for visualization, and different geometries can be selected +via the \code{geom} parameter. +} diff --git a/tests/testthat/test_class_phg_dataset.R b/tests/testthat/test_class_phg_dataset.R index 691651e..cbf586c 100644 --- a/tests/testthat/test_class_phg_dataset.R +++ b/tests/testthat/test_class_phg_dataset.R @@ -69,9 +69,31 @@ test_that("PHGDataSet class construction tests", { ### expect 4 observations to overlap with successful query expect_equal(length(testPlot$data$rr_id), 4) + ## Geometry ('geom = ') tests + containsGeom <- function(plot, geomType) { + any( + sapply(plot$layers, function(layer) { + inherits(layer$geom, geomType) + }) + ) + } + expect_true(containsGeom(plotHaploCounts(pds, geom = "p"), "GeomPoint")) + expect_true(containsGeom(plotHaploCounts(pds, geom = "l"), "GeomPoint")) + expect_true(containsGeom(plotHaploCounts(pds, geom = "b"), "GeomPoint")) + expect_true(containsGeom(plotHaploCounts(pds, gr = grQuery, geom = "l"), "GeomLine")) + expect_false(containsGeom(plotHaploCounts(pds, gr = grQuery, geom = "l"), "GeomBar")) + expect_false(containsGeom(plotHaploCounts(pds, gr = grQuery, geom = "l"), "GeomPoint")) + expect_true(containsGeom(plotHaploCounts(pds, gr = grQuery, geom = "b"), "GeomBar")) + expect_false(containsGeom(plotHaploCounts(pds, gr = grQuery, geom = "b"), "GeomLine")) + expect_false(containsGeom(plotHaploCounts(pds, gr = grQuery, geom = "b"), "GeomPoint")) + expect_true(containsGeom(plotHaploCounts(pds, gr = grQuery, geom = "p"), "GeomPoint")) + expect_false(containsGeom(plotHaploCounts(pds, gr = grQuery, geom = "p"), "GeomBar")) + expect_false(containsGeom(plotHaploCounts(pds, gr = grQuery, geom = "p"), "GeomLine")) + ## General errors expect_error(plotHaploCounts(pds, gr = mtcars)) expect_error(plotHaploCounts(pds, gr = grQueryError)) + expect_error(plotHaploCounts(pds, geom = "x")) }) diff --git a/tests/testthat/test_utils_metrics.R b/tests/testthat/test_utils_metrics.R index 6593ec9..26f65c5 100644 --- a/tests/testthat/test_utils_metrics.R +++ b/tests/testthat/test_utils_metrics.R @@ -2,10 +2,13 @@ test_that("General metrics utility tests", { # validateHeaders() x <- "some\tline\tto\ttest" expect_true( - rPHG2:::validateHeaders(x, c("some", "line", "to", "test")) + validateHeaders(x, c("some", "line", "to", "test")) ) expect_false( - rPHG2:::validateHeaders(NULL, c("some", "line", "to", "test")) + validateHeaders(NULL, c("some", "line", "to", "test")) + ) + expect_false( + validateHeaders(NULL, c("some", "line", "to", "ttest")) ) }) diff --git a/vignettes/metrics.Rmd b/vignettes/metrics.Rmd index 95adb0f..be5052d 100644 --- a/vignettes/metrics.Rmd +++ b/vignettes/metrics.Rmd @@ -397,9 +397,9 @@ metData |> plotGvcf(f = CORE ~ ALL, tag = "i") #### Adding metadata -If you want to color bars based additional categorical besides sample ID, you -can pass a `data.frame` object to the parameter `mData`. This will need the -following prerequisites: +If you want to color bars based on additional categorical data besides sample +ID, you can pass a `data.frame` object to the parameter `mData`. This will need +the following prerequisites: 1. A column named either `sample`, `taxa`, **or** `line` 2. At least one more column containing a new categorical variable where each @@ -411,7 +411,7 @@ In the prior examples, I have 3 samples in my gVCF metrics data: 2. `Xb02` 3. `Xb03` -I will make a `data.frame` object with following columns, `sample` and +I will make a `data.frame` object with following columns: `sample`, `technology`, and `treatment`: ```{r eval=TRUE, echo=TRUE} @@ -444,14 +444,3 @@ metData |> plotGvcf( ``` - - - - - - - - - - - diff --git a/vignettes/rPHG2.Rmd b/vignettes/rPHG2.Rmd index 68a1fc5..4e082c0 100644 --- a/vignettes/rPHG2.Rmd +++ b/vignettes/rPHG2.Rmd @@ -425,7 +425,7 @@ query <- GRanges( phgDs |> plotHaploCounts(gr = query) ``` -We can also query mutliple regions of interest simultaneously by adding more +We can also query mutltiple regions of interest simultaneously by adding more observations to the `query` object: ```{r, eval=TRUE, echo=TRUE} @@ -439,6 +439,19 @@ query <- GRanges( phgDs |> plotHaploCounts(gr = query) ``` +Finally, we provide 3 separate "geometry" options for plotting using the `geom` +parameter: +* `"l"` - line plotting (e.g., `geom_line()`) (_default_) +* `"b"` - bar plotting (e.g., `geom_bar()`) +* `"p"` - point plotting (e.g., `geom_point()`) + +For example, if we want to switch this from a line plot to a bar plot, we can +specify `geom = "b"`: + +```{r, eval=TRUE, echo=TRUE} +phgDs |> plotHaploCounts(gr = query, geom = "b") +``` + ### Visualize distributions of unique haplotype IDs To get a more "global" estimate of unique haplotype ID distribution across the