Skip to content

Commit

Permalink
refactor ezvolcano
Browse files Browse the repository at this point in the history
Had overlapping labels in metabolomics volcano, so refactored to use df columns instead of indices and use ggrepel for plotting all (except rasterized) points & labels, to minimize overlaps
  • Loading branch information
jdreyf committed Dec 18, 2023
1 parent 3a4ff11 commit b3e30fe
Showing 18 changed files with 3,460 additions and 3,885 deletions.
153 changes: 59 additions & 94 deletions R/ezvolcano.R
Original file line number Diff line number Diff line change
@@ -21,14 +21,15 @@
#' @param x.bound x-axis limits are set to \code{c(-x.bound, x.bound)}. If \code{NULL, x.bound=max(abs(tab[,lfc.col]))}.
#' @param y.bound y-axis limits are set to \code{c(0, y.bound)}. If \code{NULL, y.bound=max(tab[,'nlg10sig'])}.
#' @param type.sig Type of significance y-axis should use, either "p" or "FDR".
#' @param cut.color Color of points that meet both \code{cut.lfc} and \code{cut.sig}. If \code{NULL}, cutoffs are ignored.
#' @param cut.color Color of points that meet both \code{cut.lfc} and \code{cut.sig} but are not labeled.
#' @param cut.lfc Points need to have \code{|logFC| >= cut.lfc} to have \code{cut.color}.
#' @param cut.sig Points need to have significance \code{tab[,sig.col] <= cut.sig} to have \code{cut.color}.
#' @param lines.sig Numeric vector of values of \code{sig.type} at which to draw lines. For example, if
#' \code{type.sig="p"}, you may want to set \code{lines.sig = 0.05}, which will draw a line at \code{y = -log10(0.05)}.
#' @param raster Rasterize points using \code{ggrastr} so plot is lighter.
#' @param sep Separator string between contrast names and suffix such as \code{logFC}.
#' @param na.lab Character vector of labels in \code{lab.col} to treat as missing, in addition to \code{NA}.
#' @param seed Numeric seed for reproducibility since \code{ggrepel} uses a random algorithm.
#' @inheritParams ezheat
#' @inheritParams ezvenn
#' @details If \code{ntop.sig>0} or \code{ntop.lfc>0}, then \code{lab.col} must be in \code{colnames(tab)}.
@@ -38,26 +39,18 @@
#' @export

ezvolcano <- function(tab, lfc.col=NA, sig.col=NA, lab.col='Gene.Symbol', ntop.sig=0, ntop.lfc=0, comparison=NULL, alpha=0.4,
name='volcano', ann.rnames=NULL, up.ann.color='black', down.ann.color='black', shape = 16,
x.bound=NULL, y.bound=NULL, type.sig=c('p', 'FDR'), cut.color=NULL, cut.lfc=1, cut.sig=0.05, lines.sig=NA,
raster = FALSE, sep='.', na.lab=c('---', ''), plot=TRUE){
name='volcano', ann.rnames=NULL, up.ann.color='black', down.ann.color='black', shape = 19,
x.bound=NULL, y.bound=NULL, type.sig=c('p', 'FDR'), cut.color="black", cut.lfc=1, cut.sig=0.05, lines.sig=NA,
raster = FALSE, sep='.', na.lab=c('---', ''), seed = 0, plot=TRUE){
set.seed(seed = seed) # ggrepel is random
type.sig <- match.arg(type.sig)
# can't annot if no lab.col
if (is.null(lab.col)){
if (ntop.lfc > 0){
ntop.lfc <- 0
warning("ntop.lfc > 0 but lab.col is null.")
}
if (ntop.sig > 0){
ntop.sig <- 0
warning("ntop.sig > 0 but lab.col is null.")
}
if (!is.null(ann.rnames)){
ann.rnames <- NULL
warning("ann.rnames is not null, but lab.col is null.")
}
}# end if lab.col null
# if null, need to halt and return FALSE
have.labs <- !(is.null(lab.col) || !(lab.col %in% colnames(tab)))
if (!have.labs & (ntop.lfc > 0 | ntop.sig > 0 | !is.null(ann.rnames))){
stop("ntop.lfc > 0 | ntop.sig > 0 | !is.null(ann.rnames) but lab.col is null or not in the column names of tab.")
}

type.sig <- match.arg(type.sig)
if (type.sig=="p"){
y.lab <- expression("-"*log[10]~p*"-"*value)
} else {
@@ -75,102 +68,74 @@ ezvolcano <- function(tab, lfc.col=NA, sig.col=NA, lab.col='Gene.Symbol', ntop.s
if (!is.na(name)) name <- paste(comparison, name, sep='_')
}

stopifnot((ntop.sig==0 & ntop.lfc==0) | lab.col %in% colnames(tab), ntop.sig==as.integer(ntop.sig),
ntop.lfc==as.integer(ntop.lfc), is.null(ann.rnames)|ann.rnames %in% rownames(tab),
lfc.col %in% colnames(tab), sig.col %in% colnames(tab), any(tab[,lfc.col]<0), any(tab[,lfc.col]>=0),
length(x.bound)<=1, length(y.bound)<=1, all(is.na(lines.sig)) || (is.numeric(lines.sig) && length(lines.sig)<=5),
is.logical(plot))
stopifnot(ntop.sig==as.integer(ntop.sig), lfc.col %in% colnames(tab), sig.col %in% colnames(tab), any(tab[,lfc.col]<0), any(tab[,lfc.col]>=0),
length(x.bound)<=1, length(y.bound)<=1, all(is.na(lines.sig)) || (is.numeric(lines.sig) && length(lines.sig)<=5), is.logical(plot))

# There is no need for prefixing !!! or !! or {{ with rlang::. These operators are not function calls, they are specially interpreted by rlang in data-masked arguments.
tab <- data.frame(tab, nlg10sig = -log10(tab[,sig.col]), check.names = FALSE) |>
dplyr::mutate(rnm = rownames(tab), drctn.color = ifelse(!!rlang::sym(lfc.col) > 0, up.ann.color, down.ann.color), annot=FALSE,
na.lab = dplyr::case_when(!have.labs ~ TRUE,
is.na(!!rlang::sym(lab.col)) | !!rlang::sym(lab.col) %in% na.lab ~ TRUE,
.default = FALSE))

rnms.top.lfc <- rnms.top.sig <- NULL
if (ntop.lfc > 0) rnms.top.lfc <- tab |> dplyr::arrange(-abs(!!rlang::sym(lfc.col))) |> dplyr::slice(1:ntop.lfc) |> dplyr::pull(rnm)
if (ntop.sig > 0) rnms.top.sig <- tab |> dplyr::arrange(!!rlang::sym(sig.col)) |> dplyr::slice(1:ntop.sig) |> dplyr::pull(rnm)

# ?geom_text_repel has example to hide many labels but repel from all points where the labels they don't want are ""
# default shape=19; point size = 1.5
tab <- tab |>
dplyr::mutate(map.grp = dplyr::case_when(rnm %in% c(ann.rnames, rnms.top.lfc, rnms.top.sig) ~ "annot",
abs(!!rlang::sym(lfc.col)) > cut.lfc & !!rlang::sym(sig.col) <= cut.sig ~ "cut",
.default = "rest"),
color.point = dplyr::case_match(map.grp, "annot" ~ drctn.color, "cut" ~ cut.color, .default = "black"),
label.point = dplyr::case_when(map.grp == "annot" & !na.lab ~ !!rlang::sym(lab.col), .default = ""),
alpha.point = dplyr::case_when(label.point == "" ~ alpha, .default = 1),
size.point = dplyr::case_when(map.grp == "annot" ~ 2, .default = 1.5),
shape.point = dplyr::case_when(map.grp == "annot" ~ 19, .default = shape))

tab <- data.frame(tab, nlg10sig = -log10(tab[,sig.col]), check.names = FALSE)
# want symmetric x-axis
if (is.null(x.bound)) x.bound <- max(abs(tab[,lfc.col]))
if (is.null(y.bound)) y.bound <- max(tab[,'nlg10sig'])

## need to order tab st annotated points are plotted last, so they are seen even if in dense areas
## however, not concerned about cut points, since they are in outer region
## https://stackoverflow.com/questions/15706281/controlling-order-of-points-in-ggplot2-in-r

# ann.rnames to plot with symbol
ind.annot <- NULL
if (!is.null(ann.rnames)){
ind.ann.rnames <- which(rownames(tab) %in% ann.rnames)
tab <- tab[c(setdiff(1:nrow(tab), ind.ann.rnames), ind.ann.rnames),, drop=FALSE]
ind.annot <- (nrow(tab) - length(ind.ann.rnames) + 1):nrow(tab)
}
# construct ggplot object
vol <- ggplot2::ggplot(data=tab, mapping=ggplot2::aes(x=!!rlang::sym(lfc.col), y = nlg10sig, color = color.point, alpha=alpha.point, shape=shape.point)) +
ggplot2::theme_bw() + ggplot2::theme(axis.text=ggplot2::element_text(size=12, face="bold")) + ggplot2::xlab(expression(log[2]~fold~change)) + ggplot2::ylab(y.lab) +
ggplot2::xlim(c(-x.bound, x.bound)) + ggplot2::ylim(c(0, y.bound))

# ntop indices to plot with symbol
if (ntop.sig > 0 | ntop.lfc > 0){
na.lab.ind <- which(is.na(tab[,lab.col]) | tab[,lab.col] %in% na.lab)
if (ntop.lfc > 0) top.lfc.ind <- order(-abs(tab[,lfc.col]))[1:ntop.lfc] else top.lfc.ind <- NULL
if (ntop.sig > 0) top.sig.ind <- order(tab[,sig.col])[1:ntop.sig] else top.sig.ind <- NULL
ind.ntop <- setdiff(union(top.sig.ind, top.lfc.ind), na.lab.ind)
ind.annot <- union(ind.annot, ind.ntop)
if (all(!is.na(lines.sig))){
# y is already -log10(sig)
# linetype values that we want = 2-6
# 0 = blank, 1 = solid, 2 = dashed, 3 = dotted, 4 = dotdash, 5 = longdash, 6 = twodash
lty.v <- c("dashed", "dotted", "dotdash", "longdash", "twodash")
lty.leg <- lty.v[order(as.character(lines.sig))]
names(lty.leg) <- sort(as.character(lines.sig))
vol <- vol + ggplot2::geom_hline(data=data.frame(lines.sig),
mapping = ggplot2::aes(yintercept = -log10(lines.sig), linetype=as.character(lines.sig)),
show.legend = TRUE) + ggplot2::scale_linetype_manual(values=lty.leg) +
ggplot2::guides(linetype=ggplot2::guide_legend(title=type.sig))
}

# construct ggplot object
vol <- ggplot2::ggplot(data=tab, mapping=ggplot2::aes_string(x=lfc.col, y='nlg10sig')) + ggplot2::theme_bw() +
ggplot2::theme(axis.text=ggplot2::element_text(size=12, face="bold")) + ggplot2::xlab(expression(log[2]~fold~change)) +
ggplot2::xlim(c(-x.bound, x.bound)) + ggplot2::ylim(c(0, y.bound)) + ggplot2::ylab(y.lab)

if (!is.null(comparison)){
first.grp <- unlist(strsplit(x=gsub("_", "", comparison), split = "vs|VS|Vs|In|IN|in|OF|Of|of"))[1]
# label left & right
# label left & right sides
vol <- vol + ggplot2::ggtitle(comparison) +
ggplot2::geom_text(mapping=ggplot2::aes(x=2*x.bound/3, y = -Inf, label = paste0("Up in ", first.grp)), color="darkgrey",
vjust = -0.5, show.legend=FALSE) +
ggplot2::geom_text(mapping=ggplot2::aes(x=-2*x.bound/3, y = -Inf, label = paste0("Down in ", first.grp)), color="darkgrey",
vjust = -0.5, show.legend=FALSE)
}

# cut points
if (!is.null(cut.color)){
ind.cut <- which(abs(tab[,lfc.col]) > cut.lfc & tab[,sig.col] <= cut.sig)
ind.cut <- setdiff(ind.cut, ind.annot)
if (length(ind.cut) > 0){
vol <- vol + ggplot2::geom_point(data=tab[ind.cut,], alpha=alpha, size=2, color = cut.color, shape=shape)
}
} else {
ind.cut <- NULL
}

# plot rest
ind.rest <- setdiff(1:nrow(tab), union(ind.annot, ind.cut))
# finalize plot
if (raster){
vol <- vol + ggrastr::rasterize(ggplot2::geom_point(data=tab[ind.rest,], alpha=alpha, size=2, shape=shape))
vol <- vol + ggrastr::rasterize(ggplot2::geom_point(data=tab |> dplyr::filter(map.grp == "rest"), size=2)) +
ggplot2::geom_point(data = tab |> dplyr::filter(map.grp != "rest"), mapping = ggplot2::aes(size = size.point), show.legend = FALSE)
} else {
vol <- vol + ggplot2::geom_point(data=tab[ind.rest,], alpha=alpha, size=2, shape=shape)
}

# plot annotated with color (do last, so labels on top of points)
if (!is.null(ind.annot)){
ind.annot.up <- ind.annot[which(tab[ind.annot, lfc.col] >= 0)]
# xlim=c(0, NA) allows labels on RHS until end of x-axis
if (length(ind.annot.up) > 0){
vol <- vol + ggplot2::geom_point(data=tab[ind.annot.up,], size=2, color = up.ann.color) +
ggrepel::geom_text_repel(data=tab[ind.annot.up,], mapping=ggplot2::aes_string(x=lfc.col, y='nlg10sig', label=lab.col),
size=3, color = up.ann.color, xlim=c(0, NA))
}

ind.annot.down <- ind.annot[which(tab[ind.annot, lfc.col] < 0)]
if (length(ind.annot.down) > 0){
vol <- vol + ggplot2::geom_point(data=tab[ind.annot.down,], size=2, color = down.ann.color) +
ggrepel::geom_text_repel(data=tab[ind.annot.down,], mapping=ggplot2::aes_string(x=lfc.col, y='nlg10sig', label=lab.col),
size=3, color = down.ann.color, xlim=c(NA, 0))
}
vol <- vol + ggplot2::geom_point(mapping = ggplot2::aes(size = size.point), show.legend = FALSE)
}

if (all(!is.na(lines.sig))){
# y is already -log10(sig)
# linetype values that we want = 2-6
# 0 = blank, 1 = solid, 2 = dashed, 3 = dotted, 4 = dotdash, 5 = longdash, 6 = twodash
lty.v <- c("dashed", "dotted", "dotdash", "longdash", "twodash")
lty.leg <- lty.v[order(as.character(lines.sig))]
names(lty.leg) <- sort(as.character(lines.sig))
vol <- vol + ggplot2::geom_hline(data=data.frame(lines.sig),
mapping = ggplot2::aes(yintercept = -log10(lines.sig), linetype=as.character(lines.sig)),
show.legend = TRUE) + ggplot2::scale_linetype_manual(values=lty.leg) +
ggplot2::guides(linetype=ggplot2::guide_legend(title=type.sig))
}
vol <- vol + ggplot2::scale_color_identity() + ggplot2::scale_alpha_identity() + ggplot2::scale_shape_identity() + ggplot2::scale_size_identity() +
ggrepel::geom_text_repel(mapping=ggplot2::aes(label=label.point), show.legend = FALSE, size=3)

if (plot){
if (!is.na(name)) ggplot2::ggsave(filename=paste0(name, ".png"), plot=vol) else graphics::plot(vol)
12 changes: 6 additions & 6 deletions R/multi_volcano.R
Original file line number Diff line number Diff line change
@@ -12,10 +12,10 @@
#' @seealso ezvolcano
#' @export

multi_volcano <- function(tab, lab.col=NULL, ntop.sig=0, ntop.lfc=0, alpha=0.4, name="volcanoes", ann.rnames=NULL,
multi_volcano <- function(tab, lab.col='Gene.Symbol', ntop.sig=0, ntop.lfc=0, alpha=0.4, name="volcanoes", ann.rnames=NULL,
up.ann.color="black", down.ann.color="black", same.scale=FALSE, type.sig=c("p", "FDR"),
cut.color=NULL, cut.lfc=1, cut.sig=0.05, lines.sig=ifelse(type.sig[1]=="p", yes = 0.05, no=NA),
raster=FALSE, sep=".", na.lab=c("---", ""), plot=TRUE){
cut.color="black", cut.lfc=1, cut.sig=0.05, lines.sig=ifelse(type.sig[1]=="p", yes = 0.05, no=NA),
raster=FALSE, sep=".", na.lab=c("---", ""), seed=0, plot=TRUE){
type.sig <- match.arg(type.sig)
lfc.cols <- grep(paste0("\\", sep, "logFC$"), colnames(tab))
if (length(lfc.cols)==0) stop("No logFC columns detected.")
@@ -42,9 +42,9 @@ multi_volcano <- function(tab, lab.col=NULL, ntop.sig=0, ntop.lfc=0, alpha=0.4,
ret.lst <- list()
for (contr in contr.names){
ret.lst[[contr]] <- ezvolcano(tab=tab, lab.col=lab.col, ntop.sig=ntop.sig, ntop.lfc=ntop.lfc, alpha=alpha, comparison=contr,
name=NA, ann.rnames=ann.rnames, up.ann.color=up.ann.color, down.ann.color=down.ann.color,
x.bound=x.bound, y.bound=y.bound, type.sig=type.sig, cut.color=cut.color,
cut.lfc=cut.lfc, cut.sig=cut.sig, lines.sig=lines.sig, raster=raster, sep=sep, na.lab=na.lab, plot=plot)
name=NA, ann.rnames=ann.rnames, x.bound=x.bound, y.bound=y.bound, type.sig=type.sig, cut.color=cut.color,
cut.lfc=cut.lfc, cut.sig=cut.sig, up.ann.color=up.ann.color, down.ann.color=down.ann.color,
lines.sig=lines.sig, raster=raster, sep=sep, na.lab=na.lab, seed=seed, plot=plot)
}
return(invisible(ret.lst))
}
9 changes: 6 additions & 3 deletions man/ezvolcano.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

9 changes: 6 additions & 3 deletions man/multi_volcano.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit b3e30fe

Please sign in to comment.