Merge pull request #4 from tlesluyes/dev

Update from dev
tlesluyes · Oct 10, 2023 · 02b0daa · 02b0daa
2 parents 3d2ee1f + 0bc2f9b
commit 02b0daa
Show file tree

Hide file tree

Showing 16 changed files with 451 additions and 117 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,8 +1,8 @@
 Package: myFun
 Type: Package
 Title: myFun is a collection of my favorite R functions, packaged for simplicity
-Version: 1.0.4
-Date: 2023-10-03
+Version: 1.0.5
+Date: 2023-10-10
 Authors@R: person("Tom", "Lesluyes", email = "[email protected]", role = c("aut", "cre"))
 Author: Tom Lesluyes [aut, cre]
 Maintainer: Tom Lesluyes <[email protected]>

diff --git a/NAMESPACE b/NAMESPACE
@@ -2,14 +2,19 @@
 
 export(adjustPositions)
 export(checkGRlist)
+export(computeBAF)
+export(computeFit)
 export(computeISA)
 export(computeISA_batch)
+export(computeLogR)
 export(computeMD)
 export(computeMD_batch)
 export(generate_cytoband_and_CHRsize)
 export(harmonizeGRanges)
 export(load_CHRsize)
 export(load_cytoband)
 export(occurrenceGRanges)
+export(reestimate_ploidy)
+export(reestimate_purity)
 export(splitDF)
 importFrom(foreach,"%dopar%")
diff --git a/R/computeISA.R b/R/computeISA.R
@@ -26,4 +26,57 @@ computeISA=function(GR1, GR2, CNstatus="CNstatus") {
   profiles=harmonizeGRanges(list(GR1, GR2))
   sameCN=which(GenomicRanges::mcols(profiles[[1]])[, CNstatus]==GenomicRanges::mcols(profiles[[2]])[, CNstatus])
   return(sum(IRanges::width(profiles[[1]][sameCN, ]))/sum(IRanges::width(profiles[[1]]))*100)
+}
+
+#' @title computeISA_batch
+#' @description Compute the inter-sample agreement (ISA) for a batch of samples
+#' @details This function computes the inter-sample agreement (ISA) between multiple profiles (as a list of GRanges objects).
+#' @param myGRList a list of GRanges objects, each object should correspond to one CNA profile
+#' @param cores a numeric, the number of cores to use (default: 1)
+#' @param min_seg_size a numeric, the minimum segment size (in bp) to consider (default: 0)
+#' @param CNstatus a metadata column name for the copy-number status (default: "CNstatus"). Can be total (e.g. "3") or allele-specific (e.g. "2+1")
+#' @return A matrix of ISA values
+#' @examples
+#' require("GenomicRanges")
+#' GR1=GRanges(seqnames=rep("1", 3),
+#'             ranges=IRanges(start=c(1, 1001, 10001), end=c(1000, 10000, 20000)),
+#'             CNstatus=c("1+1", "2+1", "1+1"))
+#' GR2=GRanges(seqnames=rep("1", 2),
+#'             ranges=IRanges(start=c(500, 10001), end=c(10000, 25000)),
+#'             CNstatus=c("2+1", "1+1"))
+#' GR3=GRanges(seqnames="1",
+#'             ranges=IRanges(start=500, end=25000),
+#'             CNstatus="1+1")
+#' myGRList=list(GR1, GR2, GR3)
+#' names(myGRList)=c("GR1", "GR2", "GR3")
+#' computeISA_batch(myGRList)
+#' @author tlesluyes
+#' @export
+computeISA_batch=function(myGRList, cores=1, min_seg_size=0, CNstatus="CNstatus") {
+  if (cores>1) doParallel::registerDoParallel(cores=cores)
+  checkGRlist(myGRList)
+  stopifnot(all(sapply(myGRList, function(x) all("CNstatus" %in% names(GenomicRanges::mcols(x))))))
+  if (is.null(names(myGRList))) {
+    NAMES=as.character(1:length(myGRList))
+  } else {
+    NAMES=names(myGRList)
+  }
+  myGRList=harmonizeGRanges(myGRList, cores=cores)
+  if (min_seg_size>0) {
+    TO_KEEP=IRanges::width(myGRList[[1]])>min_seg_size
+    myGRList=lapply(myGRList, function(x) x[TO_KEEP, ])
+    rm(TO_KEEP)
+  }
+  CNstatus_matrix=do.call(cbind, lapply(myGRList, function(x) { return(GenomicRanges::mcols(x)[, CNstatus]) }))
+  WIDTHS=IRanges::width(myGRList[[1]])
+  SUM_WIDTH=sum(WIDTHS)
+  ISA=foreach::foreach(i=1:length(myGRList), .combine=cbind) %dopar% {
+    if (i %% 50==0) print(paste0(i, "/", length(myGRList)))
+    SAME=CNstatus_matrix==CNstatus_matrix[, i]
+    return(sapply(1:length(myGRList), function(x) sum(WIDTHS[SAME[, x]])/SUM_WIDTH))
+  }
+  ISA=ISA*100
+  rownames(ISA)=NAMES
+  colnames(ISA)=NAMES
+  return(ISA)
 }
diff --git a/R/computeISA_batch.R b/R/computeISA_batch.R
diff --git a/R/computeMD.R b/R/computeMD.R
@@ -34,4 +34,64 @@ computeMD=function(GR1, GR2, nMajor="nMajor", nMinor="nMinor", convertMb=FALSE)
   MD=sum((abs(GenomicRanges::mcols(profiles[[1]])[, nMajor]-GenomicRanges::mcols(profiles[[2]])[, nMajor])+abs(GenomicRanges::mcols(profiles[[1]])[, nMinor]-GenomicRanges::mcols(profiles[[2]])[, nMinor]))*IRanges::width(profiles[[1]]))
   if (convertMb) MD=MD/1e6
   return(MD)
+}
+
+#' @title computeMD_batch
+#' @description Compute the Manhattan distance (MD) for a batch of samples
+#' @details This function computes the Manhattan distance (MD) between multiple profiles (as a list of GRanges objects).
+#' @param myGRList a list of GRanges objects, each object should correspond to one CNA profile
+#' @param cores a numeric, the number of cores to use (default: 1)
+#' @param min_seg_size a numeric, the minimum segment size (in bp) to consider (default: 0)
+#' @param nMajor a metadata column name for the major allele (default: "nMajor")
+#' @param nMinor a metadata column name for the minor allele (default: "nMinor")
+#' @param convertMb a boolean, the MD will be converted to megabases if set to TRUE (default: FALSE)
+#' @return A matrix of MD values
+#' @examples
+#' require("GenomicRanges")
+#' GR1=GRanges(seqnames=rep("1", 3),
+#'             ranges=IRanges(start=c(1, 1001, 10001), end=c(1000, 10000, 20000)),
+#'             nMajor=c(1, 2, 1),
+#'             nMinor=c(1, 1, 1))
+#' GR2=GRanges(seqnames=rep("1", 2),
+#'             ranges=IRanges(start=c(500, 10001), end=c(10000, 25000)),
+#'             nMajor=c(2, 1),
+#'             nMinor=c(1, 1))
+#' GR3=GRanges(seqnames="1",
+#'             ranges=IRanges(start=500, end=25000),
+#'             nMajor=1,
+#'             nMinor=1)
+#' myGRList=list(GR1, GR2, GR3)
+#' names(myGRList)=c("GR1", "GR2", "GR3")
+#' computeMD_batch(myGRList)
+#' @author tlesluyes
+#' @export
+computeMD_batch=function(myGRList, cores=1, min_seg_size=0, nMajor="nMajor", nMinor="nMinor", convertMb=FALSE) {
+  if (cores>1) doParallel::registerDoParallel(cores=cores)
+  checkGRlist(myGRList)
+  stopifnot(length(convertMb)==1 && is.logical(convertMb))
+  stopifnot(all(sapply(myGRList, function(x) all(c(nMajor, nMinor) %in% names(GenomicRanges::mcols(x))))))
+  stopifnot(all(sapply(myGRList, function(x) is.numeric(GenomicRanges::mcols(x)[, nMajor]))))
+  stopifnot(all(sapply(myGRList, function(x) is.numeric(GenomicRanges::mcols(x)[, nMinor]))))
+  if (is.null(names(myGRList))) {
+    NAMES=as.character(1:length(myGRList))
+  } else {
+    NAMES=names(myGRList)
+  }
+  myGRList=harmonizeGRanges(myGRList, cores=cores)
+  if (min_seg_size>0) {
+    TO_KEEP=IRanges::width(myGRList[[1]])>min_seg_size
+    myGRList=lapply(myGRList, function(x) x[TO_KEEP, ])
+    rm(TO_KEEP)
+  }
+  nMajor_matrix=do.call(cbind, lapply(myGRList, function(x) { return(GenomicRanges::mcols(x)[, nMajor]) }))
+  nMinor_matrix=do.call(cbind, lapply(myGRList, function(x) { return(GenomicRanges::mcols(x)[, nMinor]) }))
+  WIDTHS=IRanges::width(myGRList[[1]])
+  MD=foreach::foreach(i=1:length(myGRList), .combine=cbind) %dopar% {
+    if (i %% 50==0) print(paste0(i, "/", length(myGRList)))
+    return(apply((abs(nMajor_matrix-nMajor_matrix[, i])+abs(nMinor_matrix-nMinor_matrix[, i]))*WIDTHS, 2, sum))
+  }
+  rownames(MD)=NAMES
+  colnames(MD)=NAMES
+  if (convertMb) MD=MD/1e6
+  return(MD)
 }
diff --git a/R/computeMD_batch.R b/R/computeMD_batch.R
diff --git a/R/logR_BAF_purity_ploidy.R b/R/logR_BAF_purity_ploidy.R
@@ -0,0 +1,114 @@
+#' @title computeLogR
+#' @description Compute the theoretical logR value for a given segment
+#' @details This function computes the theoretical logR value for a given segment (from nMajor, nMinor, purity and ploidy values). Since logR isn't allele-specific, ntot can be used instead of nMajor (and nMinor should set to 0).
+#' @param nMajor the number of copies of the major allele
+#' @param nMinor the number of copies of the minor allele
+#' @param purity the purity estimate of the tumour
+#' @param ploidy the ploidy estimate of the tumour
+#' @param digits a numeric, the number of digits to round to (default: 4)
+#' @return A number representing the logR value
+#' @seealso https://doi.org/10.1038/s41592-020-01013-2
+#' @examples
+#' # A 2+1 state in a diploid tumour with 90% purity
+#' computeLogR(2, 1, 0.9, 2)
+#' # A loss of 1 copy (2+1) in a pseudo-tetraploid tumour with 60% purity
+#' computeLogR(2, 1, 0.6, 3.5)
+#' @author tlesluyes
+#' @export
+computeLogR=function(nMajor, nMinor, purity, ploidy, digits=4) {
+  stopifnot(all(is.numeric(c(nMajor, nMinor, purity, ploidy, digits))))
+  round(log2((purity*(nMajor+nMinor)+2*(1-purity))/(purity*ploidy+2*(1-purity))), digits)
+}
+
+#' @title computeBAF
+#' @description Compute the theoretical BAF values for a given segment
+#' @details This function computes the theoretical BAF values for a given segment (from nMajor, nMinor and purity values).
+#' @param nMajor the number of copies of the major allele
+#' @param nMinor the number of copies of the minor allele
+#' @param purity the purity estimate of the tumour
+#' @param digits a numeric, the number of digits to round to (default: 4)
+#' @return A vector of two numbers representing the BAF values
+#' @seealso https://doi.org/10.1038/s41592-020-01013-2
+#' @examples
+#' # A 2+1 state in a tumour with 90% purity
+#' computeBAF(2, 1, 0.9)
+#' # A 1+0 state in a tumour with 60% purity
+#' computeBAF(1, 0, 0.6)
+#' @author tlesluyes
+#' @export
+computeBAF=function(nMajor, nMinor, purity, digits=4) {
+  stopifnot(all(is.numeric(c(nMajor, nMinor, purity, digits))))
+  BAF1=(purity*nMinor+(1-purity))/(purity*(nMajor+nMinor)+2*(1-purity))
+  BAF2=(purity*nMajor+(1-purity))/(purity*(nMajor+nMinor)+2*(1-purity))
+  return(round(c(BAF1, BAF2), digits))
+}
+
+#' @title computeFit
+#' @description Compute the purity/ploidy fit for a given segment
+#' @details This function computes the purity/ploidy fit (rho, psi and psit) for a given segment (from logR, BAF, proposed nMajor and proposed nMinor).
+#' @param logR the logR value of the segment
+#' @param BAF the BAF value of the segment (upper band only so the value should be in the 0.5-1 space)
+#' @param nMajor the number of copies of the major allele
+#' @param nMinor the number of copies of the minor allele
+#' @param gamma the gamma parameter is platform-dependent and represents the expected logR decrease in a diploid sample where one copy is lost (should be 1 for HTS data and 0.55 for SNP arrays)
+#' @param digits a numeric, the number of digits to round to (default: 4)
+#' @return A list with the rho (=purity), psi (=total ploidy) and psit (=tumour ploidy) values
+#' @examples
+#' # A segment has logR=0.5361 and BAF=0.3448/0.6552
+#' # What is the purity/ploidy fit if I believe that the segment is 2+1?
+#' computeFit(0.5361, 0.6552, 2, 1, 1) # purity=90%; ploidy=2
+#' @author tlesluyes
+#' @export
+#' @seealso https://doi.org/10.1038/s41592-020-01013-2
+computeFit=function(logR, BAF, nMajor, nMinor, gamma, digits=4) {
+  stopifnot(all(is.numeric(c(logR, BAF, nMajor, nMinor, gamma, digits))))
+  rho=(2*BAF-1)/(2*BAF-BAF*(nMajor+nMinor)-1+nMajor)
+  psi=(rho*(nMajor+nMinor)+2-2*rho)/(2^(logR/gamma))
+  psit=(psi-2*(1-rho))/rho
+  return(list(rho=round(rho, digits), psi=round(psi, digits), psit=round(psit, digits)))
+}
+
+#' @title reestimate_ploidy
+#' @description Compute the re-estimated ploidy for a given sample
+#' @details This function computes the re-estimated ploidy for a given sample (from its old purity/ploidy fit and the re-estimated purity).
+#' @param rho.old old purity estimate
+#' @param psit.old old ploidy estimate
+#' @param rho.new new purity estimate
+#' @param WGD number of WGD events (0 if there is no WGD)
+#' @param digits a numeric, the number of digits to round to (default: 4)
+#' @return A number representing the re-estimated ploidy
+#' @examples
+#' # A pseudo-diploid sample has purity=74% and ploidy=2.4
+#' # What is the re-estimated ploidy if I believe that the sample has purity=61%?
+#' reestimate_ploidy(0.74, 2.4, 0.61, 0)
+#' @author tlesluyes
+#' @export
+reestimate_ploidy=function(rho.old, psit.old, rho.new, WGD, digits=4) {
+  stopifnot(all(is.numeric(c(rho.old, psit.old, rho.new, digits))))
+  COEF=2*(WGD+1)
+  return(round(((rho.old*psit.old)+COEF*(rho.new-rho.old))/rho.new, digits))
+}
+
+#' @title reestimate_purity
+#' @description Compute the re-estimated purity for a given sample
+#' @details This function computes the re-estimated purity for a given sample in the context of a jump in ploidy (so the matched ploidy needs to be doubled or halved).
+#' @param rho.old old purity estimate
+#' @param psit.old old ploidy estimate
+#' @param switch a character ("double" or "halve") indicating whether the ploidy should be doubled or halved
+#' @param digits a numeric, the number of digits to round to (default: 4)
+#' @return A number representing the re-estimated purity
+#' @examples
+#' # A sample has purity=74% and ploidy=2.4 but the CNA profile needs to be doubled
+#' # What is the re-estimated purity?
+#' reestimate_purity(0.74, 2.4, "double")
+#' @author tlesluyes
+#' @export
+reestimate_purity=function(rho.old, psit.old, switch, digits=4) {
+  stopifnot(all(is.numeric(c(rho.old, psit.old, digits))))
+  stopifnot(switch %in% c("double", "halve"))
+  if (switch=="double") {
+    return(round(rho.old/(2-rho.old), digits))
+  } else {
+    return(round(2*rho.old/(rho.old+1), digits))
+  }
+}