From 6ff5f46448446f9c9559ed25eb4eb7541d54c463 Mon Sep 17 00:00:00 2001 From: Davide Risso Date: Wed, 5 Apr 2017 10:37:29 -0400 Subject: [PATCH 01/21] Update .travis.yml Trying to avoid Travis error by using release version of devtools --- .travis.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index 1845df51..0b26eaee 100644 --- a/.travis.yml +++ b/.travis.yml @@ -30,12 +30,13 @@ notifications: on_failure: change ## Use patched devtools -r_github_packages: - - hadley/devtools +# r_github_packages: +# - hadley/devtools ## Code coverage r_packages: - covr + - devtools ## BiocCheck bioc_packages: From c752f8383b0106b4037f56a0dc261d06b479546c Mon Sep 17 00:00:00 2001 From: Davide Risso Date: Wed, 5 Apr 2017 10:54:20 -0400 Subject: [PATCH 02/21] Update .travis.yml Going back to GitHub version of devtools because of an issue with the CRAN version --- .travis.yml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/.travis.yml b/.travis.yml index 0b26eaee..1845df51 100644 --- a/.travis.yml +++ b/.travis.yml @@ -30,13 +30,12 @@ notifications: on_failure: change ## Use patched devtools -# r_github_packages: -# - hadley/devtools +r_github_packages: + - hadley/devtools ## Code coverage r_packages: - covr - - devtools ## BiocCheck bioc_packages: From 1cd394ba578e3c02a2d7e5538e6a37e2365e9b2b Mon Sep 17 00:00:00 2001 From: Davide Risso Date: Wed, 5 Apr 2017 11:35:19 -0400 Subject: [PATCH 03/21] Update .travis.yml Trying by removing the cached packages... --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 1845df51..5d086d0a 100644 --- a/.travis.yml +++ b/.travis.yml @@ -11,7 +11,7 @@ # Validate your .travis.yml file at http://lint.travis-ci.org/ #---------------------------------------------------------------- language: r -cache: packages +#cache: packages # R versions to be tested on r: From f74168d0cf3540a9fe18d26ce0608f5380bbf46e Mon Sep 17 00:00:00 2001 From: Davide Risso Date: Wed, 5 Apr 2017 13:48:47 -0400 Subject: [PATCH 04/21] Update .travis.yml Workaround to avoid issues with installing dependencies in Travis CI --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 5d086d0a..d043a276 100644 --- a/.travis.yml +++ b/.travis.yml @@ -31,7 +31,7 @@ notifications: ## Use patched devtools r_github_packages: - - hadley/devtools + - hadley/devtools@efa894ffa ## Code coverage r_packages: From 4f8f0a9884ac53f6c7f0498ff3a6f3b7e5a3d613 Mon Sep 17 00:00:00 2001 From: Davide Risso Date: Wed, 5 Apr 2017 13:49:27 -0400 Subject: [PATCH 05/21] Update .travis.yml Use cached packages again --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index d043a276..8dce1816 100644 --- a/.travis.yml +++ b/.travis.yml @@ -11,7 +11,7 @@ # Validate your .travis.yml file at http://lint.travis-ci.org/ #---------------------------------------------------------------- language: r -#cache: packages +cache: packages # R versions to be tested on r: From da3d6d0a65f53c273be790af4aa629fee182b178 Mon Sep 17 00:00:00 2001 From: Elizabeth Purdom Date: Fri, 7 Apr 2017 11:18:44 -0700 Subject: [PATCH 06/21] update version on develop to -9001 --- NEWS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS b/NEWS index d2e4008a..ed5372ae 100644 --- a/NEWS +++ b/NEWS @@ -1,4 +1,4 @@ -Changes in version 1.1.2 ( Release date: 2017-04-04 ) +Changes in version 1.2.0-9001 ( Release date: 2017-04-04 ) ============== Changes: From f7a5d60bc7c0e206af77d3f7c08c45628542bbb8 Mon Sep 17 00:00:00 2001 From: Elizabeth Purdom Date: Fri, 7 Apr 2017 11:20:44 -0700 Subject: [PATCH 07/21] update version on develop to -9001 --- NEWS | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/NEWS b/NEWS index ed5372ae..a4cc6851 100644 --- a/NEWS +++ b/NEWS @@ -1,6 +1,8 @@ -Changes in version 1.2.0-9001 ( Release date: 2017-04-04 ) +Changes in version 1.2.0-9001 ( Release date: ) ============== +Changes in version 1.2.0 ( Release date: 2017-04-04 ) +============== Changes: * RSEC now has option `rerunClusterMany`, which if FALSE will not rerun the clusterMany step if RSEC is called on an existing clusterExperiment object (assuming of course, clusterMany has been run already on the object) * setBreaks now has option `makeSymmetric` to force symmetric breaks around zero when using the quantile option. From a07043e217e213ffc4831bc799816b26ff212ab0 Mon Sep 17 00:00:00 2001 From: Elizabeth Purdom Date: Wed, 3 May 2017 13:32:28 -0700 Subject: [PATCH 08/21] fix plotHeatmap to deal with extra legend colors --- NEWS | 5 +++++ R/plotHeatmap.R | 27 ++++++++++++++++++++++++++- 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/NEWS b/NEWS index a4cc6851..b959893c 100644 --- a/NEWS +++ b/NEWS @@ -1,5 +1,10 @@ Changes in version 1.2.0-9001 ( Release date: ) ============== +Changes: +* plotHeatmap accepts data.frame values for the data argument (calls `data.matrix` on object and sends to matrix version) + +Bug fixes: +* plotHeatmap now goes through the clusterLegend input and removes levels that do not exist in the sampleData; this was causing incorrect coloring when the clusterLegend had more (or less) levels that it assigned color to than the sampleData did (e.g. if sampleData was subset of larger dataset upon which the original colors were assigned.) Changes in version 1.2.0 ( Release date: 2017-04-04 ) ============== diff --git a/R/plotHeatmap.R b/R/plotHeatmap.R index 55927ddf..3d212a06 100644 --- a/R/plotHeatmap.R +++ b/R/plotHeatmap.R @@ -467,6 +467,15 @@ setMethod( }) + +#' @rdname plotHeatmap +setMethod( + f = "plotHeatmap", + signature = signature(data = "data.frame"), + definition = function(data,...){ + plotHeatmap(data.matrix(data),...) + } +) #' @rdname plotHeatmap setMethod( f = "plotHeatmap", @@ -592,7 +601,10 @@ setMethod( #browser() tmpDf<-do.call("data.frame",lapply(1:ncol(sampleData),function(ii){factor(sampleData[,ii])})) names(tmpDf)<-colnames(sampleData) - if(!is.null(whSampleDataCont)) tmpDf[,whSampleDataCont]<-sampleData[,whSampleDataCont] + if(!is.null(whSampleDataCont)){ + if(logical(whSampleDataCont)) whSampleDataCont<-which(whSampleDataCont) + if(length(whSampleDataCont)>0) tmpDf[,whSampleDataCont]<-sampleData[,whSampleDataCont] + } annCol<-tmpDf #browser() convertNames <- TRUE @@ -643,6 +655,19 @@ setMethod( } else { annColors<-clusterLegend #in case give in format wanted by aheatmap to begin with } + #remove any unused level colors to clean up legend and make them in same order as in annCol factor + whInAnnColors<-which(names(annColors)%in% colnames(annCol)) + if(!is.null(whSampleDataCont) & length(whSampleDataCont)>0){ + whInAnnColors<-setdiff(whInAnnColors,whSampleDataCont) + } + prunedList<-lapply(whInAnnColors,function(ii){ + nam<-names(annColors)[[ii]] + x<-annColors[[ii]] + levs<-levels(annCol[,nam]) + x<-x[levs] + }) + names(prunedList)<-names(annColors)[whInAnnColors] + annColors[whInAnnColors]<-prunedList } else{ annCol<-NA From c56d6a38d86ff8f2d22f8a62c5d577a0c33ba7e0 Mon Sep 17 00:00:00 2001 From: Elizabeth Purdom Date: Wed, 3 May 2017 16:18:59 -0700 Subject: [PATCH 09/21] fix bug in checking dimensions in plotHeatmap --- NEWS | 3 ++- R/plotHeatmap.R | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/NEWS b/NEWS index b959893c..49413348 100644 --- a/NEWS +++ b/NEWS @@ -4,7 +4,8 @@ Changes: * plotHeatmap accepts data.frame values for the data argument (calls `data.matrix` on object and sends to matrix version) Bug fixes: -* plotHeatmap now goes through the clusterLegend input and removes levels that do not exist in the sampleData; this was causing incorrect coloring when the clusterLegend had more (or less) levels that it assigned color to than the sampleData did (e.g. if sampleData was subset of larger dataset upon which the original colors were assigned.) +* plotHeatmap now goes through the clusterLegend input and removes levels that do not exist in the sampleData; this was causing incorrect coloring when the clusterLegend had more (or less) levels that it assigned color to than the sampleData did (e.g. if sampleData was subset of larger dataset upon which the original colors were assigned.) Note that this now has the effect of NOT plotting all values in the clusterLegend, thus changing the previous behavior of plotHeatmap legend. +* fixed bug in how plotHeatmap checked the dimensions of dendrogram match that of data. Changes in version 1.2.0 ( Release date: 2017-04-04 ) ============== diff --git a/R/plotHeatmap.R b/R/plotHeatmap.R index 3d212a06..367ed06f 100644 --- a/R/plotHeatmap.R +++ b/R/plotHeatmap.R @@ -551,7 +551,7 @@ setMethod( else{ if(clusterFeatures){ if(inherits(clusterFeaturesData, "dendrogram")){ - if(nobs(clusterFeaturesData)!=ncol(heatData)) stop("clusterFeaturesData dendrogram is not on same number of observations as heatData") + if(nobs(clusterFeaturesData)!=nrow(heatData)) stop("clusterFeaturesData dendrogram is not on same number of observations as heatData") dendroFeatures<-clusterFeaturesData } else{ From 0a5c61ddf233cdc6c8d13e985fe1fdbc698f1874 Mon Sep 17 00:00:00 2001 From: Elizabeth Purdom Date: Thu, 4 May 2017 11:04:03 -0700 Subject: [PATCH 10/21] add ExpressionSet for plotHeatmap --- NEWS | 2 +- R/plotHeatmap.R | 9 +++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/NEWS b/NEWS index 49413348..7fb0646f 100644 --- a/NEWS +++ b/NEWS @@ -1,7 +1,7 @@ Changes in version 1.2.0-9001 ( Release date: ) ============== Changes: -* plotHeatmap accepts data.frame values for the data argument (calls `data.matrix` on object and sends to matrix version) +* plotHeatmap accepts data.frame or ExpressionSet objects for the data argument (calls `data.matrix` or `exprs` on object and sends to matrix version) Bug fixes: * plotHeatmap now goes through the clusterLegend input and removes levels that do not exist in the sampleData; this was causing incorrect coloring when the clusterLegend had more (or less) levels that it assigned color to than the sampleData did (e.g. if sampleData was subset of larger dataset upon which the original colors were assigned.) Note that this now has the effect of NOT plotting all values in the clusterLegend, thus changing the previous behavior of plotHeatmap legend. diff --git a/R/plotHeatmap.R b/R/plotHeatmap.R index 367ed06f..b2c7f832 100644 --- a/R/plotHeatmap.R +++ b/R/plotHeatmap.R @@ -476,6 +476,15 @@ setMethod( plotHeatmap(data.matrix(data),...) } ) +#' @rdname plotHeatmap +setMethod( + f = "plotHeatmap", + signature = signature(data = "ExpressionSet"), + definition = function(data,...){ + plotHeatmap(exprs(data),...) + } +) + #' @rdname plotHeatmap setMethod( f = "plotHeatmap", From 1a97922de14cb47cd221c19095f57e5d6ec03349 Mon Sep 17 00:00:00 2001 From: Elizabeth Purdom Date: Thu, 11 May 2017 11:40:12 -0700 Subject: [PATCH 11/21] add plotBarplot and fix bugs discovered during addition --- NAMESPACE | 1 + NEWS | 9 +- R/AllGenerics.R | 10 +- R/AllHelper.R | 28 +++- R/plotBarplot.R | 239 ++++++++++++++++++++++++++++++ R/plotClusters.R | 2 + R/plottingHelpers.R | 1 + R/subsampleClustering.R | 155 +++++++++++++------ man/ClusterExperiment-methods.Rd | 16 +- man/plotBarplot.Rd | 109 ++++++++++++++ man/plotHeatmap.Rd | 6 + man/subsampleClustering.Rd | 4 +- tests/testthat/test_constructor.R | 8 +- tests/testthat/test_plotting.R | 32 ++++ 14 files changed, 566 insertions(+), 54 deletions(-) create mode 100644 R/plotBarplot.R create mode 100644 man/plotBarplot.Rd diff --git a/NAMESPACE b/NAMESPACE index 50f1873d..dae52cab 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -46,6 +46,7 @@ exportMethods(nClusters) exportMethods(nFeatures) exportMethods(nSamples) exportMethods(orderSamples) +exportMethods(plotBarplot) exportMethods(plotClusters) exportMethods(plotCoClustering) exportMethods(plotDendrogram) diff --git a/NEWS b/NEWS index 7fb0646f..2460b029 100644 --- a/NEWS +++ b/NEWS @@ -1,11 +1,14 @@ Changes in version 1.2.0-9001 ( Release date: ) ============== Changes: -* plotHeatmap accepts data.frame or ExpressionSet objects for the data argument (calls `data.matrix` or `exprs` on object and sends to matrix version) +* `plotHeatmap` accepts `data.frame` or `ExpressionSet` objects for the data argument (calls `data.matrix` or `exprs` on object and sends to matrix version) +* Added `plotBarplot` to plot a barplot for 1 cluster or comparison of 2 clusters along with tests. +* Added `whichClusters` argument to `clusterMatrix` to return only clusters corresponding to certain clusters. Mainly relevant for using arguments like `workflow` that are used by other commands (otherwise could just index the complete matrix manually...) Bug fixes: -* plotHeatmap now goes through the clusterLegend input and removes levels that do not exist in the sampleData; this was causing incorrect coloring when the clusterLegend had more (or less) levels that it assigned color to than the sampleData did (e.g. if sampleData was subset of larger dataset upon which the original colors were assigned.) Note that this now has the effect of NOT plotting all values in the clusterLegend, thus changing the previous behavior of plotHeatmap legend. -* fixed bug in how plotHeatmap checked the dimensions of dendrogram match that of data. +* `plotHeatmap` now goes through the `clusterLegend` input and removes levels that do not exist in the sampleData; this was causing incorrect coloring when the `clusterLegend` had more (or less) levels that it assigned color to than the `sampleData` did (e.g. if `sampleData` was a subset of larger dataset upon which the original colors were assigned.) NOTE: that this now has the effect of NOT plotting all values in the clusterLegend if they are not represented in the data, thus changing the previous behavior of `plotHeatmap` legend. +* fixed bug in how `plotHeatmap` checked that the dimensions of user-supplied dendrogram match that of data (matrix version). +* fixed `convertClusterLegend` so when `output` is `matrixNames` or `matrixColors`, the resulting matrix has the `colnames` equal to cluster labels, like `clusterMatrix`. Changes in version 1.2.0 ( Release date: 2017-04-04 ) ============== diff --git a/R/AllGenerics.R b/R/AllGenerics.R index 8d7611f5..ff0c03e0 100644 --- a/R/AllGenerics.R +++ b/R/AllGenerics.R @@ -170,6 +170,14 @@ setGeneric( } ) +setGeneric( + name="plotBarplot", + def=function(clusters, whichClusters,...) + { + standardGeneric("plotBarplot") + } +) + setGeneric( name="plotHeatmap", def=function(data,...) @@ -194,7 +202,7 @@ setGeneric( setGeneric( name = "clusterMatrix", - def = function(x) { + def = function(x,whichClusters) { standardGeneric("clusterMatrix") } ) diff --git a/R/AllHelper.R b/R/AllHelper.R index 06ba52a3..8998e5fb 100644 --- a/R/AllHelper.R +++ b/R/AllHelper.R @@ -188,11 +188,35 @@ setMethod( #' @aliases clusterMatrix setMethod( f = "clusterMatrix", - signature = "ClusterExperiment", - definition = function(x) { + signature = c("ClusterExperiment","missing"), + definition = function(x,whichClusters) { return(x@clusterMatrix) } ) +#' @rdname ClusterExperiment-methods +#' @return \code{clusterMatrix} returns the matrix with all the clusterings. +#' @export +#' @aliases clusterMatrix +setMethod( + f = "clusterMatrix", + signature = c("ClusterExperiment","numeric"), + definition = function(x,whichClusters) { + return(x@clusterMatrix[,whichClusters,drop=FALSE]) + } +) +#' @rdname ClusterExperiment-methods +#' @return \code{clusterMatrix} returns the matrix with all the clusterings. +#' @export +#' @aliases clusterMatrix +setMethod( + f = "clusterMatrix", + signature = c("ClusterExperiment","character"), + definition = function(x,whichClusters) { + wh<-.TypeIntoIndices(x,whClusters=whichClusters) + return(clusterMatrix(x,whichClusters=wh)) + } +) + #' @rdname ClusterExperiment-methods #' @return \code{primaryCluster} returns the primary clustering (as numeric). diff --git a/R/plotBarplot.R b/R/plotBarplot.R new file mode 100644 index 00000000..117df38b --- /dev/null +++ b/R/plotBarplot.R @@ -0,0 +1,239 @@ +#' Barplot of 1 or 2 clusterings +#' +#' Make a barplot of sample's assignments to clusters for single clustering, or +#' cross comparison for two clusterings. +#' +#' @aliases plotBarplot +#' @docType methods +#' @param clusters A matrix of with each column corresponding to a clustering +#' and each row a sample or a \code{\link{ClusterExperiment}} object. If a +#' matrix, the function will plot the clusterings in order of this matrix, and +#' their order influences the plot greatly. +#' @param whichClusters If numeric, a predefined order for the clusterings in +#' the plot. If x is a \code{\link{ClusterExperiment}} object, +#' \code{whichClusters} can be a character value identifying the +#' \code{clusterTypess} to be used; alternatively \code{whichClusters} +#' can be either 'all' or 'workflow' to indicate choosing all clusters or +#' choosing all \code{\link{workflowClusters}}. +#' @param unassignedColor If ``-1'' in \code{clusters}, will be given this color +#' (meant for samples not assigned to cluster). +#' @param missingColor If ``-2'' in clusters, will be given this color (meant +#' for samples that were missing from the clustering, mainly when comparing +#' clusterings run on different sets of samples) +#' @param colPalette a vector of colors used for the different clusters. Must be +#' as long as the maximum number of clusters found in any single +#' clustering/column given in \code{clusters} or will otherwise return an +#' error. +#' @param xNames names for the first clusters (on x-axis). By default uses +#' values in 1st cluster of clusters matrix +#' @param legNames names for the first clusters (in legend). By default uses +#' values in 2nd cluster of clusters matrix +#' @param legend whether to plot the legend +#' @param xlab label for x-axis. By default or if equal NULL the column name of +#' the 1st cluster of clusters matrix +#' @param legend.title label for legend. By default or if equal NULL the column +#' name of the 2st cluster of clusters matrix +#' @param labels if clusters is a clusterExperiment object, then labels defines +#' whether the clusters will be identified by their names values in +#' clusterLegend (labels="names", the default) or by their clusterIds value in +#' clusterLegend (labels="ids"). +#' @param ... for \code{plotBarplot} arguments passed either to the method +#' of \code{plotBarplot} for matrices or ultimately to \code{\link{barplot}}. +#' @details The first column of the cluster matrix will be on the x-axis and the +#' second column will separate the groups of the first column. +#' @details All arguments of the matrix version can be passed to the +#' \code{ClusterExperiment} version. As noted above, however, some arguments +#' have different interpretations. +#' @details If \code{whichClusters = "workflow"}, then the most recent two +#' clusters of the workflow will be chosen where recent is based on the +#' following order (most recent first): final, mergeClusters, combineMany, +#' clusterMany. +#' +#' @author Elizabeth Purdom + +#' @export +#' +#' @examples +#' #clustering using pam: try using different dimensions of pca and different k +#' data(simData) +#' +#' cl <- clusterMany(simData, nPCADims=c(5, 10, 50), dimReduce="PCA", +#' clusterFunction="pam", ks=2:4, findBestK=c(TRUE,FALSE), +#' removeSil=c(TRUE,FALSE)) +#' +#' plotBarplot(cl) +#' plotBarplot(cl,whichClusters=1:2) +#' +#' @rdname plotBarplot +setMethod( + f = "plotBarplot", + signature = signature(clusters = "ClusterExperiment",whichClusters="character"), + definition = function(clusters, whichClusters,...) + { + wh<-head(.TypeIntoIndices(clusters,whClusters=whichClusters),2) + return(plotBarplot(clusters,whichClusters=wh,...)) + + }) + +#' @rdname plotBarplot +#' @export +setMethod( + f = "plotBarplot", + signature = signature(clusters = "ClusterExperiment",whichClusters="missing"), + definition = function(clusters, whichClusters,...) + { + plotBarplot(clusters,whichClusters="primaryCluster") + + }) + +#' @rdname plotBarplot +#' @export +setMethod( + f = "plotBarplot", + signature = signature(clusters = "ClusterExperiment",whichClusters="numeric"), + definition = function(clusters, whichClusters,labels=c("names","ids"),...) + { + labels<-match.arg(labels) + legend<-clusterLegend(clusters)[[tail(whichClusters,1)]] + colPalette<-legend[,"color"] + numClusterMat<-clusterMatrix(clusters,whichClusters=whichClusters) + if(labels=="names"){ + clusterMat<-convertClusterLegend(clusters,output="matrixNames")[,whichClusters] + names(colPalette)<-legend[,"name"] + #make sure "-1" stays "-1" + clusterMat[numClusterMat== -1]<- "-1" + clusterMat[numClusterMat== -2]<- "-2" + if(any(legend[,"clusterIds"]== "-1")){ + names(colPalette)[which(legend[,"clusterIds"]== "-1")]<-"-1" + } + if(any(legend[,"clusterIds"]== "-2")){ + names(colPalette)[which(legend[,"clusterIds"]== "-2")]<-"-2" + } + } + else{ + clusterMat<-numClusterMat + names(colPalette)<-legend[,"clusterIds"] + } + args<-list(...) + if(!"unassignedColor" %in% names(args) & any(legend[,"clusterIds"]== "-1")){ + args$unassignedColor<-legend[legend[,"clusterIds"]== "-1","color"] + } + if(!"missingColor" %in% names(args) & any(legend[,"clusterIds"]== "-2")){ + args$missingColor<-legend[legend[,"clusterIds"]== "-2","color"] + } + #browser() + do.call("plotBarplot",c(list(clusters=clusterMat,colPalette=colPalette),args)) + + }) + +#' @rdname plotBarplot +setMethod( + f = "plotBarplot", + signature = signature(clusters = "ClusterExperiment",whichClusters="missing"), + definition = function(clusters, whichClusters,...) + { + plotBarplot(clusters,whichClusters="primaryCluster",...) + }) + + + +#' @rdname plotBarplot +setMethod( + f = "plotBarplot", + signature = signature(clusters = "vector",whichClusters="missing"), + definition = function(clusters, whichClusters, ...){ + plotBarplot(matrix(clusters,ncol=1),...) + }) + +#' @rdname plotBarplot +setMethod( + f = "plotBarplot", + signature = signature(clusters = "matrix",whichClusters="missing"), + definition = function(clusters, whichClusters, xNames=NULL, legNames=NULL, legend=TRUE, xlab=NULL, legend.title=NULL, unassignedColor="white", missingColor="grey", colPalette=bigPalette,...){ + if(ncol(clusters)>2) stop("clusters must at most 2 clusters (i.e. 2 columns)") + clLeg<-clusters[,1] + if(is.null(xlab)) xlab<-colnames(clusters)[1] + if(ncol(clusters)==2){ + pair<-TRUE + clX<-clusters[,2] + x<-t(table(clLeg,clX)) #references is on the columns, alt on rows + if(is.null(legend.title)) legend.title<-colnames(clusters)[2] + #browser() + + if(is.null(names(colPalette))) colPalette<-rep(colPalette,length=nrow(x)) + else colPalette<-colPalette[rownames(x)] + #change name and color of missing/unassigned + whAltNotAssigned<-which(row.names(x)=="-1") + whAltMissing<-which(row.names(x)=="-2") + whRefNotAssigned<-which(colnames(x)=="-1") + whRefMissing<-which(colnames(x)=="-2") + if(length(whAltNotAssigned)>0){ + row.names(x)[whAltNotAssigned]<-"Not Assigned" + colPalette[whRefNotAssigned]<-unassignedColor + } + if(length(whAltMissing)>0){ + row.names(x)[whAltMissing]<-"Not Included in Clustering" + colPalette[whRefMissing]<-missingColor + } + if(length(whRefNotAssigned)>0){ + colnames(x)[whRefNotAssigned]<-"Not Assigned" + } + if(length(whRefMissing)>0){ + colnames(x)[whRefMissing]<-"Not Included in Clustering" + } + #change order so those are last + if(any(length(whAltNotAssigned)>0 | length(whAltMissing)>0)){ + nm<-row.names(x) + wh<-c(whAltNotAssigned,whAltMissing) + x<-rbind(x[-wh,,drop=FALSE],x[wh,,drop=FALSE]) + rownames(x)<-c(nm[-wh],nm[wh]) #annoying, but otherwise still loose the names + } + if(any(length(whRefNotAssigned)>0 | length(whRefMissing)>0)){ + nm<-colnames(x) + wh<-c(whRefNotAssigned,whRefMissing) + x<-cbind(x[,-wh,drop=FALSE],x[,wh,drop=FALSE]) + colPalette<-c(colPalette[-wh],colPalette[wh]) + colnames(x)<-c(nm[-wh],nm[wh]) #annoying, but otherwise still loose the names + } + if(is.null(legNames)){ + legNames<-colnames(x) + names(legNames)<-colnames(x) + labs<-legNames + } + else{ + if(is.null(names(legNames))) stop("must give names to legNames that match values of reference cluster") + if(length(legNames)!=ncol(x)) stop("Invalid reference cluster names -- not same length as number of reference clusters") + if(!all(sort(names(legNames))==sort(colnames(x)))) stop("Invalid names for reference cluster names -- not match names of reference clusters") + #put in same order + legNames<-legNames[colnames(x)] + labs<-paste(legNames," (",colnames(x),")",sep="") + } + } + else{ + x<-table(clLeg) + if(is.null(names(colPalette))) colPalette<-rep(colPalette,length=length(x)) + else colPalette<-colPalette[names(x)] + if(is.null(legNames)){ + legNames<-names(x) + names(legNames)<-names(x) + labs<-legNames + } + else{ + if(is.null(names(legNames))) stop("must give names to legNames that match values of reference cluster") + if(length(legNames)!=ncol(x)) stop("Invalid reference cluster names -- not same length as number of reference clusters") + if(!all(sort(names(legNames))==sort(names(x)))) stop("Invalid names for reference cluster names -- not match names of reference clusters") + #put in same order + legNames<-legNames[names(x)] + labs<-paste(legNames," (",names(x),")",sep="") + } + + } + par(mar=c(9.1,4.1,4.1,1.1),las=2) + bp<-barplot(x,col=colPalette,legend=legend,args.legend=list(title=legend.title), names.arg=rep("",length(labs)),xlab="",...) + xsize<-diff(par("usr")[3:4]) + text(bp, par("usr")[3]+.0*xsize, labels=labs, srt=45, adj=c(1,2), xpd=TRUE) + title(xlab=xlab,line=7) + +}) + + diff --git a/R/plotClusters.R b/R/plotClusters.R index 478f1e0b..5357a501 100644 --- a/R/plotClusters.R +++ b/R/plotClusters.R @@ -321,6 +321,8 @@ setMethod( axisLine=0,box=FALSE,...) { if(!is.matrix(clusters)) stop("clusters must be a matrix") + + if(!is.null(orderSamples) && !all(orderSamples %in% 1:nrow(clusters))) stop("invalid values for orderSamples") index<-orderSamples #match to old arguments diff --git a/R/plottingHelpers.R b/R/plottingHelpers.R index b42169b6..7ff81dae 100644 --- a/R/plottingHelpers.R +++ b/R/plottingHelpers.R @@ -46,6 +46,7 @@ setMethod( colReturn<-if(output=="matrixNames") "name" else "color" return(colMat[m,colReturn]) })) + colnames(outval)<-clusterLabels(object) } if(output=="plotAndLegend"){ diff --git a/R/subsampleClustering.R b/R/subsampleClustering.R index df22acc2..76b65793 100644 --- a/R/subsampleClustering.R +++ b/R/subsampleClustering.R @@ -58,7 +58,7 @@ #' heatmap(subD) #' @export subsampleClustering<-function(x,k,clusterFunction="pam", clusterArgs=NULL, - classifyMethod=c("All","InSample","OutOfSample"),classifyFunction=NULL, + classifyMethod=c("All","InSample","OutOfSample"),classifyFunction=NULL, largeDataset=FALSE, resamp.num = 100, samp.p = 0.7,ncores=1,... ) { #input<-.checkXDissInput(x,diss) @@ -82,50 +82,119 @@ subsampleClustering<-function(x,k,clusterFunction="pam", clusterArgs=NULL, #if(input %in% c("X","both")) N <- dim(x)[2] else N<-dim(diss)[2] N <- dim(x)[2] subSize <- round(samp.p * N) + + ###Large Data: rather than create this big sample, perhaps should do on the fly? idx<-replicate(resamp.num,sample(1:N,size=subSize)) #each column a set of indices for the subsample. - perSample<-function(ids){ -# xWithIds<-switch(input,"X"=x[,ids,drop=FALSE],"diss"=x,"both"=x[,ids,drop=FALSE]) -# dissWithIds<-switch(input,"X"=diss,"diss"=diss[ids,ids,drop=FALSE],"both"=diss[ids,ids,drop=FALSE]) - xWithIds<-x[,ids,drop=FALSE] - #result<-do.call(clusterFunction,c(list(x=xWithIds,diss=dissWithIds,k=k),clusterArgs)) - result<-do.call(clusterFunction,c(list(x=xWithIds,k=k),clusterArgs)) - #if(classifyMethod=="All") classX<-classifyFunction(x=x,diss=diss,result) - if(classifyMethod=="All") classX<-classifyFunction(x=x,result) - if(classifyMethod=="OutOfSample"){ - # xWithoutIds<-switch(input,"X"=x[,-ids,drop=FALSE],"diss"=x,"both"=x[,-ids,drop=FALSE]) - # dissWithoutIds<-switch(input,"X"=diss,"diss"=diss[-ids,-ids,drop=FALSE],"both"=diss[-ids,-ids,drop=FALSE]) - xWithoutIds<-x[,-ids,drop=FALSE] - #classElse<-classifyFunction(x=xWithoutIds, diss=dissWithoutIds,result) - classElse<-classifyFunction(x=xWithoutIds, result) - classX<-rep(NA,N) - classX[-ids]<-classElse - } - if(classifyMethod=="InSample"){ - classX<-rep(NA,N) - classX[ids]<-result$clustering - } - D <- outer(classX, classX, function(a, b) a == b) - Dinclude<-matrix(1,N,N) - whNA<-which(is.na(classX)) - if(length(whNA)>0){ - Dinclude[whNA,]<-0 #don't add them to the denominator either - Dinclude[,whNA]<-0 - D[whNA,]<-0 #don't add to sum - D[,whNA]<-0 - } - return(list(D=D,Dinclude=Dinclude)) - } - if(ncores==1){ - DList<-apply(idx,2,perSample) - } - else{ - DList<-parallel::mclapply(1:ncol(idx),function(nc){perSample(idx[,nc])},mc.cores=ncores,...) - } - DDenom<-Reduce("+",lapply(DList,function(y){y$Dinclude})) - DNum<-Reduce("+",lapply(DList,function(y){y$D})) - Dbar = DNum/DDenom + perSample<-function(ids){ + # xWithIds<-switch(input,"X"=x[,ids,drop=FALSE],"diss"=x,"both"=x[,ids,drop=FALSE]) + # dissWithIds<-switch(input,"X"=diss,"diss"=diss[ids,ids,drop=FALSE],"both"=diss[ids,ids,drop=FALSE]) + xWithIds<-x[,ids,drop=FALSE] + #result<-do.call(clusterFunction,c(list(x=xWithIds,diss=dissWithIds,k=k),clusterArgs)) + result<-do.call(clusterFunction,c(list(x=xWithIds,k=k),clusterArgs)) + #if(classifyMethod=="All") classX<-classifyFunction(x=x,diss=diss,result) + if(classifyMethod=="All") classX<-classifyFunction(x=x,result) + if(classifyMethod=="OutOfSample"){ + # xWithoutIds<-switch(input,"X"=x[,-ids,drop=FALSE],"diss"=x,"both"=x[,-ids,drop=FALSE]) + # dissWithoutIds<-switch(input,"X"=diss,"diss"=diss[-ids,-ids,drop=FALSE],"both"=diss[-ids,-ids,drop=FALSE]) + xWithoutIds<-x[,-ids,drop=FALSE] + #classElse<-classifyFunction(x=xWithoutIds, diss=dissWithoutIds,result) + classElse<-classifyFunction(x=xWithoutIds, result) + classX<-rep(NA,N) + classX[-ids]<-classElse + } + if(classifyMethod=="InSample"){ + classX<-rep(NA,N) + classX[ids]<-result$clustering + } + if(!largeDataset){ #current implementation + + D <- outer(classX, classX, function(a, b) a == b) + Dinclude<-matrix(1,N,N) + whNA<-which(is.na(classX)) + if(length(whNA)>0){ + Dinclude[whNA,]<-0 #don't add them to the denominator either + Dinclude[,whNA]<-0 + D[whNA,]<-0 #don't add to sum + D[,whNA]<-0 + } + return(list(D=D,Dinclude=Dinclude)) + } + else{ + #instead return one vector of indices and another indicating length of each cluster + #vector, where ids in clusters are adjacent + clusterIds<-unlist(tapply(1:N,classX,function(x){x},simplify=FALSE)) + clusterLengths<-tapply(1:N,classX,length) + return(list(clusterIds=clusterIds,clusterLengths=clusterLengths)) + } + } + + if(ncores==1){ + DList<-apply(idx,2,perSample) + } + else{ + DList<-parallel::mclapply(1:ncol(idx),function(nc){perSample(idx[,nc])},mc.cores=ncores,...) + } + #N large: get rid of these big matrices from memory + rm(idx) + rm(x) + gc() + if(!largeDataset){ + DDenom<-Reduce("+",lapply(DList,function(y){y$Dinclude})) + DNum<-Reduce("+",lapply(DList,function(y){y$D})) + Dbar = DNum/DDenom + } + else{ + otherIds<-function(idx,clustVec,clustLeng){ + m<-which(clustVec==idx) + if(length(m)>1) stop("ids clustered in more than one cluster") + if(length(m)==0) return(NA) #sample not ever clustered + if(length(m)==1){ + ends<-cumsum(clustLeng) + begins<-cumsum(c(1,head(clustLeng,-1))) + whCluster<-which(m<=ends & m>=begins) + if(length(whCluster)>1 | length(whCluster)==0) stop("error in coding: finding range of clusterids") + return(clustVec[seq(begins[whCluster],ends[whCluster],by=1)]) + } + } + #Test: otherIds(5,DList[[1]][[1]],DList[[1]][[2]]) + searchForPairs<-function(ii,clusterList){ + #ii is an index. + # clusterList is a list of all the subsampled returns from the perSample + ## only search for pairs with index greater than ii + + whHave<-which(sapply(clusterList,function(ll){ii%in%ll$clusterIds})) + clusterWith<-lapply(clusterList[whHave],function(ll){ + otherIds(idx=ii,clustVec=ll$clusterIds,clustLeng=ll$clusterLengths) + }) + clusterWithTab<-table(unlist(clusterWith)) + sampledWithTab<-table(unlist(sapply(clusterList[whHave],.subset2,"clusterIds"))) + #jointNames<-names(sampledWithTab) #if manage to not save NxN matrix, could use this to return only those that actually present + jointNames<-as.character(1:N) + out<-cbind(idx=as.integer(as.numeric(jointNames)),together=as.integer(clusterWithTab[jointNames]),total=as.integer(sampledWithTab[jointNames])) + out<-out[out[,"idx"] Date: Thu, 11 May 2017 12:58:35 -0700 Subject: [PATCH 12/21] remove feature subsample added by mistake --- R/subsampleClustering.R | 155 ++++++++++--------------------------- man/subsampleClustering.Rd | 143 +++++++++++++++++++++++++++++++++- 2 files changed, 185 insertions(+), 113 deletions(-) diff --git a/R/subsampleClustering.R b/R/subsampleClustering.R index 76b65793..df22acc2 100644 --- a/R/subsampleClustering.R +++ b/R/subsampleClustering.R @@ -58,7 +58,7 @@ #' heatmap(subD) #' @export subsampleClustering<-function(x,k,clusterFunction="pam", clusterArgs=NULL, - classifyMethod=c("All","InSample","OutOfSample"),classifyFunction=NULL, largeDataset=FALSE, + classifyMethod=c("All","InSample","OutOfSample"),classifyFunction=NULL, resamp.num = 100, samp.p = 0.7,ncores=1,... ) { #input<-.checkXDissInput(x,diss) @@ -82,119 +82,50 @@ subsampleClustering<-function(x,k,clusterFunction="pam", clusterArgs=NULL, #if(input %in% c("X","both")) N <- dim(x)[2] else N<-dim(diss)[2] N <- dim(x)[2] subSize <- round(samp.p * N) - - ###Large Data: rather than create this big sample, perhaps should do on the fly? idx<-replicate(resamp.num,sample(1:N,size=subSize)) #each column a set of indices for the subsample. - perSample<-function(ids){ - # xWithIds<-switch(input,"X"=x[,ids,drop=FALSE],"diss"=x,"both"=x[,ids,drop=FALSE]) - # dissWithIds<-switch(input,"X"=diss,"diss"=diss[ids,ids,drop=FALSE],"both"=diss[ids,ids,drop=FALSE]) - xWithIds<-x[,ids,drop=FALSE] - #result<-do.call(clusterFunction,c(list(x=xWithIds,diss=dissWithIds,k=k),clusterArgs)) - result<-do.call(clusterFunction,c(list(x=xWithIds,k=k),clusterArgs)) - #if(classifyMethod=="All") classX<-classifyFunction(x=x,diss=diss,result) - if(classifyMethod=="All") classX<-classifyFunction(x=x,result) - if(classifyMethod=="OutOfSample"){ - # xWithoutIds<-switch(input,"X"=x[,-ids,drop=FALSE],"diss"=x,"both"=x[,-ids,drop=FALSE]) - # dissWithoutIds<-switch(input,"X"=diss,"diss"=diss[-ids,-ids,drop=FALSE],"both"=diss[-ids,-ids,drop=FALSE]) - xWithoutIds<-x[,-ids,drop=FALSE] - #classElse<-classifyFunction(x=xWithoutIds, diss=dissWithoutIds,result) - classElse<-classifyFunction(x=xWithoutIds, result) - classX<-rep(NA,N) - classX[-ids]<-classElse - } - if(classifyMethod=="InSample"){ - classX<-rep(NA,N) - classX[ids]<-result$clustering - } - if(!largeDataset){ #current implementation - - D <- outer(classX, classX, function(a, b) a == b) - Dinclude<-matrix(1,N,N) - whNA<-which(is.na(classX)) - if(length(whNA)>0){ - Dinclude[whNA,]<-0 #don't add them to the denominator either - Dinclude[,whNA]<-0 - D[whNA,]<-0 #don't add to sum - D[,whNA]<-0 - } - return(list(D=D,Dinclude=Dinclude)) - } - else{ - #instead return one vector of indices and another indicating length of each cluster - #vector, where ids in clusters are adjacent - clusterIds<-unlist(tapply(1:N,classX,function(x){x},simplify=FALSE)) - clusterLengths<-tapply(1:N,classX,length) - return(list(clusterIds=clusterIds,clusterLengths=clusterLengths)) - } - } - - if(ncores==1){ - DList<-apply(idx,2,perSample) - } - else{ - DList<-parallel::mclapply(1:ncol(idx),function(nc){perSample(idx[,nc])},mc.cores=ncores,...) - } - #N large: get rid of these big matrices from memory - rm(idx) - rm(x) - gc() - if(!largeDataset){ - DDenom<-Reduce("+",lapply(DList,function(y){y$Dinclude})) - DNum<-Reduce("+",lapply(DList,function(y){y$D})) - Dbar = DNum/DDenom - } - else{ - otherIds<-function(idx,clustVec,clustLeng){ - m<-which(clustVec==idx) - if(length(m)>1) stop("ids clustered in more than one cluster") - if(length(m)==0) return(NA) #sample not ever clustered - if(length(m)==1){ - ends<-cumsum(clustLeng) - begins<-cumsum(c(1,head(clustLeng,-1))) - whCluster<-which(m<=ends & m>=begins) - if(length(whCluster)>1 | length(whCluster)==0) stop("error in coding: finding range of clusterids") - return(clustVec[seq(begins[whCluster],ends[whCluster],by=1)]) - } - } - #Test: otherIds(5,DList[[1]][[1]],DList[[1]][[2]]) - searchForPairs<-function(ii,clusterList){ - #ii is an index. - # clusterList is a list of all the subsampled returns from the perSample - ## only search for pairs with index greater than ii - - whHave<-which(sapply(clusterList,function(ll){ii%in%ll$clusterIds})) - clusterWith<-lapply(clusterList[whHave],function(ll){ - otherIds(idx=ii,clustVec=ll$clusterIds,clustLeng=ll$clusterLengths) - }) - clusterWithTab<-table(unlist(clusterWith)) - sampledWithTab<-table(unlist(sapply(clusterList[whHave],.subset2,"clusterIds"))) - #jointNames<-names(sampledWithTab) #if manage to not save NxN matrix, could use this to return only those that actually present - jointNames<-as.character(1:N) - out<-cbind(idx=as.integer(as.numeric(jointNames)),together=as.integer(clusterWithTab[jointNames]),total=as.integer(sampledWithTab[jointNames])) - out<-out[out[,"idx"]0){ + Dinclude[whNA,]<-0 #don't add them to the denominator either + Dinclude[,whNA]<-0 + D[whNA,]<-0 #don't add to sum + D[,whNA]<-0 + } + return(list(D=D,Dinclude=Dinclude)) + } + if(ncores==1){ + DList<-apply(idx,2,perSample) + } + else{ + DList<-parallel::mclapply(1:ncol(idx),function(nc){perSample(idx[,nc])},mc.cores=ncores,...) + } + DDenom<-Reduce("+",lapply(DList,function(y){y$Dinclude})) + DNum<-Reduce("+",lapply(DList,function(y){y$D})) + Dbar = DNum/DDenom # if(input %in% c("X","both")) rownames(Dbar)<-colnames(Dbar)<-colnames(x) # else rownames(Dbar)<-colnames(Dbar)<-colnames(diss) - rownames(Dbar)<-colnames(Dbar)<-colnames(x) + rownames(Dbar)<-colnames(Dbar)<-colnames(x) return(Dbar) } diff --git a/man/subsampleClustering.Rd b/man/subsampleClustering.Rd index 5c4effbf..01b12625 100644 --- a/man/subsampleClustering.Rd +++ b/man/subsampleClustering.Rd @@ -1,9 +1,22 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/subsampleClustering.R +% Please edit documentation in R/featureSubsample.R, R/subsampleClustering.R, +% R/tempsubsample.R \name{subsampleClustering} \alias{subsampleClustering} +\alias{subsampleClustering} +\alias{subsampleClustering} \title{Cluster subsamples of the data} \usage{ +subsampleClustering(x, k, clusterFunction = "pam", clusterArgs = NULL, + classifyMethod = c("All", "InSample", "OutOfSample"), + classifyFunction = NULL, largeDataset = FALSE, resamp.num = 100, + samp.p = 0.7, ncores = 1, ...) + +subsampleClustering(x, k, clusterFunction = "pam", clusterArgs = NULL, + classifyMethod = c("All", "InSample", "OutOfSample"), + classifyFunction = NULL, largeDataset = FALSE, resamp.num = 100, + samp.p = 0.7, ncores = 1, ...) + subsampleClustering(x, k, clusterFunction = "pam", clusterArgs = NULL, classifyMethod = c("All", "InSample", "OutOfSample"), classifyFunction = NULL, largeDataset = FALSE, resamp.num = 100, @@ -42,12 +55,92 @@ and new data points, will classify the new data points into a cluster.} \item{ncores}{integer giving the number of cores. If ncores>1, mclapply will be called.} +\item{...}{arguments passed to mclapply (if ncores>1).} + +\item{x}{the data on which to run the clustering (samples in columns).} + +\item{k}{number of clusters to find for each clustering of a subsample +(passed to clusterFunction).} + +\item{clusterFunction}{a function that clusters a \code{p x n} matrix of +data. Can also be given character values 'pam' or 'kmeans' to indicate use +of internal wrapper functions. Must accept arguments 'x' and 'k' (whether +uses them or not). See Details for format of what must return.} + +\item{clusterArgs}{a list of parameter arguments to be passed to +clusterFunction.} + +\item{resamp.num}{the number of subsamples to draw.} + +\item{samp.p}{the proportion of samples to sample for each subsample.} + +\item{classifyMethod}{method for determining which samples should be used in +the co-occurance matrix. "All"= all samples, "OutOfSample"= those not +subsampled, and "InSample"=those in the subsample. "All" and "OutOfSample" +require that you provide classifyFunction to define how to classify those +samples not in the subsample into a cluster. If "All" is chosen, all +samples will be classified into clusters via the classifyFunctions, not +just those that are out-of-sample. Note if not choose 'All' possible to get +NAs in resulting D matrix (particularly if not enough subsamples taken).} + +\item{classifyFunction}{a function which, given the output of clusterFunction +and new data points, will classify the new data points into a cluster.} + +\item{ncores}{integer giving the number of cores. If ncores>1, mclapply will +be called.} + +\item{...}{arguments passed to mclapply (if ncores>1).} + +\item{x}{the data on which to run the clustering (samples in columns).} + +\item{k}{number of clusters to find for each clustering of a subsample +(passed to clusterFunction).} + +\item{clusterFunction}{a function that clusters a \code{p x n} matrix of +data. Can also be given character values 'pam' or 'kmeans' to indicate use +of internal wrapper functions. Must accept arguments 'x' and 'k' (whether +uses them or not). See Details for format of what must return.} + +\item{clusterArgs}{a list of parameter arguments to be passed to +clusterFunction.} + +\item{resamp.num}{the number of subsamples to draw.} + +\item{samp.p}{the proportion of samples to sample for each subsample.} + +\item{classifyMethod}{method for determining which samples should be used in +the co-occurance matrix. "All"= all samples, "OutOfSample"= those not +subsampled, and "InSample"=those in the subsample. "All" and "OutOfSample" +require that you provide classifyFunction to define how to classify those +samples not in the subsample into a cluster. If "All" is chosen, all +samples will be classified into clusters via the classifyFunctions, not +just those that are out-of-sample. Note if not choose 'All' possible to get +NAs in resulting D matrix (particularly if not enough subsamples taken).} + +\item{classifyFunction}{a function which, given the output of clusterFunction +and new data points, will classify the new data points into a cluster.} + +\item{ncores}{integer giving the number of cores. If ncores>1, mclapply will +be called.} + \item{...}{arguments passed to mclapply (if ncores>1).} } \value{ +A \code{n x n} matrix of co-occurances. + +A \code{n x n} matrix of co-occurances. + A \code{n x n} matrix of co-occurances. } \description{ +Given a data matrix, this function will subsample the rows +(samples), cluster the subsamples, and return a \code{n x n} matrix with the +probability of co-occurance. + +Given a data matrix, this function will subsample the rows +(samples), cluster the subsamples, and return a \code{n x n} matrix with the +probability of co-occurance. + Given a data matrix, this function will subsample the rows (samples), cluster the subsamples, and return a \code{n x n} matrix with the probability of co-occurance. @@ -65,6 +158,42 @@ The \code{clusterFunction} must be a function that takes as an classifyFunction arguments. Additional arguments should be supplied via clusterArgs. +The classifyFunction should take as an object a data matrix 'x' with + samples on the columns, and the output of the clusterFunction. Note that the + function should assume that the input 'x' is not the same samples that were + input to the clusterFunction (but can assume that it is the same number of + features/columns). + +The \code{clusterFunction} must be a function that takes as an + argument 'x' which is a \code{p x n} matrix of data and integer 'k'. It + minimally must return a list with element named 'clustering' giving the + vector of cluster ids. To be incorporated with the larger hierarchy, it + should be list with elements of a partition object, just as is returned by + \code{\link[cluster]{pam}}. Generally, the user will need to write a + wrapper function to do this. In the case of pam or kmeans, the user can + identify clusterFunction as "pam" or "kmeans", and the package functions + will use internally written wrappers for the clusterFunction and + classifyFunction arguments. Additional arguments should be supplied via + clusterArgs. + +The classifyFunction should take as an object a data matrix 'x' with + samples on the columns, and the output of the clusterFunction. Note that the + function should assume that the input 'x' is not the same samples that were + input to the clusterFunction (but can assume that it is the same number of + features/columns). + +The \code{clusterFunction} must be a function that takes as an + argument 'x' which is a \code{p x n} matrix of data and integer 'k'. It + minimally must return a list with element named 'clustering' giving the + vector of cluster ids. To be incorporated with the larger hierarchy, it + should be list with elements of a partition object, just as is returned by + \code{\link[cluster]{pam}}. Generally, the user will need to write a + wrapper function to do this. In the case of pam or kmeans, the user can + identify clusterFunction as "pam" or "kmeans", and the package functions + will use internally written wrappers for the clusterFunction and + classifyFunction arguments. Additional arguments should be supplied via + clusterArgs. + The classifyFunction should take as an object a data matrix 'x' with samples on the columns, and the output of the clusterFunction. Note that the function should assume that the input 'x' is not the same samples that were @@ -77,5 +206,17 @@ data(simData) subD <- subsampleClustering(t(simData), k=3, clusterFunction="kmeans", clusterArgs=list(nstart=10), resamp.n=100, samp.p=0.7) +heatmap(subD) +data(simData) + +subD <- subsampleClustering(t(simData), k=3, clusterFunction="kmeans", +clusterArgs=list(nstart=10), resamp.n=100, samp.p=0.7) + +heatmap(subD) +data(simData) + +subD <- subsampleClustering(t(simData), k=3, clusterFunction="kmeans", +clusterArgs=list(nstart=10), resamp.n=100, samp.p=0.7) + heatmap(subD) } From b897af53845ebc34c871d207ffad137b5053174b Mon Sep 17 00:00:00 2001 From: Elizabeth Purdom Date: Thu, 11 May 2017 14:25:00 -0700 Subject: [PATCH 13/21] start some notes on how to change plot for merge clusters --- R/mergeClusters.R | 13 ++++++++++++- R/plotHeatmap.R | 4 ++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/R/mergeClusters.R b/R/mergeClusters.R index 596d7eff..9f7b6f79 100644 --- a/R/mergeClusters.R +++ b/R/mergeClusters.R @@ -198,7 +198,7 @@ setMethod(f = "mergeClusters", } ) -.plotMerge<-function(dendro,mergeOutput,plotType,mergeMethod,clusterLegendMat=NULL,...){ +.plotMerge<-function(dendro,mergeOutput,plotType,mergeMethod,clusterLegendMat=NULL,dendroSamples=NULL,...){ sigInfo<-mergeOutput$propDE whToMerge<-which(sigInfo$Merged) nodesToMerge<-sigInfo$Node[whToMerge] @@ -247,6 +247,17 @@ setMethod(f = "mergeClusters", ape::plot.phylo(phyloObj, show.node=TRUE, edge.lty=edgeLty, tip.color=tip.color,...) } } +## If want to try to add plotCluster information, from example of phydataplot in ape package: +# > ## change the aspect: +# > plot(tr, x.lim = 35, align.tip = TRUE, adj = 1) +# > phydataplot(x, tr, "m", 2, width = 2, border = "white", lwd = 3, legend = "side") +# > ## user-defined colour: +# > f <- function(n) c("yellow", "blue", "red") +# > phydataplot(x, tr, "m", 18, width = 2, border = "white", lwd = 3, +# + legend = "side", funcol = f) +## Would need to add the individual samples to the tree for it to work. +## Note that clusterExperiemnt version does a separate call to .plotMerge, so can easily pull the sample dendrogram from the object... + #' @rdname mergeClusters #' @export diff --git a/R/plotHeatmap.R b/R/plotHeatmap.R index b2c7f832..db6ee3b6 100644 --- a/R/plotHeatmap.R +++ b/R/plotHeatmap.R @@ -608,6 +608,10 @@ setMethod( ###Make sampleData explicitly factors, except for whSampleDataCont ###(not sure why this simpler code doesn't give back data.frame with factors: annCol<-apply(annCol,2,function(x){factor(x)})) #browser() + #check that no ordered factors... + anyOrdered<-sapply(1:ncol(sampleData),function(ii){is.ordered(sampleData[,ii])}) + if(any(anyOrdered)) stop("The function aheatmap in the NMF package that is called to create the heatmap does not currently accept ordered factors (https://github.com/renozao/NMF/issues/83)") + tmpDf<-do.call("data.frame",lapply(1:ncol(sampleData),function(ii){factor(sampleData[,ii])})) names(tmpDf)<-colnames(sampleData) if(!is.null(whSampleDataCont)){ From 435ec72e120dbbc7efdada7c0d77eb0e9303c97d Mon Sep 17 00:00:00 2001 From: Elizabeth Purdom Date: Sat, 13 May 2017 10:30:36 -0700 Subject: [PATCH 14/21] add -9001 to version --- DESCRIPTION | 2 +- R/mergeClusters.R | 15 +++++++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 6593d098..4f7f39e5 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: clusterExperiment Title: Compare Clusterings for Single-Cell Sequencing -Version: 1.2.0 +Version: 1.2.0-9001 Description: Provides functionality for running and comparing many different clusterings of single-cell sequencing data or other large mRNA Expression data sets. Authors@R: c(person("Elizabeth", "Purdom", email = "epurdom@stat.berkeley.edu", diff --git a/R/mergeClusters.R b/R/mergeClusters.R index 596d7eff..f70af283 100644 --- a/R/mergeClusters.R +++ b/R/mergeClusters.R @@ -247,6 +247,21 @@ setMethod(f = "mergeClusters", ape::plot.phylo(phyloObj, show.node=TRUE, edge.lty=edgeLty, tip.color=tip.color,...) } } +# from ape package. +# ## use type = "mosaic" on a 30x5 matrix: +# tr <- rtree(n <- 30) +# p <- 5 +# x <- matrix(sample(3, size = n*p, replace = TRUE), n, p) +# dimnames(x) <- list(paste0("t", 1:n), LETTERS[1:p]) +# plot(tr, x.lim = 35, align.tip = TRUE, adj = 1) +# phydataplot(x, tr, "m", 2) +# ## change the aspect: +# plot(tr, x.lim = 35, align.tip = TRUE, adj = 1) +# phydataplot(x, tr, "m", 2, width = 2, border = "white", lwd = 3, legend = "side") +# ## user-defined colour: +# f <- function(n) c("yellow", "blue", "red") +# phydataplot(x, tr, "m", 18, width = 2, border = "white", lwd = 3, +# legend = "side", funcol = f) #' @rdname mergeClusters #' @export From ae8f215205426ac8215476d32cc05e415598d053 Mon Sep 17 00:00:00 2001 From: Elizabeth Purdom Date: Sat, 13 May 2017 10:35:49 -0700 Subject: [PATCH 15/21] fix version to fix devel in bioconductor --- DESCRIPTION | 2 +- NEWS | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 4f7f39e5..aaacf277 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: clusterExperiment Title: Compare Clusterings for Single-Cell Sequencing -Version: 1.2.0-9001 +Version: 1.3.0-9001 Description: Provides functionality for running and comparing many different clusterings of single-cell sequencing data or other large mRNA Expression data sets. Authors@R: c(person("Elizabeth", "Purdom", email = "epurdom@stat.berkeley.edu", diff --git a/NEWS b/NEWS index 2460b029..c29a4b47 100644 --- a/NEWS +++ b/NEWS @@ -1,4 +1,4 @@ -Changes in version 1.2.0-9001 ( Release date: ) +Changes in version 1.3.0-9001 ( Release date: ) ============== Changes: * `plotHeatmap` accepts `data.frame` or `ExpressionSet` objects for the data argument (calls `data.matrix` or `exprs` on object and sends to matrix version) From 648ef8c6daf92c021cdc61c3d211fbef913aeab7 Mon Sep 17 00:00:00 2001 From: Elizabeth Purdom Date: Wed, 24 May 2017 12:04:10 -0700 Subject: [PATCH 16/21] fix wrong subsampling .Rd file --- man/subsampleClustering.Rd | 81 +++----------------------------------- 1 file changed, 5 insertions(+), 76 deletions(-) diff --git a/man/subsampleClustering.Rd b/man/subsampleClustering.Rd index 01b12625..b7804267 100644 --- a/man/subsampleClustering.Rd +++ b/man/subsampleClustering.Rd @@ -1,26 +1,19 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/featureSubsample.R, R/subsampleClustering.R, -% R/tempsubsample.R +% Please edit documentation in R/featureSubsample.R, R/subsampleClustering.R \name{subsampleClustering} \alias{subsampleClustering} \alias{subsampleClustering} -\alias{subsampleClustering} \title{Cluster subsamples of the data} \usage{ subsampleClustering(x, k, clusterFunction = "pam", clusterArgs = NULL, classifyMethod = c("All", "InSample", "OutOfSample"), - classifyFunction = NULL, largeDataset = FALSE, resamp.num = 100, - samp.p = 0.7, ncores = 1, ...) + classifyFunction = NULL, resamp.num = 100, samp.p = 0.7, ncores = 1, + ...) subsampleClustering(x, k, clusterFunction = "pam", clusterArgs = NULL, classifyMethod = c("All", "InSample", "OutOfSample"), - classifyFunction = NULL, largeDataset = FALSE, resamp.num = 100, - samp.p = 0.7, ncores = 1, ...) - -subsampleClustering(x, k, clusterFunction = "pam", clusterArgs = NULL, - classifyMethod = c("All", "InSample", "OutOfSample"), - classifyFunction = NULL, largeDataset = FALSE, resamp.num = 100, - samp.p = 0.7, ncores = 1, ...) + classifyFunction = NULL, resamp.num = 100, samp.p = 0.7, ncores = 1, + ...) } \arguments{ \item{x}{the data on which to run the clustering (samples in columns).} @@ -89,47 +82,11 @@ and new data points, will classify the new data points into a cluster.} \item{ncores}{integer giving the number of cores. If ncores>1, mclapply will be called.} -\item{...}{arguments passed to mclapply (if ncores>1).} - -\item{x}{the data on which to run the clustering (samples in columns).} - -\item{k}{number of clusters to find for each clustering of a subsample -(passed to clusterFunction).} - -\item{clusterFunction}{a function that clusters a \code{p x n} matrix of -data. Can also be given character values 'pam' or 'kmeans' to indicate use -of internal wrapper functions. Must accept arguments 'x' and 'k' (whether -uses them or not). See Details for format of what must return.} - -\item{clusterArgs}{a list of parameter arguments to be passed to -clusterFunction.} - -\item{resamp.num}{the number of subsamples to draw.} - -\item{samp.p}{the proportion of samples to sample for each subsample.} - -\item{classifyMethod}{method for determining which samples should be used in -the co-occurance matrix. "All"= all samples, "OutOfSample"= those not -subsampled, and "InSample"=those in the subsample. "All" and "OutOfSample" -require that you provide classifyFunction to define how to classify those -samples not in the subsample into a cluster. If "All" is chosen, all -samples will be classified into clusters via the classifyFunctions, not -just those that are out-of-sample. Note if not choose 'All' possible to get -NAs in resulting D matrix (particularly if not enough subsamples taken).} - -\item{classifyFunction}{a function which, given the output of clusterFunction -and new data points, will classify the new data points into a cluster.} - -\item{ncores}{integer giving the number of cores. If ncores>1, mclapply will -be called.} - \item{...}{arguments passed to mclapply (if ncores>1).} } \value{ A \code{n x n} matrix of co-occurances. -A \code{n x n} matrix of co-occurances. - A \code{n x n} matrix of co-occurances. } \description{ @@ -137,10 +94,6 @@ Given a data matrix, this function will subsample the rows (samples), cluster the subsamples, and return a \code{n x n} matrix with the probability of co-occurance. -Given a data matrix, this function will subsample the rows -(samples), cluster the subsamples, and return a \code{n x n} matrix with the -probability of co-occurance. - Given a data matrix, this function will subsample the rows (samples), cluster the subsamples, and return a \code{n x n} matrix with the probability of co-occurance. @@ -176,24 +129,6 @@ The \code{clusterFunction} must be a function that takes as an classifyFunction arguments. Additional arguments should be supplied via clusterArgs. -The classifyFunction should take as an object a data matrix 'x' with - samples on the columns, and the output of the clusterFunction. Note that the - function should assume that the input 'x' is not the same samples that were - input to the clusterFunction (but can assume that it is the same number of - features/columns). - -The \code{clusterFunction} must be a function that takes as an - argument 'x' which is a \code{p x n} matrix of data and integer 'k'. It - minimally must return a list with element named 'clustering' giving the - vector of cluster ids. To be incorporated with the larger hierarchy, it - should be list with elements of a partition object, just as is returned by - \code{\link[cluster]{pam}}. Generally, the user will need to write a - wrapper function to do this. In the case of pam or kmeans, the user can - identify clusterFunction as "pam" or "kmeans", and the package functions - will use internally written wrappers for the clusterFunction and - classifyFunction arguments. Additional arguments should be supplied via - clusterArgs. - The classifyFunction should take as an object a data matrix 'x' with samples on the columns, and the output of the clusterFunction. Note that the function should assume that the input 'x' is not the same samples that were @@ -212,11 +147,5 @@ data(simData) subD <- subsampleClustering(t(simData), k=3, clusterFunction="kmeans", clusterArgs=list(nstart=10), resamp.n=100, samp.p=0.7) -heatmap(subD) -data(simData) - -subD <- subsampleClustering(t(simData), k=3, clusterFunction="kmeans", -clusterArgs=list(nstart=10), resamp.n=100, samp.p=0.7) - heatmap(subD) } From fd098edef1db1e1a2bcba3c6125ab1cc56c0135c Mon Sep 17 00:00:00 2001 From: Elizabeth Purdom Date: Wed, 24 May 2017 12:33:02 -0700 Subject: [PATCH 17/21] update documentation files --- R/AllHelper.R | 9 +++++++++ R/plotBarplot.R | 16 ++-------------- R/plotClusters.R | 13 +++++++------ man/ClusterExperiment-methods.Rd | 10 ++++++++++ man/plotBarplot.Rd | 18 +----------------- man/plotClusters.Rd | 13 +++++++------ 6 files changed, 36 insertions(+), 43 deletions(-) diff --git a/R/AllHelper.R b/R/AllHelper.R index 8998e5fb..65f56e84 100644 --- a/R/AllHelper.R +++ b/R/AllHelper.R @@ -183,6 +183,15 @@ setMethod( ) #' @rdname ClusterExperiment-methods +#' @param whichClusters optional argument that can be either numeric or +#' character value. If numeric, gives the indices of the \code{clusterMatrix} +#' to return; this can also be used to defined an ordering for the +#' clusterings. \code{whichClusters} can be a character value identifying the +#' \code{clusterTypes} to be used, or if not matching \code{clusterTypes} then +#' \code{clusterLabels}; alternatively \code{whichClusters} can be either +#' 'all' or 'workflow' to indicate choosing all clusters or choosing all +#' \code{\link{workflowClusters}}. If missing, the entire matrix of all +#' clusterings is returned. #' @return \code{clusterMatrix} returns the matrix with all the clusterings. #' @export #' @aliases clusterMatrix diff --git a/R/plotBarplot.R b/R/plotBarplot.R index 117df38b..33957854 100644 --- a/R/plotBarplot.R +++ b/R/plotBarplot.R @@ -5,21 +5,9 @@ #' #' @aliases plotBarplot #' @docType methods +#' @inheritParams plotClusters #' @param clusters A matrix of with each column corresponding to a clustering -#' and each row a sample or a \code{\link{ClusterExperiment}} object. If a -#' matrix, the function will plot the clusterings in order of this matrix, and -#' their order influences the plot greatly. -#' @param whichClusters If numeric, a predefined order for the clusterings in -#' the plot. If x is a \code{\link{ClusterExperiment}} object, -#' \code{whichClusters} can be a character value identifying the -#' \code{clusterTypess} to be used; alternatively \code{whichClusters} -#' can be either 'all' or 'workflow' to indicate choosing all clusters or -#' choosing all \code{\link{workflowClusters}}. -#' @param unassignedColor If ``-1'' in \code{clusters}, will be given this color -#' (meant for samples not assigned to cluster). -#' @param missingColor If ``-2'' in clusters, will be given this color (meant -#' for samples that were missing from the clustering, mainly when comparing -#' clusterings run on different sets of samples) +#' and each row a sample or a \code{\link{ClusterExperiment}} object. #' @param colPalette a vector of colors used for the different clusters. Must be #' as long as the maximum number of clusters found in any single #' clustering/column given in \code{clusters} or will otherwise return an diff --git a/R/plotClusters.R b/R/plotClusters.R index 5357a501..18aabb41 100644 --- a/R/plotClusters.R +++ b/R/plotClusters.R @@ -9,12 +9,13 @@ #' and each row a sample or a \code{\link{ClusterExperiment}} object. If a #' matrix, the function will plot the clusterings in order of this matrix, and #' their order influences the plot greatly. -#' @param whichClusters If numeric, a predefined order for the clusterings in -#' the plot. If x is a \code{\link{ClusterExperiment}} object, -#' \code{whichClusters} can be a character value identifying the -#' \code{clusterTypess} to be used; alternatively \code{whichClusters} -#' can be either 'all' or 'workflow' to indicate choosing all clusters or -#' choosing all \code{\link{workflowClusters}}. +#' @param whichClusters If numeric, a predefined order for the clusterings in +#' the plot. If x is a \code{\link{ClusterExperiment}} object, +#' \code{whichClusters} can be a character value identifying the +#' \code{clusterTypes} to be used, or if not matching \code{clusterTypes} then +#' \code{clusterLabels}; alternatively \code{whichClusters} can be either +#' 'all' or 'workflow' to indicate choosing all clusters or choosing all +#' \code{\link{workflowClusters}}. #' @param orderSamples A predefined order in which the samples will be plotted. #' Otherwise the order will be found internally by aligning the clusters #' (assuming \code{input="clusters"}) diff --git a/man/ClusterExperiment-methods.Rd b/man/ClusterExperiment-methods.Rd index 96295e4e..0d56e984 100644 --- a/man/ClusterExperiment-methods.Rd +++ b/man/ClusterExperiment-methods.Rd @@ -114,6 +114,16 @@ \item{..., i, j, drop}{Forwarded to the \code{\link[SummarizedExperiment]{SummarizedExperiment}} method.} +\item{whichClusters}{optional argument that can be either numeric or +character value. If numeric, gives the indices of the \code{clusterMatrix} +to return; this can also be used to defined an ordering for the +clusterings. \code{whichClusters} can be a character value identifying the +\code{clusterTypes} to be used, or if not matching \code{clusterTypes} then +\code{clusterLabels}; alternatively \code{whichClusters} can be either +'all' or 'workflow' to indicate choosing all clusters or choosing all +\code{\link{workflowClusters}}. If missing, the entire matrix of all +clusterings is returned.} + \item{value}{The value to be substituted in the corresponding slot. See the slot descriptions in \code{\link{ClusterExperiment}} for details on what objects may be passed to these functions.} diff --git a/man/plotBarplot.Rd b/man/plotBarplot.Rd index 587e5e48..b592e1a3 100644 --- a/man/plotBarplot.Rd +++ b/man/plotBarplot.Rd @@ -30,16 +30,7 @@ } \arguments{ \item{clusters}{A matrix of with each column corresponding to a clustering -and each row a sample or a \code{\link{ClusterExperiment}} object. If a -matrix, the function will plot the clusterings in order of this matrix, and -their order influences the plot greatly.} - -\item{whichClusters}{If numeric, a predefined order for the clusterings in -the plot. If x is a \code{\link{ClusterExperiment}} object, -\code{whichClusters} can be a character value identifying the -\code{clusterTypess} to be used; alternatively \code{whichClusters} -can be either 'all' or 'workflow' to indicate choosing all clusters or -choosing all \code{\link{workflowClusters}}.} +and each row a sample or a \code{\link{ClusterExperiment}} object.} \item{...}{for \code{plotBarplot} arguments passed either to the method of \code{plotBarplot} for matrices or ultimately to \code{\link{barplot}}.} @@ -63,13 +54,6 @@ the 1st cluster of clusters matrix} \item{legend.title}{label for legend. By default or if equal NULL the column name of the 2st cluster of clusters matrix} -\item{unassignedColor}{If ``-1'' in \code{clusters}, will be given this color -(meant for samples not assigned to cluster).} - -\item{missingColor}{If ``-2'' in clusters, will be given this color (meant -for samples that were missing from the clustering, mainly when comparing -clusterings run on different sets of samples)} - \item{colPalette}{a vector of colors used for the different clusters. Must be as long as the maximum number of clusters found in any single clustering/column given in \code{clusters} or will otherwise return an diff --git a/man/plotClusters.Rd b/man/plotClusters.Rd index f1e00541..585a2c64 100644 --- a/man/plotClusters.Rd +++ b/man/plotClusters.Rd @@ -32,12 +32,13 @@ and each row a sample or a \code{\link{ClusterExperiment}} object. If a matrix, the function will plot the clusterings in order of this matrix, and their order influences the plot greatly.} -\item{whichClusters}{If numeric, a predefined order for the clusterings in -the plot. If x is a \code{\link{ClusterExperiment}} object, -\code{whichClusters} can be a character value identifying the -\code{clusterTypess} to be used; alternatively \code{whichClusters} -can be either 'all' or 'workflow' to indicate choosing all clusters or -choosing all \code{\link{workflowClusters}}.} +\item{whichClusters}{If numeric, a predefined order for the clusterings in +the plot. If x is a \code{\link{ClusterExperiment}} object, +\code{whichClusters} can be a character value identifying the +\code{clusterTypes} to be used, or if not matching \code{clusterTypes} then +\code{clusterLabels}; alternatively \code{whichClusters} can be either +'all' or 'workflow' to indicate choosing all clusters or choosing all +\code{\link{workflowClusters}}.} \item{...}{for \code{plotClusters} arguments passed either to the method of \code{plotClusters} for matrices, or ultimately to \code{\link{plot}} From 255514f940136ab86f98caf8a4b211fd9bf73a41 Mon Sep 17 00:00:00 2001 From: Elizabeth Purdom Date: Wed, 24 May 2017 12:35:52 -0700 Subject: [PATCH 18/21] update subsampling documentation again --- man/subsampleClustering.Rd | 72 +------------------------------------- 1 file changed, 1 insertion(+), 71 deletions(-) diff --git a/man/subsampleClustering.Rd b/man/subsampleClustering.Rd index b7804267..14ec3a8a 100644 --- a/man/subsampleClustering.Rd +++ b/man/subsampleClustering.Rd @@ -1,15 +1,9 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/featureSubsample.R, R/subsampleClustering.R +% Please edit documentation in R/subsampleClustering.R \name{subsampleClustering} \alias{subsampleClustering} -\alias{subsampleClustering} \title{Cluster subsamples of the data} \usage{ -subsampleClustering(x, k, clusterFunction = "pam", clusterArgs = NULL, - classifyMethod = c("All", "InSample", "OutOfSample"), - classifyFunction = NULL, resamp.num = 100, samp.p = 0.7, ncores = 1, - ...) - subsampleClustering(x, k, clusterFunction = "pam", clusterArgs = NULL, classifyMethod = c("All", "InSample", "OutOfSample"), classifyFunction = NULL, resamp.num = 100, samp.p = 0.7, ncores = 1, @@ -48,52 +42,12 @@ and new data points, will classify the new data points into a cluster.} \item{ncores}{integer giving the number of cores. If ncores>1, mclapply will be called.} -\item{...}{arguments passed to mclapply (if ncores>1).} - -\item{x}{the data on which to run the clustering (samples in columns).} - -\item{k}{number of clusters to find for each clustering of a subsample -(passed to clusterFunction).} - -\item{clusterFunction}{a function that clusters a \code{p x n} matrix of -data. Can also be given character values 'pam' or 'kmeans' to indicate use -of internal wrapper functions. Must accept arguments 'x' and 'k' (whether -uses them or not). See Details for format of what must return.} - -\item{clusterArgs}{a list of parameter arguments to be passed to -clusterFunction.} - -\item{resamp.num}{the number of subsamples to draw.} - -\item{samp.p}{the proportion of samples to sample for each subsample.} - -\item{classifyMethod}{method for determining which samples should be used in -the co-occurance matrix. "All"= all samples, "OutOfSample"= those not -subsampled, and "InSample"=those in the subsample. "All" and "OutOfSample" -require that you provide classifyFunction to define how to classify those -samples not in the subsample into a cluster. If "All" is chosen, all -samples will be classified into clusters via the classifyFunctions, not -just those that are out-of-sample. Note if not choose 'All' possible to get -NAs in resulting D matrix (particularly if not enough subsamples taken).} - -\item{classifyFunction}{a function which, given the output of clusterFunction -and new data points, will classify the new data points into a cluster.} - -\item{ncores}{integer giving the number of cores. If ncores>1, mclapply will -be called.} - \item{...}{arguments passed to mclapply (if ncores>1).} } \value{ -A \code{n x n} matrix of co-occurances. - A \code{n x n} matrix of co-occurances. } \description{ -Given a data matrix, this function will subsample the rows -(samples), cluster the subsamples, and return a \code{n x n} matrix with the -probability of co-occurance. - Given a data matrix, this function will subsample the rows (samples), cluster the subsamples, and return a \code{n x n} matrix with the probability of co-occurance. @@ -111,24 +65,6 @@ The \code{clusterFunction} must be a function that takes as an classifyFunction arguments. Additional arguments should be supplied via clusterArgs. -The classifyFunction should take as an object a data matrix 'x' with - samples on the columns, and the output of the clusterFunction. Note that the - function should assume that the input 'x' is not the same samples that were - input to the clusterFunction (but can assume that it is the same number of - features/columns). - -The \code{clusterFunction} must be a function that takes as an - argument 'x' which is a \code{p x n} matrix of data and integer 'k'. It - minimally must return a list with element named 'clustering' giving the - vector of cluster ids. To be incorporated with the larger hierarchy, it - should be list with elements of a partition object, just as is returned by - \code{\link[cluster]{pam}}. Generally, the user will need to write a - wrapper function to do this. In the case of pam or kmeans, the user can - identify clusterFunction as "pam" or "kmeans", and the package functions - will use internally written wrappers for the clusterFunction and - classifyFunction arguments. Additional arguments should be supplied via - clusterArgs. - The classifyFunction should take as an object a data matrix 'x' with samples on the columns, and the output of the clusterFunction. Note that the function should assume that the input 'x' is not the same samples that were @@ -141,11 +77,5 @@ data(simData) subD <- subsampleClustering(t(simData), k=3, clusterFunction="kmeans", clusterArgs=list(nstart=10), resamp.n=100, samp.p=0.7) -heatmap(subD) -data(simData) - -subD <- subsampleClustering(t(simData), k=3, clusterFunction="kmeans", -clusterArgs=list(nstart=10), resamp.n=100, samp.p=0.7) - heatmap(subD) } From 88d86cb29002fc7b2a3280b762be3df818c92876 Mon Sep 17 00:00:00 2001 From: Elizabeth Purdom Date: Wed, 24 May 2017 12:49:28 -0700 Subject: [PATCH 19/21] make version 1.3.0 --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index aaacf277..f2107bd8 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: clusterExperiment Title: Compare Clusterings for Single-Cell Sequencing -Version: 1.3.0-9001 +Version: 1.3.0 Description: Provides functionality for running and comparing many different clusterings of single-cell sequencing data or other large mRNA Expression data sets. Authors@R: c(person("Elizabeth", "Purdom", email = "epurdom@stat.berkeley.edu", From ee8222f3e17c93d823b48787cd03c2e7e3ccafc2 Mon Sep 17 00:00:00 2001 From: Elizabeth Purdom Date: Wed, 24 May 2017 12:49:56 -0700 Subject: [PATCH 20/21] fix NEWS date --- NEWS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS b/NEWS index c29a4b47..f585e371 100644 --- a/NEWS +++ b/NEWS @@ -1,4 +1,4 @@ -Changes in version 1.3.0-9001 ( Release date: ) +Changes in version 1.3.0 ( Release date: 2017-05-24 ) ============== Changes: * `plotHeatmap` accepts `data.frame` or `ExpressionSet` objects for the data argument (calls `data.matrix` or `exprs` on object and sends to matrix version) From 453d95b549f313f7c24148126ca22c1401f3f99b Mon Sep 17 00:00:00 2001 From: Elizabeth Purdom Date: Wed, 24 May 2017 12:54:05 -0700 Subject: [PATCH 21/21] fix documentation to plotBarplot --- R/plotBarplot.R | 2 +- man/plotBarplot.Rd | 15 +++++++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/R/plotBarplot.R b/R/plotBarplot.R index 33957854..0ac35b83 100644 --- a/R/plotBarplot.R +++ b/R/plotBarplot.R @@ -5,7 +5,6 @@ #' #' @aliases plotBarplot #' @docType methods -#' @inheritParams plotClusters #' @param clusters A matrix of with each column corresponding to a clustering #' and each row a sample or a \code{\link{ClusterExperiment}} object. #' @param colPalette a vector of colors used for the different clusters. Must be @@ -38,6 +37,7 @@ #' clusterMany. #' #' @author Elizabeth Purdom +#' @inheritParams plotClusters,ClusterExperiment,character-method #' @export #' diff --git a/man/plotBarplot.Rd b/man/plotBarplot.Rd index b592e1a3..f85b68a5 100644 --- a/man/plotBarplot.Rd +++ b/man/plotBarplot.Rd @@ -32,6 +32,14 @@ \item{clusters}{A matrix of with each column corresponding to a clustering and each row a sample or a \code{\link{ClusterExperiment}} object.} +\item{whichClusters}{If numeric, a predefined order for the clusterings in +the plot. If x is a \code{\link{ClusterExperiment}} object, +\code{whichClusters} can be a character value identifying the +\code{clusterTypes} to be used, or if not matching \code{clusterTypes} then +\code{clusterLabels}; alternatively \code{whichClusters} can be either +'all' or 'workflow' to indicate choosing all clusters or choosing all +\code{\link{workflowClusters}}.} + \item{...}{for \code{plotBarplot} arguments passed either to the method of \code{plotBarplot} for matrices or ultimately to \code{\link{barplot}}.} @@ -54,6 +62,13 @@ the 1st cluster of clusters matrix} \item{legend.title}{label for legend. By default or if equal NULL the column name of the 2st cluster of clusters matrix} +\item{unassignedColor}{If ``-1'' in \code{clusters}, will be given this color +(meant for samples not assigned to cluster).} + +\item{missingColor}{If ``-2'' in clusters, will be given this color (meant +for samples that were missing from the clustering, mainly when comparing +clusterings run on different sets of samples)} + \item{colPalette}{a vector of colors used for the different clusters. Must be as long as the maximum number of clusters found in any single clustering/column given in \code{clusters} or will otherwise return an