-
Notifications
You must be signed in to change notification settings - Fork 0
/
analysisScript2.R
205 lines (162 loc) · 10.8 KB
/
analysisScript2.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
# Scripts steps:
# incorporate hierarchical clustering into milsomSet
# plot PCA and t-SNE with it
# find ideal number of clusters
# ... there's so much more you can do ...!
# after filtering genes in previous script, you can save it out.
# Then load it here:
milsomSet <- readRDS("....rds")
#' ## For further analysis: normalized, transformed matrices
#' This is useful to compute clusters, PCA, etc., independently from the scater / scran packages
milsomexprs <- milsomSet[rownames(hvg.out.all)] # normalized with scran's deconvolution (Lun, Genome Biol 2016)
milsomexprs <- t(exprs(milsomexprs))
# create treatment-vectors for above matrices to plot with color=treatment (does not apply to your data, charles, just leave it out...)
treatment <- revalue(substr(rownames(milsomexprs),18,18),
c("3"="pbs", "4"="pbs", "5"="pbs",
"6"="treated", "7"="treated", "8"="treated") )
#' ## Compute clusters of cells:
HC.euclidean <- dist(milsomexprs, method="euclidean")
HC.euclideantree <- hclust(HC.euclidean, method="ward.D2")
library(dynamicTreeCut)
HC.euclidean <- unname(cutreeDynamic(HC.euclideantree, distM=as.matrix(HC.euclidean), verbose=0))
HC.euclidean <- as.factor(HC.euclidean)
# note that below, I have a section on how to find 'ideal number of clusters' with NBClust - this is cool but perhaps for later timepoints.
#############################################################################3
#' ## Add new information to the milsomSet
#' Here I add the NbClust cluster definitions (if you computed them, see below) to the milsomSet. Also, I put the hvg library size, etc.,
#' to later be able to display them in PCA / DiffusionMaps or to compute correlation with the first components.
# It's good to check library size as a potential confounder interfering with PCA. To be rigorous, I on top
# also want to use the library size of the 1764 HVGs
hvg1764_counts <- rowSums(milsomexprs)
# number of detected HVGs in each cell:
hvg1764_features <- apply(milsomexprs, 1, function(cell) sum(cell >0))
# add information to milsomSet: (NOTE: you can use the 'mutate' function instead of the following complicated code, see https://rdrr.io/bioc/scater/man/mutate.html)
milsomPheno <- pData(milsomSet)
milsomPheno <- cbind(
# add the clustering from above:
HC.euclidean,
# add library sizes, in different versions:
hvg1764_counts, hvg1764_features,
# do the following commented lines only if you followed section 'ideal number of clusters' below...
# # add NbClust's ideal cluster definitions, according to Silhouette index:
# clusters.complete.eucl.s, clusters.ward.eucl.s, clusters.kmeans.eucl.s,
# # add NbClust's ideal cluster definitions, according to Dunn index:
# clusters.complete.eucl.d, clusters.ward.eucl.d, clusters.kmeans.eucl.d, clusters.kmeans.spear.d,clusters.kmeans.spear.s,
# append the previous milsomPheno information as well, of course:
milsomPheno)
pd <- new("AnnotatedDataFrame", data = milsomPheno)
pData(milsomSet) <- pd
# confirm it worked:
head(varLabels(milsomSet), n=10) # should show you "clusters...."
# now you can color anything inside varlabels(milsomSet) in PCAs and t-SNE:
plotPCA(milsomSet, feature_set=rownames(hvg.out.all), color=HC.euclidean)
# plotTSNE( ...) # see ? plotTSNE, it's awesome!
# Now felix is curious, once you're here send him a screenshot ;)
#############################################################################3
#' # ideal number of clusters with NbClust
# this is perhaps for later analysis steps, not for the first quick look on the data. That's why I commented it out for now.
# library(NbClust)
#
# nbwrapper <- function( dataset=milsomexprs, index, min.nc=2, max.nc=8){
# # nbwrapper computes ideal number of clusters with euclidean matrix (4 different methods) and spearman distance
# # (one method, as the other two do not work for me in this package implementation). The 4 methods are the following:
# # average: "UPGMA"
# # ward.D2: "Ward's method"
# # kmeans
# # complete: "complete-linkage clustering"
# # computation based on euclidean distance, 3 methods
# c<- NbClust(dataset, distance = "euclidean", min.nc=min.nc, max.nc=max.nc, method = "complete", index = index)
# av<- NbClust(dataset, distance = "euclidean", min.nc=min.nc, max.nc=max.nc, method = "average", index = index)
# wa<- NbClust(dataset, distance = "euclidean", min.nc=min.nc, max.nc=max.nc, method = "ward.D2", index = index)
# k<- NbClust(dataset, distance = "euclidean", min.nc=min.nc, max.nc=max.nc, method = "kmeans", index = index)
# # computation based on spearman distance, 1 method:
# HC.spearman <-cor(t(dataset), method='spearman')
# HC.spearman <- (1-HC.spearman)/2
# ## the following 2 options (s.av, s.wa) do not work with datasets this large. I have tried to do it only on the 217 treated cells,
# # but still get this error:
# # Error in if (is.na(n) || n > 65536L) stop("size cannot be NA nor exceed 65536") :
# # missing value where TRUE/FALSE needed
# # s.av<- NbClust(dataset, diss=HC.spearman, distance = NULL, min.nc=min.nc, max.nc=max.nc, method = "average", index = index)
# # s.wa<- NbClust(dataset, diss=HC.spearman, distance = NULL, min.nc=min.nc, max.nc=max.nc, method = "ward.D2", index = index)
# s.k<- NbClust(dataset, diss=HC.spearman, distance = NULL, min.nc=min.nc, max.nc=max.nc, method = "kmeans", index = index)
# # extract information you want to return: index holds score for each cluster size, clusters holds ideal cluster assignments for each cell
# index.all <- rbind(c$All.index ,av$All.index,wa$All.index,k$All.index,s.k$All.index)# s.av$All.index,s.wa$All.index,
# rownames(index.all) <- c("eucli.complete" ,"eucli.average","eucli.ward","eucli.kmeans", "spear.kmeans")#"spear.av","spear.wa",
# clusters.all <- rbind(c$Best.partition, av$Best.partition,wa$Best.partition,k$Best.partition,s.k$Best.partition)#s.av$Best.partition,s.wa$Best.partition,
# rownames(clusters.all) <- c( "eucli.complete", "eucli.average","eucli.ward","eucli.kmeans", "spear.kmeans")# "spear.av","spear.wa",
# return(list(index.all, clusters.all))
# }
#
#
# # Compute ideal clusters with Dunn and Silhouette indices, or load the pre-computed ones:
# # compute and save:
# # dunn.list <- nbwrapper(milsomexprs, "dunn", max.nc=12)
# # silhouette.list <- nbwrapper(milsomexprs, "silhouette", max.nc=12)
# # saveRDS(dunn.list, file=paste(DIR, "nbclust_dunn-list",format(Sys.time(), "_%Y%m%d-%H:%M."), "rds", sep='') )
# # saveRDS(silhouette.list, file=paste(DIR, "nbclust_silhouette-list",format(Sys.time(), "_%Y%m%d-%H:%M."), "rds", sep='') )
# # load the above:
# dunn.list <- readRDS('/home/felix/PhDother/milsomData/stressedCells/octobersprint/resultsanddata/20161217_Draft1_firstFigs/nbclust_dunn-list_20161217-16:18.rds')
# silhouette.list <- readRDS('/home/felix/PhDother/milsomData/stressedCells/octobersprint/resultsanddata/20161217_Draft1_firstFigs/nbclust_silhouette-list_20161217-16:18.rds')
#
# # plot index scores for each number of clusters:
# # massage score data for dunn index:
# dunn.index <- dunn.list[[1]]
# dunn.index <- scale( t(dunn.index), scale=F)
# dunn.index <- dunn.index[,-2] # exclude average linkage, as all cells except 1 in same cluster (not shown here)
# dunn.index <- melt(dunn.index)
# dunn.index <- cbind(dunn.index, rep("dunn",dim(dunn.index)[1]))
# colnames(dunn.index) <- c("Number_of_Clusters","distance_and_method", "IndexScore", "Index")
# # massage score data for silhouette index:
# silh.index<- silhouette.list[[1]]
# silh.index <- scale( t(silh.index), scale=F)
# silh.index <- silh.index[,-2]# exclude average linkage, as all cells except 1 in same cluster (not shown here)
# silh.index<- melt(silh.index)
# silh.index <- cbind(silh.index, rep("Silhouette", dim(silh.index)[1]))
# colnames(silh.index) <- c("Number_of_Clusters","distance_and_method", "IndexScore", "Index")
# # bring both into same dataframe:
# indices <- rbind(dunn.index, silh.index)
# # plot ideal scores of all:
# indices <- ggplot(indices, aes(x=Number_of_Clusters, y=IndexScore, color=distance_and_method, linetype=Index)) +
# geom_line() + scale_x_continuous(breaks=as.numeric(2:12),) + ylab("Index Score (centered)")
# #plotit(indices, DIR, "NBclust_1764HVG", knit= knitall)
#
# dunn.clusters <- as.data.frame(t(dunn.list[[2]]))
# silhouette.clusters <- as.data.frame(t(silhouette.list[[2]]))
#
#
# #' I do not show plots for 'average linkage', as the ideal clusters were not meaningful (all cells except 1 in same cluster),
# #' for both Dunn and Silhouette indices. Therefore, they are also left out in all following analysis. The following clusters,
# #' however, are interesting and will be investigated further. To this end, we extract the ideal clusterings and, further below,
# #' add them to the milsomSet.
#
# # Silhouette:
# # 1 eucli.complete
# # 3 eucli.ward
# # 4 eucli.kmeans
# # 5 spear kmeans
# clusters.complete.eucl.s <- as.factor(silhouette.clusters[,1])
# clusters.ward.eucl.s <- as.factor(silhouette.clusters[,3])
# clusters.kmeans.eucl.s <- as.factor(silhouette.clusters[,4])
# clusters.kmeans.spear.s <- as.factor(silhouette.clusters[,5])
# # Dunn:
# # 1 eucli.complete
# # 3. eucli.ward
# # 4. eucli.kmeans
# # 5. spear.kmeans
# clusters.complete.eucl.d <- as.factor(dunn.clusters[,1])
# clusters.ward.eucl.d <- as.factor(dunn.clusters[,3])
# clusters.kmeans.eucl.d <- as.factor(dunn.clusters[,4])
# clusters.kmeans.spear.d <- as.factor(dunn.clusters[,5])
#######################################################
#' ## Compare different cluster definitions
#'
pc.1764x424 <- prcomp(milsomexprs,center=T,scale=F)
# extract scores (cell's values in PC1/2) and loads (best genes have high scores)
scores.1764x424 <- as.data.frame(pc.1764x424$x)
loads.1764x424 <- as.data.frame(pc.1764x424$rotation) # coefficients of linear combinations of variables
#' Compute t-SNE
seed=100
tsne.hvg <- plotTSNE(milsomSet, feature_set=rownames(hvg.out.all), colour_by="hvg1764_features")
tsne.scores <- tsne.hvg$data
stopifnot(all( rownames(tsne.scores) == colnames(milsomSet) )) # makes sure rows of tsne values and in milsomSet match
# ...