-
Notifications
You must be signed in to change notification settings - Fork 1
/
DRUID.Rmd
293 lines (213 loc) · 11.1 KB
/
DRUID.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
---
title: "DRUID analysis of JHU Biobank data"
author:
- affiliation: Sage Bionetworks
affiliation_url: https://sagebionetworks.org/
name: Jineta Banerjee
url: null
date: "`r Sys.Date()`"
output: distill::distill_article
description: |
DRUID analysis of JHU Biobank data
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
```{r lib_synapser, echo=FALSE, eval=TRUE, results='hide', message=FALSE, warning=FALSE}
library(synapser)
library(BiocManager)
library(gProfileR)
library(GOsummaries)
library(tidyverse)
library(ggfortify)
library(GSVA)
library(GSVAdata)
library(biomartr)
library(pheatmap)
library(biomaRt)
library(glue)
library(edgeR)
library(limma)
library(sagethemes)
import_lato()
library(DRUID)
library(cauldron)
library(AnnotationDbi)
library(org.Hs.eg.db)
library(DT)
library(synapser)
synapser::synLogin()
```
## Publicly available data from JHU Biobank
We used the publicly available JHU Biobank RNASeq data stored in project : syn20812185 for this analysis.
```{r download data, eval=TRUE, echo=FALSE, results='hide', message=FALSE, warning=FALSE}
NTAP_published_files <- synapser::synTableQuery(glue::glue("SELECT * FROM syn21221980"))$asDataFrame()
NTAP_pub_rnaseq_data <- synapser::synTableQuery(glue::glue("SELECT * FROM syn20812185"))$asDataFrame() %>%
dplyr::select(totalCounts, Symbol, zScore, specimenID, individualID, sex, tumorType, studyName)
NTAP_pub_rnaseq_data$sex[NTAP_pub_rnaseq_data$sex == "female"] <- "Female"
NTAP_pub_rnaseq_data$sex[NTAP_pub_rnaseq_data$sex == "male"] <- "Male"
NTAP_pub_rnaseq_data$tumorType[NTAP_pub_rnaseq_data$tumorType == "Malignant peripheral nerve sheath tumor"] <- "Malignant Peripheral Nerve Sheath Tumor"
NTAP_pub_rnaseq_data <- NTAP_pub_rnaseq_data %>%
filter(tumorType %in% c("Cutaneous Neurofibroma", "Plexiform Neurofibroma", "Neurofibroma", "Malignant Peripheral Nerve Sheath Tumor"),
!grepl('xenograft', specimenID, ignore.case = T),
!grepl('Cell Line', specimenID, ignore.case = T),
!specimenID %in% c("BI386-004","CW225-001","DW356-002",
"JK368-003", "SK436-005"))
```
We then used TMM normalization to prepare the dataset for differential gene expression analysis.
```{r TMM-LCPM normalization, eval=TRUE, echo=FALSE, results='hide', message=FALSE, warning=FALSE}
gene_mat<-reshape2::acast(unique(NTAP_pub_rnaseq_data),
Symbol~specimenID,
value.var='totalCounts',
fun.aggregate = mean)
# missing<-which(apply(gene_mat,1,function(x) any(is.na(x))))
# gene_mat<-gene_mat[-missing,]
DGE.all <-DGEList(counts=gene_mat) # function from edgeR
```
## Differential gene expression analysis in samples:
We used limma-edgeR based analysis to find differentially expressed genes in the TMM normalized dataset.
A point to note:
* All MPNST samples except one are males while all PNF samples except one are femmales. So there may be a sex related effect on the analysis that we dont have a good way to mitigate.
* The number of MPNST samples are extremely limited (n=4), so these results should be evaluated with more samples when possible
The plot below shows genes that are significantly overexpressed in MPNST compared to PNF (logFC > 4) in red. The dots in blue refer to genes that are significantly underexpressed in MPNST compared to PNF (logFC < -4). The thresholds of fold change have been arbitrarily chosen for ease of visualization in the volcano plot shown below.
```{r DEG, echo=F, eval= TRUE, results='hide', message=FALSE, warning=FALSE, fig.height=20, fig.width=20, fig.cap="Differential expression of genes in MPNST and pNF tumor types. Above shows a volcano plot highlighting significantly upregulated genes in red and downregulated genes in blue"}
#Select annotations
annotes=NTAP_pub_rnaseq_data %>%
dplyr::select(specimenID,sex,tumorType,studyName) %>%
unique()
annotes$tumorType <- as.factor(annotes$tumorType)
annotes$sex <-as.factor(annotes$sex)
rownames(annotes)<-annotes$specimenID
## DEG analysis using limma, Glimma and EdgeR
limma_object <- DGE.all # make new DGElist object so that we dont change the original object
#make design matrix
annotes <- annotes %>%
dplyr::filter(specimenID %in% colnames(gene_mat))
ordered_annotes <- annotes[match(colnames(gene_mat), annotes$specimenID),]
group_object <- ordered_annotes$tumorType
batch <- ordered_annotes$studyName
sex <- ordered_annotes$sex
design_new <- model.matrix(~0+group_object)
colnames(design_new) <- gsub("group_object", "", colnames(design_new)) # modify the colnames to remove "group_object" addition
colnames(design_new) <- gsub(" ", "_", colnames(design_new))
colnames(design_new) <- gsub("-", "_", colnames(design_new))
contr.matrix <- makeContrasts(
MPNSTvsPNF = Malignant_Peripheral_Nerve_Sheath_Tumor - Plexiform_Neurofibroma,
levels = colnames(design_new))
#contr.matrix
## Using voom to make Elist object
#print("Visualizing the mean variance trend of the RNASeq dataset")
voom_object <- voomWithQualityWeights(limma_object, design_new, plot=FALSE) # removing heteroscadisticity
#voom_object$genes <- genes_unique
## Run DGE analysis using lmFit
fit_new <- lmFit(voom_object, design_new)
contrast_fit_new <- contrasts.fit(fit_new, contrasts=contr.matrix)
fit <- eBayes(contrast_fit_new, trend=TRUE)
results_fit_new <- decideTests(fit)
#summary(results_fit_new)
#vennDiagram(results_fit_new[,c(1)], circle.col=c("turquoise", "salmon"))
MPNSTvsPNF <- topTreat(fit, coef=1, n=Inf)
MPNSTvsPNF$Genes <- rownames(MPNSTvsPNF)
MPNSTvsPNF_ordered <- MPNSTvsPNF[order(MPNSTvsPNF$adj.P.Val),]
## Volcano Plots:
MPNSTvsPNF_ordered <-MPNSTvsPNF_ordered %>%
mutate(threshold = ifelse(logFC >= 4 & adj.P.Val <= 0.05,"A",
ifelse(logFC<=-4 & adj.P.Val <= 0.05,
"B", "C")))
ggplot(data=MPNSTvsPNF_ordered, aes(x=logFC, y=-log10(adj.P.Val))) +
geom_point(aes(colour = threshold), alpha=0.4, size=5) +
xlim(c(-10, 10)) + ylim(c(0, 2)) +
xlab("log2 fold change") + ylab("-log10 adjusted Pvalue") +
scale_colour_manual(values = c("A"= "red", "B"="blue", "C"= "black")) +
geom_text(aes(label = ifelse(threshold != "C",
as.character(Genes), ""),
colour = as.factor(threshold)), size = 5, angle = 45, hjust = -0.25) +
theme_bw()+
theme(legend.text = element_text(size=10), #element_text(size=8),
axis.text.x = element_text(size=30, angle = 45),
axis.text.y = element_text(size=30),
text = element_text(size=40),
strip.text.x = element_text(size = 30),
legend.position="none",
panel.grid = element_blank(),
panel.background = element_rect(fill = "white"))
```
## Predicting drug candidates based on significant DEGs using DRUID:
We then took the subset of genes that were significantly differentially expressed between MPNST and pNF tumor types and used DRUID to enrich candidate drugs to revert the MPNST phenotype to pNF phenotype. We did this using the following steps:
```{r DRUID, echo=T, eval= TRUE, results='hide', message=FALSE, warning=FALSE, fig.height=15, fig.width=20, fig.cap="DRUID predictions for significant DEGs", layout="l-screen-inset shaded"}
# Select the significantly differentially expressed genes from DEG analysis
druid_dge <- MPNSTvsPNF %>%
dplyr::select(c("logFC","adj.P.Val")) %>%
dplyr::filter(MPNSTvsPNF$adj.P.Val < 0.05)
# Make query matrix for DRUID
query_matrix <- as.matrix(druid_dge)
# Convert the gene names to entrez ids
geneSymbols <- AnnotationDbi::mapIds(org.Hs.eg.db,
keys=rownames(query_matrix),
column=c("ENTREZID"),
keytype="SYMBOL",
multiVals = "first") %>% as.data.frame()
#head(geneSymbols)
entrez_ids <- geneSymbols$.
# # Run Druid with only significant genes (adj_p_val < 0.05) and save output
# sig_genes_druid <- concoct(dge_matrix = query_matrix,
# num_random = 10000,
# druid_direction = "neg",
# fold_thr = 0.5,
# pvalue_thr = 0.05,
# entrez = entrez_ids)
# sig_genes_druid_ordered <- sig_genes_druid[order(sig_genes_druid$druid_score, decreasing = TRUE),]
# #head(sig_genes_druid_ordered)
#
# save(sig_genes_druid_ordered, file = "./sig_genes_druid_mpnst_pnf.RData")
load("./sig_genes_druid_mpnst_pnf.RData")
sig_genes_druid_ordered %>%
dplyr::filter(., druid_score > 5) %>%
ggplot() +
geom_point(aes(x = drug_name, y = cosine_similarity, color = cell_line), alpha = 0.5) +
facet_grid(. ~ cell_line, scales = "free") +
theme_bw() +
theme(axis.text.x = element_text(angle = 45))
```
The table below lists all the drugs that were recommended as promising candidates using the DRUID analysis. A higher DRUID score (scale from 0-6) would mean a higher recommendation for the drug compound.
```{r DRUID_table, echo=F, eval= TRUE, results='show', message=FALSE, warning=FALSE, layout="l-body-outset"}
#library(rmarkdown)
#paged_table(mtcars, options = list(rows.print = 15))
DT::datatable(sig_genes_druid_ordered[1:50,c("drug_name", "matched_genes", "druid_score", "probability_random","cell_line")])
```
```{r DRUID allgenes, echo=F, eval=F, results='hide', message=FALSE, warning=FALSE, fig.height=20, fig.width=20}
druid_dge_all <- MPNSTvsPNF %>%
dplyr::select(c("logFC","adj.P.Val"))
query_matrix <- as.matrix(druid_dge_all)
# Convert the gene names to entrez ids
geneSymbols <- AnnotationDbi::mapIds(org.Hs.eg.db,
keys=rownames(query_matrix),
column=c("ENTREZID"),
keytype="SYMBOL",
multiVals = "first") %>% as.data.frame()
head(geneSymbols)
entrez_ids <- geneSymbols$.
# #Run Druid with all genes
# all_genes_druid <- concoct(dge_matrix = query_matrix,
# num_random = 10000,
# druid_direction = "neg",
# fold_thr = 0.5,
# pvalue_thr = 0.05,
# entrez = entrez_ids)
# all_genes_druid_ordered <- all_genes_druid[order(all_genes_druid$druid_score, decreasing = TRUE),]
# #head(sig_genes_druid_ordered)
#
# save(all_genes_druid_ordered, file = "/Users/jineta/git/gitrepo/GSVA/all_genes_druid_mpnst_pnf.RData")
load("/Users/jineta/git/gitrepo/GSVA/all_genes_druid_mpnst_pnf.RData")
DT::datatable(all_genes_druid_ordered[1:50,c("drug_name", "matched_genes", "druid_score", "probability_random","cell_line")])
all_genes_druid_ordered %>%
dplyr::filter(., druid_score > 5) %>%
ggplot() +
geom_point(aes(x = drug_name, y = cosine_similarity, color = cell_line), alpha = 0.5) +
facet_grid(. ~ cell_line, scales = "free") +
theme_bw() +
theme(axis.text.x = element_text(angle = 45))
```
Distill is a publication format for scientific and technical writing, native to the web.
Learn more about using Distill for R Markdown at <https://rstudio.github.io/distill>.