-
Notifications
You must be signed in to change notification settings - Fork 0
/
LiveExprCNMut.R
308 lines (221 loc) · 12.4 KB
/
LiveExprCNMut.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
################################################################################
#
# File name: Expr_CN_mut_Profile.R
#
# Authors: Jacek Marzec ( [email protected] )
#
# Barts Cancer Institute,
# Queen Mary, University of London
# Charterhouse Square, London EC1M 6BQ
#
################################################################################
################################################################################
#
# Description: Script generating box-plots and bar-plots to visualise expression measurments across samples and groups (as indicated in target file) from normalised expression data for user-defined gene. NOTE: the script allowes to process gene matrix with duplicated gene IDs.
#
# Command line use example: R --file=./Expr_CN_mut_Profile.R --args "CCLE_BC_processed_mRNA.txt" "CCLE_BC_processed_CN.txt" "CCLE_BC_processed_mut.txt" "CCLE_target.txt" "ADAM28" "Example_FINAL/BC/"
#
# First arg: Full path with name of the normalised expression matrix
# Second arg: Full path with name of the relative linear copy-number matrix
# Third arg: Full path with name of the file with mutation data. This file is expected contain the following information: (1) sample name, (2) gene name and (3) variant classification
# Forth arg: Full path with name of the text file with samples annotation. The file is expected to include the following columns: sample name (1st column) and annotation (3rd column)
# Fifth arg: ID of gene/probe of interest
# Six arg: Full path with name of the output folder
#
################################################################################
# silent warnings
options(warn=-1)
##### Clear workspace
rm(list=ls())
##### Close any open graphics devices
graphics.off()
### Setting environment for pandoc
Sys.setenv(HOME = "")
#===============================================================================
# Functions
#===============================================================================
##### Create 'not in' operator
"%!in%" <- function(x,table) match(x,table, nomatch = 0) == 0
##### Assign colours to analysed groups
getTargetsColours <- function(targets) {
##### Predefined selection of colours for groups
targets.colours <- c("red","blue","green","darkgoldenrod","darkred","deepskyblue", "coral", "cornflowerblue", "chartreuse4", "bisque4", "chocolate3", "cadetblue3", "darkslategrey", "lightgoldenrod4", "mediumpurple4", "orangered3")
f.targets <- factor(targets)
vec.targets <- targets.colours[1:length(levels(f.targets))]
targets.colour <- rep(0,length(f.targets))
for(i in 1:length(f.targets))
targets.colour[i] <- vec.targets[ f.targets[i]==levels(f.targets)]
return( list(vec.targets, targets.colour) )
}
##### Deal with the duplicated genes
duplGenes <- function(expData) {
genesList <- NULL
genesRepl <- NULL
for ( i in 1:nrow(expData) ) {
geneName <- expData[i,1]
##### Distingish duplicated genes by adding duplicate number
if ( geneName %in% genesList ) {
##### Report genes with more than one duplicates
if ( geneName %in% names(genesRepl) ) {
genesRepl[[ geneName ]] = genesRepl[[ geneName ]]+1
geneName <- paste(geneName, "-", genesRepl[[ geneName ]], sep="")
} else {
genesRepl[[ geneName ]] <- 2
geneName <- paste(geneName, "-2", sep="")
}
}
genesList <- c(genesList,geneName)
}
rownames(expData) <- genesList
##### Remove the first column with gene names, which now are used as row names
expData <- expData[, -1]
return(expData)
}
#===============================================================================
# Load libraries
#===============================================================================
suppressMessages(library(plotly))
suppressMessages(library(optparse))
#===============================================================================
# Catching the arguments
#===============================================================================
option_list = list(
make_option(c("-e", "--exp_file"), action="store", default=NA, type='character',
help="File containing experimental data"),
make_option(c("-n", "--cn_file"), action="store", default=NA, type='character',
help="File containing CN data"),
make_option(c("-m", "--mut_file"), action="store", default=NA, type='character',
help="File containing mutation data"),
make_option(c("-t", "--target"), action="store", default=NA, type='character',
help="Clinical data saved in tab-delimited format"),
make_option(c("-p", "--gene"), action="store", default=NA, type='character',
help="ID of gene/probe of interest"),
make_option(c("-d", "--dir"), action="store", default=NA, type='character',
help="Default directory"),
make_option(c("-x", "--hexcode"), action="store", default=NA, type='character',
help="unique_id to save temporary plots")
)
opt = parse_args(OptionParser(option_list=option_list))
expFile <- opt$exp_file
cnFile <- opt$cn_file
mutFile <- opt$mut_file
annFile <- opt$target
gene <- opt$gene
outFolder <- opt$dir
hexcode <- opt$hexcode
#===============================================================================
# Main
#===============================================================================
# GENE EXPRESSION
##### Read file with expression data
expData <- read.table(expFile,sep="\t",as.is=TRUE,header=TRUE,row.names=NULL)
##### Deal with the duplicated genes
expData <- duplGenes(expData)
##### Read file with CN data
cnData <- read.table(cnFile,sep="\t",as.is=TRUE,header=TRUE,row.names=NULL)
##### Deal with the duplicated genes
cnData <- duplGenes(cnData)
##### Read maf file with mutation data
mutData <- read.table(mutFile,sep="\t",as.is=TRUE,header=TRUE,row.names=NULL)
##### Keep only samples present in both the expression and CN datasets
absentSamples.cnData <- colnames(expData)[colnames(expData) %!in% colnames(cnData)]
absentSamples.expData <- colnames(cnData)[colnames(cnData) %!in% colnames(expData)]
expData <- expData[,colnames(expData) %in% colnames(cnData)]
cnData <- cnData[,colnames(cnData) %in% colnames(expData)]
##### Make sure that the samples order in the expression and CN matrices are the same
cnData <- cnData[, colnames(expData)]
##### Read sample annotation file
annData <- read.table(annFile,sep="\t",as.is=TRUE,header=TRUE,row.names=NULL)
rownames(annData) <- make.names(annData[,1], unique = TRUE)
annData <- annData[,-1]
##### Keep only samples with annotation info
expData <- expData[,colnames(expData) %in% rownames(annData)]
cnData <- cnData[,colnames(cnData) %in% rownames(annData)]
mutData <- mutData[mutData[,1] %in% rownames(annData), ]
annData <- subset(annData, rownames(annData) %in% colnames(expData))
##### Make sure that the samples order in the data matrix and annotation file is the same
annData <- annData[colnames(expData),]
##### Check if the queried genes is present in the expression data
genes <- rownames(expData)
if ( gene %!in% rownames(expData) ) {
#cat("The gene/probe", gene, "is not present in the data!", sep=" ")
q()
##### ... and extract the expression of the gene of inteterest
} else {
gene.expr <- data.matrix(expData[gene, ])
gene.cn <- data.matrix(cnData[gene, ])
gene.mut <- mutData[mutData[,2]==gene, ]
}
# Change working directory to the project workspace
setwd(outFolder)
# # Report samples not present in the the expression or CN matrices
# if ( length(absentSamples.expData) > 0 ) {
# write(absentSamples.expData, file = paste(coreName, gene, "absent_in_mRNA_data.txt", sep = "_"), append = FALSE, sep="\t")
# }
#
# if ( length(absentSamples.cnData) > 0 ) {
# write(absentSamples.cnData, file = paste(coreName, gene, "absent_in_CN_data.txt", sep = "_"), append = FALSE, sep="\t")
# }
#===============================================================================
# Prepare the mutation file
#===============================================================================
##### Initiate variable for the gene mutation status for each sample
gene.mut.sample <- as.matrix(rep("Not mutated", ncol(gene.expr)))
colnames(gene.mut.sample) <- "Mutation"
rownames(gene.mut.sample) <- colnames(gene.expr)
for ( i in 1:ncol(gene.expr) ) {
if ( gene.mut[i,1] %in% colnames(gene.expr) ) {
##### If for a specific sample more than one mutation in the queried genes is provided then the the additional mutation categories will be also provided
if ( gene.mut.sample[gene.mut[i,1],"Mutation"] != "Not mutated" ) {
gene.mut.sample[gene.mut[i,1],"Mutation"] <- paste(gene.mut.sample[gene.mut[i,1],"Mutation"], gene.mut[i,3], sep=" & ")
} else {
gene.mut.sample[gene.mut[i,1],"Mutation"] <- gene.mut[i,3]
}
}
}
#===============================================================================
# Generate mRNA expression vs DNA copy-number scatterplot
#===============================================================================
targets <- gene.mut.sample[,"Mutation"]
targets.colour <- getTargetsColours(targets)
##### Calculate Pearson correlation coefficient
expr_cn.corr <- round(cor.test( as.numeric(gene.expr), as.numeric(gene.cn), method = "pearson" )$estimate, digits=2)
##### Generate scatter plot (PLOTLY)
##### Prepare data frame
gene.df <- data.frame(targets, as.numeric(gene.cn), as.numeric(gene.expr))
colnames(gene.df) <- c("Target", "CN", "mRNA")
p <- plot_ly(gene.df, x = ~CN, y = ~mRNA, color = ~Target, text=colnames(gene.expr), colors = targets.colour[[1]], type='scatter', mode = "markers", marker = list(size=10, symbol="circle"), width = 800, height = 600) %>%
layout(title = paste0("Pearson's r = ", expr_cn.corr), xaxis = list(title = paste0(gene, " relative linear copy-number values")), yaxis = list(title = paste0(gene, " mRNA expression")), margin = list(l=50, r=50, b=50, t=50, pad=4), autosize = F, legend = list(orientation = 'h', y = 1))
##### Save the box-plot as html (PLOTLY)
htmlwidgets::saveWidget(as_widget(p), paste0(hexcode, "_mRNA_vs_CN_mut_plot.html"))
#===============================================================================
# Calculate putative copy-number alterations
#===============================================================================
##### Draw histogram of correlation coefficients (PLOTLY)
p <- plot_ly(x = ~as.numeric(gene.cn), type = 'histogram', width = 800, height = 500) %>%
layout(xaxis = list( title = paste0(gene, " relative linear copy-number values")), yaxis = list( title = "Frequency"), margin = list(l=50, r=50, b=50, t=50, pad=4), autosize = F)
##### Save the histogram as html (PLOTLY)
htmlwidgets::saveWidget(as_widget(p), paste0(hexcode, "_corr_hist.html"))
##### Assign gain for linear CN values above 0.5 and loss for linear CN values below -0.5
gene.cn[ gene.cn > 0.5 ] <- 1
gene.cn[ gene.cn < -0.5 ] <- -1
gene.cn[ gene.cn <= 0.5 & gene.cn >= -0.5 ] <- 0
#===============================================================================
# Generate mRNA expression vs putative DNA copy-number alterations box-plot
#===============================================================================
##### Preprare dataframe
gene.df <- data.frame(targets, rep(unique(targets)[1],length(targets)), as.numeric(gene.cn), as.numeric(gene.expr))
colnames(gene.df) <- c("Target", "Box", "CN", "mRNA")
gene.cn[ gene.cn == 1 ] <- "(1) Gain"
gene.cn[ gene.cn == -1 ] <- "(-1) Loss"
gene.cn[ gene.cn == 0 ] <- "(0) Diploid"
gene.df <- data.frame(targets, rep(unique(targets)[1],length(targets)), data.frame(t(gene.cn)), as.numeric(gene.expr))
colnames(gene.df) <- c("Target", "Box", "CN", "mRNA")
##### Generate box-plot (PLOTLY)
p <- plot_ly(gene.df, x = ~CN, y = ~mRNA, color = ~Target, colors = targets.colour[[1]], type='scatter', mode = "markers", marker = list(size=10, symbol="circle"), width = 800, height = 600, text=colnames(gene.expr) ) %>%
add_boxplot(gene.df, x= ~CN, y= ~mRNA, color = ~Box, key=FALSE, line = list(color = "grey"), showlegend=FALSE ) %>%
layout(title = "", xaxis = list(title = paste0(gene, " relative linear copy-number values")), yaxis = list(title = paste0(gene, " mRNA expression")), margin = list(l=50, r=50, b=50, t=50, pad=4), autosize = F, legend = list(orientation = 'h', y = 1))
##### Save the box-plot as html (PLOTLY)
htmlwidgets::saveWidget(as_widget(p), paste0(hexcode, "_mRNA_vs_CN_mut_boxplot.html"))
##### Close any open graphics devices
graphics.off()