-
Notifications
You must be signed in to change notification settings - Fork 0
/
LiveCoExpression.R
210 lines (163 loc) · 8.17 KB
/
LiveCoExpression.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
################################################################################
#
# File name: GeneCoExprMultiGenes.R
#
# Authors: Jacek Marzec ( [email protected] )
#
# Barts Cancer Institute,
# Queen Mary, University of London
# Charterhouse Square, London EC1M 6BQ
#
################################################################################
################################################################################
#
# Description: Script for calculating coexpression between user-defined genes/probes. Person correlation coefficient is used to measure correaltion between genes' expression. The pairwise correlation between expression of user-defined genes is depicted in a form of a correlation matrix heat map. NOTE: the script allowes to process gene matrix with duplicated gene IDs. It allows to process up to 50 genes
#
# Command line use example: R --file=./GeneCoExprMultiGenes.R --args "TCGA_PAAD_normalized.txt" "TCGA_PAAD_target.txt" "Genes_of_interest.txt" "Example_results/PC_GeneCoExprMultiGenes" "PDAC"
#
# First arg: Full path with name of the normalised expression matrix
# Second arg: Full path with name of the text file with samples annotation. The file is expected to include the following columns: sample name (1st column) and annotation (3rd column)
# Third arg: ID of gene/probe of interest
# Fourth arg: Full path with name of the output folder
# Fifth arg (OPTIONAL): Samples group to use for the analysis
#
################################################################################
# silent warnings
options(warn=-1)
##### Clear workspace
rm(list=ls())
##### Close any open graphics devices
graphics.off()
### Setting environment for pandoc
Sys.setenv(HOME = "")
#===============================================================================
# Functions
#===============================================================================
##### Prepare object to write into a file
prepare2write <- function (x) {
x2write <- cbind(rownames(x), x)
colnames(x2write) <- c("",colnames(x))
return(x2write)
}
##### Create 'not in' operator
"%!in%" <- function(x,table) match(x,table, nomatch = 0) == 0
##### Deal with the duplicated genes
duplGenes <- function(expData) {
genesList <- NULL
genesRepl <- NULL
for ( i in 1:nrow(expData) ) {
geneName <- expData[i,1]
##### Distingish duplicated genes by adding duplicate number
if ( geneName %in% genesList ) {
##### Report genes with more than one duplicates
if ( geneName %in% names(genesRepl) ) {
genesRepl[[ geneName ]] = genesRepl[[ geneName ]]+1
geneName <- paste(geneName, "-", genesRepl[[ geneName ]], sep="")
} else {
genesRepl[[ geneName ]] <- 2
geneName <- paste(geneName, "-2", sep="")
}
}
genesList <- c(genesList,geneName)
}
rownames(expData) <- genesList
##### Remove the first column with gene names, which now are used as row names
expData <- expData[, -1]
return(expData)
}
#===============================================================================
# Load libraries
#===============================================================================
suppressMessages(library(gplots))
suppressMessages(library(plotly))
suppressMessages(library(heatmaply))
suppressMessages(library(optparse))
suppressMessages(library(data.table))
#===============================================================================
# Catching the arguments
#===============================================================================
option_list = list(
make_option(c("-e", "--exp_file"), action="store", default=NA, type='character',
help="File containing experimental data"),
make_option(c("-t", "--target"), action="store", default=NA, type='character',
help="Clinical data saved in tab-delimited format"),
make_option(c("-p", "--genes"), action="store", default=NA, type='character',
help="ID of genes/probe of interest"),
make_option(c("-d", "--dir"), action="store", default=NA, type='character',
help="Default directory"),
make_option(c("-x", "--hexcode"), action="store", default=NA, type='character',
help="unique_id to save temporary plots"),
make_option(c("-y", "--type"), action="store", default=NA, type='character',
help="type of analysis (dr, tcga, ccle)")
)
opt = parse_args(OptionParser(option_list=option_list))
expFile <- opt$exp_file
annFile <- opt$target
gene_list <- opt$genes
outFolder <- opt$dir
hexcode <- opt$hexcode
type_an = opt$type
#===============================================================================
# Main
#===============================================================================
##### Read sample annotation file
annData <- read.table(annFile,sep="\t",as.is=TRUE,header=TRUE)
##### Read file with expression data
expData <- as.data.frame(fread(expFile,sep="\t",header=TRUE, stringsAsFactors = FALSE))
### assigning gene ids to rownames
rownames(expData) <- expData[,1]
expData <- expData[,-1]
if (type_an == "dr_co_expression") {
rnaseq_samples = annData[annData$TECHNOLOGY=="rnaseq","FILE_NAME"]
selected_samples <- intersect(rnaseq_samples,colnames(expData))
expData <- expData[,selected_samples]
annData <- annData[which(annData$FILE_NAME %in% selected_samples),]
}
genes = unlist(strsplit(gene_list, ","))
gene.expr <- expData[genes, ]
##### Identify genes of interest not present in the expression matrix
absentGenes <- genes[genes %!in% rownames(expData)]
##### Change working directory to the project workspace
setwd(outFolder)
#===============================================================================
# Calculate Pearson correlation coefficients
#===============================================================================
##### Remove gense with standard deviation = 0 (otherwise the cor.test complains)
##### Keep only genes/probes with variance > 0 across all samples (otherwise the cor.test complains)
rsd<-apply(gene.expr,1,sd)
expData <- gene.expr[rsd>0,]
##### Check if used-defined genes are present in the data
for ( i in 1:length(genes) ) {
if ( genes[i] %!in% rownames(expData) ) {
#cat("The genes/probes", genes[i], "are not present in the data!", sep=" ")
genes <- genes[-i]
}
}
##### Calculate Pearson correlation coefficient for user-defined genes
corr.res <- matrix(data = 0, nrow = nrow(expData), ncol = (2*length(genes))+1, dimnames = list( rownames(expData), c( "Gene", paste(rep(genes, each = 2), c("Correlation", "P-value")) ) ))
for ( i in 1:length(genes) ) {
for ( x in 1:nrow(expData) ) {
#### Pearson correlation coefficient and test P
corr.res[x, paste0(genes[i], " Correlation")] <- cor.test( as.numeric(expData[ genes[i] ,]), as.numeric(expData[ x, ]), method = "pearson" )$estimate;
corr.res[x, paste0(genes[i], " P-value")] <- cor.test( as.numeric(expData[ genes[i] ,]), as.numeric(expData[ x, ]), method = "pearson" )$p.value;
}
}
corr.res <- data.frame(corr.res)
corr.res[,"Gene"] <- rownames(expData)
colnames(corr.res) <- c( "Gene", paste(rep(genes, each = 2), c("Correlation", "P-value")) )
#===============================================================================
# Pairwise correlation heat map for defined genes
#===============================================================================
##### Extract the the correlation results for the user-defined genes
corr.res.genes <- corr.res[genes, paste(genes, "Correlation")]
colnames(corr.res.genes) <- rownames(corr.res.genes)
corr.res.genes[upper.tri(corr.res.genes)] <- NA
##### Generate heatmap including the top correlated genes (PLOTLY)
p <- heatmaply(data.frame(corr.res.genes), dendrogram="none", colors = colorRampPalette(c('dark blue','white','dark red'))(100), limits=c(-1,1), scale="none", trace="none", hide_colorbar = FALSE, fontsize_row = 8, fontsize_col = 8) %>%
layout(autosize = TRUE, width = 800, margin = list(l=150, r=50, b=150, t=50, pad=4), showlegend = FALSE)
##### Save the heatmap as html (PLOTLY)
htmlwidgets::saveWidget(as_widget(p), paste0(hexcode,".live.corr_hm.html"))
##### Clear workspace
rm(list=ls())
##### Close any open graphics devices
graphics.off()