-
Notifications
You must be signed in to change notification settings - Fork 6
/
R_functional_script_w-md5s.Rmd
270 lines (224 loc) · 12.1 KB
/
R_functional_script_w-md5s.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
---
title: "R_functional_script"
author: "Sam Westreich, [email protected], github.com/transcript"
date: "June 24, 2015"
output: html_document
---
This is a script for processing output files from MG-RAST, converting functional output data into a graph. This program creates a barplot of activity within a sample grouped by protein function, both by relative and by absolute activity.
Necessary packages to include:
```{r packages, results='hide'}
library(DESeq2)
library(ggplot2)
library(gridExtra)
library(scales)
library(reshape)
library(knitr)
library(plyr)
```
Before starting this script, please adjust the following parameters. The working directory should contain both the experimental sample files (designated by "exp_<filename>") and the control sample files (designated by "control_<filename>").
```{r working_directory}
setwd("~/Desktop/Projects/Lab Stuff/SAMSA_pipeline_v1/public_output_files/public_files_ready_for_R/function/")
```
Now, we read in the files from the working directory as specified above. The file should be of a format such as:
"control_RefSeq_function_identifier-name.tab.output".
Note that if the identifier name or ID of the file is not specified as such, the script will not select the correct section of the file name to use as an ID.
```{r file_listing}
control_files <- list.files(
pattern = "control_*", full.names = T, recursive = FALSE)
control_names = ""
for (name in control_files) {
control_names <- c(control_names, unlist(strsplit(name, split='_', fixed=TRUE))[4])} #change the "2" to a different part of the name if not following standard naming conventions as above
control_names <- control_names[-1]
control_names_trimmed = ""
for (name in control_names) {
control_names_trimmed <- c(control_names_trimmed, unlist(strsplit(name, split='.', fixed=TRUE))[1])}
control_names_trimmed <- control_names_trimmed[-1]
rm (control_names)
exp_files <- list.files(
pattern = "experiment_*", full.names = T, recursive = FALSE)
exp_names = ""
for (name in exp_files) {
exp_names <- c(exp_names, unlist(strsplit(name, split='_', fixed=TRUE))[4])} #again, change 2 if non-standard file names are used
exp_names <- exp_names[-1]
exp_names_trimmed = ""
for (name in exp_names) {
exp_names_trimmed <- c(exp_names_trimmed, unlist(strsplit(name, split='.', fixed=TRUE))[1])}
exp_names_trimmed <- exp_names_trimmed[-1]
rm (exp_names, name)
```
Next, we will load these files in as two tables, one for the experimental data and one for the control data.
```{r open_tables}
# loading the control table
y <- 0
for (x in control_files) {
y <- y + 1
if (y == 1) {
control_table <- read.table(file = x, header=F, quote = "", sep = "\t", fill = TRUE)
if (ncol(control_table) == 4) {
colnames(control_table) = c("DELETE", x, "V3", "md5")
control_table <- control_table[,c(4,2,3)]
} else {
colnames(control_table) = c("DELETE", x, "V3")
control_table <- control_table[,c(3,2)] }
control_table <- control_table[c(1:1000),] } # can be deleted, restricts to top 1k hits
if (y > 1) {
temp_table <- read.table(file = x, header = F, quote = "", sep = "\t", fill = TRUE)
temp_table <- temp_table[c(1:1000),] # can be deleted, restricts to top 1k hits
print (x)
if (ncol(temp_table) == 4) {
colnames(temp_table) = c("DELETE", x, "V3", "md5")
} else {
colnames(temp_table) = c("DELETE", x, "V3") }
control_table <- merge(control_table, temp_table[,c(2,3)], by = "V3", all.x = T) }
}
control_table[is.na(control_table)] <- 0
rownames(control_table) = control_table$V3
control_table_trimmed <- control_table[,-1, drop = FALSE]
# loading the exp table
y <- 0
for (x in exp_files) {
y <- y + 1
if (y == 1) {
exp_table <- read.table(file = x, header=F, quote = "", sep = "\t", fill = TRUE)
if (ncol(exp_table) == 4) {
colnames(exp_table) = c("DELETE", x, "V3", "md5")
exp_table <- exp_table[,c(4,2,3)]
} else {
colnames(exp_table) = c("DELETE", x, "V3")
exp_table <- exp_table[,c(3,2)] }
exp_table <- exp_table[c(1:1000),] } # can be deleted, restricts to top 1k hits
if (y > 1) {
temp_table <- read.table(file = x, header = F, quote = "", sep = "\t", fill = TRUE)
temp_table <- temp_table[c(1:1000),] # can be deleted, restricts to top 1k hits
print (x)
if (ncol(temp_table) == 4) {
colnames(temp_table) = c("DELETE", x, "V3", "md5")
} else {
colnames(temp_table) = c("DELETE", x, "V3") }
exp_table <- merge(exp_table, temp_table[,c(2,3)], by = "V3", all.x = T) }
}
exp_table[is.na(exp_table)] <- 0
rownames(exp_table) = exp_table$V3
exp_table_trimmed <- exp_table[,-1, drop = FALSE]
rm(control_table, exp_table, x, y)
```
Now, we simplify the names of the columns of these two tables, using the ID names scrubbed from the filenames above in chunk 3, file_listing. If this step is failing, please note the proper naming of the files as mentioned above chunk 3.
```{r column_names}
if (colnames(control_table_trimmed[1]) == "md5") {
colnames(control_table_trimmed) = c("md5", control_names_trimmed)
colnames(exp_table_trimmed) = c("md5", exp_names_trimmed)
} else {
colnames(control_table_trimmed) = c(control_names_trimmed)
colnames(exp_table_trimmed) = c(exp_names_trimmed) }
```
******** GRAPH FORMATION ********
NOTE: If skipping directly to DESeq analysis of the results, proceed instead to line 233.
The next step is to remove the hypothetical or predicted proteins. Note that this significantly reduces the total amount of data, but provides much more clarity when examining end results.
```{r hypothetical_removal}
control_to_be_removed <- control_table_trimmed[grep("hypothetical", rownames(control_table_trimmed)),]
control_table_filtered <- control_table_trimmed[ !(rownames(control_table_trimmed) %in% rownames(control_to_be_removed)),]
control_to_be_removed <- control_table_trimmed[grep("predicted", rownames(control_table_trimmed)),]
control_table_filtered <- control_table_filtered[ !(rownames(control_table_filtered) %in% rownames(control_to_be_removed)),]
exp_to_be_removed <- exp_table_trimmed[grep("hypothetical", rownames(exp_table_trimmed)),]
exp_table_filtered <- exp_table_trimmed[ !(rownames(exp_table_trimmed) %in% rownames(exp_to_be_removed)),]
exp_to_be_removed <- exp_table_trimmed[grep("predicted", rownames(exp_table_trimmed)),]
exp_table_filtered <- exp_table_filtered[ !(rownames(exp_table_filtered) %in% rownames(exp_to_be_removed)),]
rm(control_to_be_removed, exp_to_be_removed)
```
We now want to combine the two tables (experimental and control) into a single table, designated as all_table.
```{r all_table_creation}
all_table <- control_table_filtered
all_table[, " "] <- 0
if (colnames(control_table_filtered[1]) == "md5") {
all_table <- merge(all_table, exp_table_filtered, by = c(0,1), all = TRUE) # this step may take a while, depending on total number of annotations
rownames(all_table) <- all_table[,2]
all_table <- all_table[,-2]
} else {
all_table <- merge(all_table, exp_table_filtered, by = 0, all = TRUE)
rownames(all_table) <- all_table[,1]
}
all_table[is.na(all_table)] <- 0
all_table[, "Total"] <- rowSums(all_table[,c(2:length(all_table))])
all_table <- all_table[ with(all_table, order(-Total)), ]
names(all_table)[names(all_table) == 'Row.names'] <- 'Function'
# let's save everything up to this point:
all_table_all <- all_table
# continue here to include only the top 30 most abundant functions.
all_table <- rbind(all_table[1:29,], all_table[nrow(all_table),])
all_table <- all_table[, -ncol(all_table)]
```
We now alter the format of the all_table using melt:
```{r merge_tables}
all_table_m <- melt(cbind(all_table, Function = rownames(all_table)), id.vars = c('Function'))
```
One difficulty with function exports from RefSeq is that there are often several different vernacular forms of each function, split by semicolon. To simplify remaining functions, we here split by semicolon and merge functions with identical vernacular.
```{r function_simplifying}
all_table_m$Simplified_Function <- sapply(strsplit(as.character(all_table_m$Function), ";"), "[", 1)
all_table_m$Simplified_Function <- as.factor(all_table_m$Simplified_Function)
all_table_m2 <- all_table_m[c(4,2,3)]
all_table_m2 <- ddply(all_table_m2, c("Simplified_Function", "variable"), numcolwise(sum))
names(all_table_m2)[1] <- "Function"
all_table_m2$Function <- as.factor(all_table_m2$Function)
all_table_m3 <- all_table_m2[order(all_table_m2$variable, all_table_m2$Function),]
rm(all_table_m2, all_table_m)
```
Because normal ggplot colors tend to blend together too much when required to select 30 colors, a custom palette is specified here:
```{r palette}
CbPalette <- c("#a6cee3", "#1f78b4", "#b2df8a", "#33a02c", "#fb9a99", "#e31a1c", "#fdbf6f", "#ff7f00", "#cab2d6", "#6a3d9a", "#ffff99", "#b15928", "#8dd3c7", "#ffffb3", "#bebada", "#fb8072", "#80b1d3", "#fdb462", "#b3de69", "#fccde5", "#d9d9d9", "#bc80bd", "#ccebc5", "#ffed6f", "#e41a1c", "#377eb8", "#4daf4a", "#984ea3", "#ff7f00", "#ffff33", "#a65628", "#f781bf", "#999999", "#000000")
```
Now, we create the graphs! First, we create a stacked bar by percentage, allowing us to see relative activity composition by sample.
```{r percentage_plot}
relative_ggplot <- ggplot(all_table_m3, aes(x = variable, y = value, fill = Function)) +
geom_bar(position = "fill", stat = "identity") +
scale_fill_manual(values = CbPalette) +
scale_y_continuous(labels = percent_format()) +
theme(legend.position = "bottom") +
guides(fill = guide_legend(ncol=4)) +
ggtitle("Top 30 protein functions by relative abundance") +
xlab("Sample ID") + ylab("Relative activity of total sample")
relative_ggplot
```
Next, we create a stacked bar graph by raw counts, showing how total sample counts compare to each other.
```{r absolute_plot}
absolute_ggplot <- ggplot(all_table_m3, aes(x = variable, y = value, fill = Function)) +
geom_bar(stat = "identity") +
scale_fill_manual(values = CbPalette) +
theme(legend.position = "bottom") +
guides(fill = guide_legend(ncol=5)) +
ggtitle("Top 30 protein functions by total abundance") +
xlab("Sample ID") + ylab("Activity reads per sample")
absolute_ggplot
```
Finally, we display both of these graphs on a single display!
```{r graph_output}
grid.arrange(relative_ggplot, absolute_ggplot, ncol = 1)
```
******** DESEQ ANALYSIS ********
Now, let's run some DESeq analysis on the results:
```{r DESeq_analysis}
library(DESeq2)
complete_table <- merge(control_table_trimmed, exp_table_trimmed, by=c(0,1), all = TRUE)
complete_table[is.na(complete_table)] <- 1
rownames(complete_table) <- complete_table$md5
complete_table <- complete_table[,-2]
complete_table$Simplified_Function <- sapply(strsplit(as.character(complete_table$Row.names), ";"), "[", 1)
complete_table$Simplified_Function <- as.factor(complete_table$Simplified_Function)
complete_table <- complete_table[,c(ncol(complete_table), 1:ncol(complete_table))]
complete_table <- complete_table[,c(-2, -ncol(complete_table))]
complete_table2 <- complete_table[,-1]
completeCondition <- data.frame(condition=factor(c(rep("control", length(control_files)), rep("experimental", length(exp_files)))))
dds <- DESeqDataSetFromMatrix(complete_table2, completeCondition, ~ condition)
dds <- DESeq(dds)
baseMeanPerLvl <- sapply( levels(dds$condition), function(lvl) rowMeans( counts(dds,normalized=TRUE)[,dds$condition == lvl] ) )
res <- results(dds, contrast = c("condition", "experimental", "control"))
func_results <- data.frame(res)
func_results$Function <- complete_table$Simplified_Function
func_results <- merge(func_results, baseMeanPerLvl, by="row.names")
func_results <- func_results[,c(1,8,2,9,10,3,4,5,6,7)]
colnames(func_results)[c(4,5)] <- c("controlMean", "experimentalMean")
plotMA(dds, ylim=c(-10,10), main="DESeq2 comparison of functional annotations")
sorted_func_results <- func_results[order(-func_results$baseMean),]
colnames(sorted_func_results)[1] <- "Function Name"
write.table(func_results, file = "DESeq_functional_output_with_md5s.tab", append = FALSE, quote = FALSE, sep = "\t", row.names = FALSE, col.names = TRUE)
```
The DESeq results file should now be saved with the file name of the last command just above, in a tab-delimited format.