-
Notifications
You must be signed in to change notification settings - Fork 1
/
moss_figure_2_3_S2A.Rmd
153 lines (136 loc) · 4.33 KB
/
moss_figure_2_3_S2A.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
---
title: "moss_figures"
author: "Elena"
date: "3/1/2021"
output:
pdf_document: default
html_document: default
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)
library(viridis)
```
```{r}
# read data-- mean f1 for 5-fold CV all RF models using MOSS labels
aggf1RF <- read.csv('MIP_classify_RF_f1_score_agg.csv') %>%
mutate(response = factor(response, levels = c("difcoMB", "HMBcmpt", "HMBpep", "HMBaa",
"HMBlips", "HMBoligo", "HMBorg", "HMBntrl",
"HMBamisug", "HMBacdsug", "HMB--"))) %>%
arrange(response)
colnames(aggf1RF) <- c("num_pred", "response", "f1_mean", "f1_std")
# read data-- RF control without MOSS
rfCntrl <- read.csv('compare_classify_RF_f1_score_agg.csv') %>%
mutate(response = factor(response, levels = c("difcoMB", "HMBcmpt", "HMBpep", "HMBaa",
"HMBlips", "HMBoligo", "HMBorg", "HMBntrl",
"HMBamisug", "HMBacdsug", "HMB--"))) %>%
arrange(response) %>%
mutate(num_pred = 'RFcntrl') %>%
select(num_pred, response, mean, std)
colnames(rfCntrl) <- c("num_pred", "response", "f1_mean", "f1_std")
# combine mean f1 and control
comboF1 <- aggf1RF %>%
mutate(num_pred = as.character(num_pred)) %>%
bind_rows(rfCntrl)
# binary growth profiles
gp <- readxl::read_xlsx('growth_profiles_binary.xlsx')
```
```{r}
# compute Shannon entropy
entropy <- function(target) {
freq <- table(target)/length(target)
# vectorize
vec <- as.data.frame(freq)[,2]
#drop 0 to avoid NaN resulting from log2
vec<-vec[vec>0]
#compute entropy
-sum(vec * log2(vec))
}
gp_ent <- lapply(gp[,-1], entropy)
ent <- data.frame(medium = names(gp_ent),
entropy = unlist(gp_ent))
```
```{r}
### Figure 2 ###
dat <- select(gp, -strain)
rownames(dat) <- gp$strain
pheatmap::pheatmap(t(dat),
cluster_rows = FALSE,
color = viridis(2))
```
```{r warning=FALSE}
### Figure 3A ###
# heatmap selected predictors
# format matrix
predWide <- aggf1RF %>%
mutate(pred = 0) %>%
pivot_wider(num_pred, names_from = "response", values_from = "pred")
predWide[is.na(predWide)] <- 1
ord <- sort(colSums(predWide)[-1], decreasing = FALSE)
# heatmap
x <- predWide[,-1]
x <- x[,names(ord)]
rownames(x) <- predWide$num_pred
pheatmap::pheatmap(t(x),
cluster_cols = FALSE,
cluster_rows = FALSE,
color = c("black", "gray"),
angle_col = 0,
fontsize = 14)
```
```{r}
### Figure 3C ###
# dot plots
a <- comboF1 %>%
mutate(response = factor(response, levels = names(ord))) %>%
mutate(num_pred = ifelse(num_pred == "RFcntrl", "C", num_pred)) %>%
mutate(num_pred = factor(num_pred, levels = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, "C"))) %>%
ggplot(aes(x = as.factor(num_pred), y = f1_mean, color = response)) +
geom_point(aes(size = f1_std), alpha = .75) +
theme_classic() +
theme(text = element_text(size = 14),
legend.justification=c(1,0),
legend.position=c(1,0)) +
scale_color_viridis(discrete = TRUE) +
facet_wrap(~response) +
guides(color = "none") +
labs(y = "Mean Accuracy Score",
x = "Number of Predictors",
color = "Medium",
size = "Standard Deviation")
a
### Figure 3B ###
# bar graphs
b <- ent %>%
mutate(medium = factor(medium, levels = names(ord))) %>%
ggplot(aes(x = reorder(medium, desc(medium)), y = entropy, fill = medium)) +
geom_col(width = 1) +
theme_classic() +
theme(text = element_text(size = 14, color = "black"),
axis.text.y = element_text(hjust = 0.5, colour = "black")) +
scale_fill_viridis(discrete = TRUE) +
guides(fill = "none") +
labs(x = "",
y = "Shannon Entropy") +
coord_flip()
b
```
```{r}
### Figure S2A ###
x <- gp %>%
select(-strain) %>%
colSums()
# times selected vs entropy w/ and w/o difco
p1 <- data.frame(medium = names(x),
ngrow = x,
nsel = c(0, 5, 6, 8, 7, 5, 6, 9, 1, 4, 4)) %>%
inner_join(ent, by = 'medium') %>%
ggplot(aes(x = entropy, y = nsel, label = medium)) +
geom_point() +
scale_y_continuous(breaks = scales::pretty_breaks(10)) +
theme_classic() +
theme(text = element_text(size = 14)) +
labs(y = "Times Selected as Predictor",
x = "Shannon Entropy")
p1
```