moss_figure_2_3_S2A.Rmd

---
title: "moss_figures"
author: "Elena"
date: "3/1/2021"
output:
  pdf_document: default
  html_document: default
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)
library(viridis)
```

```{r}
# read data-- mean f1 for 5-fold CV all RF models using MOSS labels
aggf1RF <- read.csv('MIP_classify_RF_f1_score_agg.csv') %>%
  mutate(response = factor(response, levels = c("difcoMB", "HMBcmpt", "HMBpep", "HMBaa", 
                                                "HMBlips", "HMBoligo", "HMBorg", "HMBntrl", 
                                                "HMBamisug", "HMBacdsug", "HMB--"))) %>%
  arrange(response)
colnames(aggf1RF) <- c("num_pred", "response", "f1_mean", "f1_std")

# read data-- RF control without MOSS
rfCntrl <- read.csv('compare_classify_RF_f1_score_agg.csv') %>%
  mutate(response = factor(response, levels = c("difcoMB", "HMBcmpt", "HMBpep", "HMBaa", 
                                                "HMBlips", "HMBoligo", "HMBorg", "HMBntrl", 
                                                "HMBamisug", "HMBacdsug", "HMB--"))) %>%
  arrange(response) %>%
  mutate(num_pred = 'RFcntrl') %>%
  select(num_pred, response, mean, std)
colnames(rfCntrl) <- c("num_pred", "response", "f1_mean", "f1_std")

# combine mean f1 and control
comboF1 <- aggf1RF %>%
  mutate(num_pred = as.character(num_pred)) %>%
  bind_rows(rfCntrl)

# binary growth profiles
gp <- readxl::read_xlsx('growth_profiles_binary.xlsx')
```

```{r}
# compute Shannon entropy
entropy <- function(target) {
  freq <- table(target)/length(target)
  # vectorize
  vec <- as.data.frame(freq)[,2]
  #drop 0 to avoid NaN resulting from log2
  vec<-vec[vec>0]
  #compute entropy
  -sum(vec * log2(vec))
}

gp_ent <- lapply(gp[,-1], entropy) 
ent <- data.frame(medium = names(gp_ent),
                  entropy = unlist(gp_ent))
```

```{r}
### Figure 2 ###
dat <- select(gp, -strain)
rownames(dat) <- gp$strain

pheatmap::pheatmap(t(dat),
                   cluster_rows = FALSE,
                   color = viridis(2))
```

```{r warning=FALSE}
### Figure 3A ###
# heatmap selected predictors
# format matrix
predWide <- aggf1RF %>%
  mutate(pred = 0) %>%
  pivot_wider(num_pred, names_from = "response", values_from = "pred")
predWide[is.na(predWide)] <- 1

ord <- sort(colSums(predWide)[-1], decreasing = FALSE)

# heatmap
x <- predWide[,-1]
x <- x[,names(ord)]
rownames(x) <- predWide$num_pred

pheatmap::pheatmap(t(x),
                   cluster_cols = FALSE,
                   cluster_rows = FALSE,
                   color = c("black", "gray"),
                   angle_col = 0,
                   fontsize = 14)
```

```{r}
### Figure 3C ###
# dot plots
a <- comboF1 %>%
  mutate(response = factor(response, levels = names(ord))) %>%
  mutate(num_pred = ifelse(num_pred == "RFcntrl", "C", num_pred)) %>%
  mutate(num_pred = factor(num_pred, levels = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, "C"))) %>%
  ggplot(aes(x = as.factor(num_pred), y = f1_mean, color = response)) +
  geom_point(aes(size = f1_std), alpha = .75) +
  theme_classic() +
  theme(text = element_text(size = 14),
        legend.justification=c(1,0), 
        legend.position=c(1,0)) +
  scale_color_viridis(discrete = TRUE) +
  facet_wrap(~response) +
  guides(color = "none") +
  labs(y = "Mean Accuracy Score",
       x = "Number of Predictors",
       color = "Medium",
       size = "Standard Deviation")
a

### Figure 3B ###
# bar graphs
b <- ent %>%
  mutate(medium = factor(medium, levels = names(ord))) %>%
  ggplot(aes(x = reorder(medium, desc(medium)), y = entropy, fill = medium)) +
  geom_col(width = 1) +
  theme_classic() +
  theme(text = element_text(size = 14, color = "black"),
        axis.text.y = element_text(hjust = 0.5, colour = "black")) +
  scale_fill_viridis(discrete = TRUE) +
  guides(fill = "none") +
  labs(x = "",
       y = "Shannon Entropy") +
  coord_flip()
b
```

```{r}
### Figure S2A ###
x <- gp %>%
  select(-strain) %>%
  colSums()

# times selected vs entropy w/ and w/o difco
p1 <- data.frame(medium = names(x), 
           ngrow = x,
           nsel = c(0, 5, 6, 8, 7, 5, 6, 9, 1, 4, 4)) %>%
  inner_join(ent, by = 'medium') %>%
  ggplot(aes(x = entropy, y = nsel, label = medium)) +
  geom_point() +
  scale_y_continuous(breaks = scales::pretty_breaks(10)) +
  theme_classic() +
  theme(text = element_text(size = 14)) +
  labs(y = "Times Selected as Predictor",
       x = "Shannon Entropy")
p1
```