HbF_linear_model_cumulative_distributions

#### HbF linear model cumulative distributions
# Load dependencies
library(dplyr)
library(Seurat)
library(reshape2)
library(stringr)
library(ggplot2)

`%notin%` = Negate(`%in%`)

#### Compute the log1p, scaled HBG1/HBG2 counts for single cells, split by sgRNA

# Import a Seurat object with the dominant sgRNA stored as a column in the meta.data
seurat_obj_merged_dominant_perturbation=readRDS('path_to_seurat_object.RDS')

list_hbg1_hbg2_levels=list()

for (guides in unique(seurat_object@meta.data$perturbation)){

coi=row.names(seurat_object@meta.data %>%
filter(perturbation %in% guides))

list_hbg1_hbg2_levels[[guides]]=log1p(as.numeric((seurat_object@assays$RNA@counts["HBG1",coi]+seurat_object@assays$RNA@counts["HBG2",coi])
                                                )/
    seurat_object@meta.data[coi,"nCount_RNA"]*10000)
}

#### Compute cumulative distributions of HBG1+HBG2 for Perturb(BE)-seq
set.seed(1)
dataframe_hbg1_hbg2_levels=plyr::ldply(list_hbg1_hbg2_levels, rbind)
melted_dataframe_hbg1_hbg2_levels=melt(dataframe_hbg1_hbg2_levels)

observed_melted_dataframe_hbg1_hbg2_levels=melted_dataframe_hbg1_hbg2_levels
dataframe_temp_integration=melted_dataframe_hbg1_hbg2_levels %>%
  group_by(.id) %>%
  mutate(integration_value= integrate(ecdf(value),lower=0,upper=8, subdivisions=2000)[1])
    dataframe_temp_integration$integration_value=as.numeric(dataframe_temp_integration$integration_value)

dataframe_temp_integration_final=dataframe_temp_integration %>% group_by(.id) %>% slice_sample(n = 1) %>%
arrange(integration_value)

dataframe_temp_integration_final=dataframe_temp_integration_final %>%
mutate(zscore_integration_value=(integration_value-mean(dataframe_temp_integration_final$integration_value))/sd(dataframe_temp_integration_final$integration_value))

ecdf_plot=ggplot(melted_dataframe_hbg1_hbg2_levels)+stat_ecdf(aes(value,group=.id))+ 
scale_fill_manual(values=c("grey80"))+
theme(axis.line = element_line(colour = 'black', size = 0.75),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_blank(),                                        
axis.title.y = element_text(size = rel(1.8)),
axis.text.y = element_text(size=24),
axis.title.x = element_text(size = rel(1.8)),
axis.text.x = element_text(size=24),
legend.text=element_text(size=22),
      legend.title=element_text(size=22),

legend.key = element_rect(fill = "white"),
axis.ticks.length=unit(.25, "cm"))  + 
ylab("Proportion of distribution") + 
xlab("Log1p(HBG1+HBG2 single-cell-scaled counts)") +theme(axis.text=element_text(size=12),
        axis.title=element_text(size=14)) 

print(ecdf_plot)

#### Run the linear model
# Compute the log1p, scaled HBG1/HBG2 counts for single cells
seurat_object@meta.data$hbg1_hbg2_levels=log1p(as.numeric((seurat_object@assays$RNA@counts["HBG1",]+seurat_object@assays$RNA@counts["HBG2",])
                                                )/
    seurat_object@meta.data[,"nCount_RNA"]*10000)

linear_model_results=lm(hbg1_hbg2_levels~perturbation,data=seurat_object@meta.data)

linear_model_dataframe=data.frame(summary(linear_model_results)$coefficients[-1,1],
summary(linear_model_results)$coefficients[-1,3],
summary(linear_model_results)$coefficients[-1,4])
colnames(linear_model_dataframe)=c("estimate","t","p")