-
Notifications
You must be signed in to change notification settings - Fork 0
/
HbF_linear_model_cumulative_distributions
78 lines (61 loc) · 3.25 KB
/
HbF_linear_model_cumulative_distributions
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
#### HbF linear model cumulative distributions
# Load dependencies
library(dplyr)
library(Seurat)
library(reshape2)
library(stringr)
library(ggplot2)
`%notin%` = Negate(`%in%`)
#### Compute the log1p, scaled HBG1/HBG2 counts for single cells, split by sgRNA
# Import a Seurat object with the dominant sgRNA stored as a column in the meta.data
seurat_obj_merged_dominant_perturbation=readRDS('path_to_seurat_object.RDS')
list_hbg1_hbg2_levels=list()
for (guides in unique([email protected]$perturbation)){
coi=row.names([email protected] %>%
filter(perturbation %in% guides))
list_hbg1_hbg2_levels[[guides]]=log1p(as.numeric((seurat_object@assays$RNA@counts["HBG1",coi]+seurat_object@assays$RNA@counts["HBG2",coi])
)/
[email protected][coi,"nCount_RNA"]*10000)
}
#### Compute cumulative distributions of HBG1+HBG2 for Perturb(BE)-seq
set.seed(1)
dataframe_hbg1_hbg2_levels=plyr::ldply(list_hbg1_hbg2_levels, rbind)
melted_dataframe_hbg1_hbg2_levels=melt(dataframe_hbg1_hbg2_levels)
observed_melted_dataframe_hbg1_hbg2_levels=melted_dataframe_hbg1_hbg2_levels
dataframe_temp_integration=melted_dataframe_hbg1_hbg2_levels %>%
group_by(.id) %>%
mutate(integration_value= integrate(ecdf(value),lower=0,upper=8, subdivisions=2000)[1])
dataframe_temp_integration$integration_value=as.numeric(dataframe_temp_integration$integration_value)
dataframe_temp_integration_final=dataframe_temp_integration %>% group_by(.id) %>% slice_sample(n = 1) %>%
arrange(integration_value)
dataframe_temp_integration_final=dataframe_temp_integration_final %>%
mutate(zscore_integration_value=(integration_value-mean(dataframe_temp_integration_final$integration_value))/sd(dataframe_temp_integration_final$integration_value))
ecdf_plot=ggplot(melted_dataframe_hbg1_hbg2_levels)+stat_ecdf(aes(value,group=.id))+
scale_fill_manual(values=c("grey80"))+
theme(axis.line = element_line(colour = 'black', size = 0.75),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_blank(),
axis.title.y = element_text(size = rel(1.8)),
axis.text.y = element_text(size=24),
axis.title.x = element_text(size = rel(1.8)),
axis.text.x = element_text(size=24),
legend.text=element_text(size=22),
legend.title=element_text(size=22),
legend.key = element_rect(fill = "white"),
axis.ticks.length=unit(.25, "cm")) +
ylab("Proportion of distribution") +
xlab("Log1p(HBG1+HBG2 single-cell-scaled counts)") +theme(axis.text=element_text(size=12),
axis.title=element_text(size=14))
print(ecdf_plot)
#### Run the linear model
# Compute the log1p, scaled HBG1/HBG2 counts for single cells
[email protected]$hbg1_hbg2_levels=log1p(as.numeric((seurat_object@assays$RNA@counts["HBG1",]+seurat_object@assays$RNA@counts["HBG2",])
)/
[email protected][,"nCount_RNA"]*10000)
linear_model_results=lm(hbg1_hbg2_levels~perturbation,[email protected])
linear_model_dataframe=data.frame(summary(linear_model_results)$coefficients[-1,1],
summary(linear_model_results)$coefficients[-1,3],
summary(linear_model_results)$coefficients[-1,4])
colnames(linear_model_dataframe)=c("estimate","t","p")