-
Notifications
You must be signed in to change notification settings - Fork 1
/
enrichment_vs_n_anton_data_exacycle_criteria.R
150 lines (125 loc) · 5.72 KB
/
enrichment_vs_n_anton_data_exacycle_criteria.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
library(ggplot2)
library(reshape2)
make_roc_plot <- function(data, title) {
data <- data.frame(data)[1:1000,]
final.data.melted <- melt(as.data.frame(data), variable.name = "class", value.name = "TPR", id.vars = colnames(data)[1])
print(final.data.melted[1:10,])
p <- ggplot(final.data.melted, aes(x = FPR, y = TPR, colour= class)) + geom_line() #+ geom_hline(yintercept = 1.0, color = "blue", linetype = "longdash") #+ geom_vline(xintercept=50, color = "blue", linetype = "longdash")
p <- p + ylab("True Positive Rate") + xlab("False Positive Rate") + ggtitle(title)
p <- p + geom_abline(intercept = 0.0, slope = 1)
#p <- p + geom_line(data=final.data.melted,thickness=3) #+ ylab(colnames(data)[2]) + xlab(colnames(data)[1])
#p <- p + theme(axis.title = element_text(size = 12,face="bold"))
#p <- p + theme(plot.title = element_text(size=12,face="bold"))
#p <- p + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
# panel.background = element_blank(), axis.line = element_line(colour = "black"))
#p <- p + scale_x_continuous(expand=c(.002,0))
#p <- p + theme(aspect.ratio=.5)
p
#pdf(paste(title,".pdf"),width=12,height=6)
#plot(p)
#dev.off()
}
pnas_coords_csv <- "/Users/evan/biox3/b2ar_analysis_sherlock_all/b2ar_analysis/tICA_t10_n_components5_switches_npxx_tm6_bp/analysis_n_clusters1000/pnas_coords_new.csv"
pnas_coords <- data.frame(read.csv(pnas_coords_csv, stringsAsFactors = F, row.names=1))
pnas_coords["tm6_tm3_dist"] <- 7.14 * pnas_coords["tm6_tm3_dist"]
pnas_coords[1:10,]
pnas_coords_all_csv <- "/Users/evan/biox3/b2ar_analysis_sherlock_all/b2ar_analysis/all_pnas_features/pnas_all_coords.csv"
pnas_coords_all <- data.frame(read.csv(pnas_coords_all_csv, stringsAsFactors = F, row.names=1))
colnames(pnas_coords_all) <- colnames(pnas_coords)
pnas_coords_all["tm6_tm3_dist"] <- 7.14 * pnas_coords_all["tm6_tm3_dist"]
is_active <- function(row) {
#print(row)
#print(rownames(row))
if(row["tm6_tm3_dist"] > 12.0 & row["npxxy_rmsd_active"] < row["npxxy_rmsd_inactive"] & row["connector_rmsd_inactive"] > row["connector_rmsd_active"] & row["connector_rmsd_active"] < 1.0 & row["npxxy_rmsd_active"] < 1.0) {
return(TRUE)
} else {
return(FALSE)
}
}
find_active <- function(df) {
active_rows <- apply(df, 1, is_active)
return(active_rows)
}
is_active(pnas_coords[1,])
pnas_rows <- find_active(pnas_coords)
active_rows <- pnas_rows[pnas_rows == T]
inactive_rows <- pnas_rows[pnas_rows == F]
print(length(active_rows))
print(length(active_rows)/dim(pnas_coords)[1])
all_rows <- find_active(pnas_coords_all)
all_active_rows <- all_rows[all_rows == T]
print(length(all_active_rows))
print(length(all_active_rows)/length(all_rows))
cluster.names = rep("",1000)
for (i in 1:1000) {
cluster.names[i] <- paste("cluster", (i-1), sep= "")
}
is_cluster_active <- function(cluster_name, df) {
#print(cluster_name)
cluster_rows <- grep(paste(cluster_name, "_", sep=""), rownames(df))
# print(cluster_rows)
cluster_df <- df[cluster_rows,]
rows <- find_active(cluster_df)
if(length(rows[rows == T]) > length(rows[rows == F])) {
return(T)
} else {
return(F)
}
}
find_active_clusters <- function(df, cluster.names) {
cluster_active <- t(data.frame(lapply(cluster.names, is_cluster_active, df)))
rownames(cluster_active) <- cluster.names
return(cluster_active)
}
cluster_active <- find_active_clusters(pnas_coords, cluster.names)
docking_csv <- "/Users/evan/biox3/b2ar_analysis_sherlock_all/b2ar_analysis/tICA_t10_n_components5_switches_npxx_tm6_bp/analysis_n_clusters1000/aggregate_docking_joined.csv"
docking <- data.frame(read.csv(docking_csv, stringsAsFactors = F, row.names=1))
cluster.active.docking <- merge(docking, cluster_active, by = "row.names", all = TRUE)
rownames(cluster.active.docking) <- cluster.active.docking[,1]
cluster.active.docking <- cluster.active.docking[,1:dim(cluster.active.docking)[2]]
colnames(cluster.active.docking) <-c("aggregate_docking_score", "is_active")
cluster.active.docking.ordered <- cluster.active.docking[order(-1.0 * cluster.active.docking$is_active),]
total_active <- function(df, col) {
num_active = length(df[df[,col] == T, col])
return(num_active)
}
active_under_index <- function(index, df, col) {
under_index = df[1:index,]
num_active = length(under_index[under_index[,col] == T, col])
return(num_active)
}
enrichment <- function(percent_active, percent_all) {
return(percent_active/percent_all)
}
calc_tpr <- function(index, df, col) {
TP <- active_under_index(index, df, col)
TP_FN <- total_active(df, col)
sensitivity <- TP / TP_FN
return(sensitivity)
}
calc_fpr <- function(index, df, col) {
under_index = df[1:index,]
TP_FP <- dim(under_index)[1]
TP <- active_under_index(index, df, col)
FP <- TP_FP - TP
over_index <- df[index:dim(df)[1],]
over_index_over_cutoff <- over_index[over_index[col] == F,]
TN <- dim(over_index_over_cutoff)[1]
fpr <- FP / (TN + FP)
return(fpr)
}
N <- dim(cluster.active.docking.ordered)[1]
cutoff <- 0.20
index.fpr.tpr <- cbind(seq(1, N, 1), rep(0, N), rep(0, N))
colnames(index.fpr.tpr) <- c("Index", "FPR", "TPR")
index.fpr.tpr[,2] <- sapply(index.fpr.tpr[,1], calc_fpr, df = cluster.active.docking.ordered, col=3)
index.fpr.tpr[,3] <- sapply(index.fpr.tpr[,1], calc_tpr, df = cluster.active.docking.ordered, col=3)
fpr.tpr <- index.fpr.tpr[,c(2,3)]
colnames(fpr.tpr) <- colnames(index.fpr.tpr)[c(2,3)]
make_roc_plot(fpr.tpr, paste("ROC Curve Plot"))
pos.scores <- cluster.active.docking.ordered[cluster.active.docking.ordered[,3] == T, 2]
neg.scores <- cluster.active.docking.ordered[cluster.active.docking.ordered[,3] == F, 2]
auc <- mean(sample(pos.scores,1000,replace=T) > sample(neg.scores,1000,replace=T))
#0.779 for cutoff of 0.3
#0.818 for cutoff of 0.25
#0.948 for cutoff of 0.2