-
Notifications
You must be signed in to change notification settings - Fork 0
/
Figure_3b,c_unbiased_vs_amplicon_correlation_plots.R
131 lines (109 loc) · 7.19 KB
/
Figure_3b,c_unbiased_vs_amplicon_correlation_plots.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
###############################################################################
# This includes the code for generating figures for the CA04 illumina manuscript
###############################################################################
require(ggplot2)
require(grid)
require(stringr)
require(reshape2)
require(format)
require(plyr)
# blue/red pallette
color1 <- rgb(0,148,194, maxColorValue=255)
color2 <- rgb(74,204,245, maxColorValue=255)
color3 <- rgb(0,90,117, maxColorValue=255)
color4 <- rgb(117,11,0, maxColorValue=255)
color5 <- rgb(207,19,0, maxColorValue=255)
color6 <- rgb(245,89,74, maxColorValue=255)
color7 <- rgb(255,255,255, maxColorValue=255)
###############################################################################
# Figure X: Correlation between the mean frequency of SNPs as detected in unbiased vs. amplicon sequence data
###############################################################################
# read in the combined dataframe
data=read.table("~/Documents/CA04_SNP_detection/data/unbiased_data/amplicon_vs_unbiased.txt", header=T, stringsAsFactors=F, na.string=c(NA,"NA","na"), sep="\t", col.names=c("reference_position","type","length","reference","allele","linkage","zygosity","count","coverage","frequency","hyper-allelic","fr_balance","average_quality","input","replicate","minimum_coverage_required","method","dups_removed"))
# remove indels:
data <- data[data$type != "Deletion",]
data <- data[data$type != "Insertion",]
# remove SNPs called for minimum coverage required = 1 and dups removed
data <- data[data$minimum_coverage_required == "100",]
#data <- data[data$minimum_coverage_required == "1" & data$method == "unbiased" | data$minimum_coverage_required == "100" & data$method == "amplicon",]
data <- data[data$dups_removed == "no",]
# for this, need to convert the input to a character
data$input = as.character(as.numeric(data$input))
data$input <- gsub("10000","1e+04",data$input)
data$input <- gsub("1000","1e+03",data$input)
data$input <- gsub("100","1e+02",data$input)
# I would like to answer the following question: if we consider every single SNP detected in this study, including those detected by both methods, what is the mean frequency of that SNP for each dilution in each method? I will then plot the correlation.
method_list = unique(data$method)
dilution_list = unique(data$input)
site_list = unique(data$reference_position)
count = 0
data$frequency = as.numeric(as.character(data$frequency))
for (d in dilution_list)
{
for (m in method_list)
{
for (i in site_list)
{
df <- data[data$reference_position == i & data$input == d & data$method == m,]
if (nrow(df) == 1){
all_values = list(df$frequency, 0, 0)
times_detected = 1
non_mean_frequency = df$frequency
} else if (nrow(df) == 2) {
all_values = list(df$frequency[1], df$frequency[2], 0)
times_detected=2
non_mean_frequency = NA
} else if (nrow(df) == 3) {
all_values = list(df$frequency[1], df$frequency[2], df$frequency[3])
times_detected = 3
non_mean_frequency = NA
} else if (nrow(df) == 0) {
all_values = list(0, 0, 0)
times_detected = 0
non_mean_frequency = 0
}
# now calculate the mean and variance
mean = mean(as.numeric(all_values))
st_dev = sd(as.numeric(all_values))
replicate1 = all_values[[1]]
replicate2 = all_values[[2]]
replicate3 = all_values[[3]]
df2 = data.frame(i, replicate1, replicate2, replicate3, mean, st_dev,times_detected, non_mean_frequency, d, m)
if (count == 0){
mean_variance_df = df2
count = count + 1
} else {
mean_variance_df = rbind(mean_variance_df, df2)
}
}
}
}
# reformat dataframe so that SNP frequencies can be compared between amplicon and unbiased methods
# select only the columns: reference_position, frequency, input, replicate and method
df <- mean_variance_df[c(1,5:10)]
# separate out unbiased and amplicon into 2 separate dataframes, then merge back together and convert all NAs to 0s
# method.x is unbiased, method.y is ampilcon
df1 = df[df$m =="unbiased",]
df2 = df[df$m =="amplicon",]
df = merge(df1, df2, by=c("i","d"), all=TRUE)
df$mean.x[is.na(df$mean.x)] <- 0
df$mean.y[is.na(df$mean.y)] <- 0
# remove all dilutions 10^4 and less
df = df[df$d == "1e+07" | df$d == "1e+06" | df$d == "1e+05",]
# compare amplicon vs. unbiased as a scatter plot
ggplot(df, aes(x=mean.x,y=mean.y, alpha=0.8, colour=d))+geom_point(size=2)+theme(panel.grid.major=element_line(colour=NA,size=NA))+theme(panel.grid.minor=element_line(colour=NA,size=NA))+theme(plot.title=element_text(size=13))+theme(strip.background = element_rect(colour=NA, fill=NA))+theme(plot.margin=unit(c(1,1,1,1),"cm"))+theme(axis.line.x=element_line(colour="black"))+theme(axis.line.y=element_line(colour="black"))+theme(strip.text.x=element_text(size=11))+theme(axis.title=element_text(size=13, vjust=0.2))+theme(axis.text=element_text(size=11, colour="black"))+theme(axis.text.x=element_text(hjust=0.5))+theme(legend.text=element_text(size=11))+theme(legend.title=element_text(size=13, face="plain"))+theme(panel.margin=unit(1, "lines"))+theme(legend.key.size=unit(0.5, "cm"))+theme(panel.background=element_rect(fill=NA))+theme(legend.key=element_rect(fill=NA))+labs(x="SNP frequency (%) (unbiased)",y="SNP frequency (%) (amplicon)")+scale_x_continuous(breaks=seq(0,100,10), limits=c(0,100))+scale_y_continuous(breaks=seq(0,100,10), limits=c(0,100))+scale_colour_manual(values=c(color4,color5,color6))
# calculate regression between amplicon and unbiased
r2.lm = lm(mean.x~mean.y, data=df)
r2.lm$residuals #get residuals
summary(r2.lm)
# perform a paired t-test to compare the mean frequency of a SNP in unbiased vs. amplicon for SNPs that were detected in both methods
df_nozeroes <- df[df$mean.x != 0 & df$mean.y != 0,]
t.test(df_nozeroes$mean.x, df_nozeroes$mean.y, paired=T)
# plot only those SNPs between 1-10% frequency
df2 = df[df$mean.x <= 10 | df$mean.y <= 10,]
# compare amplicon vs. unbiased as a scatter plot
ggplot(df2, aes(x=mean.x,y=mean.y, alpha=0.8, colour=d))+geom_point(size=2)+theme(panel.grid.major=element_line(colour=NA,size=NA))+theme(panel.grid.minor=element_line(colour=NA,size=NA))+theme(plot.title=element_text(size=13))+theme(strip.background = element_rect(colour=NA, fill=NA))+theme(plot.margin=unit(c(1,1,1,1),"cm"))+theme(axis.line.x=element_line(colour="black"))+theme(axis.line.y=element_line(colour="black"))+theme(strip.text.x=element_text(size=11))+theme(axis.title=element_text(size=13, vjust=0.2))+theme(axis.text=element_text(size=11, colour="black"))+theme(axis.text.x=element_text(hjust=0.5))+theme(legend.text=element_text(size=11))+theme(legend.title=element_text(size=13, face="plain"))+theme(panel.margin=unit(1, "lines"))+theme(legend.key.size=unit(0.5, "cm"))+theme(panel.background=element_rect(fill=NA))+theme(legend.key=element_rect(fill=NA))+labs(x="SNP frequency (%) (unbiased)",y="SNP frequency (%) (amplicon)")+scale_x_continuous(breaks=seq(0,10,1), limits=c(0,10))+scale_y_continuous(breaks=seq(0,10,1), limits=c(0,10))+scale_colour_manual(values=c(color4,color5,color6))
# calculate regression between amplicon and unbiased
r2.lm = lm(mean.y~mean.x, data=df2)
r2.lm$residuals #get residuals
summary(r2.lm)