forked from ErasmusMC-Bioinformatics/shm_csr
-
Notifications
You must be signed in to change notification settings - Fork 1
/
nt_overview.r
74 lines (52 loc) · 3.03 KB
/
nt_overview.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
args <- commandArgs(trailingOnly = TRUE)
merged.file = args[1]
outputdir = args[2]
gene.classes = unlist(strsplit(args[3], ","))
hotspot.analysis.sum.file = args[4]
NToverview.file = paste(outputdir, "ntoverview.txt", sep="/")
empty.region.filter = args[5]
setwd(outputdir)
merged = read.table(merged.file, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="")
hotspot.analysis.sum = read.table(hotspot.analysis.sum.file, header=F, sep=",", fill=T, stringsAsFactors=F, quote="")
#ACGT overview
NToverview = merged
if(empty.region.filter == "leader" || empty.region.filter == "None"){
NToverview$seq = paste(NToverview$FR1.IMGT.seq, NToverview$CDR1.IMGT.seq, NToverview$FR2.IMGT.seq, NToverview$CDR2.IMGT.seq, NToverview$FR3.IMGT.seq)
} else if(empty.region.filter == "FR1"){
NToverview$seq = paste(NToverview$CDR1.IMGT.seq, NToverview$FR2.IMGT.seq, NToverview$CDR2.IMGT.seq, NToverview$FR3.IMGT.seq)
} else if(empty.region.filter == "CDR1"){
NToverview$seq = paste(NToverview$FR2.IMGT.seq, NToverview$CDR2.IMGT.seq, NToverview$FR3.IMGT.seq)
} else if(empty.region.filter == "FR2"){
NToverview$seq = paste(NToverview$CDR2.IMGT.seq, NToverview$FR3.IMGT.seq)
}
NToverview$A = nchar(gsub("[^Aa]", "", NToverview$seq))
NToverview$C = nchar(gsub("[^Cc]", "", NToverview$seq))
NToverview$G = nchar(gsub("[^Gg]", "", NToverview$seq))
NToverview$T = nchar(gsub("[^Tt]", "", NToverview$seq))
#Nsum = data.frame(Sequence.ID="-", best_match="Sum", seq="-", A = sum(NToverview$A), C = sum(NToverview$C), G = sum(NToverview$G), T = sum(NToverview$T))
#NToverview = rbind(NToverview, NTsum)
NTresult = data.frame(nt=c("A", "C", "T", "G"))
for(clazz in gene.classes){
print(paste("class:", clazz))
NToverview.sub = NToverview[grepl(paste("^", clazz, sep=""), NToverview$best_match),]
print(paste("nrow:", nrow(NToverview.sub)))
new.col.x = c(sum(NToverview.sub$A), sum(NToverview.sub$C), sum(NToverview.sub$T), sum(NToverview.sub$G))
new.col.y = sum(new.col.x)
new.col.z = round(new.col.x / new.col.y * 100, 2)
tmp = names(NTresult)
NTresult = cbind(NTresult, data.frame(new.col.x, new.col.y, new.col.z))
names(NTresult) = c(tmp, paste(clazz, c("x", "y", "z"), sep=""))
}
NToverview.tmp = NToverview[,c("Sequence.ID", "best_match", "seq", "A", "C", "G", "T")]
names(NToverview.tmp) = c("Sequence.ID", "best_match", "Sequence of the analysed region", "A", "C", "G", "T")
write.table(NToverview.tmp, NToverview.file, quote=F, sep="\t", row.names=F, col.names=T)
NToverview = NToverview[!grepl("unmatched", NToverview$best_match),]
new.col.x = c(sum(NToverview$A), sum(NToverview$C), sum(NToverview$T), sum(NToverview$G))
new.col.y = sum(new.col.x)
new.col.z = round(new.col.x / new.col.y * 100, 2)
tmp = names(NTresult)
NTresult = cbind(NTresult, data.frame(new.col.x, new.col.y, new.col.z))
names(NTresult) = c(tmp, paste("all", c("x", "y", "z"), sep=""))
names(hotspot.analysis.sum) = names(NTresult)
hotspot.analysis.sum = rbind(hotspot.analysis.sum, NTresult)
write.table(hotspot.analysis.sum, hotspot.analysis.sum.file, quote=F, sep=",", row.names=F, col.names=F, na="0")