forked from robsyme/nextflow-annotate
-
Notifications
You must be signed in to change notification settings - Fork 0
/
annotate
executable file
·217 lines (186 loc) · 5.86 KB
/
annotate
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
#!/usr/bin/env nextflow
params.genomes = 'data/**/tmp.fasta'
genomes = Channel.fromPath(params.genomes)
params.homologyProbabilityCutoff = 70
params.evalueCutoff = 1e-3
params.pvalueCutoff = 1e-6
params.scoreCutoff = 100
params.alignmentLengthCutoff = 50
params.templateLengthCutoff = 30
def toFasta(defLine, sequence, width=80) {
return (">" + defLine + "\n" + wrapString(sequence, width) + "\n")
}
def wrapString(text, width=80) {
def out = []
while(text.length() > width) {
out += text[0..(width-1)]
text = text[width..-1]
}
out += text
return out.join("\n")
}
class HHRHit {
float probability
float evalue
float pvalue
float score
float structureScore
String queryName
String queryStart
String queryEnd
String revString
String strainName
String description
Integer alignmentLength
Integer aaStart
Integer aaStop
Integer hitStart
Integer hitStop
Integer templateSize
HHRHit(String result) {
// Pull out the query information. It will look something like:
// Query Scaffold_1_318 [160485 - 161177] Length:352063 [Ascochyta_fabae_Af1]
(queryName, queryStart, queryEnd, revString, strainName) = (result =~ (/Query\s+(\S+)_\d+ \[(\d+) - (\d+)\] (\(REVERSE SENSE\))?.* \[(.*)\]/))[0][1..-1]
// Find the top hit. The line will look something like:
// 1 GB:CAA29181 ORF 1 (LINE-elemen 99.9 1.4E-28 4.1E-32 243.9 0.0 61 2-69 1216-1281(1650)
def hitData = (result =~ /\s+1\s+(.{30})\s+(\d+\.?\d*)\s+(\d+\.?\d*E?-?\d*)\s+(\d+\.?\d*E?-?\d*)\s+(\d+\.?\d*)\s+(\d+\.?\d*)\s+(\d+)\s+(\d+)-(\d+)\s+(\d+)-(\d+)\s*\((\d+)\)/)
if (hitData.size() == 0) {
// It's likely that many of the orfs won't have any hits. In
// this case, just return a 'hit' with score zero
score = 0
structureScore = 0
} else {
// If we do find a hit, return a new HHRHit instance.
description = hitData[0][1]
probability = hitData[0][2].toFloat()
evalue = hitData[0][3].toFloat()
pvalue = hitData[0][4].toFloat()
score = hitData[0][5].toFloat()
structureScore = hitData[0][6].toFloat()
alignmentLength = hitData[0][7].toInteger()
aaStart = hitData[0][8].toInteger()
aaStop = hitData[0][9].toInteger()
hitStart = hitData[0][10].toInteger()
hitStop = hitData[0][11].toInteger()
templateSize = hitData[0][12].toInteger()
}
}
String toString() {
def out = []
out.push "Query: " + queryName + " (" + queryStart + "-" + queryEnd + ") [" + strainName + "]"
out.push " Description: '" + description + "'"
out.push " Probability: " + probability
out.push " E-value: " + evalue
out.push " P-value: " + pvalue
out.push " Score: " + score
return out.join("\n")
}
String toGFF3() {
def hitID = description.split()[0]
def uid = "${hitID}.s${hitStart}.e${hitStop}"
def out = []
out.push queryName
out.push 'hhblits'
out.push 'protein_match'
out.push queryStart + 3 * aaStart - 1
out.push queryStart + 3 * aaStop - 1
out.push score
out.push revString ? "-" : "+"
out.push "."
out.push "ID=${uid};Name=${uid};Target=${hitID} $hitStart $hitStop"
println "DONE: ${out.join('\t')}"
out.join("\t")
}
String toHints() {
def out = []
out.push queryName
out.push 'hhblits'
out.push "nonexonpart"
out.push queryStart + 3 * aaStart - 1
out.push queryStart + 3 * aaStop - 1
out.push score
out.push revString ? "-" : "+"
out.push "."
out.push "source=RM;grp=${description.split()[0]};pri=6"
out.join("\t")
}
String toGeneID() {
def out = []
out.push queryName
out.push 'hhblits'
out.push 'sr'
out.push queryStart + 3 * aaStart - 1
out.push queryStart + 3 * aaStop - 1
out.push score
out.push revString ? "-" : "+"
out.push '.'
out.join("\t")
}
}
process cleanGenome {
input:
val genome from genomes
output:
set name, stdout into cleanGenomes
script:
name = genome.getParent().getBaseName()
"""
awk '/^>/ && !/[.*]/ {print(\$0, "[$name]")} /^>/ && /[.*]/ {print \$0} /^[^>]/ {print(toupper(\$0))}' '$genome'
sed -ie "s/\015//" "$genome"
"""
}
process getorf {
container 'robsyme/emboss'
input:
set name, 'maskedGenome' from cleanGenomes
output:
file 'orfs.aa.fasta' into orfFiles
"""
getorf -sequence $maskedGenome -outseq orfs.aa.fasta -minsize 150 -find 1
"""
}
cleanOrfs = orfFiles.splitFasta(record: [header: true, seqString: true])
.filter { record ->
xCount = record.seqString.count('X')
length = record.seqString.size()
xCount / length < 0.3
}
.map { record ->
record.seqString = record.seqString.replaceAll('X','')
return toFasta(record.header, record.seqString)
}
process hhblit {
container 'robsyme/hhblits'
input:
file 'orf.fasta' from cleanOrfs
output:
stdout into hhblitOutput
"""
hhblits -i orf.fasta -d /databases/transposons -maxmem 5 -cpu 1 -o stdout -e 1E-5 -E 1E-5 -id 80 -p 80 -z 0 -b 0 -B 3 -Z 3 -n 1 -mact 0.5 -v 0
"""
}
transposonGFFLines = Channel.create()
transposonHintLines = Channel.create()
transposonGeneIDLines = Channel.create()
hhblitOutput
.map { String result -> new HHRHit(result) }
.filter { it.score > 0 }
.filter { it.probability > params.homologyProbabilityCutoff }
.filter { it.evalue < params.evalueCutoff }
.filter { it.pvalue < params.pvalueCutoff }
.filter { it.alignmentLength > params.alignmentLengthCutoff }
.filter { it.templateSize > params.templateLengthCutoff }
.separate(transposonGFFLines, transposonHintLines, transposonGeneIDLines) { [ it.toGFF3(), it.toHints(), it.toGeneID() ] }
transposonGFF = transposonGFFLines.collectFile(name: 'transposon_hits.gff3')
process sortTransposonHits {
input:
file 'gff' from transposonGFF
output:
stdout into transposonSortedGFF
"""
sort -nk 4,4 $gff | sort -sk 1,1
"""
}
transposonSortedGFF.subscribe {
it.moveTo('./')
}