forked from robsyme/nextflow-annotate
-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.nf
476 lines (367 loc) · 12.5 KB
/
main.nf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
#!/usr/bin/env nextflow
params.reference = 'data/genome.fasta'
params.scaffoldmin = 1000 // Minimum scaffold size to consider
params.minsize = 100 // Minimum exon size
params.species = 'fungi' // Name of species passed to RepeatMasker
params.maxintronlength = 500 // Maximum intron length
params.minintronlength = 10 // Maximum intron length
params.bamfiles = 'data/bams/*.bam'
params.pasaconf = 'conf/alignAssembly.conf' // Pasa configuration file to set db name etc.
params.reads = 'data/reads/all.fastq'
reference_raw = file(params.reference)
// Remove small scaffolds from analysis.
process remove_small_scaffolds {
container 'genomicpariscentre/bioperl:1.6.924'
input:
file 'ref.fasta' from reference_raw
output:
file 'ref_trimmed.fasta' into ref_trimmed_for_orfs
file 'ref_trimmed.fasta' into ref_trimmed_for_masking
file 'ref_trimmed.fasta' into ref_trimmed_for_softmasking
file 'ref_trimmed.fasta' into ref_trimmed_for_trinity
file 'ref_trimmed.fasta' into ref_trimmed_for_bamtohints
file 'ref_trimmed.fasta' into ref_trimmed_for_pasa
file 'ref_trimmed.fasta' into ref_trimmed_for_gff2gb
file 'ref_trimmed.fasta' into ref_trimmed_for_busco
file 'ref_trimmed.fasta' into ref_trimmed_for_cufflinks
"trim_fasta_all.pl -i ref.fasta -out ref_trimmed.fasta -length ${params.scaffoldmin}"
}
process busco {
container 'robsyme/busco'
input:
file 'ref.fasta' from ref_trimmed_for_busco
output:
stdout into debug
"ln -s /opt/busco/lineages/fungi . && busco -in ref.fasta -o custom --lineage fungi"
}
// Generate a fasta file of open reading frames.
process identify_orfs {
container 'robsyme/emboss'
input:
file 'ref.fasta' from ref_trimmed_for_orfs
output:
file 'ref_exons.aa' into orfs_fasta
"getorf -sequence ref.fasta -outseq ref_exons.aa -minsize 300 -find 0"
}
// We want to remove ORFs with a high percentage of Xs. Notice that
// the large orfs file is split into pieces containing 1000 fasta
// entries each.
process remove_Xs {
container 'robsyme/bioruby'
input:
file 'orfs.fasta' from orfs_fasta.splitFasta( by: 5000 )
output:
stdout into clean_orfs_for_transposons
stdout into clean_orfs_for_fungi
"""
#!/usr/bin/env ruby
require 'bio'
Bio::FlatFile.open('orfs.fasta').each do |entry|
next if entry.length < (${params.minsize} / 3)
x_percentage = entry.seq.composition['X'] / entry.length.to_f
puts entry if x_percentage < 0.3
end
"""
}
// Run HHblits to identify potential transposons in the (cleaned) open
// reading frames from the 'identify_orfs' step. We run a hhblits
// process for each open reading frame.
process hhblits_transposon {
container 'robsyme/hhblits-transposon'
input:
file 'orfs.fasta' from clean_orfs_for_transposons.splitFasta( by: 500 )
output:
stdout into hhblits_transposon
"""
csplit --elide-empty-files --quiet orfs.fasta '/^>/' '{*}'
for orf in xx*; do
hhblits -i \$orf -o stdout -d /databases/transposons -e 1e-5 -E 1e-5 -id 80 -n 2
done
"""
}
process hhblits_fungi {
container 'robsyme/hhblits-fungi'
input:
file 'orfs.fasta' from clean_orfs_for_fungi.splitFasta( by: 500 )
output:
stdout into hhblits_fungi
"""
csplit --elide-empty-files --quiet orfs.fasta '/^>/' '{*}'
for orf in xx*; do
hhblits -i \$orf -o stdout -d /databases/fungal_50kclus -e 1e-5 -E 1e-5 -id 80 -n 2
done
"""
}
//Look at a hhblits output file and generate a gff file of the matches
process parse_transposon_hhr {
cache 'deep'
input:
file 'search.hhr' from hhblits_transposon.collectFile()
output:
file 'out.gff3' into hhblits_transposon_gff
"""
parse_hhr.rb \
--homology_cutoff 70 \
--evalue_cutoff 1e-3 \
--pvalue_cutoff 1e-5 \
--score_cutoff 100 \
--align_length_cutoff 50 \
--template_length_cutoff 30 \
--repeat \
search.hhr
"""
}
process parse_fungi_hhr {
cache 'deep'
input:
file 'search.hhr' from hhblits_fungi.collectFile()
output:
file 'out.gff3' into hhblits_fungi_gff
"""
parse_hhr.rb \
--homology_cutoff 70 \
--evalue_cutoff 1e-3 \
--pvalue_cutoff 1e-5 \
--score_cutoff 100 \
--align_length_cutoff 50 \
--template_length_cutoff 30 \
search.hhr
"""
}
process repeatmasker {
container 'repeatmasker'
input:
file 'ref.fasta' from ref_trimmed_for_masking
output:
file 'ref.fasta.out.gff' into repeats_gff_for_hints
file 'ref.fasta.out.gff' into repeats_gff_for_softmasking
file 'ref.fasta.masked' into ref_masked_for_golden
file 'ref.fasta.masked' into ref_masked_for_codingquarry
"RepeatMasker -qq -frag 5000000 -gff -species ${params.species} -no_is ref.fasta"
}
process repeatmasker_gff_to_hints {
container 'robsyme/bioruby'
input:
file 'repeats.gff' from repeats_gff_for_hints
output:
stdout into repeat_hints
'''
#!/usr/bin/env ruby
repeats = File.open("repeats.gff", "r")
while repeats.gets
next if $_ =~ /^#/
split = $_.split("\t")
split[2] = "nonexonpart"
split[8] = "src=RM;pri=6"
puts split.join("\t")
end
'''
}
process softMaskReference {
container 'robsyme/bedtools'
input:
file 'ref.fasta' from ref_trimmed_for_softmasking
file 'repeats.gff' from repeats_gff_for_softmasking
output:
file 'ref_softmasked.fasta' into ref_softmasked_for_golden
"maskFastaFromBed -soft -fi ref.fasta -fo ref_softmasked.fasta -bed repeats.gff"
}
process merge_bams {
input:
file '*.bam' from Channel.fromPath(params.bamfiles).toList()
output:
file 'merged.bam' into mapped_reads
file 'merged.bam' into mapped_reads_for_bamtohints
file 'merged.bam' into mapped_reads_for_cufflinks
"samtools merge merged.bam *.bam"
}
process cufflinks {
container 'robsyme/cufflinks'
input:
file 'merged.bam' from mapped_reads_for_cufflinks
output:
file 'transcripts.gtf' into transcriptwtranscripts_gtf_for_codingquarry
"cufflinks --max-intron-length ${params.maxintronlength} --min-intron-length ${params.minintronlength} merged.bam"
}
process codingquarry {
container 'robsyme/codingquarry'
input:
file 'ref.fasta' from ref_masked_for_codingquarry
file 'transcripts.gtf' from transcriptwtranscripts_gtf_for_codingquarry
output:
file 'out/PredictedPass.gff3' into codingquarry_gff
'''
CufflinksGTF_to_CodingQuarryGFF3.py transcripts.gtf > transcripts.gff
CodingQuarry -f ref.fasta -t transcripts.gff
'''
}
process split_bams_by_scaffold {
input:
file 'merged.bam' from mapped_reads
output:
file '*.bam' into split_bams
"""
samtools index merged.bam && \
samtools idxstats merged.bam \
| awk '\$3 > 0 && \$2 > ${params.scaffoldmin} {print \$1}' \
| xargs -n1 -I{} samtools view -b -o {}.bam merged.bam {}
"""
}
process genome_guided_trinity {
container 'robsyme/trinity'
input:
set 'ref.fasta', 'single.bam' from ref_trimmed_for_trinity.spread(split_bams)
output:
file 'trinity_out_dir/Trinity-GG.fasta' into genome_guided_trinity_split
"Trinity --genome_guided_bam single.bam --genome_guided_max_intron ${params.maxintronlength} --max_memory 2G --jaccard_clip --CPU 1 --full_cleanup"
}
process collate_genome_guided_transcripts {
input:
stdin genome_guided_trinity_split.collectFile().map{ it.text }
output:
stdout into genome_guided_trinity
'''
#!/usr/bin/awk -f
/^>/ {
sub(/>GG[0-9]+/, ">GG" count++)
print
}
/^[^>]/ {
print $0
}
'''
}
process denovo_trinity {
container 'robsyme/trinity'
input:
file 'reads.fastq' from file(params.reads)
output:
file 'trinity_out_dir.Trinity.fasta' into denovo_trinity
"Trinity --seqType fq --single reads.fastq --max_memory 2G --CPU 2 --jaccard_clip --full_cleanup"
}
process bam_to_hints {
container 'robsyme/bedtools'
input:
file 'ref.fasta' from ref_trimmed_for_bamtohints
file 'all.bam' from mapped_reads_for_bamtohints
output:
file 'all.bam.junctions.hints' into augustus_hints
"augustus_RNAseq_hints.pl --genome ref.fasta --bam all.bam"
}
// Note that I had to start a separate mysql docker container: docker
// run --name pasadb -e MYSQL_ROOT_PASSWORD=password -e MYSQL_DATABASE=pasa -e MYSQL_USER=pasauser -e MYSQL_PASSWORD=password mysql
process pasa {
container 'robsyme/pasa'
input:
file 'GG_raw.fasta' from genome_guided_trinity
file 'DN_raw.fasta' from denovo_trinity
file 'ref.fasta' from ref_trimmed_for_pasa
file 'alignAssembly.config' from file(params.pasaconf)
output:
file '*.assemblies.fasta.transdecoder.pep' into pasa_cds_for_golden
file '*.assemblies.fasta.transdecoder.genome.gff3' into pasa_gff_for_fl
file '*.assemblies.fasta.transdecoder.pep' into pasa_cds_for_fl
file 'ref.fasta' into reference_genome
"""
grep '^>' DN_raw.fasta \
| awk '{print(substr(\$1, 2))}' \
> DN_raw.list
cat DN_raw.fasta GG_raw.fasta > transcripts.fasta
/opt/pasa/scripts/Launch_PASA_pipeline.pl \
-c alignAssembly.config \
--MAX_INTRON_LENGTH ${params.maxintronlength} \
--stringent_alignment_overlap 30.0 \
-C \
-r \
-R \
-g ref.fasta \
-t transcripts.fasta \
--TDN DN_raw.list \
--ALIGNERS blat,gmap \
--TRANSDECODER \
--CPU 2
/opt/pasa/scripts/build_comprehensive_transcriptome.dbi \
-c alignAssembly.config \
-t transcripts.fasta \
--min_per_ID 95 \
--min_per_aligned 30
/opt/pasa/scripts/pasa_asmbls_to_training_set.dbi \
--pasa_transcripts_fasta *.assemblies.fasta \
--pasa_transcripts_gff3 *.pasa_assemblies.gff3
"""
}
// Pull out the full-length transcripts identified by pasa (and Transdecoder)
process find_full_length_proteins {
container 'robsyme/bioruby'
input:
stdin pasa_cds_for_golden.map{ it.text }
output:
stdout into full_pasa_pep_fasta
"""
#!/usr/bin/env ruby
require 'bio'
Bio::FlatFile.auto(ARGF).each do |entry|
puts entry if entry.definition =~ /type:complete/
end
"""
}
process exclude_partial_genes_from_gff {
container 'robsyme/bioruby'
input:
file 'hits.gff3' from pasa_gff_for_fl
file 'peptide.fasta' from pasa_cds_for_fl
output:
stdout into full_length_gff
'''
#!/usr/bin/env ruby
require "bio"
require "set"
full_length_ids = Bio::FlatFile
.open("peptide.fasta")
.find_all{ |entry| entry.definition =~ /type:complete/ }
.map{ |entry| entry.entry_id }
.to_set
File.open("hits.gff3").each do |line|
next unless line =~ /ID=(cds.)?([^\\|]+)\\|/
next unless full_length_ids.include?($2)
scaffold_name = line.split("\t").first
puts line
end
'''
}
// The input to Augustus training requires that we provide the
// 'golden' annotations as a genbank format, but it's not just any
// genbank format, there are some restrictions.
//
// For the best results, we should remove proteins that are too
// similar. Augusutus will also assume that all nucleotides not
// annotated as coding sequence are non-coding sequence, so we need to
// trim the output to the coding sequence += a small margin either
// side. Note that this is not simply a conversion of gff to genbank.
process gff_to_genbank {
container 'robsyme/augustus'
input:
file 'genome.fasta' from ref_trimmed_for_gff2gb
file 'full_length_genes.gff' from full_length_gff
output:
file 'out.gb' into golden_genbank_for_training
"gff2gbSmallDNA.pl full_length_genes.gff genome.fasta 1000 out.gb"
}
process train_augustus {
container 'robsyme/augustus'
input:
file 'custom.gb' from golden_genbank_for_training
output:
file 'custom.tar.gz' into augustus_trained_parameters
"""
mkdir -p /opt/augustus/config/species/custom/
cp /opt/augustus/config/species/generic/generic_parameters.cfg /opt/augustus/config/species/custom/custom_parameters.cfg
cp /opt/augustus/config/species/generic/generic_weightmatrix.txt /opt/augustus/config/species/custom/
/opt/augustus/bin/etraining --species=custom custom.gb
/opt/augustus/scripts/optimize_augustus.pl --species=custom custom.gb
tar -czvf custom.tar.gz /opt/augustus/config/species/custom
"""
}
debug.subscribe{ println("DEBUG: $it") }
// TODO: Evaluate whether it is at all helpful to supply cufflinks gtf as 'exonpart' hints to augustus. The problem with cufflinks is the concatentation of overlapping transcripts. When those transcripts are from opposite directions, supplying a stranded hint to augustus may prevent the annotation of one of genes that form the fused transcript.
// TODO: Perhaps I can do ORF detection on the cufflinks transcripts and then run those ORFs through pfam and signalP detected domains can be converted into hints for augustus.