-
Notifications
You must be signed in to change notification settings - Fork 0
/
modules.nf
executable file
·412 lines (389 loc) · 16.2 KB
/
modules.nf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
/*
* Process 1: Create a blast database with makeblastdb
*/
process BUILD_TARGET {
tag "${file(targetdesign).baseName}"
executor 'local'
input:
val targetdesign
output:
tuple path("target.table"),
path("target.fasta")
script:
mincols = 4
"""
#!/usr/bin/env bash
set -euo pipefail # Enable strict error handling
# Check if the file exists
if [[ ! -e "${targetdesign}" ]]; then
echo "File ${targetdesign} not found"
exit 1
fi
# Detect delimiter (comma or tab)
format=""
if [[ ! -z \$(head -n1 "${targetdesign}" | awk -F, '{if(NF>1) print \$NF}') ]]; then
format=","
elif [[ ! -z \$(head -n1 "${targetdesign}" | awk -F'\t' '{if(NF>1) print \$NF }') ]]; then
format="\t"
else
echo "${targetdesign} is not a CSV or tab-separated file."
exit 1
fi
# Check for minimum number of columns
ncols=\$(head -n1 "${targetdesign}" | awk -F"\${format}" '{print NF}')
if [[ "\${ncols}" -lt "${mincols}" ]]; then
echo "${targetdesign} does not have the correct number of columns, minimum expected is ${mincols}."
exit 1
fi
# Check if the second column contains a valid sequence
seq1=\$(head "${targetdesign}" | tail -n1 | awk -F"\${format}" '{print \$2}')
nucleotides="ACTGUN${params.markercharacter}"
if [[ ! "\${seq1}" =~ ^[\${nucleotides}]+\$ ]]; then # Use ^ and \$ for exact match
echo "Error! Second column in the target file is supposed to be the probe/marker sequence"
exit 1
fi
# Check for duplicate IDs
if [[ ! -z \$(awk -F"\${format}" 'BEGIN{print } {print \$1}' "${targetdesign}" | sort | uniq -d) ]]; then
echo "ID column in ${targetdesign} cannot have duplicates"
exit 1
fi
echo "Build target and fasta files"
# Create target table
if [[ "\${ncols}" -ge 5 ]]; then
awk -F"\${format}" 'BEGIN{print "ID\tTarget.Chr\tTarget.bp\tTarget.base"} !/Target/{print \$1"\t"\$5"\t"\$3"\t"\$4}' "${targetdesign}" > target.table
else
awk -F"\${format}" 'BEGIN{print "ID\tTarget.Chr\tTarget.bp\tTarget.base"} !/Target/{print \$1"\t-\t"\$3"\t"\$4}' "${targetdesign}" > target.table
fi
# Create probe fasta file
grep -v "Sequence" "${targetdesign}" | awk -F"\${format}" '{print ">"\$1"\\n"\$2}' > target.fasta
"""
}
process BUILD_BLASTDb {
tag "${file(fasta).baseName.replaceAll('-split','')}"
label 'large'
input:
val fasta
val refgenome
output:
val "${blastdatabase}"
script:
fastadir=new File("${refgenome}").getParent()
blastdb=fastadir+"/BLAST/"
blastdatabase="${file(fasta).getName()}".replaceAll("-split.*","")
refgenomename="${file(refgenome).getName()}"
if(new File(fastadir).canWrite())
{
if(blastdatabase!=refgenomename)
{
blastdb=new File(blastdb+"/brioche-blastdb/").toString()
}
if(!new File(blastdb).exists())
{
new File(blastdb).mkdirs()
}
blastdatabase=blastdb.toString()+"/"+blastdatabase
}
"""
#check if blastdb exists
if [[ ! -e "${blastdatabase}.ndb" ]]
then
echo creating database
makeblastdb -in $fasta -out ${blastdatabase} -dbtype "nucl" -title "${params.genomename}"
fi
"""
}
process SPLIT_REFGENOME{
tag "${file(refgenome).baseName}"
input:
val refgenome
val resultsdir
val chromstoexclude
val buildblastdbonly
output:
path "*split.fasta"
script:
path2briocheR=params.pathtobriocher
genomefasta="${file(refgenome).getName()}-split.fasta"
"""
mkdir -p ${resultsdir}
#Check if reference dictionary exists, if not, build one
dict=\$(echo ${refgenome}|sed -e "s/.[^.]*\$/.dict/")
if [[ ! -e "\${dict}" ]]
then
picard -Xms4g -Xmx10g CreateSequenceDictionary -R ${refgenome} -O \${dict}
else
echo "Dictionary file found"
fi
exclude_array="DC"
if [[ ! -z "${chromstoexclude}" ]];
then
exclude_array=\$(echo ${chromstoexclude}|sed 's/,/|/g')
fi
chromosomes=\$(grep "@SQ" \${dict}|cut -f2|sed -e "s/SN://g"|grep -v -P "^\${exclude_array}")
split=\$(echo \${chromosomes}|awk -v maxchrom=30 'NF > maxchrom {print "no"}; NF < maxchrom {print "yes"}')
if [[ "\${split}" == "yees" && "$buildblastdbonly" == "false" ]];
then
if [[ ! -z "\${chromosomes}" ]];
then
for chr in \${chromosomes};
do
samtools faidx $refgenome "\$chr" > "\${chr}-split.fasta"
done
else
echo "No chromosomes found in fasta file"
exit 1
fi
elif [[ ! -z "${params.restrict2chrom}" ]]
then
samtools faidx $refgenome "${params.restrict2chrom}" > "${params.restrict2chrom}-split.fasta"
else
ln -s ${refgenome} ${genomefasta}
fi
echo "Try and install briocheR that has been updated"
#install briocheR if not installed
Rscript -e "if(!require('briocheR')){
install.packages('${path2briocheR}',repos=NULL);
print('BriocheR installed')}"
"""
}
/*
* Step2: Process to run blast of the probes provided against reference genome
*/
process RUN_BLAST {
tag "${file(blastdb).baseName}"
label 'large'
input:
path probefasta
val blastdb
val blastoutformat
output:
path "${splitname}*"
script:
splitname=file("${blastdb}").baseName
"""
blastn -db ${blastdb} -query ${probefasta} \\
-out blastoutput.txt \\
-num_threads $task.cpus \\
-outfmt ${blastoutformat} \\
-evalue ${params.evalue} ${params.otherblastoptions} \\
-dust ${params.dust}
split -l 50000 blastoutput.txt ${splitname}
"""
}
/*
* Step 3 Process to pre-process the results from blastn
*/
process PROCESS_BLAST_RESULTS {
tag "${blastout.baseName}"
label 'large'
input:
path blastout
val targetdesign
val minmismatches
val extendablebps
path genomefasta
val blastoutformat
val istarget3primeend
val markerchar
output:
path "${rds}"
path "${snpmapped}"
script:
snpmapped="${blastout.baseName}_samtools_coordinates_mapped.tsv"
rds="SNPcoordinates/${blastout.baseName}_blast_results.RDS"
"""
echo ${blastout}
snpcoords="\$(pwd)/SNPcoordinates/"
Rscript -e "briocheR::processBlastResults(
blast.file='${blastout}',
min.length=${minmismatches},
path.2save.coords = '\${snpcoords}',
extendable.site.bps = ${extendablebps},
targets='${targetdesign}',
outfmt=strsplit(${blastoutformat},' ')[[1]][-1],
istarget3primeend='${istarget3primeend}',
marker.char='${markerchar}')"
mv \${snpcoords}/processed_blast_results.RDS ${rds}
samtools faidx ${genomefasta} -r "\${snpcoords}/samtools_marker_positions.txt"|awk '
BEGIN {FS = "\\n"}
/^>/ {
if (seq != "") {
print id "\\t" seq
}
id = substr(\$1, 2)
seq = ""
}
!/^>/ {
seq = seq \$1
}
END {
print id "\\t" seq
}
' >"${snpmapped}"
"""
}
/*
Step4: Perform filtering and generate annotations
*/
process ADD_MARKERINFO{
tag "${blast_results.baseName}"
input:
path blast_results
path coordinates
val probename
val genomename
val keepduplicates
val coveragefilter
val pidentfilter
output:
tuple path("${allmappings}"), path("${mappings}")
script:
def random_factor = new Random().nextInt()
allmappings="${blast_results.baseName}_"+random_factor+"_all_mappings.csv"
mappings="${blast_results.baseName}_"+random_factor+"_mapping.tsv"
"""
R --version
Rscript -e "briocheR::addSNPdetails(
blast.path='${blast_results}',
reference.bases='${coordinates}',
probe.name='${probename}',
genome.name='${genomename}',
output.path = '\$(pwd)',
pident.filter=${pidentfilter},
coverage.filter= ${coveragefilter})"
mv "${probename}_with_${genomename}_all_mappings.csv" ${allmappings}
mv "${probename}_with_${genomename}_mapping.tsv" ${mappings}
"""
}
/*
Step5: Process to merge individual mapping results into a single file
*/
process MERGE_MAPPINGS {
publishDir "${params.resultsdirectory}/",mode:'move'
label 'merge'
tag 'Merge_all_CSVs'
input:
path allmappings
path resultsdir
val probename
val genomename
output:
tuple path("${filteredmappings}"), path("${allmappings}")
script:
allmaps=allmappings[0]
filteredmappings="${probename}_with_${genomename}_filtered_mappings.csv"
allmappings="${probename}_with_${genomename}_all_mappings.csv"
"""
#-k\${SNP_ID},\${SNP_ID} -k\${coverage},\${coverage}r -k\${pident},\${pident}r -k\${chrom},\${chrom} -k\${pos},\${pos}
header=\$(head -n1 $allmaps)
chrom=\$(echo \$header|awk -F, '{for(i=1;i<=NF;i++){if(\$i=="saccver")print i}}')
pos=\$(echo \$header|awk -F, '{for(i=1;i<=NF;i++){if(\$i=="sstart")print i}}')
SNP_ID=\$(echo \$header|awk -F, '{for(i=1;i<=NF;i++){if(\$i=="qaccver")print i}}')
pident=\$(echo \$header|awk -F, '{for(i=1;i<=NF;i++){if(\$i=="pident")print i}}')
coverage=\$(echo \$header|awk -F, '{for(i=1;i<=NF;i++){if(\$i=="Coverage")print i}}')
find -name "*.csv"|split -l 100 - subsetcsvs
for ss in subsetcsvs*;
do
cat \$(awk '{printf("%s ",\$0)}' \$ss)|grep -v "qaccver" >> Mergedmappings.csv
done
sort -t, -k\${SNP_ID},\${SNP_ID} -k\${coverage},\${coverage}r -k\${pident},\${pident}r -k\${chrom},\${chrom} -k\${pos},\${pos} Mergedmappings.csv \\
|awk -F, -v h="\$header" 'BEGIN{print "Alternate_SNP_ID,"h} {count[\$1]++;print \$1"."count[\$1]","\$0}' \\
>allmaps.csv
awk -F, -v maxhit="${params.maximumhits}" 'BEGIN{split(\$1,a,".");data[\$1]=\$0;IDs[a[1]]=0} {split(\$1,a,".");IDs[a[1]]++;data[\$1]=\$0} END{for(n in data){split(n,b,".");if(IDs[b[1]]<=maxhit)print data[n]}}' allmaps.csv > map.csv
echo "##brioche parameters" > header.txt
echo "##evalue = ${params.evalue} #The number of expected hits of similar quality (score) that could be found just by chance." >> header.txt
echo "##minimumlength = ${params.minlength} #minimum length of HSP to be considered fully hybridized" >> header.txt
echo "##extendablebps =${params.extendablebps} #number of matching base pairs from the 3 prime end for a probe to be considered as extendable" >> header.txt
echo "##maximumgaps = ${params.maxgaps} #Maximum number of gaps allowed per HSP" >> header.txt
echo "##coverage = ${params.coverage} #Option to filter any hits with coverage less than the provided percentage" >> header.txt
echo "##pident = ${params.pident} #Option to filter any hits with pident less than the provided percentage" >> header.txt
echo "##maximumhits = ${params.maximumhits} #Filter probes with hits more than maximumhits" >> header.txt
cat header.txt map.csv >"${filteredmappings}"
cat header.txt allmaps.csv>"${allmappings}"
"""
}
/*
Step5: Process to merge individual mapping results into a single file
*/
process MERGE_MIN_MAPPINGS {
publishDir "${params.resultsdirectory}/",mode:'move'
label 'merge'
tag 'Merge_all_TSVs'
input:
path mappings
val resultsdir
val probename
val genomename
output:
tuple path("${outfile}"),
path("${pretzelfile}")
script:
allmaps=mappings[0]
outfile="${probename}_with_${genomename}_mappings.tsv"
pretzelfile="${probename}_with_${genomename}_mappings-pretzel-alignment.xlsx"
"""
header=\$(head -n1 $allmaps)
chrom=\$(echo \$header|awk '{for(i=1;i<=NF;i++){if(\$i=="Chr")print i}}')
pos=\$(echo \$header|awk '{for(i=1;i<=NF;i++){if(\$i=="SNP_position")print i}}')
SNP_ID=\$(echo \$header|awk '{for(i=1;i<=NF;i++){if(\$i=="SNP_ID")print i}}')
pident=\$(echo \$header|awk '{for(i=1;i<=NF;i++){if(\$i=="pident")print i}}')
header=\$(echo \$header|awk '{for(i=1;i<=NF;i++){if(i==1){printf \$i"\tAlternate_SNP_ID"}else{printf "\t"\$i}}}')
coverage=\$(echo \$header|awk '{for(i=1;i<=NF;i++){if(\$i=="Coverage")print i}}')
find -name "*.tsv"|split -l 100 - subsettsvs
for ss in subsettsvs*;
do
cat \$(awk '{printf("%s ",\$0)}' \$ss)|grep -v "SNP_ID" >> Mergedmappings.tsv
done
sort -k\${SNP_ID},\${SNP_ID} -k\${coverage},\${coverage}r -k\${pident},\${pident}r -k\${chrom},\${chrom} -k\${pos},\${pos} Mergedmappings.tsv \\
|awk 'BEGIN{count[\$1]=0} {count[\$1]++;for(i=1;i<=NF;i++){if(i==1){printf\$i"\t"\$i"."count[\$1]}else{printf"\t"\$i}};print""}' \\
|awk -v maxhit=${params.maximumhits} -v h="\$header" 'BEGIN{print h; split(\$2,a,".");data[\$2]=\$0;count[\$1]=0} {count[\$1]++;data[\$2]=\$0} END{for(n in data){split(n,a,".");if(count[a[1]]<=maxhit){print data[n]}}}' \\
|awk '{if(\$0!=""){print \$0}}' \\
> map.tsv
echo "##brioche parameters" > header.txt
echo "##evalue = ${params.evalue} #The number of expected hits of similar quality (score) that could be found just by chance." >> header.txt
echo "##minimumlength = ${params.minlength} #minimum length of HSP to be considered fully hybridized" >> header.txt
echo "##extendablebps =${params.extendablebps} #number of matching base pairs from the 3 prime end for a probe to be considered as extendable" >> header.txt
echo "##maximumgaps = ${params.maxgaps} #Maximum number of gaps allowed per HSP" >> header.txt
echo "##coverage = ${params.coverage} #Option to filter any hits with coverage less than the provided percentage" >> header.txt
echo "##pident = ${params.pident} #Option to filter any hits with pident less than the provided percentage" >> header.txt
echo "##maximumhits = ${params.maximumhits} #Filter probes with hits more than maximumhits" >> header.txt
cat header.txt map.tsv >"${outfile}"
#Generate pretzel files
echo ${pretzelfile}
#Format of alignment file
#Name Chromosome Start End #Comments
#Rows which start with # such as this one are filtered out and not loaded into the database.
#Columns with a column header which starts with #, e.g. #Comments in column 5, are filtered out and not loaded into the database.
#The given column headings are the minimum for an Alignment dataset; other columns may be added and will be loaded into the database if they have a column heading.
#There values will be stored in Feature.values.columnName, and will be displayed in the brushed/selected Features table in the right panel of Pretzel.
displayName="ChangeMe-Genome-${genomename}-${probename}-Markers"
parent="ChangeMe"
commonName="ChangeMe"
Description="${probename.replaceAll('_',' ')} markers mapped to ${genomename} using brioche"
datasetname=\$(echo "Alignment|${genomename}_${probename}"|awk '{print substr(\$0,1,30)}')
Rscript -e "
data<-read.delim('${outfile}',header=T,sep='\\\\t',comment.char = '#',stringsAsFactors=F,strip.white=T);
cols=colnames(data)[c(1,4,5,5,6,7)]
cols2=colnames(data)
print(cols)
cols2=cols2[!cols2 %in% cols]
data=cbind(data[,cols],data[,cols2])
#rearrange columns
colnames(data)[1:6]<-c('Name','Chromosome','Start','End','REF','ALT');
# Identify duplicate columns
duplicate_cols <- colnames(data)[duplicated(lapply(data, function(x) as.character(x)))]
# Keep only unique columns
keep.cols<-!colnames(data) %in% duplicate_cols
keep.cols[colnames(data) %in% 'End']=T
data <- data[, keep.cols]
if(!require('openxlsx'))
{
install.packages('openxlsx',repos = 'https://cloud.r-project.org');
}
data <-list('\${datasetname}'=data,Metadata = data.frame(
Field = c('displayName','parentName', 'species', 'Description','citation','Data source','Uploaded by','Contact','Licencing')));
data[['Metadata']][['\${datasetname}']] <-c('\${displayName}','\${commonName}','\${parent}','\${Description}','Change me','Change me','Change me','Change me','Change me');
openxlsx::write.xlsx(data, file='${pretzelfile}')
"
"""
}