-
Notifications
You must be signed in to change notification settings - Fork 1
/
ASV_pipeline.sh
executable file
·290 lines (275 loc) · 9.91 KB
/
ASV_pipeline.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
#!/usr/bin/env bash
# This scripts will apply the ASV pipeline used in Littlefair et al.
# 2019.
# ---------
# Arguments
# ---------_
# 1) file with the list fileset prefix (before the _R1)
# 2) Forward primer
# 3) Reverse primer
# 4) primer name
# 5) prefix of output
# 6) Number of threads/cores
# 7) Minimum amplicon lenght
# 8) Maximum amplicon lenght
# 9) unoise minimum abundance parameter
# -------------
# Requirements:
# -------------
# 1) cutadapt
# 2) usearch
# 3) vsearch
# 4) pear
# 5) seqkit
# 6) fastqc
# 7) GNU parallel
#set -e
source ${HOME}/.bashrc
# some functions
length_stats(){
python - << EOF
from collections import Counter
lengths = {}
name = None
seq = ''
with open("$1") as F:
for line in F:
if line.startswith('>'):
if name is not None:
lengths[name] = len(seq.strip().replace('\n',''))
seq = ''
name = line.strip()
else:
seq += line
if name not in lengths:
lengths[name] = len(seq.strip().replace('\n',''))
c = Counter(lengths.values())
mc = c.most_common(1)[0][0]
gt = sum([x[1] for x in c.items() if x[0] > mc ])
lt = sum([x[1] for x in c.items() if x[0] < mc ])
line = '\n%d sequences are longer than the mode\n%d sequences are ' \
'shorter than the mode' % (gt, lt)
sor = sorted(c.items(), key=lambda x: x[0])
with open('${2}_distribution.txt', 'a+') as F:
F.write('\n'.join(['%d:%d' % x for x in sor]))
F.write(line)
EOF
}
run_QC_single_pair(){
# Argumnets:
# 1) Forward primer
# 2) Reverse primer
# 3) Reverse complement of reverse primer
# 4) Reverse complement of forward primer
# 5) Path to output directory
# 6) prefix of output
# 7) Prefix of the fileset
# 8) cpus
# 9) minimum read length
# run first cutadapt
fileset=`basename ${7}`
sample=`echo ${fileset}|rev|cut -d '.' -f1|rev`
echo "Processing sample ${sample}"
echo -e "\tRemoving primers"
if [[ -s ${5}/${sample}.1.fastq.gz ]]
then
echo -e "File ${5}/${sample}.1.fastq.gz exist and is not empty... Skipping"
else
cutadapt -g "${1}" -G "${2}" -a "${3}" -A "${4}" \
-o ${5}/${sample}.1.fastq.gz -p ${5}/${sample}.2.fastq.gz \
-m $9 --match-read-wildcards -q 25 --trim-n -n 2 --untrimmed-output \
${5}/untrimmed.${sample}.fastq --untrimmed-paired-output \
${5}/untrimmed.paired.${sample}.fastq ${7}_R1.fastq.gz \
${7}_R2.fastq.gz > ${5}/${sample}.log1
fi
# Merge reads
#echo "Running pear"
#pear -f ${5}/${sample}.1.fastq.gz -r ${5}/${sample}.2.fastq.gz \
#-o ${5}/${sample}_pear -q 25 -t $9 -s 2 > ${5}/${sample}_pear.log
#echo "Checking if any adapters remain"
if [[ -s ${5}/${sample}_pear.assembled.fastq ]]
then
echo -e "\t\tFile ${5}/${sample}_pear.assembled.fastq exist and is not empty... Skipping"
else
echo -e "\tMerging reads"
pear -f ${5}/${sample}.1.fastq.gz -r ${5}/${sample}.2.fastq.gz \
-o ${5}/${sample}_pear -q 25 -t $9 -s 2 > ${5}/${sample}_pear.log
fi
# check if adapters still there
echo -e "\tIdentifying remnant primers"
seqkit -j ${8} locate -d -p "${1}" ${5}/${sample}_pear.assembled.fastq > ${5}/${sample}.fwd
seqkit -j ${8} locate -d -p "${2}" ${5}/${sample}_pear.assembled.fastq > ${5}/${sample}.rev
# cut the adapter if they remain
if [[ $(wc -l <${5}/${sample}.fwd) -ge 2 || $(wc -l <${5}/${sample}.rev) -ge 2 ]]
then
echo -e "\t\tRemnant primers found, attempting to remove them"
cutadapt -a "$3" -m $9 -n 2 --too-short-output ${5}/${sample}.short.fastq \
-o ${5}/${sample}.3trimmed.fastq ${5}/${sample}_pear.assembled.fastq > ${5}/${sample}.log2
cutadapt -g "${1}" -n 2 -o ${5}/${sample}.5trimmed.fastq \
${5}/${sample}.3trimmed.fastq > ${outdir}/${sample}.log3
else
echo -e "\t\tNo remnant primers found"
cp ${5}/${sample}_pear.assembled.fastq ${5}/${sample}.3trimmed.fastq
fi
echo "Dereplicating"
usearch -fastx_uniques ${5}/${sample}.3trimmed.fastq -fastaout \
${5}/${sample}.trimmed.derep.fasta -sizeout
echo "Estimating length distributions"
length_stats ${5}/${sample}.trimmed.derep.fasta ${5}/${6}
echo "Estimating stats for all the steps"
seqkit -j ${8} stats ${5}/*${sample}*.fa* >> ${5}/${6}.stats
# dereplicate
echo -e "\tRemoving duplicate sequences"
if [[ -s ${5}/${sample}.trimmed.derep.fasta ]]
then
echo -e "\t\tFile${5}/${sample}.trimmed.derep.fasta exist and is not empty... Skipping"
else
fs=`du -b ${5}/${sample}.3trimmed.fastq|cut -f1`
fs=$(( fs / 1000000000 ))
if [[ "fs" -gt 3 ]]
then
p=`python -c "from math import ceil; print(int(ceil($fs/3))"`
seqkit -j ${8} split2 ${5}/${sample}.3trimmed.fastq -p ${p} -f \
-O split_trimmed
for i in split_trimmed/*part_*
do
usearch -fastx_uniques ${i} -fastaout split_trimmed/${i}_uniq \
-sizeout -threads ${8}
done
cat split_trimmed/*_uniq > tmp
usearch -fastx_uniques tmp -fastaout ${5}/${sample}.trimmed.derep.fasta \
-sizein -sizeout -threads ${8}
else
usearch -fastx_uniques ${5}/${sample}.3trimmed.fastq -fastaout \
${5}/${sample}.trimmed.derep.fasta -sizeout -threads ${8}
fi
fi
# get lenghth distributions
length_stats ${5}/${sample}.trimmed.derep.fasta ${5}/${6}
# get stats for all the steps
seqkit -j ${8} stats ${5}/*${sample}*.fa* >> ${5}/${6}.stats
# run a fastqc
echo "Running fastqc"
fastqc ${5}/${sample}.3trimmed.fastq -o ${5}
#fastqc ${5}/${sample}.3trimmed.fastq -o ${5}
echo -e "\n\n"
}
prog() {
# Progress bar, courtesy of ilkkachu (stackoverflow)
local w=80 p=$1; shift
printf -v dots "%*s" "$(( $p*$w/100 ))" ""; dots=${dots// /.};
printf "\r\e[K|%-*s| %3d %% %s" "$w" "$dots" "$p" "$*";
}
fastx_relabel() {
# 1. *.lengthfilter.fastq file
i=${1}
prefix=`basename ${i%%.lengthfilter.fastq}`_
awk -v l="$prefix" 'BEGIN { cntr = 0 } /^@[A-Z][0-9]{5}:.{3}:*:*[:space:]*/ { \
cntr++ ; print l, cntr } !/^@[A-Z][0-9]{5}:.{3}:*:*[:space:]*/ \
{ print $0 }' > ${i%%.fastq}_relabel.fastq
}
# Get the command line arguments
file_list=$1
ADAPTER_FWD=$2
ADAPTER_REV=$3
primer_name=$4
prefix=$5
cpus=$6
min_len=$7
max_len=$8
outdir=${prefix}_output${primer_name}
min_read_len=$(( (min_len / 2) - 20 ))
usearch_min_size=${9}
echo `date`
echo -e "executing $0 $@\n"
# Process the primers and get reverse complements
Adapter1rc=`echo $ADAPTER_FWD | tr 'ACGTYRSWKMBDHV' 'TGCARYSWMKVHDB' | rev`
Adapter2rc=`echo $ADAPTER_REV | tr 'ACGTYRSWKMBDHV' 'TGCARYSWMKVHDB' | rev`
# create output folder
mkdir -p ${outdir}
echo "" > ${outdir}/${prefix}.stats
echo "" > ${outdir}/${prefix}_distribution.txt
# run QC in all pairs
while read line
do
run_QC_single_pair ${ADAPTER_FWD} ${ADAPTER_REV} ${Adapter2rc} \
${Adapter1rc} ${outdir} ${prefix} ${line} ${cpus} ${min_read_len}
done <${file_list}
# Trim the reads to expected length
echo "Trimming all samples between ${min_len} and ${max_len}"
n=`ls ${outdir}/*3trimmed.fastq| wc -l`
perc=0
for i in ${outdir}/*3trimmed.fastq
do
let perc++
prog $(( (perc * 100) / n ))
cat ${i} | seqkit -j ${cpus} seq -m ${min_len} \
-M ${max_len} >${i%%3trimmed.fastq}lengthfilter.fastq
done
echo -e "\nConcatenating all resulting files"
# Concatenate all resulting files
cat ${outdir}/*.lengthfilter.fastq > ${outdir}/all.lengthfilter.fastq
# Dereplicate on full set reporting size of identical sequences
echo -e "\nDereplicating full set reporting size of identical sequences"
vsearch --derep_fulllength ${outdir}/all.lengthfilter.fastq -sizein -sizeout \
-relabel Uniq -output ${outdir}/vs_all_lengthfilter.fasta
# execute denoising in full file
echo -e "\nDenoising"
usearch -unoise3 ${outdir}/vs_all_lengthfilter.fasta -zotus \
${outdir}/all_lengthfilter_zotus_${usearch_min_size}.fasta \
-minsize ${usearch_min_size} -tabbedout \
${outdir}/allzotus_lengthfilter_${usearch_min_size}.txt
# Rename to Zotu
echo -e "\n Renaming sequences"
sed -i "s/>O/>Zo/g" ${outdir}/all_lengthfilter_zotus_${usearch_min_size}.fasta
# reannotate each sample
echo -e "\n Re-annotate each sample"
n=`ls ${outdir}/*lengthfilter.fastq| wc -l`
perc=0
for i in ${outdir}/*lengthfilter.fastq
do
let perc++
prog $(( (perc * 100) / n ))
usearch -fastx_relabel ${i} -prefix `basename ${i%%.lengthfilter.fastq}`_ \
-fastqout ${i%%.fastq}_relabel.fastq -keep_annots
#fastx_relabel ${i}
done
# Make database with the joined file
echo -e "\n\nCreating usearch database"
usearch -makeudb_usearch ${outdir}/all_lengthfilter_zotus_${usearch_min_size}.fasta \
-output ${outdir}/all_lengthfilter_zotus_${usearch_min_size}.udb
# Concatenating relabeled sequences
echo -e "\nConcatenating relabeled sequences"
cat ${outdir}/*lengthfilter_relabel.fastq > ${outdir}/all.lengthfilter_relabel.fastq
# get file_size
echo -e "\nGenerating ZOTU/ASV table"
file_size=`du -b ${outdir}/all.lengthfilter_relabel.fastq|cut -f1`
file_size=$(( file_size / 1000000000 ))
if [[ "$file_size" -gt 3 ]]
then
# split file into 3GB chunks
p=`python -c "from math import ceil; print(int(ceil(${file_size}/3)))"`
seqkit split2 ${outdir}/all.lengthfilter_relabel.fastq -p ${p} -f \
-O split
# execute in smaller files
for i in split/*part_*
do
dir=`dirname ${i}`
part=`basename ${i}|rev| cut -d'.' -f 2|rev`
usearch -otutab ${i} -sample_delim _ \
-zotus ${outdir}/all_lengthfilter_zotus_${usearch_min_size}.udb \
-otutabout ${dir}/${part}_zotutab_${usearch_min_size}.txt \
-mapout ${dir}/${part}zmap_${usearch_min_size}.txt
done
# merge them
# merge the individual otu tables
files=`echo split/*zotutab_${usearch_min_size}.txt|tr ' ' ','`
usearch -otutab_merge ${files} -output ${outdir}/allzotus_lengthfilter_${usearch_min_size}.txt
else
# execute full
# Created the Zotu table with the renamed samples
usearch -otutab ${outdir}/all.lengthfilter_relabel.fastq -sample_delim _ \
-zotus ${outdir}/all_lengthfilter_zotus_${usearch_min_size}.udb -otutabout \
${outdir}/all_${primer_name}zotutab_${usearch_min_size}.txt -mapout \
${outdir}/all_${primer_name}zmap_${usearch_min_size}.txt
fi