-
Notifications
You must be signed in to change notification settings - Fork 0
/
assembly_pipeline.sh
193 lines (161 loc) · 8.21 KB
/
assembly_pipeline.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
#!/usr/bin/env bash
#Note: This script requires a main directory containing fastq read files and a file named "species_file.txt" with the genus and species on separate lines.
#It also assumes that the main directory is named after the strain, and uses that name as a label for outputs.
#Software should be available in the folder at the image_path in the form of singularity images.
#Images were sourced from the Staphb repository on dockerhub, or the biocontainers repository if they were not available in the Staphb repository. Software versions are described in the manuscript.
#Reads should be paired end, and contained within separate gzipped fastq files.
#Script should be called using "bash assembly_pipeline.sh /folder/path"
#Setting inputs
#Moving fastq files to trimmomatic folder, if they weren't there already
current_dir=$1
mkdir $current_dir/trimmomatic
mv $current_dir/*.fastq.gz $current_dir/trimmomatic
#Setting path to fastq files
fastq_dir=$current_dir/trimmomatic
pair_1_raw=$(find $fastq_dir -name "*R1*.fastq.gz")
echo "Read 1 is $pair_1_raw"
pair_2_raw=$(find $fastq_dir -name "*R2*.fastq.gz")
echo "Read 2 is $pair_2_raw"
echo "fastq directory is $fastq_dir"
#Path to image files
image_path="/home/amy/Documents/Genomics_Project/scripts"
echo "Images are found are $(ls $image_path/*.img)"
#Finding the name of the sample, assuming the set folder has the sample name
strain="${current_dir%"${current_dir##*[!/]}"}"
strain="${strain##*/}"
echo "Sample is $strain"
#Setting the genus and species names
mapfile -t lines <$1/species_file.txt
echo "Species is ${lines[0]} ${lines[1]}"
#Shovill image
shovill_img=$image_path/shovill.img
#Running first fastQC on raw reads
mkdir $1/trimmomatic/fastqc_run1
singularity run $image_path/fastqc.img fastqc -o $1/trimmomatic/fastqc_run1 \
--extract --nogroup --format fastq --threads 6 --dir $1/trimmomatic \
$pair_1_raw $pair_2_raw
#Running trimmomatic on reads, using the trimmomatic executable from the shovill image
singularity run $shovill_img trimmomatic PE -threads 6 -phred33 \
$pair_1_raw $pair_2_raw $1/trimmomatic/R1.fq.gz /dev/null $1/trimmomatic/R2.fq.gz /dev/null CROP:300 \
ILLUMINACLIP:/shovill/shovill-1.1.0/db/trimmomatic.fa:1:30:11 LEADING:3 TRAILING:3 SLIDINGWINDOW:10:20 MINLEN:30 TOPHRED33 2>&1 | sed 's/^/[trimmomatic] /' | tee -a shovill.log
#Setting reads names
pair_1=$1/trimmomatic/R1.fq.gz
echo "Read 1 is $pair_1"
pair_2=$1/trimmomatic/R2.fq.gz
echo "Read 2 is $pair_2"
#Running first fastQC on raw reads
mkdir $1/trimmomatic/fastqc_run2
singularity run $image_path/fastqc.img fastqc -o $1/trimmomatic/fastqc_run2 \
--extract --nogroup --format fastq --threads 6 --dir $1/trimmomatic \
$pair_1 $pair_2
#Running shovill
singularity run $shovill_img shovill \
--cpus 6 \
--ram 15 \
--outdir $1/shovill \
--R1 $pair_1 --R2 $pair_2
#Running Prokka
#Copying assembly to prokka folder
assembly=$1/shovill/contigs.fa
echo "assembly path is $assembly"
#Running the prokka pipeline
singularity run $image_path/prokka.img \
prokka --outdir $1/prokka \
--cpus 4 \
--genus "${lines[0]}" \
--species "${lines[1]}" \
--prefix "$strain"_annotated \
--centre X \
--compliant \
$assembly
#Running Quast
#Finding the prokka genome feature file output and each paired end read
genome_feat=$(find $current_dir/prokka -name "*annotated.gff")
echo "feature file is $genome_feat"
#Running quast
singularity run $image_path/quast.img python /usr/local/bin/quast.py $assembly \
-g $genome_feat \
--pe1 $pair_1 \
--pe2 $pair_2 \
-o $1/quast \
-t 4 \
-m 0
#Running abricate
#Path to annotation
prokka_dir=$1/prokka
echo "Input directory is $prokka_dir"
echo "File to use: $prokka_dir/"$strain"_annotated.fna"
#Output path
mkdir $1/abricate
abr_dir=$1/abricate
echo "Output will be located at $abr_dir"
#Running the abricate against ncbi, CARD, resfinder and plasmidfinder databases using the contig nucleotide file
singularity run $image_path/abricate.img \
abricate --db card $prokka_dir/"$strain"_annotated.fna > $abr_dir/"$strain"_card.tab
singularity run $image_path/abricate.img \
abricate --db plasmidfinder $prokka_dir/"$strain"_annotated.fna > $abr_dir/"$strain"_plasmidfinder.tab
singularity run $image_path/abricate.img \
abricate --db vfdb $prokka_dir/"$strain"_annotated.fna > $abr_dir/"$strain"_vfdb.tab
singularity run $image_path/abricate.img \
abricate --db ecoli_vf $prokka_dir/"$strain"_annotated.fna > $abr_dir/"$strain"_ecoli_vf.tab
#Creating a summary file
singularity run $image_path/abricate.img \
abricate --summary $abr_dir/"$strain"_card.tab $abr_dir/"$strain"_plasmidfinder.tab $abr_dir/"$strain"_vfdb.tab $abr_dir/"$strain"_ecoli_vf.tab > $abr_dir/"$strain"_summary.tab
#Running amrfinder
#Path to amr_finder directory
mkdir $1/amrfinder
amr_dir=$1/amrfinder
echo "Output directory is $amr_dir"
#Setting genus and species variables
genus=${lines[0]}
species=${lines[1]}
#Reformatting the gff file to make it compatible with amrfinder
perl -pe '/^##FASTA/ && exit; s/(\W)Name=/$1OldName=/i; s/ID=([^;]+)/ID=$1;Name=$1/' $prokka_dir/"$strain"_annotated.gff > $prokka_dir/"$strain"_amrfinder.gff
echo "Created reformatted gff file $prokka_dir/"$strain"_amrfinder.gff"
#Species list and genus list
valid_species="Acinetobacter_baumannii Clostridioides_difficile Enterococcus_faecalis Enterococcus_faecium Pseudomonas_aeruginosa Staphylococcus_aureus Staphylococcus_pseudintermedius Streptococcus_agalactiae Streptococcus_pneumoniae Streptococcus_pyogenes Vibrio_cholerae"
valid_genus="Escherichia Campylobacter Klebsiella Neisseria Salmonella"
#Changing the command based on whether a valid species is present
if [[ $valid_species =~ (^|[[:space:]])""$genus"_"$species""($|[[:space:]]) ]]; then
echo "Valid species "$genus"_"$species" found"
singularity run $image_path/amrfinder.img \
amrfinder --plus \
--protein $prokka_dir/"$strain"_annotated.faa \
--nucleotide $prokka_dir/"$strain"_annotated.fna \
--gff $prokka_dir/"$strain"_amrfinder.gff \
--organism "$genus"_"$species" \
--name $strain \
--output $amr_dir/"$strain"_amrfinder.txt \
--mutation_all $amr_dir/"$strain"_muations.txt \
--protein_output $amr_dir/"$strain"_amr_protein.fasta \
--nucleotide_output $amr_dir/"$strain"_amr_nucleotide.fasta \
--log $amr_dir/"$strain"_amrfinder_log.txt
#Changing command based on if a valid genus is present
elif [[ $valid_genus =~ (^|[[:space:]])"$genus"($|[[:space:]]) ]]; then
echo "Valid genus for "$genus"_"$species" found"
singularity run $image_path/amrfinder.img \
amrfinder --plus \
--protein $prokka_dir/"$strain"_annotated.faa \
--nucleotide $prokka_dir/"$strain"_annotated.fna \
--gff $prokka_dir/"$strain"_amrfinder.gff \
--organism "$genus" \
--name $strain \
--output $amr_dir/"$strain"_amrfinder.txt \
--mutation_all $amr_dir/"$strain"_muations.txt \
--protein_output $amr_dir/"$strain"_amr_protein.fasta \
--nucleotide_output $amr_dir/"$strain"_amr_nucleotide.fasta \
--log $amr_dir/"$strain"_amrfinder_log.txt
#Command if no valid genus or species
else
echo "No valid species or genus for "$genus"_"$species" found"
singularity run $image_path/amrfinder.img \
amrfinder --plus \
--protein $prokka_dir/"$strain"_annotated.faa \
--nucleotide $prokka_dir/"$strain"_annotated.fna \
--gff $prokka_dir/"$strain"_amrfinder.gff \
--name $strain \
--output $amr_dir/"$strain"_amrfinder.txt \
--protein_output $amr_dir/"$strain"_amr_protein.fasta \
--nucleotide_output $amr_dir/"$strain"_amr_nucleotide.fasta \
--log $amr_dir/"$strain"_amrfinder_log.txt
fi