From 455445b88f90d298fe34b3201a9182f6ad93d700 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Renaut?= Date: Wed, 28 Nov 2018 07:56:08 -0800 Subject: [PATCH 01/10] new improved master script --- README.md | 22 ++ R_scripts/combining_umerged.R | 61 +++++ bash_scripts/master_samsa2.slurm | 431 +++++++++++++++++++++++++++++++ 3 files changed, 514 insertions(+) create mode 100755 R_scripts/combining_umerged.R create mode 100755 bash_scripts/master_samsa2.slurm diff --git a/README.md b/README.md index 2ae3d5f..189ac85 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,27 @@ # SAMSA2 - A fork of the complete metatranscriptome analysis pipeline +### modification by sebastien.renaut@gmail.com + +### New in my version. +There is a new master script (master_samsa2.slurm) with several modifications: + +* It specifies multithreading for all programs for which it is available (trimmomatic, pear, diamond) +* The script annomatically creates a checkpoint file. Once a step is finished, it writes the name of that specific step in the file. When you run the script again, it will SKIP the steps labelled in checkpoint. +* In the merging step, unmerged reads are concatenated and added to a single file. The forward read and the reverse (complement) read are concatenated with a string of 20 Ns in the middle: This is done through a new R script entitled: `combining_umerged.R` +* Extra care is taken to remove unnecessary files once a step is performed to keep disk usage at a minimum. +* Each step contains an exit statement to be printed if the master script dies. +* Trimommatic removes adapter contamination according to a specific file +* All options are to be specified in the first section of the script +* The script is formated to be run into a SLURM job scheduler, but this can be easily changed / removed. + + +#### To do: +* reverse the order of the merging and trimming step. +* simplify some syntax / make sure all paths are relative... + + + + Version 2 of the SAMSA pipeline - faster! Lighter! More options! Less waiting! ### New in version 2: diff --git a/R_scripts/combining_umerged.R b/R_scripts/combining_umerged.R new file mode 100755 index 0000000..c04f28c --- /dev/null +++ b/R_scripts/combining_umerged.R @@ -0,0 +1,61 @@ +#!/cvmfs/soft.computecanada.ca/easybuild/software/2017/avx2/Compiler/intel2016.4/r/3.5.0/bin/Rscript + +args = commandArgs(TRUE) +start = args[1] # Specify which sequences in "list_ind" file you want to align, directlty from the shell. Alternatively, you can do this from the "alignments" function itself. +#start="/Users/jerry/Documents/CSBQ/shapiro/results" + +setwd(start) + +###files to work with +system(paste("ls -1 *unassembled.forward.fastq >umerged.forward.files",sep = "")) +system(paste("ls -1 *unassembled.reverse.fastq >umerged.reverse.files",sep = "")) +system(paste("ls -1 *.assembled.fastq >assembled.files",sep = "")) + +files_f = read.table("umerged.forward.files",stringsAsFactors = F) +files_r = read.table("umerged.reverse.files",stringsAsFactors = F) +files_a = read.table("assembled.files",stringsAsFactors = F) + + +for(i in 1:nrow(files_f)) + { +#grep 1st and 2rd lines + unassembled.fastq = paste("awk 'NR % 2 == 1' ",files_f[i,1]," >unassembled.names.seq.fastq",sep="") + system(unassembled.fastq) + +#grep sequences forward... + unassembled.forward.fastq = paste("awk 'NR % 2 == 0' ",files_f[i,1]," >unassembled.seq.forward.fastq",sep="") + system(unassembled.forward.fastq) + +#grep sequences reverse... + unassembled.reverse.fastq = paste("awk 'NR % 2 == 0' ",files_r[i,1]," >unassembled.seq.reverse.fastq",sep="") + system(unassembled.reverse.fastq) + +#N and E quality file + system("wc -l unassembled.seq.reverse.fastq >wc") + wc = read.table("wc") + + write.table(c(rbind(rep('NNNNNNNNNNNNNNNNNNNN',wc[1,1]/2),rep('EEEEEEEEEEEEEEEEEEEE',wc[1,1]/2))),"NE_file",row.names = F, col.names = F, quote = F) + +#paste the unassembled sequences into a single file + system("paste -d '\\0' unassembled.seq.forward.fastq NE_file unassembled.seq.reverse.fastq >unassembledN") + +#put everything back into a single fastq + back = paste("paste -d '\\n' unassembled.names.seq.fastq unassembledN >",gsub("merged.unassembled.forward","cat",files_f[i,1]),sep= "") + system(back) + +#add the merged sequences + all = paste("cat ",files_a[i,1]," ",gsub("merged.unassembled.forward","cat",files_f[i,1])," >",gsub("merged.assembled","merged.assembled2",files_a[i,1]),sep = "") + system(all) + +#remove the clutter (file specific) + remove = c("rm NE_file unassembledN wc unassembled.names.seq.fastq unassembled.seq.forward.fastq unassembled.seq.reverse.fastq") + system(remove) +} + +#remove the clutter (listing of all files) +system("rm assembled.files umerged.forward.files umerged.reverse.files") + + + + + diff --git a/bash_scripts/master_samsa2.slurm b/bash_scripts/master_samsa2.slurm new file mode 100755 index 0000000..cc60bb3 --- /dev/null +++ b/bash_scripts/master_samsa2.slurm @@ -0,0 +1,431 @@ +#!/bin/bash + +#SBATCH --time=0-23:59 +#SBATCH --mem=100000 +#SBATCH --account=def-bruneaua +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=48 + +module load r/3.5.0 + + +# Lines starting with #SBATCH are for SLURM job management systems +# and may be removed if user is not submitting jobs to SLURM +#################################################################### +#set -x #echo +# +# master_script_for_sample_files.bash +# Created April 2017 by Sam Westreich, github.com/transcript +# This version modified by Michelle Treiber on August 9, 2017 +# +#################################################################### +# +# This script sets up and runs through ALL steps in the SAMSA pipeline +# before the analysis (which is done in R, likely in RStudio). Each +# step is set up below. +# +# The steps are: +# 1. Merging with PEAR, if applicable +# 2. Read cleaning with Trimmomatic +# 3. rRNA removal with SortMeRNA +# 4. Annotation using DIAMOND (by default against the RefSeq database) +# 5. Aggregation using analysis_counter.py +# 4.1 Annotation using DIAMOND against the Subsystems database +# 5.1 Aggregation using Subsystems-specific analysis counter.py +# 6. Running R scripts to get DESeq statistical analysis. +# +# NOTE: BEFORE running this script, please run package_installation.bash +# and full_database_download.bash located at: +# https://github.com/transcript/samsa2/tree/master/setup in order to set +# up SAMSA2 dependencies and download full databases. +# +#################################################################### +# +echo -e "NOTE: Before running this script, user must run package_installation.bash and full_database_download.bash located at https://github.com/transcript/samsa2/tree/master/setup in order to set up SAMSA2 dependencies.\n\n" +# +# VARIABLES - Set pathway for starting_location to location of samsa2 GitHub download +# +# 0. Set starting location: +starting_location=/home/renaut/scratch/ATRAPP_Champlain_2016_Metat/metagenome_St2 + +# 0.5 Sequences location: +sequence_location=/home/renaut/scratch/ATRAPP_Champlain_2016_Metat/sequences/metagenome/St2 + +# 1. PEAR +pear_location=/home/renaut/bin + +# 2. Trimmomatic +trimmomatic_location=/home/renaut/bin + +# 3. SortMeRNA +sortmerna_location=/home/renaut/bin + +# 4. DIAMOND +diamond_database="/home/renaut/scratch/blast_database/refseq/RefSeq_bac" +diamond_subsys_db="/home/renaut/scratch/blast_database/subsys/subsys_db" +diamond_location=/home/renaut/bin + +# 5. Aggregation +python_programs=/home/renaut/samsa2/python_scripts +RefSeq_db="/home/renaut/scratch/blast_database/refseq/RefSeq_bac.fa" +Subsys_db="/home/renaut/scratch/blast_database/subsys/subsys_db.fa" + +# 6. R scripts and paths +export R_LIBS="/home/renaut/samsa2/R_scripts/packages" +R_programs=/home/renaut/samsa2/R_scripts + +# threads +threads="48" + +################## +#Step0.1: create/read checkpoint +cd $starting_location + +printf "\nStep 0.1: Checking for the presence of the checkpoint file\n" +if [ ! -f "pipeline/checkpoints" ] + then + printf "\tThe file checkpoints does not exist, we will create it.\n" + touch "pipeline/checkpoints" +else + printf "\tThe file checkpoints already exist\n" +fi + +#################################################################### +# +#STEP 0.1: GUNZIP PROCESS. +# +# +Step=$(grep "GUNZIP" pipeline/checkpoints) +if [ "${Step}" != "GUNZIP" ] + then + for file in $sequence_location/hiseq_raw/*gz + do + file_out=`echo $file | awk -F ".gz" '{print $1}'` + gunzip $file -cd >$file_out + + if [ $? -ne 0 ] + then + printf "\t!!! There is a problem in the gunzip step" + exit 1 + fi + done + mv $sequence_location/hiseq_raw/*fastq $sequence_location/hiseq_unzip/. + printf "GUNZIP\n" >>pipeline/checkpoints + echo "STEP 0.1 DONE:GUNZIP OF PAIRED-END FILES USING GUNZIP" + date +else + printf "\tThe variable GUNZIP is in the checkpoint file. STEP 0.1 will then be passed\n" +fi + +#################################################################### +# +# STEP 1: MERGING OF PAIRED-END FILES USING PEAR +# Note: paired-end files are usually named using R1 and R2 in the name. +# Example: control_1.R1.fastq +# control_1.R2.fastq +# +# Note: if using single-end sequencing, skip this step (comment out). +# Note: if performing R analysis (step 6), be sure to name files with +# the appropriate prefix ("control_$file" and "experimental_$file")! + + + +Step=$(grep "MERGING" pipeline/checkpoints) +if [ "${Step}" != "MERGING" ] + then + mkdir $starting_location/step_1_merging + touch $starting_location/step_1_merging/pear_log + for file in $sequence_location/hiseq_unzip/*_R1.fastq + do + file1=$file + file2=`echo $file1 | awk -F "R1" '{print $1 "R2" $2}'` + out_path=`echo $file | awk -F "R1" '{print $1 "merged"}'` + out_name=`echo ${out_path##*/}` + $pear_location/pear -f $file1 -j $threads -r $file2 -o $out_name 1>>$starting_location/step_1_merging/pear_log + #FOR ATRAPP, GIVEN THAT IT APPEARS LIKE 50-60% ARE MERGED, AND I HAVE A LOT OF SEQUENCES, I WILL ONLY KEEP THE MERGED + + if [ $? -ne 0 ] + then + printf "\t!!! There is a problem in the merging step" + exit 1 + fi + done + #FOR ATRAPP, I WILL NOW MERGE:I will merge the umerged leaving 20 N's in between. + + combining_umerged.R $starting_location + printf "MERGING\n" >>pipeline/checkpoints + mkdir $starting_location/step_1_merging/ + mv $starting_location/*assembled2.fastq $starting_location/step_1_merging/ + + rm $starting_location/*fastq + rm $sequence_location/hiseq_unzip/*fastq + + echo "STEP 1 DONE:MERGING OF PAIRED-END FILES USING PEAR" + date +else + printf "\tThe variable MERGING is in the checkpoint file. STEP 1 will then be passed\n" +fi + +#################################################################### +# +# STEP 2: CLEANING FILES WITH TRIMMOMATIC +# Note: if skipping PEAR, make sure that all starting files are in the +# $starting_location/step_1_output/ folder! +Step=$(grep "TRIMMING" pipeline/checkpoints) +if [ "${Step}" != "TRIMMING" ] + then + mkdir $starting_location/step_2_trimming + touch $starting_location/step_2_trimming/trimmomatic_log + + for file in $starting_location/step_1_merging/*assembled2.fastq + do + shortname=`echo $file | awk -F "assembled2.fastq" '{print $1 "cleaned.fastq"}'` + + java -jar $trimmomatic_location/trimmomatic-0.36.jar SE -threads $threads -phred33 $file $shortname ILLUMINACLIP:/home/renaut/trinityrnaseq-Trinity-v2.8.4/trinity-plugins/Trimmomatic-0.36/adapters/all_primers.fa:2:30:10 SLIDINGWINDOW:4:15 MINLEN:59 2>>$starting_location/step_2_trimming/trimmomatic_log + if [ $? -ne 0 ] + then + printf "\t!!! There is a problem in the trimmmomatic step" + exit 1 + fi + + done + + mkdir $starting_location/step_2_trimming/ + mv $starting_location/step_1_merging/*cleaned.fastq $starting_location/step_2_trimming/ + + rm step_1_merging/*assembled2.fastq + + echo "STEP 2 DONE:CLEANING FILES WITH TRIMMOMATIC" + date + + printf "TRIMMING\n" >>pipeline/checkpoints +else + printf "\tThe variable TRIMMING is in the checkpoint file. STEP 2 will then be passed\n" +fi + +#################################################################### +# +# STEP 2.5: GETTING RAW SEQUENCES COUNTS +# Note: These are used later for statistical analysis. +Step=$(grep "RAW" pipeline/checkpoints) +if [ "${Step}" != "RAW" ] + then + if [ -f $starting_location/step_2_trimming/raw_counts.txt ] + then + rm $starting_location/step_2_trimming/raw_counts.txt + touch $starting_location/step_2_trimming/raw_counts.txt + else + touch $starting_location/step_2_trimming/raw_counts.txt + fi + + for file in $starting_location/step_2_trimming/*cleaned.fastq + do + python $python_programs/raw_read_counter.py -I $file -O $starting_location/step_2_trimming/raw_counts.txt + if [ $? -ne 0 ] + then + printf "\t!!! There is a problem in the raw sequences step" + exit 1 + fi + done + echo "STEP 2.5:GETTING RAW SEQUENCES COUNTS" + date + + printf "RAW\n" >>pipeline/checkpoints +else + printf "\tThe variable RAW is in the checkpoint file. STEP 2.9 will then be passed\n" +fi + +#################################################################### +# +# STEP 3: REMOVING RIBOSOMAL READS WITH SORTMERNA +# Note: this step assumes that the SortMeRNA databases are indexed. If not, +# do that first (see the SortMeRNA user manual for details). + +Step=$(grep "RIBODEPLETION" pipeline/checkpoints) +if [ "${Step}" != "RIBODEPLETION" ] + then + mkdir $starting_location/step_3_ribodepletion + touch $starting_location/step_3_ribodepletion/sortmerna_log + + for file in $starting_location/step_2_trimming/*cleaned.fastq + do + shortname=`echo $file | awk -F "cleaned" '{print $1 "ribodepleted"}'` + $sortmerna_location/sortmerna -a $threads --ref $sortmerna_location/../sortmerna-2.1/rRNA_databases/silva-bac-16s-id90.fasta,$sortmerna_location/../sortmerna-2.1/index/silva-bac-16s --reads $file --aligned $file.ribosomes --other $shortname --fastx --num_alignments 0 --log -v 1>>$starting_location/step_3_ribodepletion/sortmerna_log + if [ $? -ne 0 ] + then + printf "\t!!! There is a problem in the sortmerna step" + exit 1 + fi + done + + mkdir $starting_location/step_3_ribodepletion/ + mv $starting_location/step_2_trimming/*ribodepleted* $starting_location/step_3_ribodepletion/ + + rm step_2_trimming/*cleaned.fastq + rm step_2_trimming/*ribosomes.fastq + + echo "STEP 3 DONE:REMOVING RIBOSOMAL READS WITH SORTMERNA" + date + + printf "RIBODEPLETION\n" >>pipeline/checkpoints +else + printf "\tThe variable RIBODEPLETION is in the checkpoint file. STEP 3 will then be passed\n" +fi + +#################################################################### +# +# STEP 4: ANNOTATING WITH DIAMOND AGAINST SUBSYSTEMS +date +Step=$(grep "ANNOTATION_SUBSYS" pipeline/checkpoints) +if [ "${Step}" != "ANNOTATION_SUBSYS" ] + then + mkdir $starting_location/step_4_annotation + touch $starting_location/step_4_annotation/diamond_log + for file in $starting_location/step_3_ribodepletion/*ribodepleted.fastq + do + shortname=`echo $file | awk -F "ribodepleted" '{print $1 "subsys_annotated"}'` + #removed the diamond sensitive option + $diamond_location/diamond blastx --db $diamond_subsys_db -q $file -a $file.Subsys -p $threads -t ./ -k 1 1>>$starting_location/step_4_annotation/diamond_log + $diamond_location/diamond view --daa $file.Subsys.daa -o $shortname -p $threads -f tab 1>>$starting_location/step_4_annotation/diamond_log + if [ $? -ne 0 ] + then + printf "\t!!! There is a problem in the diamond subsys step" + exit 1 + fi + done + + mkdir $starting_location/step_4_annotation/daa_binary_files/ + + mv $starting_location/step_3_ribodepletion/*subsys_annotated* $starting_location/step_4_annotation/ + mv $starting_location/step_3_ribodepletion/*.daa $starting_location/step_4_annotation/daa_binary_files/ + + echo "STEP 4 DONE: ANNOTATING WITH DIAMOND AGAINST SUBSYSTEMS" + date + + printf "ANNOTATION_SUBSYS\n" >>pipeline/checkpoints +else + printf "\tThe variable ANNOTATION_SUBSYS is in the checkpoint file. STEP 4 will then be passed\n" +fi +################################################################## +# +# STEP 5: PYTHON SUBSYSTEMS ANALYSIS COUNTER +Step=$(grep "AGGREGATION_SUBSYS" pipeline/checkpoints) +if [ "${Step}" != "AGGREGATION_SUBSYS" ] + then + mkdir $starting_location/step_5_aggregation/ + for file in $starting_location/step_4_annotation/*subsys_annotated + do + python $python_programs/DIAMOND_subsystems_analysis_counter.py -I $file -D $Subsys_db -O $file.hierarchy -P $file.receipt + + # This quick program reduces down identical hierarchy annotations + python $python_programs/subsys_reducer.py -I $file.hierarchy + if [ $? -ne 0 ] + then + printf "\t!!! There is a problem in the aggregation subsys step" + exit 1 + fi + done + + mkdir $starting_location/step_5_aggregation/Subsystems_results/ + mkdir $starting_location/step_5_aggregation/Subsystems_results/receipts/ + mv $starting_location/step_4_annotation/*.reduced $starting_location/step_5_aggregation/Subsystems_results/ + mv $starting_location/step_4_annotation/*.receipt $starting_location/step_5_aggregation/Subsystems_results/receipts/ + rm $starting_location/step_4_annotation/*.hierarchy + + echo "STEP 5 DONE:PYTHON SUBSYSTEMS ANALYSIS COUNTER" + date + + printf "AGGREGATION_SUBSYS\n" >>pipeline/checkpoints +else + printf "\tThe variable AGGREGATION_SUBSYS is in the checkpoint file. STEP 5. will then be passed\n" +fi + + +#################################################################### +# +# STEP 4.1: ANNOTATING WITH DIAMOND AGAINST REFSEQ +# Note: this step assumes that the DIAMOND database is already built. If not, +# do that first before running this step. +Step=$(grep "ANNOTATION_REFSEQ" pipeline/checkpoints) +if [ "${Step}" != "ANNOTATION_REFSEQ" ] + then + touch $starting_location/step_4_annotation/diamond_log + + for file in $starting_location/step_3_ribodepletion/*ribodepleted.fastq + do + shortname=`echo $file | awk -F "ribodepleted" '{print $1 "RefSeq_annotated"}'` + #removed the diamond sensitive option + $diamond_location/diamond blastx --db $diamond_database -q $file -p $threads -a $file.RefSeq -t ./ -k 1 1>>$starting_location/step_4_annotation/diamond_log + $diamond_location/diamond view --daa $file.RefSeq.daa -o $shortname -p $threads -f tab 1>>$starting_location/step_4_annotation/diamond_log + if [ $? -ne 0 ] + then + printf "\t!!! There is a problem in the diamond refseq step" + exit 1 + fi + done + + mv $starting_location/step_3_ribodepletion/*annotated* $starting_location/step_4_annotation/ + mv $starting_location/step_3_ribodepletion/*.daa $starting_location/step_4_annotation/daa_binary_files/ + + echo "STEP 4.1 DONE:ANNOTATING WITH DIAMOND AGAINST REFSEQ" + date + + printf "ANNOTATION_REFSEQ\n" >>pipeline/checkpoints +else + printf "\tThe variable ANNOTATION_REFSEQ is in the checkpoint file. STEP 4.1 will then be passed\n" +fi + +#################################################################### +# +# STEP 5.1: AGGREGATING WITH ANALYSIS_COUNTER +Step=$(grep "AGGREGATION_REFSEQ" pipeline/checkpoints) +if [ "${Step}" != "AGGREGATION_REFSEQ" ] + then + for file in $starting_location/step_4_annotation/*RefSeq_annotated* + do + python $python_programs/standardized_DIAMOND_analysis_counter.py -I $file -D $RefSeq_db -O + python $python_programs/standardized_DIAMOND_analysis_counter.py -I $file -D $RefSeq_db -F + if [ $? -ne 0 ] + then + printf "\t!!! There is a problem in the aggregation refseq step" + exit 1 + fi + done + + mkdir $starting_location/step_5_aggregation/RefSeq_results/ + mkdir $starting_location/step_5_aggregation/RefSeq_results/org_results/ + mkdir $starting_location/step_5_aggregation/RefSeq_results/func_results/ + mv $starting_location/step_4_annotation/*organism.tsv $starting_location/step_5_aggregation/RefSeq_results/org_results/ + mv $starting_location/step_4_annotation/*function.tsv $starting_location/step_5_aggregation/RefSeq_results/func_results/ + + echo "STEP 5.1 DONE: AGGREGATING WITH ANALYSIS_COUNTER" + date + + printf "AGGREGATION_REFSEQ\n" >>pipeline/checkpoints +else + printf "\tThe variable AGGREGATION_REFSEQ is in the checkpoint file. STEP 5.1 will then be passed\n" +fi + + + +################################################################## +# +# At this point, all the results files are ready for analysis using R. +# This next step performs basic DESeq2 analysis of the RefSeq organism, function, +# and Subsystems annotations. +# +# More complex R analyses may be performed using specific .sh analysis scripts. +# +# STEP 6: R ANALYSIS +# Note: For R to properly identify files to compare/contrast, they must include +# the appropriate prefix (either "control_$file" or experimental_$file")! + +#/home/apps/Logiciels/R/3.4.3-gcc/bin/Rscript $R_programs/run_DESeq_stats.R -I $starting_location/step_5_aggregation/RefSeq_results/org_results/ -O RefSeq_org_DESeq_results.tab -R $starting_location/step_2_trimming/raw_counts.txt +#/home/apps/Logiciels/R/3.4.3-gcc/bin/Rscript $R_programs/run_DESeq_stats.R -I $starting_location/step_5_aggregation/RefSeq_results/func_results/ -O RefSeq_func_DESeq_results.tab -R $starting_location/step_2_trimming/raw_counts.txt +#/home/apps/Logiciels/R/3.4.3-gcc/bin/Rscript $R_programs/Subsystems_DESeq_stats.R -I $starting_location/step_5_aggregation/Subsystems_results/ -O Subsystems_level-1_DESeq_results.tab -L 1 -R step_2_trimming/raw_counts.txt + +echo "STEP 6 DONE: R ANALYSIS" +echo "Master bash script finished running at: "; date +exit +#################################################################### + From 5d90e0e4392d8210f95b828fa9c30121b9afb3cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Renaut?= Date: Wed, 28 Nov 2018 08:48:36 -0800 Subject: [PATCH 02/10] improved syntax --- bash_scripts/master_samsa2.slurm | 44 +++++++++++++++++--------------- 1 file changed, 24 insertions(+), 20 deletions(-) diff --git a/bash_scripts/master_samsa2.slurm b/bash_scripts/master_samsa2.slurm index cc60bb3..36b8297 100755 --- a/bash_scripts/master_samsa2.slurm +++ b/bash_scripts/master_samsa2.slurm @@ -25,14 +25,16 @@ module load r/3.5.0 # step is set up below. # # The steps are: -# 1. Merging with PEAR, if applicable -# 2. Read cleaning with Trimmomatic -# 3. rRNA removal with SortMeRNA -# 4. Annotation using DIAMOND (by default against the RefSeq database) -# 5. Aggregation using analysis_counter.py -# 4.1 Annotation using DIAMOND against the Subsystems database -# 5.1 Aggregation using Subsystems-specific analysis counter.py -# 6. Running R scripts to get DESeq statistical analysis. +# 0.1 Create the checkpoint +# 0.2 Gunzip the raw sequences +# 1. Merging with PEAR, if applicable +# 2. Read cleaning with Trimmomatic +# 3. rRNA removal with SortMeRNA +# 4. Annotation using DIAMOND against Subsys database +# 5. Aggregation using analysis_counter.py (Subsys) +# 4.1 Annotation using DIAMOND against Refseq database +# 5.1 Aggregation using analysis_counter.py (Refseq) +# 6. Running R scripts to get DESeq statistical analysis. (NOT RUN) # # NOTE: BEFORE running this script, please run package_installation.bash # and full_database_download.bash located at: @@ -48,14 +50,14 @@ echo -e "NOTE: Before running this script, user must run package_installation.ba # 0. Set starting location: starting_location=/home/renaut/scratch/ATRAPP_Champlain_2016_Metat/metagenome_St2 -# 0.5 Sequences location: +# 0.5 Sequences location (note that sequences actually need to be stored in directory called $sequence_location/hiseq_raw sequence_location=/home/renaut/scratch/ATRAPP_Champlain_2016_Metat/sequences/metagenome/St2 # 1. PEAR pear_location=/home/renaut/bin # 2. Trimmomatic -trimmomatic_location=/home/renaut/bin +trimmomatic_location=/home/renaut/trinityrnaseq-Trinity-v2.8.4/trinity-plugins/Trimmomatic-0.36 # 3. SortMeRNA sortmerna_location=/home/renaut/bin @@ -66,19 +68,19 @@ diamond_subsys_db="/home/renaut/scratch/blast_database/subsys/subsys_db" diamond_location=/home/renaut/bin # 5. Aggregation -python_programs=/home/renaut/samsa2/python_scripts +python_programs=/home/renaut/samsa2.master/samsa2/python_scripts RefSeq_db="/home/renaut/scratch/blast_database/refseq/RefSeq_bac.fa" Subsys_db="/home/renaut/scratch/blast_database/subsys/subsys_db.fa" # 6. R scripts and paths -export R_LIBS="/home/renaut/samsa2/R_scripts/packages" -R_programs=/home/renaut/samsa2/R_scripts +export R_LIBS="/home/renaut/samsa2.master/samsa2/R_scripts/packages" +R_programs=/home/renaut/samsa2.master/samsa2/R_scripts # threads threads="48" ################## -#Step0.1: create/read checkpoint +#STEP 0.1: create/read checkpoint cd $starting_location printf "\nStep 0.1: Checking for the presence of the checkpoint file\n" @@ -92,7 +94,7 @@ fi #################################################################### # -#STEP 0.1: GUNZIP PROCESS. +#STEP 0.2: GUNZIP PROCESS. # # Step=$(grep "GUNZIP" pipeline/checkpoints) @@ -109,6 +111,8 @@ if [ "${Step}" != "GUNZIP" ] exit 1 fi done + + mkdir $sequence_location/hiseq_unzip mv $sequence_location/hiseq_raw/*fastq $sequence_location/hiseq_unzip/. printf "GUNZIP\n" >>pipeline/checkpoints echo "STEP 0.1 DONE:GUNZIP OF PAIRED-END FILES USING GUNZIP" @@ -142,7 +146,6 @@ if [ "${Step}" != "MERGING" ] out_path=`echo $file | awk -F "R1" '{print $1 "merged"}'` out_name=`echo ${out_path##*/}` $pear_location/pear -f $file1 -j $threads -r $file2 -o $out_name 1>>$starting_location/step_1_merging/pear_log - #FOR ATRAPP, GIVEN THAT IT APPEARS LIKE 50-60% ARE MERGED, AND I HAVE A LOT OF SEQUENCES, I WILL ONLY KEEP THE MERGED if [ $? -ne 0 ] then @@ -150,9 +153,10 @@ if [ "${Step}" != "MERGING" ] exit 1 fi done - #FOR ATRAPP, I WILL NOW MERGE:I will merge the umerged leaving 20 N's in between. - - combining_umerged.R $starting_location + + ###I will concatenate forward and reverse complement with 20Ns in the middle. + $R_programs/combining_umerged.R $starting_location + printf "MERGING\n" >>pipeline/checkpoints mkdir $starting_location/step_1_merging/ mv $starting_location/*assembled2.fastq $starting_location/step_1_merging/ @@ -181,7 +185,7 @@ if [ "${Step}" != "TRIMMING" ] do shortname=`echo $file | awk -F "assembled2.fastq" '{print $1 "cleaned.fastq"}'` - java -jar $trimmomatic_location/trimmomatic-0.36.jar SE -threads $threads -phred33 $file $shortname ILLUMINACLIP:/home/renaut/trinityrnaseq-Trinity-v2.8.4/trinity-plugins/Trimmomatic-0.36/adapters/all_primers.fa:2:30:10 SLIDINGWINDOW:4:15 MINLEN:59 2>>$starting_location/step_2_trimming/trimmomatic_log + java -jar $trimmomatic_location/trimmomatic-0.36.jar SE -threads $threads -phred33 $file $shortname ILLUMINACLIP:$trimmomatic_location/adapters/all_primers.fa:2:30:10 SLIDINGWINDOW:4:15 MINLEN:59 2>>$starting_location/step_2_trimming/trimmomatic_log if [ $? -ne 0 ] then printf "\t!!! There is a problem in the trimmmomatic step" From 33adea99d76ff7a05c8a8f1b3e244475e9f94df1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Renaut?= Date: Wed, 28 Nov 2018 12:16:07 -0800 Subject: [PATCH 03/10] sortmerna options --- README.md | 4 ++-- bash_scripts/master_samsa2.slurm | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 189ac85..59417b5 100644 --- a/README.md +++ b/README.md @@ -13,10 +13,10 @@ There is a new master script (master_samsa2.slurm) with several modifications: * Trimommatic removes adapter contamination according to a specific file * All options are to be specified in the first section of the script * The script is formated to be run into a SLURM job scheduler, but this can be easily changed / removed. - +* I've removed the --num_alignments 0 is sortmrna. This causes problems and slows things down a lot. Plus we don't care about the rRNA alignments. If a sequence aligns to one or 1,000 rRNA its out anyways... #### To do: -* reverse the order of the merging and trimming step. +* reverse the order of the merging and trimming step (but again, maybe not...) * simplify some syntax / make sure all paths are relative... diff --git a/bash_scripts/master_samsa2.slurm b/bash_scripts/master_samsa2.slurm index 36b8297..6fa3c24 100755 --- a/bash_scripts/master_samsa2.slurm +++ b/bash_scripts/master_samsa2.slurm @@ -254,7 +254,7 @@ if [ "${Step}" != "RIBODEPLETION" ] for file in $starting_location/step_2_trimming/*cleaned.fastq do shortname=`echo $file | awk -F "cleaned" '{print $1 "ribodepleted"}'` - $sortmerna_location/sortmerna -a $threads --ref $sortmerna_location/../sortmerna-2.1/rRNA_databases/silva-bac-16s-id90.fasta,$sortmerna_location/../sortmerna-2.1/index/silva-bac-16s --reads $file --aligned $file.ribosomes --other $shortname --fastx --num_alignments 0 --log -v 1>>$starting_location/step_3_ribodepletion/sortmerna_log + $sortmerna_location/sortmerna -a $threads --ref $sortmerna_location/../sortmerna-2.1/rRNA_databases/silva-bac-16s-id90.fasta,$sortmerna_location/../sortmerna-2.1/index/silva-bac-16s --reads $file --aligned $file.ribosomes --other $shortname --fastx --log -v 1>>$starting_location/step_3_ribodepletion/sortmerna_log if [ $? -ne 0 ] then printf "\t!!! There is a problem in the sortmerna step" From 58513fc8e1403670d54762ea086322a84ee6eccd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Renaut?= Date: Mon, 3 Dec 2018 10:01:23 -0800 Subject: [PATCH 04/10] improve flow --- README.md | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 59417b5..72c0ce0 100644 --- a/README.md +++ b/README.md @@ -3,23 +3,21 @@ ### modification by sebastien.renaut@gmail.com ### New in my version. -There is a new master script (master_samsa2.slurm) with several modifications: +This is a new master script (master_samsa2.slurm) with several modifications: * It specifies multithreading for all programs for which it is available (trimmomatic, pear, diamond) -* The script annomatically creates a checkpoint file. Once a step is finished, it writes the name of that specific step in the file. When you run the script again, it will SKIP the steps labelled in checkpoint. +* The script automatically creates a `checkpoint` file. Once a step is finished, it writes the name of that specific step in `checkpoint`. Everytime you run the script, it looks if that file exist. If so, it reads it and SKIP the steps written in checkpoint. This is done to avoid re-run CPU intense steps if not necessary. * In the merging step, unmerged reads are concatenated and added to a single file. The forward read and the reverse (complement) read are concatenated with a string of 20 Ns in the middle: This is done through a new R script entitled: `combining_umerged.R` * Extra care is taken to remove unnecessary files once a step is performed to keep disk usage at a minimum. -* Each step contains an exit statement to be printed if the master script dies. -* Trimommatic removes adapter contamination according to a specific file -* All options are to be specified in the first section of the script -* The script is formated to be run into a SLURM job scheduler, but this can be easily changed / removed. -* I've removed the --num_alignments 0 is sortmrna. This causes problems and slows things down a lot. Plus we don't care about the rRNA alignments. If a sequence aligns to one or 1,000 rRNA its out anyways... +* Each step contains an exit statement to be printed if the master script dies due to an unforseen error. +* Trimmomatic removes adapter contamination according to a specific fasta file. +* All options, read & program location are to be specified in the first section of the script. +* The script is formated to be run on a HPC using a SLURM job scheduler, but this can be easily changed / removed. +* I've removed the --num_alignments 0 in the ribosomal `sortmrna` step. This caused problems and slowed things down a lot. Plus we don't care about the rRNA alignments. If a sequence aligns to 1 or 1,000 rRNA its out anyways... #### To do: * reverse the order of the merging and trimming step (but again, maybe not...) -* simplify some syntax / make sure all paths are relative... - - +* simplify some syntax / make sure all paths are relative (mostly done) Version 2 of the SAMSA pipeline - faster! Lighter! More options! Less waiting! From 79e66eede3f488cbc6272c34e10d7fa199e46553 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Renaut?= Date: Mon, 18 Feb 2019 11:01:57 -0800 Subject: [PATCH 05/10] minor --- .gitignore | 5 +++++ bash_scripts/master_samsa2.slurm | 18 +++++++++--------- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/.gitignore b/.gitignore index 9e8b5a7..c11349b 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,8 @@ +# my files +bash_scripts/master_samsa2.slurm +bash_scripts/cyanotoxin_master_samsa2.slurm +R_scripts/combining_umerged.R + # Swap files *.swp *.swo diff --git a/bash_scripts/master_samsa2.slurm b/bash_scripts/master_samsa2.slurm index 6fa3c24..a16d11b 100755 --- a/bash_scripts/master_samsa2.slurm +++ b/bash_scripts/master_samsa2.slurm @@ -1,7 +1,7 @@ #!/bin/bash #SBATCH --time=0-23:59 -#SBATCH --mem=100000 +#SBATCH --mem=100G #SBATCH --account=def-bruneaua #SBATCH --nodes=1 #SBATCH --ntasks-per-node=48 @@ -48,10 +48,10 @@ echo -e "NOTE: Before running this script, user must run package_installation.ba # VARIABLES - Set pathway for starting_location to location of samsa2 GitHub download # # 0. Set starting location: -starting_location=/home/renaut/scratch/ATRAPP_Champlain_2016_Metat/metagenome_St2 +starting_location=/home/renaut/scratch/ATRAPP/Metagenomic_Champlain_long_term/St1 -# 0.5 Sequences location (note that sequences actually need to be stored in directory called $sequence_location/hiseq_raw -sequence_location=/home/renaut/scratch/ATRAPP_Champlain_2016_Metat/sequences/metagenome/St2 +# 0.5 Sequences location (note that sequences actually need to be stored in directory called $sequence_location/zipped +sequence_location=/home/renaut/scratch/ATRAPP/sequences/Metagenomic_Champlain_long_term/St1 # 1. PEAR pear_location=/home/renaut/bin @@ -100,7 +100,7 @@ fi Step=$(grep "GUNZIP" pipeline/checkpoints) if [ "${Step}" != "GUNZIP" ] then - for file in $sequence_location/hiseq_raw/*gz + for file in $sequence_location/zipped/*gz do file_out=`echo $file | awk -F ".gz" '{print $1}'` gunzip $file -cd >$file_out @@ -112,8 +112,8 @@ if [ "${Step}" != "GUNZIP" ] fi done - mkdir $sequence_location/hiseq_unzip - mv $sequence_location/hiseq_raw/*fastq $sequence_location/hiseq_unzip/. + mkdir $sequence_location/unzipped + mv $sequence_location/zipped/*fastq $sequence_location/unzipped/. printf "GUNZIP\n" >>pipeline/checkpoints echo "STEP 0.1 DONE:GUNZIP OF PAIRED-END FILES USING GUNZIP" date @@ -139,7 +139,7 @@ if [ "${Step}" != "MERGING" ] then mkdir $starting_location/step_1_merging touch $starting_location/step_1_merging/pear_log - for file in $sequence_location/hiseq_unzip/*_R1.fastq + for file in $sequence_location/unzipped/*_R1.fastq do file1=$file file2=`echo $file1 | awk -F "R1" '{print $1 "R2" $2}'` @@ -162,7 +162,7 @@ if [ "${Step}" != "MERGING" ] mv $starting_location/*assembled2.fastq $starting_location/step_1_merging/ rm $starting_location/*fastq - rm $sequence_location/hiseq_unzip/*fastq + rm $sequence_location/unzipped/*fastq echo "STEP 1 DONE:MERGING OF PAIRED-END FILES USING PEAR" date From fb2fc99f0e4f369fa783f476b2322f5caad6422b Mon Sep 17 00:00:00 2001 From: Sebastien Renaut Date: Mon, 18 Feb 2019 14:03:14 -0500 Subject: [PATCH 06/10] remove from main --- bash_scripts/master_samsa2.slurm | 435 ------------------------------- 1 file changed, 435 deletions(-) delete mode 100755 bash_scripts/master_samsa2.slurm diff --git a/bash_scripts/master_samsa2.slurm b/bash_scripts/master_samsa2.slurm deleted file mode 100755 index a16d11b..0000000 --- a/bash_scripts/master_samsa2.slurm +++ /dev/null @@ -1,435 +0,0 @@ -#!/bin/bash - -#SBATCH --time=0-23:59 -#SBATCH --mem=100G -#SBATCH --account=def-bruneaua -#SBATCH --nodes=1 -#SBATCH --ntasks-per-node=48 - -module load r/3.5.0 - - -# Lines starting with #SBATCH are for SLURM job management systems -# and may be removed if user is not submitting jobs to SLURM -#################################################################### -#set -x #echo -# -# master_script_for_sample_files.bash -# Created April 2017 by Sam Westreich, github.com/transcript -# This version modified by Michelle Treiber on August 9, 2017 -# -#################################################################### -# -# This script sets up and runs through ALL steps in the SAMSA pipeline -# before the analysis (which is done in R, likely in RStudio). Each -# step is set up below. -# -# The steps are: -# 0.1 Create the checkpoint -# 0.2 Gunzip the raw sequences -# 1. Merging with PEAR, if applicable -# 2. Read cleaning with Trimmomatic -# 3. rRNA removal with SortMeRNA -# 4. Annotation using DIAMOND against Subsys database -# 5. Aggregation using analysis_counter.py (Subsys) -# 4.1 Annotation using DIAMOND against Refseq database -# 5.1 Aggregation using analysis_counter.py (Refseq) -# 6. Running R scripts to get DESeq statistical analysis. (NOT RUN) -# -# NOTE: BEFORE running this script, please run package_installation.bash -# and full_database_download.bash located at: -# https://github.com/transcript/samsa2/tree/master/setup in order to set -# up SAMSA2 dependencies and download full databases. -# -#################################################################### -# -echo -e "NOTE: Before running this script, user must run package_installation.bash and full_database_download.bash located at https://github.com/transcript/samsa2/tree/master/setup in order to set up SAMSA2 dependencies.\n\n" -# -# VARIABLES - Set pathway for starting_location to location of samsa2 GitHub download -# -# 0. Set starting location: -starting_location=/home/renaut/scratch/ATRAPP/Metagenomic_Champlain_long_term/St1 - -# 0.5 Sequences location (note that sequences actually need to be stored in directory called $sequence_location/zipped -sequence_location=/home/renaut/scratch/ATRAPP/sequences/Metagenomic_Champlain_long_term/St1 - -# 1. PEAR -pear_location=/home/renaut/bin - -# 2. Trimmomatic -trimmomatic_location=/home/renaut/trinityrnaseq-Trinity-v2.8.4/trinity-plugins/Trimmomatic-0.36 - -# 3. SortMeRNA -sortmerna_location=/home/renaut/bin - -# 4. DIAMOND -diamond_database="/home/renaut/scratch/blast_database/refseq/RefSeq_bac" -diamond_subsys_db="/home/renaut/scratch/blast_database/subsys/subsys_db" -diamond_location=/home/renaut/bin - -# 5. Aggregation -python_programs=/home/renaut/samsa2.master/samsa2/python_scripts -RefSeq_db="/home/renaut/scratch/blast_database/refseq/RefSeq_bac.fa" -Subsys_db="/home/renaut/scratch/blast_database/subsys/subsys_db.fa" - -# 6. R scripts and paths -export R_LIBS="/home/renaut/samsa2.master/samsa2/R_scripts/packages" -R_programs=/home/renaut/samsa2.master/samsa2/R_scripts - -# threads -threads="48" - -################## -#STEP 0.1: create/read checkpoint -cd $starting_location - -printf "\nStep 0.1: Checking for the presence of the checkpoint file\n" -if [ ! -f "pipeline/checkpoints" ] - then - printf "\tThe file checkpoints does not exist, we will create it.\n" - touch "pipeline/checkpoints" -else - printf "\tThe file checkpoints already exist\n" -fi - -#################################################################### -# -#STEP 0.2: GUNZIP PROCESS. -# -# -Step=$(grep "GUNZIP" pipeline/checkpoints) -if [ "${Step}" != "GUNZIP" ] - then - for file in $sequence_location/zipped/*gz - do - file_out=`echo $file | awk -F ".gz" '{print $1}'` - gunzip $file -cd >$file_out - - if [ $? -ne 0 ] - then - printf "\t!!! There is a problem in the gunzip step" - exit 1 - fi - done - - mkdir $sequence_location/unzipped - mv $sequence_location/zipped/*fastq $sequence_location/unzipped/. - printf "GUNZIP\n" >>pipeline/checkpoints - echo "STEP 0.1 DONE:GUNZIP OF PAIRED-END FILES USING GUNZIP" - date -else - printf "\tThe variable GUNZIP is in the checkpoint file. STEP 0.1 will then be passed\n" -fi - -#################################################################### -# -# STEP 1: MERGING OF PAIRED-END FILES USING PEAR -# Note: paired-end files are usually named using R1 and R2 in the name. -# Example: control_1.R1.fastq -# control_1.R2.fastq -# -# Note: if using single-end sequencing, skip this step (comment out). -# Note: if performing R analysis (step 6), be sure to name files with -# the appropriate prefix ("control_$file" and "experimental_$file")! - - - -Step=$(grep "MERGING" pipeline/checkpoints) -if [ "${Step}" != "MERGING" ] - then - mkdir $starting_location/step_1_merging - touch $starting_location/step_1_merging/pear_log - for file in $sequence_location/unzipped/*_R1.fastq - do - file1=$file - file2=`echo $file1 | awk -F "R1" '{print $1 "R2" $2}'` - out_path=`echo $file | awk -F "R1" '{print $1 "merged"}'` - out_name=`echo ${out_path##*/}` - $pear_location/pear -f $file1 -j $threads -r $file2 -o $out_name 1>>$starting_location/step_1_merging/pear_log - - if [ $? -ne 0 ] - then - printf "\t!!! There is a problem in the merging step" - exit 1 - fi - done - - ###I will concatenate forward and reverse complement with 20Ns in the middle. - $R_programs/combining_umerged.R $starting_location - - printf "MERGING\n" >>pipeline/checkpoints - mkdir $starting_location/step_1_merging/ - mv $starting_location/*assembled2.fastq $starting_location/step_1_merging/ - - rm $starting_location/*fastq - rm $sequence_location/unzipped/*fastq - - echo "STEP 1 DONE:MERGING OF PAIRED-END FILES USING PEAR" - date -else - printf "\tThe variable MERGING is in the checkpoint file. STEP 1 will then be passed\n" -fi - -#################################################################### -# -# STEP 2: CLEANING FILES WITH TRIMMOMATIC -# Note: if skipping PEAR, make sure that all starting files are in the -# $starting_location/step_1_output/ folder! -Step=$(grep "TRIMMING" pipeline/checkpoints) -if [ "${Step}" != "TRIMMING" ] - then - mkdir $starting_location/step_2_trimming - touch $starting_location/step_2_trimming/trimmomatic_log - - for file in $starting_location/step_1_merging/*assembled2.fastq - do - shortname=`echo $file | awk -F "assembled2.fastq" '{print $1 "cleaned.fastq"}'` - - java -jar $trimmomatic_location/trimmomatic-0.36.jar SE -threads $threads -phred33 $file $shortname ILLUMINACLIP:$trimmomatic_location/adapters/all_primers.fa:2:30:10 SLIDINGWINDOW:4:15 MINLEN:59 2>>$starting_location/step_2_trimming/trimmomatic_log - if [ $? -ne 0 ] - then - printf "\t!!! There is a problem in the trimmmomatic step" - exit 1 - fi - - done - - mkdir $starting_location/step_2_trimming/ - mv $starting_location/step_1_merging/*cleaned.fastq $starting_location/step_2_trimming/ - - rm step_1_merging/*assembled2.fastq - - echo "STEP 2 DONE:CLEANING FILES WITH TRIMMOMATIC" - date - - printf "TRIMMING\n" >>pipeline/checkpoints -else - printf "\tThe variable TRIMMING is in the checkpoint file. STEP 2 will then be passed\n" -fi - -#################################################################### -# -# STEP 2.5: GETTING RAW SEQUENCES COUNTS -# Note: These are used later for statistical analysis. -Step=$(grep "RAW" pipeline/checkpoints) -if [ "${Step}" != "RAW" ] - then - if [ -f $starting_location/step_2_trimming/raw_counts.txt ] - then - rm $starting_location/step_2_trimming/raw_counts.txt - touch $starting_location/step_2_trimming/raw_counts.txt - else - touch $starting_location/step_2_trimming/raw_counts.txt - fi - - for file in $starting_location/step_2_trimming/*cleaned.fastq - do - python $python_programs/raw_read_counter.py -I $file -O $starting_location/step_2_trimming/raw_counts.txt - if [ $? -ne 0 ] - then - printf "\t!!! There is a problem in the raw sequences step" - exit 1 - fi - done - echo "STEP 2.5:GETTING RAW SEQUENCES COUNTS" - date - - printf "RAW\n" >>pipeline/checkpoints -else - printf "\tThe variable RAW is in the checkpoint file. STEP 2.9 will then be passed\n" -fi - -#################################################################### -# -# STEP 3: REMOVING RIBOSOMAL READS WITH SORTMERNA -# Note: this step assumes that the SortMeRNA databases are indexed. If not, -# do that first (see the SortMeRNA user manual for details). - -Step=$(grep "RIBODEPLETION" pipeline/checkpoints) -if [ "${Step}" != "RIBODEPLETION" ] - then - mkdir $starting_location/step_3_ribodepletion - touch $starting_location/step_3_ribodepletion/sortmerna_log - - for file in $starting_location/step_2_trimming/*cleaned.fastq - do - shortname=`echo $file | awk -F "cleaned" '{print $1 "ribodepleted"}'` - $sortmerna_location/sortmerna -a $threads --ref $sortmerna_location/../sortmerna-2.1/rRNA_databases/silva-bac-16s-id90.fasta,$sortmerna_location/../sortmerna-2.1/index/silva-bac-16s --reads $file --aligned $file.ribosomes --other $shortname --fastx --log -v 1>>$starting_location/step_3_ribodepletion/sortmerna_log - if [ $? -ne 0 ] - then - printf "\t!!! There is a problem in the sortmerna step" - exit 1 - fi - done - - mkdir $starting_location/step_3_ribodepletion/ - mv $starting_location/step_2_trimming/*ribodepleted* $starting_location/step_3_ribodepletion/ - - rm step_2_trimming/*cleaned.fastq - rm step_2_trimming/*ribosomes.fastq - - echo "STEP 3 DONE:REMOVING RIBOSOMAL READS WITH SORTMERNA" - date - - printf "RIBODEPLETION\n" >>pipeline/checkpoints -else - printf "\tThe variable RIBODEPLETION is in the checkpoint file. STEP 3 will then be passed\n" -fi - -#################################################################### -# -# STEP 4: ANNOTATING WITH DIAMOND AGAINST SUBSYSTEMS -date -Step=$(grep "ANNOTATION_SUBSYS" pipeline/checkpoints) -if [ "${Step}" != "ANNOTATION_SUBSYS" ] - then - mkdir $starting_location/step_4_annotation - touch $starting_location/step_4_annotation/diamond_log - for file in $starting_location/step_3_ribodepletion/*ribodepleted.fastq - do - shortname=`echo $file | awk -F "ribodepleted" '{print $1 "subsys_annotated"}'` - #removed the diamond sensitive option - $diamond_location/diamond blastx --db $diamond_subsys_db -q $file -a $file.Subsys -p $threads -t ./ -k 1 1>>$starting_location/step_4_annotation/diamond_log - $diamond_location/diamond view --daa $file.Subsys.daa -o $shortname -p $threads -f tab 1>>$starting_location/step_4_annotation/diamond_log - if [ $? -ne 0 ] - then - printf "\t!!! There is a problem in the diamond subsys step" - exit 1 - fi - done - - mkdir $starting_location/step_4_annotation/daa_binary_files/ - - mv $starting_location/step_3_ribodepletion/*subsys_annotated* $starting_location/step_4_annotation/ - mv $starting_location/step_3_ribodepletion/*.daa $starting_location/step_4_annotation/daa_binary_files/ - - echo "STEP 4 DONE: ANNOTATING WITH DIAMOND AGAINST SUBSYSTEMS" - date - - printf "ANNOTATION_SUBSYS\n" >>pipeline/checkpoints -else - printf "\tThe variable ANNOTATION_SUBSYS is in the checkpoint file. STEP 4 will then be passed\n" -fi -################################################################## -# -# STEP 5: PYTHON SUBSYSTEMS ANALYSIS COUNTER -Step=$(grep "AGGREGATION_SUBSYS" pipeline/checkpoints) -if [ "${Step}" != "AGGREGATION_SUBSYS" ] - then - mkdir $starting_location/step_5_aggregation/ - for file in $starting_location/step_4_annotation/*subsys_annotated - do - python $python_programs/DIAMOND_subsystems_analysis_counter.py -I $file -D $Subsys_db -O $file.hierarchy -P $file.receipt - - # This quick program reduces down identical hierarchy annotations - python $python_programs/subsys_reducer.py -I $file.hierarchy - if [ $? -ne 0 ] - then - printf "\t!!! There is a problem in the aggregation subsys step" - exit 1 - fi - done - - mkdir $starting_location/step_5_aggregation/Subsystems_results/ - mkdir $starting_location/step_5_aggregation/Subsystems_results/receipts/ - mv $starting_location/step_4_annotation/*.reduced $starting_location/step_5_aggregation/Subsystems_results/ - mv $starting_location/step_4_annotation/*.receipt $starting_location/step_5_aggregation/Subsystems_results/receipts/ - rm $starting_location/step_4_annotation/*.hierarchy - - echo "STEP 5 DONE:PYTHON SUBSYSTEMS ANALYSIS COUNTER" - date - - printf "AGGREGATION_SUBSYS\n" >>pipeline/checkpoints -else - printf "\tThe variable AGGREGATION_SUBSYS is in the checkpoint file. STEP 5. will then be passed\n" -fi - - -#################################################################### -# -# STEP 4.1: ANNOTATING WITH DIAMOND AGAINST REFSEQ -# Note: this step assumes that the DIAMOND database is already built. If not, -# do that first before running this step. -Step=$(grep "ANNOTATION_REFSEQ" pipeline/checkpoints) -if [ "${Step}" != "ANNOTATION_REFSEQ" ] - then - touch $starting_location/step_4_annotation/diamond_log - - for file in $starting_location/step_3_ribodepletion/*ribodepleted.fastq - do - shortname=`echo $file | awk -F "ribodepleted" '{print $1 "RefSeq_annotated"}'` - #removed the diamond sensitive option - $diamond_location/diamond blastx --db $diamond_database -q $file -p $threads -a $file.RefSeq -t ./ -k 1 1>>$starting_location/step_4_annotation/diamond_log - $diamond_location/diamond view --daa $file.RefSeq.daa -o $shortname -p $threads -f tab 1>>$starting_location/step_4_annotation/diamond_log - if [ $? -ne 0 ] - then - printf "\t!!! There is a problem in the diamond refseq step" - exit 1 - fi - done - - mv $starting_location/step_3_ribodepletion/*annotated* $starting_location/step_4_annotation/ - mv $starting_location/step_3_ribodepletion/*.daa $starting_location/step_4_annotation/daa_binary_files/ - - echo "STEP 4.1 DONE:ANNOTATING WITH DIAMOND AGAINST REFSEQ" - date - - printf "ANNOTATION_REFSEQ\n" >>pipeline/checkpoints -else - printf "\tThe variable ANNOTATION_REFSEQ is in the checkpoint file. STEP 4.1 will then be passed\n" -fi - -#################################################################### -# -# STEP 5.1: AGGREGATING WITH ANALYSIS_COUNTER -Step=$(grep "AGGREGATION_REFSEQ" pipeline/checkpoints) -if [ "${Step}" != "AGGREGATION_REFSEQ" ] - then - for file in $starting_location/step_4_annotation/*RefSeq_annotated* - do - python $python_programs/standardized_DIAMOND_analysis_counter.py -I $file -D $RefSeq_db -O - python $python_programs/standardized_DIAMOND_analysis_counter.py -I $file -D $RefSeq_db -F - if [ $? -ne 0 ] - then - printf "\t!!! There is a problem in the aggregation refseq step" - exit 1 - fi - done - - mkdir $starting_location/step_5_aggregation/RefSeq_results/ - mkdir $starting_location/step_5_aggregation/RefSeq_results/org_results/ - mkdir $starting_location/step_5_aggregation/RefSeq_results/func_results/ - mv $starting_location/step_4_annotation/*organism.tsv $starting_location/step_5_aggregation/RefSeq_results/org_results/ - mv $starting_location/step_4_annotation/*function.tsv $starting_location/step_5_aggregation/RefSeq_results/func_results/ - - echo "STEP 5.1 DONE: AGGREGATING WITH ANALYSIS_COUNTER" - date - - printf "AGGREGATION_REFSEQ\n" >>pipeline/checkpoints -else - printf "\tThe variable AGGREGATION_REFSEQ is in the checkpoint file. STEP 5.1 will then be passed\n" -fi - - - -################################################################## -# -# At this point, all the results files are ready for analysis using R. -# This next step performs basic DESeq2 analysis of the RefSeq organism, function, -# and Subsystems annotations. -# -# More complex R analyses may be performed using specific .sh analysis scripts. -# -# STEP 6: R ANALYSIS -# Note: For R to properly identify files to compare/contrast, they must include -# the appropriate prefix (either "control_$file" or experimental_$file")! - -#/home/apps/Logiciels/R/3.4.3-gcc/bin/Rscript $R_programs/run_DESeq_stats.R -I $starting_location/step_5_aggregation/RefSeq_results/org_results/ -O RefSeq_org_DESeq_results.tab -R $starting_location/step_2_trimming/raw_counts.txt -#/home/apps/Logiciels/R/3.4.3-gcc/bin/Rscript $R_programs/run_DESeq_stats.R -I $starting_location/step_5_aggregation/RefSeq_results/func_results/ -O RefSeq_func_DESeq_results.tab -R $starting_location/step_2_trimming/raw_counts.txt -#/home/apps/Logiciels/R/3.4.3-gcc/bin/Rscript $R_programs/Subsystems_DESeq_stats.R -I $starting_location/step_5_aggregation/Subsystems_results/ -O Subsystems_level-1_DESeq_results.tab -L 1 -R step_2_trimming/raw_counts.txt - -echo "STEP 6 DONE: R ANALYSIS" -echo "Master bash script finished running at: "; date -exit -#################################################################### - From f68ba12b7bd1a88d9830c21e1c012cc2b6eec3a1 Mon Sep 17 00:00:00 2001 From: Sebastien Renaut Date: Mon, 18 Feb 2019 14:07:22 -0500 Subject: [PATCH 07/10] delete from main branch --- R_scripts/combining_umerged.R | 61 ----------------------------------- 1 file changed, 61 deletions(-) delete mode 100755 R_scripts/combining_umerged.R diff --git a/R_scripts/combining_umerged.R b/R_scripts/combining_umerged.R deleted file mode 100755 index c04f28c..0000000 --- a/R_scripts/combining_umerged.R +++ /dev/null @@ -1,61 +0,0 @@ -#!/cvmfs/soft.computecanada.ca/easybuild/software/2017/avx2/Compiler/intel2016.4/r/3.5.0/bin/Rscript - -args = commandArgs(TRUE) -start = args[1] # Specify which sequences in "list_ind" file you want to align, directlty from the shell. Alternatively, you can do this from the "alignments" function itself. -#start="/Users/jerry/Documents/CSBQ/shapiro/results" - -setwd(start) - -###files to work with -system(paste("ls -1 *unassembled.forward.fastq >umerged.forward.files",sep = "")) -system(paste("ls -1 *unassembled.reverse.fastq >umerged.reverse.files",sep = "")) -system(paste("ls -1 *.assembled.fastq >assembled.files",sep = "")) - -files_f = read.table("umerged.forward.files",stringsAsFactors = F) -files_r = read.table("umerged.reverse.files",stringsAsFactors = F) -files_a = read.table("assembled.files",stringsAsFactors = F) - - -for(i in 1:nrow(files_f)) - { -#grep 1st and 2rd lines - unassembled.fastq = paste("awk 'NR % 2 == 1' ",files_f[i,1]," >unassembled.names.seq.fastq",sep="") - system(unassembled.fastq) - -#grep sequences forward... - unassembled.forward.fastq = paste("awk 'NR % 2 == 0' ",files_f[i,1]," >unassembled.seq.forward.fastq",sep="") - system(unassembled.forward.fastq) - -#grep sequences reverse... - unassembled.reverse.fastq = paste("awk 'NR % 2 == 0' ",files_r[i,1]," >unassembled.seq.reverse.fastq",sep="") - system(unassembled.reverse.fastq) - -#N and E quality file - system("wc -l unassembled.seq.reverse.fastq >wc") - wc = read.table("wc") - - write.table(c(rbind(rep('NNNNNNNNNNNNNNNNNNNN',wc[1,1]/2),rep('EEEEEEEEEEEEEEEEEEEE',wc[1,1]/2))),"NE_file",row.names = F, col.names = F, quote = F) - -#paste the unassembled sequences into a single file - system("paste -d '\\0' unassembled.seq.forward.fastq NE_file unassembled.seq.reverse.fastq >unassembledN") - -#put everything back into a single fastq - back = paste("paste -d '\\n' unassembled.names.seq.fastq unassembledN >",gsub("merged.unassembled.forward","cat",files_f[i,1]),sep= "") - system(back) - -#add the merged sequences - all = paste("cat ",files_a[i,1]," ",gsub("merged.unassembled.forward","cat",files_f[i,1])," >",gsub("merged.assembled","merged.assembled2",files_a[i,1]),sep = "") - system(all) - -#remove the clutter (file specific) - remove = c("rm NE_file unassembledN wc unassembled.names.seq.fastq unassembled.seq.forward.fastq unassembled.seq.reverse.fastq") - system(remove) -} - -#remove the clutter (listing of all files) -system("rm assembled.files umerged.forward.files umerged.reverse.files") - - - - - From ceba37eea48f45f4bcc7171066d2c43af7aa2a5c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Renaut?= Date: Mon, 18 Feb 2019 11:13:35 -0800 Subject: [PATCH 08/10] add a checkpoint file --- README.md | 20 -------------------- bash_scripts/master_script.sh | 23 +++++++++++++++++++++++ 2 files changed, 23 insertions(+), 20 deletions(-) diff --git a/README.md b/README.md index 72c0ce0..2ae3d5f 100644 --- a/README.md +++ b/README.md @@ -1,25 +1,5 @@ # SAMSA2 - A fork of the complete metatranscriptome analysis pipeline -### modification by sebastien.renaut@gmail.com - -### New in my version. -This is a new master script (master_samsa2.slurm) with several modifications: - -* It specifies multithreading for all programs for which it is available (trimmomatic, pear, diamond) -* The script automatically creates a `checkpoint` file. Once a step is finished, it writes the name of that specific step in `checkpoint`. Everytime you run the script, it looks if that file exist. If so, it reads it and SKIP the steps written in checkpoint. This is done to avoid re-run CPU intense steps if not necessary. -* In the merging step, unmerged reads are concatenated and added to a single file. The forward read and the reverse (complement) read are concatenated with a string of 20 Ns in the middle: This is done through a new R script entitled: `combining_umerged.R` -* Extra care is taken to remove unnecessary files once a step is performed to keep disk usage at a minimum. -* Each step contains an exit statement to be printed if the master script dies due to an unforseen error. -* Trimmomatic removes adapter contamination according to a specific fasta file. -* All options, read & program location are to be specified in the first section of the script. -* The script is formated to be run on a HPC using a SLURM job scheduler, but this can be easily changed / removed. -* I've removed the --num_alignments 0 in the ribosomal `sortmrna` step. This caused problems and slowed things down a lot. Plus we don't care about the rRNA alignments. If a sequence aligns to 1 or 1,000 rRNA its out anyways... - -#### To do: -* reverse the order of the merging and trimming step (but again, maybe not...) -* simplify some syntax / make sure all paths are relative (mostly done) - - Version 2 of the SAMSA pipeline - faster! Lighter! More options! Less waiting! ### New in version 2: diff --git a/bash_scripts/master_script.sh b/bash_scripts/master_script.sh index 5f8ff5e..cd6ac6c 100644 --- a/bash_scripts/master_script.sh +++ b/bash_scripts/master_script.sh @@ -72,8 +72,26 @@ else fi #################################################################### +#STEP 0.1: create/read checkpoint + +printf "\nStep 0.1: Checking for the presence of the checkpoint file\n" +if [ ! -f "$INPUT_DIR/checkpoints" ] + then + printf "\tThe file checkpoints does not exist, we will create it.\n" + touch "$INPUT_DIR/checkpoints" +else + printf "\tThe file checkpoints already exist\n" +fi + +#################################################################### + + # # STEP 1: CLEANING FILES WITH TRIMMOMATIC +Step=$(grep "TRIMMO" $INPUT_DIR/checkpoints) +if [ "${Step}" != "TRIMMO" ] + then + if ls $INPUT_DIR/*.gz &>/dev/null; then for file in $INPUT_DIR/*.gz @@ -105,6 +123,11 @@ else mv $INPUT_DIR/*".cleaned" $STEP_1 fi +else + printf "\tThe variable TRIMMO is in the checkpoint file. STEP 1 will then be passed\n" +fi + + #################################################################### # # STEP 2: MERGING OF PAIRED-END FILES USING PEAR From bf29e7888a2b08872837c5cbb50e923abd89d711 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Renaut?= Date: Mon, 18 Feb 2019 11:26:35 -0800 Subject: [PATCH 09/10] checkpoint for all steps --- bash_scripts/master_script.sh | 74 +++++++++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) diff --git a/bash_scripts/master_script.sh b/bash_scripts/master_script.sh index cd6ac6c..f6ef74e 100644 --- a/bash_scripts/master_script.sh +++ b/bash_scripts/master_script.sh @@ -134,6 +134,10 @@ fi # Note: paired-end files are usually named using R1 and R2 in the name. # Example: control_1.R1.fastq # control_1.R2.fastq +Step=$(grep "MERGING" $INPUT_DIR/checkpoints) +if [ "${Step}" != "MERGING" ] + then + $MKDIR $STEP_2 if $paired; then @@ -151,10 +155,19 @@ else done fi +printf "MERGING\n" >>$INPUT_DIR/checkpoints + +else + printf "\tThe variable MERGING is in the checkpoint file. STEP 2 will then be passed\n" +fi + #################################################################### # # STEP 2.9: GETTING RAW SEQUENCES COUNTS # Note: These are used later for statistical analysis. +Step=$(grep "RAW" $INPUT_DIR/checkpoints) +if [ "${Step}" != "RAW" ] + then if [[ -f $STEP_2/raw_counts.txt ]]; then rm $STEP_2/raw_counts.txt @@ -173,11 +186,20 @@ else done fi +printf "RAW\n" >>$INPUT_DIR/checkpoints + +else + printf "\tThe variable RAW is in the checkpoint file. STEP 2.9 will then be passed\n" +fi + #################################################################### # # STEP 3: REMOVING RIBOSOMAL READS WITH SORTMERNA # Note: this step assumes that the SortMeRNA databases are indexed. If not, # do that first (see the SortMeRNA user manual for details). +Step=$(grep "RIBO" $INPUT_DIR/checkpoints) +if [ "${Step}" != "RIBO" ] + then for file in $STEP_2/*.assembled.fastq do @@ -191,11 +213,20 @@ done $MKDIR $STEP_3 mv $STEP_2/*ribodepleted* $STEP_3 +printf "RIBO\n" >>$INPUT_DIR/checkpoints + +else + printf "\tThe variable RIBO is in the checkpoint file. STEP 3 will then be passed\n" +fi + #################################################################### # # STEP 4: ANNOTATING WITH DIAMOND AGAINST REFSEQ # Note: this step assumes that the DIAMOND database is already built. If not, # do that first before running this step. +Step=$(grep "REFSEQ_ANNOT" $INPUT_DIR/checkpoints) +if [ "${Step}" != "REFSEQ_ANNOT" ] + then echo "Now starting on DIAMOND org annotations at: "; date @@ -215,9 +246,18 @@ mv $STEP_3/*.daa $STEP_4/daa_binary_files echo "RefSeq DIAMOND annotations completed at: "; date +printf "REFSEQ_ANNOT\n" >>$INPUT_DIR/checkpoints + +else + printf "\tThe variable REFSEQ_ANNOT is in the checkpoint file. STEP 4 will then be passed\n" +fi + #################################################################### # # STEP 5: AGGREGATING WITH ANALYSIS_COUNTER +Step=$(grep "REFSEQ_AGGREG" $INPUT_DIR/checkpoints) +if [ "${Step}" != "REFSEQ_AGGREG" ] + then for file in $STEP_4/*RefSeq_annotated do @@ -230,9 +270,18 @@ $MKDIR $STEP_5/RefSeq_results/func_results mv $STEP_4/*organism.tsv $STEP_5/RefSeq_results/org_results mv $STEP_4/*function.tsv $STEP_5/RefSeq_results/func_results +printf "REFSEQ_AGGREG\n" >>$INPUT_DIR/checkpoints + +else + printf "\tThe variable REFSEQ_AGGREG is in the checkpoint file. STEP 5 will then be passed\n" +fi + #################################################################### # # STEP 4.1: ANNOTATING WITH DIAMOND AGAINST SUBSYSTEMS +Step=$(grep "SUBSYS_ANNOT" $INPUT_DIR/checkpoints) +if [ "${Step}" != "SUBSYS_ANNOT" ] + then echo "Now starting on DIAMOND Subsystems annotations at: "; date @@ -249,9 +298,18 @@ mv $STEP_3/*.daa $STEP_4/daa_binary_files echo "DIAMOND Subsystems annotations completed at: "; date +printf "SUBSYS_ANNOT\n" >>$INPUT_DIR/checkpoints + +else + printf "\tThe variable SUBSYS_ANNOT is in the checkpoint file. STEP 4.1 will then be passed\n" +fi + ################################################################## # # STEP 5.1: PYTHON SUBSYSTEMS ANALYSIS COUNTER +Step=$(grep "SUBSYS_AGGREG" $INPUT_DIR/checkpoints) +if [ "${Step}" != "SUBSYS_AGGREG" ] + then for file in $STEP_4/*subsys_annotated do @@ -267,6 +325,12 @@ mv $STEP_4/*.reduced $STEP_5/Subsystems_results mv $STEP_4/*.receipt $STEP_5/Subsystems_results/receipts rm $STEP_4/*.hierarchy +printf "SUBSYS_AGGREG\n" >>$INPUT_DIR/checkpoints + +else + printf "\tThe variable SUBSYS_AGGREG is in the checkpoint file. STEP 5.1 will then be passed\n" +fi + ################################################################## # # At this point, all the results files are ready for analysis using R. @@ -278,6 +342,9 @@ rm $STEP_4/*.hierarchy # STEP 6: R ANALYSIS # Note: For R to properly identify files to compare/contrast, they must include # the appropriate prefix (either "control_$file" or experimental_$file")! +Step=$(grep "R_ANALYSIS" $INPUT_DIR/checkpoints) +if [ "${Step}" != "R_ANALYSIS" ] + then checked Rscript $R_DIR/run_DESeq_stats.R \ -I $STEP_5/RefSeq_results/org_results \ @@ -292,6 +359,13 @@ checked Rscript $R_DIR/Subsystems_DESeq_stats.R \ -O Subsystems_level-1_DESeq_results.tab -L 1 \ -R $STEP_2/raw_counts.txt +printf "R_ANALYSIS\n" >>$INPUT_DIR/checkpoints + +else + printf "\tThe variable R_ANALYSIS is in the checkpoint file. STEP 6 will then be passed\n" +fi + echo "Master bash script finished running at: "; date exit #################################################################### + From c7474db08cc89320ae587fc0c7def606508f2063 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Renaut?= Date: Mon, 4 Mar 2019 12:07:28 -0800 Subject: [PATCH 10/10] adding my own scripts to the forked samsa2 pipeline --- .gitignore | 6 +- README.md | 20 + R_scripts/combining_umerged.R | 61 +++ bash_scripts/cyanotoxin_master_samsa2.slurm | 505 ++++++++++++++++++++ bash_scripts/master_samsa2.slurm | 435 +++++++++++++++++ 5 files changed, 1024 insertions(+), 3 deletions(-) create mode 100755 R_scripts/combining_umerged.R create mode 100755 bash_scripts/cyanotoxin_master_samsa2.slurm create mode 100755 bash_scripts/master_samsa2.slurm diff --git a/.gitignore b/.gitignore index c11349b..30b79f6 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,7 @@ # my files -bash_scripts/master_samsa2.slurm -bash_scripts/cyanotoxin_master_samsa2.slurm -R_scripts/combining_umerged.R +#bash_scripts/master_samsa2.slurm +#bash_scripts/cyanotoxin_master_samsa2.slurm +#R_scripts/combining_umerged.R # Swap files *.swp diff --git a/README.md b/README.md index 2ae3d5f..72c0ce0 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,25 @@ # SAMSA2 - A fork of the complete metatranscriptome analysis pipeline +### modification by sebastien.renaut@gmail.com + +### New in my version. +This is a new master script (master_samsa2.slurm) with several modifications: + +* It specifies multithreading for all programs for which it is available (trimmomatic, pear, diamond) +* The script automatically creates a `checkpoint` file. Once a step is finished, it writes the name of that specific step in `checkpoint`. Everytime you run the script, it looks if that file exist. If so, it reads it and SKIP the steps written in checkpoint. This is done to avoid re-run CPU intense steps if not necessary. +* In the merging step, unmerged reads are concatenated and added to a single file. The forward read and the reverse (complement) read are concatenated with a string of 20 Ns in the middle: This is done through a new R script entitled: `combining_umerged.R` +* Extra care is taken to remove unnecessary files once a step is performed to keep disk usage at a minimum. +* Each step contains an exit statement to be printed if the master script dies due to an unforseen error. +* Trimmomatic removes adapter contamination according to a specific fasta file. +* All options, read & program location are to be specified in the first section of the script. +* The script is formated to be run on a HPC using a SLURM job scheduler, but this can be easily changed / removed. +* I've removed the --num_alignments 0 in the ribosomal `sortmrna` step. This caused problems and slowed things down a lot. Plus we don't care about the rRNA alignments. If a sequence aligns to 1 or 1,000 rRNA its out anyways... + +#### To do: +* reverse the order of the merging and trimming step (but again, maybe not...) +* simplify some syntax / make sure all paths are relative (mostly done) + + Version 2 of the SAMSA pipeline - faster! Lighter! More options! Less waiting! ### New in version 2: diff --git a/R_scripts/combining_umerged.R b/R_scripts/combining_umerged.R new file mode 100755 index 0000000..c04f28c --- /dev/null +++ b/R_scripts/combining_umerged.R @@ -0,0 +1,61 @@ +#!/cvmfs/soft.computecanada.ca/easybuild/software/2017/avx2/Compiler/intel2016.4/r/3.5.0/bin/Rscript + +args = commandArgs(TRUE) +start = args[1] # Specify which sequences in "list_ind" file you want to align, directlty from the shell. Alternatively, you can do this from the "alignments" function itself. +#start="/Users/jerry/Documents/CSBQ/shapiro/results" + +setwd(start) + +###files to work with +system(paste("ls -1 *unassembled.forward.fastq >umerged.forward.files",sep = "")) +system(paste("ls -1 *unassembled.reverse.fastq >umerged.reverse.files",sep = "")) +system(paste("ls -1 *.assembled.fastq >assembled.files",sep = "")) + +files_f = read.table("umerged.forward.files",stringsAsFactors = F) +files_r = read.table("umerged.reverse.files",stringsAsFactors = F) +files_a = read.table("assembled.files",stringsAsFactors = F) + + +for(i in 1:nrow(files_f)) + { +#grep 1st and 2rd lines + unassembled.fastq = paste("awk 'NR % 2 == 1' ",files_f[i,1]," >unassembled.names.seq.fastq",sep="") + system(unassembled.fastq) + +#grep sequences forward... + unassembled.forward.fastq = paste("awk 'NR % 2 == 0' ",files_f[i,1]," >unassembled.seq.forward.fastq",sep="") + system(unassembled.forward.fastq) + +#grep sequences reverse... + unassembled.reverse.fastq = paste("awk 'NR % 2 == 0' ",files_r[i,1]," >unassembled.seq.reverse.fastq",sep="") + system(unassembled.reverse.fastq) + +#N and E quality file + system("wc -l unassembled.seq.reverse.fastq >wc") + wc = read.table("wc") + + write.table(c(rbind(rep('NNNNNNNNNNNNNNNNNNNN',wc[1,1]/2),rep('EEEEEEEEEEEEEEEEEEEE',wc[1,1]/2))),"NE_file",row.names = F, col.names = F, quote = F) + +#paste the unassembled sequences into a single file + system("paste -d '\\0' unassembled.seq.forward.fastq NE_file unassembled.seq.reverse.fastq >unassembledN") + +#put everything back into a single fastq + back = paste("paste -d '\\n' unassembled.names.seq.fastq unassembledN >",gsub("merged.unassembled.forward","cat",files_f[i,1]),sep= "") + system(back) + +#add the merged sequences + all = paste("cat ",files_a[i,1]," ",gsub("merged.unassembled.forward","cat",files_f[i,1])," >",gsub("merged.assembled","merged.assembled2",files_a[i,1]),sep = "") + system(all) + +#remove the clutter (file specific) + remove = c("rm NE_file unassembledN wc unassembled.names.seq.fastq unassembled.seq.forward.fastq unassembled.seq.reverse.fastq") + system(remove) +} + +#remove the clutter (listing of all files) +system("rm assembled.files umerged.forward.files umerged.reverse.files") + + + + + diff --git a/bash_scripts/cyanotoxin_master_samsa2.slurm b/bash_scripts/cyanotoxin_master_samsa2.slurm new file mode 100755 index 0000000..2e278da --- /dev/null +++ b/bash_scripts/cyanotoxin_master_samsa2.slurm @@ -0,0 +1,505 @@ +#!/bin/bash + +#SBATCH --time=0-07:59 +#SBATCH --mem=40G +#SBATCH --account=def-bruneaua +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=12 + +module load r/3.5.0 + + +# Lines starting with #SBATCH are for SLURM job management systems +# and may be removed if user is not submitting jobs to SLURM +#################################################################### +#set -x #echo +# +# master_script_for_sample_files.bash +# Created April 2017 by Sam Westreich, github.com/transcript +# This version modified by Michelle Treiber on August 9, 2017 +# +#################################################################### +# +# This script sets up and runs through ALL steps in the SAMSA pipeline +# before the analysis (which is done in R, likely in RStudio). Each +# step is set up below. +# +# The steps are: +# 0.1 Create the checkpoint +# 0.2 Gunzip the raw sequences +# 1. Merging with PEAR, if applicable +# 2. Read cleaning with Trimmomatic +# 3. rRNA removal with SortMeRNA +# 4. Annotation using DIAMOND against Subsys database +# 5. Aggregation using analysis_counter.py (Subsys) +# 4.1 Annotation using DIAMOND against Refseq database +# 5.1 Aggregation using analysis_counter.py (Refseq) +# 6. Running R scripts to get DESeq statistical analysis. (NOT RUN) +# +# NOTE: BEFORE running this script, please run package_installation.bash +# and full_database_download.bash located at: +# https://github.com/transcript/samsa2/tree/master/setup in order to set +# up SAMSA2 dependencies and download full databases. +# +#################################################################### +# +echo -e "NOTE: Before running this script, user must run package_installation.bash and full_database_download.bash located at https://github.com/transcript/samsa2/tree/master/setup in order to set up SAMSA2 dependencies.\n\n" +# +# VARIABLES - Set pathway for starting_location to location of samsa2 GitHub download +# +# 0. Set starting location: +starting_location=/home/renaut/scratch/ATRAPP/ATRAPP_Champlain_2016_Metat/St1 + +# 0.5 Sequences location (note that sequences actually need to be stored in directory called $sequence_location/gzipped +sequence_location=/home/renaut/scratch/ATRAPP/sequences/ATRAPP_Champlain_2016_Metat/St1 + +# 1. PEAR +pear_location=/home/renaut/bin + +# 2. Trimmomatic +trimmomatic_location=/home/renaut/trinityrnaseq-Trinity-v2.8.4/trinity-plugins/Trimmomatic-0.36 + +# 3. SortMeRNA +sortmerna_location=/home/renaut/bin + +# 4. DIAMOND +diamond_database="/home/renaut/scratch/blast_database/refseq/RefSeq_bac" +diamond_cyanotoxin="/home/renaut/scratch/blast_database/Cyanotoxin_Database/New_Toxin_Database_OneLine_1" +diamond_subsys_db="/home/renaut/scratch/blast_database/subsys/subsys_db" +diamond_location=/home/renaut/bin + +# 5. Aggregation +python_programs=/home/renaut/samsa2.master/samsa2/python_scripts +cyano_db="/home/renaut/scratch/blast_database/Cyanotoxin_Database/New_Toxin_Database_OneLine_1.faa2" +RefSeq_db="/home/renaut/scratch/blast_database/refseq/RefSeq_bac.fa" +Subsys_db="/home/renaut/scratch/blast_database/subsys/subsys_db.fa" + +# 6. R scripts and paths +export R_LIBS="/home/renaut/samsa2.master/samsa2/R_scripts/packages" +R_programs=/home/renaut/samsa2.master/samsa2/R_scripts + +# threads +threads="12" + +################## +#STEP 0.1: create/read checkpoint +cd $starting_location + +printf "\nStep 0.1: Checking for the presence of the checkpoint file\n" +if [ ! -f "pipeline/checkpoints" ] + then + printf "\tThe file checkpoints does not exist, we will create it.\n" + touch "pipeline/checkpoints" +else + printf "\tThe file checkpoints already exist\n" +fi + +#################################################################### +# +#STEP 0.2: GUNZIP PROCESS. +# +# +Step=$(grep "GUNZIP" pipeline/checkpoints) +if [ "${Step}" != "GUNZIP" ] + then + for file in $sequence_location/zipped/*gz + do + file_out=`echo $file | awk -F ".gz" '{print $1}'` + gunzip $file -cd >$file_out + + if [ $? -ne 0 ] + then + printf "\t!!! There is a problem in the gunzip step" + exit 1 + fi + done + + mkdir $sequence_location/unzipped + mv $sequence_location/zipped/*fastq $sequence_location/unzipped/. + printf "GUNZIP\n" >>pipeline/checkpoints + echo "STEP 0.1 DONE:GUNZIP OF PAIRED-END FILES USING GUNZIP" + date +else + printf "\tThe variable GUNZIP is in the checkpoint file. STEP 0.1 will then be passed\n" +fi + +#################################################################### +# +# STEP 1: MERGING OF PAIRED-END FILES USING PEAR +# Note: paired-end files are usually named using R1 and R2 in the name. +# Example: control_1.R1.fastq +# control_1.R2.fastq +# +# Note: if using single-end sequencing, skip this step (comment out). +# Note: if performing R analysis (step 6), be sure to name files with +# the appropriate prefix ("control_$file" and "experimental_$file")! + + + +Step=$(grep "MERGING" pipeline/checkpoints) +if [ "${Step}" != "MERGING" ] + then + mkdir $starting_location/step_1_merging + touch $starting_location/step_1_merging/pear_log + for file in $sequence_location/unzipped/*_R1.fastq + do + file1=$file + file2=`echo $file1 | awk -F "R1" '{print $1 "R2" $2}'` + out_path=`echo $file | awk -F "R1" '{print $1 "merged"}'` + out_name=`echo ${out_path##*/}` + $pear_location/pear -f $file1 -j $threads -r $file2 -o $out_name 1>>$starting_location/step_1_merging/pear_log + + if [ $? -ne 0 ] + then + printf "\t!!! There is a problem in the merging step" + exit 1 + fi + done + + ###I will concatenate forward and reverse complement with 20Ns in the middle. + $R_programs/combining_umerged.R $starting_location + + printf "MERGING\n" >>pipeline/checkpoints + mkdir $starting_location/step_1_merging/ + mv $starting_location/*assembled2.fastq $starting_location/step_1_merging/ + + rm $starting_location/*fastq + rm $sequence_location/unzipped/*fastq + + echo "STEP 1 DONE:MERGING OF PAIRED-END FILES USING PEAR" + date +else + printf "\tThe variable MERGING is in the checkpoint file. STEP 1 will then be passed\n" +fi + +#################################################################### +# +# STEP 2: CLEANING FILES WITH TRIMMOMATIC +# Note: if skipping PEAR, make sure that all starting files are in the +# $starting_location/step_1_output/ folder! +Step=$(grep "TRIMMING" pipeline/checkpoints) +if [ "${Step}" != "TRIMMING" ] + then + mkdir $starting_location/step_2_trimming + touch $starting_location/step_2_trimming/trimmomatic_log + + for file in $starting_location/step_1_merging/*assembled2.fastq + do + shortname=`echo $file | awk -F "assembled2.fastq" '{print $1 "cleaned.fastq"}'` + + java -jar $trimmomatic_location/trimmomatic-0.36.jar SE -threads $threads -phred33 $file $shortname ILLUMINACLIP:$trimmomatic_location/adapters/all_primers.fa:2:30:10 SLIDINGWINDOW:4:15 MINLEN:59 2>>$starting_location/step_2_trimming/trimmomatic_log + if [ $? -ne 0 ] + then + printf "\t!!! There is a problem in the trimmmomatic step" + exit 1 + fi + + done + + mkdir $starting_location/step_2_trimming/ + mv $starting_location/step_1_merging/*cleaned.fastq $starting_location/step_2_trimming/ + + rm step_1_merging/*assembled2.fastq + + echo "STEP 2 DONE:CLEANING FILES WITH TRIMMOMATIC" + date + + printf "TRIMMING\n" >>pipeline/checkpoints +else + printf "\tThe variable TRIMMING is in the checkpoint file. STEP 2 will then be passed\n" +fi + +#################################################################### +# +# STEP 2.5: GETTING RAW SEQUENCES COUNTS +# Note: These are used later for statistical analysis. +Step=$(grep "RAW" pipeline/checkpoints) +if [ "${Step}" != "RAW" ] + then + if [ -f $starting_location/step_2_trimming/raw_counts.txt ] + then + rm $starting_location/step_2_trimming/raw_counts.txt + touch $starting_location/step_2_trimming/raw_counts.txt + else + touch $starting_location/step_2_trimming/raw_counts.txt + fi + + for file in $starting_location/step_2_trimming/*cleaned.fastq + do + python $python_programs/raw_read_counter.py -I $file -O $starting_location/step_2_trimming/raw_counts.txt + if [ $? -ne 0 ] + then + printf "\t!!! There is a problem in the raw sequences step" + exit 1 + fi + done + echo "STEP 2.5:GETTING RAW SEQUENCES COUNTS" + date + + printf "RAW\n" >>pipeline/checkpoints +else + printf "\tThe variable RAW is in the checkpoint file. STEP 2.9 will then be passed\n" +fi + +#################################################################### +# +# STEP 3: REMOVING RIBOSOMAL READS WITH SORTMERNA +# Note: this step assumes that the SortMeRNA databases are indexed. If not, +# do that first (see the SortMeRNA user manual for details). + +Step=$(grep "RIBODEPLETION" pipeline/checkpoints) +if [ "${Step}" != "RIBODEPLETION" ] + then + mkdir $starting_location/step_3_ribodepletion + touch $starting_location/step_3_ribodepletion/sortmerna_log + + for file in $starting_location/step_2_trimming/*cleaned.fastq + do + shortname=`echo $file | awk -F "cleaned" '{print $1 "ribodepleted"}'` + $sortmerna_location/sortmerna -a $threads --ref $sortmerna_location/../sortmerna-2.1/rRNA_databases/silva-bac-16s-id90.fasta,$sortmerna_location/../sortmerna-2.1/index/silva-bac-16s --reads $file --aligned $file.ribosomes --other $shortname --fastx --log -v 1>>$starting_location/step_3_ribodepletion/sortmerna_log + if [ $? -ne 0 ] + then + printf "\t!!! There is a problem in the sortmerna step" + exit 1 + fi + done + + mkdir $starting_location/step_3_ribodepletion/ + mv $starting_location/step_2_trimming/*ribodepleted* $starting_location/step_3_ribodepletion/ + + rm step_2_trimming/*cleaned.fastq + rm step_2_trimming/*ribosomes.fastq + + echo "STEP 3 DONE:REMOVING RIBOSOMAL READS WITH SORTMERNA" + date + + printf "RIBODEPLETION\n" >>pipeline/checkpoints +else + printf "\tThe variable RIBODEPLETION is in the checkpoint file. STEP 3 will then be passed\n" +fi + +#################################################################### +# +# STEP 4: ANNOTATING WITH DIAMOND AGAINST SUBSYSTEMS +date +Step=$(grep "ANNOTATION_SUBSYS" pipeline/checkpoints) +if [ "${Step}" != "ANNOTATION_SUBSYS" ] + then + mkdir $starting_location/step_4_annotation + touch $starting_location/step_4_annotation/diamond_log + for file in $starting_location/step_3_ribodepletion/*ribodepleted.fastq + do + shortname=`echo $file | awk -F "ribodepleted" '{print $1 "subsys_annotated"}'` + #removed the diamond sensitive option + $diamond_location/diamond blastx --db $diamond_subsys_db -q $file -a $file.Subsys -p $threads -t ./ -k 1 1>>$starting_location/step_4_annotation/diamond_log + $diamond_location/diamond view --daa $file.Subsys.daa -o $shortname -p $threads -f tab 1>>$starting_location/step_4_annotation/diamond_log + if [ $? -ne 0 ] + then + printf "\t!!! There is a problem in the diamond subsys step" + exit 1 + fi + done + + mkdir $starting_location/step_4_annotation/daa_binary_files/ + + mv $starting_location/step_3_ribodepletion/*subsys_annotated* $starting_location/step_4_annotation/ + mv $starting_location/step_3_ribodepletion/*.daa $starting_location/step_4_annotation/daa_binary_files/ + + echo "STEP 4 DONE: ANNOTATING WITH DIAMOND AGAINST SUBSYSTEMS" + date + + printf "ANNOTATION_SUBSYS\n" >>pipeline/checkpoints +else + printf "\tThe variable ANNOTATION_SUBSYS is in the checkpoint file. STEP 4 will then be passed\n" +fi +################################################################## +# +# STEP 5: PYTHON SUBSYSTEMS ANALYSIS COUNTER +Step=$(grep "AGGREGATION_SUBSYS" pipeline/checkpoints) +if [ "${Step}" != "AGGREGATION_SUBSYS" ] + then + mkdir $starting_location/step_5_aggregation/ + for file in $starting_location/step_4_annotation/*subsys_annotated + do + python $python_programs/DIAMOND_subsystems_analysis_counter.py -I $file -D $Subsys_db -O $file.hierarchy -P $file.receipt + + # This quick program reduces down identical hierarchy annotations + python $python_programs/subsys_reducer.py -I $file.hierarchy + if [ $? -ne 0 ] + then + printf "\t!!! There is a problem in the aggregation subsys step" + exit 1 + fi + done + + mkdir $starting_location/step_5_aggregation/Subsystems_results/ + mkdir $starting_location/step_5_aggregation/Subsystems_results/receipts/ + mv $starting_location/step_4_annotation/*.reduced $starting_location/step_5_aggregation/Subsystems_results/ + mv $starting_location/step_4_annotation/*.receipt $starting_location/step_5_aggregation/Subsystems_results/receipts/ + rm $starting_location/step_4_annotation/*.hierarchy + + echo "STEP 5 DONE:PYTHON SUBSYSTEMS ANALYSIS COUNTER" + date + + printf "AGGREGATION_SUBSYS\n" >>pipeline/checkpoints +else + printf "\tThe variable AGGREGATION_SUBSYS is in the checkpoint file. STEP 5. will then be passed\n" +fi + + +#################################################################### +# +# STEP 4.1: ANNOTATING WITH DIAMOND AGAINST REFSEQ +# Note: this step assumes that the DIAMOND database is already built. If not, +# do that first before running this step. +Step=$(grep "ANNOTATION_REFSEQ" pipeline/checkpoints) +if [ "${Step}" != "ANNOTATION_REFSEQ" ] + then + touch $starting_location/step_4_annotation/diamond_log + + for file in $starting_location/step_3_ribodepletion/*ribodepleted.fastq + do + shortname=`echo $file | awk -F "ribodepleted" '{print $1 "RefSeq_annotated"}'` + #removed the diamond sensitive option + $diamond_location/diamond blastx --db $diamond_database -q $file -p $threads -a $file.RefSeq -t ./ -k 1 1>>$starting_location/step_4_annotation/diamond_log + $diamond_location/diamond view --daa $file.RefSeq.daa -o $shortname -p $threads -f tab 1>>$starting_location/step_4_annotation/diamond_log + if [ $? -ne 0 ] + then + printf "\t!!! There is a problem in the diamond refseq step" + exit 1 + fi + done + + mv $starting_location/step_3_ribodepletion/*annotated* $starting_location/step_4_annotation/ + mv $starting_location/step_3_ribodepletion/*.daa $starting_location/step_4_annotation/daa_binary_files/ + + echo "STEP 4.1 DONE:ANNOTATING WITH DIAMOND AGAINST REFSEQ" + date + + printf "ANNOTATION_REFSEQ\n" >>pipeline/checkpoints +else + printf "\tThe variable ANNOTATION_REFSEQ is in the checkpoint file. STEP 4.1 will then be passed\n" +fi + +#################################################################### +# +# STEP 4.2: ANNOTATING WITH DIAMOND AGAINST CYANO +# Note: this step assumes that the DIAMOND database is already built. If not, +# do that first before running this step. +Step=$(grep "ANNOTATION_CYANO" pipeline/checkpoints) +if [ "${Step}" != "ANNOTATION_CYANO" ] + then + touch $starting_location/step_4_annotation/diamond_cyano_log + + for file in $starting_location/step_3_ribodepletion/*ribodepleted.fastq + do + shortname=`echo $file | awk -F "ribodepleted" '{print $1 "cyano_annotated"}'` + #removed the diamond sensitive option + $diamond_location/diamond blastx --db $diamond_cyanotoxin -q $file -p $threads -a $file.cyano -t ./ -k 1 1>>$starting_location/step_4_annotation/diamond_cyano_log + $diamond_location/diamond view --daa $file.cyano.daa -o $shortname -p $threads -f tab 1>>$starting_location/step_4_annotation/diamond_cyano_log + if [ $? -ne 0 ] + then + printf "\t!!! There is a problem in the diamond cyano step" + exit 1 + fi + done + + mv $starting_location/step_3_ribodepletion/*annotated* $starting_location/step_4_annotation/ + mv $starting_location/step_3_ribodepletion/*.daa $starting_location/step_4_annotation/daa_binary_files/ + + echo "STEP 4.2 DONE:ANNOTATING WITH DIAMOND AGAINST CYANO" + date + + printf "ANNOTATION_CYANO\n" >>pipeline/checkpoints +else + printf "\tThe variable ANNOTATION_CYANO is in the checkpoint file. STEP 4.1 will then be passed\n" +fi + + +#################################################################### +# +# STEP 5.1: AGGREGATING WITH ANALYSIS_COUNTER +Step=$(grep "AGGREGATION_REFSEQ" pipeline/checkpoints) +if [ "${Step}" != "AGGREGATION_REFSEQ" ] + then + for file in $starting_location/step_4_annotation/*RefSeq_annotated* + do + python $python_programs/standardized_DIAMOND_analysis_counter.py -I $file -D $RefSeq_db -O + python $python_programs/standardized_DIAMOND_analysis_counter.py -I $file -D $RefSeq_db -F + if [ $? -ne 0 ] + then + printf "\t!!! There is a problem in the aggregation refseq step" + exit 1 + fi + done + + mkdir $starting_location/step_5_aggregation/RefSeq_results/ + mkdir $starting_location/step_5_aggregation/RefSeq_results/org_results/ + mkdir $starting_location/step_5_aggregation/RefSeq_results/func_results/ + mv $starting_location/step_4_annotation/*organism.tsv $starting_location/step_5_aggregation/RefSeq_results/org_results/ + mv $starting_location/step_4_annotation/*function.tsv $starting_location/step_5_aggregation/RefSeq_results/func_results/ + + echo "STEP 5.1 DONE: AGGREGATING WITH ANALYSIS_COUNTER" + date + + printf "AGGREGATION_REFSEQ\n" >>pipeline/checkpoints +else + printf "\tThe variable AGGREGATION_REFSEQ is in the checkpoint file. STEP 5.1 will then be passed\n" +fi + + +#################################################################### +# +# STEP 5.2: AGGREGATING WITH ANALYSIS_COUNTER +Step=$(grep "AGGREGATION_CYANO" pipeline/checkpoints) +if [ "${Step}" != "AGGREGATION_CYANO" ] + then + for file in $starting_location/step_4_annotation/*cyano_annotated* + do + python $python_programs/standardized_DIAMOND_analysis_counter.py -I $file -D $cyano_db -O + python $python_programs/standardized_DIAMOND_analysis_counter.py -I $file -D $cyano_db -F + if [ $? -ne 0 ] + then + printf "\t!!! There is a problem in the aggregation CYANO step" + exit 1 + fi + done + + mkdir $starting_location/step_5_aggregation/cyano_results/ + mkdir $starting_location/step_5_aggregation/cyano_results/org_results/ + mkdir $starting_location/step_5_aggregation/cyano_results/func_results/ + mv $starting_location/step_4_annotation/*organism.tsv $starting_location/step_5_aggregation/cyano_results/org_results/ + mv $starting_location/step_4_annotation/*function.tsv $starting_location/step_5_aggregation/cyano_results/func_results/ + + echo "STEP 5.2 DONE: AGGREGATING WITH ANALYSIS_COUNTER" + date + + printf "AGGREGATION_CYANO\n" >>pipeline/checkpoints +else + printf "\tThe variable AGGREGATION_CYANO is in the checkpoint file. STEP 5.2 will then be passed\n" +fi + + + + +################################################################## +# +# At this point, all the results files are ready for analysis using R. +# This next step performs basic DESeq2 analysis of the RefSeq organism, function, +# and Subsystems annotations. +# +# More complex R analyses may be performed using specific .sh analysis scripts. +# +# STEP 6: R ANALYSIS +# Note: For R to properly identify files to compare/contrast, they must include +# the appropriate prefix (either "control_$file" or experimental_$file")! + +#/home/apps/Logiciels/R/3.4.3-gcc/bin/Rscript $R_programs/run_DESeq_stats.R -I $starting_location/step_5_aggregation/RefSeq_results/org_results/ -O RefSeq_org_DESeq_results.tab -R $starting_location/step_2_trimming/raw_counts.txt +#/home/apps/Logiciels/R/3.4.3-gcc/bin/Rscript $R_programs/run_DESeq_stats.R -I $starting_location/step_5_aggregation/RefSeq_results/func_results/ -O RefSeq_func_DESeq_results.tab -R $starting_location/step_2_trimming/raw_counts.txt +#/home/apps/Logiciels/R/3.4.3-gcc/bin/Rscript $R_programs/Subsystems_DESeq_stats.R -I $starting_location/step_5_aggregation/Subsystems_results/ -O Subsystems_level-1_DESeq_results.tab -L 1 -R step_2_trimming/raw_counts.txt + +echo "STEP 6 DONE: R ANALYSIS" +echo "Master bash script finished running at: "; date +exit +#################################################################### + diff --git a/bash_scripts/master_samsa2.slurm b/bash_scripts/master_samsa2.slurm new file mode 100755 index 0000000..a16d11b --- /dev/null +++ b/bash_scripts/master_samsa2.slurm @@ -0,0 +1,435 @@ +#!/bin/bash + +#SBATCH --time=0-23:59 +#SBATCH --mem=100G +#SBATCH --account=def-bruneaua +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=48 + +module load r/3.5.0 + + +# Lines starting with #SBATCH are for SLURM job management systems +# and may be removed if user is not submitting jobs to SLURM +#################################################################### +#set -x #echo +# +# master_script_for_sample_files.bash +# Created April 2017 by Sam Westreich, github.com/transcript +# This version modified by Michelle Treiber on August 9, 2017 +# +#################################################################### +# +# This script sets up and runs through ALL steps in the SAMSA pipeline +# before the analysis (which is done in R, likely in RStudio). Each +# step is set up below. +# +# The steps are: +# 0.1 Create the checkpoint +# 0.2 Gunzip the raw sequences +# 1. Merging with PEAR, if applicable +# 2. Read cleaning with Trimmomatic +# 3. rRNA removal with SortMeRNA +# 4. Annotation using DIAMOND against Subsys database +# 5. Aggregation using analysis_counter.py (Subsys) +# 4.1 Annotation using DIAMOND against Refseq database +# 5.1 Aggregation using analysis_counter.py (Refseq) +# 6. Running R scripts to get DESeq statistical analysis. (NOT RUN) +# +# NOTE: BEFORE running this script, please run package_installation.bash +# and full_database_download.bash located at: +# https://github.com/transcript/samsa2/tree/master/setup in order to set +# up SAMSA2 dependencies and download full databases. +# +#################################################################### +# +echo -e "NOTE: Before running this script, user must run package_installation.bash and full_database_download.bash located at https://github.com/transcript/samsa2/tree/master/setup in order to set up SAMSA2 dependencies.\n\n" +# +# VARIABLES - Set pathway for starting_location to location of samsa2 GitHub download +# +# 0. Set starting location: +starting_location=/home/renaut/scratch/ATRAPP/Metagenomic_Champlain_long_term/St1 + +# 0.5 Sequences location (note that sequences actually need to be stored in directory called $sequence_location/zipped +sequence_location=/home/renaut/scratch/ATRAPP/sequences/Metagenomic_Champlain_long_term/St1 + +# 1. PEAR +pear_location=/home/renaut/bin + +# 2. Trimmomatic +trimmomatic_location=/home/renaut/trinityrnaseq-Trinity-v2.8.4/trinity-plugins/Trimmomatic-0.36 + +# 3. SortMeRNA +sortmerna_location=/home/renaut/bin + +# 4. DIAMOND +diamond_database="/home/renaut/scratch/blast_database/refseq/RefSeq_bac" +diamond_subsys_db="/home/renaut/scratch/blast_database/subsys/subsys_db" +diamond_location=/home/renaut/bin + +# 5. Aggregation +python_programs=/home/renaut/samsa2.master/samsa2/python_scripts +RefSeq_db="/home/renaut/scratch/blast_database/refseq/RefSeq_bac.fa" +Subsys_db="/home/renaut/scratch/blast_database/subsys/subsys_db.fa" + +# 6. R scripts and paths +export R_LIBS="/home/renaut/samsa2.master/samsa2/R_scripts/packages" +R_programs=/home/renaut/samsa2.master/samsa2/R_scripts + +# threads +threads="48" + +################## +#STEP 0.1: create/read checkpoint +cd $starting_location + +printf "\nStep 0.1: Checking for the presence of the checkpoint file\n" +if [ ! -f "pipeline/checkpoints" ] + then + printf "\tThe file checkpoints does not exist, we will create it.\n" + touch "pipeline/checkpoints" +else + printf "\tThe file checkpoints already exist\n" +fi + +#################################################################### +# +#STEP 0.2: GUNZIP PROCESS. +# +# +Step=$(grep "GUNZIP" pipeline/checkpoints) +if [ "${Step}" != "GUNZIP" ] + then + for file in $sequence_location/zipped/*gz + do + file_out=`echo $file | awk -F ".gz" '{print $1}'` + gunzip $file -cd >$file_out + + if [ $? -ne 0 ] + then + printf "\t!!! There is a problem in the gunzip step" + exit 1 + fi + done + + mkdir $sequence_location/unzipped + mv $sequence_location/zipped/*fastq $sequence_location/unzipped/. + printf "GUNZIP\n" >>pipeline/checkpoints + echo "STEP 0.1 DONE:GUNZIP OF PAIRED-END FILES USING GUNZIP" + date +else + printf "\tThe variable GUNZIP is in the checkpoint file. STEP 0.1 will then be passed\n" +fi + +#################################################################### +# +# STEP 1: MERGING OF PAIRED-END FILES USING PEAR +# Note: paired-end files are usually named using R1 and R2 in the name. +# Example: control_1.R1.fastq +# control_1.R2.fastq +# +# Note: if using single-end sequencing, skip this step (comment out). +# Note: if performing R analysis (step 6), be sure to name files with +# the appropriate prefix ("control_$file" and "experimental_$file")! + + + +Step=$(grep "MERGING" pipeline/checkpoints) +if [ "${Step}" != "MERGING" ] + then + mkdir $starting_location/step_1_merging + touch $starting_location/step_1_merging/pear_log + for file in $sequence_location/unzipped/*_R1.fastq + do + file1=$file + file2=`echo $file1 | awk -F "R1" '{print $1 "R2" $2}'` + out_path=`echo $file | awk -F "R1" '{print $1 "merged"}'` + out_name=`echo ${out_path##*/}` + $pear_location/pear -f $file1 -j $threads -r $file2 -o $out_name 1>>$starting_location/step_1_merging/pear_log + + if [ $? -ne 0 ] + then + printf "\t!!! There is a problem in the merging step" + exit 1 + fi + done + + ###I will concatenate forward and reverse complement with 20Ns in the middle. + $R_programs/combining_umerged.R $starting_location + + printf "MERGING\n" >>pipeline/checkpoints + mkdir $starting_location/step_1_merging/ + mv $starting_location/*assembled2.fastq $starting_location/step_1_merging/ + + rm $starting_location/*fastq + rm $sequence_location/unzipped/*fastq + + echo "STEP 1 DONE:MERGING OF PAIRED-END FILES USING PEAR" + date +else + printf "\tThe variable MERGING is in the checkpoint file. STEP 1 will then be passed\n" +fi + +#################################################################### +# +# STEP 2: CLEANING FILES WITH TRIMMOMATIC +# Note: if skipping PEAR, make sure that all starting files are in the +# $starting_location/step_1_output/ folder! +Step=$(grep "TRIMMING" pipeline/checkpoints) +if [ "${Step}" != "TRIMMING" ] + then + mkdir $starting_location/step_2_trimming + touch $starting_location/step_2_trimming/trimmomatic_log + + for file in $starting_location/step_1_merging/*assembled2.fastq + do + shortname=`echo $file | awk -F "assembled2.fastq" '{print $1 "cleaned.fastq"}'` + + java -jar $trimmomatic_location/trimmomatic-0.36.jar SE -threads $threads -phred33 $file $shortname ILLUMINACLIP:$trimmomatic_location/adapters/all_primers.fa:2:30:10 SLIDINGWINDOW:4:15 MINLEN:59 2>>$starting_location/step_2_trimming/trimmomatic_log + if [ $? -ne 0 ] + then + printf "\t!!! There is a problem in the trimmmomatic step" + exit 1 + fi + + done + + mkdir $starting_location/step_2_trimming/ + mv $starting_location/step_1_merging/*cleaned.fastq $starting_location/step_2_trimming/ + + rm step_1_merging/*assembled2.fastq + + echo "STEP 2 DONE:CLEANING FILES WITH TRIMMOMATIC" + date + + printf "TRIMMING\n" >>pipeline/checkpoints +else + printf "\tThe variable TRIMMING is in the checkpoint file. STEP 2 will then be passed\n" +fi + +#################################################################### +# +# STEP 2.5: GETTING RAW SEQUENCES COUNTS +# Note: These are used later for statistical analysis. +Step=$(grep "RAW" pipeline/checkpoints) +if [ "${Step}" != "RAW" ] + then + if [ -f $starting_location/step_2_trimming/raw_counts.txt ] + then + rm $starting_location/step_2_trimming/raw_counts.txt + touch $starting_location/step_2_trimming/raw_counts.txt + else + touch $starting_location/step_2_trimming/raw_counts.txt + fi + + for file in $starting_location/step_2_trimming/*cleaned.fastq + do + python $python_programs/raw_read_counter.py -I $file -O $starting_location/step_2_trimming/raw_counts.txt + if [ $? -ne 0 ] + then + printf "\t!!! There is a problem in the raw sequences step" + exit 1 + fi + done + echo "STEP 2.5:GETTING RAW SEQUENCES COUNTS" + date + + printf "RAW\n" >>pipeline/checkpoints +else + printf "\tThe variable RAW is in the checkpoint file. STEP 2.9 will then be passed\n" +fi + +#################################################################### +# +# STEP 3: REMOVING RIBOSOMAL READS WITH SORTMERNA +# Note: this step assumes that the SortMeRNA databases are indexed. If not, +# do that first (see the SortMeRNA user manual for details). + +Step=$(grep "RIBODEPLETION" pipeline/checkpoints) +if [ "${Step}" != "RIBODEPLETION" ] + then + mkdir $starting_location/step_3_ribodepletion + touch $starting_location/step_3_ribodepletion/sortmerna_log + + for file in $starting_location/step_2_trimming/*cleaned.fastq + do + shortname=`echo $file | awk -F "cleaned" '{print $1 "ribodepleted"}'` + $sortmerna_location/sortmerna -a $threads --ref $sortmerna_location/../sortmerna-2.1/rRNA_databases/silva-bac-16s-id90.fasta,$sortmerna_location/../sortmerna-2.1/index/silva-bac-16s --reads $file --aligned $file.ribosomes --other $shortname --fastx --log -v 1>>$starting_location/step_3_ribodepletion/sortmerna_log + if [ $? -ne 0 ] + then + printf "\t!!! There is a problem in the sortmerna step" + exit 1 + fi + done + + mkdir $starting_location/step_3_ribodepletion/ + mv $starting_location/step_2_trimming/*ribodepleted* $starting_location/step_3_ribodepletion/ + + rm step_2_trimming/*cleaned.fastq + rm step_2_trimming/*ribosomes.fastq + + echo "STEP 3 DONE:REMOVING RIBOSOMAL READS WITH SORTMERNA" + date + + printf "RIBODEPLETION\n" >>pipeline/checkpoints +else + printf "\tThe variable RIBODEPLETION is in the checkpoint file. STEP 3 will then be passed\n" +fi + +#################################################################### +# +# STEP 4: ANNOTATING WITH DIAMOND AGAINST SUBSYSTEMS +date +Step=$(grep "ANNOTATION_SUBSYS" pipeline/checkpoints) +if [ "${Step}" != "ANNOTATION_SUBSYS" ] + then + mkdir $starting_location/step_4_annotation + touch $starting_location/step_4_annotation/diamond_log + for file in $starting_location/step_3_ribodepletion/*ribodepleted.fastq + do + shortname=`echo $file | awk -F "ribodepleted" '{print $1 "subsys_annotated"}'` + #removed the diamond sensitive option + $diamond_location/diamond blastx --db $diamond_subsys_db -q $file -a $file.Subsys -p $threads -t ./ -k 1 1>>$starting_location/step_4_annotation/diamond_log + $diamond_location/diamond view --daa $file.Subsys.daa -o $shortname -p $threads -f tab 1>>$starting_location/step_4_annotation/diamond_log + if [ $? -ne 0 ] + then + printf "\t!!! There is a problem in the diamond subsys step" + exit 1 + fi + done + + mkdir $starting_location/step_4_annotation/daa_binary_files/ + + mv $starting_location/step_3_ribodepletion/*subsys_annotated* $starting_location/step_4_annotation/ + mv $starting_location/step_3_ribodepletion/*.daa $starting_location/step_4_annotation/daa_binary_files/ + + echo "STEP 4 DONE: ANNOTATING WITH DIAMOND AGAINST SUBSYSTEMS" + date + + printf "ANNOTATION_SUBSYS\n" >>pipeline/checkpoints +else + printf "\tThe variable ANNOTATION_SUBSYS is in the checkpoint file. STEP 4 will then be passed\n" +fi +################################################################## +# +# STEP 5: PYTHON SUBSYSTEMS ANALYSIS COUNTER +Step=$(grep "AGGREGATION_SUBSYS" pipeline/checkpoints) +if [ "${Step}" != "AGGREGATION_SUBSYS" ] + then + mkdir $starting_location/step_5_aggregation/ + for file in $starting_location/step_4_annotation/*subsys_annotated + do + python $python_programs/DIAMOND_subsystems_analysis_counter.py -I $file -D $Subsys_db -O $file.hierarchy -P $file.receipt + + # This quick program reduces down identical hierarchy annotations + python $python_programs/subsys_reducer.py -I $file.hierarchy + if [ $? -ne 0 ] + then + printf "\t!!! There is a problem in the aggregation subsys step" + exit 1 + fi + done + + mkdir $starting_location/step_5_aggregation/Subsystems_results/ + mkdir $starting_location/step_5_aggregation/Subsystems_results/receipts/ + mv $starting_location/step_4_annotation/*.reduced $starting_location/step_5_aggregation/Subsystems_results/ + mv $starting_location/step_4_annotation/*.receipt $starting_location/step_5_aggregation/Subsystems_results/receipts/ + rm $starting_location/step_4_annotation/*.hierarchy + + echo "STEP 5 DONE:PYTHON SUBSYSTEMS ANALYSIS COUNTER" + date + + printf "AGGREGATION_SUBSYS\n" >>pipeline/checkpoints +else + printf "\tThe variable AGGREGATION_SUBSYS is in the checkpoint file. STEP 5. will then be passed\n" +fi + + +#################################################################### +# +# STEP 4.1: ANNOTATING WITH DIAMOND AGAINST REFSEQ +# Note: this step assumes that the DIAMOND database is already built. If not, +# do that first before running this step. +Step=$(grep "ANNOTATION_REFSEQ" pipeline/checkpoints) +if [ "${Step}" != "ANNOTATION_REFSEQ" ] + then + touch $starting_location/step_4_annotation/diamond_log + + for file in $starting_location/step_3_ribodepletion/*ribodepleted.fastq + do + shortname=`echo $file | awk -F "ribodepleted" '{print $1 "RefSeq_annotated"}'` + #removed the diamond sensitive option + $diamond_location/diamond blastx --db $diamond_database -q $file -p $threads -a $file.RefSeq -t ./ -k 1 1>>$starting_location/step_4_annotation/diamond_log + $diamond_location/diamond view --daa $file.RefSeq.daa -o $shortname -p $threads -f tab 1>>$starting_location/step_4_annotation/diamond_log + if [ $? -ne 0 ] + then + printf "\t!!! There is a problem in the diamond refseq step" + exit 1 + fi + done + + mv $starting_location/step_3_ribodepletion/*annotated* $starting_location/step_4_annotation/ + mv $starting_location/step_3_ribodepletion/*.daa $starting_location/step_4_annotation/daa_binary_files/ + + echo "STEP 4.1 DONE:ANNOTATING WITH DIAMOND AGAINST REFSEQ" + date + + printf "ANNOTATION_REFSEQ\n" >>pipeline/checkpoints +else + printf "\tThe variable ANNOTATION_REFSEQ is in the checkpoint file. STEP 4.1 will then be passed\n" +fi + +#################################################################### +# +# STEP 5.1: AGGREGATING WITH ANALYSIS_COUNTER +Step=$(grep "AGGREGATION_REFSEQ" pipeline/checkpoints) +if [ "${Step}" != "AGGREGATION_REFSEQ" ] + then + for file in $starting_location/step_4_annotation/*RefSeq_annotated* + do + python $python_programs/standardized_DIAMOND_analysis_counter.py -I $file -D $RefSeq_db -O + python $python_programs/standardized_DIAMOND_analysis_counter.py -I $file -D $RefSeq_db -F + if [ $? -ne 0 ] + then + printf "\t!!! There is a problem in the aggregation refseq step" + exit 1 + fi + done + + mkdir $starting_location/step_5_aggregation/RefSeq_results/ + mkdir $starting_location/step_5_aggregation/RefSeq_results/org_results/ + mkdir $starting_location/step_5_aggregation/RefSeq_results/func_results/ + mv $starting_location/step_4_annotation/*organism.tsv $starting_location/step_5_aggregation/RefSeq_results/org_results/ + mv $starting_location/step_4_annotation/*function.tsv $starting_location/step_5_aggregation/RefSeq_results/func_results/ + + echo "STEP 5.1 DONE: AGGREGATING WITH ANALYSIS_COUNTER" + date + + printf "AGGREGATION_REFSEQ\n" >>pipeline/checkpoints +else + printf "\tThe variable AGGREGATION_REFSEQ is in the checkpoint file. STEP 5.1 will then be passed\n" +fi + + + +################################################################## +# +# At this point, all the results files are ready for analysis using R. +# This next step performs basic DESeq2 analysis of the RefSeq organism, function, +# and Subsystems annotations. +# +# More complex R analyses may be performed using specific .sh analysis scripts. +# +# STEP 6: R ANALYSIS +# Note: For R to properly identify files to compare/contrast, they must include +# the appropriate prefix (either "control_$file" or experimental_$file")! + +#/home/apps/Logiciels/R/3.4.3-gcc/bin/Rscript $R_programs/run_DESeq_stats.R -I $starting_location/step_5_aggregation/RefSeq_results/org_results/ -O RefSeq_org_DESeq_results.tab -R $starting_location/step_2_trimming/raw_counts.txt +#/home/apps/Logiciels/R/3.4.3-gcc/bin/Rscript $R_programs/run_DESeq_stats.R -I $starting_location/step_5_aggregation/RefSeq_results/func_results/ -O RefSeq_func_DESeq_results.tab -R $starting_location/step_2_trimming/raw_counts.txt +#/home/apps/Logiciels/R/3.4.3-gcc/bin/Rscript $R_programs/Subsystems_DESeq_stats.R -I $starting_location/step_5_aggregation/Subsystems_results/ -O Subsystems_level-1_DESeq_results.tab -L 1 -R step_2_trimming/raw_counts.txt + +echo "STEP 6 DONE: R ANALYSIS" +echo "Master bash script finished running at: "; date +exit +#################################################################### +