From 455445b88f90d298fe34b3201a9182f6ad93d700 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=A9bastien=20Renaut?=
 <renaut@cedar1.cedar.computecanada.ca>
Date: Wed, 28 Nov 2018 07:56:08 -0800
Subject: [PATCH 01/10] new improved master script

---
 README.md                        |  22 ++
 R_scripts/combining_umerged.R    |  61 +++++
 bash_scripts/master_samsa2.slurm | 431 +++++++++++++++++++++++++++++++
 3 files changed, 514 insertions(+)
 create mode 100755 R_scripts/combining_umerged.R
 create mode 100755 bash_scripts/master_samsa2.slurm

diff --git a/README.md b/README.md
index 2ae3d5f..189ac85 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,27 @@
 # SAMSA2 - A fork of the complete metatranscriptome analysis pipeline
 
+### modification by sebastien.renaut@gmail.com
+
+### New in my version. 
+There is a new master script (master_samsa2.slurm) with several modifications:
+
+* It specifies multithreading for all programs for which it is available (trimmomatic, pear, diamond)
+* The script annomatically creates a checkpoint file. Once a step is finished, it writes the name of that specific step in the file. When you run the script again, it will SKIP the steps labelled in checkpoint.
+* In the merging step, unmerged reads are concatenated and added to a single file. The forward read and the reverse (complement) read are concatenated with a string of 20 Ns in the middle: This is done through a new R script entitled: `combining_umerged.R`
+* Extra care is taken to remove unnecessary files once a step is performed to keep disk usage at a minimum.
+* Each step contains an exit statement to be printed if the master script dies.
+* Trimommatic removes adapter contamination according to a specific file
+* All options are to be specified in the first section of the script
+* The script is formated to be run into a SLURM job scheduler, but this can be easily changed / removed.
+
+
+#### To do:
+* reverse the order of the merging and trimming step.
+* simplify some syntax / make sure all paths are relative...
+
+
+
+
 Version 2 of the SAMSA pipeline - faster!  Lighter!  More options!  Less waiting!  
 
 ### New in version 2:
diff --git a/R_scripts/combining_umerged.R b/R_scripts/combining_umerged.R
new file mode 100755
index 0000000..c04f28c
--- /dev/null
+++ b/R_scripts/combining_umerged.R
@@ -0,0 +1,61 @@
+#!/cvmfs/soft.computecanada.ca/easybuild/software/2017/avx2/Compiler/intel2016.4/r/3.5.0/bin/Rscript
+
+args = commandArgs(TRUE)
+start = args[1] # Specify which sequences in "list_ind" file you want to align, directlty from the shell. Alternatively, you can do this from the "alignments" function itself.
+#start="/Users/jerry/Documents/CSBQ/shapiro/results"
+
+setwd(start)
+
+###files to work with 
+system(paste("ls -1 *unassembled.forward.fastq >umerged.forward.files",sep = ""))
+system(paste("ls -1 *unassembled.reverse.fastq >umerged.reverse.files",sep = ""))
+system(paste("ls -1 *.assembled.fastq >assembled.files",sep = ""))
+
+files_f = read.table("umerged.forward.files",stringsAsFactors = F)      
+files_r = read.table("umerged.reverse.files",stringsAsFactors = F) 
+files_a = read.table("assembled.files",stringsAsFactors = F) 
+
+
+for(i in 1:nrow(files_f))      
+  {
+#grep 1st and 2rd lines 
+  	unassembled.fastq = paste("awk 'NR % 2 == 1' ",files_f[i,1]," >unassembled.names.seq.fastq",sep="")
+  	system(unassembled.fastq)
+
+#grep sequences forward...
+  	unassembled.forward.fastq = paste("awk 'NR % 2 == 0' ",files_f[i,1]," >unassembled.seq.forward.fastq",sep="")
+  	system(unassembled.forward.fastq)
+
+#grep sequences reverse...
+  	unassembled.reverse.fastq = paste("awk 'NR % 2 == 0' ",files_r[i,1]," >unassembled.seq.reverse.fastq",sep="")
+  	system(unassembled.reverse.fastq)
+
+#N and E quality file
+ 	system("wc -l unassembled.seq.reverse.fastq >wc")
+ 	wc = read.table("wc")
+  
+ 	write.table(c(rbind(rep('NNNNNNNNNNNNNNNNNNNN',wc[1,1]/2),rep('EEEEEEEEEEEEEEEEEEEE',wc[1,1]/2))),"NE_file",row.names = F, col.names = F, quote = F)
+  
+#paste the unassembled sequences into a single file
+	system("paste -d '\\0' unassembled.seq.forward.fastq NE_file unassembled.seq.reverse.fastq >unassembledN")
+
+#put everything back into a single fastq
+	back = paste("paste -d '\\n' unassembled.names.seq.fastq unassembledN >",gsub("merged.unassembled.forward","cat",files_f[i,1]),sep= "")
+	system(back)
+
+#add the merged sequences
+	all = paste("cat ",files_a[i,1]," ",gsub("merged.unassembled.forward","cat",files_f[i,1])," >",gsub("merged.assembled","merged.assembled2",files_a[i,1]),sep = "")
+	system(all)
+
+#remove the clutter (file specific)
+	remove = c("rm NE_file unassembledN wc unassembled.names.seq.fastq unassembled.seq.forward.fastq unassembled.seq.reverse.fastq")
+	system(remove)
+}
+
+#remove the clutter (listing of all files)
+system("rm assembled.files umerged.forward.files umerged.reverse.files")
+
+
+
+
+
diff --git a/bash_scripts/master_samsa2.slurm b/bash_scripts/master_samsa2.slurm
new file mode 100755
index 0000000..cc60bb3
--- /dev/null
+++ b/bash_scripts/master_samsa2.slurm
@@ -0,0 +1,431 @@
+#!/bin/bash
+
+#SBATCH --time=0-23:59
+#SBATCH --mem=100000
+#SBATCH --account=def-bruneaua
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=48
+
+module load r/3.5.0
+
+
+# Lines starting with #SBATCH are for SLURM job management systems
+# and may be removed if user is not submitting jobs to SLURM
+####################################################################
+#set -x #echo
+#
+# master_script_for_sample_files.bash
+# Created April 2017 by Sam Westreich, github.com/transcript
+# This version modified by Michelle Treiber on August 9, 2017
+#
+####################################################################
+#
+# This script sets up and runs through ALL steps in the SAMSA pipeline
+# before the analysis (which is done in R, likely in RStudio).  Each
+# step is set up below.
+#
+# The steps are:
+#	1.   Merging with PEAR, if applicable
+#	2.   Read cleaning with Trimmomatic
+#	3.   rRNA removal with SortMeRNA
+#	4.   Annotation using DIAMOND (by default against the RefSeq database)
+#	5.   Aggregation using analysis_counter.py
+#   4.1  Annotation using DIAMOND against the Subsystems database
+#   5.1  Aggregation using Subsystems-specific analysis counter.py
+#	6.	 Running R scripts to get DESeq statistical analysis.
+#
+# NOTE: BEFORE running this script, please run package_installation.bash
+# and full_database_download.bash located at:
+# https://github.com/transcript/samsa2/tree/master/setup in order to set 
+# up SAMSA2 dependencies and download full databases.
+#
+####################################################################
+#
+echo -e "NOTE: Before running this script, user must run package_installation.bash and full_database_download.bash located at https://github.com/transcript/samsa2/tree/master/setup in order to set up SAMSA2 dependencies.\n\n"
+#
+# VARIABLES - Set pathway for starting_location to location of samsa2 GitHub download
+#
+# 	0. Set starting location:
+starting_location=/home/renaut/scratch/ATRAPP_Champlain_2016_Metat/metagenome_St2
+
+#	0.5 Sequences location:
+sequence_location=/home/renaut/scratch/ATRAPP_Champlain_2016_Metat/sequences/metagenome/St2
+
+#	1. PEAR
+pear_location=/home/renaut/bin
+
+# 	2. Trimmomatic
+trimmomatic_location=/home/renaut/bin
+
+#	3. SortMeRNA
+sortmerna_location=/home/renaut/bin
+
+#	4. DIAMOND
+diamond_database="/home/renaut/scratch/blast_database/refseq/RefSeq_bac"
+diamond_subsys_db="/home/renaut/scratch/blast_database/subsys/subsys_db"
+diamond_location=/home/renaut/bin
+
+#	5. Aggregation
+python_programs=/home/renaut/samsa2/python_scripts
+RefSeq_db="/home/renaut/scratch/blast_database/refseq/RefSeq_bac.fa"
+Subsys_db="/home/renaut/scratch/blast_database/subsys/subsys_db.fa"
+
+#	6. R scripts and paths
+export R_LIBS="/home/renaut/samsa2/R_scripts/packages"
+R_programs=/home/renaut/samsa2/R_scripts
+
+#	threads
+threads="48"
+
+##################
+#Step0.1: create/read checkpoint
+cd $starting_location
+
+printf "\nStep 0.1: Checking for the presence of the checkpoint file\n"
+if [ ! -f "pipeline/checkpoints" ]
+        then
+                printf "\tThe file checkpoints does not exist, we will create it.\n"
+        touch "pipeline/checkpoints"
+else
+        printf "\tThe file checkpoints already exist\n"
+fi
+
+####################################################################
+#
+#STEP 0.1: GUNZIP PROCESS. 
+#
+#
+Step=$(grep "GUNZIP" pipeline/checkpoints)
+if [ "${Step}" != "GUNZIP" ]
+	then
+		for file in $sequence_location/hiseq_raw/*gz
+			do
+				file_out=`echo $file | awk -F ".gz" '{print $1}'`	
+        			gunzip $file -cd >$file_out
+
+				if [ $? -ne 0 ]
+                        		then
+                                	printf "\t!!! There is a problem in the gunzip step"
+                                	exit 1
+                		fi
+			done
+		mv $sequence_location/hiseq_raw/*fastq $sequence_location/hiseq_unzip/.
+		printf "GUNZIP\n" >>pipeline/checkpoints
+                echo "STEP 0.1 DONE:GUNZIP OF PAIRED-END FILES USING GUNZIP"
+                date
+else
+        printf  "\tThe variable GUNZIP is in the checkpoint file. STEP 0.1 will then be passed\n"
+fi
+
+####################################################################
+#
+# STEP 1: MERGING OF PAIRED-END FILES USING PEAR
+# Note: paired-end files are usually named using R1 and R2 in the name.
+#		Example: control_1.R1.fastq
+#				 control_1.R2.fastq
+#
+# Note: if using single-end sequencing, skip this step (comment out).
+# Note: if performing R analysis (step 6), be sure to name files with 
+# 	the appropriate prefix ("control_$file" and "experimental_$file")!
+
+
+
+Step=$(grep "MERGING" pipeline/checkpoints)
+if [ "${Step}" != "MERGING" ]
+	then
+		mkdir $starting_location/step_1_merging
+		touch $starting_location/step_1_merging/pear_log
+		for file in $sequence_location/hiseq_unzip/*_R1.fastq
+		do
+			file1=$file
+			file2=`echo $file1 | awk -F "R1" '{print $1 "R2" $2}'`
+			out_path=`echo $file | awk -F "R1" '{print $1 "merged"}'`
+			out_name=`echo ${out_path##*/}`
+			$pear_location/pear -f $file1 -j $threads -r $file2 -o $out_name 1>>$starting_location/step_1_merging/pear_log
+			#FOR ATRAPP, GIVEN THAT IT APPEARS LIKE 50-60% ARE MERGED, AND I HAVE A LOT OF SEQUENCES, I WILL ONLY KEEP THE MERGED		
+		
+			if [ $? -ne 0 ]
+                        	then
+                                	printf "\t!!! There is a problem in the merging step"
+                                	exit 1
+                	fi
+		done
+		 #FOR ATRAPP, I WILL NOW MERGE:I will merge the umerged leaving 20 N's in between.
+	
+		combining_umerged.R $starting_location
+		printf "MERGING\n" >>pipeline/checkpoints
+		mkdir $starting_location/step_1_merging/
+		mv $starting_location/*assembled2.fastq $starting_location/step_1_merging/
+		
+		rm $starting_location/*fastq
+		rm $sequence_location/hiseq_unzip/*fastq
+
+		echo "STEP 1 DONE:MERGING OF PAIRED-END FILES USING PEAR"
+		date        
+else
+        printf  "\tThe variable MERGING is in the checkpoint file. STEP 1 will then be passed\n"
+fi
+
+####################################################################
+#
+# STEP 2: CLEANING FILES WITH TRIMMOMATIC
+# Note: if skipping PEAR, make sure that all starting files are in the
+# $starting_location/step_1_output/ folder!
+Step=$(grep "TRIMMING" pipeline/checkpoints)
+if [ "${Step}" != "TRIMMING" ]
+        then
+		mkdir $starting_location/step_2_trimming
+		touch $starting_location/step_2_trimming/trimmomatic_log
+
+	for file in $starting_location/step_1_merging/*assembled2.fastq
+		do
+		shortname=`echo $file | awk -F "assembled2.fastq" '{print $1 "cleaned.fastq"}'`
+
+		java -jar $trimmomatic_location/trimmomatic-0.36.jar SE -threads $threads -phred33 $file $shortname ILLUMINACLIP:/home/renaut/trinityrnaseq-Trinity-v2.8.4/trinity-plugins/Trimmomatic-0.36/adapters/all_primers.fa:2:30:10 SLIDINGWINDOW:4:15 MINLEN:59 2>>$starting_location/step_2_trimming/trimmomatic_log
+		if [ $? -ne 0 ]
+			then 
+				printf "\t!!! There is a problem in the trimmmomatic step"
+				exit 1
+		fi
+		
+		done
+
+	mkdir $starting_location/step_2_trimming/
+	mv $starting_location/step_1_merging/*cleaned.fastq $starting_location/step_2_trimming/
+	
+	rm step_1_merging/*assembled2.fastq
+	
+	echo "STEP 2 DONE:CLEANING FILES WITH TRIMMOMATIC"
+	date
+	
+	printf "TRIMMING\n" >>pipeline/checkpoints
+else
+        printf  "\tThe variable TRIMMING is in the checkpoint file. STEP 2 will then be passed\n"
+fi
+
+####################################################################
+#
+# STEP 2.5: GETTING RAW SEQUENCES COUNTS
+# Note: These are used later for statistical analysis.
+Step=$(grep "RAW" pipeline/checkpoints)
+if [ "${Step}" != "RAW" ]
+        then
+		if [ -f $starting_location/step_2_trimming/raw_counts.txt ]
+			then
+				rm $starting_location/step_2_trimming/raw_counts.txt
+				touch $starting_location/step_2_trimming/raw_counts.txt
+		else
+			touch $starting_location/step_2_trimming/raw_counts.txt
+		fi	
+
+		for file in $starting_location/step_2_trimming/*cleaned.fastq
+		do
+			python $python_programs/raw_read_counter.py -I $file -O $starting_location/step_2_trimming/raw_counts.txt
+		if [ $? -ne 0 ]
+			then 
+				printf "\t!!! There is a problem in the raw sequences step"
+				exit 1
+		fi
+		done
+		echo "STEP 2.5:GETTING RAW SEQUENCES COUNTS" 
+		date
+
+        printf "RAW\n" >>pipeline/checkpoints
+else
+        printf  "\tThe variable RAW is in the checkpoint file. STEP 2.9 will then be passed\n"
+fi
+
+####################################################################
+#
+# STEP 3: REMOVING RIBOSOMAL READS WITH SORTMERNA
+# Note: this step assumes that the SortMeRNA databases are indexed.  If not,
+# do that first (see the SortMeRNA user manual for details).
+
+Step=$(grep "RIBODEPLETION" pipeline/checkpoints)
+if [ "${Step}" != "RIBODEPLETION" ]
+        then
+        mkdir $starting_location/step_3_ribodepletion
+		touch $starting_location/step_3_ribodepletion/sortmerna_log
+
+		for file in $starting_location/step_2_trimming/*cleaned.fastq
+		do
+			shortname=`echo $file | awk -F "cleaned" '{print $1 "ribodepleted"}'`
+			$sortmerna_location/sortmerna -a $threads --ref $sortmerna_location/../sortmerna-2.1/rRNA_databases/silva-bac-16s-id90.fasta,$sortmerna_location/../sortmerna-2.1/index/silva-bac-16s --reads $file --aligned $file.ribosomes --other $shortname --fastx --num_alignments 0 --log -v 1>>$starting_location/step_3_ribodepletion/sortmerna_log
+		if [ $? -ne 0 ]
+			then 
+				printf "\t!!! There is a problem in the sortmerna step"
+				exit 1
+		fi
+		done
+
+		mkdir $starting_location/step_3_ribodepletion/
+		mv $starting_location/step_2_trimming/*ribodepleted* $starting_location/step_3_ribodepletion/
+		
+		rm step_2_trimming/*cleaned.fastq
+		rm step_2_trimming/*ribosomes.fastq
+		
+		echo "STEP 3 DONE:REMOVING RIBOSOMAL READS WITH SORTMERNA"
+		date
+	
+        	printf "RIBODEPLETION\n" >>pipeline/checkpoints
+else
+        	printf  "\tThe variable RIBODEPLETION is in the checkpoint file. STEP 3 will then be passed\n"
+fi
+
+####################################################################
+#
+# STEP 4: ANNOTATING WITH DIAMOND AGAINST SUBSYSTEMS
+date
+Step=$(grep "ANNOTATION_SUBSYS" pipeline/checkpoints)
+if [ "${Step}" != "ANNOTATION_SUBSYS" ]
+	then
+		mkdir $starting_location/step_4_annotation
+		touch $starting_location/step_4_annotation/diamond_log
+		for file in $starting_location/step_3_ribodepletion/*ribodepleted.fastq
+		do
+			shortname=`echo $file | awk -F "ribodepleted" '{print $1 "subsys_annotated"}'`
+			#removed the diamond sensitive option
+			$diamond_location/diamond blastx --db $diamond_subsys_db -q $file -a $file.Subsys -p $threads -t ./ -k 1 1>>$starting_location/step_4_annotation/diamond_log
+			$diamond_location/diamond view --daa $file.Subsys.daa -o $shortname -p $threads -f tab 1>>$starting_location/step_4_annotation/diamond_log
+		if [ $? -ne 0 ]
+			then 
+				printf "\t!!! There is a problem in the diamond subsys step"
+				exit 1
+		fi
+		done
+		
+		mkdir $starting_location/step_4_annotation/daa_binary_files/
+	
+		mv $starting_location/step_3_ribodepletion/*subsys_annotated* $starting_location/step_4_annotation/
+		mv $starting_location/step_3_ribodepletion/*.daa $starting_location/step_4_annotation/daa_binary_files/
+
+		echo "STEP 4 DONE: ANNOTATING WITH DIAMOND AGAINST SUBSYSTEMS"
+        	date
+
+		printf "ANNOTATION_SUBSYS\n" >>pipeline/checkpoints
+else
+       	 printf  "\tThe variable ANNOTATION_SUBSYS is in the checkpoint file. STEP 4 will then be passed\n"
+fi
+##################################################################
+#
+# STEP 5: PYTHON SUBSYSTEMS ANALYSIS COUNTER
+Step=$(grep "AGGREGATION_SUBSYS" pipeline/checkpoints)
+if [ "${Step}" != "AGGREGATION_SUBSYS" ]
+	then
+		mkdir $starting_location/step_5_aggregation/
+		for file in $starting_location/step_4_annotation/*subsys_annotated
+		do
+			python $python_programs/DIAMOND_subsystems_analysis_counter.py -I $file -D $Subsys_db -O $file.hierarchy -P $file.receipt
+	
+			# This quick program reduces down identical hierarchy annotations
+			python $python_programs/subsys_reducer.py -I $file.hierarchy
+		if [ $? -ne 0 ]
+			then 
+				printf "\t!!! There is a problem in the aggregation subsys step"
+				exit 1
+		fi
+		done
+
+		mkdir $starting_location/step_5_aggregation/Subsystems_results/
+		mkdir $starting_location/step_5_aggregation/Subsystems_results/receipts/
+		mv $starting_location/step_4_annotation/*.reduced $starting_location/step_5_aggregation/Subsystems_results/
+		mv $starting_location/step_4_annotation/*.receipt $starting_location/step_5_aggregation/Subsystems_results/receipts/
+		rm $starting_location/step_4_annotation/*.hierarchy
+
+		echo "STEP 5 DONE:PYTHON SUBSYSTEMS ANALYSIS COUNTER"
+		date
+
+        	printf "AGGREGATION_SUBSYS\n" >>pipeline/checkpoints
+else
+	        printf  "\tThe variable AGGREGATION_SUBSYS is in the checkpoint file. STEP 5. will then be passed\n"
+fi
+
+
+####################################################################
+#
+# STEP 4.1: ANNOTATING WITH DIAMOND AGAINST REFSEQ
+# Note: this step assumes that the DIAMOND database is already built.  If not,
+# do that first before running this step.
+Step=$(grep "ANNOTATION_REFSEQ" pipeline/checkpoints)
+if [ "${Step}" != "ANNOTATION_REFSEQ" ]
+        then
+		touch $starting_location/step_4_annotation/diamond_log
+
+		for file in $starting_location/step_3_ribodepletion/*ribodepleted.fastq
+			do
+				shortname=`echo $file | awk -F "ribodepleted" '{print $1 "RefSeq_annotated"}'`
+				#removed the diamond sensitive option
+				$diamond_location/diamond blastx --db $diamond_database -q $file -p $threads -a $file.RefSeq -t ./ -k 1 1>>$starting_location/step_4_annotation/diamond_log
+				$diamond_location/diamond view --daa $file.RefSeq.daa -o $shortname -p $threads -f tab 1>>$starting_location/step_4_annotation/diamond_log
+			if [ $? -ne 0 ]
+			then 
+				printf "\t!!! There is a problem in the diamond refseq step"
+				exit 1
+			fi
+			done
+
+		mv $starting_location/step_3_ribodepletion/*annotated* $starting_location/step_4_annotation/
+		mv $starting_location/step_3_ribodepletion/*.daa $starting_location/step_4_annotation/daa_binary_files/
+
+		echo "STEP 4.1 DONE:ANNOTATING WITH DIAMOND AGAINST REFSEQ"
+		date
+
+        	printf "ANNOTATION_REFSEQ\n" >>pipeline/checkpoints
+else
+	        printf  "\tThe variable ANNOTATION_REFSEQ is in the checkpoint file. STEP 4.1 will then be passed\n"
+fi
+
+####################################################################
+#
+# STEP 5.1: AGGREGATING WITH ANALYSIS_COUNTER
+Step=$(grep "AGGREGATION_REFSEQ" pipeline/checkpoints)
+if [ "${Step}" != "AGGREGATION_REFSEQ" ]
+	then
+		for file in $starting_location/step_4_annotation/*RefSeq_annotated*
+		do
+			python $python_programs/standardized_DIAMOND_analysis_counter.py -I $file -D $RefSeq_db -O
+			python $python_programs/standardized_DIAMOND_analysis_counter.py -I $file -D $RefSeq_db -F
+			if [ $? -ne 0 ]
+				then 
+					printf "\t!!! There is a problem in the aggregation refseq step"
+					exit 1
+			fi
+		done
+
+		mkdir $starting_location/step_5_aggregation/RefSeq_results/
+		mkdir $starting_location/step_5_aggregation/RefSeq_results/org_results/
+		mkdir $starting_location/step_5_aggregation/RefSeq_results/func_results/
+		mv $starting_location/step_4_annotation/*organism.tsv $starting_location/step_5_aggregation/RefSeq_results/org_results/
+		mv $starting_location/step_4_annotation/*function.tsv $starting_location/step_5_aggregation/RefSeq_results/func_results/
+
+		echo "STEP 5.1 DONE: AGGREGATING WITH ANALYSIS_COUNTER"
+		date 
+
+        	printf "AGGREGATION_REFSEQ\n" >>pipeline/checkpoints
+else
+	        printf  "\tThe variable AGGREGATION_REFSEQ is in the checkpoint file. STEP 5.1 will then be passed\n"
+fi
+
+
+
+##################################################################
+#
+# At this point, all the results files are ready for analysis using R.
+# This next step performs basic DESeq2 analysis of the RefSeq organism, function,
+# and Subsystems annotations.
+#
+# More complex R analyses may be performed using specific .sh analysis scripts.
+#
+# STEP 6: R ANALYSIS
+# Note: For R to properly identify files to compare/contrast, they must include 
+# the appropriate prefix (either "control_$file" or experimental_$file")!
+
+#/home/apps/Logiciels/R/3.4.3-gcc/bin/Rscript $R_programs/run_DESeq_stats.R -I $starting_location/step_5_aggregation/RefSeq_results/org_results/ -O RefSeq_org_DESeq_results.tab -R $starting_location/step_2_trimming/raw_counts.txt
+#/home/apps/Logiciels/R/3.4.3-gcc/bin/Rscript $R_programs/run_DESeq_stats.R -I $starting_location/step_5_aggregation/RefSeq_results/func_results/ -O RefSeq_func_DESeq_results.tab -R $starting_location/step_2_trimming/raw_counts.txt
+#/home/apps/Logiciels/R/3.4.3-gcc/bin/Rscript $R_programs/Subsystems_DESeq_stats.R -I $starting_location/step_5_aggregation/Subsystems_results/ -O Subsystems_level-1_DESeq_results.tab -L 1 -R step_2_trimming/raw_counts.txt
+
+echo "STEP 6 DONE: R ANALYSIS"
+echo "Master bash script finished running at: "; date
+exit
+####################################################################
+

From 5d90e0e4392d8210f95b828fa9c30121b9afb3cc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=A9bastien=20Renaut?=
 <renaut@cedar1.cedar.computecanada.ca>
Date: Wed, 28 Nov 2018 08:48:36 -0800
Subject: [PATCH 02/10] improved syntax

---
 bash_scripts/master_samsa2.slurm | 44 +++++++++++++++++---------------
 1 file changed, 24 insertions(+), 20 deletions(-)

diff --git a/bash_scripts/master_samsa2.slurm b/bash_scripts/master_samsa2.slurm
index cc60bb3..36b8297 100755
--- a/bash_scripts/master_samsa2.slurm
+++ b/bash_scripts/master_samsa2.slurm
@@ -25,14 +25,16 @@ module load r/3.5.0
 # step is set up below.
 #
 # The steps are:
-#	1.   Merging with PEAR, if applicable
-#	2.   Read cleaning with Trimmomatic
-#	3.   rRNA removal with SortMeRNA
-#	4.   Annotation using DIAMOND (by default against the RefSeq database)
-#	5.   Aggregation using analysis_counter.py
-#   4.1  Annotation using DIAMOND against the Subsystems database
-#   5.1  Aggregation using Subsystems-specific analysis counter.py
-#	6.	 Running R scripts to get DESeq statistical analysis.
+#	0.1	Create the checkpoint
+#	0.2	Gunzip the raw sequences
+#	1.	Merging with PEAR, if applicable
+#	2.	Read cleaning with Trimmomatic
+#	3.	rRNA removal with SortMeRNA
+#	4.	Annotation using DIAMOND against Subsys database
+#	5.	Aggregation using analysis_counter.py (Subsys)
+#   	4.1  	Annotation using DIAMOND against Refseq database
+#   	5.1	Aggregation using analysis_counter.py (Refseq)
+#	6.	Running R scripts to get DESeq statistical analysis. (NOT RUN)
 #
 # NOTE: BEFORE running this script, please run package_installation.bash
 # and full_database_download.bash located at:
@@ -48,14 +50,14 @@ echo -e "NOTE: Before running this script, user must run package_installation.ba
 # 	0. Set starting location:
 starting_location=/home/renaut/scratch/ATRAPP_Champlain_2016_Metat/metagenome_St2
 
-#	0.5 Sequences location:
+#	0.5 Sequences location (note that sequences actually need to be stored in directory called $sequence_location/hiseq_raw
 sequence_location=/home/renaut/scratch/ATRAPP_Champlain_2016_Metat/sequences/metagenome/St2
 
 #	1. PEAR
 pear_location=/home/renaut/bin
 
 # 	2. Trimmomatic
-trimmomatic_location=/home/renaut/bin
+trimmomatic_location=/home/renaut/trinityrnaseq-Trinity-v2.8.4/trinity-plugins/Trimmomatic-0.36
 
 #	3. SortMeRNA
 sortmerna_location=/home/renaut/bin
@@ -66,19 +68,19 @@ diamond_subsys_db="/home/renaut/scratch/blast_database/subsys/subsys_db"
 diamond_location=/home/renaut/bin
 
 #	5. Aggregation
-python_programs=/home/renaut/samsa2/python_scripts
+python_programs=/home/renaut/samsa2.master/samsa2/python_scripts
 RefSeq_db="/home/renaut/scratch/blast_database/refseq/RefSeq_bac.fa"
 Subsys_db="/home/renaut/scratch/blast_database/subsys/subsys_db.fa"
 
 #	6. R scripts and paths
-export R_LIBS="/home/renaut/samsa2/R_scripts/packages"
-R_programs=/home/renaut/samsa2/R_scripts
+export R_LIBS="/home/renaut/samsa2.master/samsa2/R_scripts/packages"
+R_programs=/home/renaut/samsa2.master/samsa2/R_scripts
 
 #	threads
 threads="48"
 
 ##################
-#Step0.1: create/read checkpoint
+#STEP 0.1: create/read checkpoint
 cd $starting_location
 
 printf "\nStep 0.1: Checking for the presence of the checkpoint file\n"
@@ -92,7 +94,7 @@ fi
 
 ####################################################################
 #
-#STEP 0.1: GUNZIP PROCESS. 
+#STEP 0.2: GUNZIP PROCESS. 
 #
 #
 Step=$(grep "GUNZIP" pipeline/checkpoints)
@@ -109,6 +111,8 @@ if [ "${Step}" != "GUNZIP" ]
                                 	exit 1
                 		fi
 			done
+		
+		mkdir $sequence_location/hiseq_unzip
 		mv $sequence_location/hiseq_raw/*fastq $sequence_location/hiseq_unzip/.
 		printf "GUNZIP\n" >>pipeline/checkpoints
                 echo "STEP 0.1 DONE:GUNZIP OF PAIRED-END FILES USING GUNZIP"
@@ -142,7 +146,6 @@ if [ "${Step}" != "MERGING" ]
 			out_path=`echo $file | awk -F "R1" '{print $1 "merged"}'`
 			out_name=`echo ${out_path##*/}`
 			$pear_location/pear -f $file1 -j $threads -r $file2 -o $out_name 1>>$starting_location/step_1_merging/pear_log
-			#FOR ATRAPP, GIVEN THAT IT APPEARS LIKE 50-60% ARE MERGED, AND I HAVE A LOT OF SEQUENCES, I WILL ONLY KEEP THE MERGED		
 		
 			if [ $? -ne 0 ]
                         	then
@@ -150,9 +153,10 @@ if [ "${Step}" != "MERGING" ]
                                 	exit 1
                 	fi
 		done
-		 #FOR ATRAPP, I WILL NOW MERGE:I will merge the umerged leaving 20 N's in between.
-	
-		combining_umerged.R $starting_location
+		
+		###I will concatenate forward and reverse complement with 20Ns in the middle.
+		$R_programs/combining_umerged.R $starting_location
+		
 		printf "MERGING\n" >>pipeline/checkpoints
 		mkdir $starting_location/step_1_merging/
 		mv $starting_location/*assembled2.fastq $starting_location/step_1_merging/
@@ -181,7 +185,7 @@ if [ "${Step}" != "TRIMMING" ]
 		do
 		shortname=`echo $file | awk -F "assembled2.fastq" '{print $1 "cleaned.fastq"}'`
 
-		java -jar $trimmomatic_location/trimmomatic-0.36.jar SE -threads $threads -phred33 $file $shortname ILLUMINACLIP:/home/renaut/trinityrnaseq-Trinity-v2.8.4/trinity-plugins/Trimmomatic-0.36/adapters/all_primers.fa:2:30:10 SLIDINGWINDOW:4:15 MINLEN:59 2>>$starting_location/step_2_trimming/trimmomatic_log
+		java -jar $trimmomatic_location/trimmomatic-0.36.jar SE -threads $threads -phred33 $file $shortname ILLUMINACLIP:$trimmomatic_location/adapters/all_primers.fa:2:30:10 SLIDINGWINDOW:4:15 MINLEN:59 2>>$starting_location/step_2_trimming/trimmomatic_log
 		if [ $? -ne 0 ]
 			then 
 				printf "\t!!! There is a problem in the trimmmomatic step"

From 33adea99d76ff7a05c8a8f1b3e244475e9f94df1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=A9bastien=20Renaut?=
 <renaut@cedar5.cedar.computecanada.ca>
Date: Wed, 28 Nov 2018 12:16:07 -0800
Subject: [PATCH 03/10] sortmerna options

---
 README.md                        | 4 ++--
 bash_scripts/master_samsa2.slurm | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 189ac85..59417b5 100644
--- a/README.md
+++ b/README.md
@@ -13,10 +13,10 @@ There is a new master script (master_samsa2.slurm) with several modifications:
 * Trimommatic removes adapter contamination according to a specific file
 * All options are to be specified in the first section of the script
 * The script is formated to be run into a SLURM job scheduler, but this can be easily changed / removed.
-
+* I've removed the --num_alignments 0 is sortmrna. This causes problems and slows things down a lot. Plus we don't care about the rRNA alignments. If a sequence aligns to one or 1,000 rRNA its out anyways...
 
 #### To do:
-* reverse the order of the merging and trimming step.
+* reverse the order of the merging and trimming step (but again, maybe not...)
 * simplify some syntax / make sure all paths are relative...
 
 
diff --git a/bash_scripts/master_samsa2.slurm b/bash_scripts/master_samsa2.slurm
index 36b8297..6fa3c24 100755
--- a/bash_scripts/master_samsa2.slurm
+++ b/bash_scripts/master_samsa2.slurm
@@ -254,7 +254,7 @@ if [ "${Step}" != "RIBODEPLETION" ]
 		for file in $starting_location/step_2_trimming/*cleaned.fastq
 		do
 			shortname=`echo $file | awk -F "cleaned" '{print $1 "ribodepleted"}'`
-			$sortmerna_location/sortmerna -a $threads --ref $sortmerna_location/../sortmerna-2.1/rRNA_databases/silva-bac-16s-id90.fasta,$sortmerna_location/../sortmerna-2.1/index/silva-bac-16s --reads $file --aligned $file.ribosomes --other $shortname --fastx --num_alignments 0 --log -v 1>>$starting_location/step_3_ribodepletion/sortmerna_log
+			$sortmerna_location/sortmerna -a $threads --ref $sortmerna_location/../sortmerna-2.1/rRNA_databases/silva-bac-16s-id90.fasta,$sortmerna_location/../sortmerna-2.1/index/silva-bac-16s --reads $file --aligned $file.ribosomes --other $shortname --fastx --log -v 1>>$starting_location/step_3_ribodepletion/sortmerna_log
 		if [ $? -ne 0 ]
 			then 
 				printf "\t!!! There is a problem in the sortmerna step"

From 58513fc8e1403670d54762ea086322a84ee6eccd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=A9bastien=20Renaut?=
 <renaut@cedar1.cedar.computecanada.ca>
Date: Mon, 3 Dec 2018 10:01:23 -0800
Subject: [PATCH 04/10] improve flow

---
 README.md | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index 59417b5..72c0ce0 100644
--- a/README.md
+++ b/README.md
@@ -3,23 +3,21 @@
 ### modification by sebastien.renaut@gmail.com
 
 ### New in my version. 
-There is a new master script (master_samsa2.slurm) with several modifications:
+This is a new master script (master_samsa2.slurm) with several modifications:
 
 * It specifies multithreading for all programs for which it is available (trimmomatic, pear, diamond)
-* The script annomatically creates a checkpoint file. Once a step is finished, it writes the name of that specific step in the file. When you run the script again, it will SKIP the steps labelled in checkpoint.
+* The script automatically creates a `checkpoint` file. Once a step is finished, it writes the name of that specific step in `checkpoint`. Everytime you run the script, it looks if that file exist. If so, it reads it and SKIP the steps written in checkpoint. This is done to avoid re-run CPU intense steps if not necessary.
 * In the merging step, unmerged reads are concatenated and added to a single file. The forward read and the reverse (complement) read are concatenated with a string of 20 Ns in the middle: This is done through a new R script entitled: `combining_umerged.R`
 * Extra care is taken to remove unnecessary files once a step is performed to keep disk usage at a minimum.
-* Each step contains an exit statement to be printed if the master script dies.
-* Trimommatic removes adapter contamination according to a specific file
-* All options are to be specified in the first section of the script
-* The script is formated to be run into a SLURM job scheduler, but this can be easily changed / removed.
-* I've removed the --num_alignments 0 is sortmrna. This causes problems and slows things down a lot. Plus we don't care about the rRNA alignments. If a sequence aligns to one or 1,000 rRNA its out anyways...
+* Each step contains an exit statement to be printed if the master script dies due to an unforseen error.
+* Trimmomatic removes adapter contamination according to a specific fasta file.
+* All options, read & program location are to be specified in the first section of the script.
+* The script is formated to be run on a HPC using a SLURM job scheduler, but this can be easily changed / removed.
+* I've removed the --num_alignments 0 in the ribosomal `sortmrna` step. This caused problems and slowed things down a lot. Plus we don't care about the rRNA alignments. If a sequence aligns to 1 or 1,000 rRNA its out anyways...
 
 #### To do:
 * reverse the order of the merging and trimming step (but again, maybe not...)
-* simplify some syntax / make sure all paths are relative...
-
-
+* simplify some syntax / make sure all paths are relative (mostly done)
 
 
 Version 2 of the SAMSA pipeline - faster!  Lighter!  More options!  Less waiting!  

From 79e66eede3f488cbc6272c34e10d7fa199e46553 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=A9bastien=20Renaut?=
 <renaut@cedar1.cedar.computecanada.ca>
Date: Mon, 18 Feb 2019 11:01:57 -0800
Subject: [PATCH 05/10] minor

---
 .gitignore                       |  5 +++++
 bash_scripts/master_samsa2.slurm | 18 +++++++++---------
 2 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/.gitignore b/.gitignore
index 9e8b5a7..c11349b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,8 @@
+# my files
+bash_scripts/master_samsa2.slurm
+bash_scripts/cyanotoxin_master_samsa2.slurm
+R_scripts/combining_umerged.R
+
 # Swap files
 *.swp
 *.swo
diff --git a/bash_scripts/master_samsa2.slurm b/bash_scripts/master_samsa2.slurm
index 6fa3c24..a16d11b 100755
--- a/bash_scripts/master_samsa2.slurm
+++ b/bash_scripts/master_samsa2.slurm
@@ -1,7 +1,7 @@
 #!/bin/bash
 
 #SBATCH --time=0-23:59
-#SBATCH --mem=100000
+#SBATCH --mem=100G
 #SBATCH --account=def-bruneaua
 #SBATCH --nodes=1
 #SBATCH --ntasks-per-node=48
@@ -48,10 +48,10 @@ echo -e "NOTE: Before running this script, user must run package_installation.ba
 # VARIABLES - Set pathway for starting_location to location of samsa2 GitHub download
 #
 # 	0. Set starting location:
-starting_location=/home/renaut/scratch/ATRAPP_Champlain_2016_Metat/metagenome_St2
+starting_location=/home/renaut/scratch/ATRAPP/Metagenomic_Champlain_long_term/St1
 
-#	0.5 Sequences location (note that sequences actually need to be stored in directory called $sequence_location/hiseq_raw
-sequence_location=/home/renaut/scratch/ATRAPP_Champlain_2016_Metat/sequences/metagenome/St2
+#	0.5 Sequences location (note that sequences actually need to be stored in directory called $sequence_location/zipped
+sequence_location=/home/renaut/scratch/ATRAPP/sequences/Metagenomic_Champlain_long_term/St1
 
 #	1. PEAR
 pear_location=/home/renaut/bin
@@ -100,7 +100,7 @@ fi
 Step=$(grep "GUNZIP" pipeline/checkpoints)
 if [ "${Step}" != "GUNZIP" ]
 	then
-		for file in $sequence_location/hiseq_raw/*gz
+		for file in $sequence_location/zipped/*gz
 			do
 				file_out=`echo $file | awk -F ".gz" '{print $1}'`	
         			gunzip $file -cd >$file_out
@@ -112,8 +112,8 @@ if [ "${Step}" != "GUNZIP" ]
                 		fi
 			done
 		
-		mkdir $sequence_location/hiseq_unzip
-		mv $sequence_location/hiseq_raw/*fastq $sequence_location/hiseq_unzip/.
+		mkdir $sequence_location/unzipped
+		mv $sequence_location/zipped/*fastq $sequence_location/unzipped/.
 		printf "GUNZIP\n" >>pipeline/checkpoints
                 echo "STEP 0.1 DONE:GUNZIP OF PAIRED-END FILES USING GUNZIP"
                 date
@@ -139,7 +139,7 @@ if [ "${Step}" != "MERGING" ]
 	then
 		mkdir $starting_location/step_1_merging
 		touch $starting_location/step_1_merging/pear_log
-		for file in $sequence_location/hiseq_unzip/*_R1.fastq
+		for file in $sequence_location/unzipped/*_R1.fastq
 		do
 			file1=$file
 			file2=`echo $file1 | awk -F "R1" '{print $1 "R2" $2}'`
@@ -162,7 +162,7 @@ if [ "${Step}" != "MERGING" ]
 		mv $starting_location/*assembled2.fastq $starting_location/step_1_merging/
 		
 		rm $starting_location/*fastq
-		rm $sequence_location/hiseq_unzip/*fastq
+		rm $sequence_location/unzipped/*fastq
 
 		echo "STEP 1 DONE:MERGING OF PAIRED-END FILES USING PEAR"
 		date        

From fb2fc99f0e4f369fa783f476b2322f5caad6422b Mon Sep 17 00:00:00 2001
From: Sebastien Renaut <sebastien.renaut@gmail.com>
Date: Mon, 18 Feb 2019 14:03:14 -0500
Subject: [PATCH 06/10] remove from main

---
 bash_scripts/master_samsa2.slurm | 435 -------------------------------
 1 file changed, 435 deletions(-)
 delete mode 100755 bash_scripts/master_samsa2.slurm

diff --git a/bash_scripts/master_samsa2.slurm b/bash_scripts/master_samsa2.slurm
deleted file mode 100755
index a16d11b..0000000
--- a/bash_scripts/master_samsa2.slurm
+++ /dev/null
@@ -1,435 +0,0 @@
-#!/bin/bash
-
-#SBATCH --time=0-23:59
-#SBATCH --mem=100G
-#SBATCH --account=def-bruneaua
-#SBATCH --nodes=1
-#SBATCH --ntasks-per-node=48
-
-module load r/3.5.0
-
-
-# Lines starting with #SBATCH are for SLURM job management systems
-# and may be removed if user is not submitting jobs to SLURM
-####################################################################
-#set -x #echo
-#
-# master_script_for_sample_files.bash
-# Created April 2017 by Sam Westreich, github.com/transcript
-# This version modified by Michelle Treiber on August 9, 2017
-#
-####################################################################
-#
-# This script sets up and runs through ALL steps in the SAMSA pipeline
-# before the analysis (which is done in R, likely in RStudio).  Each
-# step is set up below.
-#
-# The steps are:
-#	0.1	Create the checkpoint
-#	0.2	Gunzip the raw sequences
-#	1.	Merging with PEAR, if applicable
-#	2.	Read cleaning with Trimmomatic
-#	3.	rRNA removal with SortMeRNA
-#	4.	Annotation using DIAMOND against Subsys database
-#	5.	Aggregation using analysis_counter.py (Subsys)
-#   	4.1  	Annotation using DIAMOND against Refseq database
-#   	5.1	Aggregation using analysis_counter.py (Refseq)
-#	6.	Running R scripts to get DESeq statistical analysis. (NOT RUN)
-#
-# NOTE: BEFORE running this script, please run package_installation.bash
-# and full_database_download.bash located at:
-# https://github.com/transcript/samsa2/tree/master/setup in order to set 
-# up SAMSA2 dependencies and download full databases.
-#
-####################################################################
-#
-echo -e "NOTE: Before running this script, user must run package_installation.bash and full_database_download.bash located at https://github.com/transcript/samsa2/tree/master/setup in order to set up SAMSA2 dependencies.\n\n"
-#
-# VARIABLES - Set pathway for starting_location to location of samsa2 GitHub download
-#
-# 	0. Set starting location:
-starting_location=/home/renaut/scratch/ATRAPP/Metagenomic_Champlain_long_term/St1
-
-#	0.5 Sequences location (note that sequences actually need to be stored in directory called $sequence_location/zipped
-sequence_location=/home/renaut/scratch/ATRAPP/sequences/Metagenomic_Champlain_long_term/St1
-
-#	1. PEAR
-pear_location=/home/renaut/bin
-
-# 	2. Trimmomatic
-trimmomatic_location=/home/renaut/trinityrnaseq-Trinity-v2.8.4/trinity-plugins/Trimmomatic-0.36
-
-#	3. SortMeRNA
-sortmerna_location=/home/renaut/bin
-
-#	4. DIAMOND
-diamond_database="/home/renaut/scratch/blast_database/refseq/RefSeq_bac"
-diamond_subsys_db="/home/renaut/scratch/blast_database/subsys/subsys_db"
-diamond_location=/home/renaut/bin
-
-#	5. Aggregation
-python_programs=/home/renaut/samsa2.master/samsa2/python_scripts
-RefSeq_db="/home/renaut/scratch/blast_database/refseq/RefSeq_bac.fa"
-Subsys_db="/home/renaut/scratch/blast_database/subsys/subsys_db.fa"
-
-#	6. R scripts and paths
-export R_LIBS="/home/renaut/samsa2.master/samsa2/R_scripts/packages"
-R_programs=/home/renaut/samsa2.master/samsa2/R_scripts
-
-#	threads
-threads="48"
-
-##################
-#STEP 0.1: create/read checkpoint
-cd $starting_location
-
-printf "\nStep 0.1: Checking for the presence of the checkpoint file\n"
-if [ ! -f "pipeline/checkpoints" ]
-        then
-                printf "\tThe file checkpoints does not exist, we will create it.\n"
-        touch "pipeline/checkpoints"
-else
-        printf "\tThe file checkpoints already exist\n"
-fi
-
-####################################################################
-#
-#STEP 0.2: GUNZIP PROCESS. 
-#
-#
-Step=$(grep "GUNZIP" pipeline/checkpoints)
-if [ "${Step}" != "GUNZIP" ]
-	then
-		for file in $sequence_location/zipped/*gz
-			do
-				file_out=`echo $file | awk -F ".gz" '{print $1}'`	
-        			gunzip $file -cd >$file_out
-
-				if [ $? -ne 0 ]
-                        		then
-                                	printf "\t!!! There is a problem in the gunzip step"
-                                	exit 1
-                		fi
-			done
-		
-		mkdir $sequence_location/unzipped
-		mv $sequence_location/zipped/*fastq $sequence_location/unzipped/.
-		printf "GUNZIP\n" >>pipeline/checkpoints
-                echo "STEP 0.1 DONE:GUNZIP OF PAIRED-END FILES USING GUNZIP"
-                date
-else
-        printf  "\tThe variable GUNZIP is in the checkpoint file. STEP 0.1 will then be passed\n"
-fi
-
-####################################################################
-#
-# STEP 1: MERGING OF PAIRED-END FILES USING PEAR
-# Note: paired-end files are usually named using R1 and R2 in the name.
-#		Example: control_1.R1.fastq
-#				 control_1.R2.fastq
-#
-# Note: if using single-end sequencing, skip this step (comment out).
-# Note: if performing R analysis (step 6), be sure to name files with 
-# 	the appropriate prefix ("control_$file" and "experimental_$file")!
-
-
-
-Step=$(grep "MERGING" pipeline/checkpoints)
-if [ "${Step}" != "MERGING" ]
-	then
-		mkdir $starting_location/step_1_merging
-		touch $starting_location/step_1_merging/pear_log
-		for file in $sequence_location/unzipped/*_R1.fastq
-		do
-			file1=$file
-			file2=`echo $file1 | awk -F "R1" '{print $1 "R2" $2}'`
-			out_path=`echo $file | awk -F "R1" '{print $1 "merged"}'`
-			out_name=`echo ${out_path##*/}`
-			$pear_location/pear -f $file1 -j $threads -r $file2 -o $out_name 1>>$starting_location/step_1_merging/pear_log
-		
-			if [ $? -ne 0 ]
-                        	then
-                                	printf "\t!!! There is a problem in the merging step"
-                                	exit 1
-                	fi
-		done
-		
-		###I will concatenate forward and reverse complement with 20Ns in the middle.
-		$R_programs/combining_umerged.R $starting_location
-		
-		printf "MERGING\n" >>pipeline/checkpoints
-		mkdir $starting_location/step_1_merging/
-		mv $starting_location/*assembled2.fastq $starting_location/step_1_merging/
-		
-		rm $starting_location/*fastq
-		rm $sequence_location/unzipped/*fastq
-
-		echo "STEP 1 DONE:MERGING OF PAIRED-END FILES USING PEAR"
-		date        
-else
-        printf  "\tThe variable MERGING is in the checkpoint file. STEP 1 will then be passed\n"
-fi
-
-####################################################################
-#
-# STEP 2: CLEANING FILES WITH TRIMMOMATIC
-# Note: if skipping PEAR, make sure that all starting files are in the
-# $starting_location/step_1_output/ folder!
-Step=$(grep "TRIMMING" pipeline/checkpoints)
-if [ "${Step}" != "TRIMMING" ]
-        then
-		mkdir $starting_location/step_2_trimming
-		touch $starting_location/step_2_trimming/trimmomatic_log
-
-	for file in $starting_location/step_1_merging/*assembled2.fastq
-		do
-		shortname=`echo $file | awk -F "assembled2.fastq" '{print $1 "cleaned.fastq"}'`
-
-		java -jar $trimmomatic_location/trimmomatic-0.36.jar SE -threads $threads -phred33 $file $shortname ILLUMINACLIP:$trimmomatic_location/adapters/all_primers.fa:2:30:10 SLIDINGWINDOW:4:15 MINLEN:59 2>>$starting_location/step_2_trimming/trimmomatic_log
-		if [ $? -ne 0 ]
-			then 
-				printf "\t!!! There is a problem in the trimmmomatic step"
-				exit 1
-		fi
-		
-		done
-
-	mkdir $starting_location/step_2_trimming/
-	mv $starting_location/step_1_merging/*cleaned.fastq $starting_location/step_2_trimming/
-	
-	rm step_1_merging/*assembled2.fastq
-	
-	echo "STEP 2 DONE:CLEANING FILES WITH TRIMMOMATIC"
-	date
-	
-	printf "TRIMMING\n" >>pipeline/checkpoints
-else
-        printf  "\tThe variable TRIMMING is in the checkpoint file. STEP 2 will then be passed\n"
-fi
-
-####################################################################
-#
-# STEP 2.5: GETTING RAW SEQUENCES COUNTS
-# Note: These are used later for statistical analysis.
-Step=$(grep "RAW" pipeline/checkpoints)
-if [ "${Step}" != "RAW" ]
-        then
-		if [ -f $starting_location/step_2_trimming/raw_counts.txt ]
-			then
-				rm $starting_location/step_2_trimming/raw_counts.txt
-				touch $starting_location/step_2_trimming/raw_counts.txt
-		else
-			touch $starting_location/step_2_trimming/raw_counts.txt
-		fi	
-
-		for file in $starting_location/step_2_trimming/*cleaned.fastq
-		do
-			python $python_programs/raw_read_counter.py -I $file -O $starting_location/step_2_trimming/raw_counts.txt
-		if [ $? -ne 0 ]
-			then 
-				printf "\t!!! There is a problem in the raw sequences step"
-				exit 1
-		fi
-		done
-		echo "STEP 2.5:GETTING RAW SEQUENCES COUNTS" 
-		date
-
-        printf "RAW\n" >>pipeline/checkpoints
-else
-        printf  "\tThe variable RAW is in the checkpoint file. STEP 2.9 will then be passed\n"
-fi
-
-####################################################################
-#
-# STEP 3: REMOVING RIBOSOMAL READS WITH SORTMERNA
-# Note: this step assumes that the SortMeRNA databases are indexed.  If not,
-# do that first (see the SortMeRNA user manual for details).
-
-Step=$(grep "RIBODEPLETION" pipeline/checkpoints)
-if [ "${Step}" != "RIBODEPLETION" ]
-        then
-        mkdir $starting_location/step_3_ribodepletion
-		touch $starting_location/step_3_ribodepletion/sortmerna_log
-
-		for file in $starting_location/step_2_trimming/*cleaned.fastq
-		do
-			shortname=`echo $file | awk -F "cleaned" '{print $1 "ribodepleted"}'`
-			$sortmerna_location/sortmerna -a $threads --ref $sortmerna_location/../sortmerna-2.1/rRNA_databases/silva-bac-16s-id90.fasta,$sortmerna_location/../sortmerna-2.1/index/silva-bac-16s --reads $file --aligned $file.ribosomes --other $shortname --fastx --log -v 1>>$starting_location/step_3_ribodepletion/sortmerna_log
-		if [ $? -ne 0 ]
-			then 
-				printf "\t!!! There is a problem in the sortmerna step"
-				exit 1
-		fi
-		done
-
-		mkdir $starting_location/step_3_ribodepletion/
-		mv $starting_location/step_2_trimming/*ribodepleted* $starting_location/step_3_ribodepletion/
-		
-		rm step_2_trimming/*cleaned.fastq
-		rm step_2_trimming/*ribosomes.fastq
-		
-		echo "STEP 3 DONE:REMOVING RIBOSOMAL READS WITH SORTMERNA"
-		date
-	
-        	printf "RIBODEPLETION\n" >>pipeline/checkpoints
-else
-        	printf  "\tThe variable RIBODEPLETION is in the checkpoint file. STEP 3 will then be passed\n"
-fi
-
-####################################################################
-#
-# STEP 4: ANNOTATING WITH DIAMOND AGAINST SUBSYSTEMS
-date
-Step=$(grep "ANNOTATION_SUBSYS" pipeline/checkpoints)
-if [ "${Step}" != "ANNOTATION_SUBSYS" ]
-	then
-		mkdir $starting_location/step_4_annotation
-		touch $starting_location/step_4_annotation/diamond_log
-		for file in $starting_location/step_3_ribodepletion/*ribodepleted.fastq
-		do
-			shortname=`echo $file | awk -F "ribodepleted" '{print $1 "subsys_annotated"}'`
-			#removed the diamond sensitive option
-			$diamond_location/diamond blastx --db $diamond_subsys_db -q $file -a $file.Subsys -p $threads -t ./ -k 1 1>>$starting_location/step_4_annotation/diamond_log
-			$diamond_location/diamond view --daa $file.Subsys.daa -o $shortname -p $threads -f tab 1>>$starting_location/step_4_annotation/diamond_log
-		if [ $? -ne 0 ]
-			then 
-				printf "\t!!! There is a problem in the diamond subsys step"
-				exit 1
-		fi
-		done
-		
-		mkdir $starting_location/step_4_annotation/daa_binary_files/
-	
-		mv $starting_location/step_3_ribodepletion/*subsys_annotated* $starting_location/step_4_annotation/
-		mv $starting_location/step_3_ribodepletion/*.daa $starting_location/step_4_annotation/daa_binary_files/
-
-		echo "STEP 4 DONE: ANNOTATING WITH DIAMOND AGAINST SUBSYSTEMS"
-        	date
-
-		printf "ANNOTATION_SUBSYS\n" >>pipeline/checkpoints
-else
-       	 printf  "\tThe variable ANNOTATION_SUBSYS is in the checkpoint file. STEP 4 will then be passed\n"
-fi
-##################################################################
-#
-# STEP 5: PYTHON SUBSYSTEMS ANALYSIS COUNTER
-Step=$(grep "AGGREGATION_SUBSYS" pipeline/checkpoints)
-if [ "${Step}" != "AGGREGATION_SUBSYS" ]
-	then
-		mkdir $starting_location/step_5_aggregation/
-		for file in $starting_location/step_4_annotation/*subsys_annotated
-		do
-			python $python_programs/DIAMOND_subsystems_analysis_counter.py -I $file -D $Subsys_db -O $file.hierarchy -P $file.receipt
-	
-			# This quick program reduces down identical hierarchy annotations
-			python $python_programs/subsys_reducer.py -I $file.hierarchy
-		if [ $? -ne 0 ]
-			then 
-				printf "\t!!! There is a problem in the aggregation subsys step"
-				exit 1
-		fi
-		done
-
-		mkdir $starting_location/step_5_aggregation/Subsystems_results/
-		mkdir $starting_location/step_5_aggregation/Subsystems_results/receipts/
-		mv $starting_location/step_4_annotation/*.reduced $starting_location/step_5_aggregation/Subsystems_results/
-		mv $starting_location/step_4_annotation/*.receipt $starting_location/step_5_aggregation/Subsystems_results/receipts/
-		rm $starting_location/step_4_annotation/*.hierarchy
-
-		echo "STEP 5 DONE:PYTHON SUBSYSTEMS ANALYSIS COUNTER"
-		date
-
-        	printf "AGGREGATION_SUBSYS\n" >>pipeline/checkpoints
-else
-	        printf  "\tThe variable AGGREGATION_SUBSYS is in the checkpoint file. STEP 5. will then be passed\n"
-fi
-
-
-####################################################################
-#
-# STEP 4.1: ANNOTATING WITH DIAMOND AGAINST REFSEQ
-# Note: this step assumes that the DIAMOND database is already built.  If not,
-# do that first before running this step.
-Step=$(grep "ANNOTATION_REFSEQ" pipeline/checkpoints)
-if [ "${Step}" != "ANNOTATION_REFSEQ" ]
-        then
-		touch $starting_location/step_4_annotation/diamond_log
-
-		for file in $starting_location/step_3_ribodepletion/*ribodepleted.fastq
-			do
-				shortname=`echo $file | awk -F "ribodepleted" '{print $1 "RefSeq_annotated"}'`
-				#removed the diamond sensitive option
-				$diamond_location/diamond blastx --db $diamond_database -q $file -p $threads -a $file.RefSeq -t ./ -k 1 1>>$starting_location/step_4_annotation/diamond_log
-				$diamond_location/diamond view --daa $file.RefSeq.daa -o $shortname -p $threads -f tab 1>>$starting_location/step_4_annotation/diamond_log
-			if [ $? -ne 0 ]
-			then 
-				printf "\t!!! There is a problem in the diamond refseq step"
-				exit 1
-			fi
-			done
-
-		mv $starting_location/step_3_ribodepletion/*annotated* $starting_location/step_4_annotation/
-		mv $starting_location/step_3_ribodepletion/*.daa $starting_location/step_4_annotation/daa_binary_files/
-
-		echo "STEP 4.1 DONE:ANNOTATING WITH DIAMOND AGAINST REFSEQ"
-		date
-
-        	printf "ANNOTATION_REFSEQ\n" >>pipeline/checkpoints
-else
-	        printf  "\tThe variable ANNOTATION_REFSEQ is in the checkpoint file. STEP 4.1 will then be passed\n"
-fi
-
-####################################################################
-#
-# STEP 5.1: AGGREGATING WITH ANALYSIS_COUNTER
-Step=$(grep "AGGREGATION_REFSEQ" pipeline/checkpoints)
-if [ "${Step}" != "AGGREGATION_REFSEQ" ]
-	then
-		for file in $starting_location/step_4_annotation/*RefSeq_annotated*
-		do
-			python $python_programs/standardized_DIAMOND_analysis_counter.py -I $file -D $RefSeq_db -O
-			python $python_programs/standardized_DIAMOND_analysis_counter.py -I $file -D $RefSeq_db -F
-			if [ $? -ne 0 ]
-				then 
-					printf "\t!!! There is a problem in the aggregation refseq step"
-					exit 1
-			fi
-		done
-
-		mkdir $starting_location/step_5_aggregation/RefSeq_results/
-		mkdir $starting_location/step_5_aggregation/RefSeq_results/org_results/
-		mkdir $starting_location/step_5_aggregation/RefSeq_results/func_results/
-		mv $starting_location/step_4_annotation/*organism.tsv $starting_location/step_5_aggregation/RefSeq_results/org_results/
-		mv $starting_location/step_4_annotation/*function.tsv $starting_location/step_5_aggregation/RefSeq_results/func_results/
-
-		echo "STEP 5.1 DONE: AGGREGATING WITH ANALYSIS_COUNTER"
-		date 
-
-        	printf "AGGREGATION_REFSEQ\n" >>pipeline/checkpoints
-else
-	        printf  "\tThe variable AGGREGATION_REFSEQ is in the checkpoint file. STEP 5.1 will then be passed\n"
-fi
-
-
-
-##################################################################
-#
-# At this point, all the results files are ready for analysis using R.
-# This next step performs basic DESeq2 analysis of the RefSeq organism, function,
-# and Subsystems annotations.
-#
-# More complex R analyses may be performed using specific .sh analysis scripts.
-#
-# STEP 6: R ANALYSIS
-# Note: For R to properly identify files to compare/contrast, they must include 
-# the appropriate prefix (either "control_$file" or experimental_$file")!
-
-#/home/apps/Logiciels/R/3.4.3-gcc/bin/Rscript $R_programs/run_DESeq_stats.R -I $starting_location/step_5_aggregation/RefSeq_results/org_results/ -O RefSeq_org_DESeq_results.tab -R $starting_location/step_2_trimming/raw_counts.txt
-#/home/apps/Logiciels/R/3.4.3-gcc/bin/Rscript $R_programs/run_DESeq_stats.R -I $starting_location/step_5_aggregation/RefSeq_results/func_results/ -O RefSeq_func_DESeq_results.tab -R $starting_location/step_2_trimming/raw_counts.txt
-#/home/apps/Logiciels/R/3.4.3-gcc/bin/Rscript $R_programs/Subsystems_DESeq_stats.R -I $starting_location/step_5_aggregation/Subsystems_results/ -O Subsystems_level-1_DESeq_results.tab -L 1 -R step_2_trimming/raw_counts.txt
-
-echo "STEP 6 DONE: R ANALYSIS"
-echo "Master bash script finished running at: "; date
-exit
-####################################################################
-

From f68ba12b7bd1a88d9830c21e1c012cc2b6eec3a1 Mon Sep 17 00:00:00 2001
From: Sebastien Renaut <sebastien.renaut@gmail.com>
Date: Mon, 18 Feb 2019 14:07:22 -0500
Subject: [PATCH 07/10] delete from main branch

---
 R_scripts/combining_umerged.R | 61 -----------------------------------
 1 file changed, 61 deletions(-)
 delete mode 100755 R_scripts/combining_umerged.R

diff --git a/R_scripts/combining_umerged.R b/R_scripts/combining_umerged.R
deleted file mode 100755
index c04f28c..0000000
--- a/R_scripts/combining_umerged.R
+++ /dev/null
@@ -1,61 +0,0 @@
-#!/cvmfs/soft.computecanada.ca/easybuild/software/2017/avx2/Compiler/intel2016.4/r/3.5.0/bin/Rscript
-
-args = commandArgs(TRUE)
-start = args[1] # Specify which sequences in "list_ind" file you want to align, directlty from the shell. Alternatively, you can do this from the "alignments" function itself.
-#start="/Users/jerry/Documents/CSBQ/shapiro/results"
-
-setwd(start)
-
-###files to work with 
-system(paste("ls -1 *unassembled.forward.fastq >umerged.forward.files",sep = ""))
-system(paste("ls -1 *unassembled.reverse.fastq >umerged.reverse.files",sep = ""))
-system(paste("ls -1 *.assembled.fastq >assembled.files",sep = ""))
-
-files_f = read.table("umerged.forward.files",stringsAsFactors = F)      
-files_r = read.table("umerged.reverse.files",stringsAsFactors = F) 
-files_a = read.table("assembled.files",stringsAsFactors = F) 
-
-
-for(i in 1:nrow(files_f))      
-  {
-#grep 1st and 2rd lines 
-  	unassembled.fastq = paste("awk 'NR % 2 == 1' ",files_f[i,1]," >unassembled.names.seq.fastq",sep="")
-  	system(unassembled.fastq)
-
-#grep sequences forward...
-  	unassembled.forward.fastq = paste("awk 'NR % 2 == 0' ",files_f[i,1]," >unassembled.seq.forward.fastq",sep="")
-  	system(unassembled.forward.fastq)
-
-#grep sequences reverse...
-  	unassembled.reverse.fastq = paste("awk 'NR % 2 == 0' ",files_r[i,1]," >unassembled.seq.reverse.fastq",sep="")
-  	system(unassembled.reverse.fastq)
-
-#N and E quality file
- 	system("wc -l unassembled.seq.reverse.fastq >wc")
- 	wc = read.table("wc")
-  
- 	write.table(c(rbind(rep('NNNNNNNNNNNNNNNNNNNN',wc[1,1]/2),rep('EEEEEEEEEEEEEEEEEEEE',wc[1,1]/2))),"NE_file",row.names = F, col.names = F, quote = F)
-  
-#paste the unassembled sequences into a single file
-	system("paste -d '\\0' unassembled.seq.forward.fastq NE_file unassembled.seq.reverse.fastq >unassembledN")
-
-#put everything back into a single fastq
-	back = paste("paste -d '\\n' unassembled.names.seq.fastq unassembledN >",gsub("merged.unassembled.forward","cat",files_f[i,1]),sep= "")
-	system(back)
-
-#add the merged sequences
-	all = paste("cat ",files_a[i,1]," ",gsub("merged.unassembled.forward","cat",files_f[i,1])," >",gsub("merged.assembled","merged.assembled2",files_a[i,1]),sep = "")
-	system(all)
-
-#remove the clutter (file specific)
-	remove = c("rm NE_file unassembledN wc unassembled.names.seq.fastq unassembled.seq.forward.fastq unassembled.seq.reverse.fastq")
-	system(remove)
-}
-
-#remove the clutter (listing of all files)
-system("rm assembled.files umerged.forward.files umerged.reverse.files")
-
-
-
-
-

From ceba37eea48f45f4bcc7171066d2c43af7aa2a5c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=A9bastien=20Renaut?=
 <renaut@cedar1.cedar.computecanada.ca>
Date: Mon, 18 Feb 2019 11:13:35 -0800
Subject: [PATCH 08/10] add a checkpoint file

---
 README.md                     | 20 --------------------
 bash_scripts/master_script.sh | 23 +++++++++++++++++++++++
 2 files changed, 23 insertions(+), 20 deletions(-)

diff --git a/README.md b/README.md
index 72c0ce0..2ae3d5f 100644
--- a/README.md
+++ b/README.md
@@ -1,25 +1,5 @@
 # SAMSA2 - A fork of the complete metatranscriptome analysis pipeline
 
-### modification by sebastien.renaut@gmail.com
-
-### New in my version. 
-This is a new master script (master_samsa2.slurm) with several modifications:
-
-* It specifies multithreading for all programs for which it is available (trimmomatic, pear, diamond)
-* The script automatically creates a `checkpoint` file. Once a step is finished, it writes the name of that specific step in `checkpoint`. Everytime you run the script, it looks if that file exist. If so, it reads it and SKIP the steps written in checkpoint. This is done to avoid re-run CPU intense steps if not necessary.
-* In the merging step, unmerged reads are concatenated and added to a single file. The forward read and the reverse (complement) read are concatenated with a string of 20 Ns in the middle: This is done through a new R script entitled: `combining_umerged.R`
-* Extra care is taken to remove unnecessary files once a step is performed to keep disk usage at a minimum.
-* Each step contains an exit statement to be printed if the master script dies due to an unforseen error.
-* Trimmomatic removes adapter contamination according to a specific fasta file.
-* All options, read & program location are to be specified in the first section of the script.
-* The script is formated to be run on a HPC using a SLURM job scheduler, but this can be easily changed / removed.
-* I've removed the --num_alignments 0 in the ribosomal `sortmrna` step. This caused problems and slowed things down a lot. Plus we don't care about the rRNA alignments. If a sequence aligns to 1 or 1,000 rRNA its out anyways...
-
-#### To do:
-* reverse the order of the merging and trimming step (but again, maybe not...)
-* simplify some syntax / make sure all paths are relative (mostly done)
-
-
 Version 2 of the SAMSA pipeline - faster!  Lighter!  More options!  Less waiting!  
 
 ### New in version 2:
diff --git a/bash_scripts/master_script.sh b/bash_scripts/master_script.sh
index 5f8ff5e..cd6ac6c 100644
--- a/bash_scripts/master_script.sh
+++ b/bash_scripts/master_script.sh
@@ -72,8 +72,26 @@ else
 fi
 
 ####################################################################
+#STEP 0.1: create/read checkpoint
+
+printf "\nStep 0.1: Checking for the presence of the checkpoint file\n"
+if [ ! -f "$INPUT_DIR/checkpoints" ]
+        then
+                printf "\tThe file checkpoints does not exist, we will create it.\n"
+        touch "$INPUT_DIR/checkpoints"
+else
+        printf "\tThe file checkpoints already exist\n"
+fi
+
+####################################################################
+
+
 #
 # STEP 1: CLEANING FILES WITH TRIMMOMATIC
+Step=$(grep "TRIMMO" $INPUT_DIR/checkpoints)
+if [ "${Step}" != "TRIMMO" ]
+        then
+
 
 if ls $INPUT_DIR/*.gz &>/dev/null; then
   for file in $INPUT_DIR/*.gz
@@ -105,6 +123,11 @@ else
   mv $INPUT_DIR/*".cleaned" $STEP_1
 fi
 
+else
+        printf  "\tThe variable TRIMMO is in the checkpoint file. STEP 1 will then be passed\n"
+fi
+
+
 ####################################################################
 #
 # STEP 2: MERGING OF PAIRED-END FILES USING PEAR

From bf29e7888a2b08872837c5cbb50e923abd89d711 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=A9bastien=20Renaut?=
 <renaut@cedar1.cedar.computecanada.ca>
Date: Mon, 18 Feb 2019 11:26:35 -0800
Subject: [PATCH 09/10] checkpoint for all steps

---
 bash_scripts/master_script.sh | 74 +++++++++++++++++++++++++++++++++++
 1 file changed, 74 insertions(+)

diff --git a/bash_scripts/master_script.sh b/bash_scripts/master_script.sh
index cd6ac6c..f6ef74e 100644
--- a/bash_scripts/master_script.sh
+++ b/bash_scripts/master_script.sh
@@ -134,6 +134,10 @@ fi
 # Note: paired-end files are usually named using R1 and R2 in the name.
 #       Example: control_1.R1.fastq
 #                control_1.R2.fastq
+Step=$(grep "MERGING" $INPUT_DIR/checkpoints)
+if [ "${Step}" != "MERGING" ]
+        then
+
 
 $MKDIR $STEP_2
 if $paired; then
@@ -151,10 +155,19 @@ else
   done
 fi
 
+printf "MERGING\n" >>$INPUT_DIR/checkpoints
+
+else
+        printf  "\tThe variable MERGING is in the checkpoint file. STEP 2 will then be passed\n"
+fi
+
 ####################################################################
 #
 # STEP 2.9: GETTING RAW SEQUENCES COUNTS
 # Note: These are used later for statistical analysis.
+Step=$(grep "RAW" $INPUT_DIR/checkpoints)
+if [ "${Step}" != "RAW" ]
+        then
 
 if [[ -f $STEP_2/raw_counts.txt ]]; then
     rm $STEP_2/raw_counts.txt
@@ -173,11 +186,20 @@ else
   done
 fi
 
+printf "RAW\n" >>$INPUT_DIR/checkpoints
+
+else
+        printf  "\tThe variable RAW is in the checkpoint file. STEP 2.9 will then be passed\n"
+fi
+
 ####################################################################
 #
 # STEP 3: REMOVING RIBOSOMAL READS WITH SORTMERNA
 # Note: this step assumes that the SortMeRNA databases are indexed.  If not,
 # do that first (see the SortMeRNA user manual for details).
+Step=$(grep "RIBO" $INPUT_DIR/checkpoints)
+if [ "${Step}" != "RIBO" ]
+        then
 
 for file in $STEP_2/*.assembled.fastq
 do
@@ -191,11 +213,20 @@ done
 $MKDIR $STEP_3
 mv $STEP_2/*ribodepleted* $STEP_3
 
+printf "RIBO\n" >>$INPUT_DIR/checkpoints
+
+else
+        printf  "\tThe variable RIBO is in the checkpoint file. STEP 3 will then be passed\n"
+fi
+
 ####################################################################
 #
 # STEP 4: ANNOTATING WITH DIAMOND AGAINST REFSEQ
 # Note: this step assumes that the DIAMOND database is already built.  If not,
 # do that first before running this step.
+Step=$(grep "REFSEQ_ANNOT" $INPUT_DIR/checkpoints)
+if [ "${Step}" != "REFSEQ_ANNOT" ]
+        then
 
 echo "Now starting on DIAMOND org annotations at: "; date
 
@@ -215,9 +246,18 @@ mv $STEP_3/*.daa $STEP_4/daa_binary_files
 
 echo "RefSeq DIAMOND annotations completed at: "; date
 
+printf "REFSEQ_ANNOT\n" >>$INPUT_DIR/checkpoints
+
+else
+        printf  "\tThe variable REFSEQ_ANNOT is in the checkpoint file. STEP 4 will then be passed\n"
+fi
+
 ####################################################################
 #
 # STEP 5: AGGREGATING WITH ANALYSIS_COUNTER
+Step=$(grep "REFSEQ_AGGREG" $INPUT_DIR/checkpoints)
+if [ "${Step}" != "REFSEQ_AGGREG" ]
+        then
 
 for file in $STEP_4/*RefSeq_annotated
 do
@@ -230,9 +270,18 @@ $MKDIR $STEP_5/RefSeq_results/func_results
 mv $STEP_4/*organism.tsv $STEP_5/RefSeq_results/org_results
 mv $STEP_4/*function.tsv $STEP_5/RefSeq_results/func_results
 
+printf "REFSEQ_AGGREG\n" >>$INPUT_DIR/checkpoints
+
+else
+        printf  "\tThe variable REFSEQ_AGGREG is in the checkpoint file. STEP 5 will then be passed\n"
+fi
+
 ####################################################################
 #
 # STEP 4.1: ANNOTATING WITH DIAMOND AGAINST SUBSYSTEMS
+Step=$(grep "SUBSYS_ANNOT" $INPUT_DIR/checkpoints)
+if [ "${Step}" != "SUBSYS_ANNOT" ]
+        then
 
 echo "Now starting on DIAMOND Subsystems annotations at: "; date
 
@@ -249,9 +298,18 @@ mv $STEP_3/*.daa $STEP_4/daa_binary_files
 
 echo "DIAMOND Subsystems annotations completed at: "; date
 
+printf "SUBSYS_ANNOT\n" >>$INPUT_DIR/checkpoints
+
+else
+        printf  "\tThe variable SUBSYS_ANNOT is in the checkpoint file. STEP 4.1 will then be passed\n"
+fi
+
 ##################################################################
 #
 # STEP 5.1: PYTHON SUBSYSTEMS ANALYSIS COUNTER
+Step=$(grep "SUBSYS_AGGREG" $INPUT_DIR/checkpoints)
+if [ "${Step}" != "SUBSYS_AGGREG" ]
+        then
 
 for file in $STEP_4/*subsys_annotated
 do
@@ -267,6 +325,12 @@ mv $STEP_4/*.reduced $STEP_5/Subsystems_results
 mv $STEP_4/*.receipt $STEP_5/Subsystems_results/receipts
 rm $STEP_4/*.hierarchy
 
+printf "SUBSYS_AGGREG\n" >>$INPUT_DIR/checkpoints
+
+else
+        printf  "\tThe variable SUBSYS_AGGREG is in the checkpoint file. STEP 5.1 will then be passed\n"
+fi
+
 ##################################################################
 #
 # At this point, all the results files are ready for analysis using R.
@@ -278,6 +342,9 @@ rm $STEP_4/*.hierarchy
 # STEP 6: R ANALYSIS
 # Note: For R to properly identify files to compare/contrast, they must include
 # the appropriate prefix (either "control_$file" or experimental_$file")!
+Step=$(grep "R_ANALYSIS" $INPUT_DIR/checkpoints)
+if [ "${Step}" != "R_ANALYSIS" ]
+        then
 
 checked Rscript $R_DIR/run_DESeq_stats.R \
   -I $STEP_5/RefSeq_results/org_results \
@@ -292,6 +359,13 @@ checked Rscript $R_DIR/Subsystems_DESeq_stats.R \
   -O Subsystems_level-1_DESeq_results.tab -L 1 \
   -R $STEP_2/raw_counts.txt
 
+printf "R_ANALYSIS\n" >>$INPUT_DIR/checkpoints
+
+else
+        printf  "\tThe variable R_ANALYSIS is in the checkpoint file. STEP 6 will then be passed\n"
+fi
+
 echo "Master bash script finished running at: "; date
 exit
 ####################################################################
+

From c7474db08cc89320ae587fc0c7def606508f2063 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=A9bastien=20Renaut?=
 <renaut@cedar1.cedar.computecanada.ca>
Date: Mon, 4 Mar 2019 12:07:28 -0800
Subject: [PATCH 10/10] adding my own scripts to the forked samsa2 pipeline

---
 .gitignore                                  |   6 +-
 README.md                                   |  20 +
 R_scripts/combining_umerged.R               |  61 +++
 bash_scripts/cyanotoxin_master_samsa2.slurm | 505 ++++++++++++++++++++
 bash_scripts/master_samsa2.slurm            | 435 +++++++++++++++++
 5 files changed, 1024 insertions(+), 3 deletions(-)
 create mode 100755 R_scripts/combining_umerged.R
 create mode 100755 bash_scripts/cyanotoxin_master_samsa2.slurm
 create mode 100755 bash_scripts/master_samsa2.slurm

diff --git a/.gitignore b/.gitignore
index c11349b..30b79f6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,7 +1,7 @@
 # my files
-bash_scripts/master_samsa2.slurm
-bash_scripts/cyanotoxin_master_samsa2.slurm
-R_scripts/combining_umerged.R
+#bash_scripts/master_samsa2.slurm
+#bash_scripts/cyanotoxin_master_samsa2.slurm
+#R_scripts/combining_umerged.R
 
 # Swap files
 *.swp
diff --git a/README.md b/README.md
index 2ae3d5f..72c0ce0 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,25 @@
 # SAMSA2 - A fork of the complete metatranscriptome analysis pipeline
 
+### modification by sebastien.renaut@gmail.com
+
+### New in my version. 
+This is a new master script (master_samsa2.slurm) with several modifications:
+
+* It specifies multithreading for all programs for which it is available (trimmomatic, pear, diamond)
+* The script automatically creates a `checkpoint` file. Once a step is finished, it writes the name of that specific step in `checkpoint`. Everytime you run the script, it looks if that file exist. If so, it reads it and SKIP the steps written in checkpoint. This is done to avoid re-run CPU intense steps if not necessary.
+* In the merging step, unmerged reads are concatenated and added to a single file. The forward read and the reverse (complement) read are concatenated with a string of 20 Ns in the middle: This is done through a new R script entitled: `combining_umerged.R`
+* Extra care is taken to remove unnecessary files once a step is performed to keep disk usage at a minimum.
+* Each step contains an exit statement to be printed if the master script dies due to an unforseen error.
+* Trimmomatic removes adapter contamination according to a specific fasta file.
+* All options, read & program location are to be specified in the first section of the script.
+* The script is formated to be run on a HPC using a SLURM job scheduler, but this can be easily changed / removed.
+* I've removed the --num_alignments 0 in the ribosomal `sortmrna` step. This caused problems and slowed things down a lot. Plus we don't care about the rRNA alignments. If a sequence aligns to 1 or 1,000 rRNA its out anyways...
+
+#### To do:
+* reverse the order of the merging and trimming step (but again, maybe not...)
+* simplify some syntax / make sure all paths are relative (mostly done)
+
+
 Version 2 of the SAMSA pipeline - faster!  Lighter!  More options!  Less waiting!  
 
 ### New in version 2:
diff --git a/R_scripts/combining_umerged.R b/R_scripts/combining_umerged.R
new file mode 100755
index 0000000..c04f28c
--- /dev/null
+++ b/R_scripts/combining_umerged.R
@@ -0,0 +1,61 @@
+#!/cvmfs/soft.computecanada.ca/easybuild/software/2017/avx2/Compiler/intel2016.4/r/3.5.0/bin/Rscript
+
+args = commandArgs(TRUE)
+start = args[1] # Specify which sequences in "list_ind" file you want to align, directlty from the shell. Alternatively, you can do this from the "alignments" function itself.
+#start="/Users/jerry/Documents/CSBQ/shapiro/results"
+
+setwd(start)
+
+###files to work with 
+system(paste("ls -1 *unassembled.forward.fastq >umerged.forward.files",sep = ""))
+system(paste("ls -1 *unassembled.reverse.fastq >umerged.reverse.files",sep = ""))
+system(paste("ls -1 *.assembled.fastq >assembled.files",sep = ""))
+
+files_f = read.table("umerged.forward.files",stringsAsFactors = F)      
+files_r = read.table("umerged.reverse.files",stringsAsFactors = F) 
+files_a = read.table("assembled.files",stringsAsFactors = F) 
+
+
+for(i in 1:nrow(files_f))      
+  {
+#grep 1st and 2rd lines 
+  	unassembled.fastq = paste("awk 'NR % 2 == 1' ",files_f[i,1]," >unassembled.names.seq.fastq",sep="")
+  	system(unassembled.fastq)
+
+#grep sequences forward...
+  	unassembled.forward.fastq = paste("awk 'NR % 2 == 0' ",files_f[i,1]," >unassembled.seq.forward.fastq",sep="")
+  	system(unassembled.forward.fastq)
+
+#grep sequences reverse...
+  	unassembled.reverse.fastq = paste("awk 'NR % 2 == 0' ",files_r[i,1]," >unassembled.seq.reverse.fastq",sep="")
+  	system(unassembled.reverse.fastq)
+
+#N and E quality file
+ 	system("wc -l unassembled.seq.reverse.fastq >wc")
+ 	wc = read.table("wc")
+  
+ 	write.table(c(rbind(rep('NNNNNNNNNNNNNNNNNNNN',wc[1,1]/2),rep('EEEEEEEEEEEEEEEEEEEE',wc[1,1]/2))),"NE_file",row.names = F, col.names = F, quote = F)
+  
+#paste the unassembled sequences into a single file
+	system("paste -d '\\0' unassembled.seq.forward.fastq NE_file unassembled.seq.reverse.fastq >unassembledN")
+
+#put everything back into a single fastq
+	back = paste("paste -d '\\n' unassembled.names.seq.fastq unassembledN >",gsub("merged.unassembled.forward","cat",files_f[i,1]),sep= "")
+	system(back)
+
+#add the merged sequences
+	all = paste("cat ",files_a[i,1]," ",gsub("merged.unassembled.forward","cat",files_f[i,1])," >",gsub("merged.assembled","merged.assembled2",files_a[i,1]),sep = "")
+	system(all)
+
+#remove the clutter (file specific)
+	remove = c("rm NE_file unassembledN wc unassembled.names.seq.fastq unassembled.seq.forward.fastq unassembled.seq.reverse.fastq")
+	system(remove)
+}
+
+#remove the clutter (listing of all files)
+system("rm assembled.files umerged.forward.files umerged.reverse.files")
+
+
+
+
+
diff --git a/bash_scripts/cyanotoxin_master_samsa2.slurm b/bash_scripts/cyanotoxin_master_samsa2.slurm
new file mode 100755
index 0000000..2e278da
--- /dev/null
+++ b/bash_scripts/cyanotoxin_master_samsa2.slurm
@@ -0,0 +1,505 @@
+#!/bin/bash
+
+#SBATCH --time=0-07:59
+#SBATCH --mem=40G
+#SBATCH --account=def-bruneaua
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=12
+
+module load r/3.5.0
+
+
+# Lines starting with #SBATCH are for SLURM job management systems
+# and may be removed if user is not submitting jobs to SLURM
+####################################################################
+#set -x #echo
+#
+# master_script_for_sample_files.bash
+# Created April 2017 by Sam Westreich, github.com/transcript
+# This version modified by Michelle Treiber on August 9, 2017
+#
+####################################################################
+#
+# This script sets up and runs through ALL steps in the SAMSA pipeline
+# before the analysis (which is done in R, likely in RStudio).  Each
+# step is set up below.
+#
+# The steps are:
+#	0.1	Create the checkpoint
+#	0.2	Gunzip the raw sequences
+#	1.	Merging with PEAR, if applicable
+#	2.	Read cleaning with Trimmomatic
+#	3.	rRNA removal with SortMeRNA
+#	4.	Annotation using DIAMOND against Subsys database
+#	5.	Aggregation using analysis_counter.py (Subsys)
+#   	4.1  	Annotation using DIAMOND against Refseq database
+#   	5.1	Aggregation using analysis_counter.py (Refseq)
+#	6.	Running R scripts to get DESeq statistical analysis. (NOT RUN)
+#
+# NOTE: BEFORE running this script, please run package_installation.bash
+# and full_database_download.bash located at:
+# https://github.com/transcript/samsa2/tree/master/setup in order to set 
+# up SAMSA2 dependencies and download full databases.
+#
+####################################################################
+#
+echo -e "NOTE: Before running this script, user must run package_installation.bash and full_database_download.bash located at https://github.com/transcript/samsa2/tree/master/setup in order to set up SAMSA2 dependencies.\n\n"
+#
+# VARIABLES - Set pathway for starting_location to location of samsa2 GitHub download
+#
+# 	0. Set starting location:
+starting_location=/home/renaut/scratch/ATRAPP/ATRAPP_Champlain_2016_Metat/St1
+
+#	0.5 Sequences location (note that sequences actually need to be stored in directory called $sequence_location/gzipped
+sequence_location=/home/renaut/scratch/ATRAPP/sequences/ATRAPP_Champlain_2016_Metat/St1
+
+#	1. PEAR
+pear_location=/home/renaut/bin
+
+# 	2. Trimmomatic
+trimmomatic_location=/home/renaut/trinityrnaseq-Trinity-v2.8.4/trinity-plugins/Trimmomatic-0.36
+
+#	3. SortMeRNA
+sortmerna_location=/home/renaut/bin
+
+#	4. DIAMOND
+diamond_database="/home/renaut/scratch/blast_database/refseq/RefSeq_bac"
+diamond_cyanotoxin="/home/renaut/scratch/blast_database/Cyanotoxin_Database/New_Toxin_Database_OneLine_1"
+diamond_subsys_db="/home/renaut/scratch/blast_database/subsys/subsys_db"
+diamond_location=/home/renaut/bin
+
+#	5. Aggregation
+python_programs=/home/renaut/samsa2.master/samsa2/python_scripts
+cyano_db="/home/renaut/scratch/blast_database/Cyanotoxin_Database/New_Toxin_Database_OneLine_1.faa2"
+RefSeq_db="/home/renaut/scratch/blast_database/refseq/RefSeq_bac.fa"
+Subsys_db="/home/renaut/scratch/blast_database/subsys/subsys_db.fa"
+
+#	6. R scripts and paths
+export R_LIBS="/home/renaut/samsa2.master/samsa2/R_scripts/packages"
+R_programs=/home/renaut/samsa2.master/samsa2/R_scripts
+
+#	threads
+threads="12"
+
+##################
+#STEP 0.1: create/read checkpoint
+cd $starting_location
+
+printf "\nStep 0.1: Checking for the presence of the checkpoint file\n"
+if [ ! -f "pipeline/checkpoints" ]
+        then
+                printf "\tThe file checkpoints does not exist, we will create it.\n"
+        touch "pipeline/checkpoints"
+else
+        printf "\tThe file checkpoints already exist\n"
+fi
+
+####################################################################
+#
+#STEP 0.2: GUNZIP PROCESS. 
+#
+#
+Step=$(grep "GUNZIP" pipeline/checkpoints)
+if [ "${Step}" != "GUNZIP" ]
+	then
+		for file in $sequence_location/zipped/*gz
+			do
+				file_out=`echo $file | awk -F ".gz" '{print $1}'`	
+        			gunzip $file -cd >$file_out
+
+				if [ $? -ne 0 ]
+                        		then
+                                	printf "\t!!! There is a problem in the gunzip step"
+                                	exit 1
+                		fi
+			done
+		
+		mkdir $sequence_location/unzipped
+		mv $sequence_location/zipped/*fastq $sequence_location/unzipped/.
+		printf "GUNZIP\n" >>pipeline/checkpoints
+                echo "STEP 0.1 DONE:GUNZIP OF PAIRED-END FILES USING GUNZIP"
+                date
+else
+        printf  "\tThe variable GUNZIP is in the checkpoint file. STEP 0.1 will then be passed\n"
+fi
+
+####################################################################
+#
+# STEP 1: MERGING OF PAIRED-END FILES USING PEAR
+# Note: paired-end files are usually named using R1 and R2 in the name.
+#		Example: control_1.R1.fastq
+#				 control_1.R2.fastq
+#
+# Note: if using single-end sequencing, skip this step (comment out).
+# Note: if performing R analysis (step 6), be sure to name files with 
+# 	the appropriate prefix ("control_$file" and "experimental_$file")!
+
+
+
+Step=$(grep "MERGING" pipeline/checkpoints)
+if [ "${Step}" != "MERGING" ]
+	then
+		mkdir $starting_location/step_1_merging
+		touch $starting_location/step_1_merging/pear_log
+		for file in $sequence_location/unzipped/*_R1.fastq
+		do
+			file1=$file
+			file2=`echo $file1 | awk -F "R1" '{print $1 "R2" $2}'`
+			out_path=`echo $file | awk -F "R1" '{print $1 "merged"}'`
+			out_name=`echo ${out_path##*/}`
+			$pear_location/pear -f $file1 -j $threads -r $file2 -o $out_name 1>>$starting_location/step_1_merging/pear_log
+		
+			if [ $? -ne 0 ]
+                        	then
+                                	printf "\t!!! There is a problem in the merging step"
+                                	exit 1
+                	fi
+		done
+		
+		###I will concatenate forward and reverse complement with 20Ns in the middle.
+		$R_programs/combining_umerged.R $starting_location
+		
+		printf "MERGING\n" >>pipeline/checkpoints
+		mkdir $starting_location/step_1_merging/
+		mv $starting_location/*assembled2.fastq $starting_location/step_1_merging/
+		
+		rm $starting_location/*fastq
+		rm $sequence_location/unzipped/*fastq
+
+		echo "STEP 1 DONE:MERGING OF PAIRED-END FILES USING PEAR"
+		date        
+else
+        printf  "\tThe variable MERGING is in the checkpoint file. STEP 1 will then be passed\n"
+fi
+
+####################################################################
+#
+# STEP 2: CLEANING FILES WITH TRIMMOMATIC
+# Note: if skipping PEAR, make sure that all starting files are in the
+# $starting_location/step_1_output/ folder!
+Step=$(grep "TRIMMING" pipeline/checkpoints)
+if [ "${Step}" != "TRIMMING" ]
+        then
+		mkdir $starting_location/step_2_trimming
+		touch $starting_location/step_2_trimming/trimmomatic_log
+
+	for file in $starting_location/step_1_merging/*assembled2.fastq
+		do
+		shortname=`echo $file | awk -F "assembled2.fastq" '{print $1 "cleaned.fastq"}'`
+
+		java -jar $trimmomatic_location/trimmomatic-0.36.jar SE -threads $threads -phred33 $file $shortname ILLUMINACLIP:$trimmomatic_location/adapters/all_primers.fa:2:30:10 SLIDINGWINDOW:4:15 MINLEN:59 2>>$starting_location/step_2_trimming/trimmomatic_log
+		if [ $? -ne 0 ]
+			then 
+				printf "\t!!! There is a problem in the trimmmomatic step"
+				exit 1
+		fi
+		
+		done
+
+	mkdir $starting_location/step_2_trimming/
+	mv $starting_location/step_1_merging/*cleaned.fastq $starting_location/step_2_trimming/
+	
+	rm step_1_merging/*assembled2.fastq
+	
+	echo "STEP 2 DONE:CLEANING FILES WITH TRIMMOMATIC"
+	date
+	
+	printf "TRIMMING\n" >>pipeline/checkpoints
+else
+        printf  "\tThe variable TRIMMING is in the checkpoint file. STEP 2 will then be passed\n"
+fi
+
+####################################################################
+#
+# STEP 2.5: GETTING RAW SEQUENCES COUNTS
+# Note: These are used later for statistical analysis.
+Step=$(grep "RAW" pipeline/checkpoints)
+if [ "${Step}" != "RAW" ]
+        then
+		if [ -f $starting_location/step_2_trimming/raw_counts.txt ]
+			then
+				rm $starting_location/step_2_trimming/raw_counts.txt
+				touch $starting_location/step_2_trimming/raw_counts.txt
+		else
+			touch $starting_location/step_2_trimming/raw_counts.txt
+		fi	
+
+		for file in $starting_location/step_2_trimming/*cleaned.fastq
+		do
+			python $python_programs/raw_read_counter.py -I $file -O $starting_location/step_2_trimming/raw_counts.txt
+		if [ $? -ne 0 ]
+			then 
+				printf "\t!!! There is a problem in the raw sequences step"
+				exit 1
+		fi
+		done
+		echo "STEP 2.5:GETTING RAW SEQUENCES COUNTS" 
+		date
+
+        printf "RAW\n" >>pipeline/checkpoints
+else
+        printf  "\tThe variable RAW is in the checkpoint file. STEP 2.9 will then be passed\n"
+fi
+
+####################################################################
+#
+# STEP 3: REMOVING RIBOSOMAL READS WITH SORTMERNA
+# Note: this step assumes that the SortMeRNA databases are indexed.  If not,
+# do that first (see the SortMeRNA user manual for details).
+
+Step=$(grep "RIBODEPLETION" pipeline/checkpoints)
+if [ "${Step}" != "RIBODEPLETION" ]
+        then
+        mkdir $starting_location/step_3_ribodepletion
+		touch $starting_location/step_3_ribodepletion/sortmerna_log
+
+		for file in $starting_location/step_2_trimming/*cleaned.fastq
+		do
+			shortname=`echo $file | awk -F "cleaned" '{print $1 "ribodepleted"}'`
+			$sortmerna_location/sortmerna -a $threads --ref $sortmerna_location/../sortmerna-2.1/rRNA_databases/silva-bac-16s-id90.fasta,$sortmerna_location/../sortmerna-2.1/index/silva-bac-16s --reads $file --aligned $file.ribosomes --other $shortname --fastx --log -v 1>>$starting_location/step_3_ribodepletion/sortmerna_log
+		if [ $? -ne 0 ]
+			then 
+				printf "\t!!! There is a problem in the sortmerna step"
+				exit 1
+		fi
+		done
+
+		mkdir $starting_location/step_3_ribodepletion/
+		mv $starting_location/step_2_trimming/*ribodepleted* $starting_location/step_3_ribodepletion/
+		
+		rm step_2_trimming/*cleaned.fastq
+		rm step_2_trimming/*ribosomes.fastq
+		
+		echo "STEP 3 DONE:REMOVING RIBOSOMAL READS WITH SORTMERNA"
+		date
+	
+        	printf "RIBODEPLETION\n" >>pipeline/checkpoints
+else
+        	printf  "\tThe variable RIBODEPLETION is in the checkpoint file. STEP 3 will then be passed\n"
+fi
+
+####################################################################
+#
+# STEP 4: ANNOTATING WITH DIAMOND AGAINST SUBSYSTEMS
+date
+Step=$(grep "ANNOTATION_SUBSYS" pipeline/checkpoints)
+if [ "${Step}" != "ANNOTATION_SUBSYS" ]
+	then
+		mkdir $starting_location/step_4_annotation
+		touch $starting_location/step_4_annotation/diamond_log
+		for file in $starting_location/step_3_ribodepletion/*ribodepleted.fastq
+		do
+			shortname=`echo $file | awk -F "ribodepleted" '{print $1 "subsys_annotated"}'`
+			#removed the diamond sensitive option
+			$diamond_location/diamond blastx --db $diamond_subsys_db -q $file -a $file.Subsys -p $threads -t ./ -k 1 1>>$starting_location/step_4_annotation/diamond_log
+			$diamond_location/diamond view --daa $file.Subsys.daa -o $shortname -p $threads -f tab 1>>$starting_location/step_4_annotation/diamond_log
+		if [ $? -ne 0 ]
+			then 
+				printf "\t!!! There is a problem in the diamond subsys step"
+				exit 1
+		fi
+		done
+		
+		mkdir $starting_location/step_4_annotation/daa_binary_files/
+	
+		mv $starting_location/step_3_ribodepletion/*subsys_annotated* $starting_location/step_4_annotation/
+		mv $starting_location/step_3_ribodepletion/*.daa $starting_location/step_4_annotation/daa_binary_files/
+
+		echo "STEP 4 DONE: ANNOTATING WITH DIAMOND AGAINST SUBSYSTEMS"
+        	date
+
+		printf "ANNOTATION_SUBSYS\n" >>pipeline/checkpoints
+else
+       	 printf  "\tThe variable ANNOTATION_SUBSYS is in the checkpoint file. STEP 4 will then be passed\n"
+fi
+##################################################################
+#
+# STEP 5: PYTHON SUBSYSTEMS ANALYSIS COUNTER
+Step=$(grep "AGGREGATION_SUBSYS" pipeline/checkpoints)
+if [ "${Step}" != "AGGREGATION_SUBSYS" ]
+	then
+		mkdir $starting_location/step_5_aggregation/
+		for file in $starting_location/step_4_annotation/*subsys_annotated
+		do
+			python $python_programs/DIAMOND_subsystems_analysis_counter.py -I $file -D $Subsys_db -O $file.hierarchy -P $file.receipt
+	
+			# This quick program reduces down identical hierarchy annotations
+			python $python_programs/subsys_reducer.py -I $file.hierarchy
+		if [ $? -ne 0 ]
+			then 
+				printf "\t!!! There is a problem in the aggregation subsys step"
+				exit 1
+		fi
+		done
+
+		mkdir $starting_location/step_5_aggregation/Subsystems_results/
+		mkdir $starting_location/step_5_aggregation/Subsystems_results/receipts/
+		mv $starting_location/step_4_annotation/*.reduced $starting_location/step_5_aggregation/Subsystems_results/
+		mv $starting_location/step_4_annotation/*.receipt $starting_location/step_5_aggregation/Subsystems_results/receipts/
+		rm $starting_location/step_4_annotation/*.hierarchy
+
+		echo "STEP 5 DONE:PYTHON SUBSYSTEMS ANALYSIS COUNTER"
+		date
+
+        	printf "AGGREGATION_SUBSYS\n" >>pipeline/checkpoints
+else
+	        printf  "\tThe variable AGGREGATION_SUBSYS is in the checkpoint file. STEP 5. will then be passed\n"
+fi
+
+
+####################################################################
+#
+# STEP 4.1: ANNOTATING WITH DIAMOND AGAINST REFSEQ
+# Note: this step assumes that the DIAMOND database is already built.  If not,
+# do that first before running this step.
+Step=$(grep "ANNOTATION_REFSEQ" pipeline/checkpoints)
+if [ "${Step}" != "ANNOTATION_REFSEQ" ]
+        then
+		touch $starting_location/step_4_annotation/diamond_log
+
+		for file in $starting_location/step_3_ribodepletion/*ribodepleted.fastq
+			do
+				shortname=`echo $file | awk -F "ribodepleted" '{print $1 "RefSeq_annotated"}'`
+				#removed the diamond sensitive option
+				$diamond_location/diamond blastx --db $diamond_database -q $file -p $threads -a $file.RefSeq -t ./ -k 1 1>>$starting_location/step_4_annotation/diamond_log
+				$diamond_location/diamond view --daa $file.RefSeq.daa -o $shortname -p $threads -f tab 1>>$starting_location/step_4_annotation/diamond_log
+			if [ $? -ne 0 ]
+			then 
+				printf "\t!!! There is a problem in the diamond refseq step"
+				exit 1
+			fi
+			done
+
+		mv $starting_location/step_3_ribodepletion/*annotated* $starting_location/step_4_annotation/
+		mv $starting_location/step_3_ribodepletion/*.daa $starting_location/step_4_annotation/daa_binary_files/
+
+		echo "STEP 4.1 DONE:ANNOTATING WITH DIAMOND AGAINST REFSEQ"
+		date
+
+        	printf "ANNOTATION_REFSEQ\n" >>pipeline/checkpoints
+else
+	        printf  "\tThe variable ANNOTATION_REFSEQ is in the checkpoint file. STEP 4.1 will then be passed\n"
+fi
+
+####################################################################
+#
+# STEP 4.2: ANNOTATING WITH DIAMOND AGAINST CYANO
+# Note: this step assumes that the DIAMOND database is already built.  If not,
+# do that first before running this step.
+Step=$(grep "ANNOTATION_CYANO" pipeline/checkpoints)
+if [ "${Step}" != "ANNOTATION_CYANO" ]
+        then
+                touch $starting_location/step_4_annotation/diamond_cyano_log
+
+                for file in $starting_location/step_3_ribodepletion/*ribodepleted.fastq
+                        do
+                                shortname=`echo $file | awk -F "ribodepleted" '{print $1 "cyano_annotated"}'`
+                                #removed the diamond sensitive option
+                                $diamond_location/diamond blastx --db $diamond_cyanotoxin -q $file -p $threads -a $file.cyano -t ./ -k 1 1>>$starting_location/step_4_annotation/diamond_cyano_log
+                                $diamond_location/diamond view --daa $file.cyano.daa -o $shortname -p $threads -f tab 1>>$starting_location/step_4_annotation/diamond_cyano_log
+                        if [ $? -ne 0 ]
+                        then
+                                printf "\t!!! There is a problem in the diamond cyano step"
+                                exit 1
+                        fi
+                        done
+
+                mv $starting_location/step_3_ribodepletion/*annotated* $starting_location/step_4_annotation/
+                mv $starting_location/step_3_ribodepletion/*.daa $starting_location/step_4_annotation/daa_binary_files/
+
+                echo "STEP 4.2 DONE:ANNOTATING WITH DIAMOND AGAINST CYANO"
+                date
+
+                printf "ANNOTATION_CYANO\n" >>pipeline/checkpoints
+else
+                printf  "\tThe variable ANNOTATION_CYANO is in the checkpoint file. STEP 4.1 will then be passed\n"
+fi
+
+
+####################################################################
+#
+# STEP 5.1: AGGREGATING WITH ANALYSIS_COUNTER
+Step=$(grep "AGGREGATION_REFSEQ" pipeline/checkpoints)
+if [ "${Step}" != "AGGREGATION_REFSEQ" ]
+	then
+		for file in $starting_location/step_4_annotation/*RefSeq_annotated*
+		do
+			python $python_programs/standardized_DIAMOND_analysis_counter.py -I $file -D $RefSeq_db -O
+			python $python_programs/standardized_DIAMOND_analysis_counter.py -I $file -D $RefSeq_db -F
+			if [ $? -ne 0 ]
+				then 
+					printf "\t!!! There is a problem in the aggregation refseq step"
+					exit 1
+			fi
+		done
+
+		mkdir $starting_location/step_5_aggregation/RefSeq_results/
+		mkdir $starting_location/step_5_aggregation/RefSeq_results/org_results/
+		mkdir $starting_location/step_5_aggregation/RefSeq_results/func_results/
+		mv $starting_location/step_4_annotation/*organism.tsv $starting_location/step_5_aggregation/RefSeq_results/org_results/
+		mv $starting_location/step_4_annotation/*function.tsv $starting_location/step_5_aggregation/RefSeq_results/func_results/
+
+		echo "STEP 5.1 DONE: AGGREGATING WITH ANALYSIS_COUNTER"
+		date 
+
+        	printf "AGGREGATION_REFSEQ\n" >>pipeline/checkpoints
+else
+	        printf  "\tThe variable AGGREGATION_REFSEQ is in the checkpoint file. STEP 5.1 will then be passed\n"
+fi
+
+
+####################################################################
+#
+# STEP 5.2: AGGREGATING WITH ANALYSIS_COUNTER
+Step=$(grep "AGGREGATION_CYANO" pipeline/checkpoints)
+if [ "${Step}" != "AGGREGATION_CYANO" ]
+        then
+                for file in $starting_location/step_4_annotation/*cyano_annotated*
+                do
+                        python $python_programs/standardized_DIAMOND_analysis_counter.py -I $file -D $cyano_db -O
+                        python $python_programs/standardized_DIAMOND_analysis_counter.py -I $file -D $cyano_db -F
+                        if [ $? -ne 0 ]
+                                then
+                                        printf "\t!!! There is a problem in the aggregation CYANO step"
+                                        exit 1
+                        fi
+                done
+
+                mkdir $starting_location/step_5_aggregation/cyano_results/
+                mkdir $starting_location/step_5_aggregation/cyano_results/org_results/
+                mkdir $starting_location/step_5_aggregation/cyano_results/func_results/
+                mv $starting_location/step_4_annotation/*organism.tsv $starting_location/step_5_aggregation/cyano_results/org_results/
+                mv $starting_location/step_4_annotation/*function.tsv $starting_location/step_5_aggregation/cyano_results/func_results/
+
+                echo "STEP 5.2 DONE: AGGREGATING WITH ANALYSIS_COUNTER"
+                date
+
+                printf "AGGREGATION_CYANO\n" >>pipeline/checkpoints
+else
+                printf  "\tThe variable AGGREGATION_CYANO is in the checkpoint file. STEP 5.2 will then be passed\n"
+fi
+
+
+
+
+##################################################################
+#
+# At this point, all the results files are ready for analysis using R.
+# This next step performs basic DESeq2 analysis of the RefSeq organism, function,
+# and Subsystems annotations.
+#
+# More complex R analyses may be performed using specific .sh analysis scripts.
+#
+# STEP 6: R ANALYSIS
+# Note: For R to properly identify files to compare/contrast, they must include 
+# the appropriate prefix (either "control_$file" or experimental_$file")!
+
+#/home/apps/Logiciels/R/3.4.3-gcc/bin/Rscript $R_programs/run_DESeq_stats.R -I $starting_location/step_5_aggregation/RefSeq_results/org_results/ -O RefSeq_org_DESeq_results.tab -R $starting_location/step_2_trimming/raw_counts.txt
+#/home/apps/Logiciels/R/3.4.3-gcc/bin/Rscript $R_programs/run_DESeq_stats.R -I $starting_location/step_5_aggregation/RefSeq_results/func_results/ -O RefSeq_func_DESeq_results.tab -R $starting_location/step_2_trimming/raw_counts.txt
+#/home/apps/Logiciels/R/3.4.3-gcc/bin/Rscript $R_programs/Subsystems_DESeq_stats.R -I $starting_location/step_5_aggregation/Subsystems_results/ -O Subsystems_level-1_DESeq_results.tab -L 1 -R step_2_trimming/raw_counts.txt
+
+echo "STEP 6 DONE: R ANALYSIS"
+echo "Master bash script finished running at: "; date
+exit
+####################################################################
+
diff --git a/bash_scripts/master_samsa2.slurm b/bash_scripts/master_samsa2.slurm
new file mode 100755
index 0000000..a16d11b
--- /dev/null
+++ b/bash_scripts/master_samsa2.slurm
@@ -0,0 +1,435 @@
+#!/bin/bash
+
+#SBATCH --time=0-23:59
+#SBATCH --mem=100G
+#SBATCH --account=def-bruneaua
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=48
+
+module load r/3.5.0
+
+
+# Lines starting with #SBATCH are for SLURM job management systems
+# and may be removed if user is not submitting jobs to SLURM
+####################################################################
+#set -x #echo
+#
+# master_script_for_sample_files.bash
+# Created April 2017 by Sam Westreich, github.com/transcript
+# This version modified by Michelle Treiber on August 9, 2017
+#
+####################################################################
+#
+# This script sets up and runs through ALL steps in the SAMSA pipeline
+# before the analysis (which is done in R, likely in RStudio).  Each
+# step is set up below.
+#
+# The steps are:
+#	0.1	Create the checkpoint
+#	0.2	Gunzip the raw sequences
+#	1.	Merging with PEAR, if applicable
+#	2.	Read cleaning with Trimmomatic
+#	3.	rRNA removal with SortMeRNA
+#	4.	Annotation using DIAMOND against Subsys database
+#	5.	Aggregation using analysis_counter.py (Subsys)
+#   	4.1  	Annotation using DIAMOND against Refseq database
+#   	5.1	Aggregation using analysis_counter.py (Refseq)
+#	6.	Running R scripts to get DESeq statistical analysis. (NOT RUN)
+#
+# NOTE: BEFORE running this script, please run package_installation.bash
+# and full_database_download.bash located at:
+# https://github.com/transcript/samsa2/tree/master/setup in order to set 
+# up SAMSA2 dependencies and download full databases.
+#
+####################################################################
+#
+echo -e "NOTE: Before running this script, user must run package_installation.bash and full_database_download.bash located at https://github.com/transcript/samsa2/tree/master/setup in order to set up SAMSA2 dependencies.\n\n"
+#
+# VARIABLES - Set pathway for starting_location to location of samsa2 GitHub download
+#
+# 	0. Set starting location:
+starting_location=/home/renaut/scratch/ATRAPP/Metagenomic_Champlain_long_term/St1
+
+#	0.5 Sequences location (note that sequences actually need to be stored in directory called $sequence_location/zipped
+sequence_location=/home/renaut/scratch/ATRAPP/sequences/Metagenomic_Champlain_long_term/St1
+
+#	1. PEAR
+pear_location=/home/renaut/bin
+
+# 	2. Trimmomatic
+trimmomatic_location=/home/renaut/trinityrnaseq-Trinity-v2.8.4/trinity-plugins/Trimmomatic-0.36
+
+#	3. SortMeRNA
+sortmerna_location=/home/renaut/bin
+
+#	4. DIAMOND
+diamond_database="/home/renaut/scratch/blast_database/refseq/RefSeq_bac"
+diamond_subsys_db="/home/renaut/scratch/blast_database/subsys/subsys_db"
+diamond_location=/home/renaut/bin
+
+#	5. Aggregation
+python_programs=/home/renaut/samsa2.master/samsa2/python_scripts
+RefSeq_db="/home/renaut/scratch/blast_database/refseq/RefSeq_bac.fa"
+Subsys_db="/home/renaut/scratch/blast_database/subsys/subsys_db.fa"
+
+#	6. R scripts and paths
+export R_LIBS="/home/renaut/samsa2.master/samsa2/R_scripts/packages"
+R_programs=/home/renaut/samsa2.master/samsa2/R_scripts
+
+#	threads
+threads="48"
+
+##################
+#STEP 0.1: create/read checkpoint
+cd $starting_location
+
+printf "\nStep 0.1: Checking for the presence of the checkpoint file\n"
+if [ ! -f "pipeline/checkpoints" ]
+        then
+                printf "\tThe file checkpoints does not exist, we will create it.\n"
+        touch "pipeline/checkpoints"
+else
+        printf "\tThe file checkpoints already exist\n"
+fi
+
+####################################################################
+#
+#STEP 0.2: GUNZIP PROCESS. 
+#
+#
+Step=$(grep "GUNZIP" pipeline/checkpoints)
+if [ "${Step}" != "GUNZIP" ]
+	then
+		for file in $sequence_location/zipped/*gz
+			do
+				file_out=`echo $file | awk -F ".gz" '{print $1}'`	
+        			gunzip $file -cd >$file_out
+
+				if [ $? -ne 0 ]
+                        		then
+                                	printf "\t!!! There is a problem in the gunzip step"
+                                	exit 1
+                		fi
+			done
+		
+		mkdir $sequence_location/unzipped
+		mv $sequence_location/zipped/*fastq $sequence_location/unzipped/.
+		printf "GUNZIP\n" >>pipeline/checkpoints
+                echo "STEP 0.1 DONE:GUNZIP OF PAIRED-END FILES USING GUNZIP"
+                date
+else
+        printf  "\tThe variable GUNZIP is in the checkpoint file. STEP 0.1 will then be passed\n"
+fi
+
+####################################################################
+#
+# STEP 1: MERGING OF PAIRED-END FILES USING PEAR
+# Note: paired-end files are usually named using R1 and R2 in the name.
+#		Example: control_1.R1.fastq
+#				 control_1.R2.fastq
+#
+# Note: if using single-end sequencing, skip this step (comment out).
+# Note: if performing R analysis (step 6), be sure to name files with 
+# 	the appropriate prefix ("control_$file" and "experimental_$file")!
+
+
+
+Step=$(grep "MERGING" pipeline/checkpoints)
+if [ "${Step}" != "MERGING" ]
+	then
+		mkdir $starting_location/step_1_merging
+		touch $starting_location/step_1_merging/pear_log
+		for file in $sequence_location/unzipped/*_R1.fastq
+		do
+			file1=$file
+			file2=`echo $file1 | awk -F "R1" '{print $1 "R2" $2}'`
+			out_path=`echo $file | awk -F "R1" '{print $1 "merged"}'`
+			out_name=`echo ${out_path##*/}`
+			$pear_location/pear -f $file1 -j $threads -r $file2 -o $out_name 1>>$starting_location/step_1_merging/pear_log
+		
+			if [ $? -ne 0 ]
+                        	then
+                                	printf "\t!!! There is a problem in the merging step"
+                                	exit 1
+                	fi
+		done
+		
+		###I will concatenate forward and reverse complement with 20Ns in the middle.
+		$R_programs/combining_umerged.R $starting_location
+		
+		printf "MERGING\n" >>pipeline/checkpoints
+		mkdir $starting_location/step_1_merging/
+		mv $starting_location/*assembled2.fastq $starting_location/step_1_merging/
+		
+		rm $starting_location/*fastq
+		rm $sequence_location/unzipped/*fastq
+
+		echo "STEP 1 DONE:MERGING OF PAIRED-END FILES USING PEAR"
+		date        
+else
+        printf  "\tThe variable MERGING is in the checkpoint file. STEP 1 will then be passed\n"
+fi
+
+####################################################################
+#
+# STEP 2: CLEANING FILES WITH TRIMMOMATIC
+# Note: if skipping PEAR, make sure that all starting files are in the
+# $starting_location/step_1_output/ folder!
+Step=$(grep "TRIMMING" pipeline/checkpoints)
+if [ "${Step}" != "TRIMMING" ]
+        then
+		mkdir $starting_location/step_2_trimming
+		touch $starting_location/step_2_trimming/trimmomatic_log
+
+	for file in $starting_location/step_1_merging/*assembled2.fastq
+		do
+		shortname=`echo $file | awk -F "assembled2.fastq" '{print $1 "cleaned.fastq"}'`
+
+		java -jar $trimmomatic_location/trimmomatic-0.36.jar SE -threads $threads -phred33 $file $shortname ILLUMINACLIP:$trimmomatic_location/adapters/all_primers.fa:2:30:10 SLIDINGWINDOW:4:15 MINLEN:59 2>>$starting_location/step_2_trimming/trimmomatic_log
+		if [ $? -ne 0 ]
+			then 
+				printf "\t!!! There is a problem in the trimmmomatic step"
+				exit 1
+		fi
+		
+		done
+
+	mkdir $starting_location/step_2_trimming/
+	mv $starting_location/step_1_merging/*cleaned.fastq $starting_location/step_2_trimming/
+	
+	rm step_1_merging/*assembled2.fastq
+	
+	echo "STEP 2 DONE:CLEANING FILES WITH TRIMMOMATIC"
+	date
+	
+	printf "TRIMMING\n" >>pipeline/checkpoints
+else
+        printf  "\tThe variable TRIMMING is in the checkpoint file. STEP 2 will then be passed\n"
+fi
+
+####################################################################
+#
+# STEP 2.5: GETTING RAW SEQUENCES COUNTS
+# Note: These are used later for statistical analysis.
+Step=$(grep "RAW" pipeline/checkpoints)
+if [ "${Step}" != "RAW" ]
+        then
+		if [ -f $starting_location/step_2_trimming/raw_counts.txt ]
+			then
+				rm $starting_location/step_2_trimming/raw_counts.txt
+				touch $starting_location/step_2_trimming/raw_counts.txt
+		else
+			touch $starting_location/step_2_trimming/raw_counts.txt
+		fi	
+
+		for file in $starting_location/step_2_trimming/*cleaned.fastq
+		do
+			python $python_programs/raw_read_counter.py -I $file -O $starting_location/step_2_trimming/raw_counts.txt
+		if [ $? -ne 0 ]
+			then 
+				printf "\t!!! There is a problem in the raw sequences step"
+				exit 1
+		fi
+		done
+		echo "STEP 2.5:GETTING RAW SEQUENCES COUNTS" 
+		date
+
+        printf "RAW\n" >>pipeline/checkpoints
+else
+        printf  "\tThe variable RAW is in the checkpoint file. STEP 2.9 will then be passed\n"
+fi
+
+####################################################################
+#
+# STEP 3: REMOVING RIBOSOMAL READS WITH SORTMERNA
+# Note: this step assumes that the SortMeRNA databases are indexed.  If not,
+# do that first (see the SortMeRNA user manual for details).
+
+Step=$(grep "RIBODEPLETION" pipeline/checkpoints)
+if [ "${Step}" != "RIBODEPLETION" ]
+        then
+        mkdir $starting_location/step_3_ribodepletion
+		touch $starting_location/step_3_ribodepletion/sortmerna_log
+
+		for file in $starting_location/step_2_trimming/*cleaned.fastq
+		do
+			shortname=`echo $file | awk -F "cleaned" '{print $1 "ribodepleted"}'`
+			$sortmerna_location/sortmerna -a $threads --ref $sortmerna_location/../sortmerna-2.1/rRNA_databases/silva-bac-16s-id90.fasta,$sortmerna_location/../sortmerna-2.1/index/silva-bac-16s --reads $file --aligned $file.ribosomes --other $shortname --fastx --log -v 1>>$starting_location/step_3_ribodepletion/sortmerna_log
+		if [ $? -ne 0 ]
+			then 
+				printf "\t!!! There is a problem in the sortmerna step"
+				exit 1
+		fi
+		done
+
+		mkdir $starting_location/step_3_ribodepletion/
+		mv $starting_location/step_2_trimming/*ribodepleted* $starting_location/step_3_ribodepletion/
+		
+		rm step_2_trimming/*cleaned.fastq
+		rm step_2_trimming/*ribosomes.fastq
+		
+		echo "STEP 3 DONE:REMOVING RIBOSOMAL READS WITH SORTMERNA"
+		date
+	
+        	printf "RIBODEPLETION\n" >>pipeline/checkpoints
+else
+        	printf  "\tThe variable RIBODEPLETION is in the checkpoint file. STEP 3 will then be passed\n"
+fi
+
+####################################################################
+#
+# STEP 4: ANNOTATING WITH DIAMOND AGAINST SUBSYSTEMS
+date
+Step=$(grep "ANNOTATION_SUBSYS" pipeline/checkpoints)
+if [ "${Step}" != "ANNOTATION_SUBSYS" ]
+	then
+		mkdir $starting_location/step_4_annotation
+		touch $starting_location/step_4_annotation/diamond_log
+		for file in $starting_location/step_3_ribodepletion/*ribodepleted.fastq
+		do
+			shortname=`echo $file | awk -F "ribodepleted" '{print $1 "subsys_annotated"}'`
+			#removed the diamond sensitive option
+			$diamond_location/diamond blastx --db $diamond_subsys_db -q $file -a $file.Subsys -p $threads -t ./ -k 1 1>>$starting_location/step_4_annotation/diamond_log
+			$diamond_location/diamond view --daa $file.Subsys.daa -o $shortname -p $threads -f tab 1>>$starting_location/step_4_annotation/diamond_log
+		if [ $? -ne 0 ]
+			then 
+				printf "\t!!! There is a problem in the diamond subsys step"
+				exit 1
+		fi
+		done
+		
+		mkdir $starting_location/step_4_annotation/daa_binary_files/
+	
+		mv $starting_location/step_3_ribodepletion/*subsys_annotated* $starting_location/step_4_annotation/
+		mv $starting_location/step_3_ribodepletion/*.daa $starting_location/step_4_annotation/daa_binary_files/
+
+		echo "STEP 4 DONE: ANNOTATING WITH DIAMOND AGAINST SUBSYSTEMS"
+        	date
+
+		printf "ANNOTATION_SUBSYS\n" >>pipeline/checkpoints
+else
+       	 printf  "\tThe variable ANNOTATION_SUBSYS is in the checkpoint file. STEP 4 will then be passed\n"
+fi
+##################################################################
+#
+# STEP 5: PYTHON SUBSYSTEMS ANALYSIS COUNTER
+Step=$(grep "AGGREGATION_SUBSYS" pipeline/checkpoints)
+if [ "${Step}" != "AGGREGATION_SUBSYS" ]
+	then
+		mkdir $starting_location/step_5_aggregation/
+		for file in $starting_location/step_4_annotation/*subsys_annotated
+		do
+			python $python_programs/DIAMOND_subsystems_analysis_counter.py -I $file -D $Subsys_db -O $file.hierarchy -P $file.receipt
+	
+			# This quick program reduces down identical hierarchy annotations
+			python $python_programs/subsys_reducer.py -I $file.hierarchy
+		if [ $? -ne 0 ]
+			then 
+				printf "\t!!! There is a problem in the aggregation subsys step"
+				exit 1
+		fi
+		done
+
+		mkdir $starting_location/step_5_aggregation/Subsystems_results/
+		mkdir $starting_location/step_5_aggregation/Subsystems_results/receipts/
+		mv $starting_location/step_4_annotation/*.reduced $starting_location/step_5_aggregation/Subsystems_results/
+		mv $starting_location/step_4_annotation/*.receipt $starting_location/step_5_aggregation/Subsystems_results/receipts/
+		rm $starting_location/step_4_annotation/*.hierarchy
+
+		echo "STEP 5 DONE:PYTHON SUBSYSTEMS ANALYSIS COUNTER"
+		date
+
+        	printf "AGGREGATION_SUBSYS\n" >>pipeline/checkpoints
+else
+	        printf  "\tThe variable AGGREGATION_SUBSYS is in the checkpoint file. STEP 5. will then be passed\n"
+fi
+
+
+####################################################################
+#
+# STEP 4.1: ANNOTATING WITH DIAMOND AGAINST REFSEQ
+# Note: this step assumes that the DIAMOND database is already built.  If not,
+# do that first before running this step.
+Step=$(grep "ANNOTATION_REFSEQ" pipeline/checkpoints)
+if [ "${Step}" != "ANNOTATION_REFSEQ" ]
+        then
+		touch $starting_location/step_4_annotation/diamond_log
+
+		for file in $starting_location/step_3_ribodepletion/*ribodepleted.fastq
+			do
+				shortname=`echo $file | awk -F "ribodepleted" '{print $1 "RefSeq_annotated"}'`
+				#removed the diamond sensitive option
+				$diamond_location/diamond blastx --db $diamond_database -q $file -p $threads -a $file.RefSeq -t ./ -k 1 1>>$starting_location/step_4_annotation/diamond_log
+				$diamond_location/diamond view --daa $file.RefSeq.daa -o $shortname -p $threads -f tab 1>>$starting_location/step_4_annotation/diamond_log
+			if [ $? -ne 0 ]
+			then 
+				printf "\t!!! There is a problem in the diamond refseq step"
+				exit 1
+			fi
+			done
+
+		mv $starting_location/step_3_ribodepletion/*annotated* $starting_location/step_4_annotation/
+		mv $starting_location/step_3_ribodepletion/*.daa $starting_location/step_4_annotation/daa_binary_files/
+
+		echo "STEP 4.1 DONE:ANNOTATING WITH DIAMOND AGAINST REFSEQ"
+		date
+
+        	printf "ANNOTATION_REFSEQ\n" >>pipeline/checkpoints
+else
+	        printf  "\tThe variable ANNOTATION_REFSEQ is in the checkpoint file. STEP 4.1 will then be passed\n"
+fi
+
+####################################################################
+#
+# STEP 5.1: AGGREGATING WITH ANALYSIS_COUNTER
+Step=$(grep "AGGREGATION_REFSEQ" pipeline/checkpoints)
+if [ "${Step}" != "AGGREGATION_REFSEQ" ]
+	then
+		for file in $starting_location/step_4_annotation/*RefSeq_annotated*
+		do
+			python $python_programs/standardized_DIAMOND_analysis_counter.py -I $file -D $RefSeq_db -O
+			python $python_programs/standardized_DIAMOND_analysis_counter.py -I $file -D $RefSeq_db -F
+			if [ $? -ne 0 ]
+				then 
+					printf "\t!!! There is a problem in the aggregation refseq step"
+					exit 1
+			fi
+		done
+
+		mkdir $starting_location/step_5_aggregation/RefSeq_results/
+		mkdir $starting_location/step_5_aggregation/RefSeq_results/org_results/
+		mkdir $starting_location/step_5_aggregation/RefSeq_results/func_results/
+		mv $starting_location/step_4_annotation/*organism.tsv $starting_location/step_5_aggregation/RefSeq_results/org_results/
+		mv $starting_location/step_4_annotation/*function.tsv $starting_location/step_5_aggregation/RefSeq_results/func_results/
+
+		echo "STEP 5.1 DONE: AGGREGATING WITH ANALYSIS_COUNTER"
+		date 
+
+        	printf "AGGREGATION_REFSEQ\n" >>pipeline/checkpoints
+else
+	        printf  "\tThe variable AGGREGATION_REFSEQ is in the checkpoint file. STEP 5.1 will then be passed\n"
+fi
+
+
+
+##################################################################
+#
+# At this point, all the results files are ready for analysis using R.
+# This next step performs basic DESeq2 analysis of the RefSeq organism, function,
+# and Subsystems annotations.
+#
+# More complex R analyses may be performed using specific .sh analysis scripts.
+#
+# STEP 6: R ANALYSIS
+# Note: For R to properly identify files to compare/contrast, they must include 
+# the appropriate prefix (either "control_$file" or experimental_$file")!
+
+#/home/apps/Logiciels/R/3.4.3-gcc/bin/Rscript $R_programs/run_DESeq_stats.R -I $starting_location/step_5_aggregation/RefSeq_results/org_results/ -O RefSeq_org_DESeq_results.tab -R $starting_location/step_2_trimming/raw_counts.txt
+#/home/apps/Logiciels/R/3.4.3-gcc/bin/Rscript $R_programs/run_DESeq_stats.R -I $starting_location/step_5_aggregation/RefSeq_results/func_results/ -O RefSeq_func_DESeq_results.tab -R $starting_location/step_2_trimming/raw_counts.txt
+#/home/apps/Logiciels/R/3.4.3-gcc/bin/Rscript $R_programs/Subsystems_DESeq_stats.R -I $starting_location/step_5_aggregation/Subsystems_results/ -O Subsystems_level-1_DESeq_results.tab -L 1 -R step_2_trimming/raw_counts.txt
+
+echo "STEP 6 DONE: R ANALYSIS"
+echo "Master bash script finished running at: "; date
+exit
+####################################################################
+