Merge pull request #28 from transcript/checkpoint_update

Checkpoint update
transcript · Mar 12, 2019 · 1d6e8eb · 1d6e8eb
2 parents ba23c12 + 0efac25
commit 1d6e8eb
Show file tree

Hide file tree

Showing 5 changed files with 570 additions and 23 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,8 @@
+# my files
+#bash_scripts/master_samsa2.slurm
+#bash_scripts/cyanotoxin_master_samsa2.slurm
+#R_scripts/combining_umerged.R
+
 # Swap files
 *.swp
 *.swo

diff --git a/README.md b/README.md
@@ -1,4 +1,20 @@
-# SAMSA2 - A fork of the complete metatranscriptome analysis pipeline
+# SAMSA2 - A complete metatranscriptome analysis pipeline
+
+*****
+
+**Version 2.2.0 - Yesod - Modifications added by [email protected]:**
+
+* Multithreading added for PEAR, Trimmomatic, SortMeRNA
+* The script automatically creates a `checkpoint` file. Once a step is finished, it writes the name of that specific step in `checkpoint` and that step is skipped on a rerun of the master_script. This is done to avoid re-running CPU-intensive steps if unnecessary.
+* A new version of the master script now exists, called "master\_script\_preserving\_unmerged.sh".  In this script, in the merging step, unmerged reads are concatenated and added to a single file. The forward read and the reverse (complement) read are concatenated with a string of 20 Ns in the middle: This is done through a new R script entitled: `combining_umerged.R`
+* Extra care is taken to remove unnecessary files once a step is performed to keep disk usage at a minimum.
+* Each step contains an exit statement to be printed if the master script dies due to an unforseen error.
+* Trimmomatic removes adapter contamination according to a specific fasta file.
+* All options, read & program location are to be specified in the first section of the script.
+* The script is formated to be run on a HPC using a SLURM job scheduler, but this can be easily changed / removed.
+* The flag --num_alignments 0 in the ribosomal `sortmrna` step has been removed. This caused problems and slowed things down a lot. Plus, we don't care about the rRNA alignments - whether a sequence aligns to 1 or 1,000 rRNA, it's out anyways...
+
+*****
 
 Version 2 of the SAMSA pipeline - faster!  Lighter!  More options!  Less waiting!  
 
@@ -12,7 +28,7 @@ Version 2 of the SAMSA pipeline - faster!  Lighter!  More options!  Less waiting
 
 ### Dependencies
 
-SAMSA2 requires Python2 for aggregation scripts.  Currently, this pipeline does not work with Python3, although an update is planned.
+SAMSA2 requires Python2 for aggregation scripts.  Currently, this pipeline works mostly with Python3, although there may be some errors not yet caught.
 
 The following programs can be downloaded OR can be installed from the binaries provided in the programs/ folder.
 

diff --git a/R_scripts/combining_umerged.R b/R_scripts/combining_umerged.R
@@ -0,0 +1,55 @@
+#!/cvmfs/soft.computecanada.ca/easybuild/software/2017/avx2/Compiler/intel2016.4/r/3.5.0/bin/Rscript
+
+args = commandArgs(TRUE)
+start = args[1] # Specify which sequences in "list_ind" file you want to align, directly from the shell. Alternatively, you can do this from the "alignments" function itself.
+
+setwd(start)
+
+###files to work with
+system(paste("ls -1 *unassembled.forward.fastq >umerged.forward.files",sep = ""))
+system(paste("ls -1 *unassembled.reverse.fastq >umerged.reverse.files",sep = ""))
+system(paste("ls -1 *.assembled.fastq >assembled.files",sep = ""))
+
+files_f = read.table("umerged.forward.files",stringsAsFactors = F)
+files_r = read.table("umerged.reverse.files",stringsAsFactors = F)
+files_a = read.table("assembled.files",stringsAsFactors = F)
+
+
+for(i in 1:nrow(files_f))
+  {
+#grep 1st and 2rd lines
+  	unassembled.fastq = paste("awk 'NR % 2 == 1' ",files_f[i,1]," >unassembled.names.seq.fastq",sep="")
+  	system(unassembled.fastq)
+
+#grep sequences forward...
+  	unassembled.forward.fastq = paste("awk 'NR % 2 == 0' ",files_f[i,1]," >unassembled.seq.forward.fastq",sep="")
+  	system(unassembled.forward.fastq)
+
+#grep sequences reverse...
+  	unassembled.reverse.fastq = paste("awk 'NR % 2 == 0' ",files_r[i,1]," >unassembled.seq.reverse.fastq",sep="")
+  	system(unassembled.reverse.fastq)
+
+#N and E quality file
+ 	system("wc -l unassembled.seq.reverse.fastq >wc")
+ 	wc = read.table("wc")
+
+ 	write.table(c(rbind(rep('NNNNNNNNNNNNNNNNNNNN',wc[1,1]/2),rep('EEEEEEEEEEEEEEEEEEEE',wc[1,1]/2))),"NE_file",row.names = F, col.names = F, quote = F)
+
+#paste the unassembled sequences into a single file
+	system("paste -d '\\0' unassembled.seq.forward.fastq NE_file unassembled.seq.reverse.fastq >unassembledN")
+
+#put everything back into a single fastq
+	back = paste("paste -d '\\n' unassembled.names.seq.fastq unassembledN >",gsub("merged.unassembled.forward","cat",files_f[i,1]),sep= "")
+	system(back)
+
+#add the merged sequences
+	all = paste("cat ",files_a[i,1]," ",gsub("merged.unassembled.forward","cat",files_f[i,1])," >",gsub("merged.assembled","merged.assembled2",files_a[i,1]),sep = "")
+	system(all)
+
+#remove the clutter (file specific)
+	remove = c("rm NE_file unassembledN wc unassembled.names.seq.fastq unassembled.seq.forward.fastq unassembled.seq.reverse.fastq")
+	system(remove)
+}
+
+#remove the clutter (listing of all files)
+system("rm assembled.files umerged.forward.files umerged.reverse.files")
diff --git a/bash_scripts/master_script.sh b/bash_scripts/master_script.sh
@@ -9,7 +9,7 @@
 #
 # master_script.sh
 # Created April 2017 by Sam Westreich, github.com/transcript
-# This version modified February 21, 2018
+# This version modified March 12, 2019
 #
 ####################################################################
 #
@@ -43,6 +43,9 @@ source "${BASH_SOURCE%/*}/../bash_scripts/lib/common.sh"
 INPUT_DIR=$SAMSA/input_files
 OUT_DIR=$SAMSA
 
+# number of threads
+threads=`getconf _NPROCESSORS_ONLN`
+
 STEP_1="$OUT_DIR/step_1_output"
 STEP_2="$OUT_DIR/step_2_output"
 STEP_3="$OUT_DIR/step_3_output"
@@ -71,15 +74,30 @@ else
   Subsys_db="$SAMSA/full_databases/subsys_db.fa"
 fi
 
+####################################################################
+#STEP 0.1: create/read checkpoint
+
+printf "\nStep 0.1: Checking for the presence of the checkpoint file.\n"
+if [ ! -f "$INPUT_DIR/checkpoints" ]
+  then
+    printf "\tThe file 'checkpoints' does not exist in the input directory, creating...\n"
+    touch "$INPUT_DIR/checkpoints"
+else
+    printf "\tThe file 'checkpoints' already exists in the input directory.\n"
+fi
+
 ####################################################################
 #
 # STEP 1: CLEANING FILES WITH TRIMMOMATIC
+Step=$(grep "TRIMMO" $INPUT_DIR/checkpoints)
+if [ "${Step}" != "TRIMMO" ]
+  then
 
 if ls $INPUT_DIR/*.gz &>/dev/null; then
   for file in $INPUT_DIR/*.gz
   do
-      gunzip $file
-    done
+    gunzip $file
+  done
 fi
 
 $MKDIR $STEP_1
@@ -90,11 +108,11 @@ do
     out_path=`echo $f | awk -F "_R1" '{print $1 ".cleaned"}'`
     if [ -f $f2 ]; then
       paired=true
-      checked java -jar $TRIMMOMATIC PE -phred33 $f $f2 \
+      checked java -jar $TRIMMOMATIC PE -phred33 -threads $threads $f $f2 \
         $out_path".forward" $out_path".forward_unpaired" $out_path".reverse" $out_path".reverse_unpaired" \
         SLIDINGWINDOW:4:15 MINLEN:70
     else
-      checked java -jar $TRIMMOMATIC SE -phred33 $f $out_path SLIDINGWINDOW:4:15 MINLEN:70
+      checked java -jar $TRIMMOMATIC SE -phred33 -threads $threads $f $out_path SLIDINGWINDOW:4:15 MINLEN:70
     fi
 done
 
@@ -104,6 +122,12 @@ if $paired; then
 else
   mv $INPUT_DIR/*".cleaned" $STEP_1
 fi
+printf "TRIMMO\n" >>pipeline/checkpoints
+
+else
+  printf  "\tThe variable TRIMMO is in the checkpoint file. STEP 1 will be skipped.\n"
+fi
+
 
 ####################################################################
 #
@@ -112,13 +136,17 @@ fi
 #       Example: control_1.R1.fastq
 #                control_1.R2.fastq
 
+Step=$(grep "MERGING" $INPUT_DIR/checkpoints)
+if [ "${Step}" != "MERGING" ]
+  then
+
 $MKDIR $STEP_2
 if $paired; then
   for file in $STEP_1/*.cleaned.forward
   do
-      f2=`echo $file | awk -F "cleaned.forward" '{print $1 "cleaned.reverse"}'`
-      shortname=`echo $file | awk -F "cleaned.forward" '{print $1 "merged"}'`
-      checked $PEAR -f $file -r $f2 -o $STEP_2/${shortname##*/}
+    f2=`echo $file | awk -F "cleaned.forward" '{print $1 "cleaned.reverse"}'`
+    shortname=`echo $file | awk -F "cleaned.forward" '{print $1 "merged"}'`
+    checked $PEAR -f $file -r $f2 -j $threads -o $STEP_2/${shortname##*/}
   done
 else
   for file in $STEP_1/*.cleaned
@@ -128,10 +156,19 @@ else
   done
 fi
 
+printf "MERGING\n" >>$INPUT_DIR/checkpoints
+
+else
+  printf  "\tThe variable MERGING is in the checkpoint file. STEP 2 will be skipped.\n"
+fi
+
 ####################################################################
 #
 # STEP 2.9: GETTING RAW SEQUENCES COUNTS
 # Note: These are used later for statistical analysis.
+Step=$(grep "RAW" $INPUT_DIR/checkpoints)
+if [ "${Step}" != "RAW" ]
+  then
 
 if [[ -f $STEP_2/raw_counts.txt ]]; then
     rm $STEP_2/raw_counts.txt
@@ -141,38 +178,56 @@ touch $STEP_2/raw_counts.txt
 if $paired; then
   for file in $STEP_1/*cleaned.forward
   do
-      checked python $PY_DIR/raw_read_counter.py -I $file -O $STEP_2/raw_counts.txt
+    checked python $PY_DIR/raw_read_counter.py -I $file -O $STEP_2/raw_counts.txt
   done
 else
   for file in $STEP_1/*cleaned
   do
-      checked python $PY_DIR/raw_read_counter.py -I $file -O $STEP_2/raw_counts.txt
+    checked python $PY_DIR/raw_read_counter.py -I $file -O $STEP_2/raw_counts.txt
   done
 fi
 
+printf "RAW\n" >>$INPUT_DIR/checkpoints
+
+else
+  printf  "\tThe variable RAW is in the checkpoint file. STEP 2.9 will be skipped.\n"
+fi
+
 ####################################################################
 #
 # STEP 3: REMOVING RIBOSOMAL READS WITH SORTMERNA
 # Note: this step assumes that the SortMeRNA databases are indexed.  If not,
 # do that first (see the SortMeRNA user manual for details).
+Step=$(grep "RIBO" $INPUT_DIR/checkpoints)
+if [ "${Step}" != "RIBO" ]
+  then
 
 for file in $STEP_2/*.assembled.fastq
 do
-    shortname=`echo $file | awk -F "assembled" '{print $1 "ribodepleted"}'`
-    checked $SORTMERNA \
-      --ref $SORTMERNA_DIR/rRNA_databases/silva-bac-16s-id90.fasta,$SORTMERNA_DIR/index/silva-bac-16s-db \
-      --reads $file --aligned $file.ribosomes --other $shortname --fastx \
-      --num_alignments 0 --log -v
+  shortname=`echo $file | awk -F "assembled" '{print $1 "ribodepleted"}'`
+  checked $SORTMERNA -a $threads \
+    --ref $SORTMERNA_DIR/rRNA_databases/silva-bac-16s-id90.fasta,$SORTMERNA_DIR/index/silva-bac-16s-db \
+    --reads $file --aligned $file.ribosomes --other $shortname --fastx \
+    --log -v
 done
 
 $MKDIR $STEP_3
 mv $STEP_2/*ribodepleted* $STEP_3
 
+printf "RIBO\n" >>$INPUT_DIR/checkpoints
+
+else
+  printf  "\tThe variable RIBO is in the checkpoint file. STEP 3 will be skipped.\n"
+fi
+
 ####################################################################
 #
 # STEP 4: ANNOTATING WITH DIAMOND AGAINST REFSEQ
 # Note: this step assumes that the DIAMOND database is already built.  If not,
 # do that first before running this step.
+Step=$(grep "REFSEQ_ANNOT" $INPUT_DIR/checkpoints)
+if [ "${Step}" != "REFSEQ_ANNOT" ]
+  then
 
 echo "Now starting on DIAMOND org annotations at: "; date
 
@@ -192,24 +247,42 @@ mv $STEP_3/*.daa $STEP_4/daa_binary_files
 
 echo "RefSeq DIAMOND annotations completed at: "; date
 
+printf "REFSEQ_ANNOT\n" >>$INPUT_DIR/checkpoints
+
+else
+  printf  "\tThe variable REFSEQ_ANNOT is in the checkpoint file. STEP 4 will be skipped.\n"
+fi
+
 ####################################################################
 #
 # STEP 5: AGGREGATING WITH ANALYSIS_COUNTER
+Step=$(grep "REFSEQ_AGGREG" $INPUT_DIR/checkpoints)
+if [ "${Step}" != "REFSEQ_AGGREG" ]
+  then
 
 for file in $STEP_4/*RefSeq_annotated
 do
-    checked python $PY_DIR/DIAMOND_analysis_counter.py -I $file -D $RefSeq_db -O
-    checked python $PY_DIR/DIAMOND_analysis_counter.py -I $file -D $RefSeq_db -F
+  checked python $PY_DIR/DIAMOND_analysis_counter.py -I $file -D $RefSeq_db -O
+  checked python $PY_DIR/DIAMOND_analysis_counter.py -I $file -D $RefSeq_db -F
 done
 
 $MKDIR $STEP_5/RefSeq_results/org_results
 $MKDIR $STEP_5/RefSeq_results/func_results
 mv $STEP_4/*organism.tsv $STEP_5/RefSeq_results/org_results
 mv $STEP_4/*function.tsv $STEP_5/RefSeq_results/func_results
 
+printf "REFSEQ_AGGREG\n" >>$INPUT_DIR/checkpoints
+
+else
+  printf  "\tThe variable REFSEQ_AGGREG is in the checkpoint file. STEP 5 will be skipped.\n"
+fi
+
 ####################################################################
 #
 # STEP 4.1: ANNOTATING WITH DIAMOND AGAINST SUBSYSTEMS
+Step=$(grep "SUBSYS_ANNOT" $INPUT_DIR/checkpoints)
+if [ "${Step}" != "SUBSYS_ANNOT" ]
+  then
 
 echo "Now starting on DIAMOND Subsystems annotations at: "; date
 
@@ -226,24 +299,39 @@ mv $STEP_3/*.daa $STEP_4/daa_binary_files
 
 echo "DIAMOND Subsystems annotations completed at: "; date
 
+printf "SUBSYS_ANNOT\n" >>$INPUT_DIR/checkpoints
+
+else
+  printf  "\tThe variable SUBSYS_ANNOT is in the checkpoint file. STEP 4.1 will be skipped.\n"
+fi
+
 ##################################################################
 #
 # STEP 5.1: PYTHON SUBSYSTEMS ANALYSIS COUNTER
+Step=$(grep "SUBSYS_AGGREG" $INPUT_DIR/checkpoints)
+if [ "${Step}" != "SUBSYS_AGGREG" ]
+  then
 
 for file in $STEP_4/*subsys_annotated
 do
-    checked python $PY_DIR/DIAMOND_subsystems_analysis_counter.py -I $file \
-      -D $Subsys_db -O $file.hierarchy -P $file.receipt
+  checked python $PY_DIR/DIAMOND_subsystems_analysis_counter.py -I $file \
+    -D $Subsys_db -O $file.hierarchy -P $file.receipt
 
-    # This quick program reduces down identical hierarchy annotations
-    checked python $PY_DIR/subsys_reducer.py -I $file.hierarchy
+  # This quick program reduces down identical hierarchy annotations
+  checked python $PY_DIR/subsys_reducer.py -I $file.hierarchy
 done
 
 $MKDIR $STEP_5/Subsystems_results/receipts
 mv $STEP_4/*.reduced $STEP_5/Subsystems_results
 mv $STEP_4/*.receipt $STEP_5/Subsystems_results/receipts
 rm $STEP_4/*.hierarchy
 
+printf "SUBSYS_AGGREG\n" >>$INPUT_DIR/checkpoints
+
+else
+  printf  "\tThe variable SUBSYS_AGGREG is in the checkpoint file. STEP 5.1 will be skipped.\n"
+fi
+
 ##################################################################
 #
 # At this point, all the results files are ready for analysis using R.
@@ -255,6 +343,9 @@ rm $STEP_4/*.hierarchy
 # STEP 6: R ANALYSIS
 # Note: For R to properly identify files to compare/contrast, they must include
 # the appropriate prefix (either "control_$file" or experimental_$file")!
+Step=$(grep "R_ANALYSIS" $INPUT_DIR/checkpoints)
+if [ "${Step}" != "R_ANALYSIS" ]
+  then
 
 checked Rscript $R_DIR/run_DESeq_stats.R \
   -I $STEP_5/RefSeq_results/org_results \
@@ -269,6 +360,12 @@ checked Rscript $R_DIR/Subsystems_DESeq_stats.R \
   -O Subsystems_level-1_DESeq_results.tab -L 1 \
   -R $STEP_2/raw_counts.txt
 
+printf "R_ANALYSIS\n" >>$INPUT_DIR/checkpoints
+
+else
+  printf  "\tThe variable R_ANALYSIS is in the checkpoint file. STEP 6 will be skipped.\n"
+fi
+
 echo "Master bash script finished running at: "; date
 exit
 ####################################################################