Merge pull request #25 from seb951/master

Testing checkpoint option added by seb951
transcript · Mar 8, 2019 · f60d5bf · f60d5bf
2 parents ba23c12 + c7474db
commit f60d5bf
Show file tree

Hide file tree

Showing 6 changed files with 1,123 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,8 @@
+# my files
+#bash_scripts/master_samsa2.slurm
+#bash_scripts/cyanotoxin_master_samsa2.slurm
+#R_scripts/combining_umerged.R
+
 # Swap files
 *.swp
 *.swo

diff --git a/README.md b/README.md
@@ -1,5 +1,25 @@
 # SAMSA2 - A fork of the complete metatranscriptome analysis pipeline
 
+### modification by [email protected]
+
+### New in my version. 
+This is a new master script (master_samsa2.slurm) with several modifications:
+
+* It specifies multithreading for all programs for which it is available (trimmomatic, pear, diamond)
+* The script automatically creates a `checkpoint` file. Once a step is finished, it writes the name of that specific step in `checkpoint`. Everytime you run the script, it looks if that file exist. If so, it reads it and SKIP the steps written in checkpoint. This is done to avoid re-run CPU intense steps if not necessary.
+* In the merging step, unmerged reads are concatenated and added to a single file. The forward read and the reverse (complement) read are concatenated with a string of 20 Ns in the middle: This is done through a new R script entitled: `combining_umerged.R`
+* Extra care is taken to remove unnecessary files once a step is performed to keep disk usage at a minimum.
+* Each step contains an exit statement to be printed if the master script dies due to an unforseen error.
+* Trimmomatic removes adapter contamination according to a specific fasta file.
+* All options, read & program location are to be specified in the first section of the script.
+* The script is formated to be run on a HPC using a SLURM job scheduler, but this can be easily changed / removed.
+* I've removed the --num_alignments 0 in the ribosomal `sortmrna` step. This caused problems and slowed things down a lot. Plus we don't care about the rRNA alignments. If a sequence aligns to 1 or 1,000 rRNA its out anyways...
+
+#### To do:
+* reverse the order of the merging and trimming step (but again, maybe not...)
+* simplify some syntax / make sure all paths are relative (mostly done)
+
+
 Version 2 of the SAMSA pipeline - faster!  Lighter!  More options!  Less waiting!  
 
 ### New in version 2:

diff --git a/R_scripts/combining_umerged.R b/R_scripts/combining_umerged.R
@@ -0,0 +1,61 @@
+#!/cvmfs/soft.computecanada.ca/easybuild/software/2017/avx2/Compiler/intel2016.4/r/3.5.0/bin/Rscript
+
+args = commandArgs(TRUE)
+start = args[1] # Specify which sequences in "list_ind" file you want to align, directlty from the shell. Alternatively, you can do this from the "alignments" function itself.
+#start="/Users/jerry/Documents/CSBQ/shapiro/results"
+
+setwd(start)
+
+###files to work with 
+system(paste("ls -1 *unassembled.forward.fastq >umerged.forward.files",sep = ""))
+system(paste("ls -1 *unassembled.reverse.fastq >umerged.reverse.files",sep = ""))
+system(paste("ls -1 *.assembled.fastq >assembled.files",sep = ""))
+
+files_f = read.table("umerged.forward.files",stringsAsFactors = F)      
+files_r = read.table("umerged.reverse.files",stringsAsFactors = F) 
+files_a = read.table("assembled.files",stringsAsFactors = F) 
+
+
+for(i in 1:nrow(files_f))      
+  {
+#grep 1st and 2rd lines 
+  	unassembled.fastq = paste("awk 'NR % 2 == 1' ",files_f[i,1]," >unassembled.names.seq.fastq",sep="")
+  	system(unassembled.fastq)
+
+#grep sequences forward...
+  	unassembled.forward.fastq = paste("awk 'NR % 2 == 0' ",files_f[i,1]," >unassembled.seq.forward.fastq",sep="")
+  	system(unassembled.forward.fastq)
+
+#grep sequences reverse...
+  	unassembled.reverse.fastq = paste("awk 'NR % 2 == 0' ",files_r[i,1]," >unassembled.seq.reverse.fastq",sep="")
+  	system(unassembled.reverse.fastq)
+
+#N and E quality file
+ 	system("wc -l unassembled.seq.reverse.fastq >wc")
+ 	wc = read.table("wc")
+
+ 	write.table(c(rbind(rep('NNNNNNNNNNNNNNNNNNNN',wc[1,1]/2),rep('EEEEEEEEEEEEEEEEEEEE',wc[1,1]/2))),"NE_file",row.names = F, col.names = F, quote = F)
+
+#paste the unassembled sequences into a single file
+	system("paste -d '\\0' unassembled.seq.forward.fastq NE_file unassembled.seq.reverse.fastq >unassembledN")
+
+#put everything back into a single fastq
+	back = paste("paste -d '\\n' unassembled.names.seq.fastq unassembledN >",gsub("merged.unassembled.forward","cat",files_f[i,1]),sep= "")
+	system(back)
+
+#add the merged sequences
+	all = paste("cat ",files_a[i,1]," ",gsub("merged.unassembled.forward","cat",files_f[i,1])," >",gsub("merged.assembled","merged.assembled2",files_a[i,1]),sep = "")
+	system(all)
+
+#remove the clutter (file specific)
+	remove = c("rm NE_file unassembledN wc unassembled.names.seq.fastq unassembled.seq.forward.fastq unassembled.seq.reverse.fastq")
+	system(remove)
+}
+
+#remove the clutter (listing of all files)
+system("rm assembled.files umerged.forward.files umerged.reverse.files")
+
+
+
+
+