Adds tests and usage information

pmelsted · Apr 12, 2017 · fd6e463 · fd6e463
1 parent faa694f
commit fd6e463
Show file tree

Hide file tree

Showing 9 changed files with 111 additions and 8 deletions.
diff --git a/common.h b/common.h
@@ -3,7 +3,7 @@
 
 #include <string>
 
-#define PIZZLY_VERSION "0.37.0"
+#define PIZZLY_VERSION "0.37.1"
 
 struct ProgramOptions {
   std::string gtf;
@@ -15,7 +15,7 @@ struct ProgramOptions {
   int alignScore;
   int insertSize;
   int kmerScore;
-  ProgramOptions() : kmerScore(2) {}
+  ProgramOptions() : kmerScore(2), insertSize(400) {}
 };
 
 #endif // COMMON_H
diff --git a/main.cpp b/main.cpp
@@ -18,30 +18,29 @@ parseCommandLine(ProgramOptions & options, int argc, char const ** argv) {
 
   // Define Options
   seqan::addOption(parser, seqan::ArgParseOption(
-      "k", "", "k-mer size",
+      "k", "", "k-mer size used in kallisto",
       seqan::ArgParseArgument::INTEGER, "K"));
   seqan::addOption(parser, seqan::ArgParseOption(
-      "a", "align-score", "Alignment cutoff",
+      "a", "align-score", "Maximum number of mismatches allowed (default: 2)",
       seqan::ArgParseArgument::INTEGER, "ALIGN_SCORE"));
   seqan::addOption(parser, seqan::ArgParseOption(
-      "i", "insert-size", "Maximum size of fragment",
+      "i", "insert-size", "Maximum fragment size of library (default: 400)",
       seqan::ArgParseArgument::INTEGER, "INSERT_SIZE"));
   seqan::addOption(parser, seqan::ArgParseOption(
       "o", "output", "Prefix for output files",
       seqan::ArgParseArgument::STRING, "OUTPUT_PREFIX"));
   seqan::addOption(parser, seqan::ArgParseOption(
-      "G", "gtf", "Annotation",
+      "G", "gtf", "Annotation in GTF format",
       seqan::ArgParseArgument::STRING, "GTF"));
   seqan::addOption(parser, seqan::ArgParseOption(
-      "C", "cache", "Annotation",
+      "C", "cache", "File for caching annotation (created if not present, otherwise reused from previous runs)",
       seqan::ArgParseArgument::STRING, "cache"));
   seqan::addOption(parser, seqan::ArgParseOption(
       "F", "fasta", "Fasta reference",
       seqan::ArgParseArgument::STRING, "FASTA"));
 
   seqan::setRequired(parser, "k");
   seqan::setRequired(parser, "o");
-  seqan::setRequired(parser, "i");
   seqan::setRequired(parser, "F");
   seqan::setVersion(parser, std::string(PIZZLY_VERSION));  
 

diff --git a/test/.gitignore b/test/.gitignore
@@ -0,0 +1,5 @@
+kallisto_out
+pizzly_*
+*.kidx
+cache*
+.snakemake
diff --git a/test/README.md b/test/README.md
@@ -0,0 +1,16 @@
+# test data set
+
+Here you will find a small test data set to ensure that kallisto and pizzly are working properly. The contents are:
+
+- `transcripts.fasta.gz`: a gzip compressed transcriptome
+- `transcripts.gtf.gz` : a small subset of the GTF file for  Ensembl version 85 corresponding to the transcripts provided
+- `reads_X.fastq.gz`: gzip compressed "left" and "right" reads
+- `Snakefile`: a sample [`snakemake`](https://bitbucket.org/johanneskoester/snakemake/wiki/Home) file (not required for running kallisto or pizzly)
+
+Running `snakemake` will go through the following pipeline
+
+1. Creates the index for use with `kallisto`
+2. Runs `kallisto` with `--fusion` to identify potential fusions
+3. Runs `pizzly` on the `kallisto` output to identify potential fusions
+4. Creates a new index based on the transcriptome and the fusion transcripts identified by `pizzly`
+5. Runs `kallisto` in normal quantification mode on the expanded index to quantify both normal transcripts and fusions.
diff --git a/test/Snakefile b/test/Snakefile
@@ -0,0 +1,83 @@
+import os
+
+PRE = "transcripts"
+FASTA = "{0}.fasta.gz".format(PRE)
+GTF = "{0}.gtf.gz".format(PRE)
+INDEX = "{0}.kidx".format(PRE)
+K = 31
+
+
+ZCAT = 'gzcat' if os.uname()[0] == 'Darwin' else 'zcat'
+
+rule all:
+    input:
+        "kallisto_out/abundance.h5",
+        "pizzly_out/output.json",
+        "pizzly_post/abundance.h5"
+
+rule index:
+    input: FASTA
+    output: INDEX
+    shell:
+        "kallisto index -k {K} -i {output} {input}"
+
+rule kallisto_quant:
+    input:
+        "reads_1.fastq.gz",
+        "reads_2.fastq.gz",
+        INDEX
+    output:
+        "kallisto_out",
+        "kallisto_out/abundance.h5",
+        "kallisto_out/abundance.tsv",
+        "kallisto_out/run_info.json",
+        "kallisto_out/fusion.txt"
+    shell:
+        "kallisto quant "
+        "-i {INDEX} "
+        "-o {output[0]} "
+        "--fusion "
+        "{input[0]} {input[1]}"
+
+rule pizzly:
+    input:
+        FASTA,
+        "kallisto_out/fusion.txt"
+    output:
+        "pizzly_out/output.json",
+        "pizzly_out/output.fusions.fasta"
+    shell:
+        "../build/pizzly "
+        "-k {K} "
+        "--gtf {GTF} "
+        "--cache cache.txt "
+        "--align-score 2 "
+        "--insert-size 400 "
+        "--fasta {FASTA} "
+        "--output pizzly_out/output "
+        "kallisto_out/fusion.txt "
+
+rule append_index:
+    input:
+        FASTA,
+        "pizzly_out/output.fusions.fasta"
+    output:
+        "pizzly_post/transcripts_with_fusions.fasta.gz",
+        "pizzly_post/transcripts_with_fusions.kidx"
+    shell:
+        "cat <({ZCAT} {FASTA}) {input[1]} | gzip - > {output[0]} && "
+        "kallisto index -k {K} -i {output[1]} {output[0]}"
+
+rule requant_kallisto:
+    input:
+        "reads_1.fastq.gz",
+        "reads_2.fastq.gz",
+        "pizzly_post/transcripts_with_fusions.kidx"
+    output:
+        "pizzly_post",
+        "pizzly_post/abundance.h5"
+    shell:
+        "kallisto quant "
+        "-i {input[2]} "
+        "-o {output[0]} "
+        "{input[0]} {input[1]}"
diff --git a/test/reads_1.fastq.gz b/test/reads_1.fastq.gz
diff --git a/test/reads_2.fastq.gz b/test/reads_2.fastq.gz
diff --git a/test/transcripts.fasta.gz b/test/transcripts.fasta.gz
diff --git a/test/transcripts.gtf.gz b/test/transcripts.gtf.gz