From cb42c976c1a2b69020d27736b32e22bf196e0c80 Mon Sep 17 00:00:00 2001 From: John Vivian Date: Mon, 25 Jul 2016 14:36:37 -0700 Subject: [PATCH 1/6] Fix output for single-end (resolves #378) --- src/toil_scripts/tools/preprocessing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/toil_scripts/tools/preprocessing.py b/src/toil_scripts/tools/preprocessing.py index 5cc9c3d1..48658955 100644 --- a/src/toil_scripts/tools/preprocessing.py +++ b/src/toil_scripts/tools/preprocessing.py @@ -31,7 +31,7 @@ def run_cutadapt(job, r1_id, r2_id, fwd_3pr_adapter, rev_3pr_adapter): '/data/R1.fastq', '/data/R2.fastq']) else: job.fileStore.readGlobalFile(r1_id, os.path.join(work_dir, 'R1.fastq')) - parameters.extend(['-o', '/data/R1.fastq', '/data/R1.fastq']) + parameters.extend(['-o', '/data/R1_cutadapt.fastq', '/data/R1.fastq']) # Call: CutAdapt docker_call(tool='quay.io/ucsc_cgl/cutadapt:1.9--6bd44edd2b8f8f17e25c5a268fedaab65fa851d2', work_dir=work_dir, parameters=parameters) From 45f425df9f8b3a658923c7f8fce1dec316014d5c Mon Sep 17 00:00:00 2001 From: John Vivian Date: Tue, 26 Jul 2016 15:30:23 -0700 Subject: [PATCH 2/6] Don't pass s3_key to config inputs (resolves #377) --- .../exome_variant_pipeline/exome_variant_pipeline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/toil_scripts/exome_variant_pipeline/exome_variant_pipeline.py b/src/toil_scripts/exome_variant_pipeline/exome_variant_pipeline.py index fd1c6de2..5855d9ba 100644 --- a/src/toil_scripts/exome_variant_pipeline/exome_variant_pipeline.py +++ b/src/toil_scripts/exome_variant_pipeline/exome_variant_pipeline.py @@ -40,7 +40,7 @@ def download_shared_files(job, samples, config): urls = [config.reference, config.phase, config.mills, config.dbsnp, config.cosmic] for name, url in zip(file_names, urls): if url: - vars(config)[name] = job.addChildJobFn(download_url_job, url=url, s3_key_path=config.ssec).rv() + vars(config)[name] = job.addChildJobFn(download_url_job, url=url).rv() job.addFollowOnJobFn(reference_preprocessing, samples, config) From 440282ce7e295d7782529de99a4206dc6027366f Mon Sep 17 00:00:00 2001 From: John Vivian Date: Wed, 27 Jul 2016 10:28:46 -0700 Subject: [PATCH 3/6] Standardize output-dir (resolves #385, resolves #391) --- .../exome_variant_pipeline.py | 32 +++++++-------- .../exome_variant_pipeline/test/test_exome.py | 3 +- .../rnaseq_cgl/rnaseq_cgl_pipeline.py | 41 +++++++++---------- .../rnaseq_cgl/test/test_rnaseq_cgl.py | 3 +- 4 files changed, 38 insertions(+), 41 deletions(-) diff --git a/src/toil_scripts/exome_variant_pipeline/exome_variant_pipeline.py b/src/toil_scripts/exome_variant_pipeline/exome_variant_pipeline.py index 5855d9ba..aab1cd23 100644 --- a/src/toil_scripts/exome_variant_pipeline/exome_variant_pipeline.py +++ b/src/toil_scripts/exome_variant_pipeline/exome_variant_pipeline.py @@ -189,15 +189,14 @@ def consolidate_output(job, config, mutect, pindel, muse): else: tarinfo.name = os.path.join(config.uuid, 'muse', os.path.basename(tarinfo.name)) f_out.addfile(tarinfo, fileobj=f_in_file) - # Move to output directory of selected - if config.output_dir: + # Move to output location + if urlparse(config.output_dir).scheme == 's3': + job.fileStore.logToMaster('Uploading {} to S3: {}'.format(config.uuid, config.output_dir)) + s3am_upload(fpath=out_tar, s3_dir=config.output_dir, num_cores=config.cores) + else: job.fileStore.logToMaster('Moving {} to output dir: {}'.format(config.uuid, config.output_dir)) mkdir_p(config.output_dir) copy_files(file_paths=[out_tar], output_dir=config.output_dir) - if config.s3_output_dir: - job.fileStore.logToMaster('Uploading {} to S3: {}'.format(config.uuid, config.s3_output_dir)) - s3am_upload(fpath=out_tar, s3_dir=config.s3_output_dir, num_cores=config.cores) - def parse_manifest(path_to_manifest): """ @@ -226,8 +225,12 @@ def generate_config(): # CGL Exome Pipeline configuration file # This configuration file is formatted in YAML. Simply write the value (at least one space) after the colon. # Edit the values in this configuration file and then rerun the pipeline: "toil-variant run" - # URLs can take the form: http://, file://, s3://, gnos://. - # Comments (beginning with #) do not need to be removed. Optional parameters may be left blank + # + # URLs can take the form: http://, file://, s3://, gnos:// + # Local inputs follow the URL convention: file:///full/path/to/input + # S3 URLs follow the convention: s3://bucket/directory/file.txt + # + # Comments (beginning with #) do not need to be removed. Optional parameters left blank are treated as false. #################################################################################################################### # Required: URL to reference genome reference: s3://cgl-pipeline-inputs/variant_hg19/hg19.fa @@ -244,6 +247,10 @@ def generate_config(): # Required: URL to cosmic VCF cosmic: s3://cgl-pipeline-inputs/variant_hg19/cosmic.hg19.vcf + # Required: Output location of sample. Can be full path to a directory or an s3:// URL + # Warning: S3 buckets must exist prior to upload or it will fail. + output-dir: + # Optional: If true, will run MuTect to do mutation calls run-mutect: true @@ -256,12 +263,6 @@ def generate_config(): # Optional: If true, will perform indel realignment and base quality score recalibration preprocessing: true - # Optional: Provide a full path to where results will appear - output-dir: - - # Optional: Provide an s3 path (s3://bucket/dir) where results will appear - s3-output-dir: - # Optional: Provide a full path to a 32-byte key used for SSE-C Encryption in Amazon ssec: @@ -414,8 +415,7 @@ def main(): if config.run_muse: require(config.reference and config.dbsnp, 'Missing inputs for MuSe, check config file.') - require(config.output_dir or config.s3_output_dir, 'output-dir AND/OR s3-output-dir need to be defined, ' - 'otherwise sample output is not stored anywhere!') + require(config.output_dir, 'No output location specified: {}'.format(config.output_dir)) # Program checks for program in ['curl', 'docker']: require(next(which(program), None), program + ' must be installed on every node.'.format(program)) diff --git a/src/toil_scripts/exome_variant_pipeline/test/test_exome.py b/src/toil_scripts/exome_variant_pipeline/test/test_exome.py index ef61b4e5..daed296a 100644 --- a/src/toil_scripts/exome_variant_pipeline/test/test_exome.py +++ b/src/toil_scripts/exome_variant_pipeline/test/test_exome.py @@ -48,8 +48,7 @@ def generate_config(): run-pindel: true run-muse: true preprocessing: true - output-dir: - s3-output-dir: s3://cgl-driver-projects/test/ci + output-dir: s3://cgl-driver-projects/test/ci ssec: gtkey: ci-test: true diff --git a/src/toil_scripts/rnaseq_cgl/rnaseq_cgl_pipeline.py b/src/toil_scripts/rnaseq_cgl/rnaseq_cgl_pipeline.py index 450501b3..95378574 100644 --- a/src/toil_scripts/rnaseq_cgl/rnaseq_cgl_pipeline.py +++ b/src/toil_scripts/rnaseq_cgl/rnaseq_cgl_pipeline.py @@ -148,9 +148,9 @@ def rsem_quantification(job, config, star_output): transcriptome_id, sorted_id, wiggle_id = star_output wiggle_path = os.path.join(work_dir, config.uuid + '.wiggle.bg') job.fileStore.readGlobalFile(wiggle_id, wiggle_path) - if config.s3_output_dir: - s3am_upload(fpath=wiggle_path, s3_dir=config.s3_output_dir, s3_key_path=config.ssec) - if config.output_dir: + if urlparse(config.output_dir).scheme == 's3': + s3am_upload(fpath=wiggle_path, s3_dir=config.output_dir, s3_key_path=config.ssec) + else: copy_files(file_paths=[wiggle_path], output_dir=config.output_dir) else: transcriptome_id, sorted_id = star_output @@ -158,9 +158,9 @@ def rsem_quantification(job, config, star_output): if config.save_bam: bam_path = os.path.join(work_dir, config.uuid + '.sorted.bam') job.fileStore.readGlobalFile(sorted_id, bam_path) - if config.s3_output_dir and config.ssec: - s3am_upload(fpath=bam_path, s3_dir=config.s3_output_dir, s3_key_path=config.ssec) - if config.output_dir: + if urlparse(config.output_dir).scheme == 's3' and config.ssec: + s3am_upload(fpath=bam_path, s3_dir=config.output_dir, s3_key_path=config.ssec) + else: copy_files(file_paths=[bam_path], output_dir=config.output_dir) # Declare RSEM and RSEM post-process jobs rsem_output = job.wrapJobFn(run_rsem, config.cores, transcriptome_id, config.rsem_ref, paired=config.paired, @@ -234,7 +234,7 @@ def consolidate_output(job, config, kallisto_output, rsem_output, fastqc_output) :param tuple(str, str) rsem_output: FileStoreIDs for RSEM output :param str fastqc_output: FileStoreID for FastQC output """ - job.fileStore.logToMaster('Consolidating input: {}'.format(config.uuid)) + job.fileStore.logToMaster('Consolidating output: {}'.format(config.uuid)) work_dir = job.fileStore.getLocalTempDir() # Retrieve output file paths to consolidate rsem_tar, hugo_tar, kallisto_tar, fastqc_tar = None, None, None, None @@ -266,15 +266,14 @@ def consolidate_output(job, config, kallisto_output, rsem_output, fastqc_output) else: tarinfo.name = os.path.join(config.uuid, 'QC', os.path.basename(tarinfo.name)) f_out.addfile(tarinfo, fileobj=f_in_file) - # Move to output directory - if config.output_dir: + # Move to output location + if urlparse(config.output_dir).scheme == 's3': + job.fileStore.logToMaster('Uploading {} to S3: {}'.format(config.uuid, config.output_dir)) + s3am_upload(fpath=out_tar, s3_dir=config.output_dir, num_cores=config.cores) + else: job.fileStore.logToMaster('Moving {} to output dir: {}'.format(config.uuid, config.output_dir)) mkdir_p(config.output_dir) copy_files(file_paths=[os.path.join(work_dir, config.uuid + '.tar.gz')], output_dir=config.output_dir) - # Upload to S3 - if config.s3_output_dir: - job.fileStore.logToMaster('Uploading {} to S3: {}'.format(config.uuid, config.s3_output_dir)) - s3am_upload(fpath=out_tar, s3_dir=config.s3_output_dir, num_cores=config.cores) # Pipeline specific functions @@ -316,8 +315,12 @@ def generate_config(): # This configuration file is formatted in YAML. Simply write the value (at least one space) after the colon. # Edit the values in this configuration file and then rerun the pipeline: "toil-rnaseq run" # Just Kallisto or STAR/RSEM can be run by supplying only the inputs to those tools + # + # URLs can take the form: http://, file://, s3://, gnos:// # Local inputs follow the URL convention: file:///full/path/to/input - # Comments (beginning with #) do not need to be removed. Optional parameters may be left blank. + # S3 URLs follow the convention: s3://bucket/directory/file.txt + # + # Comments (beginning with #) do not need to be removed. Optional parameters left blank are treated as false. ############################################################################################################## # Required: URL {scheme} to index tarball used by STAR star-index: s3://cgl-pipeline-inputs/rnaseq_cgl/starIndex_hg38_no_alt.tar.gz @@ -328,13 +331,10 @@ def generate_config(): # Required: URL {scheme} to reference tarball used by RSEM rsem-ref: s3://cgl-pipeline-inputs/rnaseq_cgl/rsem_ref_hg38_no_alt.tar.gz - # NOTE: Pipeline requires at least one output option - # Optional: Provide a full file path (/path/to/output-dir) where results will appear + # Required: Output location of sample. Can be full path to a directory or an s3:// URL + # Warning: S3 buckets must exist prior to upload or it will fail. output-dir: - # Optional: Provide an s3 path (s3://bucket/dir) where results will appear - s3-output-dir: - # Optional: If true, will preprocess samples with cutadapt using adapter sequences. cutadapt: true @@ -493,8 +493,7 @@ def main(): if config.star_index or config.rsem_ref: require(config.star_index and config.rsem_ref, 'Input provided for STAR or RSEM but not both. STAR: ' '{}, RSEM: {}'.format(config.star_index, config.rsem_ref)) - require(config.output_dir or config.s3_output_dir, 'output-dir AND/OR s3-output-dir need to be defined, ' - 'otherwise sample output is not stored anywhere!') + require(config.output_dir, 'No output location specified: {}'.format(config.output_dir)) for input in [x for x in [config.kallisto_index, config.star_index, config.rsem_ref] if x]: require(urlparse(input).scheme in schemes, 'Input in config must have the appropriate URL prefix: {}'.format(schemes)) diff --git a/src/toil_scripts/rnaseq_cgl/test/test_rnaseq_cgl.py b/src/toil_scripts/rnaseq_cgl/test/test_rnaseq_cgl.py index da73ce70..a0894b04 100644 --- a/src/toil_scripts/rnaseq_cgl/test/test_rnaseq_cgl.py +++ b/src/toil_scripts/rnaseq_cgl/test/test_rnaseq_cgl.py @@ -89,8 +89,7 @@ def _generate_config(self): star-index: {input_dir}/starIndex_chr6.tar.gz kallisto-index: s3://cgl-pipeline-inputs/rnaseq_cgl/kallisto_hg38.idx rsem-ref: {input_dir}/rsem_ref_chr6.tar.gz - output-dir: - s3-output-dir: {output_dir} + output-dir: {output_dir} fastqc: true cutadapt: ssec: From 5cbb451ca43414049f93faa82fba1f6cb6b31212 Mon Sep 17 00:00:00 2001 From: John Vivian Date: Tue, 26 Jul 2016 15:49:58 -0700 Subject: [PATCH 4/6] Clarify lack of bucket creation (resolves #383) --- src/toil_scripts/bwa_alignment/bwa_alignment.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/toil_scripts/bwa_alignment/bwa_alignment.py b/src/toil_scripts/bwa_alignment/bwa_alignment.py index 144b17f2..f801fb99 100644 --- a/src/toil_scripts/bwa_alignment/bwa_alignment.py +++ b/src/toil_scripts/bwa_alignment/bwa_alignment.py @@ -101,13 +101,18 @@ def generate_config(): # BWA Alignment Pipeline configuration file # This configuration file is formatted in YAML. Simply write the value (at least one space) after the colon. # Edit the values in this configuration file and then rerun the pipeline: "toil-bwa run" - # URLs can take the form: http://, file://, s3://, gnos://. - # Comments (beginning with #) do not need to be removed. Optional parameters may be left blank + # + # URLs can take the form: http://, file://, s3://, gnos:// + # Local inputs follow the URL convention: file:///full/path/to/input + # S3 URLs follow the convention: s3://bucket/directory/file.txt + # + # Comments (beginning with #) do not need to be removed. Optional parameters left blank are treated as false. ############################################################################################################## # Required: Reference fasta file ref: s3://cgl-pipeline-inputs/alignment/hg19.fa # Required: Output location of sample. Can be full path to a directory or an s3:// URL + # Warning: S3 buckets must exist prior to upload or it will fail. output-dir: # Required: The library entry to go in the BAM read group. From 7f19be4a56f91663fccdce10a98edf84ddbfebcd Mon Sep 17 00:00:00 2001 From: John Vivian Date: Wed, 27 Jul 2016 16:20:56 -0700 Subject: [PATCH 5/6] Fix single-end naming error (resolves #389) --- src/toil_scripts/tools/aligners.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/toil_scripts/tools/aligners.py b/src/toil_scripts/tools/aligners.py index 9dbf0ccf..c4808b25 100644 --- a/src/toil_scripts/tools/aligners.py +++ b/src/toil_scripts/tools/aligners.py @@ -52,7 +52,7 @@ def run_star(job, cores, r1_id, r2_id, star_index_url, wiggle=False): job.fileStore.readGlobalFile(r2_id, os.path.join(work_dir, 'R2.fastq')) parameters.extend(['--readFilesIn', '/data/R1.fastq', '/data/R2.fastq']) else: - job.fileStore.readGlobalFile(r1_id, os.path.join(work_dir, 'R1_cutadapt.fastq')) + job.fileStore.readGlobalFile(r1_id, os.path.join(work_dir, 'R1.fastq')) parameters.extend(['--readFilesIn', '/data/R1.fastq']) # Call: STAR Mapping docker_call(tool='quay.io/ucsc_cgl/star:2.4.2a--bcbd5122b69ff6ac4ef61958e47bde94001cfe80', From f86e8d39b7fd3a6e6924200104d3530b9a387c56 Mon Sep 17 00:00:00 2001 From: John Vivian Date: Sat, 30 Jul 2016 14:20:57 -0700 Subject: [PATCH 6/6] Fix index generation (resolves #396) --- src/toil_scripts/bwa_alignment/bwa_alignment.py | 2 +- src/toil_scripts/tools/indexing.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/toil_scripts/bwa_alignment/bwa_alignment.py b/src/toil_scripts/bwa_alignment/bwa_alignment.py index f801fb99..819f79cc 100644 --- a/src/toil_scripts/bwa_alignment/bwa_alignment.py +++ b/src/toil_scripts/bwa_alignment/bwa_alignment.py @@ -44,7 +44,7 @@ def download_reference_files(job, inputs, samples): faidx = job.wrapJobFn(run_samtools_faidx, download_ref.rv()) shared_ids['fai'] = download_ref.addChild(faidx).rv() # If all BWA index files are provided, download them. Otherwise, generate them - if all(urls): + if all(x[1] for x in urls): for name, url in urls: shared_ids[name] = job.addChildJobFn(download_url_job, url).rv() else: diff --git a/src/toil_scripts/tools/indexing.py b/src/toil_scripts/tools/indexing.py index 50b81e22..97acdbf0 100644 --- a/src/toil_scripts/tools/indexing.py +++ b/src/toil_scripts/tools/indexing.py @@ -14,10 +14,10 @@ def run_bwa_index(job, ref_id): """ job.fileStore.logToMaster('Created BWA index files') work_dir = job.fileStore.getLocalTempDir() - job.fileStore.readGlobalFile(ref_id, os.path.join(work_dir, 'ref.fasta')) + job.fileStore.readGlobalFile(ref_id, os.path.join(work_dir, 'ref.fa')) command = ['index', '/data/ref.fa'] docker_call(work_dir=work_dir, parameters=command, - tool='quay.io/ucsc_cgl/samtools:0.1.19--dd5ac549b95eb3e5d166a5e310417ef13651994e') + tool='quay.io/ucsc_cgl/bwa:0.7.12--256539928ea162949d8a65ca5c79a72ef557ce7c') ids = {} for output in ['ref.fa.amb', 'ref.fa.ann', 'ref.fa.bwt', 'ref.fa.pac', 'ref.fa.sa']: ids[output.split('.')[-1]] = (job.fileStore.writeGlobalFile(os.path.join(work_dir, output)))