Skip to content

Commit

Permalink
Merge pull request #398 from /issues/377-378-383-385-389-391-396-cp-t…
Browse files Browse the repository at this point in the history
…o-master

Cherrypick to master (resolves #377, resolves #378, resolves #383, resolves #385, resolves #389, resolves #391, resolves #396)
  • Loading branch information
jvivian authored Aug 1, 2016
2 parents f3df8ff + f86e8d3 commit 007088e
Show file tree
Hide file tree
Showing 8 changed files with 51 additions and 49 deletions.
11 changes: 8 additions & 3 deletions src/toil_scripts/bwa_alignment/bwa_alignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def download_reference_files(job, inputs, samples):
faidx = job.wrapJobFn(run_samtools_faidx, download_ref.rv())
shared_ids['fai'] = download_ref.addChild(faidx).rv()
# If all BWA index files are provided, download them. Otherwise, generate them
if all(urls):
if all(x[1] for x in urls):
for name, url in urls:
shared_ids[name] = job.addChildJobFn(download_url_job, url).rv()
else:
Expand Down Expand Up @@ -101,13 +101,18 @@ def generate_config():
# BWA Alignment Pipeline configuration file
# This configuration file is formatted in YAML. Simply write the value (at least one space) after the colon.
# Edit the values in this configuration file and then rerun the pipeline: "toil-bwa run"
# URLs can take the form: http://, file://, s3://, gnos://.
# Comments (beginning with #) do not need to be removed. Optional parameters may be left blank
#
# URLs can take the form: http://, file://, s3://, gnos://
# Local inputs follow the URL convention: file:///full/path/to/input
# S3 URLs follow the convention: s3://bucket/directory/file.txt
#
# Comments (beginning with #) do not need to be removed. Optional parameters left blank are treated as false.
##############################################################################################################
# Required: Reference fasta file
ref: s3://cgl-pipeline-inputs/alignment/hg19.fa
# Required: Output location of sample. Can be full path to a directory or an s3:// URL
# Warning: S3 buckets must exist prior to upload or it will fail.
output-dir:
# Required: The library entry to go in the BAM read group.
Expand Down
34 changes: 17 additions & 17 deletions src/toil_scripts/exome_variant_pipeline/exome_variant_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def download_shared_files(job, samples, config):
urls = [config.reference, config.phase, config.mills, config.dbsnp, config.cosmic]
for name, url in zip(file_names, urls):
if url:
vars(config)[name] = job.addChildJobFn(download_url_job, url=url, s3_key_path=config.ssec).rv()
vars(config)[name] = job.addChildJobFn(download_url_job, url=url).rv()
job.addFollowOnJobFn(reference_preprocessing, samples, config)


Expand Down Expand Up @@ -189,15 +189,14 @@ def consolidate_output(job, config, mutect, pindel, muse):
else:
tarinfo.name = os.path.join(config.uuid, 'muse', os.path.basename(tarinfo.name))
f_out.addfile(tarinfo, fileobj=f_in_file)
# Move to output directory of selected
if config.output_dir:
# Move to output location
if urlparse(config.output_dir).scheme == 's3':
job.fileStore.logToMaster('Uploading {} to S3: {}'.format(config.uuid, config.output_dir))
s3am_upload(fpath=out_tar, s3_dir=config.output_dir, num_cores=config.cores)
else:
job.fileStore.logToMaster('Moving {} to output dir: {}'.format(config.uuid, config.output_dir))
mkdir_p(config.output_dir)
copy_files(file_paths=[out_tar], output_dir=config.output_dir)
if config.s3_output_dir:
job.fileStore.logToMaster('Uploading {} to S3: {}'.format(config.uuid, config.s3_output_dir))
s3am_upload(fpath=out_tar, s3_dir=config.s3_output_dir, num_cores=config.cores)


def parse_manifest(path_to_manifest):
"""
Expand Down Expand Up @@ -226,8 +225,12 @@ def generate_config():
# CGL Exome Pipeline configuration file
# This configuration file is formatted in YAML. Simply write the value (at least one space) after the colon.
# Edit the values in this configuration file and then rerun the pipeline: "toil-variant run"
# URLs can take the form: http://, file://, s3://, gnos://.
# Comments (beginning with #) do not need to be removed. Optional parameters may be left blank
#
# URLs can take the form: http://, file://, s3://, gnos://
# Local inputs follow the URL convention: file:///full/path/to/input
# S3 URLs follow the convention: s3://bucket/directory/file.txt
#
# Comments (beginning with #) do not need to be removed. Optional parameters left blank are treated as false.
####################################################################################################################
# Required: URL to reference genome
reference: s3://cgl-pipeline-inputs/variant_hg19/hg19.fa
Expand All @@ -244,6 +247,10 @@ def generate_config():
# Required: URL to cosmic VCF
cosmic: s3://cgl-pipeline-inputs/variant_hg19/cosmic.hg19.vcf
# Required: Output location of sample. Can be full path to a directory or an s3:// URL
# Warning: S3 buckets must exist prior to upload or it will fail.
output-dir:
# Optional: If true, will run MuTect to do mutation calls
run-mutect: true
Expand All @@ -256,12 +263,6 @@ def generate_config():
# Optional: If true, will perform indel realignment and base quality score recalibration
preprocessing: true
# Optional: Provide a full path to where results will appear
output-dir:
# Optional: Provide an s3 path (s3://bucket/dir) where results will appear
s3-output-dir:
# Optional: Provide a full path to a 32-byte key used for SSE-C Encryption in Amazon
ssec:
Expand Down Expand Up @@ -414,8 +415,7 @@ def main():
if config.run_muse:
require(config.reference and config.dbsnp,
'Missing inputs for MuSe, check config file.')
require(config.output_dir or config.s3_output_dir, 'output-dir AND/OR s3-output-dir need to be defined, '
'otherwise sample output is not stored anywhere!')
require(config.output_dir, 'No output location specified: {}'.format(config.output_dir))
# Program checks
for program in ['curl', 'docker']:
require(next(which(program), None), program + ' must be installed on every node.'.format(program))
Expand Down
3 changes: 1 addition & 2 deletions src/toil_scripts/exome_variant_pipeline/test/test_exome.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,7 @@ def generate_config():
run-pindel: true
run-muse: true
preprocessing: true
output-dir:
s3-output-dir: s3://cgl-driver-projects/test/ci
output-dir: s3://cgl-driver-projects/test/ci
ssec:
gtkey:
ci-test: true
Expand Down
41 changes: 20 additions & 21 deletions src/toil_scripts/rnaseq_cgl/rnaseq_cgl_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,19 +148,19 @@ def rsem_quantification(job, config, star_output):
transcriptome_id, sorted_id, wiggle_id = star_output
wiggle_path = os.path.join(work_dir, config.uuid + '.wiggle.bg')
job.fileStore.readGlobalFile(wiggle_id, wiggle_path)
if config.s3_output_dir:
s3am_upload(fpath=wiggle_path, s3_dir=config.s3_output_dir, s3_key_path=config.ssec)
if config.output_dir:
if urlparse(config.output_dir).scheme == 's3':
s3am_upload(fpath=wiggle_path, s3_dir=config.output_dir, s3_key_path=config.ssec)
else:
copy_files(file_paths=[wiggle_path], output_dir=config.output_dir)
else:
transcriptome_id, sorted_id = star_output
# Save sorted bam if flag is selected
if config.save_bam:
bam_path = os.path.join(work_dir, config.uuid + '.sorted.bam')
job.fileStore.readGlobalFile(sorted_id, bam_path)
if config.s3_output_dir and config.ssec:
s3am_upload(fpath=bam_path, s3_dir=config.s3_output_dir, s3_key_path=config.ssec)
if config.output_dir:
if urlparse(config.output_dir).scheme == 's3' and config.ssec:
s3am_upload(fpath=bam_path, s3_dir=config.output_dir, s3_key_path=config.ssec)
else:
copy_files(file_paths=[bam_path], output_dir=config.output_dir)
# Declare RSEM and RSEM post-process jobs
rsem_output = job.wrapJobFn(run_rsem, config.cores, transcriptome_id, config.rsem_ref, paired=config.paired,
Expand Down Expand Up @@ -234,7 +234,7 @@ def consolidate_output(job, config, kallisto_output, rsem_output, fastqc_output)
:param tuple(str, str) rsem_output: FileStoreIDs for RSEM output
:param str fastqc_output: FileStoreID for FastQC output
"""
job.fileStore.logToMaster('Consolidating input: {}'.format(config.uuid))
job.fileStore.logToMaster('Consolidating output: {}'.format(config.uuid))
work_dir = job.fileStore.getLocalTempDir()
# Retrieve output file paths to consolidate
rsem_tar, hugo_tar, kallisto_tar, fastqc_tar = None, None, None, None
Expand Down Expand Up @@ -266,15 +266,14 @@ def consolidate_output(job, config, kallisto_output, rsem_output, fastqc_output)
else:
tarinfo.name = os.path.join(config.uuid, 'QC', os.path.basename(tarinfo.name))
f_out.addfile(tarinfo, fileobj=f_in_file)
# Move to output directory
if config.output_dir:
# Move to output location
if urlparse(config.output_dir).scheme == 's3':
job.fileStore.logToMaster('Uploading {} to S3: {}'.format(config.uuid, config.output_dir))
s3am_upload(fpath=out_tar, s3_dir=config.output_dir, num_cores=config.cores)
else:
job.fileStore.logToMaster('Moving {} to output dir: {}'.format(config.uuid, config.output_dir))
mkdir_p(config.output_dir)
copy_files(file_paths=[os.path.join(work_dir, config.uuid + '.tar.gz')], output_dir=config.output_dir)
# Upload to S3
if config.s3_output_dir:
job.fileStore.logToMaster('Uploading {} to S3: {}'.format(config.uuid, config.s3_output_dir))
s3am_upload(fpath=out_tar, s3_dir=config.s3_output_dir, num_cores=config.cores)


# Pipeline specific functions
Expand Down Expand Up @@ -316,8 +315,12 @@ def generate_config():
# This configuration file is formatted in YAML. Simply write the value (at least one space) after the colon.
# Edit the values in this configuration file and then rerun the pipeline: "toil-rnaseq run"
# Just Kallisto or STAR/RSEM can be run by supplying only the inputs to those tools
#
# URLs can take the form: http://, file://, s3://, gnos://
# Local inputs follow the URL convention: file:///full/path/to/input
# Comments (beginning with #) do not need to be removed. Optional parameters may be left blank.
# S3 URLs follow the convention: s3://bucket/directory/file.txt
#
# Comments (beginning with #) do not need to be removed. Optional parameters left blank are treated as false.
##############################################################################################################
# Required: URL {scheme} to index tarball used by STAR
star-index: s3://cgl-pipeline-inputs/rnaseq_cgl/starIndex_hg38_no_alt.tar.gz
Expand All @@ -328,13 +331,10 @@ def generate_config():
# Required: URL {scheme} to reference tarball used by RSEM
rsem-ref: s3://cgl-pipeline-inputs/rnaseq_cgl/rsem_ref_hg38_no_alt.tar.gz
# NOTE: Pipeline requires at least one output option
# Optional: Provide a full file path (/path/to/output-dir) where results will appear
# Required: Output location of sample. Can be full path to a directory or an s3:// URL
# Warning: S3 buckets must exist prior to upload or it will fail.
output-dir:
# Optional: Provide an s3 path (s3://bucket/dir) where results will appear
s3-output-dir:
# Optional: If true, will preprocess samples with cutadapt using adapter sequences.
cutadapt: true
Expand Down Expand Up @@ -493,8 +493,7 @@ def main():
if config.star_index or config.rsem_ref:
require(config.star_index and config.rsem_ref, 'Input provided for STAR or RSEM but not both. STAR: '
'{}, RSEM: {}'.format(config.star_index, config.rsem_ref))
require(config.output_dir or config.s3_output_dir, 'output-dir AND/OR s3-output-dir need to be defined, '
'otherwise sample output is not stored anywhere!')
require(config.output_dir, 'No output location specified: {}'.format(config.output_dir))
for input in [x for x in [config.kallisto_index, config.star_index, config.rsem_ref] if x]:
require(urlparse(input).scheme in schemes,
'Input in config must have the appropriate URL prefix: {}'.format(schemes))
Expand Down
3 changes: 1 addition & 2 deletions src/toil_scripts/rnaseq_cgl/test/test_rnaseq_cgl.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,8 +89,7 @@ def _generate_config(self):
star-index: {input_dir}/starIndex_chr6.tar.gz
kallisto-index: s3://cgl-pipeline-inputs/rnaseq_cgl/kallisto_hg38.idx
rsem-ref: {input_dir}/rsem_ref_chr6.tar.gz
output-dir:
s3-output-dir: {output_dir}
output-dir: {output_dir}
fastqc: true
cutadapt:
ssec:
Expand Down
2 changes: 1 addition & 1 deletion src/toil_scripts/tools/aligners.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def run_star(job, cores, r1_id, r2_id, star_index_url, wiggle=False):
job.fileStore.readGlobalFile(r2_id, os.path.join(work_dir, 'R2.fastq'))
parameters.extend(['--readFilesIn', '/data/R1.fastq', '/data/R2.fastq'])
else:
job.fileStore.readGlobalFile(r1_id, os.path.join(work_dir, 'R1_cutadapt.fastq'))
job.fileStore.readGlobalFile(r1_id, os.path.join(work_dir, 'R1.fastq'))
parameters.extend(['--readFilesIn', '/data/R1.fastq'])
# Call: STAR Mapping
docker_call(tool='quay.io/ucsc_cgl/star:2.4.2a--bcbd5122b69ff6ac4ef61958e47bde94001cfe80',
Expand Down
4 changes: 2 additions & 2 deletions src/toil_scripts/tools/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,10 @@ def run_bwa_index(job, ref_id):
"""
job.fileStore.logToMaster('Created BWA index files')
work_dir = job.fileStore.getLocalTempDir()
job.fileStore.readGlobalFile(ref_id, os.path.join(work_dir, 'ref.fasta'))
job.fileStore.readGlobalFile(ref_id, os.path.join(work_dir, 'ref.fa'))
command = ['index', '/data/ref.fa']
docker_call(work_dir=work_dir, parameters=command,
tool='quay.io/ucsc_cgl/samtools:0.1.19--dd5ac549b95eb3e5d166a5e310417ef13651994e')
tool='quay.io/ucsc_cgl/bwa:0.7.12--256539928ea162949d8a65ca5c79a72ef557ce7c')
ids = {}
for output in ['ref.fa.amb', 'ref.fa.ann', 'ref.fa.bwt', 'ref.fa.pac', 'ref.fa.sa']:
ids[output.split('.')[-1]] = (job.fileStore.writeGlobalFile(os.path.join(work_dir, output)))
Expand Down
2 changes: 1 addition & 1 deletion src/toil_scripts/tools/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def run_cutadapt(job, r1_id, r2_id, fwd_3pr_adapter, rev_3pr_adapter):
'/data/R1.fastq', '/data/R2.fastq'])
else:
job.fileStore.readGlobalFile(r1_id, os.path.join(work_dir, 'R1.fastq'))
parameters.extend(['-o', '/data/R1.fastq', '/data/R1.fastq'])
parameters.extend(['-o', '/data/R1_cutadapt.fastq', '/data/R1.fastq'])
# Call: CutAdapt
docker_call(tool='quay.io/ucsc_cgl/cutadapt:1.9--6bd44edd2b8f8f17e25c5a268fedaab65fa851d2',
work_dir=work_dir, parameters=parameters)
Expand Down

0 comments on commit 007088e

Please sign in to comment.