forked from gatk-workflows/gatk4-somatic-snvs-indels
-
Notifications
You must be signed in to change notification settings - Fork 0
/
mutect2.wdl
1099 lines (964 loc) · 42.1 KB
/
mutect2.wdl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
version 1.0
## Copyright Broad Institute, 2017
##
## This WDL workflow runs GATK4 Mutect 2 on a single tumor-normal pair or on a single tumor sample,
## and performs additional filtering and functional annotation tasks.
##
## Main requirements/expectations :
## - One analysis-ready BAM file (and its index) for each sample
##
## Description of inputs:
##
## ** Runtime **
## gatk_docker: docker image to use for GATK 4 Mutect2
## preemptible: how many preemptions to tolerate before switching to a non-preemptible machine (on Google)
## max_retries: how many times to retry failed tasks -- very important on the cloud when there are transient errors
## gatk_override: (optional) local file or Google bucket path to a GATK 4 java jar file to be used instead of the GATK 4 jar
## in the docker image. This must be supplied when running in an environment that does not support docker
## (e.g. SGE cluster on a Broad on-prem VM)
##
## ** Workflow options **
## intervals: genomic intervals (will be used for scatter)
## scatter_count: number of parallel jobs to generate when scattering over intervals
## m2_extra_args, m2_extra_filtering_args: additional arguments for Mutect2 calling and filtering (optional)
## split_intervals_extra_args: additional arguments for splitting intervals before scattering (optional)
## run_orientation_bias_mixture_model_filter: (optional) if true, filter orientation bias sites with the read orientation artifact mixture model.
##
## ** Primary inputs **
## ref_fasta, ref_fai, ref_dict: reference genome, index, and dictionary
## tumor_bam, tumor_bam_index: BAM and index for the tumor sample
## normal_bam, normal_bam_index: BAM and index for the normal sample
##
## ** Primary resources ** (optional but strongly recommended)
## pon, pon_idx: optional panel of normals (and its index) in VCF format containing probable technical artifacts (false positves)
## gnomad, gnomad_idx: optional database of known germline variants (and its index) (see http://gnomad.broadinstitute.org/downloads)
## variants_for_contamination, variants_for_contamination_idx: VCF of common variants (and its index)with allele frequencies for calculating contamination
##
## ** Secondary resources ** (for optional tasks)
## realignment_index_bundle: resource for FilterAlignmentArtifacts, which runs if and only if it is specified. Generated by BwaMemIndexImageCreator.
##
## Funcotator parameters (see Funcotator help for more details).
## funco_reference_version: "hg19" for hg19 or b37. "hg38" for hg38. Default: "hg19"
## funco_output_format: "MAF" to produce a MAF file, "VCF" to procude a VCF file. Default: "MAF"
## funco_compress: (Only valid if funco_output_format == "VCF" ) If true, will compress the output of Funcotator. If false, produces an uncompressed output file. Default: false
## funco_use_gnomad_AF: If true, will include gnomAD allele frequency annotations in output by connecting to the internet to query gnomAD (this impacts performance). If false, will not annotate with gnomAD. Default: false
## funco_transcript_selection_mode: How to select transcripts in Funcotator. ALL, CANONICAL, or BEST_EFFECT
## funco_transcript_selection_list: Transcripts (one GENCODE ID per line) to give priority during selection process.
## funco_data_sources_tar_gz: Funcotator datasources tar gz file. Bucket location is recommended when running on the cloud.
## funco_annotation_defaults: Default values for annotations, when values are unspecified. Specified as <ANNOTATION>:<VALUE>. For example: "Center:Broad"
## funco_annotation_overrides: Values for annotations, even when values are unspecified. Specified as <ANNOTATION>:<VALUE>. For example: "Center:Broad"
## funcotator_excluded_fields: Annotations that should not appear in the output (VCF or MAF). Specified as <ANNOTATION>. For example: "ClinVar_ALLELEID"
## funco_filter_funcotations: If true, will only annotate variants that have passed filtering (. or PASS value in the FILTER column). If false, will annotate all variants in the input file. Default: true
## funcotator_extra_args: Any additional arguments to pass to Funcotator. Default: ""
##
## Outputs :
## - One VCF file and its index with primary filtering applied; secondary filtering and functional annotation if requested; a bamout.bam
## file of reassembled reads if requested
##
## Cromwell version support
## - Successfully tested on v34
##
## LICENSING :
## This script is released under the WDL source code license (BSD-3) (see LICENSE in
## https://github.com/broadinstitute/wdl). Note however that the programs it calls may
## be subject to different licenses. Users are responsible for checking that they are
## authorized to run all programs before running this script. Please see the docker
## pages at https://hub.docker.com/r/broadinstitute/* for detailed licensing information
## pertaining to the included programs.
struct Runtime {
String gatk_docker
File? gatk_override
Int max_retries
Int preemptible
Int cpu
Int machine_mem
Int command_mem
Int disk
Int boot_disk_size
}
workflow Mutect2 {
input {
# Mutect2 inputs
File? intervals
File ref_fasta
File ref_fai
File ref_dict
File tumor_reads
File tumor_reads_index
File? normal_reads
File? normal_reads_index
File? pon
File? pon_idx
Int scatter_count
File? gnomad
File? gnomad_idx
File? variants_for_contamination
File? variants_for_contamination_idx
File? realignment_index_bundle
String? realignment_extra_args
Boolean? run_orientation_bias_mixture_model_filter
String? m2_extra_args
String? m2_extra_filtering_args
String? split_intervals_extra_args
Boolean? make_bamout
Boolean? compress_vcfs
File? gga_vcf
File? gga_vcf_idx
# Funcotator inputs
Boolean? run_funcotator
String? sequencing_center
String? sequence_source
String? funco_reference_version
String? funco_output_format
Boolean? funco_compress
Boolean? funco_use_gnomad_AF
File? funco_data_sources_tar_gz
String? funco_transcript_selection_mode
File? funco_transcript_selection_list
Array[String]? funco_annotation_defaults
Array[String]? funco_annotation_overrides
Array[String]? funcotator_excluded_fields
Boolean? funco_filter_funcotations
String? funcotator_extra_args
String funco_default_output_format = "MAF"
# runtime
String gatk_docker
File? gatk_override
String basic_bash_docker = "ubuntu:16.04"
Boolean? filter_funcotations
Int? preemptible
Int? max_retries
Int small_task_cpu = 2
Int small_task_mem = 4
Int small_task_disk = 100
Int boot_disk_size = 12
Int learn_read_orientation_mem = 8000
Int filter_alignment_artifacts_mem = 9000
# Use as a last resort to increase the disk given to every task in case of ill behaving data
Int? emergency_extra_disk
# These are multipliers to multipler inputs by to make sure we have enough disk to accommodate for possible output sizes
# Large is for Bams/WGS vcfs
# Small is for metrics/other vcfs
Float large_input_to_output_multiplier = 2.25
Float small_input_to_output_multiplier = 2.0
Float cram_to_bam_multiplier = 6.0
}
Int preemptible_or_default = select_first([preemptible, 2])
Int max_retries_or_default = select_first([max_retries, 2])
Boolean compress = select_first([compress_vcfs, false])
Boolean run_ob_filter = select_first([run_orientation_bias_mixture_model_filter, false])
Boolean make_bamout_or_default = select_first([make_bamout, false])
Boolean run_funcotator_or_default = select_first([run_funcotator, false])
Boolean filter_funcotations_or_default = select_first([filter_funcotations, true])
# Disk sizes used for dynamic sizing
Int ref_size = ceil(size(ref_fasta, "GB") + size(ref_dict, "GB") + size(ref_fai, "GB"))
Int tumor_reads_size = ceil(size(tumor_reads, "GB") + size(tumor_reads_index, "GB"))
Int gnomad_vcf_size = if defined(gnomad) then ceil(size(gnomad, "GB")) else 0
Int normal_reads_size = if defined(normal_reads) then ceil(size(normal_reads, "GB") + size(normal_reads_index, "GB")) else 0
# If no tar is provided, the task downloads one from broads ftp server
Int funco_tar_size = if defined(funco_data_sources_tar_gz) then ceil(size(funco_data_sources_tar_gz, "GB") * 3) else 100
Int gatk_override_size = if defined(gatk_override) then ceil(size(gatk_override, "GB")) else 0
# This is added to every task as padding, should increase if systematically you need more disk for every call
Int disk_pad = 10 + gatk_override_size + select_first([emergency_extra_disk,0])
# logic about output file names -- these are the names *without* .vcf extensions
String output_basename = basename(basename(tumor_reads, ".bam"),".cram") #hacky way to strip either .bam or .cram
String unfiltered_name = output_basename + "-unfiltered"
String filtered_name = output_basename + "-filtered"
String funcotated_name = output_basename + "-funcotated"
String output_vcf_name = output_basename + ".vcf"
Int tumor_cram_to_bam_disk = ceil(tumor_reads_size * cram_to_bam_multiplier)
Int normal_cram_to_bam_disk = ceil(normal_reads_size * cram_to_bam_multiplier)
Runtime standard_runtime = {"gatk_docker": gatk_docker, "gatk_override": gatk_override,
"max_retries": max_retries_or_default, "preemptible": preemptible_or_default, "cpu": small_task_cpu,
"machine_mem": small_task_mem * 1000, "command_mem": small_task_mem * 1000 - 500,
"disk": small_task_disk + disk_pad, "boot_disk_size": boot_disk_size}
if (basename(tumor_reads) != basename(tumor_reads, ".cram")) {
call CramToBam as TumorCramToBam {
input:
ref_fasta = ref_fasta,
ref_fai = ref_fai,
ref_dict = ref_dict,
cram = tumor_reads,
crai = tumor_reads_index,
name = output_basename,
disk_size = tumor_cram_to_bam_disk
}
}
String normal_or_empty = select_first([normal_reads, ""])
if (basename(normal_or_empty) != basename(normal_or_empty, ".cram")) {
String normal_basename = basename(basename(normal_or_empty, ".bam"),".cram")
call CramToBam as NormalCramToBam {
input:
ref_fasta = ref_fasta,
ref_fai = ref_fai,
ref_dict = ref_dict,
cram = normal_reads,
crai = normal_reads_index,
name = normal_basename,
disk_size = normal_cram_to_bam_disk
}
}
File tumor_bam = select_first([TumorCramToBam.output_bam, tumor_reads])
File tumor_bai = select_first([TumorCramToBam.output_bai, tumor_reads_index])
File? normal_bam = if defined(normal_reads) then select_first([NormalCramToBam.output_bam, normal_reads]) else normal_reads
File? normal_bai = if defined(normal_reads) then select_first([NormalCramToBam.output_bai, normal_reads_index]) else normal_reads_index
Int tumor_bam_size = ceil(size(tumor_bam, "GB") + size(tumor_bai, "GB"))
Int normal_bam_size = if defined(normal_bam) then ceil(size(normal_bam, "GB") + size(normal_bai, "GB")) else 0
Int m2_output_size = tumor_bam_size / scatter_count
#TODO: do we need to change this disk size now that NIO is always going to happen (for the google backend only)
Int m2_per_scatter_size = (tumor_bam_size + normal_bam_size) + ref_size + gnomad_vcf_size + m2_output_size + disk_pad
call SplitIntervals {
input:
intervals = intervals,
ref_fasta = ref_fasta,
ref_fai = ref_fai,
ref_dict = ref_dict,
scatter_count = scatter_count,
split_intervals_extra_args = split_intervals_extra_args,
runtime_params = standard_runtime
}
scatter (subintervals in SplitIntervals.interval_files ) {
call M2 {
input:
intervals = subintervals,
ref_fasta = ref_fasta,
ref_fai = ref_fai,
ref_dict = ref_dict,
tumor_bam = tumor_bam,
tumor_bai = tumor_bai,
normal_bam = normal_bam,
normal_bai = normal_bai,
pon = pon,
pon_idx = pon_idx,
gnomad = gnomad,
gnomad_idx = gnomad_idx,
preemptible = preemptible,
max_retries = max_retries,
m2_extra_args = m2_extra_args,
variants_for_contamination = variants_for_contamination,
variants_for_contamination_idx = variants_for_contamination_idx,
make_bamout = make_bamout_or_default,
run_ob_filter = run_ob_filter,
compress = compress,
gga_vcf = gga_vcf,
gga_vcf_idx = gga_vcf_idx,
gatk_override = gatk_override,
gatk_docker = gatk_docker,
disk_space = m2_per_scatter_size
}
}
Int merged_vcf_size = ceil(size(M2.unfiltered_vcf, "GB"))
Int merged_bamout_size = ceil(size(M2.output_bamOut, "GB"))
Int merged_tumor_pileups_size = ceil(size(M2.tumor_pileups, "GB"))
Int merged_normal_pileups_size = ceil(size(M2.tumor_pileups, "GB"))
if (run_ob_filter) {
call LearnReadOrientationModel {
input:
f1r2_tar_gz = M2.f1r2_counts,
runtime_params = standard_runtime,
mem = learn_read_orientation_mem
}
}
call MergeVCFs {
input:
input_vcfs = M2.unfiltered_vcf,
input_vcf_indices = M2.unfiltered_vcf_idx,
output_name = unfiltered_name,
compress = compress,
runtime_params = standard_runtime
}
if (make_bamout_or_default) {
call MergeBamOuts {
input:
ref_fasta = ref_fasta,
ref_fai = ref_fai,
ref_dict = ref_dict,
bam_outs = M2.output_bamOut,
output_vcf_name = basename(MergeVCFs.merged_vcf, ".vcf"),
runtime_params = standard_runtime,
disk_space = ceil(merged_bamout_size * large_input_to_output_multiplier) + disk_pad,
}
}
call MergeStats { input: stats = M2.stats, runtime_params = standard_runtime }
if (defined(variants_for_contamination)) {
call MergePileupSummaries as MergeTumorPileups {
input:
input_tables = M2.tumor_pileups,
output_name = output_basename,
ref_dict = ref_dict,
runtime_params = standard_runtime
}
if (defined(normal_bam)){
call MergePileupSummaries as MergeNormalPileups {
input:
input_tables = M2.normal_pileups,
output_name = output_basename,
ref_dict = ref_dict,
runtime_params = standard_runtime
}
}
call CalculateContamination {
input:
tumor_pileups = MergeTumorPileups.merged_table,
normal_pileups = MergeNormalPileups.merged_table,
runtime_params = standard_runtime
}
}
call Filter {
input:
ref_fasta = ref_fasta,
ref_fai = ref_fai,
ref_dict = ref_dict,
intervals = intervals,
unfiltered_vcf = MergeVCFs.merged_vcf,
unfiltered_vcf_idx = MergeVCFs.merged_vcf_idx,
output_name = filtered_name,
compress = compress,
mutect_stats = MergeStats.merged_stats,
contamination_table = CalculateContamination.contamination_table,
maf_segments = CalculateContamination.maf_segments,
artifact_priors_tar_gz = LearnReadOrientationModel.artifact_prior_table,
m2_extra_filtering_args = m2_extra_filtering_args,
runtime_params = standard_runtime,
disk_space = ceil(size(MergeVCFs.merged_vcf, "GB") * small_input_to_output_multiplier) + disk_pad
}
if (defined(realignment_index_bundle)) {
call FilterAlignmentArtifacts {
input:
bam = tumor_bam,
bai = tumor_bai,
realignment_index_bundle = select_first([realignment_index_bundle]),
realignment_extra_args = realignment_extra_args,
compress = compress,
output_name = filtered_name,
input_vcf = Filter.filtered_vcf,
input_vcf_idx = Filter.filtered_vcf_idx,
runtime_params = standard_runtime,
mem = filter_alignment_artifacts_mem
}
}
if (run_funcotator_or_default) {
File funcotate_vcf_input = select_first([FilterAlignmentArtifacts.filtered_vcf, Filter.filtered_vcf])
File funcotate_vcf_input_index = select_first([FilterAlignmentArtifacts.filtered_vcf_idx, Filter.filtered_vcf_idx])
call Funcotate {
input:
ref_fasta = ref_fasta,
ref_fai = ref_fai,
ref_dict = ref_dict,
input_vcf = funcotate_vcf_input,
input_vcf_idx = funcotate_vcf_input_index,
reference_version = select_first([funco_reference_version, "hg19"]),
output_file_base_name = basename(funcotate_vcf_input, ".vcf") + ".annotated",
output_format = if defined(funco_output_format) then "" + funco_output_format else funco_default_output_format,
compress = if defined(funco_compress) then select_first([funco_compress]) else false,
use_gnomad = if defined(funco_use_gnomad_AF) then select_first([funco_use_gnomad_AF]) else false,
data_sources_tar_gz = funco_data_sources_tar_gz,
case_id = M2.tumor_sample[0],
control_id = M2.normal_sample[0],
sequencing_center = sequencing_center,
sequence_source = sequence_source,
transcript_selection_mode = funco_transcript_selection_mode,
transcript_selection_list = funco_transcript_selection_list,
annotation_defaults = funco_annotation_defaults,
annotation_overrides = funco_annotation_overrides,
funcotator_excluded_fields = funcotator_excluded_fields,
filter_funcotations = filter_funcotations_or_default,
extra_args = funcotator_extra_args,
runtime_params = standard_runtime,
disk_space = ceil(size(funcotate_vcf_input, "GB") * large_input_to_output_multiplier) + funco_tar_size + disk_pad
}
}
output {
File filtered_vcf = select_first([FilterAlignmentArtifacts.filtered_vcf, Filter.filtered_vcf])
File filtered_vcf_idx = select_first([FilterAlignmentArtifacts.filtered_vcf_idx, Filter.filtered_vcf_idx])
File filtering_stats = Filter.filtering_stats
File mutect_stats = MergeStats.merged_stats
File? contamination_table = CalculateContamination.contamination_table
File? funcotated_file = Funcotate.funcotated_output_file
File? funcotated_file_index = Funcotate.funcotated_output_file_index
File? bamout = MergeBamOuts.merged_bam_out
File? bamout_index = MergeBamOuts.merged_bam_out_index
File? maf_segments = CalculateContamination.maf_segments
File? read_orientation_model_params = LearnReadOrientationModel.artifact_prior_table
}
}
task CramToBam {
input {
File ref_fasta
File ref_fai
File ref_dict
#cram and crai must be optional since Normal cram is optional
File? cram
File? crai
String name
Int disk_size
Int? mem
}
Int machine_mem = if defined(mem) then mem * 1000 else 6000
#Calls samtools view to do the conversion
command {
#Set -e and -o says if any command I run fails in this script, make sure to return a failure
set -e
set -o pipefail
samtools view -h -T ~{ref_fasta} ~{cram} |
samtools view -b -o ~{name}.bam -
samtools index -b ~{name}.bam
mv ~{name}.bam.bai ~{name}.bai
}
runtime {
docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.3.3-1513176735"
memory: machine_mem + " MB"
disks: "local-disk " + disk_size + " HDD"
}
output {
File output_bam = "~{name}.bam"
File output_bai = "~{name}.bai"
}
}
task SplitIntervals {
input {
File? intervals
File ref_fasta
File ref_fai
File ref_dict
Int scatter_count
String? split_intervals_extra_args
# runtime
Runtime runtime_params
}
command {
set -e
export GATK_LOCAL_JAR=~{default="/root/gatk.jar" runtime_params.gatk_override}
mkdir interval-files
gatk --java-options "-Xmx~{runtime_params.command_mem}m" SplitIntervals \
-R ~{ref_fasta} \
~{"-L " + intervals} \
-scatter ~{scatter_count} \
-O interval-files \
~{split_intervals_extra_args}
cp interval-files/*.interval_list .
}
runtime {
docker: runtime_params.gatk_docker
bootDiskSizeGb: runtime_params.boot_disk_size
memory: runtime_params.machine_mem + " MB"
disks: "local-disk " + runtime_params.disk + " HDD"
preemptible: runtime_params.preemptible
maxRetries: runtime_params.max_retries
cpu: runtime_params.cpu
}
output {
Array[File] interval_files = glob("*.interval_list")
}
}
task M2 {
input {
File? intervals
File ref_fasta
File ref_fai
File ref_dict
File tumor_bam
File tumor_bai
File? normal_bam
File? normal_bai
File? pon
File? pon_idx
File? gnomad
File? gnomad_idx
String? m2_extra_args
Boolean? make_bamout
Boolean? run_ob_filter
Boolean compress
File? gga_vcf
File? gga_vcf_idx
File? variants_for_contamination
File? variants_for_contamination_idx
File? gatk_override
# runtime
String gatk_docker
Int? mem
Int? preemptible
Int? max_retries
Int? disk_space
Int? cpu
Boolean use_ssd = false
}
String output_vcf = "output" + if compress then ".vcf.gz" else ".vcf"
String output_vcf_idx = output_vcf + if compress then ".tbi" else ".idx"
String output_stats = output_vcf + ".stats"
# Mem is in units of GB but our command and memory runtime values are in MB
Int machine_mem = if defined(mem) then mem * 1000 else 3500
Int command_mem = machine_mem - 500
parameter_meta{
intervals: {localization_optional: true}
ref_fasta: {localization_optional: true}
ref_fai: {localization_optional: true}
ref_dict: {localization_optional: true}
tumor_bam: {localization_optional: true}
tumor_bai: {localization_optional: true}
normal_bam: {localization_optional: true}
normal_bai: {localization_optional: true}
pon: {localization_optional: true}
pon_idx: {localization_optional: true}
gnomad: {localization_optional: true}
gnomad_idx: {localization_optional: true}
gga_vcf: {localization_optional: true}
gga_vcf_idx: {localization_optional: true}
variants_for_contamination: {localization_optional: true}
variants_for_contamination_idx: {localization_optional: true}
}
command <<<
set -e
export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk_override}
# We need to create these files regardless, even if they stay empty
touch bamout.bam
touch f1r2.tar.gz
echo "" > normal_name.txt
gatk --java-options "-Xmx~{command_mem}m" GetSampleName -R ~{ref_fasta} -I ~{tumor_bam} -O tumor_name.txt -encode
tumor_command_line="-I ~{tumor_bam} -tumor `cat tumor_name.txt`"
if [[ ! -z "~{normal_bam}" ]]; then
gatk --java-options "-Xmx~{command_mem}m" GetSampleName -R ~{ref_fasta} -I ~{normal_bam} -O normal_name.txt -encode
normal_command_line="-I ~{normal_bam} -normal `cat normal_name.txt`"
fi
gatk --java-options "-Xmx~{command_mem}m" Mutect2 \
-R ~{ref_fasta} \
$tumor_command_line \
$normal_command_line \
~{"--germline-resource " + gnomad} \
~{"-pon " + pon} \
~{"-L " + intervals} \
~{"--alleles " + gga_vcf} \
-O "~{output_vcf}" \
~{true='--bam-output bamout.bam' false='' make_bamout} \
~{true='--f1r2-tar-gz f1r2.tar.gz' false='' run_ob_filter} \
~{m2_extra_args}
### GetPileupSummaries
# These must be created, even if they remain empty, as cromwell doesn't support optional output
touch tumor-pileups.table
touch normal-pileups.table
if [[ ! -z "~{variants_for_contamination}" ]]; then
gatk --java-options "-Xmx~{command_mem}m" GetPileupSummaries -R ~{ref_fasta} -I ~{tumor_bam} ~{"--interval-set-rule INTERSECTION -L " + intervals} \
-V ~{variants_for_contamination} -L ~{variants_for_contamination} -O tumor-pileups.table
if [[ ! -z "~{normal_bam}" ]]; then
gatk --java-options "-Xmx~{command_mem}m" GetPileupSummaries -R ~{ref_fasta} -I ~{normal_bam} ~{"--interval-set-rule INTERSECTION -L " + intervals} \
-V ~{variants_for_contamination} -L ~{variants_for_contamination} -O normal-pileups.table
fi
fi
>>>
runtime {
docker: gatk_docker
bootDiskSizeGb: 12
memory: machine_mem + " MB"
disks: "local-disk " + select_first([disk_space, 100]) + if use_ssd then " SSD" else " HDD"
preemptible: select_first([preemptible, 10])
maxRetries: select_first([max_retries, 0])
cpu: select_first([cpu, 1])
}
output {
File unfiltered_vcf = "~{output_vcf}"
File unfiltered_vcf_idx = "~{output_vcf_idx}"
File output_bamOut = "bamout.bam"
String tumor_sample = read_string("tumor_name.txt")
String normal_sample = read_string("normal_name.txt")
File stats = "~{output_stats}"
File f1r2_counts = "f1r2.tar.gz"
File tumor_pileups = "tumor-pileups.table"
File normal_pileups = "normal-pileups.table"
}
}
task MergeVCFs {
input {
Array[File] input_vcfs
Array[File] input_vcf_indices
String output_name
Boolean compress
Runtime runtime_params
}
String output_vcf = output_name + if compress then ".vcf.gz" else ".vcf"
String output_vcf_idx = output_vcf + if compress then ".tbi" else ".idx"
# using MergeVcfs instead of GatherVcfs so we can create indices
# WARNING 2015-10-28 15:01:48 GatherVcfs Index creation not currently supported when gathering block compressed VCFs.
command {
set -e
export GATK_LOCAL_JAR=~{default="/root/gatk.jar" runtime_params.gatk_override}
gatk --java-options "-Xmx~{runtime_params.command_mem}m" MergeVcfs -I ~{sep=' -I ' input_vcfs} -O ~{output_vcf}
}
runtime {
docker: runtime_params.gatk_docker
bootDiskSizeGb: runtime_params.boot_disk_size
memory: runtime_params.machine_mem + " MB"
disks: "local-disk " + runtime_params.disk + " HDD"
preemptible: runtime_params.preemptible
maxRetries: runtime_params.max_retries
cpu: runtime_params.cpu
}
output {
File merged_vcf = "~{output_vcf}"
File merged_vcf_idx = "~{output_vcf_idx}"
}
}
task MergeBamOuts {
input {
File ref_fasta
File ref_fai
File ref_dict
Array[File]+ bam_outs
String output_vcf_name
Runtime runtime_params
Int? disk_space #override to request more disk than default small task params
}
command <<<
# This command block assumes that there is at least one file in bam_outs.
# Do not call this task if len(bam_outs) == 0
set -e
export GATK_LOCAL_JAR=~{default="/root/gatk.jar" runtime_params.gatk_override}
gatk --java-options "-Xmx~{runtime_params.command_mem}m" GatherBamFiles \
-I ~{sep=" -I " bam_outs} -O unsorted.out.bam -R ~{ref_fasta}
# We must sort because adjacent scatters may have overlapping (padded) assembly regions, hence
# overlapping bamouts
gatk --java-options "-Xmx~{runtime_params.command_mem}m" SortSam -I unsorted.out.bam \
-O ~{output_vcf_name}.out.bam \
--SORT_ORDER coordinate -VALIDATION_STRINGENCY LENIENT
gatk --java-options "-Xmx~{runtime_params.command_mem}m" BuildBamIndex -I ~{output_vcf_name}.out.bam -VALIDATION_STRINGENCY LENIENT
>>>
runtime {
docker: runtime_params.gatk_docker
bootDiskSizeGb: runtime_params.boot_disk_size
memory: runtime_params.machine_mem + " MB"
disks: "local-disk " + select_first([disk_space, runtime_params.disk]) + " HDD"
preemptible: runtime_params.preemptible
maxRetries: runtime_params.max_retries
cpu: runtime_params.cpu
}
output {
File merged_bam_out = "~{output_vcf_name}.out.bam"
File merged_bam_out_index = "~{output_vcf_name}.out.bai"
}
}
task MergeStats {
input {
Array[File]+ stats
Runtime runtime_params
}
command {
set -e
export GATK_LOCAL_JAR=~{default="/root/gatk.jar" runtime_params.gatk_override}
gatk --java-options "-Xmx~{runtime_params.command_mem}m" MergeMutectStats \
-stats ~{sep=" -stats " stats} -O merged.stats
}
runtime {
docker: runtime_params.gatk_docker
bootDiskSizeGb: runtime_params.boot_disk_size
memory: runtime_params.machine_mem + " MB"
disks: "local-disk " + runtime_params.disk + " HDD"
preemptible: runtime_params.preemptible
maxRetries: runtime_params.max_retries
cpu: runtime_params.cpu
}
output {
File merged_stats = "merged.stats"
}
}
task MergePileupSummaries {
input {
Array[File] input_tables
String output_name
File ref_dict
Runtime runtime_params
}
command {
set -e
export GATK_LOCAL_JAR=~{default="/root/gatk.jar" runtime_params.gatk_override}
gatk --java-options "-Xmx~{runtime_params.command_mem}m" GatherPileupSummaries \
--sequence-dictionary ~{ref_dict} \
-I ~{sep=' -I ' input_tables} \
-O ~{output_name}.tsv
}
runtime {
docker: runtime_params.gatk_docker
bootDiskSizeGb: runtime_params.boot_disk_size
memory: runtime_params.machine_mem + " MB"
disks: "local-disk " + runtime_params.disk + " HDD"
preemptible: runtime_params.preemptible
maxRetries: runtime_params.max_retries
cpu: runtime_params.cpu
}
output {
File merged_table = "~{output_name}.tsv"
}
}
# Learning step of the orientation bias mixture model, which is the recommended orientation bias filter as of September 2018
task LearnReadOrientationModel {
input {
Array[File] f1r2_tar_gz
Runtime runtime_params
Int? mem #override memory
}
Int machine_mem = select_first([mem, runtime_params.machine_mem])
Int command_mem = machine_mem - 1000
command {
set -e
export GATK_LOCAL_JAR=~{default="/root/gatk.jar" runtime_params.gatk_override}
gatk --java-options "-Xmx~{command_mem}m" LearnReadOrientationModel \
-I ~{sep=" -I " f1r2_tar_gz} \
-O "artifact-priors.tar.gz"
}
runtime {
docker: runtime_params.gatk_docker
bootDiskSizeGb: runtime_params.boot_disk_size
memory: machine_mem + " MB"
disks: "local-disk " + runtime_params.disk + " HDD"
preemptible: runtime_params.preemptible
maxRetries: runtime_params.max_retries
cpu: runtime_params.cpu
}
output {
File artifact_prior_table = "artifact-priors.tar.gz"
}
}
task CalculateContamination {
input {
String? intervals
File tumor_pileups
File? normal_pileups
Runtime runtime_params
}
command {
set -e
export GATK_LOCAL_JAR=~{default="/root/gatk.jar" runtime_params.gatk_override}
gatk --java-options "-Xmx~{runtime_params.command_mem}m" CalculateContamination -I ~{tumor_pileups} \
-O contamination.table --tumor-segmentation segments.table ~{"-matched " + normal_pileups}
}
runtime {
docker: runtime_params.gatk_docker
bootDiskSizeGb: runtime_params.boot_disk_size
memory: runtime_params.machine_mem + " MB"
disks: "local-disk " + runtime_params.disk + " HDD"
preemptible: runtime_params.preemptible
maxRetries: runtime_params.max_retries
cpu: runtime_params.cpu
}
output {
File contamination_table = "contamination.table"
File maf_segments = "segments.table"
}
}
task Filter {
input {
File? intervals
File ref_fasta
File ref_fai
File ref_dict
File unfiltered_vcf
File unfiltered_vcf_idx
String output_name
Boolean compress
File? mutect_stats
File? artifact_priors_tar_gz
File? contamination_table
File? maf_segments
String? m2_extra_filtering_args
Runtime runtime_params
Int? disk_space
}
String output_vcf = output_name + if compress then ".vcf.gz" else ".vcf"
String output_vcf_idx = output_vcf + if compress then ".tbi" else ".idx"
parameter_meta{
ref_fasta: {localization_optional: true}
ref_fai: {localization_optional: true}
ref_dict: {localization_optional: true}
}
command {
set -e
export GATK_LOCAL_JAR=~{default="/root/gatk.jar" runtime_params.gatk_override}
gatk --java-options "-Xmx~{runtime_params.command_mem}m" FilterMutectCalls -V ~{unfiltered_vcf} \
-R ~{ref_fasta} \
-O ~{output_vcf} \
~{"--contamination-table " + contamination_table} \
~{"--tumor-segmentation " + maf_segments} \
~{"--ob-priors " + artifact_priors_tar_gz} \
~{"-stats " + mutect_stats} \
--filtering-stats filtering.stats \
~{m2_extra_filtering_args}
}
runtime {
docker: runtime_params.gatk_docker
bootDiskSizeGb: runtime_params.boot_disk_size
memory: runtime_params.machine_mem + " MB"
disks: "local-disk " + select_first([disk_space, runtime_params.disk]) + " HDD"
preemptible: runtime_params.preemptible
maxRetries: runtime_params.max_retries
cpu: runtime_params.cpu
}
output {
File filtered_vcf = "~{output_vcf}"
File filtered_vcf_idx = "~{output_vcf_idx}"
File filtering_stats = "filtering.stats"
}
}
task FilterAlignmentArtifacts {
input {
File input_vcf
File input_vcf_idx
File bam
File bai
String output_name
Boolean compress
File realignment_index_bundle
String? realignment_extra_args
Runtime runtime_params
Int mem
}
String output_vcf = output_name + if compress then ".vcf.gz" else ".vcf"
String output_vcf_idx = output_vcf + if compress then ".tbi" else ".idx"
Int machine_mem = mem
Int command_mem = machine_mem - 500
parameter_meta{
input_vcf: {localization_optional: true}
input_vcf_idx: {localization_optional: true}
bam: {localization_optional: true}
bai: {localization_optional: true}
}
command {
set -e
export GATK_LOCAL_JAR=~{default="/root/gatk.jar" runtime_params.gatk_override}
gatk --java-options "-Xmx~{command_mem}m" FilterAlignmentArtifacts \
-V ~{input_vcf} \
-I ~{bam} \
--bwa-mem-index-image ~{realignment_index_bundle} \
~{realignment_extra_args} \
-O ~{output_vcf}
}
runtime {
docker: runtime_params.gatk_docker
bootDiskSizeGb: runtime_params.boot_disk_size
memory: machine_mem + " MB"
disks: "local-disk " + runtime_params.disk + " HDD"
preemptible: runtime_params.preemptible
maxRetries: runtime_params.max_retries
cpu: runtime_params.cpu
}
output {
File filtered_vcf = "~{output_vcf}"
File filtered_vcf_idx = "~{output_vcf_idx}"
}
}
task Funcotate {
input {
File ref_fasta
File ref_fai
File ref_dict
File input_vcf
File input_vcf_idx
String reference_version
String output_file_base_name
String output_format
Boolean compress
Boolean use_gnomad
# This should be updated when a new version of the data sources is released
# TODO: Make this dynamically chosen in the command.
File? data_sources_tar_gz = "gs://broad-public-datasets/funcotator/funcotator_dataSources.v1.6.20190124s.tar.gz"
String? control_id
String? case_id
String? sequencing_center
String? sequence_source
String? transcript_selection_mode
File? transcript_selection_list
Array[String]? annotation_defaults
Array[String]? annotation_overrides
Array[String]? funcotator_excluded_fields
Boolean? filter_funcotations
File? interval_list
String? extra_args
# ==============
Runtime runtime_params
Int? disk_space #override to request more disk than default small task params