-
Notifications
You must be signed in to change notification settings - Fork 21
/
maf_fillout.py
executable file
·73 lines (62 loc) · 2.95 KB
/
maf_fillout.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
#!/usr/bin/env python
##########################################################################################
# MSKCC CMO
descr = 'Perform fillout operation on MAF file using GetBaseCountsMultiSample'
##########################################################################################
import argparse
import subprocess
import os
import string
import uuid
import cmo
parser = argparse.ArgumentParser(description = descr, formatter_class = argparse.RawTextHelpFormatter)
parser.add_argument('-m', '--maf', help = 'MAF file on which to filllout', required = True)
parser.add_argument('-b', '--bams', help = 'BAM files to fillout with', required = True, nargs='+')
parser.add_argument('-g', '--genome', help = 'Reference assembly of BAM files, e.g. hg19/grch37/b37', required = True, choices = cmo.util.genomes.keys())
parser.add_argument('-o', '--output', help = 'Prefix for output file', required = False)
parser.add_argument('-n', '--n_threads', help = 'Multithread', default = 10, required = False)
parser.add_argument('-mo', '--maf_output', help = 'Output MAF (boolean)', action = 'store_true', default = False, required = False)
args = parser.parse_args()
maf = args.maf
bams = args.bams
genome = args.genome.lower()
n = args.n_threads
mo = args.maf_output
if args.output is None:
output = os.path.splitext(os.path.basename(maf))[0]+'.fillout'
else:
output = args.output
### Path to GetBaseCountsMultiSample
#should be cmo.util.programs['GetBaseCountsMultiSample]['1.0.0'], but zeng hasn't packaged this for release yet
gbcmPath = '/opt/common/CentOS_6-dev/getbasecountsmultisample/v1.2.2/GetBaseCountsMultiSample'
### Set genome path
genomePath = cmo.util.genomes[args.genome]['fasta']
### Parse BAM files into string
bamString = []
for bam in bams:
bamString.append('--bam '+os.path.splitext(os.path.basename(bam))[0]+':'+bam)
bamString = string.join(bamString)
### Check if MAF has right genome
mafGenome = subprocess.check_output('grep -v ^# '+maf+' | tail -1 | cut -f4', shell = True)
print 'Using '+genome
print 'MAF genome seems to be '+mafGenome.strip().lower()
if mafGenome.strip().lower() is not genome.lower():
print 'Genome build different from that in MAF file, might fail'
### Check if genome in BAM header
for bam in bams:
try:
out = subprocess.check_output('samtools view -H '+bam+' | grep '+genome, shell = True)
except subprocess.CalledProcessError:
print 'Genome in '+bam+' does not agree with input genome'
### Make a temporary simplified MAF
tmpMaf = uuid.uuid4().hex+'_tmp.maf'
uniqRscript = os.path.dirname(os.path.realpath(__file__))+'/maf_uniq_tags.R'
uniqRCall = uniqRscript+' '+maf+' > '+tmpMaf
subprocess.call(uniqRCall, shell = True)
### Call GetBaseCountsMultiSample
gbcmCall = gbcmPath+' --thread %s --filter_improper_pair 0 --fasta %s --maf %s --output %s %s' % (n, genomePath, tmpMaf, output, bamString)
if mo:
gbcmCall += ' --omaf'
subprocess.call(gbcmCall, shell = True)
### Remove temporary MAF
subprocess.call('rm -f '+tmpMaf, shell = True)