-
Notifications
You must be signed in to change notification settings - Fork 9
/
add_bam_readcount_to_vcf_helper.py
182 lines (161 loc) · 7.04 KB
/
add_bam_readcount_to_vcf_helper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
#!/usr/bin/env python
import sys
import os
import re
import vcfpy
import tempfile
import csv
from collections import OrderedDict
def parse_brct_field(brcts):
parsed_brct = {}
for brct in brcts:
(base, count, rest) = brct.split(':', 2)
parsed_brct[base.upper()] = count
return parsed_brct
def parse_bam_readcount_file(bam_readcount_files, samples):
coverage = {}
for bam_readcount_file, sample in zip(bam_readcount_files, samples):
coverage[sample] = {}
with open(bam_readcount_file, 'r') as reader:
coverage_tsv_reader = csv.reader(reader, delimiter='\t')
for row in coverage_tsv_reader:
chromosome = row[0]
position = row[1]
reference_base = row[2].upper()
depth = row[3]
brct = row[4:]
if chromosome not in coverage[sample]:
coverage[sample][chromosome] = {}
if position not in coverage[sample][chromosome]:
coverage[sample][chromosome][position] = {}
coverage[sample][chromosome][position][reference_base] = parse_brct_field(brct)
coverage[sample][chromosome][position][reference_base]['depth'] = depth
return coverage
def is_insertion(ref, alt):
return len(alt) > len(ref)
def is_deletion(ref, alt):
return len(alt) < len(ref)
def simplify_indel_allele(ref, alt):
while len(ref)> 0 and len(alt) > 0 and ref[-1] == alt[-1]:
ref = ref[0:-1]
alt = alt[0:-1]
while len(ref)> 0 and len(alt) > 0 and ref[0] == alt[0]:
ref = ref[1:]
alt = alt[1:]
return ref, alt
def calculate_coverage(ref, var):
return ref + var
def calculate_vaf(var, depth):
return format(var / int(depth), '.5f')
def parse_to_bam_readcount(start, reference, alt):
if len(alt) != len(reference):
if is_deletion(reference, alt):
bam_readcount_position = str(start + 2)
(simplified_reference, simplified_alt) = simplify_indel_allele(reference, alt)
ref_base = reference[1:2]
var_base = '-' + simplified_reference
elif is_insertion(reference, alt):
bam_readcount_position = str(start)
(simplified_reference, simplified_alt) = simplify_indel_allele(reference, alt)
ref_base = reference
var_base = '+' + simplified_alt
else:
bam_readcount_position = str(entry.POS)
ref_base = reference
var_base = alt
return (bam_readcount_position, ref_base, var_base)
(script, vcf_filename, bam_readcount_filenames, samples_list, output_dir) = sys.argv
samples = samples_list.split(',')
bam_readcount_files = bam_readcount_filenames.split(',')
read_counts = parse_bam_readcount_file(bam_readcount_files, samples)
vcf_reader = vcfpy.Reader.from_path(vcf_filename)
new_header = vcfpy.Header(samples = vcf_reader.header.samples)
for line in vcf_reader.header.lines:
if not (line.key == 'FORMAT' and line.id in ['DP', 'AD', 'AF']):
new_header.add_line(line)
new_header.add_format_line(OrderedDict([('ID', 'DP'), ('Number', '1'), ('Type', 'Integer'), ('Description', 'Read depth')]))
new_header.add_format_line(OrderedDict([('ID', 'AD'), ('Number', 'R'), ('Type', 'Integer'), ('Description', 'Allelic depths for the ref and alt alleles in the order listed')]))
new_header.add_format_line(OrderedDict([('ID', 'AF'), ('Number', 'A'), ('Type', 'Float'), ('Description', 'Variant-allele frequency for the alt alleles')]))
vcf_writer = vcfpy.Writer.from_path(os.path.join(output_dir, 'annotated.bam_readcount.vcf.gz'), new_header)
for entry in vcf_reader:
chromosome = entry.CHROM
start = entry.affected_start
stop = entry.affected_end
reference = entry.REF
alts = entry.ALT
for sample in samples:
#DP - read depth
if 'DP' not in entry.FORMAT:
entry.FORMAT += ['DP']
alt = alts[0].serialize()
(bam_readcount_position, ref_base, var_base) = parse_to_bam_readcount(start, reference, alt)
if (
chromosome in read_counts[sample]
and bam_readcount_position in read_counts[sample][chromosome]
and ref_base in read_counts[sample][chromosome][bam_readcount_position]
):
brct = read_counts[sample][chromosome][bam_readcount_position][ref_base]
if ref_base in brct:
depth = read_counts[sample][chromosome][bam_readcount_position][ref_base]['depth']
else:
depth = 0
else:
depth = 0
entry.call_for_sample[sample].data['DP'] = depth
#AF - variant allele frequencies
if 'AF' not in entry.FORMAT:
entry.FORMAT += ['AF']
vafs = []
for alt in alts:
alt = alt.serialize()
(bam_readcount_position, ref_base, var_base) = parse_to_bam_readcount(start, reference, alt)
if (
chromosome in read_counts[sample]
and bam_readcount_position in read_counts[sample][chromosome]
and ref_base in read_counts[sample][chromosome][bam_readcount_position]
):
brct = read_counts[sample][chromosome][bam_readcount_position][ref_base]
if var_base in brct:
vafs.append(calculate_vaf(int(brct[var_base]), depth))
else:
vafs.append(0)
else:
vafs.append(0)
entry.call_for_sample[sample].data['AF'] = vafs
#AD - ref, var1..varN counts
if 'AD' not in entry.FORMAT:
entry.FORMAT += ['AD']
ads = []
(bam_readcount_position, ref_base, var_base) = parse_to_bam_readcount(start, reference, alts[0].serialize())
if (
chromosome in read_counts[sample]
and bam_readcount_position in read_counts[sample][chromosome]
and ref_base in read_counts[sample][chromosome][bam_readcount_position]
):
brct = read_counts[sample][chromosome][bam_readcount_position][ref_base]
if ref_base in brct:
ads.append(brct[ref_base])
else:
ads.append(0)
else:
ads.append(0)
for alt in alts:
if type(alt) is not str:
alt = alt.serialize()
(bam_readcount_position, ref_base, var_base) = parse_to_bam_readcount(start, reference, alt)
if (
chromosome in read_counts[sample]
and bam_readcount_position in read_counts[sample][chromosome]
and ref_base in read_counts[sample][chromosome][bam_readcount_position]
):
brct = read_counts[sample][chromosome][bam_readcount_position][ref_base]
if var_base in brct:
ads.append(brct[var_base])
else:
ads.append(0)
else:
ads.append(0)
entry.call_for_sample[sample].data['AD'] = ads
vcf_writer.write_record(entry)
vcf_writer.close()
vcf_reader.close()