forked from biothings/myvariant.info
-
Notifications
You must be signed in to change notification settings - Fork 0
/
sample_src_parser.py
323 lines (295 loc) · 11.4 KB
/
sample_src_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
####################################################
# This is an example parser file parsing #
# dbSNP VCF file into a generator of JSON objects #
####################################################
'''
Parsing dbSNP VCF file into variant document
requires pyVCF module
Chunlei Wu
'''
from collections import OrderedDict
import copy
import time
from vcf import Reader
from utils.common import timesofar
#######################################################
# Optional, but you can put some metadata fields here #
#######################################################
__METADATA__ = {
"requirements": [ # your can list any extra dependancies for this parser here
"PyVCF>=0.6.7",
],
"src_name": 'dbSNP',
"src_url": 'http://www.ncbi.nlm.nih.gov/SNP/',
"version": '147',
"field": "dbsnp"
}
#####################################################
# First some useful constants and helper functions #
#####################################################
# the key name for the pos in var_doc
POS_KEY = 'hg19'
infile = "<path_to>/dbsnp/hg19/20160413/All_20160407.vcf.gz"
def get_hgvs_name(record, as_list=False):
"""construct the valid HGVS name as the _id field"""
_id_list = []
_alt_list = []
_pos_list = []
chrom = record.CHROM
for alt in record.ALT:
alt = str(alt)
_id = None
if record.is_snp:
assert record.POS == record.INFO['RSPOS']
# make a HGVS id
_id = 'chr{}:g.{}{}>{}'.format(chrom,
record.POS,
record.REF,
alt)
_alt_list.append(alt)
_pos_list.append(OrderedDict(start=record.POS, end=record.POS)) # end is the same as start for snp
elif record.is_indel:
if record.is_deletion:
if len(record.ALT) == 1 and len(alt) == 1:
# only if ALT is a single allele, that is a simple deletion
assert alt == record.REF[0]
if record.POS + 1 == record.INFO['RSPOS']:
if len(record.REF) == 2:
# this is a single nt deletion
pos = record.INFO['RSPOS']
_pos_list.append(OrderedDict(start=pos, end=pos + 1)) # end is start+1 for single nt deletion
else:
# this is a multiple nt deletion
end = record.INFO['RSPOS'] + len(record.REF) - 2
pos = '{}_{}'.format(record.INFO['RSPOS'], end)
_pos_list.append(OrderedDict(start=record.INFO['RSPOS'], end=end))
_id = 'chr{}:g.{}del'.format(chrom, pos)
_alt_list.append(alt)
else:
# record.POS and RSPOS does not match
# something ambigious here
pass
else:
# other cases of deletion currently been ignored
# e.g. rs369371434, rs386822484
pass
else:
# insertion
if len(record.REF) == 1 and alt[0] == record.REF:
# simple insertion cases
if record.POS == record.INFO['RSPOS']:
pos = '{}_{}'.format(record.POS, record.POS + 1)
_id = 'chr{}:g.{}ins{}'.format(chrom, pos, alt[1:])
_alt_list.append(alt)
_pos_list.append(OrderedDict(start=record.POS, end=record.POS + 1))
else:
# record.POS and RSPOS does not match
# something ambigious here
pass
else:
# other cases of insertion currently been ignored
# e.g. rs398121698, rs71320640
pass
if _id:
_id_list.append(_id)
if not as_list and len(_id_list) == 1:
_id_list = _id_list[0]
_alt_list = _alt_list[0]
_pos_list = _pos_list[0]
return _id_list, _alt_list, _pos_list
def parse_one_rec(record):
snp = OrderedDict()
snp['rsid'] = record.ID
snp['vartype'] = record.var_type
snp['var_subtype'] = record.var_subtype
info = record.INFO
snp['dbsnp_build'] = info['dbSNPBuildID']
if 'GENEINFO' in info:
snp['gene'] = [dict(zip(('symbol', 'geneid'), x.split(':'))) for x in info['GENEINFO'].split('|')]
if len(snp['gene']) == 1:
snp['gene'] = snp['gene'][0]
if 'SAO' in info:
_d = {
0: 'unspecified',
1: 'germline',
2: 'somatic',
3: 'both'
}
snp['allele_origin'] = _d[info['SAO']]
if 'VC' in info:
# ref http://www.ncbi.nlm.nih.gov/projects/SNP/snp_legend.cgi?legend=snpClass
snp['class'] = info['VC']
snp['validated'] = info.get('VLD', None) is True
# flags
flags_included = [
# reverse
'RV',
# functional
'PM', 'TPA', 'PMC', 'S3D', 'SLO', 'NSF', 'NSM', 'NSN', 'REF', 'SYN',
'U3', 'U5', 'ASS', 'DSS', 'INT', 'R3', 'R5', 'MUT', 'CDA', 'MTP', 'OM'
# mapping:
'OTH', 'CFL', 'ASP', 'LSD', 'NOC', 'WTD', 'NOV',
# freqs:
'G5A', 'G5', 'HD', 'GNO', 'KGPhase1', 'KGPhase3'
]
flags = [f for f in flags_included if info.get(f, False)]
if flags:
snp['flags'] = sorted(flags)
# CAF and alleles
snp['alleles'] = [{"allele": str(a)} for a in record.alleles]
if 'CAF' in info:
assert len(info['CAF']) == len(snp['alleles'])
for i, freq in enumerate(info['CAF']):
if freq:
snp['alleles'][i]['freq'] = float(freq)
# GMAF: The minor allele is the second largest value in the list,
# and was previuosly reported in VCF as the GMAF.
# This is the GMAF reported on the RefSNP and EntrezSNP pages and
# VariationReporter
freq_list = [float(x) for x in info['CAF'] if x]
if len(freq_list) >= 2:
snp['gmaf'] = freq_list[1]
# INFO field skipped: SSR, COMMON
snp['chrom'] = record.CHROM
snp['ref'] = record.REF
_id_list, _alt_list, _pos_list = get_hgvs_name(record)
snp['alt'] = _alt_list
snp[POS_KEY] = _pos_list
snp['_id'] = _id_list
return snp
def parse_vcf(vcf_infile, compressed=True, verbose=True, by_id=True, **tabix_params):
t0 = time.time()
compressed == vcf_infile.endswith('.gz')
vcf_r = Reader(filename=vcf_infile, compressed=compressed)
vcf_r.fetch('1', 1) # call a dummy fetch to initialize vcf_r._tabix
if tabix_params:
vcf_r.reader = vcf_r._tabix.fetch(**tabix_params)
cnt_1, cnt_2, cnt_3 = 0, 0, 0
for rec in vcf_r:
doc = parse_one_rec(rec)
if by_id:
# one hgvs id, one doc
if doc['_id']:
if isinstance(doc['_id'], list):
for i, _id in enumerate(doc['_id']):
_doc = copy.copy(doc)
_doc['alt'] = doc['alt'][i]
_doc[POS_KEY] = doc[POS_KEY][i]
_doc['_id'] = _id
yield _doc
cnt_2 += 1
if verbose:
print(_doc['rsid'], '\t', _doc['_id'])
else:
yield doc
cnt_2 += 1
if verbose:
print(doc['rsid'], '\t', doc['_id'])
else:
cnt_3 += 1
else:
# one rsid, one doc
if doc['_id']:
yield doc
cnt_2 += 1
if verbose:
print(doc['rsid'], '\t', doc['_id'])
else:
cnt_3 += 1
cnt_1 += 1
print("Done. [{}]".format(timesofar(t0)))
print("Total rs: {}; total docs: {}; skipped rs: {}".format(cnt_1, cnt_2, cnt_3))
################################################################
# This is REQUIRED function called load_data. #
# It takes input file or a input folder as the first input, #
# and output a list or a generator of structured JSON objects #
################################################################
def load_data(infile):
chrom_list = [str(i) for i in range(1, 23)] + ['X', 'Y', 'MT']
for chrom in chrom_list:
print("Processing chr{}...".format(chrom))
snpdoc_iter = parse_vcf(infile, compressed=True, verbose=False, by_id=True, reference=chrom)
for doc in snpdoc_iter:
_doc = {'dbsnp': doc}
_doc['_id'] = doc['_id']
del doc['_id']
yield _doc
##############################################################
# OPTIONAL get_mapping function returns a mapping dictionary #
# to describe the JSON data structure and customize indexing.#
# You can just skip it first and we can help to create one. #
##############################################################
def get_mapping():
mapping = {
"dbsnp": {
"properties": {
"allele_origin": {
"type": "string",
"analyzer": "string_lowercase"
},
"alt": {
"type": "string",
"analyzer": "string_lowercase"
},
"chrom": {
"type": "string",
"analyzer": "string_lowercase"
},
"class": {
"type": "string",
"analyzer": "string_lowercase"
},
"flags": {
"type": "string",
"analyzer": "string_lowercase"
},
"gmaf": {
"type": "float"
},
"hg19": {
"properties": {
"end": {
"type": "long"
},
"start": {
"type": "long"
}
}
},
"ref": {
"type": "string",
"analyzer": "string_lowercase"
},
"rsid": {
"type": "string",
"include_in_all": True,
"analyzer": "string_lowercase"
},
"var_subtype": {
"type": "string",
"analyzer": "string_lowercase"
},
"vartype": {
"type": "string",
"analyzer": "string_lowercase"
},
"validated": {
"type": "boolean"
},
"gene": {
"properties": {
"symbol": {
"type": "string",
"analyzer": "string_lowercase",
"include_in_all": True
},
"geneid": {
"type": "string",
"analyzer": "string_lowercase"
}
}
}
}
}
}
return mapping