From 00a3c36b4e521a41fd7d5e75de1d2672c4cb1af1 Mon Sep 17 00:00:00 2001 From: Yao Yao Date: Mon, 9 Aug 2021 12:37:20 -0700 Subject: [PATCH 1/4] added comments; formatted code --- .../sources/clinvar/clinvar_xml_parser.py | 127 +++++++++--------- 1 file changed, 61 insertions(+), 66 deletions(-) diff --git a/src/hub/dataload/sources/clinvar/clinvar_xml_parser.py b/src/hub/dataload/sources/clinvar/clinvar_xml_parser.py index 6d3f4661..3e4640a5 100644 --- a/src/hub/dataload/sources/clinvar/clinvar_xml_parser.py +++ b/src/hub/dataload/sources/clinvar/clinvar_xml_parser.py @@ -34,8 +34,8 @@ def merge_rcv_accession(generator): if len(item) > 1: json_item = item[0] for _item in item: - rcv_info = _item['clinvar']['rcv'] - rcv_new.append(rcv_info) + rcv_info = _item['clinvar']['rcv'] + rcv_new.append(rcv_info) json_item['clinvar']['rcv'] = rcv_new yield json_item else: @@ -71,6 +71,7 @@ def parse_measure(Measure, hg19=True): alt = SequenceLocation.alternateAllele if not alt: alt = SequenceLocation.alternateAlleleVCF + if 'GRCh38' in SequenceLocation.Assembly: chromStart_38 = SequenceLocation.start chromEnd_38 = SequenceLocation.stop @@ -82,6 +83,7 @@ def parse_measure(Measure, hg19=True): alt = SequenceLocation.alternateAllele if not alt: alt = SequenceLocation.alternateAlleleVCF + if Measure.MeasureRelationship: try: symbol = Measure.MeasureRelationship[0].\ @@ -92,14 +94,17 @@ def parse_measure(Measure, hg19=True): else: symbol = None gene_id = None + if Measure.Name: name = Measure.Name[0].ElementValue.valueOf_ else: name = None + if len(Measure.CytogeneticLocation) == 1: cytogenic = Measure.CytogeneticLocation[0] else: cytogenic = Measure.CytogeneticLocation + hgvs_coding = None hgvs_genome = None HGVS = {'genomic': [], 'coding': [], 'non-coding': [], 'protein': []} @@ -111,49 +116,40 @@ def parse_measure(Measure, hg19=True): else: chromStart = chromStart_38 chromEnd = chromEnd_38 + # hgvs_not_validated = None if Measure.AttributeSet: # 'copy number loss' or 'gain' have format different\ # from other types, should be dealt with seperately - if (variation_type == 'copy number loss') or \ - (variation_type == 'copy number gain'): + if (variation_type == 'copy number loss') or (variation_type == 'copy number gain'): for AttributeSet in Measure.AttributeSet: - if 'HGVS, genomic, top level' in AttributeSet.\ - Attribute.Type: + if 'HGVS, genomic, top level' in AttributeSet.Attribute.Type: if AttributeSet.Attribute.integerValue == 37: hgvs_genome = AttributeSet.Attribute.get_valueOf_() if 'genomic' in AttributeSet.Attribute.Type: - HGVS['genomic'].append(AttributeSet.Attribute. - get_valueOf_()) + HGVS['genomic'].append(AttributeSet.Attribute.get_valueOf_()) elif 'non-coding' in AttributeSet.Attribute.Type: - HGVS['non-coding'].append(AttributeSet.Attribute. - get_valueOf_()) + HGVS['non-coding'].append(AttributeSet.Attribute.get_valueOf_()) elif 'coding' in AttributeSet.Attribute.Type: - HGVS['coding'].append(AttributeSet.Attribute. - get_valueOf_()) + HGVS['coding'].append(AttributeSet.Attribute.get_valueOf_()) elif 'protein' in AttributeSet.Attribute.Type: - HGVS['protein'].append(AttributeSet. - Attribute.get_valueOf_()) + HGVS['protein'].append(AttributeSet.Attribute.get_valueOf_()) else: for AttributeSet in Measure.AttributeSet: if 'genomic' in AttributeSet.Attribute.Type: - HGVS['genomic'].append(AttributeSet. - Attribute.get_valueOf_()) + HGVS['genomic'].append(AttributeSet.Attribute.get_valueOf_()) elif 'non-coding' in AttributeSet.Attribute.Type: - HGVS['non-coding'].append(AttributeSet. - Attribute.get_valueOf_()) + HGVS['non-coding'].append(AttributeSet.Attribute.get_valueOf_()) elif 'coding' in AttributeSet.Attribute.Type: - HGVS['coding'].append(AttributeSet.Attribute. - get_valueOf_()) + HGVS['coding'].append(AttributeSet.Attribute.get_valueOf_()) elif 'protein' in AttributeSet.Attribute.Type: - HGVS['protein'].append(AttributeSet. - Attribute.get_valueOf_()) + HGVS['protein'].append(AttributeSet.Attribute.get_valueOf_()) if AttributeSet.Attribute.Type == 'HGVS, coding, RefSeq': hgvs_coding = AttributeSet.Attribute.get_valueOf_() - elif AttributeSet.Attribute.Type == \ - 'HGVS, genomic, top level, previous': + elif AttributeSet.Attribute.Type == 'HGVS, genomic, top level, previous': hgvs_genome = AttributeSet.Attribute.get_valueOf_() break + if chrom and chromStart and chromEnd: # if its SNP, make sure chrom, chromStart, chromEnd, ref, alt are all provided if variation_type == 'single nucleotide variant': @@ -169,8 +165,7 @@ def parse_measure(Measure, hg19=True): if hgvs_genome: indel_position = hgvs_genome.find('del') indel_alt = hgvs_genome[indel_position+3:] - hgvs_id = "chr%s:g.%s_%sdel%s" % \ - (chrom, chromStart, chromEnd, indel_alt) + hgvs_id = "chr%s:g.%s_%sdel%s" % (chrom, chromStart, chromEnd, indel_alt) elif variation_type == 'Deletion': if chromStart == chromEnd: # RCV000048406, chr17:g.41243547del @@ -183,24 +178,19 @@ def parse_measure(Measure, hg19=True): if 'ins' in hgvs_genome: ins_ref = hgvs_genome[ins_position+3:] if chromStart == chromEnd: - hgvs_id = "chr%s:g.%sins%s" % \ - (chrom, chromStart, ins_ref) + hgvs_id = "chr%s:g.%sins%s" % (chrom, chromStart, ins_ref) else: - hgvs_id = "chr%s:g.%s_%sins%s" % \ - (chrom, chromStart, chromEnd, ins_ref) + hgvs_id = "chr%s:g.%s_%sins%s" % (chrom, chromStart, chromEnd, ins_ref) elif variation_type == 'Duplication': if hgvs_genome: dup_position = hgvs_genome.find('dup') if 'dup' in hgvs_genome: dup_ref = hgvs_genome[dup_position+3:] if chromStart == chromEnd: - hgvs_id = "chr%s:g.%sdup%s" % \ - (chrom, chromStart, dup_ref) + hgvs_id = "chr%s:g.%sdup%s" % (chrom, chromStart, dup_ref) else: - hgvs_id = "chr%s:g.%s_%sdup%s" % \ - (chrom, chromStart, chromEnd, dup_ref) - elif variation_type == 'copy number loss' or\ - variation_type == 'copy number gain': + hgvs_id = "chr%s:g.%s_%sdup%s" % (chrom, chromStart, chromEnd, dup_ref) + elif variation_type == 'copy number loss' or variation_type == 'copy number gain': if hgvs_genome and chrom: hgvs_id = "chr" + chrom + ":" + hgvs_genome.split('.')[2] elif hgvs_coding: @@ -210,8 +200,10 @@ def parse_measure(Measure, hg19=True): return else: return + for key in HGVS: HGVS[key].sort() + rsid = None cosmic = None dbvar = None @@ -234,7 +226,6 @@ def parse_measure(Measure, hg19=True): # make sure the hgvs_id is not none if hgvs_id: one_snp_json = { - "_id": hgvs_id, "clinvar": { @@ -244,26 +235,22 @@ def parse_measure(Measure, hg19=True): "cosmic": cosmic, "uniprot": uniprot, "dbvar": dbvar, - "hg19": - { - "start": chromStart_19, - "end": chromEnd_19 - }, - "hg38": - { - "start": chromStart_38, - "end": chromEnd_38 - }, + "hg19": { + "start": chromStart_19, + "end": chromEnd_19 + }, + "hg38": { + "start": chromStart_38, + "end": chromEnd_38 + }, "type": variation_type, - "gene": - { - "id": gene_id, - "symbol": symbol - }, - "rcv": - { - "preferred_name": name, - }, + "gene": { + "id": gene_id, + "symbol": symbol + }, + "rcv": { + "preferred_name": name, + }, "rsid": rsid, "cytogenic": cytogenic, "hgvs": HGVS, @@ -277,19 +264,19 @@ def parse_measure(Measure, hg19=True): def _map_line_to_json(cp, hg19): try: - clinical_significance = cp.ReferenceClinVarAssertion.\ - ClinicalSignificance.Description + clinical_significance = cp.ReferenceClinVarAssertion.ClinicalSignificance.Description except: clinical_significance = None + rcv_accession = cp.ReferenceClinVarAssertion.ClinVarAccession.Acc + try: - review_status = cp.ReferenceClinVarAssertion.ClinicalSignificance.\ - ReviewStatus + review_status = cp.ReferenceClinVarAssertion.ClinicalSignificance.ReviewStatus except: review_status = None + try: - last_evaluated = cp.ReferenceClinVarAssertion.ClinicalSignificance.\ - DateLastEvaluated + last_evaluated = cp.ReferenceClinVarAssertion.ClinicalSignificance.DateLastEvaluated except: last_evaluated = None @@ -299,6 +286,7 @@ def _map_line_to_json(cp, hg19): origin = cp.ReferenceClinVarAssertion.ObservedIn[0].Sample.Origin except: origin = None + conditions = [] for _trait in cp.ReferenceClinVarAssertion.TraitSet.Trait: synonyms = [] @@ -308,6 +296,7 @@ def _map_line_to_json(cp, hg19): synonyms.append(name.ElementValue.get_valueOf_()) if name.ElementValue.Type == 'Preferred': conditions_name += name.ElementValue.get_valueOf_() + identifiers = {} for item in _trait.XRef: if item.DB == 'Human Phenotype Ontology': @@ -318,16 +307,19 @@ def _map_line_to_json(cp, hg19): for symbol in _trait.Symbol: if symbol.ElementValue.Type == 'Preferred': conditions_name += ' (' + symbol.ElementValue.get_valueOf_() + ')' + age_of_onset = '' for _set in _trait.AttributeSet: if _set.Attribute.Type == 'age of onset': age_of_onset = _set.Attribute.get_valueOf_() + conditions.append({"name": conditions_name, "synonyms": synonyms, "identifiers": identifiers, "age_of_onset": age_of_onset}) try: genotypeset = cp.ReferenceClinVarAssertion.GenotypeSet except: genotypeset = None + if genotypeset: obj_list = [] id_list = [] @@ -351,9 +343,9 @@ def _map_line_to_json(cp, hg19): id_list.append(json_obj['_id']) for _obj in obj_list: _obj['clinvar'].update({'genotypeset': { - 'type': 'CompoundHeterozygote', - 'genotype': id_list - }}) + 'type': 'CompoundHeterozygote', + 'genotype': id_list + }}) yield _obj else: variant_id = cp.ReferenceClinVarAssertion.MeasureSet.ID @@ -382,11 +374,13 @@ def rcv_feeder(input_file, hg19): # some exceptions if record.startswith('\n'): continue + try: record_parsed = clinvarlib.parseString(record, silence=1) except: logging.debug(record) raise + for record_mapped in _map_line_to_json(record_parsed, hg19): yield record_mapped @@ -402,7 +396,8 @@ def load_data(data_folder, version): input_file = files[0] data_generator = rcv_feeder(input_file, version == "hg19") data_list = list(data_generator) - # TODO: why do we sort this list ? this prevent from using yield/iterator + # Sort the `data_list` because `merge_rcv_accession` will call `itertools.groupby()` + # which cannot group non-adjacent items with the same key into a group data_list_sorted = sorted(data_list, key=lambda k: k['_id']) data_merge_rcv = merge_rcv_accession(data_list_sorted) return data_merge_rcv From 24e3a9ccfb746e59edbc892b29412e3f34eaeee3 Mon Sep 17 00:00:00 2001 From: Yao Yao Date: Tue, 10 Aug 2021 15:37:55 -0700 Subject: [PATCH 2/4] formatted code; renamed variables and functions --- .../sources/clinvar/clinvar_xml_parser.py | 266 +++++++++--------- 1 file changed, 132 insertions(+), 134 deletions(-) diff --git a/src/hub/dataload/sources/clinvar/clinvar_xml_parser.py b/src/hub/dataload/sources/clinvar/clinvar_xml_parser.py index 3e4640a5..fd7f3b93 100644 --- a/src/hub/dataload/sources/clinvar/clinvar_xml_parser.py +++ b/src/hub/dataload/sources/clinvar/clinvar_xml_parser.py @@ -1,104 +1,89 @@ -# a python lib is generated on the fly, in data folder -# adjust path - - -import sys, os, glob +import glob +import os +import sys from itertools import groupby -from config import DATA_ARCHIVE_ROOT, logger as logging -import biothings, config +import biothings +import config biothings.config_for_app(config) -from biothings.utils.dataload import unlist, dict_sweep, \ - value_convert_to_number, rec_handler + +from biothings.utils.dataload import unlist, dict_sweep, value_convert_to_number, rec_handler GLOB_PATTERN = "ClinVarFullRelease_*.xml.gz" clinvarlib = None + def import_clinvar_lib(data_folder): - sys.path.insert(0,data_folder) + # a python lib is generated on the fly, in data folder + sys.path.insert(0, data_folder) import genclinvar as clinvar_mod global clinvarlib clinvarlib = clinvar_mod -def merge_rcv_accession(generator): - groups = [] - for key, group in groupby(generator, lambda x: x['_id']): - groups.append(list(group)) + +def merge_rcv_accession(records): + record_groups = [] + for key, group in groupby(records, lambda x: x['_id']): + record_groups.append(list(group)) # get the number of groups, and uniquekeys - logging.info("number of groups: %s" % len(groups)) - - # loop through each item, if item number >1, merge rcv accession number - for item in groups: - rcv_new = [] - if len(item) > 1: - json_item = item[0] - for _item in item: - rcv_info = _item['clinvar']['rcv'] - rcv_new.append(rcv_info) - json_item['clinvar']['rcv'] = rcv_new - yield json_item + logging.info("number of groups: %s" % len(record_groups)) + + # Each group of records is a list + # Loop through each group, if record number >1, merge rcv accessions + for record_list in record_groups: + if len(record_list) == 1: + yield record_list[0] else: - yield item[0] + rcv_list = [record['clinvar']['rcv'] for record in record_list] + + merged_record = record_list[0] + merged_record['clinvar']['rcv'] = rcv_list + yield merged_record def parse_measure(Measure, hg19=True): + # exclude any item of which types belong to 'Variation', 'protein only' or 'Microsatellite' variation_type = Measure.Type - # exclude any item of which types belong to - # 'Variation', 'protein only' or 'Microsatellite' if variation_type == 'Variation' or variation_type == 'protein only' or variation_type == 'Microsatellite': return None + allele_id = Measure.ID + chrom = None - chromStart_19 = None - chromEnd_19 = None - chromStart_38 = None - chromEnd_38 = None - ref = None - alt = None + chromStart_19, chromEnd_19, chromStart_38, chromEnd_38 = None, None, None, None + ref, alt = None, None if Measure.SequenceLocation: for SequenceLocation in Measure.SequenceLocation: - # In this version, only accept information concerning GRCh37 if 'GRCh37' in SequenceLocation.Assembly: chrom = SequenceLocation.Chr chromStart_19 = SequenceLocation.start chromEnd_19 = SequenceLocation.stop if not ref: - ref = SequenceLocation.referenceAllele - if not ref: - ref = SequenceLocation.referenceAlleleVCF - if not alt: - alt = SequenceLocation.alternateAllele + ref = SequenceLocation.referenceAllele or SequenceLocation.referenceAlleleVCF if not alt: - alt = SequenceLocation.alternateAlleleVCF + alt = SequenceLocation.alternateAllele or SequenceLocation.alternateAlleleVCF if 'GRCh38' in SequenceLocation.Assembly: chromStart_38 = SequenceLocation.start chromEnd_38 = SequenceLocation.stop if not ref: - ref = SequenceLocation.referenceAllele - if not ref: - ref = SequenceLocation.referenceAlleleVCF - if not alt: - alt = SequenceLocation.alternateAllele + ref = SequenceLocation.referenceAllele or SequenceLocation.referenceAlleleVCF if not alt: - alt = SequenceLocation.alternateAlleleVCF + alt = SequenceLocation.alternateAllele or SequenceLocation.alternateAlleleVCF + symbol = None + gene_id = None if Measure.MeasureRelationship: try: - symbol = Measure.MeasureRelationship[0].\ - Symbol[0].get_ElementValue().valueOf_ + symbol = Measure.MeasureRelationship[0].Symbol[0].get_ElementValue().valueOf_ except: symbol = None gene_id = Measure.MeasureRelationship[0].XRef[0].ID - else: - symbol = None - gene_id = None + name = None if Measure.Name: name = Measure.Name[0].ElementValue.valueOf_ - else: - name = None if len(Measure.CytogeneticLocation) == 1: cytogenic = Measure.CytogeneticLocation[0] @@ -126,6 +111,7 @@ def parse_measure(Measure, hg19=True): if 'HGVS, genomic, top level' in AttributeSet.Attribute.Type: if AttributeSet.Attribute.integerValue == 37: hgvs_genome = AttributeSet.Attribute.get_valueOf_() + if 'genomic' in AttributeSet.Attribute.Type: HGVS['genomic'].append(AttributeSet.Attribute.get_valueOf_()) elif 'non-coding' in AttributeSet.Attribute.Type: @@ -144,11 +130,11 @@ def parse_measure(Measure, hg19=True): HGVS['coding'].append(AttributeSet.Attribute.get_valueOf_()) elif 'protein' in AttributeSet.Attribute.Type: HGVS['protein'].append(AttributeSet.Attribute.get_valueOf_()) - if AttributeSet.Attribute.Type == 'HGVS, coding, RefSeq': + + if not hgvs_coding and AttributeSet.Attribute.Type == 'HGVS, coding, RefSeq': hgvs_coding = AttributeSet.Attribute.get_valueOf_() - elif AttributeSet.Attribute.Type == 'HGVS, genomic, top level, previous': + if not hgvs_genome and AttributeSet.Attribute.Type == 'HGVS, genomic, top level, previous': hgvs_genome = AttributeSet.Attribute.get_valueOf_() - break if chrom and chromStart and chromEnd: # if its SNP, make sure chrom, chromStart, chromEnd, ref, alt are all provided @@ -156,12 +142,13 @@ def parse_measure(Measure, hg19=True): if ref and alt: hgvs_id = "chr%s:g.%s%s>%s" % (chrom, chromStart, ref, alt) else: - print('hgvs not found chr {}, chromStart {}, chromEnd {}, ref {}, alt {}, allele id {}'.format(chrom, chromStart, chromEnd, ref, alt, allele_id)) + print('hgvs not found chr {}, chromStart {}, chromEnd {}, ref {}, alt {}, allele id {}'. + format(chrom, chromStart, chromEnd, ref, alt, allele_id)) # items whose type belong to 'Indel, Insertion, \ # Duplication' might not hava explicit alt information, \ # so we will parse from hgvs_genome elif variation_type == 'Indel': - # RCV000156073, NC_000010.10:g.112581638_112581639delinsG + # RCV000156073, NC_000010.10:g.112581638_112581639delinsG if hgvs_genome: indel_position = hgvs_genome.find('del') indel_alt = hgvs_genome[indel_position+3:] @@ -227,68 +214,69 @@ def parse_measure(Measure, hg19=True): if hgvs_id: one_snp_json = { "_id": hgvs_id, - "clinvar": - { - "allele_id": allele_id, - "chrom": chrom, - "omim": omim, - "cosmic": cosmic, - "uniprot": uniprot, - "dbvar": dbvar, - "hg19": { - "start": chromStart_19, - "end": chromEnd_19 - }, - "hg38": { - "start": chromStart_38, - "end": chromEnd_38 - }, - "type": variation_type, - "gene": { - "id": gene_id, - "symbol": symbol - }, - "rcv": { - "preferred_name": name, - }, - "rsid": rsid, - "cytogenic": cytogenic, - "hgvs": HGVS, - "coding_hgvs_only": coding_hgvs_only, - "ref": ref, - "alt": alt - } + "clinvar": { + "allele_id": allele_id, + "chrom": chrom, + "omim": omim, + "cosmic": cosmic, + "uniprot": uniprot, + "dbvar": dbvar, + "hg19": { + "start": chromStart_19, + "end": chromEnd_19 + }, + "hg38": { + "start": chromStart_38, + "end": chromEnd_38 + }, + "type": variation_type, + "gene": { + "id": gene_id, + "symbol": symbol + }, + "rcv": { + "preferred_name": name, + }, + "rsid": rsid, + "cytogenic": cytogenic, + "hgvs": HGVS, + "coding_hgvs_only": coding_hgvs_only, + "ref": ref, + "alt": alt + } } return one_snp_json -def _map_line_to_json(cp, hg19): + +def _map_parsed_record_to_json(parsed_record, hg19): try: - clinical_significance = cp.ReferenceClinVarAssertion.ClinicalSignificance.Description + clinical_significance = parsed_record.ReferenceClinVarAssertion.ClinicalSignificance.Description except: clinical_significance = None - rcv_accession = cp.ReferenceClinVarAssertion.ClinVarAccession.Acc + rcv_accession = parsed_record.ReferenceClinVarAssertion.ClinVarAccession.Acc try: - review_status = cp.ReferenceClinVarAssertion.ClinicalSignificance.ReviewStatus + review_status = parsed_record.ReferenceClinVarAssertion.ClinicalSignificance.ReviewStatus except: review_status = None try: - last_evaluated = cp.ReferenceClinVarAssertion.ClinicalSignificance.DateLastEvaluated + last_evaluated = parsed_record.ReferenceClinVarAssertion.ClinicalSignificance.DateLastEvaluated except: last_evaluated = None - number_submitters = len(cp.ClinVarAssertion) + number_submitters = len(parsed_record.ClinVarAssertion) + # some items in clinvar_xml doesn't have origin information try: - origin = cp.ReferenceClinVarAssertion.ObservedIn[0].Sample.Origin + origin = parsed_record.ReferenceClinVarAssertion.ObservedIn[0].Sample.Origin except: origin = None conditions = [] - for _trait in cp.ReferenceClinVarAssertion.TraitSet.Trait: + for _trait in parsed_record.ReferenceClinVarAssertion.TraitSet.Trait: synonyms = [] conditions_name = '' for name in _trait.Name: @@ -313,32 +301,35 @@ def _map_line_to_json(cp, hg19): if _set.Attribute.Type == 'age of onset': age_of_onset = _set.Attribute.get_valueOf_() - conditions.append({"name": conditions_name, "synonyms": synonyms, "identifiers": identifiers, "age_of_onset": age_of_onset}) + conditions.append({"name": conditions_name, "synonyms": synonyms, "identifiers": identifiers, + "age_of_onset": age_of_onset}) try: - genotypeset = cp.ReferenceClinVarAssertion.GenotypeSet + genotypeset = parsed_record.ReferenceClinVarAssertion.GenotypeSet except: genotypeset = None if genotypeset: obj_list = [] id_list = [] - for _set in cp.ReferenceClinVarAssertion.GenotypeSet.MeasureSet: + for _set in parsed_record.ReferenceClinVarAssertion.GenotypeSet.MeasureSet: variant_id = _set.ID for _measure in _set.Measure: json_obj = parse_measure(_measure, hg19=hg19) if json_obj: - json_obj['clinvar']['rcv'].update({'accession': rcv_accession, + json_obj['clinvar']['rcv'].update({ + 'accession': rcv_accession, 'clinical_significance': clinical_significance, 'number_submitters': number_submitters, 'review_status': review_status, 'last_evaluated': str(last_evaluated), 'origin': origin, - 'conditions': conditions}) + 'conditions': conditions + }) json_obj['clinvar'].update({'variant_id': variant_id}) json_obj = (dict_sweep(unlist(value_convert_to_number(json_obj, - ['chrom', 'omim', 'id', 'orphanet', 'gene', - 'rettbase_(cdkl5)', 'cosmic', 'dbrbc'])), [None, '', 'None'])) + ['chrom', 'omim', 'id', 'orphanet', 'gene', 'rettbase_(cdkl5)', 'cosmic', 'dbrbc'])), + [None, '', 'None'])) obj_list.append(json_obj) id_list.append(json_obj['_id']) for _obj in obj_list: @@ -348,17 +339,19 @@ def _map_line_to_json(cp, hg19): }}) yield _obj else: - variant_id = cp.ReferenceClinVarAssertion.MeasureSet.ID - for _measure in cp.ReferenceClinVarAssertion.MeasureSet.Measure: + variant_id = parsed_record.ReferenceClinVarAssertion.MeasureSet.ID + for _measure in parsed_record.ReferenceClinVarAssertion.MeasureSet.Measure: json_obj = parse_measure(_measure, hg19=hg19) if json_obj: - json_obj['clinvar']['rcv'].update({'accession': rcv_accession, - 'clinical_significance': clinical_significance, - 'number_submitters': number_submitters, - 'review_status': review_status, - 'last_evaluated': str(last_evaluated), - 'origin': origin, - 'conditions': conditions}) + json_obj['clinvar']['rcv'].update({ + 'accession': rcv_accession, + 'clinical_significance': clinical_significance, + 'number_submitters': number_submitters, + 'review_status': review_status, + 'last_evaluated': str(last_evaluated), + 'origin': origin, + 'conditions': conditions + }) json_obj['clinvar'].update({'variant_id': variant_id}) json_obj = (dict_sweep(unlist(value_convert_to_number(json_obj, ['chrom', 'omim', 'id', 'orphanet', 'gene', @@ -366,23 +359,23 @@ def _map_line_to_json(cp, hg19): yield json_obj -def rcv_feeder(input_file, hg19): +def clinvar_json_record_feeder(input_file, hg19): # the first two line of clinvar_xml is not useful information - cv_data = rec_handler(input_file, block_end='\n', - skip=2, include_block_end=True) - for record in cv_data: + xml_records = rec_handler(input_file, block_end='\n', skip=2, include_block_end=True) + for xml_record in xml_records: # some exceptions - if record.startswith('\n'): + if xml_record.startswith('\n'): continue try: - record_parsed = clinvarlib.parseString(record, silence=1) + parsed_record = clinvarlib.parseString(xml_record, silence=1) except: - logging.debug(record) + logging.debug(xml_record) raise - for record_mapped in _map_line_to_json(record_parsed, hg19): - yield record_mapped + for json_record in _map_parsed_record_to_json(parsed_record, hg19): + yield json_record + def load_data(data_folder, version): # try to get logger from uploader @@ -391,18 +384,23 @@ def load_data(data_folder, version): logging = loggingmod.getLogger("clinvar_upload") import_clinvar_lib(data_folder) - files = glob.glob(os.path.join(data_folder,GLOB_PATTERN)) - assert len(files) == 1, "Expecting only one file matching '%s', got: %s" % (GLOB_PATTERN,files) + + files = glob.glob(os.path.join(data_folder, GLOB_PATTERN)) + assert len(files) == 1, "Expecting only one file matching '%s', got: %s" % (GLOB_PATTERN, files) input_file = files[0] - data_generator = rcv_feeder(input_file, version == "hg19") - data_list = list(data_generator) - # Sort the `data_list` because `merge_rcv_accession` will call `itertools.groupby()` - # which cannot group non-adjacent items with the same key into a group - data_list_sorted = sorted(data_list, key=lambda k: k['_id']) - data_merge_rcv = merge_rcv_accession(data_list_sorted) - return data_merge_rcv + + json_record_generator = clinvar_json_record_feeder(input_file, hg19=(version == "hg19")) + + # Sorting is necessary because `merge_rcv_accession` will call `itertools.groupby()` + # which cannot put non-adjacent items with the same key into a group + json_record_list = list(json_record_generator) + json_record_list = sorted(json_record_list, key=lambda k: k['_id']) + + merged_json_records = merge_rcv_accession(json_record_list) + return merged_json_records + if __name__ == "__main__": from biothings.utils.mongo import get_data_folder data_folder = get_data_folder("clinvar") - load_data(data_folder=data_folder) + load_data(data_folder=data_folder, version="hg19") From 1f679bdde8bef8b072f0f4aeb406f4df4937578a Mon Sep 17 00:00:00 2001 From: Yao Yao Date: Tue, 10 Aug 2021 15:40:52 -0700 Subject: [PATCH 3/4] deleted the legacy parser --- .../sources/clinvar/clinvar_parser.py | 164 ------------------ 1 file changed, 164 deletions(-) delete mode 100644 src/hub/dataload/sources/clinvar/clinvar_parser.py diff --git a/src/hub/dataload/sources/clinvar/clinvar_parser.py b/src/hub/dataload/sources/clinvar/clinvar_parser.py deleted file mode 100644 index ef3014e8..00000000 --- a/src/hub/dataload/sources/clinvar/clinvar_parser.py +++ /dev/null @@ -1,164 +0,0 @@ -# -*- coding: utf-8 -*- -# DO NOT USE THIS SCRIPT, USE THE XML PARSER INSTEAD -import csv -import re -from itertools import imap, groupby -import os - -import vcf - -from biothings.utils.dataload import unlist, dict_sweep, value_convert_to_number, merge_duplicate_rows - -''' vcf file for clinvar downloaded from -ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh37/ -and tab_delimited file for clinvar downloaded from -ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/''' - -VALID_COLUMN_NO = 25 -vcf_reader = vcf.Reader(filename='clinvar_20150305.vcf.gz') - - -# split id lists into dictionary -def other_id(other_ids): - p = other_ids.strip(";").replace(";", ",").split(",") - other_id = {} - for id in p: - try: - ind = id.index(":") - key, value = id[:ind], id[ind+1:] - if key in other_id: - if not isinstance(other_id[key], list): - other_id[key] = [other_id[key]] - other_id[key].append(value) - else: - other_id[key] = value - except: - continue - return other_id - - -# convert one snp to json -def _map_line_to_json(fields): - assert len(fields) == VALID_COLUMN_NO - chrom = fields[13] - chromStart = fields[14] - chromEnd = fields[15] - - HGVS = None - cds = fields[18].split(":") - cds = cds[1] - replace = re.findall(r'[ATCGMNYR=]+', cds) - sub = re.search(r'\d([ATCGMNHKRY]>[ATCGMNHKRY])', cds) - ins = re.search(r'ins[ATCGMNHYR]+|ins[0-9]+', cds) - delete = fields[1] == 'deletion' - indel = fields[1] == 'indel' - dup = re.search(r'dup', cds) - inv = re.search(r'inv|inv[0-9]+|inv[ATCGMNHYR]+', cds) - if ins: - delete = None - indel = None - elif delete: - ins = None - indel = None - # parse from vcf file. Input chrom number - # and chromStart, and return REF, ALT - if chromStart: - record = vcf_reader.fetch(chrom, int(chromStart)) - else: - record = None - if record: - REF = record.REF - ALT = record.ALT - ALT = ALT[0] - if record.is_snp and len(ALT) < 2: - mod = [REF, ALT] - else: - mod = ALT - else: - return - - if sub and record.is_snp: - HGVS = "chr%s:g.%s%s>%s" % (chrom, chromStart, mod[0], mod[1]) - elif ins: - HGVS = "chr%s:g.%s_%sins%s" % (chrom, chromStart, chromEnd, mod) - elif delete: - HGVS = "chr%s:g.%s_%sdel" % (chrom, chromStart, chromEnd) - elif indel: - try: - HGVS = "chr%s:g.%s_%sdelins%s" % (chrom, chromStart, chromEnd, mod) - except AttributeError: - print "ERROR:", fields[1], cds - elif dup: - HGVS = "chr%s:g.%s_%sdup%s" % (chrom, chromStart, chromEnd, mod) - elif inv: - HGVS = "chr%s:g.%s_%sinv%s" % (chrom, chromStart, chromEnd, mod) - elif replace: - HGVS = "chr%s:g.%s_%s%s" % (chrom, chromStart, chromEnd, mod) - else: - print 'ERROR:', fields[1], cds - - # load as json data - if HGVS is None: - print 'None:', fields[1], cds - return None - - one_snp_json = { - - "_id": HGVS, - "clinvar": - { - "allele_id": fields[0], - "hg19": - { - "chr": fields[13], - "start": fields[14], - "end": fields[15] - }, - "type": fields[1], - "name": fields[2], - "gene": - { - "id": fields[3], - "symbol": fields[4] - }, - "clinical_significance": fields[5].split(";"), - "rsid": 'rs' + str(fields[6]), - "nsv_dbvar": fields[7], - "rcv_accession": fields[8].split(";"), - "tested_in_gtr": fields[9], - "phenotype_id": other_id(fields[10]), - "origin": fields[11], - "cytogenic": fields[16], - "review_status": fields[17], - "hgvs": - { - "coding": fields[18], - "protein": fields[19] - }, - "number_submitters": fields[20], - "last_evaluated": fields[21], - "guidelines": fields[22], - "other_ids": other_id(fields[23]), - "clinvar_id": fields[24] - } - } - return dict_sweep(unlist(value_convert_to_number(one_snp_json)), vals=["-"]) - - -# open file, parse, pass to json mapper -def load_data(input_file): - os.system("sort -t$'\t' -k14 -k15 -k20 -n %s > %s_sorted.tsv" \ - % (input_file, input_file)) - open_file = open("%s_sorted.tsv" % (input_file)) - print input_file - clinvar = csv.reader(open_file, delimiter="\t") - clinvar = (row for row in clinvar - if row[18] != '-' and - row[18].find('?') == -1 and - row[13] != "" and - row[12] == "GRCh37" and - not re.search(r'p.', row[18])) - json_rows = (row for row in imap(_map_line_to_json, clinvar) if row) - row_groups = (it for (key, it) in groupby(json_rows, lambda row: - row["_id"])) - return (merge_duplicate_rows(rg, "clinvar") for rg in row_groups) From 51fa2f58210c65ec4f9b4d6895e6a2ff10b88ca2 Mon Sep 17 00:00:00 2001 From: Yao Yao Date: Wed, 11 Aug 2021 17:44:23 -0700 Subject: [PATCH 4/4] rename variables and functions; added detailed comments --- .../sources/clinvar/clinvar_xml_parser.py | 215 ++++++++++++------ 1 file changed, 151 insertions(+), 64 deletions(-) diff --git a/src/hub/dataload/sources/clinvar/clinvar_xml_parser.py b/src/hub/dataload/sources/clinvar/clinvar_xml_parser.py index fd7f3b93..00631e54 100644 --- a/src/hub/dataload/sources/clinvar/clinvar_xml_parser.py +++ b/src/hub/dataload/sources/clinvar/clinvar_xml_parser.py @@ -1,3 +1,25 @@ +""" +Two major parsing functions in this script are: + + 1. `clinvar_doc_feeder()` + 2. `merge_rcv_accession()` + +`clinvar_doc_feeder()` is responsible for the following jobs: + + 1. Receive a ClinVarFullRelease_*.xml.gz file, and split the xml file into `...` blocks + 2. Parse each `...` block into an `PublicSetType` object, which is defined in the + dynamically imported `clinvarlib` + 3. Convert each `PublicSetType` object into a clinvar document + +Note that The `PublicSetType` class is defined to map the block structure of ``, which can be inspected +through its XSD, https://ftp.ncbi.nlm.nih.gov/pub/clinvar/clinvar_public.xsd. + +`merge_rcv_accession()` is responsible for only one job: + + 1. Group all the document by `doc['_id']`, and then inside each group, merge all the `doc['clinvar']['rcv']`. + Each group of documents will be merged into a single document. +""" + import glob import os import sys @@ -21,40 +43,57 @@ def import_clinvar_lib(data_folder): clinvarlib = clinvar_mod -def merge_rcv_accession(records): - record_groups = [] - for key, group in groupby(records, lambda x: x['_id']): - record_groups.append(list(group)) +def merge_rcv_accession(docs): + doc_groups = [] + for key, group in groupby(docs, lambda x: x['_id']): + doc_groups.append(list(group)) - # get the number of groups, and uniquekeys - logging.info("number of groups: %s" % len(record_groups)) + # get the number of groups, and unique keys + logging.info("number of groups: %s" % len(doc_groups)) - # Each group of records is a list - # Loop through each group, if record number >1, merge rcv accessions - for record_list in record_groups: - if len(record_list) == 1: - yield record_list[0] + # Each group is a list of documents + # Loop through each group, if doc number >1, merge rcv accessions + for doc_list in doc_groups: + if len(doc_list) == 1: + yield doc_list[0] else: - rcv_list = [record['clinvar']['rcv'] for record in record_list] + rcv_list = [doc['clinvar']['rcv'] for doc in doc_list] - merged_record = record_list[0] - merged_record['clinvar']['rcv'] = rcv_list - yield merged_record - - -def parse_measure(Measure, hg19=True): + merged_doc = doc_list[0] + merged_doc['clinvar']['rcv'] = rcv_list + yield merged_doc + + +def _map_measure_to_json(measure_obj, hg19=True): + """ + Convert a `clinvarlib.MeasureType` object into json. + + Each `clinvarlib.MeasureType` object is mapped to a `` block in the XML, which is part of the hierarchy + below: + + + + + ... + ... + ... + + + + """ + # exclude any item of which types belong to 'Variation', 'protein only' or 'Microsatellite' - variation_type = Measure.Type + variation_type = measure_obj.Type if variation_type == 'Variation' or variation_type == 'protein only' or variation_type == 'Microsatellite': return None - allele_id = Measure.ID + allele_id = measure_obj.ID chrom = None chromStart_19, chromEnd_19, chromStart_38, chromEnd_38 = None, None, None, None ref, alt = None, None - if Measure.SequenceLocation: - for SequenceLocation in Measure.SequenceLocation: + if measure_obj.SequenceLocation: + for SequenceLocation in measure_obj.SequenceLocation: if 'GRCh37' in SequenceLocation.Assembly: chrom = SequenceLocation.Chr chromStart_19 = SequenceLocation.start @@ -74,21 +113,21 @@ def parse_measure(Measure, hg19=True): symbol = None gene_id = None - if Measure.MeasureRelationship: + if measure_obj.MeasureRelationship: try: - symbol = Measure.MeasureRelationship[0].Symbol[0].get_ElementValue().valueOf_ + symbol = measure_obj.MeasureRelationship[0].Symbol[0].get_ElementValue().valueOf_ except: symbol = None - gene_id = Measure.MeasureRelationship[0].XRef[0].ID + gene_id = measure_obj.MeasureRelationship[0].XRef[0].ID name = None - if Measure.Name: - name = Measure.Name[0].ElementValue.valueOf_ + if measure_obj.Name: + name = measure_obj.Name[0].ElementValue.valueOf_ - if len(Measure.CytogeneticLocation) == 1: - cytogenic = Measure.CytogeneticLocation[0] + if len(measure_obj.CytogeneticLocation) == 1: + cytogenic = measure_obj.CytogeneticLocation[0] else: - cytogenic = Measure.CytogeneticLocation + cytogenic = measure_obj.CytogeneticLocation hgvs_coding = None hgvs_genome = None @@ -103,11 +142,11 @@ def parse_measure(Measure, hg19=True): chromEnd = chromEnd_38 # hgvs_not_validated = None - if Measure.AttributeSet: + if measure_obj.AttributeSet: # 'copy number loss' or 'gain' have format different\ # from other types, should be dealt with seperately if (variation_type == 'copy number loss') or (variation_type == 'copy number gain'): - for AttributeSet in Measure.AttributeSet: + for AttributeSet in measure_obj.AttributeSet: if 'HGVS, genomic, top level' in AttributeSet.Attribute.Type: if AttributeSet.Attribute.integerValue == 37: hgvs_genome = AttributeSet.Attribute.get_valueOf_() @@ -121,7 +160,7 @@ def parse_measure(Measure, hg19=True): elif 'protein' in AttributeSet.Attribute.Type: HGVS['protein'].append(AttributeSet.Attribute.get_valueOf_()) else: - for AttributeSet in Measure.AttributeSet: + for AttributeSet in measure_obj.AttributeSet: if 'genomic' in AttributeSet.Attribute.Type: HGVS['genomic'].append(AttributeSet.Attribute.get_valueOf_()) elif 'non-coding' in AttributeSet.Attribute.Type: @@ -197,8 +236,8 @@ def parse_measure(Measure, hg19=True): uniprot = None omim = None # loop through XRef to find rsid as well as other ids - if Measure.XRef: - for XRef in Measure.XRef: + if measure_obj.XRef: + for XRef in measure_obj.XRef: if XRef.Type == 'rs': rsid = 'rs' + str(XRef.ID) elif XRef.DB == 'COSMIC': @@ -249,34 +288,49 @@ def parse_measure(Measure, hg19=True): return one_snp_json -def _map_parsed_record_to_json(parsed_record, hg19): +def _map_public_set_to_json(public_set_obj, hg19: bool): + """ + Convert a `clinvarlib.PublicSetType` object into a json document. + + Each `clinvarlib.PublicSetType` object is mapped to a `` block in the XML. + + E.g., `public_set_obj.ReferenceClinVarAssertion.MeasureSet` is the parsed value from a block structure below: + + + + + ... + + + + """ try: - clinical_significance = parsed_record.ReferenceClinVarAssertion.ClinicalSignificance.Description + clinical_significance = public_set_obj.ReferenceClinVarAssertion.ClinicalSignificance.Description except: clinical_significance = None - rcv_accession = parsed_record.ReferenceClinVarAssertion.ClinVarAccession.Acc + rcv_accession = public_set_obj.ReferenceClinVarAssertion.ClinVarAccession.Acc try: - review_status = parsed_record.ReferenceClinVarAssertion.ClinicalSignificance.ReviewStatus + review_status = public_set_obj.ReferenceClinVarAssertion.ClinicalSignificance.ReviewStatus except: review_status = None try: - last_evaluated = parsed_record.ReferenceClinVarAssertion.ClinicalSignificance.DateLastEvaluated + last_evaluated = public_set_obj.ReferenceClinVarAssertion.ClinicalSignificance.DateLastEvaluated except: last_evaluated = None - number_submitters = len(parsed_record.ClinVarAssertion) + number_submitters = len(public_set_obj.ClinVarAssertion) # some items in clinvar_xml doesn't have origin information try: - origin = parsed_record.ReferenceClinVarAssertion.ObservedIn[0].Sample.Origin + origin = public_set_obj.ReferenceClinVarAssertion.ObservedIn[0].Sample.Origin except: origin = None conditions = [] - for _trait in parsed_record.ReferenceClinVarAssertion.TraitSet.Trait: + for _trait in public_set_obj.ReferenceClinVarAssertion.TraitSet.Trait: synonyms = [] conditions_name = '' for name in _trait.Name: @@ -305,17 +359,17 @@ def _map_parsed_record_to_json(parsed_record, hg19): "age_of_onset": age_of_onset}) try: - genotypeset = parsed_record.ReferenceClinVarAssertion.GenotypeSet + genotypeset = public_set_obj.ReferenceClinVarAssertion.GenotypeSet except: genotypeset = None if genotypeset: obj_list = [] id_list = [] - for _set in parsed_record.ReferenceClinVarAssertion.GenotypeSet.MeasureSet: + for _set in public_set_obj.ReferenceClinVarAssertion.GenotypeSet.MeasureSet: variant_id = _set.ID for _measure in _set.Measure: - json_obj = parse_measure(_measure, hg19=hg19) + json_obj = _map_measure_to_json(_measure, hg19=hg19) if json_obj: json_obj['clinvar']['rcv'].update({ 'accession': rcv_accession, @@ -339,9 +393,9 @@ def _map_parsed_record_to_json(parsed_record, hg19): }}) yield _obj else: - variant_id = parsed_record.ReferenceClinVarAssertion.MeasureSet.ID - for _measure in parsed_record.ReferenceClinVarAssertion.MeasureSet.Measure: - json_obj = parse_measure(_measure, hg19=hg19) + variant_id = public_set_obj.ReferenceClinVarAssertion.MeasureSet.ID + for _measure in public_set_obj.ReferenceClinVarAssertion.MeasureSet.Measure: + json_obj = _map_measure_to_json(_measure, hg19=hg19) if json_obj: json_obj['clinvar']['rcv'].update({ 'accession': rcv_accession, @@ -359,22 +413,55 @@ def _map_parsed_record_to_json(parsed_record, hg19): yield json_obj -def clinvar_json_record_feeder(input_file, hg19): - # the first two line of clinvar_xml is not useful information - xml_records = rec_handler(input_file, block_end='\n', skip=2, include_block_end=True) - for xml_record in xml_records: - # some exceptions - if xml_record.startswith('\n'): +def clinvar_doc_feeder(input_file, hg19: bool): + """ + This function will split the xml file into `...` blocks, then parse each block into an + `PublicSetType` object (which is defined in the dynamically imported `clinvarlib`), and finally convert each + `PublicSetType` object into a clinvar document. + """ + + """ + A ClinVarFullRelease_*.xml.gz file has the following structure: + + + + + + ... + + + + ... + + + ... + + + + Therefore when splitting the xml into `` blocks, the first 2 lines and the last 1 line should be + skipped. + + However the `rec_handler` function cannot skip the last 1 line, and will return "\n..." as + the last block in this scenario. Therefore in the for-loop below, we will skip any block starting with + "\n". + """ + clinvar_set_blocks = rec_handler(input_file, block_end='\n', skip=2, include_block_end=True) + for clinvar_set_block in clinvar_set_blocks: + # Skip any block starting with "\n" + # Actually only the last block will be skipped. See comments above + if clinvar_set_block.startswith('\n'): continue try: - parsed_record = clinvarlib.parseString(xml_record, silence=1) + # Parse each `` block into a `clinvarlib.PublicSetType` object + public_set_obj = clinvarlib.parseString(clinvar_set_block, silence=1) except: - logging.debug(xml_record) + logging.debug(clinvar_set_block) raise - for json_record in _map_parsed_record_to_json(parsed_record, hg19): - yield json_record + # Convert each `clinvarlib.PublicSetType` object into a json document + for doc in _map_public_set_to_json(public_set_obj, hg19): + yield doc def load_data(data_folder, version): @@ -389,15 +476,15 @@ def load_data(data_folder, version): assert len(files) == 1, "Expecting only one file matching '%s', got: %s" % (GLOB_PATTERN, files) input_file = files[0] - json_record_generator = clinvar_json_record_feeder(input_file, hg19=(version == "hg19")) + doc_generator = clinvar_doc_feeder(input_file, hg19=(version == "hg19")) # Sorting is necessary because `merge_rcv_accession` will call `itertools.groupby()` # which cannot put non-adjacent items with the same key into a group - json_record_list = list(json_record_generator) - json_record_list = sorted(json_record_list, key=lambda k: k['_id']) + doc_list = list(doc_generator) + doc_list = sorted(doc_list, key=lambda k: k['_id']) - merged_json_records = merge_rcv_accession(json_record_list) - return merged_json_records + merged_doc_generator = merge_rcv_accession(doc_list) + return merged_doc_generator if __name__ == "__main__":