From 00a3c36b4e521a41fd7d5e75de1d2672c4cb1af1 Mon Sep 17 00:00:00 2001
From: Yao Yao <yaoy.cs@gmail.com>
Date: Mon, 9 Aug 2021 12:37:20 -0700
Subject: [PATCH 1/4] added comments; formatted code

---
 .../sources/clinvar/clinvar_xml_parser.py     | 127 +++++++++---------
 1 file changed, 61 insertions(+), 66 deletions(-)

diff --git a/src/hub/dataload/sources/clinvar/clinvar_xml_parser.py b/src/hub/dataload/sources/clinvar/clinvar_xml_parser.py
index 6d3f4661..3e4640a5 100644
--- a/src/hub/dataload/sources/clinvar/clinvar_xml_parser.py
+++ b/src/hub/dataload/sources/clinvar/clinvar_xml_parser.py
@@ -34,8 +34,8 @@ def merge_rcv_accession(generator):
         if len(item) > 1:
             json_item = item[0]
             for _item in item:
-                    rcv_info = _item['clinvar']['rcv']
-                    rcv_new.append(rcv_info)
+                rcv_info = _item['clinvar']['rcv']
+                rcv_new.append(rcv_info)
             json_item['clinvar']['rcv'] = rcv_new
             yield json_item
         else:
@@ -71,6 +71,7 @@ def parse_measure(Measure, hg19=True):
                     alt = SequenceLocation.alternateAllele
                 if not alt:
                     alt = SequenceLocation.alternateAlleleVCF
+
             if 'GRCh38' in SequenceLocation.Assembly:
                 chromStart_38 = SequenceLocation.start
                 chromEnd_38 = SequenceLocation.stop
@@ -82,6 +83,7 @@ def parse_measure(Measure, hg19=True):
                     alt = SequenceLocation.alternateAllele
                 if not alt:
                     alt = SequenceLocation.alternateAlleleVCF
+
     if Measure.MeasureRelationship:
         try:
             symbol = Measure.MeasureRelationship[0].\
@@ -92,14 +94,17 @@ def parse_measure(Measure, hg19=True):
     else:
         symbol = None
         gene_id = None
+
     if Measure.Name:
         name = Measure.Name[0].ElementValue.valueOf_
     else:
         name = None
+
     if len(Measure.CytogeneticLocation) == 1:
         cytogenic = Measure.CytogeneticLocation[0]
     else:
         cytogenic = Measure.CytogeneticLocation
+
     hgvs_coding = None
     hgvs_genome = None
     HGVS = {'genomic': [], 'coding': [], 'non-coding': [], 'protein': []}
@@ -111,49 +116,40 @@ def parse_measure(Measure, hg19=True):
     else:
         chromStart = chromStart_38
         chromEnd = chromEnd_38
+
     # hgvs_not_validated = None
     if Measure.AttributeSet:
         # 'copy number loss' or 'gain' have format different\
         # from other types, should be dealt with seperately
-        if (variation_type == 'copy number loss') or \
-                (variation_type == 'copy number gain'):
+        if (variation_type == 'copy number loss') or (variation_type == 'copy number gain'):
             for AttributeSet in Measure.AttributeSet:
-                if 'HGVS, genomic, top level' in AttributeSet.\
-                        Attribute.Type:
+                if 'HGVS, genomic, top level' in AttributeSet.Attribute.Type:
                     if AttributeSet.Attribute.integerValue == 37:
                         hgvs_genome = AttributeSet.Attribute.get_valueOf_()
                 if 'genomic' in AttributeSet.Attribute.Type:
-                    HGVS['genomic'].append(AttributeSet.Attribute.
-                                           get_valueOf_())
+                    HGVS['genomic'].append(AttributeSet.Attribute.get_valueOf_())
                 elif 'non-coding' in AttributeSet.Attribute.Type:
-                    HGVS['non-coding'].append(AttributeSet.Attribute.
-                                              get_valueOf_())
+                    HGVS['non-coding'].append(AttributeSet.Attribute.get_valueOf_())
                 elif 'coding' in AttributeSet.Attribute.Type:
-                    HGVS['coding'].append(AttributeSet.Attribute.
-                                          get_valueOf_())
+                    HGVS['coding'].append(AttributeSet.Attribute.get_valueOf_())
                 elif 'protein' in AttributeSet.Attribute.Type:
-                    HGVS['protein'].append(AttributeSet.
-                                           Attribute.get_valueOf_())
+                    HGVS['protein'].append(AttributeSet.Attribute.get_valueOf_())
         else:
             for AttributeSet in Measure.AttributeSet:
                 if 'genomic' in AttributeSet.Attribute.Type:
-                    HGVS['genomic'].append(AttributeSet.
-                                           Attribute.get_valueOf_())
+                    HGVS['genomic'].append(AttributeSet.Attribute.get_valueOf_())
                 elif 'non-coding' in AttributeSet.Attribute.Type:
-                    HGVS['non-coding'].append(AttributeSet.
-                                              Attribute.get_valueOf_())
+                    HGVS['non-coding'].append(AttributeSet.Attribute.get_valueOf_())
                 elif 'coding' in AttributeSet.Attribute.Type:
-                    HGVS['coding'].append(AttributeSet.Attribute.
-                                          get_valueOf_())
+                    HGVS['coding'].append(AttributeSet.Attribute.get_valueOf_())
                 elif 'protein' in AttributeSet.Attribute.Type:
-                    HGVS['protein'].append(AttributeSet.
-                                           Attribute.get_valueOf_())
+                    HGVS['protein'].append(AttributeSet.Attribute.get_valueOf_())
                 if AttributeSet.Attribute.Type == 'HGVS, coding, RefSeq':
                     hgvs_coding = AttributeSet.Attribute.get_valueOf_()
-                elif AttributeSet.Attribute.Type == \
-                        'HGVS, genomic, top level, previous':
+                elif AttributeSet.Attribute.Type == 'HGVS, genomic, top level, previous':
                     hgvs_genome = AttributeSet.Attribute.get_valueOf_()
                     break
+
         if chrom and chromStart and chromEnd:
             # if its SNP, make sure chrom, chromStart, chromEnd, ref, alt are all provided
             if variation_type == 'single nucleotide variant':
@@ -169,8 +165,7 @@ def parse_measure(Measure, hg19=True):
                 if hgvs_genome:
                     indel_position = hgvs_genome.find('del')
                     indel_alt = hgvs_genome[indel_position+3:]
-                    hgvs_id = "chr%s:g.%s_%sdel%s" % \
-                              (chrom, chromStart, chromEnd, indel_alt)
+                    hgvs_id = "chr%s:g.%s_%sdel%s" % (chrom, chromStart, chromEnd, indel_alt)
             elif variation_type == 'Deletion':
                 if chromStart == chromEnd:
                     # RCV000048406, chr17:g.41243547del
@@ -183,24 +178,19 @@ def parse_measure(Measure, hg19=True):
                     if 'ins' in hgvs_genome:
                         ins_ref = hgvs_genome[ins_position+3:]
                         if chromStart == chromEnd:
-                            hgvs_id = "chr%s:g.%sins%s" % \
-                                      (chrom, chromStart, ins_ref)
+                            hgvs_id = "chr%s:g.%sins%s" % (chrom, chromStart, ins_ref)
                         else:
-                            hgvs_id = "chr%s:g.%s_%sins%s" % \
-                                      (chrom, chromStart, chromEnd, ins_ref)
+                            hgvs_id = "chr%s:g.%s_%sins%s" % (chrom, chromStart, chromEnd, ins_ref)
             elif variation_type == 'Duplication':
                 if hgvs_genome:
                     dup_position = hgvs_genome.find('dup')
                     if 'dup' in hgvs_genome:
                         dup_ref = hgvs_genome[dup_position+3:]
                         if chromStart == chromEnd:
-                            hgvs_id = "chr%s:g.%sdup%s" % \
-                                  (chrom, chromStart, dup_ref)
+                            hgvs_id = "chr%s:g.%sdup%s" % (chrom, chromStart, dup_ref)
                         else:
-                            hgvs_id = "chr%s:g.%s_%sdup%s" % \
-                                  (chrom, chromStart, chromEnd, dup_ref)
-        elif variation_type == 'copy number loss' or\
-                variation_type == 'copy number gain':
+                            hgvs_id = "chr%s:g.%s_%sdup%s" % (chrom, chromStart, chromEnd, dup_ref)
+        elif variation_type == 'copy number loss' or variation_type == 'copy number gain':
             if hgvs_genome and chrom:
                 hgvs_id = "chr" + chrom + ":" + hgvs_genome.split('.')[2]
         elif hgvs_coding:
@@ -210,8 +200,10 @@ def parse_measure(Measure, hg19=True):
             return
     else:
         return
+
     for key in HGVS:
         HGVS[key].sort()
+
     rsid = None
     cosmic = None
     dbvar = None
@@ -234,7 +226,6 @@ def parse_measure(Measure, hg19=True):
     # make sure the hgvs_id is not none
     if hgvs_id:
         one_snp_json = {
-
             "_id": hgvs_id,
             "clinvar":
                 {
@@ -244,26 +235,22 @@ def parse_measure(Measure, hg19=True):
                     "cosmic": cosmic,
                     "uniprot": uniprot,
                     "dbvar": dbvar,
-                    "hg19":
-                        {
-                            "start": chromStart_19,
-                            "end": chromEnd_19
-                        },
-                    "hg38":
-                        {
-                            "start": chromStart_38,
-                            "end": chromEnd_38
-                        },
+                    "hg19": {
+                        "start": chromStart_19,
+                        "end": chromEnd_19
+                    },
+                    "hg38": {
+                        "start": chromStart_38,
+                        "end": chromEnd_38
+                    },
                     "type": variation_type,
-                    "gene":
-                        {
-                            "id": gene_id,
-                            "symbol": symbol
-                        },
-                    "rcv":
-                        {
-                            "preferred_name": name,
-                        },
+                    "gene": {
+                        "id": gene_id,
+                        "symbol": symbol
+                    },
+                    "rcv": {
+                        "preferred_name": name,
+                    },
                     "rsid": rsid,
                     "cytogenic": cytogenic,
                     "hgvs": HGVS,
@@ -277,19 +264,19 @@ def parse_measure(Measure, hg19=True):
 
 def _map_line_to_json(cp, hg19):
     try:
-        clinical_significance = cp.ReferenceClinVarAssertion.\
-            ClinicalSignificance.Description
+        clinical_significance = cp.ReferenceClinVarAssertion.ClinicalSignificance.Description
     except:
         clinical_significance = None
+
     rcv_accession = cp.ReferenceClinVarAssertion.ClinVarAccession.Acc
+
     try:
-        review_status = cp.ReferenceClinVarAssertion.ClinicalSignificance.\
-            ReviewStatus
+        review_status = cp.ReferenceClinVarAssertion.ClinicalSignificance.ReviewStatus
     except:
         review_status = None
+
     try:
-        last_evaluated = cp.ReferenceClinVarAssertion.ClinicalSignificance.\
-            DateLastEvaluated
+        last_evaluated = cp.ReferenceClinVarAssertion.ClinicalSignificance.DateLastEvaluated
     except:
         last_evaluated = None
     
@@ -299,6 +286,7 @@ def _map_line_to_json(cp, hg19):
         origin = cp.ReferenceClinVarAssertion.ObservedIn[0].Sample.Origin
     except:
         origin = None
+
     conditions = []
     for _trait in cp.ReferenceClinVarAssertion.TraitSet.Trait:
         synonyms = []
@@ -308,6 +296,7 @@ def _map_line_to_json(cp, hg19):
                 synonyms.append(name.ElementValue.get_valueOf_())
             if name.ElementValue.Type == 'Preferred':
                 conditions_name += name.ElementValue.get_valueOf_()
+
         identifiers = {}
         for item in _trait.XRef:
             if item.DB == 'Human Phenotype Ontology':
@@ -318,16 +307,19 @@ def _map_line_to_json(cp, hg19):
         for symbol in _trait.Symbol:
             if symbol.ElementValue.Type == 'Preferred':
                 conditions_name += ' (' + symbol.ElementValue.get_valueOf_() + ')'
+
         age_of_onset = ''
         for _set in _trait.AttributeSet:
             if _set.Attribute.Type == 'age of onset':
                 age_of_onset = _set.Attribute.get_valueOf_()
+
         conditions.append({"name": conditions_name, "synonyms": synonyms, "identifiers": identifiers, "age_of_onset": age_of_onset})
 
     try:
         genotypeset = cp.ReferenceClinVarAssertion.GenotypeSet
     except:
         genotypeset = None
+
     if genotypeset:
         obj_list = []
         id_list = []
@@ -351,9 +343,9 @@ def _map_line_to_json(cp, hg19):
                     id_list.append(json_obj['_id'])
         for _obj in obj_list:
             _obj['clinvar'].update({'genotypeset': {
-                    'type': 'CompoundHeterozygote',
-                    'genotype': id_list
-                    }})
+                'type': 'CompoundHeterozygote',
+                'genotype': id_list
+            }})
             yield _obj
     else:
         variant_id = cp.ReferenceClinVarAssertion.MeasureSet.ID
@@ -382,11 +374,13 @@ def rcv_feeder(input_file, hg19):
         # some exceptions
         if record.startswith('\n</ReleaseSet>'):
             continue
+
         try:
             record_parsed = clinvarlib.parseString(record, silence=1)
         except:
             logging.debug(record)
             raise
+
         for record_mapped in _map_line_to_json(record_parsed, hg19):
             yield record_mapped
 
@@ -402,7 +396,8 @@ def load_data(data_folder, version):
     input_file = files[0]
     data_generator = rcv_feeder(input_file, version == "hg19")
     data_list = list(data_generator)
-    # TODO: why do we sort this list ? this prevent from using yield/iterator
+    # Sort the `data_list` because `merge_rcv_accession` will call `itertools.groupby()`
+    #   which cannot group non-adjacent items with the same key into a group
     data_list_sorted = sorted(data_list, key=lambda k: k['_id'])
     data_merge_rcv = merge_rcv_accession(data_list_sorted)
     return data_merge_rcv

From 24e3a9ccfb746e59edbc892b29412e3f34eaeee3 Mon Sep 17 00:00:00 2001
From: Yao Yao <yaoy.cs@gmail.com>
Date: Tue, 10 Aug 2021 15:37:55 -0700
Subject: [PATCH 2/4] formatted code; renamed variables and functions

---
 .../sources/clinvar/clinvar_xml_parser.py     | 266 +++++++++---------
 1 file changed, 132 insertions(+), 134 deletions(-)

diff --git a/src/hub/dataload/sources/clinvar/clinvar_xml_parser.py b/src/hub/dataload/sources/clinvar/clinvar_xml_parser.py
index 3e4640a5..fd7f3b93 100644
--- a/src/hub/dataload/sources/clinvar/clinvar_xml_parser.py
+++ b/src/hub/dataload/sources/clinvar/clinvar_xml_parser.py
@@ -1,104 +1,89 @@
-# a python lib is generated on the fly, in data folder
-# adjust path
-
-
-import sys, os, glob
+import glob
+import os
+import sys
 from itertools import groupby
 
-from config import DATA_ARCHIVE_ROOT, logger as logging
-import biothings, config
+import biothings
+import config
 biothings.config_for_app(config)
-from biothings.utils.dataload import unlist, dict_sweep, \
-                                     value_convert_to_number, rec_handler
+
+from biothings.utils.dataload import unlist, dict_sweep, value_convert_to_number, rec_handler
 
 GLOB_PATTERN = "ClinVarFullRelease_*.xml.gz"
 clinvarlib = None
 
+
 def import_clinvar_lib(data_folder):
-    sys.path.insert(0,data_folder)
+    # a python lib is generated on the fly, in data folder
+    sys.path.insert(0, data_folder)
     import genclinvar as clinvar_mod
     global clinvarlib
     clinvarlib = clinvar_mod
 
-def merge_rcv_accession(generator):
-    groups = []
-    for key, group in groupby(generator, lambda x: x['_id']):
-        groups.append(list(group))
+
+def merge_rcv_accession(records):
+    record_groups = []
+    for key, group in groupby(records, lambda x: x['_id']):
+        record_groups.append(list(group))
 
     # get the number of groups, and uniquekeys
-    logging.info("number of groups: %s" % len(groups))
-
-    # loop through each item, if item number >1, merge rcv accession number
-    for item in groups:
-        rcv_new = []
-        if len(item) > 1:
-            json_item = item[0]
-            for _item in item:
-                rcv_info = _item['clinvar']['rcv']
-                rcv_new.append(rcv_info)
-            json_item['clinvar']['rcv'] = rcv_new
-            yield json_item
+    logging.info("number of groups: %s" % len(record_groups))
+
+    # Each group of records is a list
+    # Loop through each group, if record number >1, merge rcv accessions
+    for record_list in record_groups:
+        if len(record_list) == 1:
+            yield record_list[0]
         else:
-            yield item[0]
+            rcv_list = [record['clinvar']['rcv'] for record in record_list]
+            
+            merged_record = record_list[0]
+            merged_record['clinvar']['rcv'] = rcv_list
+            yield merged_record
 
 
 def parse_measure(Measure, hg19=True):
+    # exclude any item of which types belong to 'Variation', 'protein only' or 'Microsatellite'
     variation_type = Measure.Type
-    # exclude any item of which types belong to
-    # 'Variation', 'protein only' or 'Microsatellite'
     if variation_type == 'Variation' or variation_type == 'protein only' or variation_type == 'Microsatellite':
         return None
+
     allele_id = Measure.ID
+
     chrom = None
-    chromStart_19 = None
-    chromEnd_19 = None
-    chromStart_38 = None
-    chromEnd_38 = None
-    ref = None
-    alt = None
+    chromStart_19, chromEnd_19, chromStart_38, chromEnd_38 = None, None, None, None
+    ref, alt = None, None
     if Measure.SequenceLocation:
         for SequenceLocation in Measure.SequenceLocation:
-            # In this version, only accept information concerning GRCh37
             if 'GRCh37' in SequenceLocation.Assembly:
                 chrom = SequenceLocation.Chr
                 chromStart_19 = SequenceLocation.start
                 chromEnd_19 = SequenceLocation.stop
                 if not ref:
-                    ref = SequenceLocation.referenceAllele
-                if not ref:
-                    ref = SequenceLocation.referenceAlleleVCF
-                if not alt:
-                    alt = SequenceLocation.alternateAllele
+                    ref = SequenceLocation.referenceAllele or SequenceLocation.referenceAlleleVCF
                 if not alt:
-                    alt = SequenceLocation.alternateAlleleVCF
+                    alt = SequenceLocation.alternateAllele or SequenceLocation.alternateAlleleVCF
 
             if 'GRCh38' in SequenceLocation.Assembly:
                 chromStart_38 = SequenceLocation.start
                 chromEnd_38 = SequenceLocation.stop
                 if not ref:
-                    ref = SequenceLocation.referenceAllele
-                if not ref:
-                    ref = SequenceLocation.referenceAlleleVCF
-                if not alt:
-                    alt = SequenceLocation.alternateAllele
+                    ref = SequenceLocation.referenceAllele or SequenceLocation.referenceAlleleVCF
                 if not alt:
-                    alt = SequenceLocation.alternateAlleleVCF
+                    alt = SequenceLocation.alternateAllele or SequenceLocation.alternateAlleleVCF
 
+    symbol = None
+    gene_id = None
     if Measure.MeasureRelationship:
         try:
-            symbol = Measure.MeasureRelationship[0].\
-                Symbol[0].get_ElementValue().valueOf_
+            symbol = Measure.MeasureRelationship[0].Symbol[0].get_ElementValue().valueOf_
         except:
             symbol = None
         gene_id = Measure.MeasureRelationship[0].XRef[0].ID
-    else:
-        symbol = None
-        gene_id = None
 
+    name = None
     if Measure.Name:
         name = Measure.Name[0].ElementValue.valueOf_
-    else:
-        name = None
 
     if len(Measure.CytogeneticLocation) == 1:
         cytogenic = Measure.CytogeneticLocation[0]
@@ -126,6 +111,7 @@ def parse_measure(Measure, hg19=True):
                 if 'HGVS, genomic, top level' in AttributeSet.Attribute.Type:
                     if AttributeSet.Attribute.integerValue == 37:
                         hgvs_genome = AttributeSet.Attribute.get_valueOf_()
+
                 if 'genomic' in AttributeSet.Attribute.Type:
                     HGVS['genomic'].append(AttributeSet.Attribute.get_valueOf_())
                 elif 'non-coding' in AttributeSet.Attribute.Type:
@@ -144,11 +130,11 @@ def parse_measure(Measure, hg19=True):
                     HGVS['coding'].append(AttributeSet.Attribute.get_valueOf_())
                 elif 'protein' in AttributeSet.Attribute.Type:
                     HGVS['protein'].append(AttributeSet.Attribute.get_valueOf_())
-                if AttributeSet.Attribute.Type == 'HGVS, coding, RefSeq':
+
+                if not hgvs_coding and AttributeSet.Attribute.Type == 'HGVS, coding, RefSeq':
                     hgvs_coding = AttributeSet.Attribute.get_valueOf_()
-                elif AttributeSet.Attribute.Type == 'HGVS, genomic, top level, previous':
+                if not hgvs_genome and AttributeSet.Attribute.Type == 'HGVS, genomic, top level, previous':
                     hgvs_genome = AttributeSet.Attribute.get_valueOf_()
-                    break
 
         if chrom and chromStart and chromEnd:
             # if its SNP, make sure chrom, chromStart, chromEnd, ref, alt are all provided
@@ -156,12 +142,13 @@ def parse_measure(Measure, hg19=True):
                 if ref and alt:
                     hgvs_id = "chr%s:g.%s%s>%s" % (chrom, chromStart, ref, alt)
                 else:
-                    print('hgvs not found chr {}, chromStart {}, chromEnd {}, ref {}, alt {}, allele id {}'.format(chrom, chromStart, chromEnd, ref, alt, allele_id))
+                    print('hgvs not found chr {}, chromStart {}, chromEnd {}, ref {}, alt {}, allele id {}'.
+                          format(chrom, chromStart, chromEnd, ref, alt, allele_id))
             # items whose type belong to 'Indel, Insertion, \
             # Duplication' might not hava explicit alt information, \
             # so we will parse from hgvs_genome
             elif variation_type == 'Indel':
-            # RCV000156073, NC_000010.10:g.112581638_112581639delinsG
+                # RCV000156073, NC_000010.10:g.112581638_112581639delinsG
                 if hgvs_genome:
                     indel_position = hgvs_genome.find('del')
                     indel_alt = hgvs_genome[indel_position+3:]
@@ -227,68 +214,69 @@ def parse_measure(Measure, hg19=True):
     if hgvs_id:
         one_snp_json = {
             "_id": hgvs_id,
-            "clinvar":
-                {
-                    "allele_id": allele_id,
-                    "chrom": chrom,
-                    "omim": omim,
-                    "cosmic": cosmic,
-                    "uniprot": uniprot,
-                    "dbvar": dbvar,
-                    "hg19": {
-                        "start": chromStart_19,
-                        "end": chromEnd_19
-                    },
-                    "hg38": {
-                        "start": chromStart_38,
-                        "end": chromEnd_38
-                    },
-                    "type": variation_type,
-                    "gene": {
-                        "id": gene_id,
-                        "symbol": symbol
-                    },
-                    "rcv": {
-                        "preferred_name": name,
-                    },
-                    "rsid": rsid,
-                    "cytogenic": cytogenic,
-                    "hgvs": HGVS,
-                    "coding_hgvs_only": coding_hgvs_only,
-                    "ref": ref,
-                    "alt": alt
-                }
+            "clinvar": {
+                "allele_id": allele_id,
+                "chrom": chrom,
+                "omim": omim,
+                "cosmic": cosmic,
+                "uniprot": uniprot,
+                "dbvar": dbvar,
+                "hg19": {
+                    "start": chromStart_19,
+                    "end": chromEnd_19
+                },
+                "hg38": {
+                    "start": chromStart_38,
+                    "end": chromEnd_38
+                },
+                "type": variation_type,
+                "gene": {
+                    "id": gene_id,
+                    "symbol": symbol
+                },
+                "rcv": {
+                    "preferred_name": name,
+                },
+                "rsid": rsid,
+                "cytogenic": cytogenic,
+                "hgvs": HGVS,
+                "coding_hgvs_only": coding_hgvs_only,
+                "ref": ref,
+                "alt": alt
+            }
         }
 
         return one_snp_json
 
-def _map_line_to_json(cp, hg19):
+
+def _map_parsed_record_to_json(parsed_record, hg19):
     try:
-        clinical_significance = cp.ReferenceClinVarAssertion.ClinicalSignificance.Description
+        clinical_significance = parsed_record.ReferenceClinVarAssertion.ClinicalSignificance.Description
     except:
         clinical_significance = None
 
-    rcv_accession = cp.ReferenceClinVarAssertion.ClinVarAccession.Acc
+    rcv_accession = parsed_record.ReferenceClinVarAssertion.ClinVarAccession.Acc
 
     try:
-        review_status = cp.ReferenceClinVarAssertion.ClinicalSignificance.ReviewStatus
+        review_status = parsed_record.ReferenceClinVarAssertion.ClinicalSignificance.ReviewStatus
     except:
         review_status = None
 
     try:
-        last_evaluated = cp.ReferenceClinVarAssertion.ClinicalSignificance.DateLastEvaluated
+        last_evaluated = parsed_record.ReferenceClinVarAssertion.ClinicalSignificance.DateLastEvaluated
     except:
         last_evaluated = None
     
-    number_submitters = len(cp.ClinVarAssertion)
+    number_submitters = len(parsed_record.ClinVarAssertion)
+
     # some items in clinvar_xml doesn't have origin information
     try:
-        origin = cp.ReferenceClinVarAssertion.ObservedIn[0].Sample.Origin
+        origin = parsed_record.ReferenceClinVarAssertion.ObservedIn[0].Sample.Origin
     except:
         origin = None
 
     conditions = []
-    for _trait in cp.ReferenceClinVarAssertion.TraitSet.Trait:
+    for _trait in parsed_record.ReferenceClinVarAssertion.TraitSet.Trait:
         synonyms = []
         conditions_name = ''
         for name in _trait.Name:
@@ -313,32 +301,35 @@ def _map_line_to_json(cp, hg19):
             if _set.Attribute.Type == 'age of onset':
                 age_of_onset = _set.Attribute.get_valueOf_()
 
-        conditions.append({"name": conditions_name, "synonyms": synonyms, "identifiers": identifiers, "age_of_onset": age_of_onset})
+        conditions.append({"name": conditions_name, "synonyms": synonyms, "identifiers": identifiers,
+                           "age_of_onset": age_of_onset})
 
     try:
-        genotypeset = cp.ReferenceClinVarAssertion.GenotypeSet
+        genotypeset = parsed_record.ReferenceClinVarAssertion.GenotypeSet
     except:
         genotypeset = None
 
     if genotypeset:
         obj_list = []
         id_list = []
-        for _set in cp.ReferenceClinVarAssertion.GenotypeSet.MeasureSet:
+        for _set in parsed_record.ReferenceClinVarAssertion.GenotypeSet.MeasureSet:
             variant_id = _set.ID
             for _measure in _set.Measure:
                 json_obj = parse_measure(_measure, hg19=hg19)
                 if json_obj:
-                    json_obj['clinvar']['rcv'].update({'accession': rcv_accession,
+                    json_obj['clinvar']['rcv'].update({
+                        'accession': rcv_accession,
                         'clinical_significance': clinical_significance,
                         'number_submitters': number_submitters,
                         'review_status': review_status,
                         'last_evaluated': str(last_evaluated),
                         'origin': origin,
-                        'conditions': conditions})
+                        'conditions': conditions
+                    })
                     json_obj['clinvar'].update({'variant_id': variant_id})
                     json_obj = (dict_sweep(unlist(value_convert_to_number(json_obj,
-                                               ['chrom', 'omim', 'id', 'orphanet', 'gene',
-                                                'rettbase_(cdkl5)', 'cosmic', 'dbrbc'])), [None, '', 'None']))
+                                ['chrom', 'omim', 'id', 'orphanet', 'gene', 'rettbase_(cdkl5)', 'cosmic', 'dbrbc'])),
+                                [None, '', 'None']))
                     obj_list.append(json_obj)
                     id_list.append(json_obj['_id'])
         for _obj in obj_list:
@@ -348,17 +339,19 @@ def _map_line_to_json(cp, hg19):
             }})
             yield _obj
     else:
-        variant_id = cp.ReferenceClinVarAssertion.MeasureSet.ID
-        for _measure in cp.ReferenceClinVarAssertion.MeasureSet.Measure:
+        variant_id = parsed_record.ReferenceClinVarAssertion.MeasureSet.ID
+        for _measure in parsed_record.ReferenceClinVarAssertion.MeasureSet.Measure:
             json_obj = parse_measure(_measure, hg19=hg19)
             if json_obj:
-                json_obj['clinvar']['rcv'].update({'accession': rcv_accession,
-                        'clinical_significance': clinical_significance,
-                        'number_submitters': number_submitters,
-                        'review_status': review_status,
-                        'last_evaluated': str(last_evaluated),
-                        'origin': origin,
-                        'conditions': conditions})
+                json_obj['clinvar']['rcv'].update({
+                    'accession': rcv_accession,
+                    'clinical_significance': clinical_significance,
+                    'number_submitters': number_submitters,
+                    'review_status': review_status,
+                    'last_evaluated': str(last_evaluated),
+                    'origin': origin,
+                    'conditions': conditions
+                })
                 json_obj['clinvar'].update({'variant_id': variant_id})
                 json_obj = (dict_sweep(unlist(value_convert_to_number(json_obj,
                                                ['chrom', 'omim', 'id', 'orphanet', 'gene',
@@ -366,23 +359,23 @@ def _map_line_to_json(cp, hg19):
                 yield json_obj
 
 
-def rcv_feeder(input_file, hg19):
+def clinvar_json_record_feeder(input_file, hg19):
     # the first two line of clinvar_xml is not useful information
-    cv_data = rec_handler(input_file, block_end='</ClinVarSet>\n',
-                          skip=2, include_block_end=True)
-    for record in cv_data:
+    xml_records = rec_handler(input_file, block_end='</ClinVarSet>\n', skip=2, include_block_end=True)
+    for xml_record in xml_records:
         # some exceptions
-        if record.startswith('\n</ReleaseSet>'):
+        if xml_record.startswith('\n</ReleaseSet>'):
             continue
 
         try:
-            record_parsed = clinvarlib.parseString(record, silence=1)
+            parsed_record = clinvarlib.parseString(xml_record, silence=1)
         except:
-            logging.debug(record)
+            logging.debug(xml_record)
             raise
 
-        for record_mapped in _map_line_to_json(record_parsed, hg19):
-            yield record_mapped
+        for json_record in _map_parsed_record_to_json(parsed_record, hg19):
+            yield json_record
+
 
 def load_data(data_folder, version):
     # try to get logger from uploader
@@ -391,18 +384,23 @@ def load_data(data_folder, version):
     logging = loggingmod.getLogger("clinvar_upload")
 
     import_clinvar_lib(data_folder)
-    files = glob.glob(os.path.join(data_folder,GLOB_PATTERN))
-    assert len(files) == 1, "Expecting only one file matching '%s', got: %s" % (GLOB_PATTERN,files)
+
+    files = glob.glob(os.path.join(data_folder, GLOB_PATTERN))
+    assert len(files) == 1, "Expecting only one file matching '%s', got: %s" % (GLOB_PATTERN, files)
     input_file = files[0]
-    data_generator = rcv_feeder(input_file, version == "hg19")
-    data_list = list(data_generator)
-    # Sort the `data_list` because `merge_rcv_accession` will call `itertools.groupby()`
-    #   which cannot group non-adjacent items with the same key into a group
-    data_list_sorted = sorted(data_list, key=lambda k: k['_id'])
-    data_merge_rcv = merge_rcv_accession(data_list_sorted)
-    return data_merge_rcv
+
+    json_record_generator = clinvar_json_record_feeder(input_file, hg19=(version == "hg19"))
+
+    # Sorting is necessary because `merge_rcv_accession` will call `itertools.groupby()`
+    #   which cannot put non-adjacent items with the same key into a group
+    json_record_list = list(json_record_generator)
+    json_record_list = sorted(json_record_list, key=lambda k: k['_id'])
+
+    merged_json_records = merge_rcv_accession(json_record_list)
+    return merged_json_records
+
 
 if __name__ == "__main__":
     from biothings.utils.mongo import get_data_folder
     data_folder = get_data_folder("clinvar")
-    load_data(data_folder=data_folder)
+    load_data(data_folder=data_folder, version="hg19")

From 1f679bdde8bef8b072f0f4aeb406f4df4937578a Mon Sep 17 00:00:00 2001
From: Yao Yao <yaoy.cs@gmail.com>
Date: Tue, 10 Aug 2021 15:40:52 -0700
Subject: [PATCH 3/4] deleted the legacy parser

---
 .../sources/clinvar/clinvar_parser.py         | 164 ------------------
 1 file changed, 164 deletions(-)
 delete mode 100644 src/hub/dataload/sources/clinvar/clinvar_parser.py

diff --git a/src/hub/dataload/sources/clinvar/clinvar_parser.py b/src/hub/dataload/sources/clinvar/clinvar_parser.py
deleted file mode 100644
index ef3014e8..00000000
--- a/src/hub/dataload/sources/clinvar/clinvar_parser.py
+++ /dev/null
@@ -1,164 +0,0 @@
-# -*- coding: utf-8 -*-
-# DO NOT USE THIS SCRIPT, USE THE XML PARSER INSTEAD
-import csv
-import re
-from itertools import imap, groupby
-import os
-
-import vcf
-
-from biothings.utils.dataload import unlist, dict_sweep, value_convert_to_number, merge_duplicate_rows
-
-''' vcf file for clinvar downloaded from
-ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh37/
-and tab_delimited file for clinvar downloaded from
-ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/'''
-
-VALID_COLUMN_NO = 25
-vcf_reader = vcf.Reader(filename='clinvar_20150305.vcf.gz')
-
-
-# split id lists into dictionary
-def other_id(other_ids):
-    p = other_ids.strip(";").replace(";", ",").split(",")
-    other_id = {}
-    for id in p:
-        try:
-            ind = id.index(":")
-            key, value = id[:ind], id[ind+1:]
-            if key in other_id:
-                if not isinstance(other_id[key], list):
-                    other_id[key] = [other_id[key]]
-                other_id[key].append(value)
-            else:
-                other_id[key] = value
-        except:
-            continue
-    return other_id
-
-
-# convert one snp to json
-def _map_line_to_json(fields):
-    assert len(fields) == VALID_COLUMN_NO
-    chrom = fields[13]
-    chromStart = fields[14]
-    chromEnd = fields[15]
-
-    HGVS = None
-    cds = fields[18].split(":")
-    cds = cds[1]
-    replace = re.findall(r'[ATCGMNYR=]+', cds)
-    sub = re.search(r'\d([ATCGMNHKRY]>[ATCGMNHKRY])', cds)
-    ins = re.search(r'ins[ATCGMNHYR]+|ins[0-9]+', cds)
-    delete = fields[1] == 'deletion'
-    indel = fields[1] == 'indel'
-    dup = re.search(r'dup', cds)
-    inv = re.search(r'inv|inv[0-9]+|inv[ATCGMNHYR]+', cds)
-    if ins:
-        delete = None
-        indel = None
-    elif delete:
-        ins = None
-        indel = None
-    # parse from vcf file. Input chrom number
-    # and chromStart, and return REF, ALT
-    if chromStart:
-        record = vcf_reader.fetch(chrom, int(chromStart))
-    else:
-        record = None
-    if record:
-        REF = record.REF
-        ALT = record.ALT
-        ALT = ALT[0]
-        if record.is_snp and len(ALT) < 2:
-            mod = [REF, ALT]
-        else:
-            mod = ALT
-    else:
-        return
-
-    if sub and record.is_snp:
-            HGVS = "chr%s:g.%s%s>%s" % (chrom, chromStart, mod[0], mod[1])
-    elif ins:
-        HGVS = "chr%s:g.%s_%sins%s" % (chrom, chromStart, chromEnd, mod)
-    elif delete:
-        HGVS = "chr%s:g.%s_%sdel" % (chrom, chromStart, chromEnd)
-    elif indel:
-        try:
-            HGVS = "chr%s:g.%s_%sdelins%s" % (chrom, chromStart, chromEnd, mod)
-        except AttributeError:
-            print "ERROR:", fields[1], cds
-    elif dup:
-        HGVS = "chr%s:g.%s_%sdup%s" % (chrom, chromStart, chromEnd, mod)
-    elif inv:
-        HGVS = "chr%s:g.%s_%sinv%s" % (chrom, chromStart, chromEnd, mod)
-    elif replace:
-        HGVS = "chr%s:g.%s_%s%s" % (chrom, chromStart, chromEnd, mod)
-    else:
-        print 'ERROR:', fields[1], cds
-
-    # load as json data
-    if HGVS is None:
-        print 'None:', fields[1], cds
-        return None
-
-    one_snp_json = {
-
-        "_id": HGVS,
-        "clinvar":
-            {
-                "allele_id": fields[0],
-                "hg19":
-                    {
-                        "chr": fields[13],
-                        "start": fields[14],
-                        "end": fields[15]
-                    },
-                "type": fields[1],
-                "name": fields[2],
-                "gene":
-                    {
-                        "id": fields[3],
-                        "symbol": fields[4]
-                    },
-                "clinical_significance": fields[5].split(";"),
-                "rsid": 'rs' + str(fields[6]),
-                "nsv_dbvar": fields[7],
-                "rcv_accession": fields[8].split(";"),
-                "tested_in_gtr": fields[9],
-                "phenotype_id": other_id(fields[10]),
-                "origin": fields[11],
-                "cytogenic": fields[16],
-                "review_status": fields[17],
-                "hgvs":
-                    {
-                        "coding": fields[18],
-                        "protein": fields[19]
-                    },
-                "number_submitters": fields[20],
-                "last_evaluated": fields[21],
-                "guidelines": fields[22],
-                "other_ids": other_id(fields[23]),
-                "clinvar_id": fields[24]
-            }
-        }
-    return dict_sweep(unlist(value_convert_to_number(one_snp_json)), vals=["-"])
-
-
-# open file, parse, pass to json mapper
-def load_data(input_file):
-    os.system("sort -t$'\t' -k14 -k15 -k20 -n %s > %s_sorted.tsv" \
-              % (input_file, input_file))
-    open_file = open("%s_sorted.tsv" % (input_file))
-    print input_file
-    clinvar = csv.reader(open_file, delimiter="\t")
-    clinvar = (row for row in clinvar
-               if row[18] != '-' and
-               row[18].find('?') == -1 and
-               row[13] != "" and
-               row[12] == "GRCh37" and
-               not re.search(r'p.', row[18]))
-    json_rows = (row for row in imap(_map_line_to_json, clinvar) if row)
-    row_groups = (it for (key, it) in groupby(json_rows, lambda row:
-                  row["_id"]))
-    return (merge_duplicate_rows(rg, "clinvar") for rg in row_groups)

From 51fa2f58210c65ec4f9b4d6895e6a2ff10b88ca2 Mon Sep 17 00:00:00 2001
From: Yao Yao <yaoy.cs@gmail.com>
Date: Wed, 11 Aug 2021 17:44:23 -0700
Subject: [PATCH 4/4] rename variables and functions; added detailed comments

---
 .../sources/clinvar/clinvar_xml_parser.py     | 215 ++++++++++++------
 1 file changed, 151 insertions(+), 64 deletions(-)

diff --git a/src/hub/dataload/sources/clinvar/clinvar_xml_parser.py b/src/hub/dataload/sources/clinvar/clinvar_xml_parser.py
index fd7f3b93..00631e54 100644
--- a/src/hub/dataload/sources/clinvar/clinvar_xml_parser.py
+++ b/src/hub/dataload/sources/clinvar/clinvar_xml_parser.py
@@ -1,3 +1,25 @@
+"""
+Two major parsing functions in this script are:
+
+    1. `clinvar_doc_feeder()`
+    2. `merge_rcv_accession()`
+
+`clinvar_doc_feeder()` is responsible for the following jobs:
+
+    1. Receive a ClinVarFullRelease_*.xml.gz file, and split the xml file into `<ClinVarSet>...</ClinVarSet>` blocks
+    2. Parse each `<ClinVarSet>...</ClinVarSet>` block into an `PublicSetType` object, which is defined in the
+        dynamically imported `clinvarlib`
+    3. Convert each `PublicSetType` object into a clinvar document
+
+Note that The `PublicSetType` class is defined to map the block structure of `<ClinVarSet>`, which can be inspected
+through its XSD, https://ftp.ncbi.nlm.nih.gov/pub/clinvar/clinvar_public.xsd.
+
+`merge_rcv_accession()` is responsible for only one job:
+
+    1. Group all the document by `doc['_id']`, and then inside each group, merge all the `doc['clinvar']['rcv']`.
+        Each group of documents will be merged into a single document.
+"""
+
 import glob
 import os
 import sys
@@ -21,40 +43,57 @@ def import_clinvar_lib(data_folder):
     clinvarlib = clinvar_mod
 
 
-def merge_rcv_accession(records):
-    record_groups = []
-    for key, group in groupby(records, lambda x: x['_id']):
-        record_groups.append(list(group))
+def merge_rcv_accession(docs):
+    doc_groups = []
+    for key, group in groupby(docs, lambda x: x['_id']):
+        doc_groups.append(list(group))
 
-    # get the number of groups, and uniquekeys
-    logging.info("number of groups: %s" % len(record_groups))
+    # get the number of groups, and unique keys
+    logging.info("number of groups: %s" % len(doc_groups))
 
-    # Each group of records is a list
-    # Loop through each group, if record number >1, merge rcv accessions
-    for record_list in record_groups:
-        if len(record_list) == 1:
-            yield record_list[0]
+    # Each group is a list of documents
+    # Loop through each group, if doc number >1, merge rcv accessions
+    for doc_list in doc_groups:
+        if len(doc_list) == 1:
+            yield doc_list[0]
         else:
-            rcv_list = [record['clinvar']['rcv'] for record in record_list]
+            rcv_list = [doc['clinvar']['rcv'] for doc in doc_list]
             
-            merged_record = record_list[0]
-            merged_record['clinvar']['rcv'] = rcv_list
-            yield merged_record
-
-
-def parse_measure(Measure, hg19=True):
+            merged_doc = doc_list[0]
+            merged_doc['clinvar']['rcv'] = rcv_list
+            yield merged_doc
+
+
+def _map_measure_to_json(measure_obj, hg19=True):
+    """
+    Convert a `clinvarlib.MeasureType` object into json.
+
+    Each `clinvarlib.MeasureType` object is mapped to a `<Measure>` block in the XML, which is part of the hierarchy
+    below:
+
+        <ClinVarSet ...>
+          <ReferenceClinVarAssertion ...>
+            <MeasureSet ...>
+              <Measure ...> ... </Measure>
+              <Measure ...> ... </Measure>
+              ...
+            </MeasureSet>
+          </ReferenceClinVarAssertion>
+        </ClinVarSet>
+    """
+    
     # exclude any item of which types belong to 'Variation', 'protein only' or 'Microsatellite'
-    variation_type = Measure.Type
+    variation_type = measure_obj.Type
     if variation_type == 'Variation' or variation_type == 'protein only' or variation_type == 'Microsatellite':
         return None
 
-    allele_id = Measure.ID
+    allele_id = measure_obj.ID
 
     chrom = None
     chromStart_19, chromEnd_19, chromStart_38, chromEnd_38 = None, None, None, None
     ref, alt = None, None
-    if Measure.SequenceLocation:
-        for SequenceLocation in Measure.SequenceLocation:
+    if measure_obj.SequenceLocation:
+        for SequenceLocation in measure_obj.SequenceLocation:
             if 'GRCh37' in SequenceLocation.Assembly:
                 chrom = SequenceLocation.Chr
                 chromStart_19 = SequenceLocation.start
@@ -74,21 +113,21 @@ def parse_measure(Measure, hg19=True):
 
     symbol = None
     gene_id = None
-    if Measure.MeasureRelationship:
+    if measure_obj.MeasureRelationship:
         try:
-            symbol = Measure.MeasureRelationship[0].Symbol[0].get_ElementValue().valueOf_
+            symbol = measure_obj.MeasureRelationship[0].Symbol[0].get_ElementValue().valueOf_
         except:
             symbol = None
-        gene_id = Measure.MeasureRelationship[0].XRef[0].ID
+        gene_id = measure_obj.MeasureRelationship[0].XRef[0].ID
 
     name = None
-    if Measure.Name:
-        name = Measure.Name[0].ElementValue.valueOf_
+    if measure_obj.Name:
+        name = measure_obj.Name[0].ElementValue.valueOf_
 
-    if len(Measure.CytogeneticLocation) == 1:
-        cytogenic = Measure.CytogeneticLocation[0]
+    if len(measure_obj.CytogeneticLocation) == 1:
+        cytogenic = measure_obj.CytogeneticLocation[0]
     else:
-        cytogenic = Measure.CytogeneticLocation
+        cytogenic = measure_obj.CytogeneticLocation
 
     hgvs_coding = None
     hgvs_genome = None
@@ -103,11 +142,11 @@ def parse_measure(Measure, hg19=True):
         chromEnd = chromEnd_38
 
     # hgvs_not_validated = None
-    if Measure.AttributeSet:
+    if measure_obj.AttributeSet:
         # 'copy number loss' or 'gain' have format different\
         # from other types, should be dealt with seperately
         if (variation_type == 'copy number loss') or (variation_type == 'copy number gain'):
-            for AttributeSet in Measure.AttributeSet:
+            for AttributeSet in measure_obj.AttributeSet:
                 if 'HGVS, genomic, top level' in AttributeSet.Attribute.Type:
                     if AttributeSet.Attribute.integerValue == 37:
                         hgvs_genome = AttributeSet.Attribute.get_valueOf_()
@@ -121,7 +160,7 @@ def parse_measure(Measure, hg19=True):
                 elif 'protein' in AttributeSet.Attribute.Type:
                     HGVS['protein'].append(AttributeSet.Attribute.get_valueOf_())
         else:
-            for AttributeSet in Measure.AttributeSet:
+            for AttributeSet in measure_obj.AttributeSet:
                 if 'genomic' in AttributeSet.Attribute.Type:
                     HGVS['genomic'].append(AttributeSet.Attribute.get_valueOf_())
                 elif 'non-coding' in AttributeSet.Attribute.Type:
@@ -197,8 +236,8 @@ def parse_measure(Measure, hg19=True):
     uniprot = None
     omim = None
     # loop through XRef to find rsid as well as other ids
-    if Measure.XRef:
-        for XRef in Measure.XRef:
+    if measure_obj.XRef:
+        for XRef in measure_obj.XRef:
             if XRef.Type == 'rs':
                 rsid = 'rs' + str(XRef.ID)
             elif XRef.DB == 'COSMIC':
@@ -249,34 +288,49 @@ def parse_measure(Measure, hg19=True):
         return one_snp_json
 
 
-def _map_parsed_record_to_json(parsed_record, hg19):
+def _map_public_set_to_json(public_set_obj, hg19: bool):
+    """
+    Convert a `clinvarlib.PublicSetType` object into a json document.
+
+    Each `clinvarlib.PublicSetType` object is mapped to a `<ClinVarSet>` block in the XML.
+
+    E.g., `public_set_obj.ReferenceClinVarAssertion.MeasureSet` is the parsed value from a block structure below:
+
+        <ClinVarSet ...>
+          <ReferenceClinVarAssertion ...>
+            <MeasureSet ...>
+              ...
+            </MeasureSet>
+          </ReferenceClinVarAssertion>
+        </ClinVarSet>
+    """
     try:
-        clinical_significance = parsed_record.ReferenceClinVarAssertion.ClinicalSignificance.Description
+        clinical_significance = public_set_obj.ReferenceClinVarAssertion.ClinicalSignificance.Description
     except:
         clinical_significance = None
 
-    rcv_accession = parsed_record.ReferenceClinVarAssertion.ClinVarAccession.Acc
+    rcv_accession = public_set_obj.ReferenceClinVarAssertion.ClinVarAccession.Acc
 
     try:
-        review_status = parsed_record.ReferenceClinVarAssertion.ClinicalSignificance.ReviewStatus
+        review_status = public_set_obj.ReferenceClinVarAssertion.ClinicalSignificance.ReviewStatus
     except:
         review_status = None
 
     try:
-        last_evaluated = parsed_record.ReferenceClinVarAssertion.ClinicalSignificance.DateLastEvaluated
+        last_evaluated = public_set_obj.ReferenceClinVarAssertion.ClinicalSignificance.DateLastEvaluated
     except:
         last_evaluated = None
     
-    number_submitters = len(parsed_record.ClinVarAssertion)
+    number_submitters = len(public_set_obj.ClinVarAssertion)
 
     # some items in clinvar_xml doesn't have origin information
     try:
-        origin = parsed_record.ReferenceClinVarAssertion.ObservedIn[0].Sample.Origin
+        origin = public_set_obj.ReferenceClinVarAssertion.ObservedIn[0].Sample.Origin
     except:
         origin = None
 
     conditions = []
-    for _trait in parsed_record.ReferenceClinVarAssertion.TraitSet.Trait:
+    for _trait in public_set_obj.ReferenceClinVarAssertion.TraitSet.Trait:
         synonyms = []
         conditions_name = ''
         for name in _trait.Name:
@@ -305,17 +359,17 @@ def _map_parsed_record_to_json(parsed_record, hg19):
                            "age_of_onset": age_of_onset})
 
     try:
-        genotypeset = parsed_record.ReferenceClinVarAssertion.GenotypeSet
+        genotypeset = public_set_obj.ReferenceClinVarAssertion.GenotypeSet
     except:
         genotypeset = None
 
     if genotypeset:
         obj_list = []
         id_list = []
-        for _set in parsed_record.ReferenceClinVarAssertion.GenotypeSet.MeasureSet:
+        for _set in public_set_obj.ReferenceClinVarAssertion.GenotypeSet.MeasureSet:
             variant_id = _set.ID
             for _measure in _set.Measure:
-                json_obj = parse_measure(_measure, hg19=hg19)
+                json_obj = _map_measure_to_json(_measure, hg19=hg19)
                 if json_obj:
                     json_obj['clinvar']['rcv'].update({
                         'accession': rcv_accession,
@@ -339,9 +393,9 @@ def _map_parsed_record_to_json(parsed_record, hg19):
             }})
             yield _obj
     else:
-        variant_id = parsed_record.ReferenceClinVarAssertion.MeasureSet.ID
-        for _measure in parsed_record.ReferenceClinVarAssertion.MeasureSet.Measure:
-            json_obj = parse_measure(_measure, hg19=hg19)
+        variant_id = public_set_obj.ReferenceClinVarAssertion.MeasureSet.ID
+        for _measure in public_set_obj.ReferenceClinVarAssertion.MeasureSet.Measure:
+            json_obj = _map_measure_to_json(_measure, hg19=hg19)
             if json_obj:
                 json_obj['clinvar']['rcv'].update({
                     'accession': rcv_accession,
@@ -359,22 +413,55 @@ def _map_parsed_record_to_json(parsed_record, hg19):
                 yield json_obj
 
 
-def clinvar_json_record_feeder(input_file, hg19):
-    # the first two line of clinvar_xml is not useful information
-    xml_records = rec_handler(input_file, block_end='</ClinVarSet>\n', skip=2, include_block_end=True)
-    for xml_record in xml_records:
-        # some exceptions
-        if xml_record.startswith('\n</ReleaseSet>'):
+def clinvar_doc_feeder(input_file, hg19: bool):
+    """
+    This function will split the xml file into `<ClinVarSet>...</ClinVarSet>` blocks, then parse each block into an
+    `PublicSetType` object (which is defined in the dynamically imported `clinvarlib`), and finally convert each
+    `PublicSetType` object into a clinvar document.
+    """
+
+    """
+    A ClinVarFullRelease_*.xml.gz file has the following structure:
+    
+        <?xml version="1.0" encoding="UTF-8" standalone="yes"?>
+        <ReleaseSet Dated="2021-06-26" ...>
+
+            <ClinVarSet ID="38756179">
+              ...
+            </ClinVarSet>
+            
+            <ClinVarSet ID="38756180">
+              ...
+            </ClinVarSet>
+            
+            ...
+
+        </ReleaseSet>
+    
+    Therefore when splitting the xml into `<ClinVarSet>` blocks, the first 2 lines and the last 1 line should be 
+    skipped. 
+    
+    However the `rec_handler` function cannot skip the last 1 line, and will return "\n</ReleaseSet>...</ClinVarSet>" as 
+    the last block in this scenario. Therefore in the for-loop below, we will skip any block starting with 
+    "\n</ReleaseSet>".
+    """
+    clinvar_set_blocks = rec_handler(input_file, block_end='</ClinVarSet>\n', skip=2, include_block_end=True)
+    for clinvar_set_block in clinvar_set_blocks:
+        # Skip any block starting with "\n</ReleaseSet>"
+        # Actually only the last block will be skipped. See comments above
+        if clinvar_set_block.startswith('\n</ReleaseSet>'):
             continue
 
         try:
-            parsed_record = clinvarlib.parseString(xml_record, silence=1)
+            # Parse each `<ClinVarSet>` block into a `clinvarlib.PublicSetType` object
+            public_set_obj = clinvarlib.parseString(clinvar_set_block, silence=1)
         except:
-            logging.debug(xml_record)
+            logging.debug(clinvar_set_block)
             raise
 
-        for json_record in _map_parsed_record_to_json(parsed_record, hg19):
-            yield json_record
+        # Convert each `clinvarlib.PublicSetType` object into a json document
+        for doc in _map_public_set_to_json(public_set_obj, hg19):
+            yield doc
 
 
 def load_data(data_folder, version):
@@ -389,15 +476,15 @@ def load_data(data_folder, version):
     assert len(files) == 1, "Expecting only one file matching '%s', got: %s" % (GLOB_PATTERN, files)
     input_file = files[0]
 
-    json_record_generator = clinvar_json_record_feeder(input_file, hg19=(version == "hg19"))
+    doc_generator = clinvar_doc_feeder(input_file, hg19=(version == "hg19"))
 
     # Sorting is necessary because `merge_rcv_accession` will call `itertools.groupby()`
     #   which cannot put non-adjacent items with the same key into a group
-    json_record_list = list(json_record_generator)
-    json_record_list = sorted(json_record_list, key=lambda k: k['_id'])
+    doc_list = list(doc_generator)
+    doc_list = sorted(doc_list, key=lambda k: k['_id'])
 
-    merged_json_records = merge_rcv_accession(json_record_list)
-    return merged_json_records
+    merged_doc_generator = merge_rcv_accession(doc_list)
+    return merged_doc_generator
 
 
 if __name__ == "__main__":