From 5c2abe3dfb9269bd446f0c00958a315ab2866087 Mon Sep 17 00:00:00 2001
From: Yao Yao <yaoy.cs@gmail.com>
Date: Tue, 19 Sep 2023 22:45:06 -0700
Subject: [PATCH 01/13] initial implementation of Issue#172; deprecate v4.3.a
 parser

---
 .../dataload/sources/dbnsfp/dbnsfp_mapping.py | 1006 +++++++++---
 .../dataload/sources/dbnsfp/dbnsfp_parser.py  | 1382 +++++++++--------
 .../sources/dbnsfp/dbnsfp_parser_43a.py       |  675 ++++++++
 .../dataload/sources/dbnsfp/dbnsfp_upload.py  |   16 +-
 src/utils/dotfield.py                         |   43 +
 src/utils/table.py                            |   49 +
 6 files changed, 2295 insertions(+), 876 deletions(-)
 create mode 100644 src/hub/dataload/sources/dbnsfp/dbnsfp_parser_43a.py
 create mode 100644 src/utils/dotfield.py
 create mode 100644 src/utils/table.py

diff --git a/src/hub/dataload/sources/dbnsfp/dbnsfp_mapping.py b/src/hub/dataload/sources/dbnsfp/dbnsfp_mapping.py
index fcccb139..42675907 100644
--- a/src/hub/dataload/sources/dbnsfp/dbnsfp_mapping.py
+++ b/src/hub/dataload/sources/dbnsfp/dbnsfp_mapping.py
@@ -75,11 +75,11 @@
                     }
                 }
             },
-            "genename": {  # Column 13
+            "genename": {
                 "type": "keyword",
                 "normalizer": "keyword_lowercase_normalizer"
             },
-            "ensembl": {  # Column 14-16
+            "ensembl": {
                 "properties": {
                     "geneid": {
                         "type": "keyword",
@@ -95,7 +95,7 @@
                     }
                 }
             },
-            "uniprot": {  # Column 17-18
+            "uniprot": {
                 "properties": {
                     "acc": {
                         "type": "keyword",
@@ -107,54 +107,54 @@
                     }
                 }
             },
-            "hgvsc": {  # Column 19-21
+            "hgvsc": {
                 "type": "keyword",
                 "normalizer": "keyword_lowercase_normalizer"
             },
-            "hgvsp": {  # Column 22-24
+            "hgvsp": {
                 "type": "keyword",
                 "normalizer": "keyword_lowercase_normalizer"
             },
-            "appris": {  # Column 25
+            "appris": {
                 "type": "keyword",
                 "normalizer": "keyword_lowercase_normalizer"
             },
-            "genecode_basic": {  # Column 26
+            "genecode_basic": {
                 "type": "keyword",
                 "normalizer": "keyword_lowercase_normalizer"
             },
-            "tsl": {  # Column 27
+            "tsl": {
                 "type": "integer"
             },
-            "vep_canonical": {  # Column 28
+            "vep_canonical": {
                 "type": "keyword",
                 "normalizer": "keyword_lowercase_normalizer"
             },
-            "cds_strand": {  # Column 29
+            "cds_strand": {
                 "type": "keyword",
                 "normalizer": "keyword_lowercase_normalizer"
             },
-            "ancestral_allele": {  # Column 33
+            "ancestral_allele": {
                 "type": "keyword",
                 "normalizer": "keyword_lowercase_normalizer"
             },
-            "altai_neandertal": {  # Column 34
+            "altai_neandertal": {
                 "type": "keyword",
                 "normalizer": "keyword_lowercase_normalizer"
             },
-            "denisova": {  # Column 35
+            "denisova": {
                 "type": "keyword",
                 "normalizer": "keyword_lowercase_normalizer"
             },
-            "vindijia_neandertal": {  # Column 36
+            "vindijia_neandertal": {
                 "type": "keyword",
                 "normalizer": "keyword_lowercase_normalizer"
             },
-            "chagyrskaya_neandertal": {  # Column 37
+            "chagyrskaya_neandertal": {
                 "type": "keyword",
                 "normalizer": "keyword_lowercase_normalizer"
             },
-            "sift": {  # Column 38-40
+            "sift": {
                 "properties": {
                     "score": {
                         "type": "float"
@@ -168,7 +168,7 @@
                     }
                 }
             },
-            "sift4g": {  # Column 41-43
+            "sift4g": {
                 "properties": {
                     "score": {
                         "type": "float"
@@ -182,7 +182,7 @@
                     }
                 }
             },
-            "polyphen2": {  # Column 44-49
+            "polyphen2": {
                 "properties": {
                     "hdiv": {
                         "properties": {
@@ -214,7 +214,7 @@
                     }
                 }
             },
-            "lrt": {  # Column 50-53
+            "lrt": {
                 "properties": {
                     "score": {
                         "type": "float"
@@ -231,7 +231,7 @@
                     }
                 }
             },
-            "mutationtaster": {  # Column 54-58
+            "mutationtaster": {
                 "properties": {
                     "score": {
                         "type": "float"
@@ -247,12 +247,12 @@
                         "type": "keyword",
                         "normalizer": "keyword_lowercase_normalizer"
                     },
-                    "AAE": {
+                    "aae": {
                         "type": "text"
                     }
                 }
             },
-            "mutationassessor": {  # Column 59-61
+            "mutationassessor": {
                 "properties": {
                     "score": {
                         "type": "float"
@@ -266,7 +266,7 @@
                     }
                 }
             },
-            "fathmm": {  # Column 62-64
+            "fathmm": {
                 "properties": {
                     "score": {
                         "type": "float"
@@ -280,7 +280,7 @@
                     }
                 }
             },
-            "provean": {  # Column 65-67
+            "provean": {
                 "properties": {
                     "score": {
                         "type": "float"
@@ -294,7 +294,7 @@
                     }
                 }
             },
-            "vest4": {  # Column 68-69
+            "vest4": {
                 "properties": {
                     "score": {
                         "type": "float"
@@ -304,7 +304,7 @@
                     }
                 }
             },
-            "metasvm": {  # Column 70-72
+            "metasvm": {
                 "properties": {
                     "score": {
                         "type": "float"
@@ -318,7 +318,7 @@
                     }
                 }
             },
-            "metalr": {  # Column 73-75
+            "metalr": {
                 "properties": {
                     "score": {
                         "type": "float"
@@ -332,10 +332,10 @@
                     }
                 }
             },
-            "reliability_index": {  # Column 76
+            "reliability_index": {
                 "type": "integer"
             },
-            "metarnn": {  # Column 77-79
+            "metarnn": {
                 "properties": {
                     "score": {
                         "type": "float"
@@ -349,7 +349,7 @@
                     }
                 }
             },
-            "m-cap": {  # Column 80-82
+            "m-cap": {
                 "properties": {
                     "score": {
                         "type": "float"
@@ -363,7 +363,7 @@
                     }
                 }
             },
-            "revel": {  # Column 83-84
+            "revel": {
                 "properties": {
                     "score": {
                         "type": "float"
@@ -373,7 +373,7 @@
                     }
                 }
             },
-            "mutpred": {  # Column 85-89
+            "mutpred": {
                 "properties": {
                     "score": {
                         "type": "float"
@@ -401,7 +401,7 @@
                     }
                 }
             },
-            "mvp": {  # Column 90-91
+            "mvp": {
                 "properties": {
                     "score": {
                         "type": "float"
@@ -411,7 +411,7 @@
                     }
                 }
             },
-            "mpc": {  # Column 92-93
+            "gmvp": {  # new in 4.4.a
                 "properties": {
                     "score": {
                         "type": "float"
@@ -421,7 +421,17 @@
                     }
                 }
             },
-            "primateai": {  # Column 94-96
+            "mpc": {
+                "properties": {
+                    "score": {
+                        "type": "float"
+                    },
+                    "rankscore": {
+                        "type": "float"
+                    }
+                }
+            },
+            "primateai": {
                 "properties": {
                     "score": {
                         "type": "float"
@@ -435,7 +445,7 @@
                     }
                 }
             },
-            "deogen2": {  # Column 97-99
+            "deogen2": {
                 "properties": {
                     "score": {
                         "type": "float"
@@ -449,7 +459,7 @@
                     }
                 }
             },
-            "bayesdel": {  # Column 100-105
+            "bayesdel": {
                 "properties": {
                     "add_af": {
                         "properties": {
@@ -481,7 +491,7 @@
                     }
                 }
             },
-            "clinpred": {  # Column 106-108
+            "clinpred": {
                 "properties": {
                     "score": {
                         "type": "float"
@@ -495,7 +505,7 @@
                     }
                 }
             },
-            "list-s2": {  # Column 109-111
+            "list-s2": {
                 "properties": {
                     "score": {
                         "type": "float"
@@ -509,8 +519,52 @@
                     }
                 }
             },
-            "aloft": {  # Column 112-117
+            "varity_r": {
+                "properties": {
+                    "score": {
+                        "type": "float"
+                    },
+                    "rankscore": {
+                        "type": "float"
+                    }
+                }
+            },
+            "varity_er": {
                 "properties": {
+                    "score": {
+                        "type": "float"
+                    },
+                    "rankscore": {
+                        "type": "float"
+                    }
+                }
+            },
+            "varity_r_loo": {
+                "properties": {
+                    "score": {
+                        "type": "float"
+                    },
+                    "rankscore": {
+                        "type": "float"
+                    }
+                }
+            },
+            "varity_er_loo": {
+                "properties": {
+                    "score": {
+                        "type": "float"
+                    },
+                    "rankscore": {
+                        "type": "float"
+                    }
+                }
+            },
+            "aloft": {
+                "properties": {
+                    "fraction_transcripts_affected": {
+                        "type": "keyword",
+                        "normalizer": "keyword_lowercase_normalizer"
+                    },
                     "prob_tolerant": {
                         "type": "keyword",
                         "normalizer": "keyword_lowercase_normalizer"
@@ -527,21 +581,14 @@
                         "type": "keyword",
                         "normalizer": "keyword_lowercase_normalizer"
                     },
-                    "fraction_transcripts_affected": {
-                        "type": "keyword",
-                        "normalizer": "keyword_lowercase_normalizer"
-                    },
                     "confidence": {
                         "type": "text"
                     }
                 }
             },
             "cadd": {
-                # Column 118-123
-                #   Column 118-120 are hg38
-                #   Column 121-123 are hg19
-                # Only column 117-119 will be included in the document for "hg38"
-                # No CADD fields will be included when "hg19"
+                # Only for "hg38"
+                # No CADD fields will be included for "hg19"
                 "properties": {
                     "raw_score": {
                         "type": "float"
@@ -555,7 +602,7 @@
                     }
                 }
             },
-            "dann": {  # Column 124-125
+            "dann": {
                 "properties": {
                     "score": {
                         "type": "float"
@@ -565,7 +612,7 @@
                     }
                 }
             },
-            "fathmm-mkl": {  # Column 126-129
+            "fathmm-mkl": {
                 "properties": {
                     "coding_score": {
                         "type": "float"
@@ -583,7 +630,7 @@
                     }
                 }
             },
-            "fathmm-xf": {  # Column 130-132
+            "fathmm-xf": {
                 "properties": {
                     "coding_score": {
                         "type": "float"
@@ -597,7 +644,7 @@
                     }
                 }
             },
-            "eigen": {  # Column 133-135
+            "eigen": {
                 "properties": {
                     "raw_coding": {
                         "type": "float"
@@ -610,7 +657,7 @@
                     }
                 }
             },
-            "eigen-pc": {  # Column 136-138
+            "eigen-pc": {
                 "properties": {
                     "raw_coding": {
                         "type": "float"
@@ -623,7 +670,7 @@
                     },
                 }
             },
-            "genocanyon": {  # Column 139-140
+            "genocanyon": {
                 "properties": {
                     "score": {
                         "type": "float"
@@ -633,59 +680,115 @@
                     }
                 }
             },
-            "integrated": {  # Column 141-143
+            # "integrated": {
+            #     "properties": {
+            #         "fitcons_score": {
+            #             "type": "float"
+            #         },
+            #         "fitcons_rankscore": {
+            #             "type": "float"
+            #         },
+            #         "confidence_value": {
+            #             "type": "integer"
+            #         }
+            #     }
+            # },
+            # "gm12878": {
+            #     "properties": {
+            #         "fitcons_score": {
+            #             "type": "float"
+            #         },
+            #         "fitcons_rankscore": {
+            #             "type": "float"
+            #         },
+            #         "confidence_value": {
+            #             "type": "integer"
+            #         }
+            #     }
+            # },
+            # "h1-hesc": {
+            #     "properties": {
+            #         "fitcons_score": {
+            #             "type": "float"
+            #         },
+            #         "fitcons_rankscore": {
+            #             "type": "float"
+            #         },
+            #         "confidence_value": {
+            #             "type": "integer"
+            #         }
+            #     }
+            # },
+            # "huvec": {
+            #     "properties": {
+            #         "fitcons_score": {
+            #             "type": "float"
+            #         },
+            #         "fitcons_rankscore": {
+            #             "type": "float"
+            #         },
+            #         "confidence_value": {
+            #             "type": "integer"
+            #         }
+            #     }
+            # },
+            "fitcons": {
                 "properties": {
-                    "fitcons_score": {
-                        "type": "float"
-                    },
-                    "fitcons_rankscore": {
-                        "type": "float"
-                    },
-                    "confidence_value": {
-                        "type": "integer"
-                    }
-                }
-            },
-            "gm12878": {  # Column 144-146
-                "properties": {
-                    "fitcons_score": {
-                        "type": "float"
-                    },
-                    "fitcons_rankscore": {
-                        "type": "float"
-                    },
-                    "confidence_value": {
-                        "type": "integer"
-                    }
-                }
-            },
-            "h1-hesc": {  # Column 147-149
-                "properties": {
-                    "fitcons_score": {
-                        "type": "float"
+                    "integrated": {
+                        "properties": {
+                            "score": {
+                                "type": "float"
+                            },
+                            "rankscore": {
+                                "type": "float"
+                            },
+                            "confidence_value": {
+                                "type": "int"
+                            }
+                        }
                     },
-                    "fitcons_rankscore": {
-                        "type": "float"
+                    "gm12878": {
+                        "properties": {
+                            "score": {
+                                "type": "float"
+                            },
+                            "rankscore": {
+                                "type": "float"
+                            },
+                            "confidence_value": {
+                                "type": "int"
+                            }
+                        }
                     },
-                    "confidence_value": {
-                        "type": "integer"
-                    }
-                }
-            },
-            "huvec": {  # Column 150-152
-                "properties": {
-                    "fitcons_score": {
-                        "type": "float"
+                    "h1-hesc": {
+                        "properties": {
+                            "score": {
+                                "type": "float"
+                            },
+                            "rankscore": {
+                                "type": "float"
+                            },
+                            "confidence_value": {
+                                "type": "int"
+                            }
+                        }
                     },
-                    "fitcons_rankscore": {
-                        "type": "float"
+                    "huvec": {
+                        "properties": {
+                            "score": {
+                                "type": "float"
+                            },
+                            "rankscore": {
+                                "type": "float"
+                            },
+                            "confidence_value": {
+                                "type": "int"
+                            }
+                        }
                     },
-                    "confidence_value": {
-                        "type": "integer"
-                    }
                 }
             },
-            "linsight": {  # Column 153-154
+            "linsight": {
                 "properties": {
                     "score": {
                         "type": "float"
@@ -695,7 +798,7 @@
                     }
                 }
             },
-            "gerp++": {  # Column 155-157
+            "gerp++": {
                 "properties": {
                     "nr": {
                         "type": "float"
@@ -708,7 +811,7 @@
                     }
                 }
             },
-            "phylop": {  # Column 158-163
+            "phylop": {
                 "properties": {
                     "100way_vertebrate": {
                         "properties": {
@@ -720,7 +823,7 @@
                             }
                         }
                     },
-                    "30way_mammalian": {
+                    "470way_mammalian": {  # replaced 30way_mammalian in 4.4.a
                         "properties": {
                             "score": {
                                 "type": "float"
@@ -742,7 +845,7 @@
                     }
                 }
             },
-            "phastcons": {  # Column 164-169
+            "phastcons": {
                 "properties": {
                     "100way_vertebrate": {
                         "properties": {
@@ -754,7 +857,7 @@
                             }
                         }
                     },
-                    "30way_mammalian": {
+                    "470way_mammalian": {  # replaced 30way_mammalian in 4.4.a
                         "properties": {
                             "score": {
                                 "type": "float"
@@ -776,7 +879,7 @@
                     }
                 }
             },
-            "siphy_29way": {  # Column 170-172
+            "siphy_29way": {
                 "properties": {
                     "pi": {
                         "properties": {
@@ -802,7 +905,7 @@
                     }
                 }
             },
-            "bstatistic": {  # Column 173-174
+            "bstatistic": {
                 "properties": {
                     "score": {
                         "type": "integer"
@@ -812,7 +915,7 @@
                     }
                 }
             },
-            "1000gp3": {  # Column 175-186
+            "1000gp3": {  # changed since 4.4.a
                 "properties": {
                     "ac": {
                         "type": "integer"
@@ -820,39 +923,89 @@
                     "af": {
                         "type": "float"
                     },
-                    "afr_ac": {
-                        "type": "integer"
-                    },
-                    "afr_af": {
-                        "type": "float"
-                    },
-                    "eur_ac": {
-                        "type": "integer"
-                    },
-                    "eur_af": {
-                        "type": "float"
-                    },
-                    "amr_ac": {
-                        "type": "integer"
-                    },
-                    "amr_af": {
-                        "type": "float"
+                    # "afr_ac": {
+                    #     "type": "integer"
+                    # },
+                    # "afr_af": {
+                    #     "type": "float"
+                    # },
+                    "afr": {
+                        "properties": {
+                            "ac": {
+                                "type": "integer"
+                            },
+                            "af": {
+                                "type": "float"
+                            }
+                        }
                     },
-                    "eas_ac": {
-                        "type": "integer"
+                    # "eur_ac": {
+                    #     "type": "integer"
+                    # },
+                    # "eur_af": {
+                    #     "type": "float"
+                    # },
+                    "eur": {
+                        "properties": {
+                            "ac": {
+                                "type": "integer"
+                            },
+                            "af": {
+                                "type": "float"
+                            }
+                        }
                     },
-                    "eas_af": {
-                        "type": "float"
+                    # "amr_ac": {
+                    #     "type": "integer"
+                    # },
+                    # "amr_af": {
+                    #     "type": "float"
+                    # },
+                    "amr": {
+                        "properties": {
+                            "ac": {
+                                "type": "integer"
+                            },
+                            "af": {
+                                "type": "float"
+                            }
+                        }
                     },
-                    "sas_ac": {
-                        "type": "integer"
+                    # "eas_ac": {
+                    #     "type": "integer"
+                    # },
+                    # "eas_af": {
+                    #     "type": "float"
+                    # },
+                    "eas": {
+                        "properties": {
+                            "ac": {
+                                "type": "integer"
+                            },
+                            "af": {
+                                "type": "float"
+                            }
+                        }
                     },
-                    "sas_af": {
-                        "type": "float"
+                    # "sas_ac": {
+                    #     "type": "integer"
+                    # },
+                    # "sas_af": {
+                    #     "type": "float"
+                    # }
+                    "sas": {
+                        "properties": {
+                            "ac": {
+                                "type": "integer"
+                            },
+                            "af": {
+                                "type": "float"
+                            }
+                        }
                     }
                 }
             },
-            "twinsuk": {  # Column 187-188
+            "twinsuk": {
                 "properties": {
                     "ac": {
                         "type": "integer"
@@ -862,7 +1015,7 @@
                     }
                 }
             },
-            "alspac": {  # Column 189-190
+            "alspac": {
                 "properties": {
                     "ac": {
                         "type": "integer"
@@ -872,7 +1025,7 @@
                     }
                 }
             },
-            "uk10k": {  # Column 191-192
+            "uk10k": {
                 "properties": {
                     "ac": {
                         "type": "integer"
@@ -882,23 +1035,43 @@
                     }
                 }
             },
-            "esp6500": {  # Column 193-196
+            "esp6500": {  # changed since 4.4.a
                 "properties": {
-                    "aa_ac": {
-                        "type": "integer"
-                    },
-                    "aa_af": {
-                        "type": "float"
+                    # "aa_ac": {
+                    #     "type": "integer"
+                    # },
+                    # "aa_af": {
+                    #     "type": "float"
+                    # },
+                    "aa": {
+                        "properties": {
+                            "ac": {
+                                "type": "integer"
+                            },
+                            "af": {
+                                "type": "float"
+                            }
+                        }
                     },
-                    "ea_ac": {
-                        "type": "integer"
+                    # "ea_ac": {
+                    #     "type": "integer"
+                    # },
+                    # "ea_af": {
+                    #     "type": "float"
+                    # }
+                    "ea": {
+                        "properties": {
+                            "ac": {
+                                "type": "integer"
+                            },
+                            "af": {
+                                "type": "float"
+                            }
+                        }
                     },
-                    "ea_af": {
-                        "type": "float"
-                    }
                 }
             },
-            "exac": {  # Column 197-212
+            "exac": {  # changed since 4.4.a
                 "properties": {
                     "ac": {
                         "type": "integer"
@@ -912,45 +1085,105 @@
                     "adj_af": {
                         "type": "float"
                     },
-                    "afr_ac": {
-                        "type": "integer"
-                    },
-                    "afr_af": {
-                        "type": "float"
-                    },
-                    "amr_ac": {
-                        "type": "integer"
-                    },
-                    "amr_af": {
-                        "type": "float"
-                    },
-                    "eas_ac": {
-                        "type": "integer"
-                    },
-                    "eas_af": {
-                        "type": "float"
-                    },
-                    "fin_ac": {
-                        "type": "integer"
+                    # "afr_ac": {
+                    #     "type": "integer"
+                    # },
+                    # "afr_af": {
+                    #     "type": "float"
+                    # },
+                    "afr": {
+                        "properties": {
+                            "ac": {
+                                "type": "integer"
+                            },
+                            "af": {
+                                "type": "float"
+                            }
+                        }
                     },
-                    "fin_af": {
-                        "type": "float"
+                    # "amr_ac": {
+                    #     "type": "integer"
+                    # },
+                    # "amr_af": {
+                    #     "type": "float"
+                    # },
+                    "amr": {
+                        "properties": {
+                            "ac": {
+                                "type": "integer"
+                            },
+                            "af": {
+                                "type": "float"
+                            }
+                        }
                     },
-                    "nfe_ac": {
-                        "type": "integer"
+                    # "eas_ac": {
+                    #     "type": "integer"
+                    # },
+                    # "eas_af": {
+                    #     "type": "float"
+                    # },
+                    "eas": {
+                        "properties": {
+                            "ac": {
+                                "type": "integer"
+                            },
+                            "af": {
+                                "type": "float"
+                            }
+                        }
                     },
-                    "nfe_af": {
-                        "type": "float"
+                    # "fin_ac": {
+                    #     "type": "integer"
+                    # },
+                    # "fin_af": {
+                    #     "type": "float"
+                    # },
+                    "fin": {
+                        "properties": {
+                            "ac": {
+                                "type": "integer"
+                            },
+                            "af": {
+                                "type": "float"
+                            }
+                        }
                     },
-                    "sas_ac": {
-                        "type": "integer"
+                    # "nfe_ac": {
+                    #     "type": "integer"
+                    # },
+                    # "nfe_af": {
+                    #     "type": "float"
+                    # },
+                    "nfe": {
+                        "properties": {
+                            "ac": {
+                                "type": "integer"
+                            },
+                            "af": {
+                                "type": "float"
+                            }
+                        }
                     },
-                    "sas_af": {
-                        "type": "float"
+                    # "sas_ac": {
+                    #     "type": "integer"
+                    # },
+                    # "sas_af": {
+                    #     "type": "float"
+                    # }
+                    "sas": {
+                        "properties": {
+                            "ac": {
+                                "type": "integer"
+                            },
+                            "af": {
+                                "type": "float"
+                            }
+                        }
                     }
                 }
             },
-            "exac_nontcga": {  # Column 213-228
+            "exac_nontcga": {  # changed since 4.4.a
                 "properties": {
                     "ac": {
                         "type": "integer"
@@ -964,45 +1197,105 @@
                     "adj_af": {
                         "type": "float"
                     },
-                    "afr_ac": {
-                        "type": "integer"
-                    },
-                    "afr_af": {
-                        "type": "float"
-                    },
-                    "amr_ac": {
-                        "type": "integer"
-                    },
-                    "amr_af": {
-                        "type": "float"
-                    },
-                    "eas_ac": {
-                        "type": "integer"
-                    },
-                    "eas_af": {
-                        "type": "float"
-                    },
-                    "fin_ac": {
-                        "type": "integer"
+                    # "afr_ac": {
+                    #     "type": "integer"
+                    # },
+                    # "afr_af": {
+                    #     "type": "float"
+                    # },
+                    "afr": {
+                        "properties": {
+                            "ac": {
+                                "type": "integer"
+                            },
+                            "af": {
+                                "type": "float"
+                            }
+                        }
                     },
-                    "fin_af": {
-                        "type": "float"
+                    # "amr_ac": {
+                    #     "type": "integer"
+                    # },
+                    # "amr_af": {
+                    #     "type": "float"
+                    # },
+                    "amr": {
+                        "properties": {
+                            "ac": {
+                                "type": "integer"
+                            },
+                            "af": {
+                                "type": "float"
+                            }
+                        }
                     },
-                    "nfe_ac": {
-                        "type": "integer"
+                    # "eas_ac": {
+                    #     "type": "integer"
+                    # },
+                    # "eas_af": {
+                    #     "type": "float"
+                    # },
+                    "eas": {
+                        "properties": {
+                            "ac": {
+                                "type": "integer"
+                            },
+                            "af": {
+                                "type": "float"
+                            }
+                        }
                     },
-                    "nfe_af": {
-                        "type": "float"
+                    # "fin_ac": {
+                    #     "type": "integer"
+                    # },
+                    # "fin_af": {
+                    #     "type": "float"
+                    # },
+                    "fin": {
+                        "properties": {
+                            "ac": {
+                                "type": "integer"
+                            },
+                            "af": {
+                                "type": "float"
+                            }
+                        }
                     },
-                    "sas_ac": {
-                        "type": "integer"
+                    # "nfe_ac": {
+                    #     "type": "integer"
+                    # },
+                    # "nfe_af": {
+                    #     "type": "float"
+                    # },
+                    "nfe": {
+                        "properties": {
+                            "ac": {
+                                "type": "integer"
+                            },
+                            "af": {
+                                "type": "float"
+                            }
+                        }
                     },
-                    "sas_af": {
-                        "type": "float"
+                    # "sas_ac": {
+                    #     "type": "integer"
+                    # },
+                    # "sas_af": {
+                    #     "type": "float"
+                    # }
+                    "sas": {
+                        "properties": {
+                            "ac": {
+                                "type": "integer"
+                            },
+                            "af": {
+                                "type": "float"
+                            }
+                        }
                     }
                 }
             },
-            "exac_nonpsych": {  # Column 229-244
+            "exac_nonpsych": {  # changed since 4.4.a
                 "properties": {
                     "ac": {
                         "type": "integer"
@@ -1016,48 +1309,265 @@
                     "adj_af": {
                         "type": "float"
                     },
-                    "afr_ac": {
-                        "type": "integer"
+                    # "afr_ac": {
+                    #     "type": "integer"
+                    # },
+                    # "afr_af": {
+                    #     "type": "float"
+                    # },
+                    "afr": {
+                        "properties": {
+                            "ac": {
+                                "type": "integer"
+                            },
+                            "af": {
+                                "type": "float"
+                            }
+                        }
                     },
-                    "afr_af": {
-                        "type": "float"
+                    # "amr_ac": {
+                    #     "type": "integer"
+                    # },
+                    # "amr_af": {
+                    #     "type": "float"
+                    # },
+                    "amr": {
+                        "properties": {
+                            "ac": {
+                                "type": "integer"
+                            },
+                            "af": {
+                                "type": "float"
+                            }
+                        }
                     },
-                    "amr_ac": {
-                        "type": "integer"
+                    # "eas_ac": {
+                    #     "type": "integer"
+                    # },
+                    # "eas_af": {
+                    #     "type": "float"
+                    # },
+                    "eas": {
+                        "properties": {
+                            "ac": {
+                                "type": "integer"
+                            },
+                            "af": {
+                                "type": "float"
+                            }
+                        }
                     },
-                    "amr_af": {
-                        "type": "float"
+                    # "fin_ac": {
+                    #     "type": "integer"
+                    # },
+                    # "fin_af": {
+                    #     "type": "float"
+                    # },
+                    "fin": {
+                        "properties": {
+                            "ac": {
+                                "type": "integer"
+                            },
+                            "af": {
+                                "type": "float"
+                            }
+                        }
                     },
-                    "eas_ac": {
-                        "type": "integer"
+                    # "nfe_ac": {
+                    #     "type": "integer"
+                    # },
+                    # "nfe_af": {
+                    #     "type": "float"
+                    # },
+                    "nfe": {
+                        "properties": {
+                            "ac": {
+                                "type": "integer"
+                            },
+                            "af": {
+                                "type": "float"
+                            }
+                        }
                     },
-                    "eas_af": {
-                        "type": "float"
+                    # "sas_ac": {
+                    #     "type": "integer"
+                    # },
+                    # "sas_af": {
+                    #     "type": "float"
+                    # }
+                    "sas": {
+                        "properties": {
+                            "ac": {
+                                "type": "integer"
+                            },
+                            "af": {
+                                "type": "float"
+                            }
+                        }
+                    }
+                }
+            },
+            "alfa": {  # new in 4.4.a
+                "properties": {
+                    "european": {
+                        "properties": {
+                            "ac": {
+                                "type": "integer"
+                            },
+                            "an": {
+                                "type": "integer"
+                            },
+                            "af": {
+                                "type": "float"
+                            }
+                        }
                     },
-                    "fin_ac": {
-                        "type": "integer"
+                    "african_others": {
+                        "properties": {
+                            "ac": {
+                                "type": "integer"
+                            },
+                            "an": {
+                                "type": "integer"
+                            },
+                            "af": {
+                                "type": "float"
+                            }
+                        }
                     },
-                    "fin_af": {
-                        "type": "float"
+                    "east_asian": {
+                        "properties": {
+                            "ac": {
+                                "type": "integer"
+                            },
+                            "an": {
+                                "type": "integer"
+                            },
+                            "af": {
+                                "type": "float"
+                            }
+                        }
                     },
-                    "nfe_ac": {
-                        "type": "integer"
+                    "african_american": {
+                        "properties": {
+                            "ac": {
+                                "type": "integer"
+                            },
+                            "an": {
+                                "type": "integer"
+                            },
+                            "af": {
+                                "type": "float"
+                            }
+                        }
                     },
-                    "nfe_af": {
-                        "type": "float"
+                    "latin_american_1": {
+                        "properties": {
+                            "ac": {
+                                "type": "integer"
+                            },
+                            "an": {
+                                "type": "integer"
+                            },
+                            "af": {
+                                "type": "float"
+                            }
+                        }
                     },
-                    "sas_ac": {
-                        "type": "integer"
+                    "latin_american_2": {
+                        "properties": {
+                            "ac": {
+                                "type": "integer"
+                            },
+                            "an": {
+                                "type": "integer"
+                            },
+                            "af": {
+                                "type": "float"
+                            }
+                        }
+                    },
+                    "other_asian": {
+                        "properties": {
+                            "ac": {
+                                "type": "integer"
+                            },
+                            "an": {
+                                "type": "integer"
+                            },
+                            "af": {
+                                "type": "float"
+                            }
+                        }
+                    },
+                    "south_asian": {
+                        "properties": {
+                            "ac": {
+                                "type": "integer"
+                            },
+                            "an": {
+                                "type": "integer"
+                            },
+                            "af": {
+                                "type": "float"
+                            }
+                        }
+                    },
+                    "other": {
+                        "properties": {
+                            "ac": {
+                                "type": "integer"
+                            },
+                            "an": {
+                                "type": "integer"
+                            },
+                            "af": {
+                                "type": "float"
+                            }
+                        }
+                    },
+                    "african": {
+                        "properties": {
+                            "ac": {
+                                "type": "integer"
+                            },
+                            "an": {
+                                "type": "integer"
+                            },
+                            "af": {
+                                "type": "float"
+                            }
+                        }
+                    },
+                    "asian": {
+                        "properties": {
+                            "ac": {
+                                "type": "integer"
+                            },
+                            "an": {
+                                "type": "integer"
+                            },
+                            "af": {
+                                "type": "float"
+                            }
+                        }
+                    },
+                    "total": {
+                        "properties": {
+                            "ac": {
+                                "type": "integer"
+                            },
+                            "an": {
+                                "type": "integer"
+                            },
+                            "af": {
+                                "type": "float"
+                            }
+                        }
                     },
-                    "sas_af": {
-                        "type": "float"
-                    }
                 }
             },
-
-            # Column 245-630 are gnomAD_* columns. Skipped.
-
-            "clinvar": {  # Column 631-639
+            "clinvar": {
                 "properties": {
                     "clinvar_id": {
                         "type": "integer"
@@ -1078,24 +1588,28 @@
                         "type": "keyword",
                         "normalizer": "keyword_lowercase_normalizer"
                     },
+                    "var_source": {
+                        "type": "keyword",
+                        "normalizer": "keyword_lowercase_normalizer"
+                    },
                     "medgen": {
                         "type": "keyword",
                         "normalizer": "keyword_lowercase_normalizer"
                     },
-                    "orphanet": {
+                    "omim": {
                         "type": "keyword",
                         "normalizer": "keyword_lowercase_normalizer"
                     },
-                    "var_source": {
+                    "orphanet": {
                         "type": "keyword",
                         "normalizer": "keyword_lowercase_normalizer"
                     }
                 }
             },
-            "interpro_domain": {  # Column 640
+            "interpro_domain": {
                 "type": "text"
             },
-            "gtex": {  # Column 641-642
+            "gtex": {
                 "properties": {
                     "gene": {
                         "type": "keyword",
@@ -1107,10 +1621,10 @@
                     }
                 }
             },
-            "geuvadis_eqtl_target_gene": {  # Column 643
+            "geuvadis_eqtl_target_gene": {
                 "type": "keyword",
                 "normalizer": "keyword_lowercase_normalizer"
             }
         }
     }
-}
\ No newline at end of file
+}
diff --git a/src/hub/dataload/sources/dbnsfp/dbnsfp_parser.py b/src/hub/dataload/sources/dbnsfp/dbnsfp_parser.py
index cbda413b..05ebfcd6 100644
--- a/src/hub/dataload/sources/dbnsfp/dbnsfp_parser.py
+++ b/src/hub/dataload/sources/dbnsfp/dbnsfp_parser.py
@@ -1,677 +1,815 @@
+import re
 import csv
 import glob
-import itertools
-import re
-
-from biothings.utils.dataload import dict_sweep  # list_split, unlist, value_convert_to_number
+from enum import Flag
+from dataclasses import dataclass
+from itertools import chain
+from typing import Callable
+from types import SimpleNamespace
+from utils.table import TableColumn, create_tag_column_map
+from utils.dotfield import parse_dot_fields
 from biothings.utils.common import anyfile
 
+
 # VALID_COLUMN_NO = 367  # for 4.1a
 # VALID_COLUMN_NO = 642  # for 4.2a
-VALID_COLUMN_NO = 643  # for 4.2a
-
-
-"""
-this parser is for dbNSFP v4.3a downloaded from
-https://sites.google.com/site/jpopgen/dbNSFP
-"""
-
-
-class DbnsfpReader:
-    # dbNSFP_variant use "." for missing values;
-    # other none values are borrowed from the `biothings.utils.dataload.dict_sweep` function and
-    #   from the default `na_values` argument of pandas.read_csv().
-    # see https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html
-    none_values = {r'.', r'', r" ", r"-", r'#N/A', r'#N/A N/A', r'#NA', r'-1.#IND', r'-1.#QNAN', r'-NaN', r'-nan',
-                   r'1.#IND', r'1.#QNAN', r'<NA>', r'N/A', r'NA', r'NULL', r'NaN', r'n/a', r'nan', r'null', r'none',
-                   r"Not Available", r"unknown"}
-
-    mutpred_top5features_pattern = re.compile(r" \(P = ([eE0-9.-]*)\)$")
-
-    # A general rule from observation: for some of the columns, data type can be inferred from the suffix of their
-    # column names. E.g. "xxx_score" is usually float.
-    # This rule is usually true especially when dealing with grouped columns (like "DANN_score" and "DANN_rankscore").
-    # Grouped columns are represented by their common col_prefix (like "DANN").
-    # Column names in one group can be restored by concatenating their common col_prefix and individual suffixes.
-    col_suffix_to_type = {
-        "score": float,
-        "rankscore": float,
-        "converted_rankscore": float,
-        "fitCons_score": float,
-        "fitCons_rankscore": float,
-        "confidence_value": int,
-        "AC": int,
-        "AF": float
-    }
+# VALID_COLUMN_NO = 643  # for 4.3a
+VALID_COLUMN_NO = 689  # for 4.4a
+
+MUTPRED_TOP5FEATURES_PATTERN = re.compile(r" \(P = ([eE0-9.-]*)\)$")
+
+# dbNSFP_variant use "." for missing values;
+# other none values are borrowed from the `biothings.utils.dataload.dict_sweep` function and
+#   from the default `na_values` argument of pandas.read_csv().
+# see https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html
+NA_VALUES = frozenset({
+    r'.', r'', r" ", r"-", r'#N/A', r'#N/A N/A', r'#NA', r'-1.#IND', r'-1.#QNAN', r'-NaN', r'-nan',
+    r'1.#IND', r'1.#QNAN', r'<NA>', r'N/A', r'NA', r'NULL', r'NaN', r'n/a', r'nan', r'null', r'none',
+    r"Not Available", r"unknown"
+})
+
+COLUMN_TAG = SimpleNamespace()
+COLUMN_TAG.HG38_POS = "hg38_pos"  # for "pos(1-based)"
+COLUMN_TAG.HG19_POS = "hg19_pos"  # for "hg19_pos(1-based)"
+COLUMN_TAG.HG38_CHROM = "hg38_chrom"  # for "#chr"
+COLUMN_TAG.HG19_CHROM = "hg19_chrom"  # for "hg19_chr"
+COLUMN_TAG.REF_ALLELE = "ref"
+COLUMN_TAG.ALT_ALLELE = "alt"
+COLUMN_TAG.UNIPROT_ACC = "uniprot_acc"
+COLUMN_TAG.UNIPROT_ENTRY = "uniprot_entry"
+COLUMN_TAG.HGVS_CODING = "hgvsc"  # for "HGVSc_ANNOVAR", "HGVSc_snpEff", and "HGVSc_VEP"
+COLUMN_TAG.HGVS_PROTEIN = "hgvsp"  # for "HGVSp_ANNOVAR", "HGVSp_snpEff", and "HGVSp_VEP"
+COLUMN_TAG.GTEX_GENE = "gtex_gene"
+COLUMN_TAG.GTEX_TISSUE = "gtex_tissue"
+
+
+def _check_length(lst: list):
+    """
+    If the input list is empty (i.e. length is 0), return None;
+    if the input list has only 1 element (i.e. length is 1), return the element;
+    otherwise return the list as-is.
+    """
+    if not lst:
+        return None
+    if len(lst) == 1:
+        return lst[0]
+    return lst
+
+
+class Assembly(Flag):
+    HG19 = 1  # indicates that a column belongs to hg19 docs
+    HG38 = 2  # indicates that a column belongs to hg38 docs
+    BOTH = HG19 | HG38  # (BOTH == 3) applies to both assemblies
 
     @classmethod
-    def read_string(cls, row, col, sep=None, transform=None):
-        """
-        Read `row[col]` as a string. If `sep` is None, return the single string (with transformation).
-        If `sep` is not None, separate the string into a list of substrings, transform each substring, and then return
-        the list.
-        """
-        def apply_transformation(_string, _transform):
-            """
-            If the transformation is to convert the string into an integer or float, wrap it in try-catch;
-            otherwise simply apply the transformation as a function to the string.
-            """
-            if _transform is int or _transform is float:
-                try:
-                    return _transform(_string)
-                except ValueError:
-                    raise ValueError("Cannot convert {col} value {string} to {type}".format(col=col, string=_string, type=_transform))
-            return _transform(_string)
-
-        string = row[col]
-        if string in cls.none_values:
-            return None
-
-        if sep is None:
-            if transform is not None:
-                string = apply_transformation(string, transform)
-            return string
+    def assembly_of(cls, name: str):
+        # E.g. when member_name == "HG19", member is Assembly.HG19
+        for member_name, member in cls.__members__.items():
+            if name.upper() == member_name:
+                return member
         else:
-            string_list = [s for s in string.split(sep=sep) if s not in cls.none_values]
-            if not string_list:  # `string_list` is empty after none-values are removed
-                return None
+            raise ValueError(f"'{cls.__name__}' enum not found for '{name}'")
 
-            if transform is not None:
-                string_list = [apply_transformation(string, transform) for string in string_list]
-            return string_list if len(string_list) > 1 else string_list[0]
 
-    @classmethod
-    def read_unique_strings(cls, row, cols, sep=None):
-        """
-        Return the unique strings from the readout union of multiple columns.
-        No transformation is applied.
-        """
-        string_list = [row[key].split(sep=sep) for key in cols]
-        string_list = list(set(string for string in itertools.chain.from_iterable(string_list) if string not in cls.none_values))
-
-        if not string_list:  # `string_list` is empty after none-values are removed
-            return None
-        return string_list if len(string_list) > 1 else string_list[0]
+@dataclass
+class Column(TableColumn):
+    """
+    Assembly-specific column configuration
+    """
+    assembly: str | Assembly = None  # which assembly or assemblies this column belongs to
 
-    @classmethod
-    def _iter_read_group(cls, row, col_prefix, col_suffixes, sep=";"):
-        """
-        Some columns other than `*_AC` and `*_AF` are grouped by their prefixes, e.g. "ClinPred_pred",
-        "ClinPred_rankscore", and "ClinPred_score". Such groups of columns usually contains scores of some metrics.
-
-        To play safe, I assume each column in such a group is a separable string (although it may not contain any
-        separator at all); and I assume all columns in a group should use the same separator, semicolon by default.
-
-        `col_prefix` and `col_suffixes`, when joined with "_", form the column names in the `row` to be read.
-        Each column will be read and yielded as in an iterator.
-
-        Args:
-            row (dict-like): the data to read in
-            col_prefix (string): col_prefix of the keys of interest in `row`, e.g "ClinPred"
-            col_suffixes (collection of strings): suffixes of the columns interest in `row`,
-                                                  e.g. ("pred", "rankscore", "score")
-            sep (string): a single string of the separator
-        """
-        for col_suffix in col_suffixes:
-            # filter(function, iterable):
-            #   If function is None, the identity function is assumed, that is,
-            #   all elements of iterable that are false are removed.
-            col = "_".join(filter(None, (col_prefix, col_suffix)))
-            transform = cls.col_suffix_to_type.get(col_suffix)
-            yield cls.read_string(row, col, sep=sep, transform=transform)
+    def __post_init__(self):
+        super().__post_init__()
 
-    @classmethod
-    def map_score_rankscore_to_json(cls, row, col_prefix):
-        """
-        A common case of calling `_iter_read_group`.
+        if self.assembly is None:
+            self.assembly = Assembly.BOTH
+            return
 
-        Dozens of columns are grouped into tuple, like "<col_prefix>_score" and "<col_prefix>_rankscore".
-        Their data types are usually float.
-        """
-        col_suffixes = ("score", "rankscore")  # col_suffixes are also the json keys
-        return dict(zip(col_suffixes, cls._iter_read_group(row, col_prefix, col_suffixes)))
+        if isinstance(self.assembly, Assembly):
+            return
 
-    @classmethod
-    def map_score_rankscore_pred_to_json(cls, row, col_prefix):
-        """
-        A common case of calling `_iter_read_group`.
+        if isinstance(self.assembly, str):
+            self.assembly = Assembly.assembly_of(self.assembly)
+            return
 
-        Dozens of columns are grouped into triples, like "<col_prefix>_score", "<col_prefix>_rankscore", and
-        "<col_prefix>_pred". Their data types are usually float, float and string.
-        """
-        col_suffixes = ("score", "rankscore", "pred")  # col_suffixes are also the json keys
-        return dict(zip(col_suffixes, cls._iter_read_group(row, col_prefix, col_suffixes)))
+        raise ValueError(f"Cannot recognize assembly {self.assembly}")
 
-    @classmethod
-    def map_score_converted_rankscore_pred_to_json(cls, row, col_prefix):
-        """
-        A common case of calling `_iter_read_group`.
+    def is_hg19(self):
+        return bool(self.assembly & Assembly.HG19)  # true if self.assembly is HG19 or BOTH
 
-        Dozens of columns are grouped into triples, like "<col_prefix>_score", "<col_prefix>_converted_rankscore", and
-        "<col_prefix>_pred". Their data types are usually float, float and string.
-        """
-        col_suffixes = ("score", "converted_rankscore", "pred")  # col_suffixes are also the json keys
-        return dict(zip(col_suffixes, cls._iter_read_group(row, col_prefix, col_suffixes)))
+    def is_hg38(self):
+        return bool(self.assembly & Assembly.HG38)  # true if self.assembly is HG38 or BOTH
 
-    @classmethod
-    def map_fitcons_score_rankscore_confidence_value_to_json(cls, row, col_prefix):
-        """
-        A common case of calling `_iter_read_group`.
 
-        Dozens of columns are grouped into triples, like "<col_prefix>_fitCons_score", "<col_prefix>_fitCons_rankscore",
-        and "<col_prefix>_confidence_value". Their data types are usually float, float and int.
-        """
-        col_suffixes = ("fitCons_score", "fitCons_rankscore", "confidence_value")
-        json_keys = ("fitcons_score", "fitcons_rankscore", "confidence_value")
-        return dict(zip(json_keys, cls._iter_read_group(row, col_prefix, col_suffixes)))
+def split(sep: str, na_values: set = NA_VALUES):
+    def _func(value: str):
+        result = [v for v in value.split(sep) if v not in na_values]
+        return _check_length(result)
 
-    @classmethod
-    def map_AC_AF_to_json(cls, row, col_prefix, col_infixes, whole_group=False):
-        """
-        Read `<col_prefix>_<col_infix>_AC` and `<col_prefix>_<col_infix>_AF` columns for each `col_infix` in
-        `col_infixes`.
-        When `whole_group` is True, read two extra columns, `<col_prefix>_AC` and `<col_prefix>_AF`.
+    return _func
 
-        AC (allele counts) will be parsed into integers; AF (allele freqs) will be parsed into floats.
-        No separator is assumed to exist in such AC/AF columns.
 
-        The readout will be returned as a dict like:
+def split_cast(sep: str, astype: Callable, na_values: set = NA_VALUES):
+    def _func(value: str):
+        result = [astype(v) for v in value.split(sep) if v not in na_values]
+        return _check_length(result)
 
-            {
-                <col_infix.lower()>_ac : int(<col_prefix>_<col_infix>_AC)
-                <col_infix.lower()>_af : float(<col_prefix>_<col_infix>_AF)
-            }
+    return _func
 
-        E.g. to read `ESP6500_AA_AC`, `ESP6500_AA_AF`, `ESP6500_EA_AC` and `ESP6500_EA_AF` columns, we can simply call
-        map_AC_AF_to_json(col_prefix="ESP6500", col_infixes=["AA", "EA"], whole_group=False)
-        """
-        if col_infixes is None:
-            col_infixes = []
 
-        if whole_group:
-            col_infixes = [""] + col_infixes
+# transforming functions for common data sources
+split_str = split(";")
+split_float = split_cast(";", float)
+split_int = split_cast(";", int)
 
-        col_suffixes = ("AC", "AF")
+# transforming functions for specific data sources
+split_clinvar = split(r"|")
+split_genotype = split(r"/")  # for "AltaiNeandertal", "Denisova", "VindijiaNeandertal", and "ChagyrskayaNeandertal"
 
-        def _generate_json_fields():
-            for col_infix in col_infixes:
-                for col_suffix in col_suffixes:
-                    # filter(function, iterable):
-                    #   If function is None, the identity function is assumed, that is,
-                    #   all elements of iterable that are false are removed.
-                    col = "_".join(filter(None, (col_prefix, col_infix, col_suffix)))
-                    transform = cls.col_suffix_to_type.get(col_suffix)
 
-                    json_key = "_".join(filter(None, (col_infix, col_suffix))).lower()
-                    yield json_key, cls.read_string(row, col, sep=None, transform=transform)
+def normalize_chrom(chr: str):
+    """
+    In dbNSFP, chromosomes are marked 1-22, "X", "Y", and "M" (Mitochondrial).
+    However, in MyVariant, we mark Mitochondrial chromosome "MT".
+    """
+    return "MT" if chr == "M" else chr
 
-        return dict(_generate_json_fields())
 
-    @classmethod
-    def parse_mutpred_top5features(cls, row, col):
-        """
-        `mutpred_mechanisms` is a string combined from 5 clauses, separated by semicolons (with whitespaces).
-        Each clause has the same pattern of "<mechanism> (P = <p_val>)".
+def make_zero_based(pos: str):
+    """
+    Convert a 1-based chromosomal position to a 0-based start-end pair.
+    """
+    _pos = int(pos)
+    return {"start": _pos, "end": _pos}
 
-        E.g. "Loss of helix (P = 0.0444); Gain of loop (P = 0.0502); Gain of catalytic residue at A444 (P = 0.1876); \
-        Gain of solvent accessibility (P = 0.2291); Loss of disorder (P = 0.9475)"
 
-        Here we apply regex to parse this string
+def parse_mutpred_top5features(value):
+    """
+    `mutpred_mechanisms` is a string combined from 5 clauses, separated by semicolons (with whitespaces).
+    Each clause has the same pattern of "<mechanism> (P = <p_val>)".
 
-            regex = re.compile(r" \(P = ([eE0-9.-]*)\)$")
-            [(e for e in regex.split(s) if e.strip()) for s in string.split("; ")]
+    E.g. "Loss of helix (P = 0.0444); Gain of loop (P = 0.0502); Gain of catalytic residue at A444 (P = 0.1876); \
+    Gain of solvent accessibility (P = 0.2291); Loss of disorder (P = 0.9475)"
 
-        and get a list of 5 tuples like
+    Here we apply regex to parse this string
 
-            [('Loss of helix', '0.0444'), ('Gain of loop', '0.0502'), ('Gain of catalytic residue at A444', '0.1876'),
-            ('Gain of solvent accessibility', '0.2291'), ('Loss of disorder', '0.9475')]
+        regex = re.compile(r" \(P = ([eE0-9.-]*)\)$")
+        [(e for e in regex.split(s) if e.strip()) for s in string.split("; ")]
 
-        Then construct a list of 5 dictionaries of <"mechanism": xxx, "p_val": xxx> and return
-        """
-        string = cls.read_string(row, col)
-        if string is None:
-            return None
+    and get a list of 5 tuples like
 
-        mp_list = [tuple(e for e in cls.mutpred_top5features_pattern.split(s) if e.strip()) for s in string.split("; ")]
-        return [{"mechanism": mp[0], "p_val": float(mp[1])} for mp in mp_list if mp and len(mp) == 2]
+        [('Loss of helix', '0.0444'), ('Gain of loop', '0.0502'), ('Gain of catalytic residue at A444', '0.1876'),
+        ('Gain of solvent accessibility', '0.2291'), ('Loss of disorder', '0.9475')]
 
-    @classmethod
-    def parse_uniprot(cls, row, acc_col, entry_col):
-        """
-        Read uniprot accession numbers and entry names as two strings from `row`. Map each accession number and entry
-        name into a dictionary, and return a list of such dictionaries.
+    Then construct a list of 5 dictionaries of <"mechanism": xxx, "p_val": xxx> and return
+    """
+    if value is None:
+        return None
 
-        E.g. suppose we have the following readouts
+    mp_list = [tuple(e for e in MUTPRED_TOP5FEATURES_PATTERN.split(s) if e.strip()) for s in value.split("; ")]
+    result = [{"mechanism": mp[0], "p_val": float(mp[1])} for mp in mp_list if mp and len(mp) == 2]
 
-            row[acc_col] = "P54578-2;P54578-3;A6NJA2;P54578"
-            row[entry_col] = UBP14_HUMAN;UBP14_HUMAN;A6NJA2_HUMAN;UBP14_HUMAN
+    return _check_length(result)
+
+
+def parse_siphy_29way_pi(value: str):
+    """
+    A "SiPhy_29way_pi" value, if not None, is a string separated by ":", representing an estimated stationary
+    distribution of A, C, G and T at a variant site. E.g. "0.0:0.5259:0.0:0.4741".
+
+    Here we split the string and convert it to a dict of {<nt>: <freq>}.
+    """
+    if value is None:
+        return None
+
+    freq = [float(v) for v in value.split(":")]
+    pi_dict = {"a": freq[0], "c": freq[1], "g": freq[2], "t": freq[3]}
+    return pi_dict
+
+
+def split_zip(a_value: str, b_value: str, sep: str, na_values: set = NA_VALUES):
+    """
+    Split a_value and b_value by sep into two lists, and generate pairs from the two lists.
+
+    This function assumes that the split two lists have the same length.
+
+    E.g. with the following input,
+
+        a_value = "P54578-2;P54578-3;A6NJA2;P54578"
+        b_value = UBP14_HUMAN;UBP14_HUMAN;A6NJA2_HUMAN;UBP14_HUMAN
+
+    the returned generator can make:
+
+        [('P54578-2', 'UBP14_HUMAN'),
+         ('P54578-3', 'UBP14_HUMAN'),
+         ('A6NJA2', 'A6NJA2_HUMAN'),
+         ('P54578', 'UBP14_HUMAN')]
+    """
+    a_list = [v if v not in na_values else None for v in a_value.split(sep)]
+    b_list = [v if v not in na_values else None for v in b_value.split(sep)]
+
+    result = ((a, b) for (a, b) in zip(a_list, b_list) if (a, b) != (None, None))
+    # DO NOT use _check_length(result) otherwise the generator will be consumed
+    return result
+
+
+def split_dedup(values: list, sep: str, na_values: set = NA_VALUES):
+    """
+    Split each value from the input values by the separator, merge all the split results, and remove duplicates from the merged result.
+
+    E.g. when values=["a;b;c", "b;c", "d"] and sep=";", the result is ["a", "b", "c", "d"]
+    """
+    value_list = [value.split(sep=sep) for value in values]  # a list of lists
+    value_set = set(chain.from_iterable(value_list))  # flatten and dedup
+
+    result = list(v for v in value_set if v not in na_values)
+    return _check_length(result)
+
+
+COLUMNS = [
+    Column("#chr", dest="chrom", transform=normalize_chrom, assembly="hg38", tag=COLUMN_TAG.HG38_CHROM),  # representing "chrom" only for assembly 'hg38'
+    Column("pos(1-based)", dest="hg38", transform=make_zero_based, tag=COLUMN_TAG.HG38_POS),
+    Column("ref", transform=str.upper, tag=COLUMN_TAG.REF_ALLELE),
+    Column("alt", transform=str.upper, tag=COLUMN_TAG.ALT_ALLELE),
+    Column("aaref", dest="aa.ref"),
+    Column("aaalt", dest="aa.alt"),
+    Column("rs_dbSNP", dest="rsid"),
+    Column("hg19_chr", dest="chrom", transform=normalize_chrom, assembly="hg19", tag=COLUMN_TAG.HG19_CHROM),  # representing "chrom" only for assembly 'hg19'
+    Column("hg19_pos(1-based)", dest="hg19", transform=make_zero_based, tag=COLUMN_TAG.HG19_POS),
+    # Column("hg18_chr"),  # Not Used
+    Column("hg18_pos(1-based)", dest="hg18", transform=make_zero_based),
+    Column("aapos", dest="aa.pos", transform=split_int),
+    Column("genename", transform=split_str),
+    Column("Ensembl_geneid", transform=split_str),
+    Column("Ensembl_transcriptid", transform=split_str),
+    Column("Ensembl_proteinid", transform=split_str),
+    Column("Uniprot_acc", tag=COLUMN_TAG.UNIPROT_ACC),  # special column, see prune_uniprot()
+    Column("Uniprot_entry", tag=COLUMN_TAG.UNIPROT_ENTRY),  # special column, see prune_uniprot()
+    Column("HGVSc_ANNOVAR", tag=COLUMN_TAG.HGVS_CODING),  # special column, see prune_hgvsc_hgvsp()
+    Column("HGVSp_ANNOVAR", tag=COLUMN_TAG.HGVS_PROTEIN),  # ditto
+    Column("HGVSc_snpEff", tag=COLUMN_TAG.HGVS_CODING),  # ditto
+    Column("HGVSp_snpEff", tag=COLUMN_TAG.HGVS_PROTEIN),  # ditto
+    Column("HGVSc_VEP", tag=COLUMN_TAG.HGVS_CODING),  # ditto
+    Column("HGVSp_VEP", tag=COLUMN_TAG.HGVS_PROTEIN),  # ditto
+    Column("APPRIS", transform=split_str),
+    Column("GENCODE_basic", dest="genecode_basic", transform=split_str),
+    Column("TSL", transform=split_int),
+    Column("VEP_canonical", dest="vep_canonical", transform=split_str),
+    Column("cds_strand", dest="cds_strand", transform=split_str),
+    Column("refcodon", dest="aa.refcodon", transform=split_str),
+    Column("codonpos", dest="aa.codonpos", transform=split_int),
+    Column("codon_degeneracy", dest="aa.codon_degeneracy", transform=split_int),
+    Column("Ancestral_allele", dest="ancestral_allele", transform=split_str),
+    Column("AltaiNeandertal", dest="altai_neandertal", transform=split_genotype),
+    Column("Denisova", transform=split_genotype),
+    Column("VindijiaNeandertal", dest="vindijia_neandertal", transform=split_genotype),
+    Column("ChagyrskayaNeandertal", dest="chagyrskaya_neandertal", transform=split_genotype),
+    Column("SIFT_score", transform=split_float),
+    Column("SIFT_converted_rankscore", transform=split_float),
+    Column("SIFT_pred", transform=split_str),
+    Column("SIFT4G_score", transform=split_float),
+    Column("SIFT4G_converted_rankscore", transform=split_float),
+    Column("SIFT4G_pred", transform=split_str),
+    Column("Polyphen2_HDIV_score", transform=split_float),
+    Column("Polyphen2_HDIV_rankscore", transform=split_float),
+    Column("Polyphen2_HDIV_pred", transform=split_str),
+    Column("Polyphen2_HVAR_score", transform=split_float),
+    Column("Polyphen2_HVAR_rankscore", transform=split_float),
+    Column("Polyphen2_HVAR_pred", transform=split_str),
+    Column("LRT_score", transform=split_float),
+    Column("LRT_converted_rankscore", transform=split_float),
+    Column("LRT_pred", transform=split_str),
+    Column("LRT_Omega", transform=split_float),
+    Column("MutationTaster_score", transform=split_float),
+    Column("MutationTaster_converted_rankscore", transform=split_float),
+    Column("MutationTaster_pred", transform=split_str),
+    Column("MutationTaster_model", transform=split_str),
+    Column("MutationTaster_AAE", transform=split_str),
+    Column("MutationAssessor_score", transform=split_float),
+    Column("MutationAssessor_rankscore", transform=split_float),
+    Column("MutationAssessor_pred", transform=split_str),
+    Column("FATHMM_score", transform=split_float),
+    Column("FATHMM_converted_rankscore", transform=split_float),
+    Column("FATHMM_pred", transform=split_str),
+    Column("PROVEAN_score", transform=split_float),
+    Column("PROVEAN_converted_rankscore", transform=split_float),
+    Column("PROVEAN_pred", transform=split_str),
+    Column("VEST4_score", transform=split_float),
+    Column("VEST4_rankscore", transform=split_float),
+    Column("MetaSVM_score", transform=split_float),
+    Column("MetaSVM_rankscore", transform=split_float),
+    Column("MetaSVM_pred", transform=split_str),
+    Column("MetaLR_score", transform=split_float),
+    Column("MetaLR_rankscore", transform=split_float),
+    Column("MetaLR_pred", transform=split_str),
+    Column("Reliability_index", dest="reliability_index", transform=int),
+    Column("MetaRNN_score", transform=split_float),
+    Column("MetaRNN_rankscore", transform=split_float),
+    Column("MetaRNN_pred", transform=split_str),
+    Column("M-CAP_score", transform=split_float),
+    Column("M-CAP_rankscore", transform=split_float),
+    Column("M-CAP_pred", transform=split_str),
+    Column("REVEL_score", transform=split_float),
+    Column("REVEL_rankscore", transform=split_float),
+    Column("MutPred_score", transform=split_float),
+    Column("MutPred_rankscore", transform=split_float),
+    Column("MutPred_protID", dest="mutpred.accession", transform=split_str),
+    Column("MutPred_AAchange", dest="mutpred.aa_change", transform=split_str),
+    Column("MutPred_Top5features", dest="mutpred.pred", transform=parse_mutpred_top5features),
+    Column("MVP_score", transform=split_float),
+    Column("MVP_rankscore", transform=split_float),
+    Column("gMVP_score", transform=split_float),  # new in 4.4.a
+    Column("gMVP_rankscore", transform=split_float),  # new in 4.4.a
+    Column("MPC_score", transform=split_float),
+    Column("MPC_rankscore", transform=split_float),
+    Column("PrimateAI_score", transform=split_float),
+    Column("PrimateAI_rankscore", transform=split_float),
+    Column("PrimateAI_pred", transform=split_str),
+    Column("DEOGEN2_score", transform=split_float),
+    Column("DEOGEN2_rankscore", transform=split_float),
+    Column("DEOGEN2_pred", transform=split_str),
+    Column("BayesDel_addAF_score", dest="bayesdel.add_af.score", transform=split_float),
+    Column("BayesDel_addAF_rankscore", dest="bayesdel.add_af.rankscore", transform=split_float),
+    Column("BayesDel_addAF_pred", dest="bayesdel.add_af.pred", transform=split_str),
+    Column("BayesDel_noAF_score", dest="bayesdel.no_af.score", transform=split_float),
+    Column("BayesDel_noAF_rankscore", dest="bayesdel.no_af.rankscore", transform=split_float),
+    Column("BayesDel_noAF_pred", dest="bayesdel.no_af.pred", transform=split_str),
+    Column("ClinPred_score", transform=split_float),
+    Column("ClinPred_rankscore", transform=split_float),
+    Column("ClinPred_pred", transform=split_str),
+    Column("LIST-S2_score", transform=split_float),
+    Column("LIST-S2_rankscore", transform=split_float),
+    Column("LIST-S2_pred", transform=split_str),
+    Column("VARITY_R_score", dest="varity_r.score", transform=split_float),  # VARITY new in 4.4.a
+    Column("VARITY_R_rankscore", dest="varity_r.rankscore", transform=split_float),
+    Column("VARITY_ER_score", dest="varity_er.score", transform=split_float),
+    Column("VARITY_ER_rankscore", dest="varity_er.rankscore", transform=split_float),
+    Column("VARITY_R_LOO_score", dest="varity_r_loo.score", transform=split_float),
+    Column("VARITY_R_LOO_rankscore", dest="varity_r_loo.rankscore", transform=split_float),
+    Column("VARITY_ER_LOO_score", dest="varity_er_loo.score", transform=split_float),
+    Column("VARITY_ER_LOO_rankscore", dest="varity_er_loo.rankscore", transform=split_float),
+    Column("Aloft_Fraction_transcripts_affected", dest="aloft.fraction_transcripts_affected", transform=split_str),
+    Column("Aloft_prob_Tolerant", dest="aloft.prob_tolerant", transform=split_str),
+    Column("Aloft_prob_Recessive", dest="aloft.prob_recessive", transform=split_str),
+    Column("Aloft_prob_Dominant", dest="aloft.prob_dominant", transform=split_str),
+    Column("Aloft_pred", transform=split_str),
+    Column("Aloft_Confidence", transform=split_str),
+    Column("CADD_raw", dest="cadd.raw_score", transform=split_float, assembly="hg38"),  # TODO CADD will have hg38 next update
+    Column("CADD_raw_rankscore", dest="cadd.raw_rankscore", transform=split_float, assembly="hg38"),
+    Column("CADD_phred", transform=split_float, assembly="hg38"),  # CADD phred-like scores, not as other predications of string type
+    # Column("CADD_raw_hg19", assembly="hg19"),  # discarded because Myvariant.info already has a hg19-only datasource of CADD.
+    # Column("CADD_raw_rankscore_hg19", assembly="hg19"),  # ditto
+    # Column("CADD_phred_hg19", assembly="hg19"),  # ditto
+    Column("DANN_score", transform=split_float),
+    Column("DANN_rankscore", transform=split_float),
+    Column("fathmm-MKL_coding_score", dest="fathmm-mkl.coding_score", transform=split_float),
+    Column("fathmm-MKL_coding_rankscore", dest="fathmm-mkl.coding_rankscore", transform=split_float),
+    Column("fathmm-MKL_coding_pred", dest="fathmm-mkl.coding_pred", transform=split_str),
+    Column("fathmm-MKL_coding_group", dest="fathmm-mkl.coding_group", transform=split_str),
+    Column("fathmm-XF_coding_score", dest="fathmm-xf.coding_score", transform=split_float),
+    Column("fathmm-XF_coding_rankscore", dest="fathmm-xf.coding_rankscore", transform=split_float),
+    Column("fathmm-XF_coding_pred", dest="fathmm-xf.coding_pred", transform=split_str),
+    Column("Eigen-raw_coding", dest="eigen.raw_coding", transform=split_float),
+    Column("Eigen-raw_coding_rankscore", dest="eigen.raw_coding_rankscore", transform=split_float),
+    Column("Eigen-phred_coding", dest="eigen.phred_coding", transform=split_float),
+    Column("Eigen-PC-raw_coding", dest="eigen-pc.raw_coding", transform=split_float),
+    Column("Eigen-PC-raw_coding_rankscore", dest="eigen-pc.raw_coding_rankscore", transform=split_float),
+    Column("Eigen-PC-phred_coding", dest="eigen-pc.phred_coding", transform=split_float),
+    Column("GenoCanyon_score", transform=split_float),
+    Column("GenoCanyon_rankscore", transform=split_float),
+    Column("integrated_fitCons_score", dest="fitcons.integrated.score", transform=split_float),
+    Column("integrated_fitCons_rankscore", dest="fitcons.integrated.rankscore", transform=split_float),
+    Column("integrated_confidence_value", dest="fitcons.integrated.confidence_value", transform=split_int),
+    Column("GM12878_fitCons_score", dest="fitcons.gm12878.score", transform=split_float),
+    Column("GM12878_fitCons_rankscore", dest="fitcons.gm12878.rankscore", transform=split_float),
+    Column("GM12878_confidence_value", dest="fitcons.gm12878.confidence_value", transform=split_int),
+    Column("H1-hESC_fitCons_score", dest="fitcons.h1-hesc.score", transform=split_float),
+    Column("H1-hESC_fitCons_rankscore", dest="fitcons.h1-hesc.rankscore", transform=split_float),
+    Column("H1-hESC_confidence_value", dest="fitcons.h1-hesc.confidence_value", transform=split_int),
+    Column("HUVEC_fitCons_score", dest="fitcons.huvec.score", transform=split_float),
+    Column("HUVEC_fitCons_rankscore", dest="fitcons.huvec.rankscore", transform=split_float),
+    Column("HUVEC_confidence_value", dest="fitcons.huvec.confidence_value", transform=split_int),
+    Column("LINSIGHT", dest="linsight.score", transform=split_float),
+    Column("LINSIGHT_rankscore", transform=split_float),
+    Column("GERP++_NR", transform=split_float),
+    Column("GERP++_RS", transform=split_float),
+    Column("GERP++_RS_rankscore", dest="gerp++.rs_rankscore", transform=split_float),
+    Column("phyloP100way_vertebrate", dest="phylop.100way_vertebrate.score", transform=split_float),
+    Column("phyloP100way_vertebrate_rankscore", dest="phylop.100way_vertebrate.rankscore", transform=split_float),
+    Column("phyloP470way_mammalian", dest="phylop.470way_mammalian.score", transform=split_float),  # replaced 30way_mammalian in 4.4.a
+    Column("phyloP470way_mammalian_rankscore", dest="phylop.470way_mammalian.rankscore", transform=split_float),  # replaced 30way_mammalian in 4.4.a
+    Column("phyloP17way_primate", dest="phylop.17way_primate.score", transform=split_float),
+    Column("phyloP17way_primate_rankscore", dest="phylop.17way_primate.rankscore", transform=split_float),
+    Column("phastCons100way_vertebrate", dest="phastcons.100way_vertebrate.score", transform=split_float),
+    Column("phastCons100way_vertebrate_rankscore", dest="phastcons.100way_vertebrate.rankscore", transform=split_float),
+    Column("phastCons470way_mammalian", dest="phastcons.470way_mammalian.score", transform=split_float),  # replaced 30way_mammalian in 4.4.a
+    Column("phastCons470way_mammalian_rankscore", dest="phastcons.470way_mammalian.rankscore", transform=split_float),  # replaced 30way_mammalian in 4.4.a
+    Column("phastCons17way_primate", dest="phastcons.17way_primate.score", transform=split_float),
+    Column("phastCons17way_primate_rankscore", dest="phastcons.17way_primate.rankscore", transform=split_float),
+    Column("SiPhy_29way_pi", dest="siphy_29way.pi", transform=parse_siphy_29way_pi),
+    Column("SiPhy_29way_logOdds", dest="siphy_29way.logodds_score", transform=split_float),
+    Column("SiPhy_29way_logOdds_rankscore", dest="siphy_29way.logodds_rankscore", transform=split_float),
+    Column("bStatistic", dest="bstatistic.score", transform=split_float),
+    Column("bStatistic_converted_rankscore", dest="bstatistic.converted_rankscore", transform=split_float),
+    Column("1000Gp3_AC", dest="1000gp3.ac", transform=int),
+    Column("1000Gp3_AF", dest="1000gp3.af", transform=float),
+    Column("1000Gp3_AFR_AC", dest="1000gp3.afr.ac", transform=int),  # dest changed since 4.4.a
+    Column("1000Gp3_AFR_AF", dest="1000gp3.afr.af", transform=float),
+    Column("1000Gp3_EUR_AC", dest="1000gp3.eur.ac", transform=int),
+    Column("1000Gp3_EUR_AF", dest="1000gp3.eur.af", transform=float),
+    Column("1000Gp3_AMR_AC", dest="1000gp3.amr.ac", transform=int),
+    Column("1000Gp3_AMR_AF", dest="1000gp3.amr.af", transform=float),
+    Column("1000Gp3_EAS_AC", dest="1000gp3.eas.ac", transform=int),
+    Column("1000Gp3_EAS_AF", dest="1000gp3.eas.af", transform=float),
+    Column("1000Gp3_SAS_AC", dest="1000gp3.sas.ac", transform=int),
+    Column("1000Gp3_SAS_AF", dest="1000gp3.sas.af", transform=float),
+    Column("TWINSUK_AC", dest="twinsuk.ac", transform=int),
+    Column("TWINSUK_AF", dest="twinsuk.af", transform=float),
+    Column("ALSPAC_AC", dest="alspac.ac", transform=int),
+    Column("ALSPAC_AF", dest="alspac.af", transform=float),
+    Column("UK10K_AC", dest="uk10k.ac", transform=int),
+    Column("UK10K_AF", dest="uk10k.af", transform=float),
+    Column("ESP6500_AA_AC", dest="esp6500.aa.ac", transform=int),  # dest changed since 4.4.a
+    Column("ESP6500_AA_AF", dest="esp6500.aa.af", transform=float),
+    Column("ESP6500_EA_AC", dest="esp6500.ea.ac", transform=int),
+    Column("ESP6500_EA_AF", dest="esp6500.ea.af", transform=float),
+    Column("ExAC_AC", dest="exac.ac", transform=int),  # dest changed since 4.4.a
+    Column("ExAC_AF", dest="exac.af", transform=float),
+    Column("ExAC_Adj_AC", dest="exac.adj_ac", transform=int),
+    Column("ExAC_Adj_AF", dest="exac.adj_af", transform=float),
+    Column("ExAC_AFR_AC", dest="exac.afr.ac", transform=int),
+    Column("ExAC_AFR_AF", dest="exac.afr.af", transform=float),
+    Column("ExAC_AMR_AC", dest="exac.amr.ac", transform=int),
+    Column("ExAC_AMR_AF", dest="exac.amr.af", transform=float),
+    Column("ExAC_EAS_AC", dest="exac.eas.ac", transform=int),
+    Column("ExAC_EAS_AF", dest="exac.eas.af", transform=float),
+    Column("ExAC_FIN_AC", dest="exac.fin.ac", transform=int),
+    Column("ExAC_FIN_AF", dest="exac.fin.af", transform=float),
+    Column("ExAC_NFE_AC", dest="exac.nfe.ac", transform=int),
+    Column("ExAC_NFE_AF", dest="exac.nfe.af", transform=float),
+    Column("ExAC_SAS_AC", dest="exac.sas.ac", transform=int),
+    Column("ExAC_SAS_AF", dest="exac.sas.af", transform=float),
+    Column("ExAC_nonTCGA_AC", dest="exac_nontcga.ac", transform=int),
+    Column("ExAC_nonTCGA_AF", dest="exac_nontcga.af", transform=float),
+    Column("ExAC_nonTCGA_Adj_AC", dest="exac_nontcga.adj_ac", transform=int),
+    Column("ExAC_nonTCGA_Adj_AF", dest="exac_nontcga.adj_af", transform=float),
+    Column("ExAC_nonTCGA_AFR_AC", dest="exac_nontcga.afr.ac", transform=int),
+    Column("ExAC_nonTCGA_AFR_AF", dest="exac_nontcga.afr.af", transform=float),
+    Column("ExAC_nonTCGA_AMR_AC", dest="exac_nontcga.amr.ac", transform=int),
+    Column("ExAC_nonTCGA_AMR_AF", dest="exac_nontcga.amr.af", transform=float),
+    Column("ExAC_nonTCGA_EAS_AC", dest="exac_nontcga.eas.ac", transform=int),
+    Column("ExAC_nonTCGA_EAS_AF", dest="exac_nontcga.eas.af", transform=float),
+    Column("ExAC_nonTCGA_FIN_AC", dest="exac_nontcga.fin.ac", transform=int),
+    Column("ExAC_nonTCGA_FIN_AF", dest="exac_nontcga.fin.af", transform=float),
+    Column("ExAC_nonTCGA_NFE_AC", dest="exac_nontcga.nfe.ac", transform=int),
+    Column("ExAC_nonTCGA_NFE_AF", dest="exac_nontcga.nfe.af", transform=float),
+    Column("ExAC_nonTCGA_SAS_AC", dest="exac_nontcga.sas.ac", transform=int),
+    Column("ExAC_nonTCGA_SAS_AF", dest="exac_nontcga.sas.af", transform=float),
+    Column("ExAC_nonpsych_AC", dest="exac_nonpsych.ac", transform=int),
+    Column("ExAC_nonpsych_AF", dest="exac_nonpsych.af", transform=float),
+    Column("ExAC_nonpsych_Adj_AC", dest="exac_nonpsych.adj_ac", transform=int),
+    Column("ExAC_nonpsych_Adj_AF", dest="exac_nonpsych.adj_af", transform=float),
+    Column("ExAC_nonpsych_AFR_AC", dest="exac_nonpsych.afr.ac", transform=int),
+    Column("ExAC_nonpsych_AFR_AF", dest="exac_nonpsych.afr.af", transform=float),
+    Column("ExAC_nonpsych_AMR_AC", dest="exac_nonpsych.amr.ac", transform=int),
+    Column("ExAC_nonpsych_AMR_AF", dest="exac_nonpsych.amr.af", transform=float),
+    Column("ExAC_nonpsych_EAS_AC", dest="exac_nonpsych.eas.ac", transform=int),
+    Column("ExAC_nonpsych_EAS_AF", dest="exac_nonpsych.eas.af", transform=float),
+    Column("ExAC_nonpsych_FIN_AC", dest="exac_nonpsych.fin.ac", transform=int),
+    Column("ExAC_nonpsych_FIN_AF", dest="exac_nonpsych.fin.af", transform=float),
+    Column("ExAC_nonpsych_NFE_AC", dest="exac_nonpsych.nfe.ac", transform=int),
+    Column("ExAC_nonpsych_NFE_AF", dest="exac_nonpsych.nfe.af", transform=float),
+    Column("ExAC_nonpsych_SAS_AC", dest="exac_nonpsych.sas.ac", transform=int),
+    Column("ExAC_nonpsych_SAS_AF", dest="exac_nonpsych.sas.af", transform=float),
+    Column("ALFA_European_AC", dest="alfa.european.ac", transform=int),  # new ALFA field, add mapping
+    Column("ALFA_European_AN", dest="alfa.european.an", transform=int),
+    Column("ALFA_European_AF", dest="alfa.european.af", transform=float),
+    Column("ALFA_African_Others_AC", dest="alfa.african_others.ac", transform=int),
+    Column("ALFA_African_Others_AN", dest="alfa.african_others.an", transform=int),
+    Column("ALFA_African_Others_AF", dest="alfa.african_others.af", transform=float),
+    Column("ALFA_East_Asian_AC", dest="alfa.east_asian.ac", transform=int),
+    Column("ALFA_East_Asian_AN", dest="alfa.east_asian.an", transform=int),
+    Column("ALFA_East_Asian_AF", dest="alfa.east_asian.af", transform=float),
+    Column("ALFA_African_American_AC", dest="alfa.african_american.ac", transform=int),
+    Column("ALFA_African_American_AN", dest="alfa.african_american.an", transform=int),
+    Column("ALFA_African_American_AF", dest="alfa.african_american.af", transform=float),
+    Column("ALFA_Latin_American_1_AC", dest="alfa.latin_american_1.ac", transform=int),
+    Column("ALFA_Latin_American_1_AN", dest="alfa.latin_american_1.an", transform=int),
+    Column("ALFA_Latin_American_1_AF", dest="alfa.latin_american_1.af", transform=float),
+    Column("ALFA_Latin_American_2_AC", dest="alfa.latin_american_2.ac", transform=int),
+    Column("ALFA_Latin_American_2_AN", dest="alfa.latin_american_2.an", transform=int),
+    Column("ALFA_Latin_American_2_AF", dest="alfa.latin_american_2.af", transform=float),
+    Column("ALFA_Other_Asian_AC", dest="alfa.other_asian.ac", transform=int),
+    Column("ALFA_Other_Asian_AN", dest="alfa.other_asian.an", transform=int),
+    Column("ALFA_Other_Asian_AF", dest="alfa.other_asian.af", transform=float),
+    Column("ALFA_South_Asian_AC", dest="alfa.south_asian.ac", transform=int),
+    Column("ALFA_South_Asian_AN", dest="alfa.south_asian.an", transform=int),
+    Column("ALFA_South_Asian_AF", dest="alfa.south_asian.af", transform=float),
+    Column("ALFA_Other_AC", dest="alfa.other.ac", transform=int),
+    Column("ALFA_Other_AN", dest="alfa.other.an", transform=int),
+    Column("ALFA_Other_AF", dest="alfa.other.af", transform=float),
+    Column("ALFA_African_AC", dest="alfa.african.ac", transform=int),
+    Column("ALFA_African_AN", dest="alfa.african.an", transform=int),
+    Column("ALFA_African_AF", dest="alfa.african.af", transform=float),
+    Column("ALFA_Asian_AC", dest="alfa.asian.ac", transform=int),
+    Column("ALFA_Asian_AN", dest="alfa.asian.an", transform=int),
+    Column("ALFA_Asian_AF", dest="alfa.asian.af", transform=float),
+    Column("ALFA_Total_AC", dest="alfa.total.ac", transform=int),
+    Column("ALFA_Total_AN", dest="alfa.total.an", transform=int),
+    Column("ALFA_Total_AF", dest="alfa.total.af", transform=float),
+    Column("clinvar_id", dest="clinvar.clinvar_id", transform=split_clinvar),
+    Column("clinvar_clnsig", transform=split_clinvar),
+    Column("clinvar_trait", transform=split_clinvar),
+    Column("clinvar_review", transform=split_clinvar),
+    Column("clinvar_hgvs", transform=split_clinvar),
+    Column("clinvar_var_source", dest="clinvar.var_source", transform=split_clinvar),
+    Column("clinvar_MedGen_id", dest="clinvar.medgen", transform=split_clinvar),
+    Column("clinvar_OMIM_id", dest="clinvar.omim", transform=split_clinvar),
+    Column("clinvar_Orphanet_id", dest="clinvar.orphanet", transform=split_clinvar),
+    Column("Interpro_domain", transform=split_str),
+    Column("GTEx_V8_gene", dest="gtex.gene", tag=COLUMN_TAG.GTEX_GENE),  # special column, see prune_uniprot()
+    Column("GTEx_V8_tissue", dest="gtex.tissue", tag=COLUMN_TAG.GTEX_TISSUE),  # special column, see prune_uniprot()
+    Column("Geuvadis_eQTL_target_gene", transform=split_str)
+]
+
+HG19_COLUMNS = [c for c in COLUMNS if c.is_hg19()]
+HG38_COLUMNS = [c for c in COLUMNS if c.is_hg38()]
+
+# Currently not necessary to make assembly-specific tag-column maps.
+TAG_COLUMN_MAP = create_tag_column_map(COLUMNS)
+
+
+def verify_pos(row, pos_column: Column, na_values: set = NA_VALUES):
+    pos_value = row[pos_column.name]
+
+    if pos_value in na_values:
+        return False
+
+    return True
+
+
+def verify_hg19_row(row: dict, na_values: set = NA_VALUES):
+    pos_column = TAG_COLUMN_MAP[COLUMN_TAG.HG19_POS][0]
+    return verify_pos(row, pos_column=pos_column, na_values=na_values)
+
+
+def verify_hg38_row(row: dict, na_values: set = NA_VALUES):
+    pos_column = TAG_COLUMN_MAP[COLUMN_TAG.HG38_POS][0]
+    return verify_pos(row, pos_column=pos_column, na_values=na_values)
+
+
+def prune_uniprot(raw_doc: dict, acc_column: Column, entry_column: Column, na_values: set = NA_VALUES):
+    """
+    Map each UniProt accession number and entry name from the raw document into a dictionary,
+    and assign all such dictionaries to the raw document's top "uniprot" field.
+
+    E.g. with the following input value:
+
+        raw_doc["uniprot.acc"] = "P54578-2;P54578-3;A6NJA2;P54578"
+        raw_doc["uniprot.entry"] = "UBP14_HUMAN;UBP14_HUMAN;A6NJA2_HUMAN;UBP14_HUMAN"
+
+    raw_doc will be assigned as:
+
+        raw_doc["uniprot"] = [
+            {'acc': 'P54578-2', 'entry': 'UBP14_HUMAN'},
+            {'acc': 'P54578-3', 'entry': 'UBP14_HUMAN'},
+            {'acc': 'A6NJA2', 'entry': 'A6NJA2_HUMAN'},
+            {'acc': 'P54578', 'entry': 'UBP14_HUMAN'}
+        ]
+    """
+    # acc_column = TAG_COLUMN_MAP[COLUMN_TAG.UNIPROT_ACC][0]
+    # entry_column = TAG_COLUMN_MAP[COLUMN_TAG.UNIPROT_ENTRY][0]
+
+    if (acc_column.dest in raw_doc) and (entry_column.dest in raw_doc):
+        acc_value = raw_doc[acc_column.dest]
+        entry_value = raw_doc[entry_column.dest]
+
+        uniprot_result = [{"acc": acc, "entry": entry} for (acc, entry) in split_zip(acc_value, entry_value, sep=";", na_values=na_values)]
+        uniprot_result = _check_length(uniprot_result)
+        if uniprot_result is not None:
+            raw_doc["uniprot"] = uniprot_result
+
+        del raw_doc[acc_column.dest]
+        del raw_doc[entry_column.dest]
+
+    return raw_doc
+
+
+def prune_hgvsc_hgvsp(raw_doc: dict, hgvsc_columns: list[Column], hgvsp_columns: list[Column], na_values: set = NA_VALUES):
+    """
+    Split "HGVSc_ANNOVAR", "HGVSc_snpEff", and "HGVSc_VEP" values into "hgvsc" field;
+    split "HGVSp_ANNOVAR", "HGVSp_snpEff", and "HGVSp_VEP" values into "hgvsp" field.
+    """
+    coding_values = [raw_doc[c.dest] for c in hgvsc_columns if c.dest in raw_doc]
+    protein_values = [raw_doc[c.dest] for c in hgvsp_columns if c.dest in raw_doc]
 
-        Then we will return a list of dictionaries like:
+    coding_result = split_dedup(coding_values, sep=";", na_values=na_values)
+    protein_result = split_dedup(protein_values, sep=";", na_values=na_values)
 
-            [{'acc': 'P54578-2', 'entry': 'UBP14_HUMAN'},
-             {'acc': 'P54578-3', 'entry': 'UBP14_HUMAN'},
-             {'acc': 'A6NJA2', 'entry': 'A6NJA2_HUMAN'},
-             {'acc': 'P54578', 'entry': 'UBP14_HUMAN'}]
-        """
-        # cls.read_string() is not used here because it will remove the NA substrings from the split string
-        acc_list = [s if s not in cls.none_values else None for s in row[acc_col].split(";")]
-        entry_list = [s if s not in cls.none_values else None for s in row[entry_col].split(";")]
+    if coding_result is not None:
+        raw_doc["hgvsc"] = coding_result
+    if protein_result is not None:
+        raw_doc["hgvsp"] = protein_result
 
-        return [{"acc": acc, "entry": entry} for (acc, entry) in zip(acc_list, entry_list)
-                if (acc, entry) != (None, None)]
+    for c in hgvsc_columns:
+        raw_doc.pop(c.dest, None)  # safely delete the key because it can be absent
+    for c in hgvsp_columns:
+        raw_doc.pop(c.dest, None)  # safely delete the key because it can be absent
 
-    @classmethod
-    def parse_gtex(cls, row, gene_col, tissue_col):
-        """
-        Read GTEx genes and tissues as two strings from `row`. Map each gene and tissue into a dictionary, and return
-        a list of such dictionaries.
+    return raw_doc
 
-        E.g. suppose we have the following readouts
 
-            row[gene_col] = "ENOSF1|ENOSF1"
-            row[tissue_col] = Adipose_Subcutaneous|Muscle_Skeletal
+def prune_gtex(raw_doc: dict, gene_column: Column, tissue_column: Column, na_values: set = NA_VALUES):
+    """
+    Map each GTEx gene name and tissue name from the raw document into a dictionary,
+    and assign all such dictionaries to the raw document's top "gtex" field.
 
-        Then we will return a list of dictionaries like:
+    E.g. with the following input value:
 
-            [{'gene': 'ENOSF1', 'tissue': 'Adipose_Subcutaneous'},
-             {'gene': 'ENOSF1', 'tissue': 'Muscle_Skeletal'}]
-        """
-        # cls.read_string() is not used here because it will remove the NA substrings from the split string
-        gene_list = [s if s not in cls.none_values else None for s in row[gene_col].split(r"|")]
-        tissue_list = [s if s not in cls.none_values else None for s in row[tissue_col].split(r"|")]
+        row["gtex_gene"] = "ENOSF1|ENOSF1"
+        row["gtex_tissue"] = "Adipose_Subcutaneous|Muscle_Skeletal"
 
-        return [{"gene": gene, "tissue": tissue} for (gene, tissue) in zip(gene_list, tissue_list)
-                if (gene, tissue) != (None, None)]
+    raw_doc will be assigned as:
 
-    @classmethod
-    def parse_siphy_29way_pi(cls, row, col):
-        """
-        A "SiPhy_29way_pi" value, if not None, is a string separated by ":", representing an estimated stationary
-        distribution of A, C, G and T at a variant site. E.g. "0.0:0.5259:0.0:0.4741".
+         row["gtex"] = [
+            {'gene': 'ENOSF1', 'tissue': 'Adipose_Subcutaneous'},
+            {'gene': 'ENOSF1', 'tissue': 'Muscle_Skeletal'}
+        ]
+    """
+    # when these two keys are not present in the doc, it means the responding two values in tsv files are NA values
+    if (gene_column.dest in raw_doc) and (tissue_column.dest in raw_doc):
+        gene_value = raw_doc[gene_column.dest]
+        tissue_value = raw_doc[tissue_column.dest]
 
-        Here we split the string and convert it to a dict of {<nt>: <freq>}.
-        """
-        string = cls.read_string(row, col)
-        if string is None:
-            return None
+        # special separator "|" for GTEx
+        gtex_result = [{"gene": acc, "tissue": entry} for (acc, entry) in split_zip(gene_value, tissue_value, sep=r"|", na_values=na_values)]
+        gtex_result = _check_length(gtex_result)
+        if gtex_result is not None:
+            raw_doc["gtex"] = gtex_result
 
-        freq = [float(s) for s in string.split(":")]
-        pi_dict = {'a': freq[0], 'c': freq[1], 'g': freq[2], 't': freq[3]}
-        return pi_dict
+        del raw_doc[gene_column.dest]
+        del raw_doc[tissue_column.dest]
 
-    @classmethod
-    def map_CADD_to_json(cls, row, version):
-        """
-        Myvariant.info already has a datasource of CADD, but it's hg19 only.
-        When version == "hg19", we will discard all CADD fields in dbNSFP.
-        When version == "hg38", we will only include the hg38 CADD fields in dbNSFP.
-        """
-        if version == "hg38":
-            cadd_dict = {
-                "raw_score": cls.read_string(row, "CADD_raw", sep=";", transform=float),
-                "raw_rankscore": cls.read_string(row, "CADD_raw_rankscore", sep=";", transform=float),
-                "phred": cls.read_string(row, "CADD_phred", sep=";", transform=float),
-                # "raw_score_hg19": cls.read_string(row, "CADD_raw_hg19", sep=";", transform=float),
-                # "raw_rankscore_hg19": cls.read_string(row, "CADD_raw_rankscore_hg19", sep=";", transform=float),
-                # "phred_hg19": cls.read_string(row, "CADD_phred_hg19", sep=";", transform=float)
-            }
-            return cadd_dict
-        elif version == "hg19":
-            return None
-        else:
-            raise ValueError("Cannot recognize version. Should be either hg19 or hg38. Got version={}".format(version))
+    return raw_doc
 
-    @classmethod
-    def map_row_to_json(cls, row, version):
-        """
-        Parse each row into a json object
-        """
-
-        """
-        Step 1: Read basic variant information
-        """
-        # in case of no hg19 position provided, remove the item
-        pos_hg19 = cls.read_string(row, "hg19_pos(1-based)", transform=int)  # Column 9
-        if pos_hg19 is None:
-            return None
-
-        pos_hg18 = cls.read_string(row, "hg18_pos(1-based)", transform=int)  # Column 11
-        pos_hg38 = cls.read_string(row, "pos(1-based)", transform=int)  # Column 2
-
-        # ref and alt cannot be None else hgvs_id is invalid
-        ref = cls.read_string(row, "ref", transform=lambda s: s.upper())  # Column 3
-        alt = cls.read_string(row, "alt", transform=lambda s: s.upper())  # Column 4
-
-        if version == 'hg19':
-            chrom = cls.read_string(row, "hg19_chr", transform=lambda s: "MT" if s == "M" else s)  # Column 1
-            hgvs_id = "chr%s:g.%d%s>%s" % (chrom, pos_hg19, ref, alt)
-        elif version == 'hg38':
-            chrom = cls.read_string(row, "#chr", transform=lambda s: "MT" if s == "M" else s)  # Column 8
-            hgvs_id = "chr%s:g.%d%s>%s" % (chrom, pos_hg38, ref, alt)
-        else:
-            raise ValueError("Cannot recognize version. Should be either hg19 or hg38. Got version={}".format(version))
-
-        rsid = cls.read_string(row, "rs_dbSNP")  # Column 7
-
-        # Column 10 "hg18_chr" is skipped
-
-        """
-        Step 2: Construct the JSON object
-        """
-        one_snp_json = {
-            "_id": hgvs_id,
-            "dbnsfp": {
-                "rsid": rsid,  # Column 7
-                "chrom": chrom,  # Column 1 or 8
-                "hg19": {  # Column 9
-                    "start": pos_hg19,
-                    "end": pos_hg19
-                },
-                "hg18": {  # Column 11
-                    "start": pos_hg18,
-                    "end": pos_hg18
-                },
-                "hg38": {  # Column 2
-                    "start": pos_hg38,
-                    "end": pos_hg38
-                },
-                "ref": ref,  # Column 3
-                "alt": alt,  # Column 4
-                "aa": {  # Column 5-6, 12, 30-32
-                    "ref": cls.read_string(row, "aaref"),
-                    "alt": cls.read_string(row, "aaalt"),
-                    "pos": cls.read_string(row, "aapos", sep=";", transform=int),
-                    "refcodon": cls.read_string(row, "refcodon", sep=";"),
-                    "codonpos": cls.read_string(row, "codonpos", sep=";", transform=int),
-                    "codon_degeneracy": cls.read_string(row, "codon_degeneracy", sep=";", transform=int),
-                },
-                # Column 13
-                "genename": cls.read_string(row, "genename", sep=";"),
-                # Column 14-16
-                "ensembl": {
-                    "geneid": cls.read_string(row, "Ensembl_geneid", sep=";"),
-                    "transcriptid": cls.read_string(row, "Ensembl_transcriptid", sep=";"),
-                    "proteinid": cls.read_string(row, "Ensembl_proteinid", sep=";")
-                },
-                # Column 17-18
-                "uniprot": cls.parse_uniprot(row, "Uniprot_acc", "Uniprot_entry"),
-                # Column 19-24
-                "hgvsc": cls.read_unique_strings(row, cols=["HGVSc_ANNOVAR", "HGVSc_snpEff", "HGVSc_VEP"], sep=";"),
-                "hgvsp": cls.read_unique_strings(row, cols=["HGVSp_ANNOVAR", "HGVSp_snpEff", "HGVSp_VEP"], sep=";"),
-                # Column 25-29
-                "appris": cls.read_string(row, "APPRIS", sep=";"),
-                "genecode_basic": cls.read_string(row, "GENCODE_basic", sep=";"),
-                "tsl": cls.read_string(row, "TSL", sep=";", transform=int),
-                "vep_canonical": cls.read_string(row, "VEP_canonical", sep=";"),
-                "cds_strand": cls.read_string(row, "cds_strand", sep=";"),
-                # Column 33-37
-                "ancestral_allele": cls.read_string(row, "Ancestral_allele", sep=";"),
-                "altai_neandertal": cls.read_string(row, "AltaiNeandertal", sep=r"/"),
-                "denisova": cls.read_string(row, "Denisova", sep=r"/"),
-                "vindijia_neandertal": cls.read_string(row, "VindijiaNeandertal", sep=r"/"),
-                "chagyrskaya_neandertal": cls.read_string(row, "ChagyrskayaNeandertal", sep=r"/"),
-                # Column 38-43
-                "sift": cls.map_score_converted_rankscore_pred_to_json(row, col_prefix="SIFT"),
-                "sift4g": cls.map_score_converted_rankscore_pred_to_json(row, col_prefix="SIFT4G"),
-                # Column 44-49
-                "polyphen2": {
-                    "hdiv": cls.map_score_rankscore_pred_to_json(row, col_prefix="Polyphen2_HDIV"),
-                    "hvar": cls.map_score_rankscore_pred_to_json(row, col_prefix="Polyphen2_HVAR"),
-                },
-                # Column 50-53
-                "lrt": {
-                    "score": cls.read_string(row, "LRT_score", sep=";", transform=float),
-                    "converted_rankscore": cls.read_string(row, "LRT_converted_rankscore", sep=";", transform=float),
-                    "pred": cls.read_string(row, "LRT_pred", sep=";"),
-                    "omega": cls.read_string(row, "LRT_Omega", sep=";", transform=float)
-                },
-                # Column 54-61
-                "mutationtaster": {
-                    "score": cls.read_string(row, "MutationTaster_score", sep=";", transform=float),
-                    "converted_rankscore": cls.read_string(row, "MutationTaster_converted_rankscore", sep=";", transform=float),
-                    "pred": cls.read_string(row, "MutationTaster_pred", sep=";"),
-                    "model": cls.read_string(row, "MutationTaster_model", sep=";"),
-                    "AAE": cls.read_string(row, "MutationTaster_AAE", sep=";")
-                },
-                "mutationassessor": cls.map_score_rankscore_pred_to_json(row, col_prefix="MutationAssessor"),
-                # Column 62-64
-                "fathmm": cls.map_score_converted_rankscore_pred_to_json(row, col_prefix="FATHMM"),
-                # Column 65-67
-                "provean": cls.map_score_converted_rankscore_pred_to_json(row, col_prefix="PROVEAN"),
-                # Column 68-69
-                "vest4": cls.map_score_rankscore_to_json(row, col_prefix="VEST4"),
-                # Column 70-79
-                "metasvm": cls.map_score_rankscore_pred_to_json(row, col_prefix="MetaSVM"),
-                "metalr": cls.map_score_rankscore_pred_to_json(row, col_prefix="MetaLR"),
-                "reliability_index": cls.read_string(row, "Reliability_index", transform=int),
-                "metarnn": cls.map_score_rankscore_pred_to_json(row, col_prefix="MetaRNN"),
-                # Column 80-82
-                "m-cap": cls.map_score_rankscore_pred_to_json(row, col_prefix="M-CAP"),
-                # Column 83-84
-                "revel": cls.map_score_rankscore_to_json(row, col_prefix="REVEL"),
-                # Column 85-89
-                "mutpred": {
-                    "score": cls.read_string(row, "MutPred_score", sep=";", transform=float),
-                    "rankscore": cls.read_string(row, "MutPred_rankscore", sep=";", transform=float),
-                    "accession": cls.read_string(row, "MutPred_protID", sep=";"),
-                    "aa_change": cls.read_string(row, "MutPred_AAchange", sep=";"),
-                    "pred": cls.parse_mutpred_top5features(row, "MutPred_Top5features"),
-                },
-                # Column 90-93
-                "mvp": cls.map_score_rankscore_to_json(row, col_prefix="MVP"),
-                "mpc": cls.map_score_rankscore_to_json(row, col_prefix="MPC"),
-                # Column 94-96
-                "primateai": cls.map_score_rankscore_pred_to_json(row, col_prefix="PrimateAI"),
-                # Column 97-99
-                "deogen2": cls.map_score_rankscore_pred_to_json(row, col_prefix="DEOGEN2"),
-                # Column 100-105
-                "bayesdel": {
-                    "add_af": cls.map_score_rankscore_pred_to_json(row, col_prefix="BayesDel_addAF"),
-                    "no_af": cls.map_score_rankscore_pred_to_json(row, col_prefix="BayesDel_noAF")
-                },
-                # Column 106-108
-                "clinpred": cls.map_score_rankscore_pred_to_json(row, col_prefix="ClinPred"),
-                # Column 109-111
-                "list-s2": cls.map_score_rankscore_pred_to_json(row, col_prefix="LIST-S2"),
-                # Column 112-117
-                "aloft": {
-                    "fraction_transcripts_affected": cls.read_string(row, "Aloft_Fraction_transcripts_affected", sep=";"),
-                    "prob_tolerant": cls.read_string(row, "Aloft_prob_Tolerant", sep=";"),
-                    "prob_recessive": cls.read_string(row, "Aloft_prob_Recessive", sep=";"),
-                    "prob_dominant": cls.read_string(row, "Aloft_prob_Dominant", sep=";"),
-                    "pred": cls.read_string(row, "Aloft_pred", sep=";"),
-                    "confidence": cls.read_string(row, "Aloft_Confidence", sep=";")
-                },
-                # Column 118-123
-                #   Column 118-120 are hg38
-                #   Column 121-123 are hg19
-                # Only column 117-119 will be included in the document when verison == "hg38"
-                # No CADD fields will be included when verison == "hg19"
-                "cadd": cls.map_CADD_to_json(row, version),
-                # Column 124-125
-                "dann": cls.map_score_rankscore_to_json(row, col_prefix="DANN"),
-                # Column 126-132
-                "fathmm-mkl": {
-                    "coding_score": cls.read_string(row, "fathmm-MKL_coding_score", sep=";", transform=float),
-                    "coding_rankscore": cls.read_string(row, "fathmm-MKL_coding_rankscore", sep=";", transform=float),
-                    "coding_pred": cls.read_string(row, "fathmm-MKL_coding_pred", sep=";"),
-                    "coding_group": cls.read_string(row, "fathmm-MKL_coding_group", sep=";")
-                },
-                "fathmm-xf": {
-                    "coding_score": cls.read_string(row, "fathmm-XF_coding_score", sep=";", transform=float),
-                    "coding_rankscore": cls.read_string(row, "fathmm-XF_coding_rankscore", sep=";", transform=float),
-                    "coding_pred": cls.read_string(row, "fathmm-XF_coding_pred", sep=";")
-                },
-                # Column 133-138
-                # Please note that Eigen uses "-", NOT "_", to connect column name prefix and suffixes
-                # Cannot use cls._iter_read_group here
-                "eigen":  {
-                    "raw_coding": cls.read_string(row, "Eigen-raw_coding", sep=";", transform=float),
-                    "raw_coding_rankscore": cls.read_string(row, "Eigen-raw_coding_rankscore", sep=";", transform=float),
-                    "phred_coding": cls.read_string(row, "Eigen-phred_coding", sep=";", transform=float)
-                },
-                "eigen-pc": {
-                    "raw_coding": cls.read_string(row, "Eigen-PC-raw_coding", sep=";", transform=float),
-                    "raw_coding_rankscore": cls.read_string(row, "Eigen-PC-raw_coding_rankscore", sep=";", transform=float),
-                    "phred_coding": cls.read_string(row, "Eigen-PC-phred_coding", sep=";", transform=float),
-                },
-                # Column 139-140
-                # Please note that column 140 in dbNSFP4.3a.readme.txt is "GenoCanyon_score_rankscore" and it's a typo
-                "genocanyon": cls.map_score_rankscore_to_json(row, col_prefix="GenoCanyon"),
-                # Column 141-143
-                "integrated": cls.map_fitcons_score_rankscore_confidence_value_to_json(row, col_prefix="integrated"),
-                # Column 144-146
-                "gm12878": cls.map_fitcons_score_rankscore_confidence_value_to_json(row, col_prefix="GM12878"),
-                # Column 147-149
-                "h1-hesc": cls.map_fitcons_score_rankscore_confidence_value_to_json(row, col_prefix="H1-hESC"),
-                # Column 150-152
-                "huvec": cls.map_fitcons_score_rankscore_confidence_value_to_json(row, col_prefix="HUVEC"),
-                # Column 153-154
-                # Note that column 153 is "LINSIGHT", not "LINSIGHT_score" (possibly a typo in the source data file)
-                "linsight": {
-                    "score": cls.read_string(row, 'LINSIGHT', sep=";", transform=float),
-                    "rankscore": cls.read_string(row, "LINSIGHT_rankscore", sep=";", transform=float)
-                },
-                # Column 155-157
-                "gerp++": {
-                    "nr": cls.read_string(row, "GERP++_NR", sep=";", transform=float),
-                    "rs": cls.read_string(row, "GERP++_RS", sep=";", transform=float),
-                    "rs_rankscore": cls.read_string(row, "GERP++_RS_rankscore", sep=";", transform=float)
-                },
-                # Column 158-163
-                "phylop": {
-                    "100way_vertebrate": {
-                        "score": cls.read_string(row, "phyloP100way_vertebrate", sep=";", transform=float),
-                        "rankscore": cls.read_string(row, "phyloP100way_vertebrate_rankscore", sep=";", transform=float)
-                    },
-                    "30way_mammalian": {
-                        "score": cls.read_string(row, "phyloP30way_mammalian", sep=";", transform=float),
-                        "rankscore": cls.read_string(row, "phyloP30way_mammalian_rankscore", sep=";", transform=float)
-                    },
-                    "17way_primate": {
-                        "score": cls.read_string(row, "phyloP17way_primate", sep=";", transform=float),
-                        "rankscore": cls.read_string(row, "phyloP17way_primate_rankscore", sep=";", transform=float)
-                    }
-                },
-                # Column 164-169
-                "phastcons": {
-                    "100way_vertebrate": {
-                        "score": cls.read_string(row, "phastCons100way_vertebrate", sep=";", transform=float),
-                        "rankscore": cls.read_string(row, "phastCons100way_vertebrate_rankscore", sep=";", transform=float)
-                    },
-                    "30way_mammalian": {
-                        "score": cls.read_string(row, "phastCons30way_mammalian", sep=";", transform=float),
-                        "rankscore": cls.read_string(row, "phastCons30way_mammalian_rankscore", sep=";", transform=float)
-                    },
-                    "17way_primate": {
-                        "score": cls.read_string(row, "phastCons17way_primate", sep=";", transform=float),
-                        "rankscore": cls.read_string(row, "phastCons17way_primate_rankscore", sep=";", transform=float)
-                    }
-                },
-                # Column 170-172
-                "siphy_29way": {
-                    "pi": cls.parse_siphy_29way_pi(row, "SiPhy_29way_pi"),
-                    # Note that the column name is "SiPhy_29way_logOdds", not "SiPhy_29way_logOdds_score"
-                    "logodds_score": cls.read_string(row, "SiPhy_29way_logOdds", sep=";", transform=float),
-                    "logodds_rankscore": cls.read_string(row, "SiPhy_29way_logOdds_rankscore", sep=";", transform=float)
-                },
-                # Column 173-174
-                "bstatistic": {
-                    # Note that the column name is "bStatistic", not "bStatistic_score"
-                    "score": cls.read_string(row, 'bStatistic', sep=";", transform=float),
-                    "converted_rankscore": cls.read_string(row, "bStatistic_converted_rankscore", sep=";", transform=float)
-                },
-                # Column 175-186
-                "1000gp3": cls.map_AC_AF_to_json(row, col_prefix="1000Gp3", col_infixes=["AFR", "EUR", "AMR", "EAS", "SAS"], whole_group=True),
-                # Column 187-188
-                "twinsuk": cls.map_AC_AF_to_json(row, col_prefix="TWINSUK", col_infixes=None, whole_group=True),
-                # Column 189-190
-                "alspac": cls.map_AC_AF_to_json(row, col_prefix="ALSPAC", col_infixes=None, whole_group=True),
-                # Column 191-192
-                "uk10k": cls.map_AC_AF_to_json(row, col_prefix="UK10K", col_infixes=None, whole_group=True),
-                # Column 193-196
-                "esp6500": cls.map_AC_AF_to_json(row, col_prefix="ESP6500", col_infixes=["AA", "EA"], whole_group=False),
-                # Column 197-212
-                "exac": cls.map_AC_AF_to_json(row, col_prefix="ExAC", col_infixes=["Adj", "AFR", "AMR", "EAS", "FIN", "NFE", "SAS"], whole_group=True),
-                # Column 213-228
-                "exac_nontcga": cls.map_AC_AF_to_json(row, col_prefix="ExAC_nonTCGA", col_infixes=["Adj", "AFR", "AMR", "EAS", "FIN", "NFE", "SAS"], whole_group=True),
-                # Column 229-244
-                "exac_nonpsych": cls.map_AC_AF_to_json(row, col_prefix="ExAC_nonpsych", col_infixes=["Adj", "AFR", "AMR", "EAS", "FIN", "NFE", "SAS"], whole_group=True),
-
-                # Column 245-630 are gnomAD_* columns. Skipped.
-
-                # Column 631-637
-                "clinvar": {
-                    "clinvar_id": cls.read_string(row, "clinvar_id"),
-                    "clinsig": cls.read_string(row, "clinvar_clnsig", sep=r"|"),
-                    "trait": cls.read_string(row, "clinvar_trait", sep=r"|"),
-                    "review": cls.read_string(row, "clinvar_review", sep=r"|"),
-                    "hgvs": cls.read_string(row, "clinvar_hgvs", sep=r"|"),
-                    "omim": cls.read_string(row, "clinvar_OMIM_id", sep=r"|"),
-                    "medgen": cls.read_string(row, "clinvar_MedGen_id", sep=r"|"),
-                    "orphanet": cls.read_string(row, "clinvar_Orphanet_id", sep=r"|"),
-                    "var_source": cls.read_string(row, "clinvar_var_source", sep=r"|")
-                },
-                # Column 640-643
-                "interpro_domain": cls.read_string(row, "Interpro_domain", sep=";"),
-                "gtex": cls.parse_gtex(row, "GTEx_V8_gene", "GTEx_V8_tissue"),
-                "geuvadis_eqtl_target_gene": cls.read_string(row, "Geuvadis_eQTL_target_gene", sep=";")
-
-                # End of row
-            }
-        }
-
-        """
-        Step 3: Prune the JSON object and return
-        """
-        # `value_convert_to_number` converts strings to numeric values inside the dictionary
-        # `unlist` regresses lists of length 1 to single values inside the dictionary
-        # `dict_sweep` remove NA values indicated by `vals` inside the dictionary
-        # `list_split` separate fields by `sep` into lists
-        one_snp_json = dict_sweep(one_snp_json, vals=[None], remove_invalid_list=True)
-        return one_snp_json
-
-
-# open file, parse, pass to json mapper
-def data_generator(input_file, version):
-    with anyfile(input_file) as file:
-        file_reader = csv.reader(file, delimiter="\t")
-
-        header = next(file_reader)
-        assert len(header) == VALID_COLUMN_NO, "Expecting %s columns, but got %s" % (VALID_COLUMN_NO, len(header))
-
-        previous_row = None
-        for row in file_reader:
-            row = dict(zip(header, row))
-
-            # use transposed matrix to have 1 line with N 187 columns
-            current_row = DbnsfpReader.map_row_to_json(row, version=version)
-            if previous_row and current_row:
-                if current_row["_id"] == previous_row["_id"]:
-                    aa = previous_row["dbnsfp"]["aa"]
-                    if not isinstance(aa, list):
-                        aa = [aa]
-                    aa.append(current_row["dbnsfp"]["aa"])
-                    previous_row["dbnsfp"]["aa"] = aa
-                    if len(previous_row["dbnsfp"]["aa"]) > 1:
-                        continue
-                else:
-                    yield previous_row
-
-            previous_row = current_row
-
-        if previous_row:
-            yield previous_row
-
-
-def load_data_file(input_file, version):
-    data = data_generator(input_file, version=version)
-    for one_snp_json in data:
-        yield one_snp_json
-
-
-# load path and find files, pass to data_generator
-def load_data(path_glob, version='hg19'):
-    for input_file in sorted(glob.glob(path_glob)):
-        for d in load_data_file(input_file, version):
-            yield d
+
+def prune_hg19_doc(doc: dict, na_values: set = NA_VALUES):
+    uniprot_acc_column = TAG_COLUMN_MAP[COLUMN_TAG.UNIPROT_ACC][0]
+    uniprot_entry_column = TAG_COLUMN_MAP[COLUMN_TAG.UNIPROT_ENTRY][0]
+    doc = prune_uniprot(doc, acc_column=uniprot_acc_column, entry_column=uniprot_entry_column, na_values=na_values)
+
+    hgvs_coding_columns = TAG_COLUMN_MAP[COLUMN_TAG.HGVS_CODING]
+    hgvs_protein_columns = TAG_COLUMN_MAP[COLUMN_TAG.HGVS_PROTEIN]
+    doc = prune_hgvsc_hgvsp(doc, hgvsc_columns=hgvs_coding_columns, hgvsp_columns=hgvs_protein_columns, na_values=na_values)
+
+    gtex_gene_column = TAG_COLUMN_MAP[COLUMN_TAG.GTEX_GENE][0]
+    gtex_tissue_column = TAG_COLUMN_MAP[COLUMN_TAG.GTEX_TISSUE][0]
+    doc = prune_gtex(doc, gene_column=gtex_gene_column, tissue_column=gtex_tissue_column, na_values=na_values)
+
+    return doc
+
+
+def prune_hg38_doc(doc: dict, na_values: set = NA_VALUES):
+    return prune_hg19_doc(doc, na_values=na_values)
+
+
+def construct_raw_doc(row: dict, columns: list, na_values: set = NA_VALUES):
+    """
+    Construct a raw dbnsfp doc from a dict-like row read from the csv file.
+    "Raw" means 1) the doc may contain dot fields that are not parsed, and 2) some values in the doc need further treatment/processing.
+
+    Args:
+        row: a dict representing a csv row's content
+        columns: a list of Column object indicating how to construct each column
+        na_values: a set of values seen as NA
+    Returns:
+        a dict representing the doc's json object
+    """
+    result = dict()
+
+    for column in columns:
+        value = row[column.name]
+        if value in na_values:
+            continue
+
+        value = column.transform(value)
+        if value is None:
+            continue
+
+        result[column.dest] = value
+
+    return result
+
+
+def construct_hg19_raw_doc(row: dict, na_values: set = NA_VALUES):
+    return construct_raw_doc(row, columns=HG19_COLUMNS, na_values=na_values)
+
+
+def construct_hg38_raw_doc(row: dict, na_values: set = NA_VALUES):
+    return construct_raw_doc(row, columns=HG38_COLUMNS, na_values=na_values)
+
+
+def make_hgvs_id(doc: dict, chrom_column: Column, pos_column: Column, ref_column: Column, alt_column: Column):
+    chrom_value = doc[chrom_column.dest]
+    pos_value = doc[pos_column.dest]["start"]  # see make_zero_based()
+    ref_value = doc[ref_column.dest]
+    alt_value = doc[alt_column.dest]
+
+    hgvs_id = "chr%s:g.%d%s>%s" % (chrom_value, pos_value, ref_value, alt_value)
+    return hgvs_id
+
+
+def make_hg19_hgvs_id(doc: dict):
+    chrom_column = TAG_COLUMN_MAP[COLUMN_TAG.HG19_CHROM][0]
+    pos_column = TAG_COLUMN_MAP[COLUMN_TAG.HG19_POS][0]
+    ref_column = TAG_COLUMN_MAP[COLUMN_TAG.REF_ALLELE][0]
+    alt_column = TAG_COLUMN_MAP[COLUMN_TAG.ALT_ALLELE][0]
+
+    return make_hgvs_id(doc, chrom_column=chrom_column, pos_column=pos_column, ref_column=ref_column, alt_column=alt_column)
+
+
+def make_hg38_hgvs_id(doc: dict):
+    chrom_column = TAG_COLUMN_MAP[COLUMN_TAG.HG38_CHROM][0]
+    pos_column = TAG_COLUMN_MAP[COLUMN_TAG.HG38_POS][0]
+    ref_column = TAG_COLUMN_MAP[COLUMN_TAG.REF_ALLELE][0]
+    alt_column = TAG_COLUMN_MAP[COLUMN_TAG.ALT_ALLELE][0]
+
+    return make_hgvs_id(doc, chrom_column=chrom_column, pos_column=pos_column, ref_column=ref_column, alt_column=alt_column)
+
+
+def construct_hg19_doc(row: dict, na_values: set = NA_VALUES):
+    verified = verify_hg19_row(row, na_values=na_values)
+    if not verified:
+        return None
+
+    raw_doc = construct_hg19_raw_doc(row, na_values=na_values)
+    raw_doc = prune_hg19_doc(raw_doc, na_values=na_values)
+    hgvs_id = make_hg19_hgvs_id(raw_doc)
+
+    doc = {
+        "_id": hgvs_id,
+        "dbnsfp": parse_dot_fields(raw_doc)  # convert dot-fields into nested dictionaries
+    }
+    return doc
+
+
+def construct_hg38_doc(row: dict, na_values: set = NA_VALUES):
+    verified = verify_hg38_row(row, na_values=na_values)
+    if not verified:
+        return None
+
+    raw_doc = construct_hg38_raw_doc(row, na_values=na_values)
+    raw_doc = prune_hg38_doc(raw_doc, na_values=na_values)
+    hgvs_id = make_hg38_hgvs_id(raw_doc)
+
+    doc = {
+        "_id": hgvs_id,
+        "dbnsfp": parse_dot_fields(raw_doc)  # convert dot-fields into nested dictionaries
+    }
+    return doc
+
+
+def load_file(path: str, assembly: str):
+    file = anyfile(path)
+    file_reader = csv.DictReader(file, delimiter="\t")
+
+    num_columns = len(file_reader.fieldnames)
+    assert num_columns == VALID_COLUMN_NO, "Expecting %s columns, but got %s" % (VALID_COLUMN_NO, num_columns)
+
+    _construct_doc = None
+    match assembly:
+        case "hg19":
+            _construct_doc = construct_hg19_doc
+        case "hg38":
+            _construct_doc = construct_hg38_doc
+        case _:
+            raise ValueError(f"Cannot recognize assembly. Accept 'hg19' or 'hg38', got '{assembly}'.")
+
+    last_doc = None
+    for row in file_reader:
+        current_doc = _construct_doc(row, na_values=NA_VALUES)
+
+        if current_doc is None:
+            continue
+
+        if last_doc is not None:
+            if current_doc["_id"] == last_doc["_id"]:
+                last_aa = last_doc["dbnsfp"]["aa"]
+                current_aa = current_doc["dbnsfp"]["aa"]
+
+                if not isinstance(last_aa, list):
+                    last_aa = [last_aa]
+                last_aa.append(current_aa)
+
+                last_doc["dbnsfp"]["aa"] = last_aa
+                continue
+            else:
+                yield last_doc
+
+        last_doc = current_doc
+
+    # yield the very last doc
+    if last_doc:
+        yield last_doc
+
+    file.close()
diff --git a/src/hub/dataload/sources/dbnsfp/dbnsfp_parser_43a.py b/src/hub/dataload/sources/dbnsfp/dbnsfp_parser_43a.py
new file mode 100644
index 00000000..ea84bb95
--- /dev/null
+++ b/src/hub/dataload/sources/dbnsfp/dbnsfp_parser_43a.py
@@ -0,0 +1,675 @@
+import csv
+import glob
+import itertools
+import re
+
+from biothings.utils.dataload import dict_sweep  # list_split, unlist, value_convert_to_number
+from biothings.utils.common import anyfile
+
+# VALID_COLUMN_NO = 367  # for 4.1a
+# VALID_COLUMN_NO = 642  # for 4.2a
+VALID_COLUMN_NO = 643  # for 4.2a
+
+
+"""
+This parser is for dbNSFP v4.3a downloaded from https://sites.google.com/site/jpopgen/dbNSFP
+"""
+
+
+class DbnsfpReader:
+    # dbNSFP_variant use "." for missing values;
+    # other none values are borrowed from the `biothings.utils.dataload.dict_sweep` function and
+    #   from the default `na_values` argument of pandas.read_csv().
+    # see https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html
+    none_values = {r'.', r'', r" ", r"-", r'#N/A', r'#N/A N/A', r'#NA', r'-1.#IND', r'-1.#QNAN', r'-NaN', r'-nan',
+                   r'1.#IND', r'1.#QNAN', r'<NA>', r'N/A', r'NA', r'NULL', r'NaN', r'n/a', r'nan', r'null', r'none',
+                   r"Not Available", r"unknown"}
+
+    mutpred_top5features_pattern = re.compile(r" \(P = ([eE0-9.-]*)\)$")
+
+    # A general rule from observation: for some of the columns, data type can be inferred from the suffix of their
+    # column names. E.g. "xxx_score" is usually float.
+    # This rule is usually true especially when dealing with grouped columns (like "DANN_score" and "DANN_rankscore").
+    # Grouped columns are represented by their common col_prefix (like "DANN").
+    # Column names in one group can be restored by concatenating their common col_prefix and individual suffixes.
+    col_suffix_to_type = {
+        "score": float,
+        "rankscore": float,
+        "converted_rankscore": float,
+        "fitCons_score": float,
+        "fitCons_rankscore": float,
+        "confidence_value": int,
+        "AC": int,
+        "AF": float
+    }
+
+    @classmethod
+    def read_string(cls, row, col, sep=None, transform=None):
+        """
+        Read `row[col]` as a string. If `sep` is None, return the single string (with transformation).
+        If `sep` is not None, separate the string into a list of substrings, transform each substring, and then return
+        the list.
+        """
+        def apply_transformation(_string, _transform):
+            """
+            If the transformation is to convert the string into an integer or float, wrap it in try-catch;
+            otherwise simply apply the transformation as a function to the string.
+            """
+            if _transform is int or _transform is float:
+                try:
+                    return _transform(_string)
+                except ValueError:
+                    raise ValueError("Cannot convert {col} value {string} to {type}".format(col=col, string=_string, type=_transform))
+            return _transform(_string)
+
+        string = row[col]
+        if string in cls.none_values:
+            return None
+
+        if sep is None:
+            if transform is not None:
+                string = apply_transformation(string, transform)
+            return string
+        else:
+            string_list = [s for s in string.split(sep=sep) if s not in cls.none_values]
+            if not string_list:  # `string_list` is empty after none-values are removed
+                return None
+
+            if transform is not None:
+                string_list = [apply_transformation(string, transform) for string in string_list]
+            return string_list if len(string_list) > 1 else string_list[0]
+
+    @classmethod
+    def read_unique_strings(cls, row, cols, sep=None):
+        """
+        Return the unique strings from the readout union of multiple columns.
+        No transformation is applied.
+        """
+        string_list = [row[key].split(sep=sep) for key in cols]
+        string_list = list(set(string for string in itertools.chain.from_iterable(string_list) if string not in cls.none_values))
+
+        if not string_list:  # `string_list` is empty after none-values are removed
+            return None
+        return string_list if len(string_list) > 1 else string_list[0]
+
+    @classmethod
+    def _iter_read_group(cls, row, col_prefix, col_suffixes, sep=";"):
+        """
+        Some columns other than `*_AC` and `*_AF` are grouped by their prefixes, e.g. "ClinPred_pred",
+        "ClinPred_rankscore", and "ClinPred_score". Such groups of columns usually contains scores of some metrics.
+
+        To play safe, I assume each column in such a group is a separable string (although it may not contain any
+        separator at all); and I assume all columns in a group should use the same separator, semicolon by default.
+
+        `col_prefix` and `col_suffixes`, when joined with "_", form the column names in the `row` to be read.
+        Each column will be read and yielded as in an iterator.
+
+        Args:
+            row (dict-like): the data to read in
+            col_prefix (string): col_prefix of the keys of interest in `row`, e.g "ClinPred"
+            col_suffixes (collection of strings): suffixes of the columns interest in `row`,
+                                                  e.g. ("pred", "rankscore", "score")
+            sep (string): a single string of the separator
+        """
+        for col_suffix in col_suffixes:
+            # filter(function, iterable):
+            #   If function is None, the identity function is assumed, that is,
+            #   all elements of iterable that are false are removed.
+            col = "_".join(filter(None, (col_prefix, col_suffix)))
+            transform = cls.col_suffix_to_type.get(col_suffix)
+            yield cls.read_string(row, col, sep=sep, transform=transform)
+
+    @classmethod
+    def map_score_rankscore_to_json(cls, row, col_prefix):
+        """
+        A common case of calling `_iter_read_group`.
+
+        Dozens of columns are grouped into tuple, like "<col_prefix>_score" and "<col_prefix>_rankscore".
+        Their data types are usually float.
+        """
+        col_suffixes = ("score", "rankscore")  # col_suffixes are also the json keys
+        return dict(zip(col_suffixes, cls._iter_read_group(row, col_prefix, col_suffixes)))
+
+    @classmethod
+    def map_score_rankscore_pred_to_json(cls, row, col_prefix):
+        """
+        A common case of calling `_iter_read_group`.
+
+        Dozens of columns are grouped into triples, like "<col_prefix>_score", "<col_prefix>_rankscore", and
+        "<col_prefix>_pred". Their data types are usually float, float and string.
+        """
+        col_suffixes = ("score", "rankscore", "pred")  # col_suffixes are also the json keys
+        return dict(zip(col_suffixes, cls._iter_read_group(row, col_prefix, col_suffixes)))
+
+    @classmethod
+    def map_score_converted_rankscore_pred_to_json(cls, row, col_prefix):
+        """
+        A common case of calling `_iter_read_group`.
+
+        Dozens of columns are grouped into triples, like "<col_prefix>_score", "<col_prefix>_converted_rankscore", and
+        "<col_prefix>_pred". Their data types are usually float, float and string.
+        """
+        col_suffixes = ("score", "converted_rankscore", "pred")  # col_suffixes are also the json keys
+        return dict(zip(col_suffixes, cls._iter_read_group(row, col_prefix, col_suffixes)))
+
+    @classmethod
+    def map_fitcons_score_rankscore_confidence_value_to_json(cls, row, col_prefix):
+        """
+        A common case of calling `_iter_read_group`.
+
+        Dozens of columns are grouped into triples, like "<col_prefix>_fitCons_score", "<col_prefix>_fitCons_rankscore",
+        and "<col_prefix>_confidence_value". Their data types are usually float, float and int.
+        """
+        col_suffixes = ("fitCons_score", "fitCons_rankscore", "confidence_value")
+        json_keys = ("fitcons_score", "fitcons_rankscore", "confidence_value")
+        return dict(zip(json_keys, cls._iter_read_group(row, col_prefix, col_suffixes)))
+
+    @classmethod
+    def map_AC_AF_to_json(cls, row, col_prefix, col_infixes, whole_group=False):
+        """
+        Read `<col_prefix>_<col_infix>_AC` and `<col_prefix>_<col_infix>_AF` columns for each `col_infix` in
+        `col_infixes`.
+        When `whole_group` is True, read two extra columns, `<col_prefix>_AC` and `<col_prefix>_AF`.
+
+        AC (allele counts) will be parsed into integers; AF (allele freqs) will be parsed into floats.
+        No separator is assumed to exist in such AC/AF columns.
+
+        The readout will be returned as a dict like:
+
+            {
+                <col_infix.lower()>_ac : int(<col_prefix>_<col_infix>_AC)
+                <col_infix.lower()>_af : float(<col_prefix>_<col_infix>_AF)
+            }
+
+        E.g. to read `ESP6500_AA_AC`, `ESP6500_AA_AF`, `ESP6500_EA_AC` and `ESP6500_EA_AF` columns, we can simply call
+        map_AC_AF_to_json(col_prefix="ESP6500", col_infixes=["AA", "EA"], whole_group=False)
+        """
+        if col_infixes is None:
+            col_infixes = []
+
+        if whole_group:
+            col_infixes = [""] + col_infixes
+
+        col_suffixes = ("AC", "AF")
+
+        def _generate_json_fields():
+            for col_infix in col_infixes:
+                for col_suffix in col_suffixes:
+                    # filter(function, iterable):
+                    #   If function is None, the identity function is assumed, that is,
+                    #   all elements of iterable that are false are removed.
+                    col = "_".join(filter(None, (col_prefix, col_infix, col_suffix)))
+                    transform = cls.col_suffix_to_type.get(col_suffix)
+
+                    json_key = "_".join(filter(None, (col_infix, col_suffix))).lower()
+                    yield json_key, cls.read_string(row, col, sep=None, transform=transform)
+
+        return dict(_generate_json_fields())
+
+    @classmethod
+    def parse_mutpred_top5features(cls, row, col):
+        """
+        `mutpred_mechanisms` is a string combined from 5 clauses, separated by semicolons (with whitespaces).
+        Each clause has the same pattern of "<mechanism> (P = <p_val>)".
+
+        E.g. "Loss of helix (P = 0.0444); Gain of loop (P = 0.0502); Gain of catalytic residue at A444 (P = 0.1876); \
+        Gain of solvent accessibility (P = 0.2291); Loss of disorder (P = 0.9475)"
+
+        Here we apply regex to parse this string
+
+            regex = re.compile(r" \(P = ([eE0-9.-]*)\)$")
+            [(e for e in regex.split(s) if e.strip()) for s in string.split("; ")]
+
+        and get a list of 5 tuples like
+
+            [('Loss of helix', '0.0444'), ('Gain of loop', '0.0502'), ('Gain of catalytic residue at A444', '0.1876'),
+            ('Gain of solvent accessibility', '0.2291'), ('Loss of disorder', '0.9475')]
+
+        Then construct a list of 5 dictionaries of <"mechanism": xxx, "p_val": xxx> and return
+        """
+        string = cls.read_string(row, col)
+        if string is None:
+            return None
+
+        mp_list = [tuple(e for e in cls.mutpred_top5features_pattern.split(s) if e.strip()) for s in string.split("; ")]
+        return [{"mechanism": mp[0], "p_val": float(mp[1])} for mp in mp_list if mp and len(mp) == 2]
+
+    @classmethod
+    def parse_uniprot(cls, row, acc_col, entry_col):
+        """
+        Read uniprot accession numbers and entry names as two strings from `row`. Map each accession number and entry
+        name into a dictionary, and return a list of such dictionaries.
+
+        E.g. suppose we have the following readouts
+
+            row[acc_col] = "P54578-2;P54578-3;A6NJA2;P54578"
+            row[entry_col] = UBP14_HUMAN;UBP14_HUMAN;A6NJA2_HUMAN;UBP14_HUMAN
+
+        Then we will return a list of dictionaries like:
+
+            [{'acc': 'P54578-2', 'entry': 'UBP14_HUMAN'},
+             {'acc': 'P54578-3', 'entry': 'UBP14_HUMAN'},
+             {'acc': 'A6NJA2', 'entry': 'A6NJA2_HUMAN'},
+             {'acc': 'P54578', 'entry': 'UBP14_HUMAN'}]
+        """
+        # cls.read_string() is not used here because it will remove the NA substrings from the split string
+        acc_list = [s if s not in cls.none_values else None for s in row[acc_col].split(";")]
+        entry_list = [s if s not in cls.none_values else None for s in row[entry_col].split(";")]
+
+        return [{"acc": acc, "entry": entry} for (acc, entry) in zip(acc_list, entry_list)
+                if (acc, entry) != (None, None)]
+
+    @classmethod
+    def parse_gtex(cls, row, gene_col, tissue_col):
+        """
+        Read GTEx genes and tissues as two strings from `row`. Map each gene and tissue into a dictionary, and return
+        a list of such dictionaries.
+
+        E.g. suppose we have the following readouts
+
+            row[gene_col] = "ENOSF1|ENOSF1"
+            row[tissue_col] = Adipose_Subcutaneous|Muscle_Skeletal
+
+        Then we will return a list of dictionaries like:
+
+            [{'gene': 'ENOSF1', 'tissue': 'Adipose_Subcutaneous'},
+             {'gene': 'ENOSF1', 'tissue': 'Muscle_Skeletal'}]
+        """
+        # cls.read_string() is not used here because it will remove the NA substrings from the split string
+        gene_list = [s if s not in cls.none_values else None for s in row[gene_col].split(r"|")]
+        tissue_list = [s if s not in cls.none_values else None for s in row[tissue_col].split(r"|")]
+
+        return [{"gene": gene, "tissue": tissue} for (gene, tissue) in zip(gene_list, tissue_list)
+                if (gene, tissue) != (None, None)]
+
+    @classmethod
+    def parse_siphy_29way_pi(cls, row, col):
+        """
+        A "SiPhy_29way_pi" value, if not None, is a string separated by ":", representing an estimated stationary
+        distribution of A, C, G and T at a variant site. E.g. "0.0:0.5259:0.0:0.4741".
+
+        Here we split the string and convert it to a dict of {<nt>: <freq>}.
+        """
+        string = cls.read_string(row, col)
+        if string is None:
+            return None
+
+        freq = [float(s) for s in string.split(":")]
+        pi_dict = {'a': freq[0], 'c': freq[1], 'g': freq[2], 't': freq[3]}
+        return pi_dict
+
+    @classmethod
+    def map_CADD_to_json(cls, row, version):
+        """
+        Myvariant.info already has a datasource of CADD, but it's hg19 only.
+        When version == "hg19", we will discard all CADD fields in dbNSFP.
+        When version == "hg38", we will only include the hg38 CADD fields in dbNSFP.
+        """
+        if version == "hg38":
+            cadd_dict = {
+                "raw_score": cls.read_string(row, "CADD_raw", sep=";", transform=float),
+                "raw_rankscore": cls.read_string(row, "CADD_raw_rankscore", sep=";", transform=float),
+                "phred": cls.read_string(row, "CADD_phred", sep=";", transform=float),
+                # "raw_score_hg19": cls.read_string(row, "CADD_raw_hg19", sep=";", transform=float),
+                # "raw_rankscore_hg19": cls.read_string(row, "CADD_raw_rankscore_hg19", sep=";", transform=float),
+                # "phred_hg19": cls.read_string(row, "CADD_phred_hg19", sep=";", transform=float)
+            }
+            return cadd_dict
+        elif version == "hg19":
+            return None
+        else:
+            raise ValueError("Cannot recognize version. Should be either hg19 or hg38. Got version={}".format(version))
+
+    @classmethod
+    def map_row_to_json(cls, row, version):
+        """
+        Parse each row into a json object
+        """
+
+        """
+        Step 1: Read basic variant information
+        """
+        # in case of no hg19 position provided, remove the item
+        pos_hg19 = cls.read_string(row, "hg19_pos(1-based)", transform=int)  # Column 9
+        if pos_hg19 is None:
+            return None
+
+        pos_hg18 = cls.read_string(row, "hg18_pos(1-based)", transform=int)  # Column 11
+        pos_hg38 = cls.read_string(row, "pos(1-based)", transform=int)  # Column 2
+
+        # ref and alt cannot be None else hgvs_id is invalid
+        ref = cls.read_string(row, "ref", transform=lambda s: s.upper())  # Column 3
+        alt = cls.read_string(row, "alt", transform=lambda s: s.upper())  # Column 4
+
+        if version == 'hg19':
+            chrom = cls.read_string(row, "hg19_chr", transform=lambda s: "MT" if s == "M" else s)  # Column 1
+            hgvs_id = "chr%s:g.%d%s>%s" % (chrom, pos_hg19, ref, alt)
+        elif version == 'hg38':
+            chrom = cls.read_string(row, "#chr", transform=lambda s: "MT" if s == "M" else s)  # Column 8
+            hgvs_id = "chr%s:g.%d%s>%s" % (chrom, pos_hg38, ref, alt)
+        else:
+            raise ValueError("Cannot recognize version. Should be either hg19 or hg38. Got version={}".format(version))
+
+        rsid = cls.read_string(row, "rs_dbSNP")  # Column 7
+
+        # Column 10 "hg18_chr" is skipped
+
+        """
+        Step 2: Construct the JSON object
+        """
+        one_snp_json = {
+            "_id": hgvs_id,
+            "dbnsfp": {
+                "rsid": rsid,  # Column 7
+                "chrom": chrom,  # Column 1 or 8
+                "hg19": {  # Column 9
+                    "start": pos_hg19,
+                    "end": pos_hg19
+                },
+                "hg18": {  # Column 11
+                    "start": pos_hg18,
+                    "end": pos_hg18
+                },
+                "hg38": {  # Column 2
+                    "start": pos_hg38,
+                    "end": pos_hg38
+                },
+                "ref": ref,  # Column 3
+                "alt": alt,  # Column 4
+                "aa": {  # Column 5-6, 12, 30-32
+                    "ref": cls.read_string(row, "aaref"),
+                    "alt": cls.read_string(row, "aaalt"),
+                    "pos": cls.read_string(row, "aapos", sep=";", transform=int),
+                    "refcodon": cls.read_string(row, "refcodon", sep=";"),
+                    "codonpos": cls.read_string(row, "codonpos", sep=";", transform=int),
+                    "codon_degeneracy": cls.read_string(row, "codon_degeneracy", sep=";", transform=int),
+                },
+                # Column 13
+                "genename": cls.read_string(row, "genename", sep=";"),
+                # Column 14-16
+                "ensembl": {
+                    "geneid": cls.read_string(row, "Ensembl_geneid", sep=";"),
+                    "transcriptid": cls.read_string(row, "Ensembl_transcriptid", sep=";"),
+                    "proteinid": cls.read_string(row, "Ensembl_proteinid", sep=";")
+                },
+                # Column 17-18
+                "uniprot": cls.parse_uniprot(row, "Uniprot_acc", "Uniprot_entry"),
+                # Column 19-24
+                "hgvsc": cls.read_unique_strings(row, cols=["HGVSc_ANNOVAR", "HGVSc_snpEff", "HGVSc_VEP"], sep=";"),
+                "hgvsp": cls.read_unique_strings(row, cols=["HGVSp_ANNOVAR", "HGVSp_snpEff", "HGVSp_VEP"], sep=";"),
+                # Column 25-29
+                "appris": cls.read_string(row, "APPRIS", sep=";"),
+                "genecode_basic": cls.read_string(row, "GENCODE_basic", sep=";"),
+                "tsl": cls.read_string(row, "TSL", sep=";", transform=int),
+                "vep_canonical": cls.read_string(row, "VEP_canonical", sep=";"),
+                "cds_strand": cls.read_string(row, "cds_strand", sep=";"),
+                # Column 33-37
+                "ancestral_allele": cls.read_string(row, "Ancestral_allele", sep=";"),
+                "altai_neandertal": cls.read_string(row, "AltaiNeandertal", sep=r"/"),
+                "denisova": cls.read_string(row, "Denisova", sep=r"/"),
+                "vindijia_neandertal": cls.read_string(row, "VindijiaNeandertal", sep=r"/"),
+                "chagyrskaya_neandertal": cls.read_string(row, "ChagyrskayaNeandertal", sep=r"/"),
+                # Column 38-43
+                "sift": cls.map_score_converted_rankscore_pred_to_json(row, col_prefix="SIFT"),
+                "sift4g": cls.map_score_converted_rankscore_pred_to_json(row, col_prefix="SIFT4G"),
+                # Column 44-49
+                "polyphen2": {
+                    "hdiv": cls.map_score_rankscore_pred_to_json(row, col_prefix="Polyphen2_HDIV"),
+                    "hvar": cls.map_score_rankscore_pred_to_json(row, col_prefix="Polyphen2_HVAR"),
+                },
+                # Column 50-53
+                "lrt": {
+                    "score": cls.read_string(row, "LRT_score", sep=";", transform=float),
+                    "converted_rankscore": cls.read_string(row, "LRT_converted_rankscore", sep=";", transform=float),
+                    "pred": cls.read_string(row, "LRT_pred", sep=";"),
+                    "omega": cls.read_string(row, "LRT_Omega", sep=";", transform=float)
+                },
+                # Column 54-61
+                "mutationtaster": {
+                    "score": cls.read_string(row, "MutationTaster_score", sep=";", transform=float),
+                    "converted_rankscore": cls.read_string(row, "MutationTaster_converted_rankscore", sep=";", transform=float),
+                    "pred": cls.read_string(row, "MutationTaster_pred", sep=";"),
+                    "model": cls.read_string(row, "MutationTaster_model", sep=";"),
+                    "AAE": cls.read_string(row, "MutationTaster_AAE", sep=";")
+                },
+                "mutationassessor": cls.map_score_rankscore_pred_to_json(row, col_prefix="MutationAssessor"),
+                # Column 62-64
+                "fathmm": cls.map_score_converted_rankscore_pred_to_json(row, col_prefix="FATHMM"),
+                # Column 65-67
+                "provean": cls.map_score_converted_rankscore_pred_to_json(row, col_prefix="PROVEAN"),
+                # Column 68-69
+                "vest4": cls.map_score_rankscore_to_json(row, col_prefix="VEST4"),
+                # Column 70-79
+                "metasvm": cls.map_score_rankscore_pred_to_json(row, col_prefix="MetaSVM"),
+                "metalr": cls.map_score_rankscore_pred_to_json(row, col_prefix="MetaLR"),
+                "reliability_index": cls.read_string(row, "Reliability_index", transform=int),
+                "metarnn": cls.map_score_rankscore_pred_to_json(row, col_prefix="MetaRNN"),
+                # Column 80-82
+                "m-cap": cls.map_score_rankscore_pred_to_json(row, col_prefix="M-CAP"),
+                # Column 83-84
+                "revel": cls.map_score_rankscore_to_json(row, col_prefix="REVEL"),
+                # Column 85-89
+                "mutpred": {
+                    "score": cls.read_string(row, "MutPred_score", sep=";", transform=float),
+                    "rankscore": cls.read_string(row, "MutPred_rankscore", sep=";", transform=float),
+                    "accession": cls.read_string(row, "MutPred_protID", sep=";"),
+                    "aa_change": cls.read_string(row, "MutPred_AAchange", sep=";"),
+                    "pred": cls.parse_mutpred_top5features(row, "MutPred_Top5features"),
+                },
+                # Column 90-93
+                "mvp": cls.map_score_rankscore_to_json(row, col_prefix="MVP"),
+                "mpc": cls.map_score_rankscore_to_json(row, col_prefix="MPC"),
+                # Column 94-96
+                "primateai": cls.map_score_rankscore_pred_to_json(row, col_prefix="PrimateAI"),
+                # Column 97-99
+                "deogen2": cls.map_score_rankscore_pred_to_json(row, col_prefix="DEOGEN2"),
+                # Column 100-105
+                "bayesdel": {
+                    "add_af": cls.map_score_rankscore_pred_to_json(row, col_prefix="BayesDel_addAF"),
+                    "no_af": cls.map_score_rankscore_pred_to_json(row, col_prefix="BayesDel_noAF")
+                },
+                # Column 106-108
+                "clinpred": cls.map_score_rankscore_pred_to_json(row, col_prefix="ClinPred"),
+                # Column 109-111
+                "list-s2": cls.map_score_rankscore_pred_to_json(row, col_prefix="LIST-S2"),
+                # Column 112-117
+                "aloft": {
+                    "fraction_transcripts_affected": cls.read_string(row, "Aloft_Fraction_transcripts_affected", sep=";"),
+                    "prob_tolerant": cls.read_string(row, "Aloft_prob_Tolerant", sep=";"),
+                    "prob_recessive": cls.read_string(row, "Aloft_prob_Recessive", sep=";"),
+                    "prob_dominant": cls.read_string(row, "Aloft_prob_Dominant", sep=";"),
+                    "pred": cls.read_string(row, "Aloft_pred", sep=";"),
+                    "confidence": cls.read_string(row, "Aloft_Confidence", sep=";")
+                },
+                # Column 118-123
+                #   Column 118-120 are hg38
+                #   Column 121-123 are hg19
+                # Only column 117-119 will be included in the document when verison == "hg38"
+                # No CADD fields will be included when verison == "hg19"
+                "cadd": cls.map_CADD_to_json(row, version),
+                # Column 124-125
+                "dann": cls.map_score_rankscore_to_json(row, col_prefix="DANN"),
+                # Column 126-132
+                "fathmm-mkl": {
+                    "coding_score": cls.read_string(row, "fathmm-MKL_coding_score", sep=";", transform=float),
+                    "coding_rankscore": cls.read_string(row, "fathmm-MKL_coding_rankscore", sep=";", transform=float),
+                    "coding_pred": cls.read_string(row, "fathmm-MKL_coding_pred", sep=";"),
+                    "coding_group": cls.read_string(row, "fathmm-MKL_coding_group", sep=";")
+                },
+                "fathmm-xf": {
+                    "coding_score": cls.read_string(row, "fathmm-XF_coding_score", sep=";", transform=float),
+                    "coding_rankscore": cls.read_string(row, "fathmm-XF_coding_rankscore", sep=";", transform=float),
+                    "coding_pred": cls.read_string(row, "fathmm-XF_coding_pred", sep=";")
+                },
+                # Column 133-138
+                # Please note that Eigen uses "-", NOT "_", to connect column name prefix and suffixes
+                # Cannot use cls._iter_read_group here
+                "eigen":  {
+                    "raw_coding": cls.read_string(row, "Eigen-raw_coding", sep=";", transform=float),
+                    "raw_coding_rankscore": cls.read_string(row, "Eigen-raw_coding_rankscore", sep=";", transform=float),
+                    "phred_coding": cls.read_string(row, "Eigen-phred_coding", sep=";", transform=float)
+                },
+                "eigen-pc": {
+                    "raw_coding": cls.read_string(row, "Eigen-PC-raw_coding", sep=";", transform=float),
+                    "raw_coding_rankscore": cls.read_string(row, "Eigen-PC-raw_coding_rankscore", sep=";", transform=float),
+                    "phred_coding": cls.read_string(row, "Eigen-PC-phred_coding", sep=";", transform=float),
+                },
+                # Column 139-140
+                # Please note that column 140 in dbNSFP4.3a.readme.txt is "GenoCanyon_score_rankscore" and it's a typo
+                "genocanyon": cls.map_score_rankscore_to_json(row, col_prefix="GenoCanyon"),
+                # Column 141-143
+                "integrated": cls.map_fitcons_score_rankscore_confidence_value_to_json(row, col_prefix="integrated"),
+                # Column 144-146
+                "gm12878": cls.map_fitcons_score_rankscore_confidence_value_to_json(row, col_prefix="GM12878"),
+                # Column 147-149
+                "h1-hesc": cls.map_fitcons_score_rankscore_confidence_value_to_json(row, col_prefix="H1-hESC"),
+                # Column 150-152
+                "huvec": cls.map_fitcons_score_rankscore_confidence_value_to_json(row, col_prefix="HUVEC"),
+                # Column 153-154
+                # Note that column 153 is "LINSIGHT", not "LINSIGHT_score" (possibly a typo in the source data file)
+                "linsight": {
+                    "score": cls.read_string(row, 'LINSIGHT', sep=";", transform=float),
+                    "rankscore": cls.read_string(row, "LINSIGHT_rankscore", sep=";", transform=float)
+                },
+                # Column 155-157
+                "gerp++": {
+                    "nr": cls.read_string(row, "GERP++_NR", sep=";", transform=float),
+                    "rs": cls.read_string(row, "GERP++_RS", sep=";", transform=float),
+                    "rs_rankscore": cls.read_string(row, "GERP++_RS_rankscore", sep=";", transform=float)
+                },
+                # Column 158-163
+                "phylop": {
+                    "100way_vertebrate": {
+                        "score": cls.read_string(row, "phyloP100way_vertebrate", sep=";", transform=float),
+                        "rankscore": cls.read_string(row, "phyloP100way_vertebrate_rankscore", sep=";", transform=float)
+                    },
+                    "30way_mammalian": {
+                        "score": cls.read_string(row, "phyloP30way_mammalian", sep=";", transform=float),
+                        "rankscore": cls.read_string(row, "phyloP30way_mammalian_rankscore", sep=";", transform=float)
+                    },
+                    "17way_primate": {
+                        "score": cls.read_string(row, "phyloP17way_primate", sep=";", transform=float),
+                        "rankscore": cls.read_string(row, "phyloP17way_primate_rankscore", sep=";", transform=float)
+                    }
+                },
+                # Column 164-169
+                "phastcons": {
+                    "100way_vertebrate": {
+                        "score": cls.read_string(row, "phastCons100way_vertebrate", sep=";", transform=float),
+                        "rankscore": cls.read_string(row, "phastCons100way_vertebrate_rankscore", sep=";", transform=float)
+                    },
+                    "30way_mammalian": {
+                        "score": cls.read_string(row, "phastCons30way_mammalian", sep=";", transform=float),
+                        "rankscore": cls.read_string(row, "phastCons30way_mammalian_rankscore", sep=";", transform=float)
+                    },
+                    "17way_primate": {
+                        "score": cls.read_string(row, "phastCons17way_primate", sep=";", transform=float),
+                        "rankscore": cls.read_string(row, "phastCons17way_primate_rankscore", sep=";", transform=float)
+                    }
+                },
+                # Column 170-172
+                "siphy_29way": {
+                    "pi": cls.parse_siphy_29way_pi(row, "SiPhy_29way_pi"),
+                    # Note that the column name is "SiPhy_29way_logOdds", not "SiPhy_29way_logOdds_score"
+                    "logodds_score": cls.read_string(row, "SiPhy_29way_logOdds", sep=";", transform=float),
+                    "logodds_rankscore": cls.read_string(row, "SiPhy_29way_logOdds_rankscore", sep=";", transform=float)
+                },
+                # Column 173-174
+                "bstatistic": {
+                    # Note that the column name is "bStatistic", not "bStatistic_score"
+                    "score": cls.read_string(row, 'bStatistic', sep=";", transform=float),
+                    "converted_rankscore": cls.read_string(row, "bStatistic_converted_rankscore", sep=";", transform=float)
+                },
+                # Column 175-186
+                "1000gp3": cls.map_AC_AF_to_json(row, col_prefix="1000Gp3", col_infixes=["AFR", "EUR", "AMR", "EAS", "SAS"], whole_group=True),
+                # Column 187-188
+                "twinsuk": cls.map_AC_AF_to_json(row, col_prefix="TWINSUK", col_infixes=None, whole_group=True),
+                # Column 189-190
+                "alspac": cls.map_AC_AF_to_json(row, col_prefix="ALSPAC", col_infixes=None, whole_group=True),
+                # Column 191-192
+                "uk10k": cls.map_AC_AF_to_json(row, col_prefix="UK10K", col_infixes=None, whole_group=True),
+                # Column 193-196
+                "esp6500": cls.map_AC_AF_to_json(row, col_prefix="ESP6500", col_infixes=["AA", "EA"], whole_group=False),
+                # Column 197-212
+                "exac": cls.map_AC_AF_to_json(row, col_prefix="ExAC", col_infixes=["Adj", "AFR", "AMR", "EAS", "FIN", "NFE", "SAS"], whole_group=True),
+                # Column 213-228
+                "exac_nontcga": cls.map_AC_AF_to_json(row, col_prefix="ExAC_nonTCGA", col_infixes=["Adj", "AFR", "AMR", "EAS", "FIN", "NFE", "SAS"], whole_group=True),
+                # Column 229-244
+                "exac_nonpsych": cls.map_AC_AF_to_json(row, col_prefix="ExAC_nonpsych", col_infixes=["Adj", "AFR", "AMR", "EAS", "FIN", "NFE", "SAS"], whole_group=True),
+
+                # Column 245-630 are gnomAD_* columns. Skipped.
+
+                # Column 631-637
+                "clinvar": {
+                    "clinvar_id": cls.read_string(row, "clinvar_id"),
+                    "clinsig": cls.read_string(row, "clinvar_clnsig", sep=r"|"),
+                    "trait": cls.read_string(row, "clinvar_trait", sep=r"|"),
+                    "review": cls.read_string(row, "clinvar_review", sep=r"|"),
+                    "hgvs": cls.read_string(row, "clinvar_hgvs", sep=r"|"),
+                    "omim": cls.read_string(row, "clinvar_OMIM_id", sep=r"|"),
+                    "medgen": cls.read_string(row, "clinvar_MedGen_id", sep=r"|"),
+                    "orphanet": cls.read_string(row, "clinvar_Orphanet_id", sep=r"|"),
+                    "var_source": cls.read_string(row, "clinvar_var_source", sep=r"|")
+                },
+                # Column 640-643
+                "interpro_domain": cls.read_string(row, "Interpro_domain", sep=";"),
+                "gtex": cls.parse_gtex(row, "GTEx_V8_gene", "GTEx_V8_tissue"),
+                "geuvadis_eqtl_target_gene": cls.read_string(row, "Geuvadis_eQTL_target_gene", sep=";")
+
+                # End of row
+            }
+        }
+
+        """
+        Step 3: Prune the JSON object and return
+        """
+        # `value_convert_to_number` converts strings to numeric values inside the dictionary
+        # `unlist` regresses lists of length 1 to single values inside the dictionary
+        # `dict_sweep` remove NA values indicated by `vals` inside the dictionary
+        # `list_split` separate fields by `sep` into lists
+        one_snp_json = dict_sweep(one_snp_json, vals=[None], remove_invalid_list=True)
+        return one_snp_json
+
+
+# open file, parse, pass to json mapper
+def data_generator(input_file, version):
+    with anyfile(input_file) as file:
+        file_reader = csv.reader(file, delimiter="\t")
+
+        header = next(file_reader)
+        assert len(header) == VALID_COLUMN_NO, "Expecting %s columns, but got %s" % (VALID_COLUMN_NO, len(header))
+
+        previous_row = None
+        for row in file_reader:
+            row = dict(zip(header, row))
+
+            current_row = DbnsfpReader.map_row_to_json(row, version=version)
+            if previous_row and current_row:
+                if current_row["_id"] == previous_row["_id"]:
+                    aa = previous_row["dbnsfp"]["aa"]
+                    if not isinstance(aa, list):
+                        aa = [aa]
+                    aa.append(current_row["dbnsfp"]["aa"])
+                    previous_row["dbnsfp"]["aa"] = aa
+                    if len(previous_row["dbnsfp"]["aa"]) > 1:
+                        continue
+                else:
+                    yield previous_row
+
+            previous_row = current_row
+
+        if previous_row:
+            yield previous_row
+
+
+def load_data_file(input_file, version):
+    data = data_generator(input_file, version=version)
+    for one_snp_json in data:
+        yield one_snp_json
+
+
+# load path and find files, pass to data_generator
+def load_data(path_glob, version='hg19'):
+    for input_file in sorted(glob.glob(path_glob)):
+        for d in load_data_file(input_file, version):
+            yield d
diff --git a/src/hub/dataload/sources/dbnsfp/dbnsfp_upload.py b/src/hub/dataload/sources/dbnsfp/dbnsfp_upload.py
index b659370e..908ab05c 100644
--- a/src/hub/dataload/sources/dbnsfp/dbnsfp_upload.py
+++ b/src/hub/dataload/sources/dbnsfp/dbnsfp_upload.py
@@ -2,7 +2,7 @@
 import glob
 
 from .dbnsfp_mapping import mapping
-from .dbnsfp_parser import load_data_file as load_common
+from .dbnsfp_parser import load_file
 import biothings.hub.dataload.uploader as uploader
 from hub.dataload.uploader import SnpeffPostUpdateUploader
 from hub.dataload.storage import MyVariantIgnoreDuplicatedStorage
@@ -22,17 +22,17 @@ class DBNSFPBaseUploader(uploader.ParallelizedSourceUploader,
     GLOB_PATTERN = "dbNSFP*_variant.chr*"
 
     @classmethod
-    def get_mapping(klass):
+    def get_mapping(cls):
         return mapping
 
     def jobs(self):
-        # tuple(input_file,version), where version is either hg38 or hg19)
-        return map(lambda e: (e, self.__class__.__metadata__["assembly"]),
-                   glob.glob(os.path.join(self.data_folder, self.__class__.GLOB_PATTERN)))
+        paths = glob.glob(os.path.join(self.data_folder, self.__class__.GLOB_PATTERN))
+        assembly = self.__class__.__metadata__["assembly"]
+        return map(lambda path: (path, assembly), paths)
 
-    def load_data(self, input_file, hg):
-        self.logger.debug("loading file " + input_file)
-        return load_common(input_file, version=hg)
+    def load_data(self, path, assembly):
+        self.logger.debug("loading file " + path)
+        return load_file(path, version=assembly)
 
 
 class DBNSFPHG38Uploader(DBNSFPBaseUploader):
diff --git a/src/utils/dotfield.py b/src/utils/dotfield.py
new file mode 100644
index 00000000..c79b5aea
--- /dev/null
+++ b/src/utils/dotfield.py
@@ -0,0 +1,43 @@
+import orjson
+from biothings.utils.dotfield import merge_object
+
+
+def make_object(attr, value):
+    """
+    Create dictionary following the input dot notation and the value
+    Example::
+
+        make_object('a.b.c', 100) --> {a:{b:{c:100}}}, or
+        make_object(['a','b','c'], 100) --> {a:{b:{c:100}}}
+
+    This is an orjson implementation of biothings.utils.dotfield.make_object, for better performance.
+    TODO Merge into biothings.utils.dotfield if necessary. (And delete this function then.)
+    """
+    attr_list = attr.split(".")
+    s = ""
+    for k in attr_list:
+        s += '{"' + k + '":'
+    s += orjson.dumps(value).decode("utf-8")  # decoding is necessary because orjson dumps into bytes
+    s += "}" * (len(attr_list))
+    return orjson.loads(s)
+
+
+def parse_dot_fields(genedoc):
+    """
+    parse_dot_fields({'a': 1, 'b.c': 2, 'b.a.c': 3})
+     should return
+        {'a': 1, 'b': {'a': {'c': 3}, 'c': 2}}
+
+    This is a copy of biothings.utils.dotfield.parse_dot_fields. However here it uses the orjson make_object() function.
+    TODO If orjson make_object() function is merged to biothings.utils.dotfield, this function can be deleted.
+    """
+    dot_fields = []
+    expanded_doc = {}
+    for key in genedoc:
+        if key.find(".") != -1:
+            dot_fields.append(key)
+            expanded_doc = merge_object(expanded_doc, make_object(key, genedoc[key]))
+    genedoc.update(expanded_doc)
+    for key in dot_fields:
+        del genedoc[key]
+    return genedoc
diff --git a/src/utils/table.py b/src/utils/table.py
new file mode 100644
index 00000000..367291a8
--- /dev/null
+++ b/src/utils/table.py
@@ -0,0 +1,49 @@
+from dataclasses import dataclass
+from itertools import groupby
+from typing import Callable
+
+
+@dataclass
+class TableColumn:
+    """
+    Configuration marker for each column in a tabular file.
+
+    A TableColumn object indicates that a value from the named column must be transformed before its assignment to a destination field inside a JSON doc.
+
+    E.g. TableColumn(name="AF", dest="allele_freq", transform=float) means that a value from the "AF" column must be cast to float and then be assigned to the
+    "allele_freq" field inside its associated JSON doc.
+    """
+    name: str  # column name
+    dest: str = None  # destination field name
+    transform: Callable = None  # transforming function applied to the column values
+    tag: str = None  # tagging columns that need special prior sanity check or post-processing
+
+    @classmethod
+    def identity_function(cls, value):
+        return value
+
+    def __post_init__(self):
+        if self.dest is None:
+            # This is very common practice of determining field name.
+            # E.g. a value in column "SIFT_score" is often wrapped to field "sift.score" (dotfield)
+            self.dest = self.name.lower().replace("_", ".")
+
+        # Default transformation is identity function; therefore we don't have to check if self.transform is None.
+        # The choice is made because most columns have transforming function in our application.
+        if self.transform is None:
+            self.transform = self.identity_function
+
+
+def create_tag_column_map(columns: list[TableColumn]):
+    """
+    Map each tag to its associated column or columns.
+
+    Args:
+        columns: a list of TableColumn objects
+
+    Returns:
+        a dictionary of { <tag> : <list-of-columns> }
+    """
+    tagged_columns = sorted([c for c in columns if c.tag is not None], key=lambda c: c.tag)
+    result = {tag: list(columns) for tag, columns in groupby(tagged_columns, lambda c: c.tag)}
+    return result

From a77dfaedc3d1ec675006c1731813b92453fd575d Mon Sep 17 00:00:00 2001
From: Yao Yao <yaoy.cs@gmail.com>
Date: Wed, 20 Sep 2023 14:47:23 -0700
Subject: [PATCH 02/13] add comments

---
 src/hub/dataload/sources/dbnsfp/dbnsfp_parser.py     | 2 +-
 src/hub/dataload/sources/dbnsfp/dbnsfp_parser_43a.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/hub/dataload/sources/dbnsfp/dbnsfp_parser.py b/src/hub/dataload/sources/dbnsfp/dbnsfp_parser.py
index 05ebfcd6..1f1fc669 100644
--- a/src/hub/dataload/sources/dbnsfp/dbnsfp_parser.py
+++ b/src/hub/dataload/sources/dbnsfp/dbnsfp_parser.py
@@ -354,7 +354,7 @@ def split_dedup(values: list, sep: str, na_values: set = NA_VALUES):
     Column("Aloft_prob_Dominant", dest="aloft.prob_dominant", transform=split_str),
     Column("Aloft_pred", transform=split_str),
     Column("Aloft_Confidence", transform=split_str),
-    Column("CADD_raw", dest="cadd.raw_score", transform=split_float, assembly="hg38"),  # TODO CADD will have hg38 next update
+    Column("CADD_raw", dest="cadd.raw_score", transform=split_float, assembly="hg38"),  # TODO CADD will have hg38 next update. Deprecate these 3 field then.
     Column("CADD_raw_rankscore", dest="cadd.raw_rankscore", transform=split_float, assembly="hg38"),
     Column("CADD_phred", transform=split_float, assembly="hg38"),  # CADD phred-like scores, not as other predications of string type
     # Column("CADD_raw_hg19", assembly="hg19"),  # discarded because Myvariant.info already has a hg19-only datasource of CADD.
diff --git a/src/hub/dataload/sources/dbnsfp/dbnsfp_parser_43a.py b/src/hub/dataload/sources/dbnsfp/dbnsfp_parser_43a.py
index ea84bb95..83316b85 100644
--- a/src/hub/dataload/sources/dbnsfp/dbnsfp_parser_43a.py
+++ b/src/hub/dataload/sources/dbnsfp/dbnsfp_parser_43a.py
@@ -12,7 +12,7 @@
 
 
 """
-This parser is for dbNSFP v4.3a downloaded from https://sites.google.com/site/jpopgen/dbNSFP
+Deprecated. This parser is for dbNSFP v4.3a downloaded from https://sites.google.com/site/jpopgen/dbNSFP
 """
 
 

From 11481b0421479a5ea3498a16ab0b6ad75c10ea6d Mon Sep 17 00:00:00 2001
From: Yao Yao <yaoy.cs@gmail.com>
Date: Wed, 20 Sep 2023 15:11:28 -0700
Subject: [PATCH 03/13] fix mapping; revise mapping construction

---
 .../dataload/sources/dbnsfp/dbnsfp_mapping.py | 1174 +++++------------
 1 file changed, 304 insertions(+), 870 deletions(-)

diff --git a/src/hub/dataload/sources/dbnsfp/dbnsfp_mapping.py b/src/hub/dataload/sources/dbnsfp/dbnsfp_mapping.py
index 42675907..2bf02501 100644
--- a/src/hub/dataload/sources/dbnsfp/dbnsfp_mapping.py
+++ b/src/hub/dataload/sources/dbnsfp/dbnsfp_mapping.py
@@ -1,3 +1,23 @@
+start_pos_field = {"start": {"type": "integer"}}
+end_pos_field = {"end": {"type": "integer"}}
+
+score_field = {"score": {"type": "float"}}
+converted_rankscore_field = {"converted_rankscore": {"type": "float"}}
+rankscore_field = {"rankscore": {"type": "float"}}
+confidence_value_field = {"confidence_value": {"type": "int"}}
+pred_field = {
+    "pred": {
+        "type": "keyword",
+        "normalizer": "keyword_lowercase_normalizer"
+    }
+}
+
+allele_count_field = {"ac": {"type": "integer"}}
+allele_num_field = {"an": {"type": "integer"}}
+allele_freq_field = {"af": {"type": "float"}}
+adj_allele_count_field = {"adj_ac": {"type": "integer"}}
+adj_allele_freq_field = {"adj_af": {"type": "float"}}
+
 mapping = {
     "dbnsfp": {
         "properties": {
@@ -11,32 +31,20 @@
             },
             "hg19": {
                 "properties": {
-                    "start": {
-                        "type": "integer"
-                    },
-                    "end": {
-                        "type": "integer"
-                    }
+                    **start_pos_field,
+                    **end_pos_field
                 }
             },
             "hg18": {
                 "properties": {
-                    "start": {
-                        "type": "integer"
-                    },
-                    "end": {
-                        "type": "integer"
-                    }
+                    **start_pos_field,
+                    **end_pos_field
                 }
             },
             "hg38": {
                 "properties": {
-                    "start": {
-                        "type": "integer"
-                    },
-                    "end": {
-                        "type": "integer"
-                    }
+                    **start_pos_field,
+                    **end_pos_field
                 }
             },
             "ref": {
@@ -156,76 +164,41 @@
             },
             "sift": {
                 "properties": {
-                    "score": {
-                        "type": "float"
-                    },
-                    "converted_rankscore": {
-                        "type": "float"
-                    },
-                    "pred": {
-                        "type": "keyword",
-                        "normalizer": "keyword_lowercase_normalizer"
-                    }
+                    **score_field,
+                    **converted_rankscore_field,
+                    **pred_field
                 }
             },
             "sift4g": {
                 "properties": {
-                    "score": {
-                        "type": "float"
-                    },
-                    "pred": {
-                        "type": "keyword",
-                        "normalizer": "keyword_lowercase_normalizer"
-                    },
-                    "converted_rankscore": {
-                        "type": "float"
-                    }
+                    **score_field,
+                    **converted_rankscore_field,
+                    **pred_field
                 }
             },
             "polyphen2": {
                 "properties": {
                     "hdiv": {
                         "properties": {
-                            "score": {
-                                "type": "float"
-                            },
-                            "rankscore": {
-                                "type": "float"
-                            },
-                            "pred": {
-                                "type": "keyword",
-                                "normalizer": "keyword_lowercase_normalizer"
-                            }
+                            **score_field,
+                            **rankscore_field,
+                            **pred_field
                         }
                     },
                     "hvar": {
                         "properties": {
-                            "score": {
-                                "type": "float"
-                            },
-                            "rankscore": {
-                                "type": "float"
-                            },
-                            "pred": {
-                                "type": "keyword",
-                                "normalizer": "keyword_lowercase_normalizer"
-                            }
+                            **score_field,
+                            **rankscore_field,
+                            **pred_field
                         }
                     }
                 }
             },
             "lrt": {
                 "properties": {
-                    "score": {
-                        "type": "float"
-                    },
-                    "converted_rankscore": {
-                        "type": "float"
-                    },
-                    "pred": {
-                        "type": "keyword",
-                        "normalizer": "keyword_lowercase_normalizer"
-                    },
+                    **score_field,
+                    **converted_rankscore_field,
+                    **pred_field,
                     "omega": {
                         "type": "float"
                     }
@@ -233,16 +206,9 @@
             },
             "mutationtaster": {
                 "properties": {
-                    "score": {
-                        "type": "float"
-                    },
-                    "converted_rankscore": {
-                        "type": "float"
-                    },
-                    "pred": {
-                        "type": "keyword",
-                        "normalizer": "keyword_lowercase_normalizer"
-                    },
+                    **score_field,
+                    **converted_rankscore_field,
+                    **pred_field,
                     "model": {
                         "type": "keyword",
                         "normalizer": "keyword_lowercase_normalizer"
@@ -254,82 +220,43 @@
             },
             "mutationassessor": {
                 "properties": {
-                    "score": {
-                        "type": "float"
-                    },
-                    "rankscore": {
-                        "type": "float"
-                    },
-                    "pred": {
-                        "type": "keyword",
-                        "normalizer": "keyword_lowercase_normalizer"
-                    }
+                    **score_field,
+                    **rankscore_field,
+                    **pred_field,
                 }
             },
             "fathmm": {
                 "properties": {
-                    "score": {
-                        "type": "float"
-                    },
-                    "rankscore": {
-                        "type": "float"
-                    },
-                    "pred": {
-                        "type": "keyword",
-                        "normalizer": "keyword_lowercase_normalizer"
-                    }
+                    **score_field,
+                    **converted_rankscore_field,
+                    **pred_field
                 }
             },
             "provean": {
                 "properties": {
-                    "score": {
-                        "type": "float"
-                    },
-                    "rankscore": {
-                        "type": "float"
-                    },
-                    "pred": {
-                        "type": "keyword",
-                        "normalizer": "keyword_lowercase_normalizer"
-                    }
+                    **score_field,
+                    **converted_rankscore_field,
+                    **pred_field
                 }
             },
             "vest4": {
                 "properties": {
-                    "score": {
-                        "type": "float"
-                    },
-                    "rankscore": {
-                        "type": "float"
-                    }
+                    **score_field,
+                    **rankscore_field
                 }
             },
             "metasvm": {
                 "properties": {
-                    "score": {
-                        "type": "float"
-                    },
-                    "rankscore": {
-                        "type": "float"
-                    },
-                    "pred": {
-                        "type": "keyword",
-                        "normalizer": "keyword_lowercase_normalizer"
-                    }
+                    **score_field,
+                    **rankscore_field,
+                    **pred_field
                 }
             },
             "metalr": {
                 "properties": {
-                    "score": {
-                        "type": "float"
-                    },
-                    "rankscore": {
-                        "type": "float"
-                    },
-                    "pred": {
-                        "type": "keyword",
-                        "normalizer": "keyword_lowercase_normalizer"
-                    }
+                    **score_field,
+                    **rankscore_field,
+                    **pred_field
                 }
             },
             "reliability_index": {
@@ -337,50 +264,28 @@
             },
             "metarnn": {
                 "properties": {
-                    "score": {
-                        "type": "float"
-                    },
-                    "rankscore": {
-                        "type": "float"
-                    },
-                    "pred": {
-                        "type": "keyword",
-                        "normalizer": "keyword_lowercase_normalizer"
-                    }
+                    **score_field,
+                    **rankscore_field,
+                    **pred_field
                 }
             },
             "m-cap": {
                 "properties": {
-                    "score": {
-                        "type": "float"
-                    },
-                    "rankscore": {
-                        "type": "float"
-                    },
-                    "pred": {
-                        "type": "keyword",
-                        "normalizer": "keyword_lowercase_normalizer"
-                    }
+                    **score_field,
+                    **rankscore_field,
+                    **pred_field
                 }
             },
             "revel": {
                 "properties": {
-                    "score": {
-                        "type": "float"
-                    },
-                    "rankscore": {
-                        "type": "float"
-                    }
+                    **score_field,
+                    **rankscore_field
                 }
             },
             "mutpred": {
                 "properties": {
-                    "score": {
-                        "type": "float"
-                    },
-                    "rankscore": {
-                        "type": "float"
-                    },
+                    **score_field,
+                    **rankscore_field,
                     "accession": {
                         "type": "keyword",
                         "normalizer": "keyword_lowercase_normalizer"
@@ -403,160 +308,90 @@
             },
             "mvp": {
                 "properties": {
-                    "score": {
-                        "type": "float"
-                    },
-                    "rankscore": {
-                        "type": "float"
-                    }
+                    **score_field,
+                    **rankscore_field
                 }
             },
             "gmvp": {  # new in 4.4.a
                 "properties": {
-                    "score": {
-                        "type": "float"
-                    },
-                    "rankscore": {
-                        "type": "float"
-                    }
+                    **score_field,
+                    **rankscore_field
                 }
             },
             "mpc": {
                 "properties": {
-                    "score": {
-                        "type": "float"
-                    },
-                    "rankscore": {
-                        "type": "float"
-                    }
+                    **score_field,
+                    **rankscore_field
                 }
             },
             "primateai": {
                 "properties": {
-                    "score": {
-                        "type": "float"
-                    },
-                    "rankscore": {
-                        "type": "float"
-                    },
-                    "pred": {
-                        "type": "keyword",
-                        "normalizer": "keyword_lowercase_normalizer"
-                    }
+                    **score_field,
+                    **rankscore_field,
+                    **pred_field
                 }
             },
             "deogen2": {
                 "properties": {
-                    "score": {
-                        "type": "float"
-                    },
-                    "rankscore": {
-                        "type": "float"
-                    },
-                    "pred": {
-                        "type": "keyword",
-                        "normalizer": "keyword_lowercase_normalizer"
-                    }
+                    **score_field,
+                    **rankscore_field,
+                    **pred_field
                 }
             },
             "bayesdel": {
                 "properties": {
                     "add_af": {
                         "properties": {
-                            "score": {
-                                "type": "float"
-                            },
-                            "rankscore": {
-                                "type": "float"
-                            },
-                            "pred": {
-                                "type": "keyword",
-                                "normalizer": "keyword_lowercase_normalizer"
-                            }
+                            **score_field,
+                            **rankscore_field,
+                            **pred_field
                         }
                     },
                     "no_af": {
                         "properties": {
-                            "score": {
-                                "type": "float"
-                            },
-                            "rankscore": {
-                                "type": "float"
-                            },
-                            "pred": {
-                                "type": "keyword",
-                                "normalizer": "keyword_lowercase_normalizer"
-                            }
+                            **score_field,
+                            **rankscore_field,
+                            **pred_field
                         }
                     }
                 }
             },
             "clinpred": {
                 "properties": {
-                    "score": {
-                        "type": "float"
-                    },
-                    "rankscore": {
-                        "type": "float"
-                    },
-                    "pred": {
-                        "type": "keyword",
-                        "normalizer": "keyword_lowercase_normalizer"
-                    }
+                    **score_field,
+                    **rankscore_field,
+                    **pred_field
                 }
             },
             "list-s2": {
                 "properties": {
-                    "score": {
-                        "type": "float"
-                    },
-                    "rankscore": {
-                        "type": "float"
-                    },
-                    "pred": {
-                        "type": "keyword",
-                        "normalizer": "keyword_lowercase_normalizer"
-                    }
+                    **score_field,
+                    **rankscore_field,
+                    **pred_field
                 }
             },
             "varity_r": {
                 "properties": {
-                    "score": {
-                        "type": "float"
-                    },
-                    "rankscore": {
-                        "type": "float"
-                    }
+                    **score_field,
+                    **rankscore_field
                 }
             },
             "varity_er": {
                 "properties": {
-                    "score": {
-                        "type": "float"
-                    },
-                    "rankscore": {
-                        "type": "float"
-                    }
+                    **score_field,
+                    **rankscore_field
                 }
             },
             "varity_r_loo": {
                 "properties": {
-                    "score": {
-                        "type": "float"
-                    },
-                    "rankscore": {
-                        "type": "float"
-                    }
+                    **score_field,
+                    **rankscore_field
                 }
             },
             "varity_er_loo": {
                 "properties": {
-                    "score": {
-                        "type": "float"
-                    },
-                    "rankscore": {
-                        "type": "float"
-                    }
+                    **score_field,
+                    **rankscore_field
                 }
             },
             "aloft": {
@@ -597,19 +432,14 @@
                         "type": "float"
                     },
                     "pred": {
-                        "type": "keyword",
-                        "normalizer": "keyword_lowercase_normalizer"
+                        "type": "float"  # CADD phred-like scores, not as other predications of string type
                     }
                 }
             },
             "dann": {
                 "properties": {
-                    "score": {
-                        "type": "float"
-                    },
-                    "rankscore": {
-                        "type": "float"
-                    }
+                    **score_field,
+                    **rankscore_field
                 }
             },
             "fathmm-mkl": {
@@ -672,130 +502,74 @@
             },
             "genocanyon": {
                 "properties": {
-                    "score": {
-                        "type": "float"
-                    },
-                    "rankscore": {
-                        "type": "float"
-                    }
+                    **score_field,
+                    **rankscore_field
                 }
             },
             # "integrated": {
             #     "properties": {
-            #         "fitcons_score": {
-            #             "type": "float"
-            #         },
-            #         "fitcons_rankscore": {
-            #             "type": "float"
-            #         },
-            #         "confidence_value": {
-            #             "type": "integer"
-            #         }
+            #         "fitcons_score": { "type": "float" },
+            #         "fitcons_rankscore": { "type": "float" },
+            #         "confidence_value": { "type": "integer" }
             #     }
             # },
             # "gm12878": {
             #     "properties": {
-            #         "fitcons_score": {
-            #             "type": "float"
-            #         },
-            #         "fitcons_rankscore": {
-            #             "type": "float"
-            #         },
-            #         "confidence_value": {
-            #             "type": "integer"
-            #         }
+            #         "fitcons_score": { "type": "float" },
+            #         "fitcons_rankscore": { "type": "float" },
+            #         "confidence_value": { "type": "integer" }
             #     }
             # },
             # "h1-hesc": {
             #     "properties": {
-            #         "fitcons_score": {
-            #             "type": "float"
-            #         },
-            #         "fitcons_rankscore": {
-            #             "type": "float"
-            #         },
-            #         "confidence_value": {
-            #             "type": "integer"
-            #         }
+            #         "fitcons_score": { "type": "float" },
+            #         "fitcons_rankscore": { "type": "float" },
+            #         "confidence_value": { "type": "integer" }
             #     }
             # },
             # "huvec": {
             #     "properties": {
-            #         "fitcons_score": {
-            #             "type": "float"
-            #         },
-            #         "fitcons_rankscore": {
-            #             "type": "float"
-            #         },
-            #         "confidence_value": {
-            #             "type": "integer"
-            #         }
+            #         "fitcons_score": { "type": "float" },
+            #         "fitcons_rankscore": { "type": "float" },
+            #         "confidence_value": { "type": "integer" }
             #     }
             # },
             "fitcons": {
                 "properties": {
                     "integrated": {
                         "properties": {
-                            "score": {
-                                "type": "float"
-                            },
-                            "rankscore": {
-                                "type": "float"
-                            },
-                            "confidence_value": {
-                                "type": "int"
-                            }
+                            **score_field,
+                            **rankscore_field,
+                            **confidence_value_field
                         }
                     },
                     "gm12878": {
                         "properties": {
-                            "score": {
-                                "type": "float"
-                            },
-                            "rankscore": {
-                                "type": "float"
-                            },
-                            "confidence_value": {
-                                "type": "int"
-                            }
+                            **score_field,
+                            **rankscore_field,
+                            **confidence_value_field
                         }
                     },
                     "h1-hesc": {
                         "properties": {
-                            "score": {
-                                "type": "float"
-                            },
-                            "rankscore": {
-                                "type": "float"
-                            },
-                            "confidence_value": {
-                                "type": "int"
-                            }
+                            **score_field,
+                            **rankscore_field,
+                            **confidence_value_field
                         }
                     },
                     "huvec": {
                         "properties": {
-                            "score": {
-                                "type": "float"
-                            },
-                            "rankscore": {
-                                "type": "float"
-                            },
-                            "confidence_value": {
-                                "type": "int"
-                            }
+                            **score_field,
+                            **rankscore_field,
+                            **confidence_value_field
                         }
                     },
                 }
             },
             "linsight": {
                 "properties": {
-                    "score": {
-                        "type": "float"
-                    },
-                    "rankscore": {
-                        "type": "float"
-                    }
+                    **score_field,
+                    **rankscore_field
                 }
             },
             "gerp++": {
@@ -815,32 +589,20 @@
                 "properties": {
                     "100way_vertebrate": {
                         "properties": {
-                            "score": {
-                                "type": "float"
-                            },
-                            "rankscore": {
-                                "type": "float"
-                            }
+                            **score_field,
+                            **rankscore_field
                         }
                     },
                     "470way_mammalian": {  # replaced 30way_mammalian in 4.4.a
                         "properties": {
-                            "score": {
-                                "type": "float"
-                            },
-                            "rankscore": {
-                                "type": "float"
-                            }
+                            **score_field,
+                            **rankscore_field
                         }
                     },
                     "17way_primate": {
                         "properties": {
-                            "score": {
-                                "type": "float"
-                            },
-                            "rankscore": {
-                                "type": "float"
-                            }
+                            **score_field,
+                            **rankscore_field
                         }
                     }
                 }
@@ -849,32 +611,20 @@
                 "properties": {
                     "100way_vertebrate": {
                         "properties": {
-                            "score": {
-                                "type": "float"
-                            },
-                            "rankscore": {
-                                "type": "float"
-                            }
+                            **score_field,
+                            **rankscore_field
                         }
                     },
                     "470way_mammalian": {  # replaced 30way_mammalian in 4.4.a
                         "properties": {
-                            "score": {
-                                "type": "float"
-                            },
-                            "rankscore": {
-                                "type": "float"
-                            }
+                            **score_field,
+                            **rankscore_field
                         }
                     },
                     "17way_primate": {
                         "properties": {
-                            "score": {
-                                "type": "float"
-                            },
-                            "rankscore": {
-                                "type": "float"
-                            }
+                            **score_field,
+                            **rankscore_field
                         }
                     }
                 }
@@ -907,502 +657,258 @@
             },
             "bstatistic": {
                 "properties": {
-                    "score": {
-                        "type": "integer"
-                    },
-                    "converted_rankscore": {
-                        "type": "float"
-                    }
+                    **score_field,
+                    **converted_rankscore_field
                 }
             },
             "1000gp3": {  # changed since 4.4.a
                 "properties": {
-                    "ac": {
-                        "type": "integer"
-                    },
-                    "af": {
-                        "type": "float"
-                    },
-                    # "afr_ac": {
-                    #     "type": "integer"
-                    # },
-                    # "afr_af": {
-                    #     "type": "float"
-                    # },
+                    **allele_count_field,
+                    **allele_freq_field,
+                    # "afr_ac": { "type": "integer" },
+                    # "afr_af": { "type": "float" },
                     "afr": {
                         "properties": {
-                            "ac": {
-                                "type": "integer"
-                            },
-                            "af": {
-                                "type": "float"
-                            }
+                            **allele_count_field,
+                            **allele_freq_field
                         }
                     },
-                    # "eur_ac": {
-                    #     "type": "integer"
-                    # },
-                    # "eur_af": {
-                    #     "type": "float"
-                    # },
+                    # "eur_ac": { "type": "integer" },
+                    # "eur_af": { "type": "float" },
                     "eur": {
                         "properties": {
-                            "ac": {
-                                "type": "integer"
-                            },
-                            "af": {
-                                "type": "float"
-                            }
+                            **allele_count_field,
+                            **allele_freq_field
                         }
                     },
-                    # "amr_ac": {
-                    #     "type": "integer"
-                    # },
-                    # "amr_af": {
-                    #     "type": "float"
-                    # },
+                    # "amr_ac": { "type": "integer" },
+                    # "amr_af": { "type": "float" },
                     "amr": {
                         "properties": {
-                            "ac": {
-                                "type": "integer"
-                            },
-                            "af": {
-                                "type": "float"
-                            }
+                            **allele_count_field,
+                            **allele_freq_field
                         }
                     },
-                    # "eas_ac": {
-                    #     "type": "integer"
-                    # },
-                    # "eas_af": {
-                    #     "type": "float"
-                    # },
+                    # "eas_ac": { "type": "integer" },
+                    # "eas_af": { "type": "float" },
                     "eas": {
                         "properties": {
-                            "ac": {
-                                "type": "integer"
-                            },
-                            "af": {
-                                "type": "float"
-                            }
+                            **allele_count_field,
+                            **allele_freq_field
                         }
                     },
-                    # "sas_ac": {
-                    #     "type": "integer"
-                    # },
-                    # "sas_af": {
-                    #     "type": "float"
-                    # }
+                    # "sas_ac": { "type": "integer" },
+                    # "sas_af": { "type": "float"}
                     "sas": {
                         "properties": {
-                            "ac": {
-                                "type": "integer"
-                            },
-                            "af": {
-                                "type": "float"
-                            }
+                            **allele_count_field,
+                            **allele_freq_field
                         }
                     }
                 }
             },
             "twinsuk": {
                 "properties": {
-                    "ac": {
-                        "type": "integer"
-                    },
-                    "af": {
-                        "type": "float"
-                    }
+                    **allele_count_field,
+                    **allele_freq_field
                 }
             },
             "alspac": {
                 "properties": {
-                    "ac": {
-                        "type": "integer"
-                    },
-                    "af": {
-                        "type": "float"
-                    }
+                    **allele_count_field,
+                    **allele_freq_field
                 }
             },
             "uk10k": {
                 "properties": {
-                    "ac": {
-                        "type": "integer"
-                    },
-                    "af": {
-                        "type": "float"
-                    }
+                    **allele_count_field,
+                    **allele_freq_field
                 }
             },
             "esp6500": {  # changed since 4.4.a
                 "properties": {
-                    # "aa_ac": {
-                    #     "type": "integer"
-                    # },
-                    # "aa_af": {
-                    #     "type": "float"
-                    # },
+                    # "aa_ac": { "type": "integer" },
+                    # "aa_af": { "type": "float" },
                     "aa": {
                         "properties": {
-                            "ac": {
-                                "type": "integer"
-                            },
-                            "af": {
-                                "type": "float"
-                            }
+                            **allele_count_field,
+                            **allele_freq_field
                         }
                     },
-                    # "ea_ac": {
-                    #     "type": "integer"
-                    # },
-                    # "ea_af": {
-                    #     "type": "float"
-                    # }
+                    # "ea_ac": { "type": "integer" },
+                    # "ea_af": { "type": "float" }
                     "ea": {
                         "properties": {
-                            "ac": {
-                                "type": "integer"
-                            },
-                            "af": {
-                                "type": "float"
-                            }
+                            **allele_count_field,
+                            **allele_freq_field
                         }
                     },
                 }
             },
             "exac": {  # changed since 4.4.a
                 "properties": {
-                    "ac": {
-                        "type": "integer"
-                    },
-                    "af": {
-                        "type": "float"
-                    },
-                    "adj_ac": {
-                        "type": "integer"
-                    },
-                    "adj_af": {
-                        "type": "float"
-                    },
-                    # "afr_ac": {
-                    #     "type": "integer"
-                    # },
-                    # "afr_af": {
-                    #     "type": "float"
-                    # },
+                    **allele_count_field,
+                    **allele_freq_field,
+                    **adj_allele_count_field,
+                    **adj_allele_freq_field,
+                    # "afr_ac": { "type": "integer" },
+                    # "afr_af": { "type": "float" },
                     "afr": {
                         "properties": {
-                            "ac": {
-                                "type": "integer"
-                            },
-                            "af": {
-                                "type": "float"
-                            }
+                            **allele_count_field,
+                            **allele_freq_field
                         }
                     },
-                    # "amr_ac": {
-                    #     "type": "integer"
-                    # },
-                    # "amr_af": {
-                    #     "type": "float"
-                    # },
+                    # "amr_ac": { "type": "integer" },
+                    # "amr_af": { "type": "float" },
                     "amr": {
                         "properties": {
-                            "ac": {
-                                "type": "integer"
-                            },
-                            "af": {
-                                "type": "float"
-                            }
+                            **allele_count_field,
+                            **allele_freq_field
                         }
                     },
-                    # "eas_ac": {
-                    #     "type": "integer"
-                    # },
-                    # "eas_af": {
-                    #     "type": "float"
-                    # },
+                    # "eas_ac": { "type": "integer" },
+                    # "eas_af": { "type": "float" },
                     "eas": {
                         "properties": {
-                            "ac": {
-                                "type": "integer"
-                            },
-                            "af": {
-                                "type": "float"
-                            }
+                            **allele_count_field,
+                            **allele_freq_field
                         }
                     },
-                    # "fin_ac": {
-                    #     "type": "integer"
-                    # },
-                    # "fin_af": {
-                    #     "type": "float"
-                    # },
+                    # "fin_ac": { "type": "integer" },
+                    # "fin_af": { "type": "float" },
                     "fin": {
                         "properties": {
-                            "ac": {
-                                "type": "integer"
-                            },
-                            "af": {
-                                "type": "float"
-                            }
+                            **allele_count_field,
+                            **allele_freq_field
                         }
                     },
-                    # "nfe_ac": {
-                    #     "type": "integer"
-                    # },
-                    # "nfe_af": {
-                    #     "type": "float"
-                    # },
+                    # "nfe_ac": { "type": "integer" },
+                    # "nfe_af": { "type": "float" },
                     "nfe": {
                         "properties": {
-                            "ac": {
-                                "type": "integer"
-                            },
-                            "af": {
-                                "type": "float"
-                            }
+                            **allele_count_field,
+                            **allele_freq_field
                         }
                     },
-                    # "sas_ac": {
-                    #     "type": "integer"
-                    # },
-                    # "sas_af": {
-                    #     "type": "float"
-                    # }
+                    # "sas_ac": { "type": "integer" },
+                    # "sas_af": { "type": "float" }
                     "sas": {
                         "properties": {
-                            "ac": {
-                                "type": "integer"
-                            },
-                            "af": {
-                                "type": "float"
-                            }
+                            **allele_count_field,
+                            **allele_freq_field
                         }
                     }
                 }
             },
             "exac_nontcga": {  # changed since 4.4.a
                 "properties": {
-                    "ac": {
-                        "type": "integer"
-                    },
-                    "af": {
-                        "type": "float"
-                    },
-                    "adj_ac": {
-                        "type": "integer"
-                    },
-                    "adj_af": {
-                        "type": "float"
-                    },
-                    # "afr_ac": {
-                    #     "type": "integer"
-                    # },
-                    # "afr_af": {
-                    #     "type": "float"
-                    # },
+                    **allele_count_field,
+                    **allele_freq_field,
+                    **adj_allele_count_field,
+                    **adj_allele_freq_field,
+                    # "afr_ac": { "type": "integer" },
+                    # "afr_af": { "type": "float" },
                     "afr": {
                         "properties": {
-                            "ac": {
-                                "type": "integer"
-                            },
-                            "af": {
-                                "type": "float"
-                            }
+                            **allele_count_field,
+                            **allele_freq_field
                         }
                     },
-                    # "amr_ac": {
-                    #     "type": "integer"
-                    # },
-                    # "amr_af": {
-                    #     "type": "float"
-                    # },
+                    # "amr_ac": { "type": "integer" },
+                    # "amr_af": { "type": "float" },
                     "amr": {
                         "properties": {
-                            "ac": {
-                                "type": "integer"
-                            },
-                            "af": {
-                                "type": "float"
-                            }
+                            **allele_count_field,
+                            **allele_freq_field
                         }
                     },
-                    # "eas_ac": {
-                    #     "type": "integer"
-                    # },
-                    # "eas_af": {
-                    #     "type": "float"
-                    # },
+                    # "eas_ac": { "type": "integer" },
+                    # "eas_af": { "type": "float" },
                     "eas": {
                         "properties": {
-                            "ac": {
-                                "type": "integer"
-                            },
-                            "af": {
-                                "type": "float"
-                            }
+                            **allele_count_field,
+                            **allele_freq_field
                         }
                     },
-                    # "fin_ac": {
-                    #     "type": "integer"
-                    # },
-                    # "fin_af": {
-                    #     "type": "float"
-                    # },
+                    # "fin_ac": { "type": "integer" },
+                    # "fin_af": { "type": "float" },
                     "fin": {
                         "properties": {
-                            "ac": {
-                                "type": "integer"
-                            },
-                            "af": {
-                                "type": "float"
-                            }
+                            **allele_count_field,
+                            **allele_freq_field
                         }
                     },
-                    # "nfe_ac": {
-                    #     "type": "integer"
-                    # },
-                    # "nfe_af": {
-                    #     "type": "float"
-                    # },
+                    # "nfe_ac": { "type": "integer" },
+                    # "nfe_af": { "type": "float" },
                     "nfe": {
                         "properties": {
-                            "ac": {
-                                "type": "integer"
-                            },
-                            "af": {
-                                "type": "float"
-                            }
+                            **allele_count_field,
+                            **allele_freq_field
                         }
                     },
-                    # "sas_ac": {
-                    #     "type": "integer"
-                    # },
-                    # "sas_af": {
-                    #     "type": "float"
-                    # }
+                    # "sas_ac": { "type": "integer" },
+                    # "sas_af": { "type": "float" }
                     "sas": {
                         "properties": {
-                            "ac": {
-                                "type": "integer"
-                            },
-                            "af": {
-                                "type": "float"
-                            }
+                            **allele_count_field,
+                            **allele_freq_field
                         }
                     }
                 }
             },
             "exac_nonpsych": {  # changed since 4.4.a
                 "properties": {
-                    "ac": {
-                        "type": "integer"
-                    },
-                    "af": {
-                        "type": "float"
-                    },
-                    "adj_ac": {
-                        "type": "integer"
-                    },
-                    "adj_af": {
-                        "type": "float"
-                    },
-                    # "afr_ac": {
-                    #     "type": "integer"
-                    # },
-                    # "afr_af": {
-                    #     "type": "float"
-                    # },
+                    **allele_count_field,
+                    **allele_freq_field,
+                    **adj_allele_count_field,
+                    **adj_allele_freq_field,
+                    # "afr_ac": { "type": "integer" },
+                    # "afr_af": { "type": "float" },
                     "afr": {
                         "properties": {
-                            "ac": {
-                                "type": "integer"
-                            },
-                            "af": {
-                                "type": "float"
-                            }
+                            **allele_count_field,
+                            **allele_freq_field
                         }
                     },
-                    # "amr_ac": {
-                    #     "type": "integer"
-                    # },
-                    # "amr_af": {
-                    #     "type": "float"
-                    # },
+                    # "amr_ac": { "type": "integer" },
+                    # "amr_af": { "type": "float" },
                     "amr": {
                         "properties": {
-                            "ac": {
-                                "type": "integer"
-                            },
-                            "af": {
-                                "type": "float"
-                            }
+                            **allele_count_field,
+                            **allele_freq_field
                         }
                     },
-                    # "eas_ac": {
-                    #     "type": "integer"
-                    # },
-                    # "eas_af": {
-                    #     "type": "float"
-                    # },
+                    # "eas_ac": { "type": "integer" },
+                    # "eas_af": { "type": "float" },
                     "eas": {
                         "properties": {
-                            "ac": {
-                                "type": "integer"
-                            },
-                            "af": {
-                                "type": "float"
-                            }
+                            **allele_count_field,
+                            **allele_freq_field
                         }
                     },
-                    # "fin_ac": {
-                    #     "type": "integer"
-                    # },
-                    # "fin_af": {
-                    #     "type": "float"
-                    # },
+                    # "fin_ac": { "type": "integer" },
+                    # "fin_af": { "type": "float" },
                     "fin": {
                         "properties": {
-                            "ac": {
-                                "type": "integer"
-                            },
-                            "af": {
-                                "type": "float"
-                            }
+                            **allele_count_field,
+                            **allele_freq_field
                         }
                     },
-                    # "nfe_ac": {
-                    #     "type": "integer"
-                    # },
-                    # "nfe_af": {
-                    #     "type": "float"
-                    # },
+                    # "nfe_ac": { "type": "integer" },
+                    # "nfe_af": { "type": "float" },
                     "nfe": {
                         "properties": {
-                            "ac": {
-                                "type": "integer"
-                            },
-                            "af": {
-                                "type": "float"
-                            }
+                            **allele_count_field,
+                            **allele_freq_field
                         }
                     },
-                    # "sas_ac": {
-                    #     "type": "integer"
-                    # },
-                    # "sas_af": {
-                    #     "type": "float"
-                    # }
+                    # "sas_ac": { "type": "integer" },
+                    # "sas_af": { "type": "float" }
                     "sas": {
                         "properties": {
-                            "ac": {
-                                "type": "integer"
-                            },
-                            "af": {
-                                "type": "float"
-                            }
+                            **allele_count_field,
+                            **allele_freq_field
                         }
                     }
                 }
@@ -1411,158 +917,86 @@
                 "properties": {
                     "european": {
                         "properties": {
-                            "ac": {
-                                "type": "integer"
-                            },
-                            "an": {
-                                "type": "integer"
-                            },
-                            "af": {
-                                "type": "float"
-                            }
+                            **allele_count_field,
+                            **allele_num_field,
+                            **allele_freq_field
                         }
                     },
                     "african_others": {
                         "properties": {
-                            "ac": {
-                                "type": "integer"
-                            },
-                            "an": {
-                                "type": "integer"
-                            },
-                            "af": {
-                                "type": "float"
-                            }
+                            **allele_count_field,
+                            **allele_num_field,
+                            **allele_freq_field
                         }
                     },
                     "east_asian": {
                         "properties": {
-                            "ac": {
-                                "type": "integer"
-                            },
-                            "an": {
-                                "type": "integer"
-                            },
-                            "af": {
-                                "type": "float"
-                            }
+                            **allele_count_field,
+                            **allele_num_field,
+                            **allele_freq_field
                         }
                     },
                     "african_american": {
                         "properties": {
-                            "ac": {
-                                "type": "integer"
-                            },
-                            "an": {
-                                "type": "integer"
-                            },
-                            "af": {
-                                "type": "float"
-                            }
+                            **allele_count_field,
+                            **allele_num_field,
+                            **allele_freq_field
                         }
                     },
                     "latin_american_1": {
                         "properties": {
-                            "ac": {
-                                "type": "integer"
-                            },
-                            "an": {
-                                "type": "integer"
-                            },
-                            "af": {
-                                "type": "float"
-                            }
+                            **allele_count_field,
+                            **allele_num_field,
+                            **allele_freq_field
                         }
                     },
                     "latin_american_2": {
                         "properties": {
-                            "ac": {
-                                "type": "integer"
-                            },
-                            "an": {
-                                "type": "integer"
-                            },
-                            "af": {
-                                "type": "float"
-                            }
+                            **allele_count_field,
+                            **allele_num_field,
+                            **allele_freq_field
                         }
                     },
                     "other_asian": {
                         "properties": {
-                            "ac": {
-                                "type": "integer"
-                            },
-                            "an": {
-                                "type": "integer"
-                            },
-                            "af": {
-                                "type": "float"
-                            }
+                            **allele_count_field,
+                            **allele_num_field,
+                            **allele_freq_field
                         }
                     },
                     "south_asian": {
                         "properties": {
-                            "ac": {
-                                "type": "integer"
-                            },
-                            "an": {
-                                "type": "integer"
-                            },
-                            "af": {
-                                "type": "float"
-                            }
+                            **allele_count_field,
+                            **allele_num_field,
+                            **allele_freq_field
                         }
                     },
                     "other": {
                         "properties": {
-                            "ac": {
-                                "type": "integer"
-                            },
-                            "an": {
-                                "type": "integer"
-                            },
-                            "af": {
-                                "type": "float"
-                            }
+                            **allele_count_field,
+                            **allele_num_field,
+                            **allele_freq_field
                         }
                     },
                     "african": {
                         "properties": {
-                            "ac": {
-                                "type": "integer"
-                            },
-                            "an": {
-                                "type": "integer"
-                            },
-                            "af": {
-                                "type": "float"
-                            }
+                            **allele_count_field,
+                            **allele_num_field,
+                            **allele_freq_field
                         }
                     },
                     "asian": {
                         "properties": {
-                            "ac": {
-                                "type": "integer"
-                            },
-                            "an": {
-                                "type": "integer"
-                            },
-                            "af": {
-                                "type": "float"
-                            }
+                            **allele_count_field,
+                            **allele_num_field,
+                            **allele_freq_field
                         }
                     },
                     "total": {
                         "properties": {
-                            "ac": {
-                                "type": "integer"
-                            },
-                            "an": {
-                                "type": "integer"
-                            },
-                            "af": {
-                                "type": "float"
-                            }
+                            **allele_count_field,
+                            **allele_num_field,
+                            **allele_freq_field
                         }
                     },
                 }

From b38ee7ebaef0102a2cb24f75946a935e180f42af Mon Sep 17 00:00:00 2001
From: Yao Yao <yaoy.cs@gmail.com>
Date: Wed, 27 Sep 2023 12:20:56 -0700
Subject: [PATCH 04/13] add destination config for some converted_rankscore
 fields

---
 .../dataload/sources/dbnsfp/dbnsfp_parser.py    | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/src/hub/dataload/sources/dbnsfp/dbnsfp_parser.py b/src/hub/dataload/sources/dbnsfp/dbnsfp_parser.py
index 1f1fc669..ea299779 100644
--- a/src/hub/dataload/sources/dbnsfp/dbnsfp_parser.py
+++ b/src/hub/dataload/sources/dbnsfp/dbnsfp_parser.py
@@ -1,6 +1,5 @@
 import re
 import csv
-import glob
 from enum import Flag
 from dataclasses import dataclass
 from itertools import chain
@@ -265,10 +264,10 @@ def split_dedup(values: list, sep: str, na_values: set = NA_VALUES):
     Column("VindijiaNeandertal", dest="vindijia_neandertal", transform=split_genotype),
     Column("ChagyrskayaNeandertal", dest="chagyrskaya_neandertal", transform=split_genotype),
     Column("SIFT_score", transform=split_float),
-    Column("SIFT_converted_rankscore", transform=split_float),
+    Column("SIFT_converted_rankscore", dest="sift.converted_rankscore", transform=split_float),
     Column("SIFT_pred", transform=split_str),
     Column("SIFT4G_score", transform=split_float),
-    Column("SIFT4G_converted_rankscore", transform=split_float),
+    Column("SIFT4G_converted_rankscore", dest="sift4g.converted_rankscore", transform=split_float),
     Column("SIFT4G_pred", transform=split_str),
     Column("Polyphen2_HDIV_score", transform=split_float),
     Column("Polyphen2_HDIV_rankscore", transform=split_float),
@@ -277,11 +276,11 @@ def split_dedup(values: list, sep: str, na_values: set = NA_VALUES):
     Column("Polyphen2_HVAR_rankscore", transform=split_float),
     Column("Polyphen2_HVAR_pred", transform=split_str),
     Column("LRT_score", transform=split_float),
-    Column("LRT_converted_rankscore", transform=split_float),
+    Column("LRT_converted_rankscore", dest="lrt.converted_rankscore", transform=split_float),
     Column("LRT_pred", transform=split_str),
     Column("LRT_Omega", transform=split_float),
     Column("MutationTaster_score", transform=split_float),
-    Column("MutationTaster_converted_rankscore", transform=split_float),
+    Column("MutationTaster_converted_rankscore", dest="mutationtaster.converted_rankscore", transform=split_float),
     Column("MutationTaster_pred", transform=split_str),
     Column("MutationTaster_model", transform=split_str),
     Column("MutationTaster_AAE", transform=split_str),
@@ -289,10 +288,10 @@ def split_dedup(values: list, sep: str, na_values: set = NA_VALUES):
     Column("MutationAssessor_rankscore", transform=split_float),
     Column("MutationAssessor_pred", transform=split_str),
     Column("FATHMM_score", transform=split_float),
-    Column("FATHMM_converted_rankscore", transform=split_float),
+    Column("FATHMM_converted_rankscore", dest="fathmm.converted_rankscore", transform=split_float),
     Column("FATHMM_pred", transform=split_str),
     Column("PROVEAN_score", transform=split_float),
-    Column("PROVEAN_converted_rankscore", transform=split_float),
+    Column("PROVEAN_converted_rankscore", dest="provean.converted_rankscore", transform=split_float),
     Column("PROVEAN_pred", transform=split_str),
     Column("VEST4_score", transform=split_float),
     Column("VEST4_rankscore", transform=split_float),
@@ -340,7 +339,7 @@ def split_dedup(values: list, sep: str, na_values: set = NA_VALUES):
     Column("LIST-S2_score", transform=split_float),
     Column("LIST-S2_rankscore", transform=split_float),
     Column("LIST-S2_pred", transform=split_str),
-    Column("VARITY_R_score", dest="varity_r.score", transform=split_float),  # VARITY new in 4.4.a
+    Column("VARITY_R_score", dest="varity_r.score", transform=split_float),  # new in 4.4.a
     Column("VARITY_R_rankscore", dest="varity_r.rankscore", transform=split_float),
     Column("VARITY_ER_score", dest="varity_er.score", transform=split_float),
     Column("VARITY_ER_rankscore", dest="varity_er.rankscore", transform=split_float),
@@ -481,7 +480,7 @@ def split_dedup(values: list, sep: str, na_values: set = NA_VALUES):
     Column("ExAC_nonpsych_NFE_AF", dest="exac_nonpsych.nfe.af", transform=float),
     Column("ExAC_nonpsych_SAS_AC", dest="exac_nonpsych.sas.ac", transform=int),
     Column("ExAC_nonpsych_SAS_AF", dest="exac_nonpsych.sas.af", transform=float),
-    Column("ALFA_European_AC", dest="alfa.european.ac", transform=int),  # new ALFA field, add mapping
+    Column("ALFA_European_AC", dest="alfa.european.ac", transform=int),
     Column("ALFA_European_AN", dest="alfa.european.an", transform=int),
     Column("ALFA_European_AF", dest="alfa.european.af", transform=float),
     Column("ALFA_African_Others_AC", dest="alfa.african_others.ac", transform=int),

From 392df191b2f60dbc79843cd8e113c770996b100e Mon Sep 17 00:00:00 2001
From: Yao Yao <yaoy.cs@gmail.com>
Date: Wed, 11 Oct 2023 14:32:05 -0700
Subject: [PATCH 05/13] fix a typo

---
 src/hub/dataload/sources/dbnsfp/dbnsfp_mapping.py | 2 +-
 src/hub/dataload/sources/dbnsfp/dbnsfp_parser.py  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/hub/dataload/sources/dbnsfp/dbnsfp_mapping.py b/src/hub/dataload/sources/dbnsfp/dbnsfp_mapping.py
index 2bf02501..954b1160 100644
--- a/src/hub/dataload/sources/dbnsfp/dbnsfp_mapping.py
+++ b/src/hub/dataload/sources/dbnsfp/dbnsfp_mapping.py
@@ -127,7 +127,7 @@
                 "type": "keyword",
                 "normalizer": "keyword_lowercase_normalizer"
             },
-            "genecode_basic": {
+            "gencode_basic": {
                 "type": "keyword",
                 "normalizer": "keyword_lowercase_normalizer"
             },
diff --git a/src/hub/dataload/sources/dbnsfp/dbnsfp_parser.py b/src/hub/dataload/sources/dbnsfp/dbnsfp_parser.py
index ea299779..ebd2eb3b 100644
--- a/src/hub/dataload/sources/dbnsfp/dbnsfp_parser.py
+++ b/src/hub/dataload/sources/dbnsfp/dbnsfp_parser.py
@@ -251,7 +251,7 @@ def split_dedup(values: list, sep: str, na_values: set = NA_VALUES):
     Column("HGVSc_VEP", tag=COLUMN_TAG.HGVS_CODING),  # ditto
     Column("HGVSp_VEP", tag=COLUMN_TAG.HGVS_PROTEIN),  # ditto
     Column("APPRIS", transform=split_str),
-    Column("GENCODE_basic", dest="genecode_basic", transform=split_str),
+    Column("GENCODE_basic", dest="gencode_basic", transform=split_str),
     Column("TSL", transform=split_int),
     Column("VEP_canonical", dest="vep_canonical", transform=split_str),
     Column("cds_strand", dest="cds_strand", transform=split_str),

From 349c002cec3ce564714207de526cfb352a430c8f Mon Sep 17 00:00:00 2001
From: Yao Yao <yaoy.cs@gmail.com>
Date: Mon, 16 Oct 2023 16:54:13 -0400
Subject: [PATCH 06/13] mutpred top 5 features are now split by semicolons only

---
 src/hub/dataload/sources/dbnsfp/dbnsfp_parser.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/hub/dataload/sources/dbnsfp/dbnsfp_parser.py b/src/hub/dataload/sources/dbnsfp/dbnsfp_parser.py
index ebd2eb3b..848f2fb4 100644
--- a/src/hub/dataload/sources/dbnsfp/dbnsfp_parser.py
+++ b/src/hub/dataload/sources/dbnsfp/dbnsfp_parser.py
@@ -144,11 +144,11 @@ def make_zero_based(pos: str):
 
 def parse_mutpred_top5features(value):
     """
-    `mutpred_mechanisms` is a string combined from 5 clauses, separated by semicolons (with whitespaces).
+    `mutpred_mechanisms` is a string combined from 5 clauses, separated by semicolons.
     Each clause has the same pattern of "<mechanism> (P = <p_val>)".
 
-    E.g. "Loss of helix (P = 0.0444); Gain of loop (P = 0.0502); Gain of catalytic residue at A444 (P = 0.1876); \
-    Gain of solvent accessibility (P = 0.2291); Loss of disorder (P = 0.9475)"
+    E.g. "Loss of helix (P = 0.0444);Gain of loop (P = 0.0502);Gain of catalytic residue at A444 (P = 0.1876);\
+    Gain of solvent accessibility (P = 0.2291);Loss of disorder (P = 0.9475)"
 
     Here we apply regex to parse this string
 
@@ -165,7 +165,7 @@ def parse_mutpred_top5features(value):
     if value is None:
         return None
 
-    mp_list = [tuple(e for e in MUTPRED_TOP5FEATURES_PATTERN.split(s) if e.strip()) for s in value.split("; ")]
+    mp_list = [tuple(e for e in MUTPRED_TOP5FEATURES_PATTERN.split(s) if e.strip()) for s in value.split(";")]
     result = [{"mechanism": mp[0], "p_val": float(mp[1])} for mp in mp_list if mp and len(mp) == 2]
 
     return _check_length(result)

From 5770ad5b3f6765947bce5bbb2ef27ab8f1e3ad33 Mon Sep 17 00:00:00 2001
From: Yao Yao <yaoy.cs@gmail.com>
Date: Mon, 16 Oct 2023 16:55:30 -0400
Subject: [PATCH 07/13] rename the parser script

---
 .../dbnsfp/{dbnsfp_parser.py => dbnsfp_parser_44a_v1.py}        | 0
 src/hub/dataload/sources/dbnsfp/dbnsfp_upload.py                | 2 +-
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename src/hub/dataload/sources/dbnsfp/{dbnsfp_parser.py => dbnsfp_parser_44a_v1.py} (100%)

diff --git a/src/hub/dataload/sources/dbnsfp/dbnsfp_parser.py b/src/hub/dataload/sources/dbnsfp/dbnsfp_parser_44a_v1.py
similarity index 100%
rename from src/hub/dataload/sources/dbnsfp/dbnsfp_parser.py
rename to src/hub/dataload/sources/dbnsfp/dbnsfp_parser_44a_v1.py
diff --git a/src/hub/dataload/sources/dbnsfp/dbnsfp_upload.py b/src/hub/dataload/sources/dbnsfp/dbnsfp_upload.py
index 908ab05c..9815bd2b 100644
--- a/src/hub/dataload/sources/dbnsfp/dbnsfp_upload.py
+++ b/src/hub/dataload/sources/dbnsfp/dbnsfp_upload.py
@@ -2,7 +2,7 @@
 import glob
 
 from .dbnsfp_mapping import mapping
-from .dbnsfp_parser import load_file
+from .dbnsfp_parser_44a_v1 import load_file
 import biothings.hub.dataload.uploader as uploader
 from hub.dataload.uploader import SnpeffPostUpdateUploader
 from hub.dataload.storage import MyVariantIgnoreDuplicatedStorage

From a8a27fc2e7111e8eb67a9fe1cd55a5fa6b2c6d6d Mon Sep 17 00:00:00 2001
From: Yao Yao <yaoy.cs@gmail.com>
Date: Fri, 27 Oct 2023 21:29:19 -0400
Subject: [PATCH 08/13] change VARITY field structure

---
 ...fp_mapping.py => dbnsfp_mapping_44a_v1.py} |  0
 .../sources/dbnsfp/dbnsfp_parser_44a_v1.py    | 23 ++++++++-----------
 2 files changed, 9 insertions(+), 14 deletions(-)
 rename src/hub/dataload/sources/dbnsfp/{dbnsfp_mapping.py => dbnsfp_mapping_44a_v1.py} (100%)

diff --git a/src/hub/dataload/sources/dbnsfp/dbnsfp_mapping.py b/src/hub/dataload/sources/dbnsfp/dbnsfp_mapping_44a_v1.py
similarity index 100%
rename from src/hub/dataload/sources/dbnsfp/dbnsfp_mapping.py
rename to src/hub/dataload/sources/dbnsfp/dbnsfp_mapping_44a_v1.py
diff --git a/src/hub/dataload/sources/dbnsfp/dbnsfp_parser_44a_v1.py b/src/hub/dataload/sources/dbnsfp/dbnsfp_parser_44a_v1.py
index 848f2fb4..014547ee 100644
--- a/src/hub/dataload/sources/dbnsfp/dbnsfp_parser_44a_v1.py
+++ b/src/hub/dataload/sources/dbnsfp/dbnsfp_parser_44a_v1.py
@@ -150,12 +150,7 @@ def parse_mutpred_top5features(value):
     E.g. "Loss of helix (P = 0.0444);Gain of loop (P = 0.0502);Gain of catalytic residue at A444 (P = 0.1876);\
     Gain of solvent accessibility (P = 0.2291);Loss of disorder (P = 0.9475)"
 
-    Here we apply regex to parse this string
-
-        regex = re.compile(r" \(P = ([eE0-9.-]*)\)$")
-        [(e for e in regex.split(s) if e.strip()) for s in string.split("; ")]
-
-    and get a list of 5 tuples like
+    Here we apply regex to parse this string and get a list of 5 tuples like
 
         [('Loss of helix', '0.0444'), ('Gain of loop', '0.0502'), ('Gain of catalytic residue at A444', '0.1876'),
         ('Gain of solvent accessibility', '0.2291'), ('Loss of disorder', '0.9475')]
@@ -339,14 +334,14 @@ def split_dedup(values: list, sep: str, na_values: set = NA_VALUES):
     Column("LIST-S2_score", transform=split_float),
     Column("LIST-S2_rankscore", transform=split_float),
     Column("LIST-S2_pred", transform=split_str),
-    Column("VARITY_R_score", dest="varity_r.score", transform=split_float),  # new in 4.4.a
-    Column("VARITY_R_rankscore", dest="varity_r.rankscore", transform=split_float),
-    Column("VARITY_ER_score", dest="varity_er.score", transform=split_float),
-    Column("VARITY_ER_rankscore", dest="varity_er.rankscore", transform=split_float),
-    Column("VARITY_R_LOO_score", dest="varity_r_loo.score", transform=split_float),
-    Column("VARITY_R_LOO_rankscore", dest="varity_r_loo.rankscore", transform=split_float),
-    Column("VARITY_ER_LOO_score", dest="varity_er_loo.score", transform=split_float),
-    Column("VARITY_ER_LOO_rankscore", dest="varity_er_loo.rankscore", transform=split_float),
+    Column("VARITY_R_score", transform=split_float),  # new in 4.4.a
+    Column("VARITY_R_rankscore", transform=split_float),
+    Column("VARITY_ER_score", transform=split_float),
+    Column("VARITY_ER_rankscore", transform=split_float),
+    Column("VARITY_R_LOO_score", dest="varity.r_loo.score", transform=split_float),
+    Column("VARITY_R_LOO_rankscore", dest="varity.r_loo.rankscore", transform=split_float),
+    Column("VARITY_ER_LOO_score", dest="varity.er_loo.score", transform=split_float),
+    Column("VARITY_ER_LOO_rankscore", dest="varity.er_loo.rankscore", transform=split_float),
     Column("Aloft_Fraction_transcripts_affected", dest="aloft.fraction_transcripts_affected", transform=split_str),
     Column("Aloft_prob_Tolerant", dest="aloft.prob_tolerant", transform=split_str),
     Column("Aloft_prob_Recessive", dest="aloft.prob_recessive", transform=split_str),

From 8c3483f0aa32d72f0b594d84323c2e64bf0677c5 Mon Sep 17 00:00:00 2001
From: Yao Yao <yaoy.cs@gmail.com>
Date: Fri, 27 Oct 2023 21:29:58 -0400
Subject: [PATCH 09/13] change VARITY and MutationTaster mappings

---
 .../sources/dbnsfp/dbnsfp_mapping_44a_v1.py   | 66 ++++++++++---------
 1 file changed, 36 insertions(+), 30 deletions(-)

diff --git a/src/hub/dataload/sources/dbnsfp/dbnsfp_mapping_44a_v1.py b/src/hub/dataload/sources/dbnsfp/dbnsfp_mapping_44a_v1.py
index 954b1160..c1d7bbcf 100644
--- a/src/hub/dataload/sources/dbnsfp/dbnsfp_mapping_44a_v1.py
+++ b/src/hub/dataload/sources/dbnsfp/dbnsfp_mapping_44a_v1.py
@@ -206,15 +206,19 @@
             },
             "mutationtaster": {
                 "properties": {
-                    **score_field,
                     **converted_rankscore_field,
-                    **pred_field,
-                    "model": {
-                        "type": "keyword",
-                        "normalizer": "keyword_lowercase_normalizer"
-                    },
-                    "aae": {
-                        "type": "text"
+                    "analysis": {  # see prune_mutation_taster()
+                        "properties": {
+                            **pred_field,
+                            **score_field,
+                            "model": {
+                                "type": "keyword",
+                                "normalizer": "keyword_lowercase_normalizer"
+                            },
+                            "aae": {
+                                "type": "text"
+                            }
+                        }
                     }
                 }
             },
@@ -370,28 +374,30 @@
                     **pred_field
                 }
             },
-            "varity_r": {
-                "properties": {
-                    **score_field,
-                    **rankscore_field
-                }
-            },
-            "varity_er": {
-                "properties": {
-                    **score_field,
-                    **rankscore_field
-                }
-            },
-            "varity_r_loo": {
-                "properties": {
-                    **score_field,
-                    **rankscore_field
-                }
-            },
-            "varity_er_loo": {
-                "properties": {
-                    **score_field,
-                    **rankscore_field
+            "varity": {
+                "r": {
+                    "properties": {
+                        **score_field,
+                        **rankscore_field
+                    }
+                },
+                "er": {
+                    "properties": {
+                        **score_field,
+                        **rankscore_field
+                    }
+                },
+                "r_loo": {
+                    "properties": {
+                        **score_field,
+                        **rankscore_field
+                    }
+                },
+                "er_loo": {
+                    "properties": {
+                        **score_field,
+                        **rankscore_field
+                    }
                 }
             },
             "aloft": {

From 6734bb282b2e03fc28d1fc5ebb1aef9671a9eaaf Mon Sep 17 00:00:00 2001
From: Yao Yao <yaoy.cs@gmail.com>
Date: Fri, 27 Oct 2023 21:31:04 -0400
Subject: [PATCH 10/13] initial implementation of V2 parser where protein
 fields are organized together

---
 .../sources/dbnsfp/dbnsfp_parser_44a_v2.py    | 894 ++++++++++++++++++
 1 file changed, 894 insertions(+)
 create mode 100644 src/hub/dataload/sources/dbnsfp/dbnsfp_parser_44a_v2.py

diff --git a/src/hub/dataload/sources/dbnsfp/dbnsfp_parser_44a_v2.py b/src/hub/dataload/sources/dbnsfp/dbnsfp_parser_44a_v2.py
new file mode 100644
index 00000000..a8f24a7a
--- /dev/null
+++ b/src/hub/dataload/sources/dbnsfp/dbnsfp_parser_44a_v2.py
@@ -0,0 +1,894 @@
+import re
+import csv
+from enum import Flag
+from dataclasses import dataclass
+from typing import Callable
+from types import SimpleNamespace
+from utils.table import TableColumn, create_tag_column_map
+from utils.dotfield import parse_dot_fields
+from biothings.utils.common import anyfile
+
+
+# VALID_COLUMN_NO = 367  # for 4.1a
+# VALID_COLUMN_NO = 642  # for 4.2a
+# VALID_COLUMN_NO = 643  # for 4.3a
+VALID_COLUMN_NO = 689  # for 4.4a
+
+MUTPRED_TOP5FEATURES_PATTERN = re.compile(r" \(P = ([eE0-9.-]*)\)$")
+
+# dbNSFP_variant use "." for missing values;
+# other none values are borrowed from the `biothings.utils.dataload.dict_sweep` function and
+#   from the default `na_values` argument of pandas.read_csv().
+# see https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html
+NA_VALUES = frozenset({
+    r'.', r'', r" ", r"-", r'#N/A', r'#N/A N/A', r'#NA', r'-1.#IND', r'-1.#QNAN', r'-NaN', r'-nan',
+    r'1.#IND', r'1.#QNAN', r'<NA>', r'N/A', r'NA', r'NULL', r'NaN', r'n/a', r'nan', r'null', r'none',
+    r"Not Available", r"unknown"
+})
+
+# A tag can be any of string value; columns with the same tag are looked-up as a group
+COLUMN_TAG = SimpleNamespace()
+COLUMN_TAG.HG38_POS = "hg38_pos"  # for "pos(1-based)"
+COLUMN_TAG.HG19_POS = "hg19_pos"  # for "hg19_pos(1-based)"
+COLUMN_TAG.HG38_CHROM = "hg38_chrom"  # for "#chr"
+COLUMN_TAG.HG19_CHROM = "hg19_chrom"  # for "hg19_chr"
+COLUMN_TAG.REF_ALLELE = "ref"
+COLUMN_TAG.ALT_ALLELE = "alt"
+COLUMN_TAG.GTEX_GENE = "gtex_gene"
+COLUMN_TAG.GTEX_TISSUE = "gtex_tissue"
+# Note that column "MutationTaster_converted_rankscore" is not tagged
+COLUMN_TAG.MUTATION_TASTER_AAE = "MutationTaster_AAE"
+COLUMN_TAG.MUTATION_TASTER_MODEL = "MutationTaster_model"
+COLUMN_TAG.MUTATION_TASTER_PRED = "MutationTaster_pred"
+COLUMN_TAG.MUTATION_TASTER_SCORE = "MutationTaster_score"
+COLUMN_TAG.ALOFT_FRACTION_TRANSCRIPTS_AFFECTED = "Aloft_Fraction_transcripts_affected"
+COLUMN_TAG.ALOFT_PROB_TOLERANT = "Aloft_prob_Tolerant"
+COLUMN_TAG.ALOFT_PROB_RECESSIVE = "Aloft_prob_Recessive"
+COLUMN_TAG.ALOFT_PROB_DOMINANT = "Aloft_prob_Dominant"
+COLUMN_TAG.ALOFT_PRED = "Aloft_pred"
+COLUMN_TAG.ALOFT_CONFIDENCE = "Aloft_Confidence"
+
+
+def _check_length(lst: list):
+    """
+    If the input list is empty (i.e. length is 0), return None;
+    if the input list has only 1 element (i.e. length is 1), return the element;
+    otherwise return the list as-is.
+    """
+    if not lst:
+        return None
+    if len(lst) == 1:
+        return lst[0]
+    return lst
+
+
+class Assembly(Flag):
+    HG19 = 1  # indicates that a column belongs to hg19 docs
+    HG38 = 2  # indicates that a column belongs to hg38 docs
+    BOTH = HG19 | HG38  # (BOTH == 3) applies to both assemblies
+
+    @classmethod
+    def assembly_of(cls, name: str):
+        # E.g. when member_name == "HG19", member is Assembly.HG19
+        for member_name, member in cls.__members__.items():
+            if name.upper() == member_name:
+                return member
+        else:
+            raise ValueError(f"'{cls.__name__}' enum not found for '{name}'")
+
+
+@dataclass
+class Column(TableColumn):
+    """
+    Assembly-specific column configuration
+    """
+    assembly: str | Assembly = None  # which assembly or assemblies this column belongs to
+
+    def __post_init__(self):
+        super().__post_init__()
+
+        if self.assembly is None:
+            self.assembly = Assembly.BOTH
+            return
+
+        if isinstance(self.assembly, Assembly):
+            return
+
+        if isinstance(self.assembly, str):
+            self.assembly = Assembly.assembly_of(self.assembly)
+            return
+
+        raise ValueError(f"Cannot recognize assembly {self.assembly}")
+
+    def is_hg19(self):
+        return bool(self.assembly & Assembly.HG19)  # true if self.assembly is HG19 or BOTH
+
+    def is_hg38(self):
+        return bool(self.assembly & Assembly.HG38)  # true if self.assembly is HG38 or BOTH
+
+
+def split(sep: str, na_values: set = NA_VALUES, drop_na: bool = False):
+    def _func_drop_na(value: str):
+        result = [v for v in value.split(sep) if v not in na_values]
+        return result
+
+    def _func_keep_na(value: str):
+        result = [v if v not in na_values else None for v in value.split(sep)]
+        if all(v is None for v in result):  # we keep NA values in the result; however if every value in the result is None, we treat whole result as None
+            return None
+        return result
+
+    return _func_drop_na if drop_na else _func_keep_na
+
+
+def split_cast(sep: str, astype: Callable, na_values: set = NA_VALUES, drop_na: bool = False):
+    def _func_drop_na(value: str):
+        result = [astype(v) for v in value.split(sep) if v not in na_values]
+        return result
+
+    def _func_keep_na(value: str):
+        result = [astype(v) if v not in na_values else None for v in value.split(sep)]
+        if all(v is None for v in result):  # we keep NA values in the result; however if every value in the result is None, we treat whole result as None
+            return None
+        return result
+
+    return _func_drop_na if drop_na else _func_keep_na
+
+
+def compose(_split_func: Callable, _unlist_func: Callable):
+    def _func(value):
+        split_result = _split_func(value)
+        if split_result is None:
+            return None
+        return _unlist_func(split_result)
+    return _func
+
+
+# Transforming functions for "protein" data sources
+# We don't compose with _check_length because it would be easier to apply "zip" on the split results if all values are lists
+split_str = split(";")
+split_float = split_cast(";", float)
+split_int = split_cast(";", int)
+
+# Transforming functions for other common non-"protein" data sources
+split_str_drop_na = compose(split(";", drop_na=True), _check_length)
+split_float_drop_na = compose(split_cast(";", float, drop_na=True), _check_length)
+split_int_drop_na = compose(split_cast(";", int, drop_na=True), _check_length)
+
+# Transforming functions for specific data sources
+split_clinvar = compose(split(r"|", drop_na=True), _check_length)
+split_genotype = compose(split(r"/", drop_na=True), _check_length)  # for "AltaiNeandertal", "Denisova", "VindijiaNeandertal", and "ChagyrskayaNeandertal"
+
+
+def normalize_chrom(chr: str):
+    """
+    In dbNSFP, chromosomes are marked 1-22, "X", "Y", and "M" (Mitochondrial).
+    However, in MyVariant, we mark Mitochondrial chromosome "MT".
+    """
+    return "MT" if chr == "M" else chr
+
+
+def make_zero_based(pos: str):
+    """
+    Convert a 1-based chromosomal position to a 0-based start-end pair.
+    """
+    _pos = int(pos)
+    return {"start": _pos, "end": _pos}
+
+
+def parse_mutpred_top5features(value):
+    """
+    `mutpred_mechanisms` is a string combined from 5 clauses, separated by semicolons.
+    Each clause has the same pattern of "<mechanism> (P = <p_val>)".
+
+    E.g. "Loss of helix (P = 0.0444);Gain of loop (P = 0.0502);Gain of catalytic residue at A444 (P = 0.1876);\
+    Gain of solvent accessibility (P = 0.2291);Loss of disorder (P = 0.9475)"
+
+    Here we apply regex to parse this string and get a list of 5 tuples like
+
+        [('Loss of helix', '0.0444'), ('Gain of loop', '0.0502'), ('Gain of catalytic residue at A444', '0.1876'),
+        ('Gain of solvent accessibility', '0.2291'), ('Loss of disorder', '0.9475')]
+
+    Then construct a list of 5 dictionaries of <"mechanism": xxx, "p_val": xxx> and return
+    """
+    if value is None:
+        return None
+
+    mp_list = [tuple(e for e in MUTPRED_TOP5FEATURES_PATTERN.split(s) if e.strip()) for s in value.split(";")]
+    result = [{"mechanism": mp[0], "p_val": float(mp[1])} for mp in mp_list if mp and len(mp) == 2]
+
+    return _check_length(result)
+
+
+def parse_siphy_29way_pi(value: str):
+    """
+    A "SiPhy_29way_pi" value, if not None, is a string separated by ":", representing an estimated stationary
+    distribution of A, C, G and T at a variant site. E.g. "0.0:0.5259:0.0:0.4741".
+
+    Here we split the string and convert it to a dict of {<nt>: <freq>}.
+    """
+    if value is None:
+        return None
+
+    freq = [float(v) for v in value.split(":")]
+    pi_dict = {"a": freq[0], "c": freq[1], "g": freq[2], "t": freq[3]}
+    return pi_dict
+
+
+def split_zip(values: list[str], sep: str, na_values: set = NA_VALUES):
+    """
+    Split each string in values by sep into a list, and generate tuples from all the lists.
+
+    E.g. with the following input,
+
+        values = ["P54578-2;P54578-3;A6NJA2;P54578", "UBP14_HUMAN;UBP14_HUMAN;A6NJA2_HUMAN;UBP14_HUMAN"]
+
+    the returned generator can make:
+
+        [('P54578-2', 'UBP14_HUMAN'),
+         ('P54578-3', 'UBP14_HUMAN'),
+         ('A6NJA2', 'A6NJA2_HUMAN'),
+         ('P54578', 'UBP14_HUMAN')]
+
+    Reference implementation: https://docs.python.org/3.3/library/functions.html#zip
+    """
+    sentinel = object()
+    iterators = [(v if v not in na_values else None for v in value.split(sep)) for value in values]
+
+    while iterators:  # always true if iterators is not empty
+        result = []
+        for it in iterators:
+            element = next(it, sentinel)
+            if element is sentinel:  # terminate at once when a `it` is fully consumed
+                return
+            result.append(element)
+        yield tuple(result)
+
+
+COLUMNS = [
+    Column("#chr", dest="chrom", transform=normalize_chrom, assembly="hg38", tag=COLUMN_TAG.HG38_CHROM),  # representing "chrom" only for assembly 'hg38'
+    Column("pos(1-based)", dest="hg38", transform=make_zero_based, tag=COLUMN_TAG.HG38_POS),
+    Column("ref", transform=str.upper, tag=COLUMN_TAG.REF_ALLELE),
+    Column("alt", transform=str.upper, tag=COLUMN_TAG.ALT_ALLELE),
+    Column("aaref", dest="aa.ref"),
+    Column("aaalt", dest="aa.alt"),
+    Column("rs_dbSNP", dest="rsid"),
+    Column("hg19_chr", dest="chrom", transform=normalize_chrom, assembly="hg19", tag=COLUMN_TAG.HG19_CHROM),  # representing "chrom" only for assembly 'hg19'
+    Column("hg19_pos(1-based)", dest="hg19", transform=make_zero_based, tag=COLUMN_TAG.HG19_POS),
+    # Column("hg18_chr"),  # Not Used
+    Column("hg18_pos(1-based)", dest="hg18", transform=make_zero_based),
+    Column("aapos", dest="protein.aa.pos", transform=split_int),
+    Column("genename", dest="protein.genename", transform=split_str),
+    Column("Ensembl_geneid", dest="protein.geneid", transform=split_str),
+    Column("Ensembl_transcriptid", dest="protein.transcriptid", transform=split_str),
+    Column("Ensembl_proteinid", dest="protein.proteinid", transform=split_str),
+    Column("Uniprot_acc", dest="protein.uniprot.acc", transform=split_str),
+    Column("Uniprot_entry", dest="protein.uniprot.entry", transform=split_str),
+    Column("HGVSc_ANNOVAR", dest="protein.hgvsc.annovar", transform=split_str),
+    Column("HGVSp_ANNOVAR", dest="protein.hgvsp.annovar", transform=split_str),
+    Column("HGVSc_snpEff", dest="protein.hgvsc.snpeff", transform=split_str),
+    Column("HGVSp_snpEff", dest="protein.hgvsp.snpeff", transform=split_str),
+    Column("HGVSc_VEP", dest="protein.hgvsc.vep", transform=split_str),
+    Column("HGVSp_VEP", dest="protein.hgvsp.vep", transform=split_str),
+    Column("APPRIS", dest="protein.appris", transform=split_str),
+    Column("GENCODE_basic", dest="protein.gencode_basic", transform=split_str),
+    Column("TSL", dest="protein.tsl", transform=split_int),
+    Column("VEP_canonical", dest="protein.vep_canonical", transform=split_str),
+    Column("cds_strand", dest="cds_strand", transform=split_str_drop_na),
+    Column("refcodon", dest="protein.aa.refcodon", transform=split_str),
+    Column("codonpos", dest="protein.aa.codonpos", transform=split_int),
+    Column("codon_degeneracy", dest="protein.aa.codon_degeneracy", transform=split_int),
+    Column("Ancestral_allele", dest="ancestral_allele", transform=split_str_drop_na),
+    Column("AltaiNeandertal", dest="altai_neandertal", transform=split_genotype),
+    Column("Denisova", transform=split_genotype),
+    Column("VindijiaNeandertal", dest="vindijia_neandertal", transform=split_genotype),
+    Column("ChagyrskayaNeandertal", dest="chagyrskaya_neandertal", transform=split_genotype),
+    Column("SIFT_score", dest="protein.sift.score", transform=split_float),
+    Column("SIFT_converted_rankscore", dest="sift.converted_rankscore", transform=split_float_drop_na),
+    Column("SIFT_pred", dest="protein.sift.pred", transform=split_str),
+    Column("SIFT4G_score", dest="protein.sift4g.score", transform=split_float),
+    Column("SIFT4G_converted_rankscore", dest="sift4g.converted_rankscore", transform=split_float_drop_na),
+    Column("SIFT4G_pred", dest="protein.sift4g.pred", transform=split_str),
+    Column("Polyphen2_HDIV_score", dest="protein.polyphen2.hdiv.score", transform=split_float),
+    Column("Polyphen2_HDIV_rankscore", transform=split_float_drop_na),
+    Column("Polyphen2_HDIV_pred", dest="protein.polyphen2.hdiv.pred", transform=split_str),
+    Column("Polyphen2_HVAR_score", dest="protein.polyphen2.hvar.score", transform=split_float),
+    Column("Polyphen2_HVAR_rankscore", transform=split_float_drop_na),
+    Column("Polyphen2_HVAR_pred", dest="protein.polyphen2.hvar.pred", transform=split_str),
+    Column("LRT_score", transform=split_float_drop_na),
+    Column("LRT_converted_rankscore", dest="lrt.converted_rankscore", transform=split_float_drop_na),
+    Column("LRT_pred", transform=split_str_drop_na),
+    Column("LRT_Omega", transform=split_float_drop_na),
+    Column("MutationTaster_score", tag=COLUMN_TAG.MUTATION_TASTER_SCORE),
+    Column("MutationTaster_converted_rankscore", dest="mutationtaster.converted_rankscore", transform=split_float_drop_na),
+    Column("MutationTaster_pred", tag=COLUMN_TAG.MUTATION_TASTER_PRED),
+    Column("MutationTaster_model", tag=COLUMN_TAG.MUTATION_TASTER_MODEL),
+    Column("MutationTaster_AAE", tag=COLUMN_TAG.MUTATION_TASTER_AAE),
+    Column("MutationAssessor_score", dest="protein.mutationassessor.score", transform=split_float),
+    Column("MutationAssessor_rankscore", transform=split_float_drop_na),
+    Column("MutationAssessor_pred", dest="protein.mutationassessor.pred", transform=split_str),
+    Column("FATHMM_score", dest="protein.fathmm.score", transform=split_float),
+    Column("FATHMM_converted_rankscore", dest="fathmm.converted_rankscore", transform=split_float_drop_na),
+    Column("FATHMM_pred", dest="protein.fathmm.pred", transform=split_str),
+    Column("PROVEAN_score", dest="protein.provean.score", transform=split_float),
+    Column("PROVEAN_converted_rankscore", dest="provean.converted_rankscore", transform=split_float_drop_na),
+    Column("PROVEAN_pred", dest="protein.provean.pred", transform=split_str),
+    Column("VEST4_score", dest="protein.vest4.score", transform=split_float),
+    Column("VEST4_rankscore", transform=split_float_drop_na),
+    Column("MetaSVM_score", transform=split_float_drop_na),
+    Column("MetaSVM_rankscore", transform=split_float_drop_na),
+    Column("MetaSVM_pred", transform=split_str_drop_na),
+    Column("MetaLR_score", transform=split_float_drop_na),
+    Column("MetaLR_rankscore", transform=split_float_drop_na),
+    Column("MetaLR_pred", transform=split_str_drop_na),
+    Column("Reliability_index", dest="reliability_index", transform=int),
+    Column("MetaRNN_score", transform=split_float_drop_na),
+    Column("MetaRNN_rankscore", transform=split_float_drop_na),
+    Column("MetaRNN_pred", transform=split_str_drop_na),
+    Column("M-CAP_score", transform=split_float_drop_na),
+    Column("M-CAP_rankscore", transform=split_float_drop_na),
+    Column("M-CAP_pred", transform=split_str_drop_na),
+    Column("REVEL_score", dest="protein.revel.score", transform=split_float),
+    Column("REVEL_rankscore", transform=split_float_drop_na),
+    Column("MutPred_score", transform=split_float_drop_na),
+    Column("MutPred_rankscore", transform=split_float_drop_na),
+    Column("MutPred_protID", dest="mutpred.accession", transform=split_str_drop_na),
+    Column("MutPred_AAchange", dest="mutpred.aa_change", transform=split_str_drop_na),
+    Column("MutPred_Top5features", dest="mutpred.pred", transform=parse_mutpred_top5features),
+    Column("MVP_score", dest="protein.mvp.score", transform=split_float),
+    Column("MVP_rankscore", transform=split_float_drop_na),
+    Column("gMVP_score", dest="protein.gmvp.score", transform=split_float),  # new in 4.4.a
+    Column("gMVP_rankscore", transform=split_float_drop_na),  # new in 4.4.a
+    Column("MPC_score", dest="protein.mpc.score", transform=split_float),
+    Column("MPC_rankscore", transform=split_float_drop_na),
+    Column("PrimateAI_score", transform=split_float_drop_na),
+    Column("PrimateAI_rankscore", transform=split_float_drop_na),
+    Column("PrimateAI_pred", transform=split_str_drop_na),
+    Column("DEOGEN2_score", transform=split_float_drop_na),
+    Column("DEOGEN2_rankscore", transform=split_float_drop_na),
+    Column("DEOGEN2_pred", transform=split_str_drop_na),
+    Column("BayesDel_addAF_score", dest="bayesdel.add_af.score", transform=split_float_drop_na),
+    Column("BayesDel_addAF_rankscore", dest="bayesdel.add_af.rankscore", transform=split_float_drop_na),
+    Column("BayesDel_addAF_pred", dest="bayesdel.add_af.pred", transform=split_str_drop_na),
+    Column("BayesDel_noAF_score", dest="bayesdel.no_af.score", transform=split_float_drop_na),
+    Column("BayesDel_noAF_rankscore", dest="bayesdel.no_af.rankscore", transform=split_float_drop_na),
+    Column("BayesDel_noAF_pred", dest="bayesdel.no_af.pred", transform=split_str_drop_na),
+    Column("ClinPred_score", transform=split_float_drop_na),
+    Column("ClinPred_rankscore", transform=split_float_drop_na),
+    Column("ClinPred_pred", transform=split_str_drop_na),
+    Column("LIST-S2_score", transform=split_float_drop_na),
+    Column("LIST-S2_rankscore", transform=split_float_drop_na),
+    Column("LIST-S2_pred", transform=split_str_drop_na),
+    Column("VARITY_R_score", transform=split_float_drop_na),  # new in 4.4.a
+    Column("VARITY_R_rankscore", transform=split_float_drop_na),
+    Column("VARITY_ER_score", transform=split_float_drop_na),
+    Column("VARITY_ER_rankscore", transform=split_float_drop_na),
+    Column("VARITY_R_LOO_score", dest="varity.r_loo.score", transform=split_float_drop_na),
+    Column("VARITY_R_LOO_rankscore", dest="varity.r_loo.rankscore", transform=split_float_drop_na),
+    Column("VARITY_ER_LOO_score", dest="varity.er_loo.score", transform=split_float_drop_na),
+    Column("VARITY_ER_LOO_rankscore", dest="varity.er_loo.rankscore", transform=split_float_drop_na),
+    Column("Aloft_Fraction_transcripts_affected", dest="protein.aloft.fraction_transcripts_affected", transform=split_str, tag=COLUMN_TAG.ALOFT_FRACTION_TRANSCRIPTS_AFFECTED),
+    Column("Aloft_prob_Tolerant", dest="protein.aloft.prob_tolerant", transform=split_str, tag=COLUMN_TAG.ALOFT_PROB_TOLERANT),
+    Column("Aloft_prob_Recessive", dest="protein.aloft.prob_recessive", transform=split_str, tag=COLUMN_TAG.ALOFT_PROB_RECESSIVE),
+    Column("Aloft_prob_Dominant", dest="protein.aloft.prob_dominant", transform=split_str, tag=COLUMN_TAG.ALOFT_PROB_DOMINANT),
+    Column("Aloft_pred", dest="protein.aloft.pred", transform=split_str, tag=COLUMN_TAG.ALOFT_PRED),
+    Column("Aloft_Confidence", dest="protein.aloft.confidence", transform=split_str, tag=COLUMN_TAG.ALOFT_CONFIDENCE),
+    Column("CADD_raw", dest="cadd.raw_score", transform=split_float_drop_na, assembly="hg38"),  # TODO CADD will have hg38 next update. Deprecate these 3 field then.
+    Column("CADD_raw_rankscore", dest="cadd.raw_rankscore", transform=split_float_drop_na, assembly="hg38"),
+    Column("CADD_phred", transform=split_float_drop_na, assembly="hg38"),  # CADD phred-like scores, not as other predications of string type
+    # Column("CADD_raw_hg19", assembly="hg19"),  # discarded because Myvariant.info already has a hg19-only datasource of CADD.
+    # Column("CADD_raw_rankscore_hg19", assembly="hg19"),  # ditto
+    # Column("CADD_phred_hg19", assembly="hg19"),  # ditto
+    Column("DANN_score", transform=split_float_drop_na),
+    Column("DANN_rankscore", transform=split_float_drop_na),
+    Column("fathmm-MKL_coding_score", dest="fathmm-mkl.coding_score", transform=split_float_drop_na),
+    Column("fathmm-MKL_coding_rankscore", dest="fathmm-mkl.coding_rankscore", transform=split_float_drop_na),
+    Column("fathmm-MKL_coding_pred", dest="fathmm-mkl.coding_pred", transform=split_str_drop_na),
+    Column("fathmm-MKL_coding_group", dest="fathmm-mkl.coding_group", transform=split_str_drop_na),
+    Column("fathmm-XF_coding_score", dest="fathmm-xf.coding_score", transform=split_float_drop_na),
+    Column("fathmm-XF_coding_rankscore", dest="fathmm-xf.coding_rankscore", transform=split_float_drop_na),
+    Column("fathmm-XF_coding_pred", dest="fathmm-xf.coding_pred", transform=split_str_drop_na),
+    Column("Eigen-raw_coding", dest="eigen.raw_coding", transform=split_float_drop_na),
+    Column("Eigen-raw_coding_rankscore", dest="eigen.raw_coding_rankscore", transform=split_float_drop_na),
+    Column("Eigen-phred_coding", dest="eigen.phred_coding", transform=split_float_drop_na),
+    Column("Eigen-PC-raw_coding", dest="eigen-pc.raw_coding", transform=split_float_drop_na),
+    Column("Eigen-PC-raw_coding_rankscore", dest="eigen-pc.raw_coding_rankscore", transform=split_float_drop_na),
+    Column("Eigen-PC-phred_coding", dest="eigen-pc.phred_coding", transform=split_float_drop_na),
+    Column("GenoCanyon_score", transform=split_float_drop_na),
+    Column("GenoCanyon_rankscore", transform=split_float_drop_na),
+    Column("integrated_fitCons_score", dest="fitcons.integrated.score", transform=split_float_drop_na),
+    Column("integrated_fitCons_rankscore", dest="fitcons.integrated.rankscore", transform=split_float_drop_na),
+    Column("integrated_confidence_value", dest="fitcons.integrated.confidence_value", transform=split_int_drop_na),
+    Column("GM12878_fitCons_score", dest="fitcons.gm12878.score", transform=split_float_drop_na),
+    Column("GM12878_fitCons_rankscore", dest="fitcons.gm12878.rankscore", transform=split_float_drop_na),
+    Column("GM12878_confidence_value", dest="fitcons.gm12878.confidence_value", transform=split_int_drop_na),
+    Column("H1-hESC_fitCons_score", dest="fitcons.h1-hesc.score", transform=split_float_drop_na),
+    Column("H1-hESC_fitCons_rankscore", dest="fitcons.h1-hesc.rankscore", transform=split_float_drop_na),
+    Column("H1-hESC_confidence_value", dest="fitcons.h1-hesc.confidence_value", transform=split_int_drop_na),
+    Column("HUVEC_fitCons_score", dest="fitcons.huvec.score", transform=split_float_drop_na),
+    Column("HUVEC_fitCons_rankscore", dest="fitcons.huvec.rankscore", transform=split_float_drop_na),
+    Column("HUVEC_confidence_value", dest="fitcons.huvec.confidence_value", transform=split_int_drop_na),
+    Column("LINSIGHT", dest="linsight.score", transform=split_float_drop_na),
+    Column("LINSIGHT_rankscore", transform=split_float_drop_na),
+    Column("GERP++_NR", transform=split_float_drop_na),
+    Column("GERP++_RS", transform=split_float_drop_na),
+    Column("GERP++_RS_rankscore", dest="gerp++.rs_rankscore", transform=split_float_drop_na),
+    Column("phyloP100way_vertebrate", dest="phylop.100way_vertebrate.score", transform=split_float_drop_na),
+    Column("phyloP100way_vertebrate_rankscore", dest="phylop.100way_vertebrate.rankscore", transform=split_float_drop_na),
+    Column("phyloP470way_mammalian", dest="phylop.470way_mammalian.score", transform=split_float_drop_na),  # replaced 30way_mammalian in 4.4.a
+    Column("phyloP470way_mammalian_rankscore", dest="phylop.470way_mammalian.rankscore", transform=split_float_drop_na),  # replaced 30way_mammalian in 4.4.a
+    Column("phyloP17way_primate", dest="phylop.17way_primate.score", transform=split_float_drop_na),
+    Column("phyloP17way_primate_rankscore", dest="phylop.17way_primate.rankscore", transform=split_float_drop_na),
+    Column("phastCons100way_vertebrate", dest="phastcons.100way_vertebrate.score", transform=split_float_drop_na),
+    Column("phastCons100way_vertebrate_rankscore", dest="phastcons.100way_vertebrate.rankscore", transform=split_float_drop_na),
+    Column("phastCons470way_mammalian", dest="phastcons.470way_mammalian.score", transform=split_float_drop_na),  # replaced 30way_mammalian in 4.4.a
+    Column("phastCons470way_mammalian_rankscore", dest="phastcons.470way_mammalian.rankscore", transform=split_float_drop_na),  # replaced 30way_mammalian in 4.4.a
+    Column("phastCons17way_primate", dest="phastcons.17way_primate.score", transform=split_float_drop_na),
+    Column("phastCons17way_primate_rankscore", dest="phastcons.17way_primate.rankscore", transform=split_float_drop_na),
+    Column("SiPhy_29way_pi", dest="siphy_29way.pi", transform=parse_siphy_29way_pi),
+    Column("SiPhy_29way_logOdds", dest="siphy_29way.logodds_score", transform=split_float_drop_na),
+    Column("SiPhy_29way_logOdds_rankscore", dest="siphy_29way.logodds_rankscore", transform=split_float_drop_na),
+    Column("bStatistic", dest="bstatistic.score", transform=split_float_drop_na),
+    Column("bStatistic_converted_rankscore", dest="bstatistic.converted_rankscore", transform=split_float_drop_na),
+    Column("1000Gp3_AC", dest="1000gp3.ac", transform=int),
+    Column("1000Gp3_AF", dest="1000gp3.af", transform=float),
+    Column("1000Gp3_AFR_AC", dest="1000gp3.afr.ac", transform=int),  # dest changed since 4.4.a
+    Column("1000Gp3_AFR_AF", dest="1000gp3.afr.af", transform=float),
+    Column("1000Gp3_EUR_AC", dest="1000gp3.eur.ac", transform=int),
+    Column("1000Gp3_EUR_AF", dest="1000gp3.eur.af", transform=float),
+    Column("1000Gp3_AMR_AC", dest="1000gp3.amr.ac", transform=int),
+    Column("1000Gp3_AMR_AF", dest="1000gp3.amr.af", transform=float),
+    Column("1000Gp3_EAS_AC", dest="1000gp3.eas.ac", transform=int),
+    Column("1000Gp3_EAS_AF", dest="1000gp3.eas.af", transform=float),
+    Column("1000Gp3_SAS_AC", dest="1000gp3.sas.ac", transform=int),
+    Column("1000Gp3_SAS_AF", dest="1000gp3.sas.af", transform=float),
+    Column("TWINSUK_AC", dest="twinsuk.ac", transform=int),
+    Column("TWINSUK_AF", dest="twinsuk.af", transform=float),
+    Column("ALSPAC_AC", dest="alspac.ac", transform=int),
+    Column("ALSPAC_AF", dest="alspac.af", transform=float),
+    Column("UK10K_AC", dest="uk10k.ac", transform=int),
+    Column("UK10K_AF", dest="uk10k.af", transform=float),
+    Column("ESP6500_AA_AC", dest="esp6500.aa.ac", transform=int),  # dest changed since 4.4.a
+    Column("ESP6500_AA_AF", dest="esp6500.aa.af", transform=float),
+    Column("ESP6500_EA_AC", dest="esp6500.ea.ac", transform=int),
+    Column("ESP6500_EA_AF", dest="esp6500.ea.af", transform=float),
+    Column("ExAC_AC", dest="exac.ac", transform=int),  # dest changed since 4.4.a
+    Column("ExAC_AF", dest="exac.af", transform=float),
+    Column("ExAC_Adj_AC", dest="exac.adj_ac", transform=int),
+    Column("ExAC_Adj_AF", dest="exac.adj_af", transform=float),
+    Column("ExAC_AFR_AC", dest="exac.afr.ac", transform=int),
+    Column("ExAC_AFR_AF", dest="exac.afr.af", transform=float),
+    Column("ExAC_AMR_AC", dest="exac.amr.ac", transform=int),
+    Column("ExAC_AMR_AF", dest="exac.amr.af", transform=float),
+    Column("ExAC_EAS_AC", dest="exac.eas.ac", transform=int),
+    Column("ExAC_EAS_AF", dest="exac.eas.af", transform=float),
+    Column("ExAC_FIN_AC", dest="exac.fin.ac", transform=int),
+    Column("ExAC_FIN_AF", dest="exac.fin.af", transform=float),
+    Column("ExAC_NFE_AC", dest="exac.nfe.ac", transform=int),
+    Column("ExAC_NFE_AF", dest="exac.nfe.af", transform=float),
+    Column("ExAC_SAS_AC", dest="exac.sas.ac", transform=int),
+    Column("ExAC_SAS_AF", dest="exac.sas.af", transform=float),
+    Column("ExAC_nonTCGA_AC", dest="exac_nontcga.ac", transform=int),
+    Column("ExAC_nonTCGA_AF", dest="exac_nontcga.af", transform=float),
+    Column("ExAC_nonTCGA_Adj_AC", dest="exac_nontcga.adj_ac", transform=int),
+    Column("ExAC_nonTCGA_Adj_AF", dest="exac_nontcga.adj_af", transform=float),
+    Column("ExAC_nonTCGA_AFR_AC", dest="exac_nontcga.afr.ac", transform=int),
+    Column("ExAC_nonTCGA_AFR_AF", dest="exac_nontcga.afr.af", transform=float),
+    Column("ExAC_nonTCGA_AMR_AC", dest="exac_nontcga.amr.ac", transform=int),
+    Column("ExAC_nonTCGA_AMR_AF", dest="exac_nontcga.amr.af", transform=float),
+    Column("ExAC_nonTCGA_EAS_AC", dest="exac_nontcga.eas.ac", transform=int),
+    Column("ExAC_nonTCGA_EAS_AF", dest="exac_nontcga.eas.af", transform=float),
+    Column("ExAC_nonTCGA_FIN_AC", dest="exac_nontcga.fin.ac", transform=int),
+    Column("ExAC_nonTCGA_FIN_AF", dest="exac_nontcga.fin.af", transform=float),
+    Column("ExAC_nonTCGA_NFE_AC", dest="exac_nontcga.nfe.ac", transform=int),
+    Column("ExAC_nonTCGA_NFE_AF", dest="exac_nontcga.nfe.af", transform=float),
+    Column("ExAC_nonTCGA_SAS_AC", dest="exac_nontcga.sas.ac", transform=int),
+    Column("ExAC_nonTCGA_SAS_AF", dest="exac_nontcga.sas.af", transform=float),
+    Column("ExAC_nonpsych_AC", dest="exac_nonpsych.ac", transform=int),
+    Column("ExAC_nonpsych_AF", dest="exac_nonpsych.af", transform=float),
+    Column("ExAC_nonpsych_Adj_AC", dest="exac_nonpsych.adj_ac", transform=int),
+    Column("ExAC_nonpsych_Adj_AF", dest="exac_nonpsych.adj_af", transform=float),
+    Column("ExAC_nonpsych_AFR_AC", dest="exac_nonpsych.afr.ac", transform=int),
+    Column("ExAC_nonpsych_AFR_AF", dest="exac_nonpsych.afr.af", transform=float),
+    Column("ExAC_nonpsych_AMR_AC", dest="exac_nonpsych.amr.ac", transform=int),
+    Column("ExAC_nonpsych_AMR_AF", dest="exac_nonpsych.amr.af", transform=float),
+    Column("ExAC_nonpsych_EAS_AC", dest="exac_nonpsych.eas.ac", transform=int),
+    Column("ExAC_nonpsych_EAS_AF", dest="exac_nonpsych.eas.af", transform=float),
+    Column("ExAC_nonpsych_FIN_AC", dest="exac_nonpsych.fin.ac", transform=int),
+    Column("ExAC_nonpsych_FIN_AF", dest="exac_nonpsych.fin.af", transform=float),
+    Column("ExAC_nonpsych_NFE_AC", dest="exac_nonpsych.nfe.ac", transform=int),
+    Column("ExAC_nonpsych_NFE_AF", dest="exac_nonpsych.nfe.af", transform=float),
+    Column("ExAC_nonpsych_SAS_AC", dest="exac_nonpsych.sas.ac", transform=int),
+    Column("ExAC_nonpsych_SAS_AF", dest="exac_nonpsych.sas.af", transform=float),
+    Column("ALFA_European_AC", dest="alfa.european.ac", transform=int),
+    Column("ALFA_European_AN", dest="alfa.european.an", transform=int),
+    Column("ALFA_European_AF", dest="alfa.european.af", transform=float),
+    Column("ALFA_African_Others_AC", dest="alfa.african_others.ac", transform=int),
+    Column("ALFA_African_Others_AN", dest="alfa.african_others.an", transform=int),
+    Column("ALFA_African_Others_AF", dest="alfa.african_others.af", transform=float),
+    Column("ALFA_East_Asian_AC", dest="alfa.east_asian.ac", transform=int),
+    Column("ALFA_East_Asian_AN", dest="alfa.east_asian.an", transform=int),
+    Column("ALFA_East_Asian_AF", dest="alfa.east_asian.af", transform=float),
+    Column("ALFA_African_American_AC", dest="alfa.african_american.ac", transform=int),
+    Column("ALFA_African_American_AN", dest="alfa.african_american.an", transform=int),
+    Column("ALFA_African_American_AF", dest="alfa.african_american.af", transform=float),
+    Column("ALFA_Latin_American_1_AC", dest="alfa.latin_american_1.ac", transform=int),
+    Column("ALFA_Latin_American_1_AN", dest="alfa.latin_american_1.an", transform=int),
+    Column("ALFA_Latin_American_1_AF", dest="alfa.latin_american_1.af", transform=float),
+    Column("ALFA_Latin_American_2_AC", dest="alfa.latin_american_2.ac", transform=int),
+    Column("ALFA_Latin_American_2_AN", dest="alfa.latin_american_2.an", transform=int),
+    Column("ALFA_Latin_American_2_AF", dest="alfa.latin_american_2.af", transform=float),
+    Column("ALFA_Other_Asian_AC", dest="alfa.other_asian.ac", transform=int),
+    Column("ALFA_Other_Asian_AN", dest="alfa.other_asian.an", transform=int),
+    Column("ALFA_Other_Asian_AF", dest="alfa.other_asian.af", transform=float),
+    Column("ALFA_South_Asian_AC", dest="alfa.south_asian.ac", transform=int),
+    Column("ALFA_South_Asian_AN", dest="alfa.south_asian.an", transform=int),
+    Column("ALFA_South_Asian_AF", dest="alfa.south_asian.af", transform=float),
+    Column("ALFA_Other_AC", dest="alfa.other.ac", transform=int),
+    Column("ALFA_Other_AN", dest="alfa.other.an", transform=int),
+    Column("ALFA_Other_AF", dest="alfa.other.af", transform=float),
+    Column("ALFA_African_AC", dest="alfa.african.ac", transform=int),
+    Column("ALFA_African_AN", dest="alfa.african.an", transform=int),
+    Column("ALFA_African_AF", dest="alfa.african.af", transform=float),
+    Column("ALFA_Asian_AC", dest="alfa.asian.ac", transform=int),
+    Column("ALFA_Asian_AN", dest="alfa.asian.an", transform=int),
+    Column("ALFA_Asian_AF", dest="alfa.asian.af", transform=float),
+    Column("ALFA_Total_AC", dest="alfa.total.ac", transform=int),
+    Column("ALFA_Total_AN", dest="alfa.total.an", transform=int),
+    Column("ALFA_Total_AF", dest="alfa.total.af", transform=float),
+    Column("clinvar_id", dest="clinvar.clinvar_id", transform=split_clinvar),
+    Column("clinvar_clnsig", transform=split_clinvar),
+    Column("clinvar_trait", transform=split_clinvar),
+    Column("clinvar_review", transform=split_clinvar),
+    Column("clinvar_hgvs", transform=split_clinvar),
+    Column("clinvar_var_source", dest="clinvar.var_source", transform=split_clinvar),
+    Column("clinvar_MedGen_id", dest="clinvar.medgen", transform=split_clinvar),
+    Column("clinvar_OMIM_id", dest="clinvar.omim", transform=split_clinvar),
+    Column("clinvar_Orphanet_id", dest="clinvar.orphanet", transform=split_clinvar),
+    Column("Interpro_domain", transform=split_str_drop_na),
+    Column("GTEx_V8_gene", dest="gtex.gene", tag=COLUMN_TAG.GTEX_GENE),  # special column, see prune_gtex()
+    Column("GTEx_V8_tissue", dest="gtex.tissue", tag=COLUMN_TAG.GTEX_TISSUE),  # special column, see prune_gtex()
+    Column("Geuvadis_eQTL_target_gene", transform=split_str_drop_na)
+]
+
+HG19_COLUMNS = [c for c in COLUMNS if c.is_hg19()]
+HG38_COLUMNS = [c for c in COLUMNS if c.is_hg38()]
+PROTEIN_COLUMNS = [c for c in COLUMNS if c.dest.startswith(r"protein.")]
+
+# Currently not necessary to make assembly-specific tag-column maps.
+TAG_COLUMN_MAP = create_tag_column_map(COLUMNS)
+
+
+def verify_pos(row, pos_column: Column, na_values: set = NA_VALUES):
+    pos_value = row[pos_column.name]
+
+    if pos_value in na_values:
+        return False
+
+    return True
+
+
+def verify_hg19_row(row: dict, na_values: set = NA_VALUES):
+    pos_column = TAG_COLUMN_MAP[COLUMN_TAG.HG19_POS][0]
+    return verify_pos(row, pos_column=pos_column, na_values=na_values)
+
+
+def verify_hg38_row(row: dict, na_values: set = NA_VALUES):
+    pos_column = TAG_COLUMN_MAP[COLUMN_TAG.HG38_POS][0]
+    return verify_pos(row, pos_column=pos_column, na_values=na_values)
+
+
+def normalize_hg19_row(row: dict):
+    """
+    For unknown reasons, 4 MutationTaster columns and 6 Aloft columns have values ending in ";", which leads to an empty string when splitting the value by ";".
+    This function remove the tailing ";" in those values.
+    """
+    columns = [
+        # MutationTaster columns
+        TAG_COLUMN_MAP[COLUMN_TAG.MUTATION_TASTER_AAE][0],
+        TAG_COLUMN_MAP[COLUMN_TAG.MUTATION_TASTER_MODEL][0],
+        TAG_COLUMN_MAP[COLUMN_TAG.MUTATION_TASTER_PRED][0],
+        TAG_COLUMN_MAP[COLUMN_TAG.MUTATION_TASTER_SCORE][0],
+        # Aloft columns
+        TAG_COLUMN_MAP[COLUMN_TAG.ALOFT_FRACTION_TRANSCRIPTS_AFFECTED][0],
+        TAG_COLUMN_MAP[COLUMN_TAG.ALOFT_PROB_TOLERANT][0],
+        TAG_COLUMN_MAP[COLUMN_TAG.ALOFT_PROB_RECESSIVE][0],
+        TAG_COLUMN_MAP[COLUMN_TAG.ALOFT_PROB_DOMINANT][0],
+        TAG_COLUMN_MAP[COLUMN_TAG.ALOFT_PRED][0],
+        TAG_COLUMN_MAP[COLUMN_TAG.ALOFT_CONFIDENCE][0]
+    ]
+
+    for c in columns:
+        if row[c.name] and row[c.name][-1] == ";":
+            row[c.name] = row[c.name][:-1]
+
+    return row
+
+
+def normalize_hg38_row(row: dict):
+    return normalize_hg19_row(row)
+
+
+def prune_gtex(raw_doc: dict, gene_column: Column, tissue_column: Column, na_values: set = NA_VALUES):
+    """
+    Map each GTEx gene name and tissue name from the raw document into a dictionary,
+    and assign all such dictionaries to the raw document's top "gtex" field.
+
+    E.g. with the following input value:
+
+        row["gtex.gene"] = "ENOSF1|ENOSF1"
+        row["gtex.tissue"] = "Adipose_Subcutaneous|Muscle_Skeletal"
+
+    raw_doc will be assigned as:
+
+         row["gtex"] = [
+            {'gene': 'ENOSF1', 'tissue': 'Adipose_Subcutaneous'},
+            {'gene': 'ENOSF1', 'tissue': 'Muscle_Skeletal'}
+        ]
+    """
+    # when these two keys are not present in the doc, it means the responding two values in tsv files are NA values
+    if (gene_column.dest in raw_doc) and (tissue_column.dest in raw_doc):
+        gene_value = raw_doc[gene_column.dest]
+        tissue_value = raw_doc[tissue_column.dest]
+
+        # special separator "|" for GTEx
+        gtex_result = [{"gene": acc, "tissue": entry} for (acc, entry) in split_zip([gene_value, tissue_value], sep=r"|", na_values=na_values)]
+        gtex_result = _check_length(gtex_result)
+        if gtex_result is not None:
+            raw_doc["gtex"] = gtex_result
+
+        del raw_doc[gene_column.dest]
+        del raw_doc[tissue_column.dest]
+
+    return raw_doc
+
+
+def prune_mutation_taster(raw_doc: dict, aae_column: Column, model_column: Column, pred_column: Column, score_column: Column, na_values: set = NA_VALUES):
+    """
+    Map each MutationTaster AAE, model, pred, and score value from the raw document into a dictionary,
+    and assign all such dictionaries to the raw document's "mutationtaster.analysis" field.
+
+    E.g. with the following input value:
+
+        row["mutationtaster.aae"] = "Y518*;Y518*;D532E"
+        row["mutationtaster.model"] = "complex_aae;complex_aae;simple_aae"
+        row["mutationtaster.pred"] = "D;D;N"
+        row["mutationtaster.score"] = "1;1;1"
+
+    raw_doc will be assigned as:
+
+         row["mutationtaster.analysis"] = [
+            {'aae': 'Y518*', 'model': 'complex_aae', 'pred': 'D', 'score': 1},
+            {'aae': 'Y518*', 'model': 'complex_aae', 'pred': 'D', 'score': 1},
+            {'aae': 'D532E', 'model': 'simple_aae', 'pred': 'N', 'score': 1}
+        ]
+    """
+    if (aae_column.dest in raw_doc) and (model_column.dest in raw_doc) and (pred_column.dest in raw_doc) and (score_column.dest in raw_doc):
+        aae_value = raw_doc[aae_column.dest]
+        model_value = raw_doc[model_column.dest]
+        pred_value = raw_doc[pred_column.dest]
+        score_value = raw_doc[score_column.dest]
+
+        analysis_values = split_zip([aae_value, model_value, pred_value, score_value], sep=r";", na_values=na_values)
+        analysis_result = [{"aae": aae, "model": model, "pred": pred, "score": float(score)} for (aae, model, pred, score) in analysis_values]
+        analysis_result = _check_length(analysis_result)
+        if analysis_result is not None:
+            raw_doc["mutationtaster.analysis"] = analysis_result
+
+        del raw_doc[aae_column.dest]
+        del raw_doc[model_column.dest]
+        del raw_doc[pred_column.dest]
+        del raw_doc[score_column.dest]
+
+        # note that raw_doc[mutationtaster.converted_rankscore] is kept as-is
+
+    return raw_doc
+
+
+def prune_protein(raw_doc: set, protein_columns: list[Column]):
+    protein_fields = {c.dest: raw_doc[c.dest] for c in protein_columns}
+
+    # assert len(set(map(len, protein_fields.values()))) == 1  # assert all values (as lists) in protein_fields have the same length before zipping
+
+    """
+    Convert protein fields (as a dictionary of lists) to a list of dictionaries. E.g.
+    
+        protein_field = {
+            'protein.transcriptid': ['ENST00000624406', 'ENST00000398168'],
+            'protein.proteinid': ['ENSP00000485669', 'ENSP00000381234']
+        }
+        
+    will be converted to
+    
+        protein_result = [
+            {'protein.transcriptid': 'ENST00000624406', 'protein.proteinid': 'ENSP00000485669'},
+            {'protein.transcriptid': 'ENST00000398168', 'protein.proteinid': 'ENSP00000381234'}
+        ]
+    """
+    protein_result = []
+    protein_keys = protein_fields.keys()
+    for protein_values in zip(*protein_fields.values()):
+        elem = dict((key, value) for key, value in zip(protein_keys, protein_values) if value is not None)
+        elem = parse_dot_fields(elem)["protein"]
+        protein_result.append(elem)
+    # We keep protein_result as a list for easier merging
+    # protein_result = _check_length(protein_result)
+    # if protein_result is not None:
+    #     raw_doc["protein"] = protein_result
+    raw_doc["protein"] = protein_result
+
+    for c in protein_columns:
+        del raw_doc[c.dest]
+
+    return raw_doc
+
+
+def prune_hg19_doc(doc: dict, na_values: set = NA_VALUES):
+    protein_columns = [c for c in PROTEIN_COLUMNS if c.dest in doc]
+    doc = prune_protein(doc, protein_columns=protein_columns)
+
+    gtex_gene_column = TAG_COLUMN_MAP[COLUMN_TAG.GTEX_GENE][0]
+    gtex_tissue_column = TAG_COLUMN_MAP[COLUMN_TAG.GTEX_TISSUE][0]
+    doc = prune_gtex(doc, gene_column=gtex_gene_column, tissue_column=gtex_tissue_column, na_values=na_values)
+
+    mutation_taster_aae_column = TAG_COLUMN_MAP[COLUMN_TAG.MUTATION_TASTER_AAE][0]
+    mutation_taster_model_column = TAG_COLUMN_MAP[COLUMN_TAG.MUTATION_TASTER_MODEL][0]
+    mutation_taster_pred_column = TAG_COLUMN_MAP[COLUMN_TAG.MUTATION_TASTER_PRED][0]
+    mutation_taster_score_column = TAG_COLUMN_MAP[COLUMN_TAG.MUTATION_TASTER_SCORE][0]
+    doc = prune_mutation_taster(doc, aae_column=mutation_taster_aae_column, model_column=mutation_taster_model_column,
+                                pred_column=mutation_taster_pred_column, score_column=mutation_taster_score_column, na_values=na_values)
+
+    return doc
+
+
+def prune_hg38_doc(doc: dict, na_values: set = NA_VALUES):
+    return prune_hg19_doc(doc, na_values=na_values)
+
+
+def construct_raw_doc(row: dict, columns: list, na_values: set = NA_VALUES):
+    """
+    Construct a raw dbnsfp doc from a dict-like row read from the csv file.
+    "Raw" means 1) the doc may contain dot fields that are not parsed, and 2) some values in the doc need further treatment/processing.
+
+    Args:
+        row: a dict representing a csv row's content
+        columns: a list of Column object indicating how to construct each column
+        na_values: a set of values seen as NA
+    Returns:
+        a dict representing the doc's json object
+    """
+    result = dict()
+
+    for column in columns:
+        value = row[column.name]
+        if value in na_values:
+            continue
+
+        value = column.transform(value)
+        if value is None:
+            continue
+
+        result[column.dest] = value
+
+    return result
+
+
+def construct_hg19_raw_doc(row: dict, na_values: set = NA_VALUES):
+    return construct_raw_doc(row, columns=HG19_COLUMNS, na_values=na_values)
+
+
+def construct_hg38_raw_doc(row: dict, na_values: set = NA_VALUES):
+    return construct_raw_doc(row, columns=HG38_COLUMNS, na_values=na_values)
+
+
+def make_hgvs_id(doc: dict, chrom_column: Column, pos_column: Column, ref_column: Column, alt_column: Column):
+    chrom_value = doc[chrom_column.dest]
+    pos_value = doc[pos_column.dest]["start"]  # see make_zero_based()
+    ref_value = doc[ref_column.dest]
+    alt_value = doc[alt_column.dest]
+
+    hgvs_id = "chr%s:g.%d%s>%s" % (chrom_value, pos_value, ref_value, alt_value)
+    return hgvs_id
+
+
+def make_hg19_hgvs_id(doc: dict):
+    chrom_column = TAG_COLUMN_MAP[COLUMN_TAG.HG19_CHROM][0]
+    pos_column = TAG_COLUMN_MAP[COLUMN_TAG.HG19_POS][0]
+    ref_column = TAG_COLUMN_MAP[COLUMN_TAG.REF_ALLELE][0]
+    alt_column = TAG_COLUMN_MAP[COLUMN_TAG.ALT_ALLELE][0]
+
+    return make_hgvs_id(doc, chrom_column=chrom_column, pos_column=pos_column, ref_column=ref_column, alt_column=alt_column)
+
+
+def make_hg38_hgvs_id(doc: dict):
+    chrom_column = TAG_COLUMN_MAP[COLUMN_TAG.HG38_CHROM][0]
+    pos_column = TAG_COLUMN_MAP[COLUMN_TAG.HG38_POS][0]
+    ref_column = TAG_COLUMN_MAP[COLUMN_TAG.REF_ALLELE][0]
+    alt_column = TAG_COLUMN_MAP[COLUMN_TAG.ALT_ALLELE][0]
+
+    return make_hgvs_id(doc, chrom_column=chrom_column, pos_column=pos_column, ref_column=ref_column, alt_column=alt_column)
+
+
+def construct_hg19_doc(row: dict, na_values: set = NA_VALUES):
+    verified = verify_hg19_row(row, na_values=na_values)
+    if not verified:
+        return None
+
+    row = normalize_hg19_row(row)
+    raw_doc = construct_hg19_raw_doc(row, na_values=na_values)
+    raw_doc = prune_hg19_doc(raw_doc, na_values=na_values)
+    hgvs_id = make_hg19_hgvs_id(raw_doc)
+
+    doc = {
+        "_id": hgvs_id,
+        "dbnsfp": parse_dot_fields(raw_doc)  # convert dot-fields into nested dictionaries
+    }
+    return doc
+
+
+def construct_hg38_doc(row: dict, na_values: set = NA_VALUES):
+    verified = verify_hg38_row(row, na_values=na_values)
+    if not verified:
+        return None
+
+    row = normalize_hg38_row(row)
+    raw_doc = construct_hg38_raw_doc(row, na_values=na_values)
+    raw_doc = prune_hg38_doc(raw_doc, na_values=na_values)
+    hgvs_id = make_hg38_hgvs_id(raw_doc)
+
+    doc = {
+        "_id": hgvs_id,
+        "dbnsfp": parse_dot_fields(raw_doc)  # convert dot-fields into nested dictionaries
+    }
+    return doc
+
+
+def load_file(path: str, assembly: str):
+    file = anyfile(path)
+    file_reader = csv.DictReader(file, delimiter="\t")
+
+    num_columns = len(file_reader.fieldnames)
+    assert num_columns == VALID_COLUMN_NO, "Expecting %s columns, but got %s" % (VALID_COLUMN_NO, num_columns)
+
+    _construct_doc = None
+    match assembly:
+        case "hg19":
+            _construct_doc = construct_hg19_doc
+        case "hg38":
+            _construct_doc = construct_hg38_doc
+        case _:
+            raise ValueError(f"Cannot recognize assembly. Accept 'hg19' or 'hg38', got '{assembly}'.")
+
+    last_doc = None
+    for row in file_reader:
+        curr_doc = _construct_doc(row, na_values=NA_VALUES)
+
+        if curr_doc is None:
+            continue
+
+        if last_doc is not None:
+            if curr_doc["_id"] == last_doc["_id"]:
+                last_protein_field = last_doc["dbnsfp"]["protein"]
+                curr_protein_field = curr_doc["dbnsfp"]["protein"]
+
+                # We guarantee that the protein field is always a list at this moment. See prune_protein()
+                # if not isinstance(last_protein_field, list):
+                #     last_protein_field = [last_protein_field]
+                last_protein_field.append(curr_protein_field)
+
+                last_doc["dbnsfp"]["protein"] = last_protein_field
+                continue
+            else:
+                if len(last_doc["dbnsfp"]["protein"]) == 1:
+                    last_doc["dbnsfp"]["protein"] = last_doc["dbnsfp"]["protein"][0]
+                yield last_doc
+
+        last_doc = curr_doc
+
+    # yield the very last doc
+    if last_doc:
+        if len(last_doc["dbnsfp"]["protein"]) == 1:
+            last_doc["dbnsfp"]["protein"] = last_doc["dbnsfp"]["protein"][0]
+        yield last_doc
+
+    file.close()

From 191878e2b5abf10a1417ac864bda74e0037f54e2 Mon Sep 17 00:00:00 2001
From: Yao Yao <yaoy.cs@gmail.com>
Date: Tue, 31 Oct 2023 13:44:12 -0700
Subject: [PATCH 11/13] add mapping for V2 parser

---
 .../sources/dbnsfp/dbnsfp_mapping_44a_v2.py   | 1151 +++++++++++++++++
 1 file changed, 1151 insertions(+)
 create mode 100644 src/hub/dataload/sources/dbnsfp/dbnsfp_mapping_44a_v2.py

diff --git a/src/hub/dataload/sources/dbnsfp/dbnsfp_mapping_44a_v2.py b/src/hub/dataload/sources/dbnsfp/dbnsfp_mapping_44a_v2.py
new file mode 100644
index 00000000..92b455f5
--- /dev/null
+++ b/src/hub/dataload/sources/dbnsfp/dbnsfp_mapping_44a_v2.py
@@ -0,0 +1,1151 @@
+start_pos_field = {"start": {"type": "integer"}}
+end_pos_field = {"end": {"type": "integer"}}
+
+score_field = {"score": {"type": "float"}}
+converted_rankscore_field = {"converted_rankscore": {"type": "float"}}
+rankscore_field = {"rankscore": {"type": "float"}}
+confidence_value_field = {"confidence_value": {"type": "int"}}
+pred_field = {
+    "pred": {
+        "type": "keyword",
+        "normalizer": "keyword_lowercase_normalizer"
+    }
+}
+
+allele_count_field = {"ac": {"type": "integer"}}
+allele_num_field = {"an": {"type": "integer"}}
+allele_freq_field = {"af": {"type": "float"}}
+adj_allele_count_field = {"adj_ac": {"type": "integer"}}
+adj_allele_freq_field = {"adj_af": {"type": "float"}}
+
+mapping = {
+    "dbnsfp": {
+        "properties": {
+            "rsid": {
+                "type": "keyword",
+                "normalizer": "keyword_lowercase_normalizer"
+            },
+            "chrom": {
+                "type": "keyword",
+                "normalizer": "keyword_lowercase_normalizer"
+            },
+            "hg19": {
+                "properties": {
+                    **start_pos_field,
+                    **end_pos_field
+                }
+            },
+            "hg18": {
+                "properties": {
+                    **start_pos_field,
+                    **end_pos_field
+                }
+            },
+            "hg38": {
+                "properties": {
+                    **start_pos_field,
+                    **end_pos_field
+                }
+            },
+            "ref": {
+                "type": "keyword",
+                "normalizer": "keyword_lowercase_normalizer"
+            },
+            "alt": {
+                "type": "keyword",
+                "normalizer": "keyword_lowercase_normalizer"
+            },
+            "aa": {
+                "properties": {
+                    "ref": {
+                        "type": "keyword",
+                        "normalizer": "keyword_lowercase_normalizer"
+                    },
+                    "alt": {
+                        "type": "keyword",
+                        "normalizer": "keyword_lowercase_normalizer"
+                    },
+
+                }
+            },
+            "protein": {
+                "properties": {
+                    "aa": {
+                        "properties": {
+                            "pos": {
+                                "type": "keyword",
+                                "normalizer": "keyword_lowercase_normalizer"
+                            },
+
+                        }
+                    },
+                    "genename": {
+                        "type": "keyword",
+                        "normalizer": "keyword_lowercase_normalizer"
+                    },
+                    "geneid": {
+                        "type": "keyword",
+                        "normalizer": "keyword_lowercase_normalizer"
+                    },
+                    "transcriptid": {
+                        "type": "keyword",
+                        "normalizer": "keyword_lowercase_normalizer"
+                    },
+                    "proteinid": {
+                        "type": "keyword",
+                        "normalizer": "keyword_lowercase_normalizer"
+                    },
+                    "uniprot": {
+                        "properties": {
+                            "acc": {
+                                "type": "keyword",
+                                "normalizer": "keyword_lowercase_normalizer"
+                            },
+                            "entry": {
+                                "type": "keyword",
+                                "normalizer": "keyword_lowercase_normalizer"
+                            }
+                        }
+                    },
+                    "hgvsc": {
+                        "properties": {
+                            "annovar": {
+                                "type": "keyword",
+                                "normalizer": "keyword_lowercase_normalizer"
+                            },
+                            "snpeff": {
+                                "type": "keyword",
+                                "normalizer": "keyword_lowercase_normalizer"
+                            },
+                            "vep": {
+                                "type": "keyword",
+                                "normalizer": "keyword_lowercase_normalizer"
+                            }
+                        }
+                    },
+                    "hgvsp": {
+                        "properties": {
+                            "annovar": {
+                                "type": "keyword",
+                                "normalizer": "keyword_lowercase_normalizer"
+                            },
+                            "snpeff": {
+                                "type": "keyword",
+                                "normalizer": "keyword_lowercase_normalizer"
+                            },
+                            "vep": {
+                                "type": "keyword",
+                                "normalizer": "keyword_lowercase_normalizer"
+                            }
+                        }
+                    },
+                    "appris": {
+                        "type": "keyword",
+                        "normalizer": "keyword_lowercase_normalizer"
+                    },
+                    "gencode_basic": {
+                        "type": "keyword",
+                        "normalizer": "keyword_lowercase_normalizer"
+                    },
+                    "tsl": {
+                        "type": "integer"
+                    },
+                    "vep_canonical": {
+                        "type": "keyword",
+                        "normalizer": "keyword_lowercase_normalizer"
+                    },
+                    "refcodon": {
+                        "type": "keyword",
+                        "normalizer": "keyword_lowercase_normalizer"
+                    },
+                    "codonpos": {
+                        "type": "keyword",
+                        "normalizer": "keyword_lowercase_normalizer"
+                    },
+                    "codon_degeneracy": {
+                        "type": "keyword",
+                        "normalizer": "keyword_lowercase_normalizer"
+                    },
+                    "sift": {
+                        "properties": {
+                            **score_field,
+                            **pred_field
+                        }
+                    },
+                    "sift4g": {
+                        "properties": {
+                            **score_field,
+                            **pred_field
+                        }
+                    },
+                    "polyphen2": {
+                        "properties": {
+                            "hdiv": {
+                                "properties": {
+                                    **score_field,
+                                    **pred_field
+                                }
+                            },
+                            "hvar": {
+                                "properties": {
+                                    **score_field,
+                                    **pred_field
+                                }
+                            }
+                        }
+                    },
+                    "mutationassessor": {
+                        "properties": {
+                            **score_field,
+                            **pred_field,
+                        }
+                    },
+                    "fathmm": {
+                        "properties": {
+                            **score_field,
+                            **pred_field
+                        }
+                    },
+                    "provean": {
+                        "properties": {
+                            **score_field,
+                            **pred_field
+                        }
+                    },
+                    "vest4": {
+                        "properties": {
+                            **score_field,
+                        }
+                    },
+                    "revel": {
+                        "properties": {
+                            **score_field,
+                        }
+                    },
+                    "mvp": {
+                        "properties": {
+                            **score_field,
+                        }
+                    },
+                    "gmvp": {  # new in 4.4.a
+                        "properties": {
+                            **score_field,
+                        }
+                    },
+                    "mpc": {
+                        "properties": {
+                            **score_field,
+                        }
+                    },
+                    "aloft": {
+                        "properties": {
+                            "fraction_transcripts_affected": {
+                                "type": "keyword",
+                                "normalizer": "keyword_lowercase_normalizer"
+                            },
+                            "prob_tolerant": {
+                                "type": "keyword",
+                                "normalizer": "keyword_lowercase_normalizer"
+                            },
+                            "prob_recessive": {
+                                "type": "keyword",
+                                "normalizer": "keyword_lowercase_normalizer"
+                            },
+                            "prob_dominant": {
+                                "type": "keyword",
+                                "normalizer": "keyword_lowercase_normalizer"
+                            },
+                            "pred": {
+                                "type": "keyword",
+                                "normalizer": "keyword_lowercase_normalizer"
+                            },
+                            "confidence": {
+                                "type": "text"
+                            }
+                        }
+                    },
+                }
+            },
+
+
+            "cds_strand": {
+                "type": "keyword",
+                "normalizer": "keyword_lowercase_normalizer"
+            },
+            "ancestral_allele": {
+                "type": "keyword",
+                "normalizer": "keyword_lowercase_normalizer"
+            },
+            "altai_neandertal": {
+                "type": "keyword",
+                "normalizer": "keyword_lowercase_normalizer"
+            },
+            "denisova": {
+                "type": "keyword",
+                "normalizer": "keyword_lowercase_normalizer"
+            },
+            "vindijia_neandertal": {
+                "type": "keyword",
+                "normalizer": "keyword_lowercase_normalizer"
+            },
+            "chagyrskaya_neandertal": {
+                "type": "keyword",
+                "normalizer": "keyword_lowercase_normalizer"
+            },
+
+            "sift": {
+                "properties": {
+                    **converted_rankscore_field,
+                }
+            },
+            "sift4g": {
+                "properties": {
+                    **converted_rankscore_field,
+                }
+            },
+            "polyphen2": {
+                "properties": {
+                    "hdiv": {
+                        "properties": {
+                            **rankscore_field,
+                        }
+                    },
+                    "hvar": {
+                        "properties": {
+                            **rankscore_field,
+                        }
+                    }
+                }
+            },
+            "lrt": {
+                "properties": {
+                    **score_field,
+                    **converted_rankscore_field,
+                    **pred_field,
+                    "omega": {
+                        "type": "float"
+                    }
+                }
+            },
+            "mutationtaster": {
+                "properties": {
+                    **score_field,
+                    **converted_rankscore_field,
+                    **pred_field,
+                    "model": {
+                        "type": "keyword",
+                        "normalizer": "keyword_lowercase_normalizer"
+                    },
+                    "aae": {
+                        "type": "text"
+                    }
+                }
+            },
+            "mutationassessor": {
+                "properties": {
+                    **rankscore_field,
+                }
+            },
+            "fathmm": {
+                "properties": {
+                    **converted_rankscore_field,
+                }
+            },
+            "provean": {
+                "properties": {
+                    **converted_rankscore_field,
+                }
+            },
+            "vest4": {
+                "properties": {
+                    **rankscore_field
+                }
+            },
+            "metasvm": {
+                "properties": {
+                    **score_field,
+                    **rankscore_field,
+                    **pred_field
+                }
+            },
+            "metalr": {
+                "properties": {
+                    **score_field,
+                    **rankscore_field,
+                    **pred_field
+                }
+            },
+            "reliability_index": {
+                "type": "integer"
+            },
+            "metarnn": {
+                "properties": {
+                    **score_field,
+                    **rankscore_field,
+                    **pred_field
+                }
+            },
+            "m-cap": {
+                "properties": {
+                    **score_field,
+                    **rankscore_field,
+                    **pred_field
+                }
+            },
+            "revel": {
+                "properties": {
+                    **rankscore_field
+                }
+            },
+            "mutpred": {
+                "properties": {
+                    **score_field,
+                    **rankscore_field,
+                    "accession": {
+                        "type": "keyword",
+                        "normalizer": "keyword_lowercase_normalizer"
+                    },
+                    "aa_change": {
+                        "type": "keyword",
+                        "normalizer": "keyword_lowercase_normalizer"
+                    },
+                    "pred": {
+                        "properties": {
+                            "p_val": {
+                                "type": "float"
+                            },
+                            "mechanism": {
+                                "type": "text"
+                            }
+                        }
+                    }
+                }
+            },
+            "mvp": {
+                "properties": {
+                    **rankscore_field
+                }
+            },
+            "gmvp": {  # new in 4.4.a
+                "properties": {
+                    **rankscore_field
+                }
+            },
+            "mpc": {
+                "properties": {
+                    **rankscore_field
+                }
+            },
+            "primateai": {
+                "properties": {
+                    **score_field,
+                    **rankscore_field,
+                    **pred_field
+                }
+            },
+            "deogen2": {
+                "properties": {
+                    **score_field,
+                    **rankscore_field,
+                    **pred_field
+                }
+            },
+            "bayesdel": {
+                "properties": {
+                    "add_af": {
+                        "properties": {
+                            **score_field,
+                            **rankscore_field,
+                            **pred_field
+                        }
+                    },
+                    "no_af": {
+                        "properties": {
+                            **score_field,
+                            **rankscore_field,
+                            **pred_field
+                        }
+                    }
+                }
+            },
+            "clinpred": {
+                "properties": {
+                    **score_field,
+                    **rankscore_field,
+                    **pred_field
+                }
+            },
+            "list-s2": {
+                "properties": {
+                    **score_field,
+                    **rankscore_field,
+                    **pred_field
+                }
+            },
+            "varity": {
+                "r": {
+                    "properties": {
+                        **score_field,
+                        **rankscore_field
+                    }
+                },
+                "er": {
+                    "properties": {
+                        **score_field,
+                        **rankscore_field
+                    }
+                },
+                "r_loo": {
+                    "properties": {
+                        **score_field,
+                        **rankscore_field
+                    }
+                },
+                "er_loo": {
+                    "properties": {
+                        **score_field,
+                        **rankscore_field
+                    }
+                }
+            },
+            "cadd": {
+                # Only for "hg38"
+                # No CADD fields will be included for "hg19"
+                "properties": {
+                    "raw_score": {
+                        "type": "float"
+                    },
+                    "raw_rankscore": {
+                        "type": "float"
+                    },
+                    "pred": {
+                        "type": "float"  # CADD phred-like scores, not as other predications of string type
+                    }
+                }
+            },
+            "dann": {
+                "properties": {
+                    **score_field,
+                    **rankscore_field
+                }
+            },
+            "fathmm-mkl": {
+                "properties": {
+                    "coding_score": {
+                        "type": "float"
+                    },
+                    "coding_rankscore": {
+                        "type": "float"
+                    },
+                    "coding_pred": {
+                        "type": "keyword",
+                        "normalizer": "keyword_lowercase_normalizer"
+                    },
+                    "coding_group": {
+                        "type": "keyword",
+                        "normalizer": "keyword_lowercase_normalizer"
+                    }
+                }
+            },
+            "fathmm-xf": {
+                "properties": {
+                    "coding_score": {
+                        "type": "float"
+                    },
+                    "coding_rankscore": {
+                        "type": "float"
+                    },
+                    "coding_pred": {
+                        "type": "keyword",
+                        "normalizer": "keyword_lowercase_normalizer"
+                    }
+                }
+            },
+            "eigen": {
+                "properties": {
+                    "raw_coding": {
+                        "type": "float"
+                    },
+                    "raw_coding_rankscore": {
+                        "type": "float"
+                    },
+                    "phred_coding": {
+                        "type": "float"
+                    }
+                }
+            },
+            "eigen-pc": {
+                "properties": {
+                    "raw_coding": {
+                        "type": "float"
+                    },
+                    "raw_coding_rankscore": {
+                        "type": "float"
+                    },
+                    "phred_coding": {
+                        "type": "float"
+                    },
+                }
+            },
+            "genocanyon": {
+                "properties": {
+                    **score_field,
+                    **rankscore_field
+                }
+            },
+            # "integrated": {
+            #     "properties": {
+            #         "fitcons_score": { "type": "float" },
+            #         "fitcons_rankscore": { "type": "float" },
+            #         "confidence_value": { "type": "integer" }
+            #     }
+            # },
+            # "gm12878": {
+            #     "properties": {
+            #         "fitcons_score": { "type": "float" },
+            #         "fitcons_rankscore": { "type": "float" },
+            #         "confidence_value": { "type": "integer" }
+            #     }
+            # },
+            # "h1-hesc": {
+            #     "properties": {
+            #         "fitcons_score": { "type": "float" },
+            #         "fitcons_rankscore": { "type": "float" },
+            #         "confidence_value": { "type": "integer" }
+            #     }
+            # },
+            # "huvec": {
+            #     "properties": {
+            #         "fitcons_score": { "type": "float" },
+            #         "fitcons_rankscore": { "type": "float" },
+            #         "confidence_value": { "type": "integer" }
+            #     }
+            # },
+            "fitcons": {
+                "properties": {
+                    "integrated": {
+                        "properties": {
+                            **score_field,
+                            **rankscore_field,
+                            **confidence_value_field
+                        }
+                    },
+                    "gm12878": {
+                        "properties": {
+                            **score_field,
+                            **rankscore_field,
+                            **confidence_value_field
+                        }
+                    },
+                    "h1-hesc": {
+                        "properties": {
+                            **score_field,
+                            **rankscore_field,
+                            **confidence_value_field
+                        }
+                    },
+                    "huvec": {
+                        "properties": {
+                            **score_field,
+                            **rankscore_field,
+                            **confidence_value_field
+                        }
+                    },
+                }
+            },
+            "linsight": {
+                "properties": {
+                    **score_field,
+                    **rankscore_field
+                }
+            },
+            "gerp++": {
+                "properties": {
+                    "nr": {
+                        "type": "float"
+                    },
+                    "rs": {
+                        "type": "float"
+                    },
+                    "rs_rankscore": {
+                        "type": "float"
+                    }
+                }
+            },
+            "phylop": {
+                "properties": {
+                    "100way_vertebrate": {
+                        "properties": {
+                            **score_field,
+                            **rankscore_field
+                        }
+                    },
+                    "470way_mammalian": {  # replaced 30way_mammalian in 4.4.a
+                        "properties": {
+                            **score_field,
+                            **rankscore_field
+                        }
+                    },
+                    "17way_primate": {
+                        "properties": {
+                            **score_field,
+                            **rankscore_field
+                        }
+                    }
+                }
+            },
+            "phastcons": {
+                "properties": {
+                    "100way_vertebrate": {
+                        "properties": {
+                            **score_field,
+                            **rankscore_field
+                        }
+                    },
+                    "470way_mammalian": {  # replaced 30way_mammalian in 4.4.a
+                        "properties": {
+                            **score_field,
+                            **rankscore_field
+                        }
+                    },
+                    "17way_primate": {
+                        "properties": {
+                            **score_field,
+                            **rankscore_field
+                        }
+                    }
+                }
+            },
+            "siphy_29way": {
+                "properties": {
+                    "pi": {
+                        "properties": {
+                            "a": {
+                                "type": "float"
+                            },
+                            "c": {
+                                "type": "float"
+                            },
+                            "g": {
+                                "type": "float"
+                            },
+                            "t": {
+                                "type": "float"
+                            }
+                        }
+                    },
+                    "logodds_score": {
+                        "type": "float"
+                    },
+                    "logodds_rankscore": {
+                        "type": "float"
+                    }
+                }
+            },
+            "bstatistic": {
+                "properties": {
+                    **score_field,
+                    **converted_rankscore_field
+                }
+            },
+            "1000gp3": {  # changed since 4.4.a
+                "properties": {
+                    **allele_count_field,
+                    **allele_freq_field,
+                    # "afr_ac": { "type": "integer" },
+                    # "afr_af": { "type": "float" },
+                    "afr": {
+                        "properties": {
+                            **allele_count_field,
+                            **allele_freq_field
+                        }
+                    },
+                    # "eur_ac": { "type": "integer" },
+                    # "eur_af": { "type": "float" },
+                    "eur": {
+                        "properties": {
+                            **allele_count_field,
+                            **allele_freq_field
+                        }
+                    },
+                    # "amr_ac": { "type": "integer" },
+                    # "amr_af": { "type": "float" },
+                    "amr": {
+                        "properties": {
+                            **allele_count_field,
+                            **allele_freq_field
+                        }
+                    },
+                    # "eas_ac": { "type": "integer" },
+                    # "eas_af": { "type": "float" },
+                    "eas": {
+                        "properties": {
+                            **allele_count_field,
+                            **allele_freq_field
+                        }
+                    },
+                    # "sas_ac": { "type": "integer" },
+                    # "sas_af": { "type": "float"}
+                    "sas": {
+                        "properties": {
+                            **allele_count_field,
+                            **allele_freq_field
+                        }
+                    }
+                }
+            },
+            "twinsuk": {
+                "properties": {
+                    **allele_count_field,
+                    **allele_freq_field
+                }
+            },
+            "alspac": {
+                "properties": {
+                    **allele_count_field,
+                    **allele_freq_field
+                }
+            },
+            "uk10k": {
+                "properties": {
+                    **allele_count_field,
+                    **allele_freq_field
+                }
+            },
+            "esp6500": {  # changed since 4.4.a
+                "properties": {
+                    # "aa_ac": { "type": "integer" },
+                    # "aa_af": { "type": "float" },
+                    "aa": {
+                        "properties": {
+                            **allele_count_field,
+                            **allele_freq_field
+                        }
+                    },
+                    # "ea_ac": { "type": "integer" },
+                    # "ea_af": { "type": "float" }
+                    "ea": {
+                        "properties": {
+                            **allele_count_field,
+                            **allele_freq_field
+                        }
+                    },
+                }
+            },
+            "exac": {  # changed since 4.4.a
+                "properties": {
+                    **allele_count_field,
+                    **allele_freq_field,
+                    **adj_allele_count_field,
+                    **adj_allele_freq_field,
+                    # "afr_ac": { "type": "integer" },
+                    # "afr_af": { "type": "float" },
+                    "afr": {
+                        "properties": {
+                            **allele_count_field,
+                            **allele_freq_field
+                        }
+                    },
+                    # "amr_ac": { "type": "integer" },
+                    # "amr_af": { "type": "float" },
+                    "amr": {
+                        "properties": {
+                            **allele_count_field,
+                            **allele_freq_field
+                        }
+                    },
+                    # "eas_ac": { "type": "integer" },
+                    # "eas_af": { "type": "float" },
+                    "eas": {
+                        "properties": {
+                            **allele_count_field,
+                            **allele_freq_field
+                        }
+                    },
+                    # "fin_ac": { "type": "integer" },
+                    # "fin_af": { "type": "float" },
+                    "fin": {
+                        "properties": {
+                            **allele_count_field,
+                            **allele_freq_field
+                        }
+                    },
+                    # "nfe_ac": { "type": "integer" },
+                    # "nfe_af": { "type": "float" },
+                    "nfe": {
+                        "properties": {
+                            **allele_count_field,
+                            **allele_freq_field
+                        }
+                    },
+                    # "sas_ac": { "type": "integer" },
+                    # "sas_af": { "type": "float" }
+                    "sas": {
+                        "properties": {
+                            **allele_count_field,
+                            **allele_freq_field
+                        }
+                    }
+                }
+            },
+            "exac_nontcga": {  # changed since 4.4.a
+                "properties": {
+                    **allele_count_field,
+                    **allele_freq_field,
+                    **adj_allele_count_field,
+                    **adj_allele_freq_field,
+                    # "afr_ac": { "type": "integer" },
+                    # "afr_af": { "type": "float" },
+                    "afr": {
+                        "properties": {
+                            **allele_count_field,
+                            **allele_freq_field
+                        }
+                    },
+                    # "amr_ac": { "type": "integer" },
+                    # "amr_af": { "type": "float" },
+                    "amr": {
+                        "properties": {
+                            **allele_count_field,
+                            **allele_freq_field
+                        }
+                    },
+                    # "eas_ac": { "type": "integer" },
+                    # "eas_af": { "type": "float" },
+                    "eas": {
+                        "properties": {
+                            **allele_count_field,
+                            **allele_freq_field
+                        }
+                    },
+                    # "fin_ac": { "type": "integer" },
+                    # "fin_af": { "type": "float" },
+                    "fin": {
+                        "properties": {
+                            **allele_count_field,
+                            **allele_freq_field
+                        }
+                    },
+                    # "nfe_ac": { "type": "integer" },
+                    # "nfe_af": { "type": "float" },
+                    "nfe": {
+                        "properties": {
+                            **allele_count_field,
+                            **allele_freq_field
+                        }
+                    },
+                    # "sas_ac": { "type": "integer" },
+                    # "sas_af": { "type": "float" }
+                    "sas": {
+                        "properties": {
+                            **allele_count_field,
+                            **allele_freq_field
+                        }
+                    }
+                }
+            },
+            "exac_nonpsych": {  # changed since 4.4.a
+                "properties": {
+                    **allele_count_field,
+                    **allele_freq_field,
+                    **adj_allele_count_field,
+                    **adj_allele_freq_field,
+                    # "afr_ac": { "type": "integer" },
+                    # "afr_af": { "type": "float" },
+                    "afr": {
+                        "properties": {
+                            **allele_count_field,
+                            **allele_freq_field
+                        }
+                    },
+                    # "amr_ac": { "type": "integer" },
+                    # "amr_af": { "type": "float" },
+                    "amr": {
+                        "properties": {
+                            **allele_count_field,
+                            **allele_freq_field
+                        }
+                    },
+                    # "eas_ac": { "type": "integer" },
+                    # "eas_af": { "type": "float" },
+                    "eas": {
+                        "properties": {
+                            **allele_count_field,
+                            **allele_freq_field
+                        }
+                    },
+                    # "fin_ac": { "type": "integer" },
+                    # "fin_af": { "type": "float" },
+                    "fin": {
+                        "properties": {
+                            **allele_count_field,
+                            **allele_freq_field
+                        }
+                    },
+                    # "nfe_ac": { "type": "integer" },
+                    # "nfe_af": { "type": "float" },
+                    "nfe": {
+                        "properties": {
+                            **allele_count_field,
+                            **allele_freq_field
+                        }
+                    },
+                    # "sas_ac": { "type": "integer" },
+                    # "sas_af": { "type": "float" }
+                    "sas": {
+                        "properties": {
+                            **allele_count_field,
+                            **allele_freq_field
+                        }
+                    }
+                }
+            },
+            "alfa": {  # new in 4.4.a
+                "properties": {
+                    "european": {
+                        "properties": {
+                            **allele_count_field,
+                            **allele_num_field,
+                            **allele_freq_field
+                        }
+                    },
+                    "african_others": {
+                        "properties": {
+                            **allele_count_field,
+                            **allele_num_field,
+                            **allele_freq_field
+                        }
+                    },
+                    "east_asian": {
+                        "properties": {
+                            **allele_count_field,
+                            **allele_num_field,
+                            **allele_freq_field
+                        }
+                    },
+                    "african_american": {
+                        "properties": {
+                            **allele_count_field,
+                            **allele_num_field,
+                            **allele_freq_field
+                        }
+                    },
+                    "latin_american_1": {
+                        "properties": {
+                            **allele_count_field,
+                            **allele_num_field,
+                            **allele_freq_field
+                        }
+                    },
+                    "latin_american_2": {
+                        "properties": {
+                            **allele_count_field,
+                            **allele_num_field,
+                            **allele_freq_field
+                        }
+                    },
+                    "other_asian": {
+                        "properties": {
+                            **allele_count_field,
+                            **allele_num_field,
+                            **allele_freq_field
+                        }
+                    },
+                    "south_asian": {
+                        "properties": {
+                            **allele_count_field,
+                            **allele_num_field,
+                            **allele_freq_field
+                        }
+                    },
+                    "other": {
+                        "properties": {
+                            **allele_count_field,
+                            **allele_num_field,
+                            **allele_freq_field
+                        }
+                    },
+                    "african": {
+                        "properties": {
+                            **allele_count_field,
+                            **allele_num_field,
+                            **allele_freq_field
+                        }
+                    },
+                    "asian": {
+                        "properties": {
+                            **allele_count_field,
+                            **allele_num_field,
+                            **allele_freq_field
+                        }
+                    },
+                    "total": {
+                        "properties": {
+                            **allele_count_field,
+                            **allele_num_field,
+                            **allele_freq_field
+                        }
+                    },
+                }
+            },
+            "clinvar": {
+                "properties": {
+                    "clinvar_id": {
+                        "type": "integer"
+                    },
+                    "clinsig": {
+                        "type": "keyword",
+                        "normalizer": "keyword_lowercase_normalizer"
+                    },
+                    "trait": {
+                        "type": "keyword",
+                        "normalizer": "keyword_lowercase_normalizer"
+                    },
+                    "review": {
+                        "type": "keyword",
+                        "normalizer": "keyword_lowercase_normalizer"
+                    },
+                    "hgvs": {
+                        "type": "keyword",
+                        "normalizer": "keyword_lowercase_normalizer"
+                    },
+                    "var_source": {
+                        "type": "keyword",
+                        "normalizer": "keyword_lowercase_normalizer"
+                    },
+                    "medgen": {
+                        "type": "keyword",
+                        "normalizer": "keyword_lowercase_normalizer"
+                    },
+                    "omim": {
+                        "type": "keyword",
+                        "normalizer": "keyword_lowercase_normalizer"
+                    },
+                    "orphanet": {
+                        "type": "keyword",
+                        "normalizer": "keyword_lowercase_normalizer"
+                    }
+                }
+            },
+            "interpro_domain": {
+                "type": "text"
+            },
+            "gtex": {
+                "properties": {
+                    "gene": {
+                        "type": "keyword",
+                        "normalizer": "keyword_lowercase_normalizer"
+                    },
+                    "tissue": {
+                        "type": "keyword",
+                        "normalizer": "keyword_lowercase_normalizer"
+                    }
+                }
+            },
+            "geuvadis_eqtl_target_gene": {
+                "type": "keyword",
+                "normalizer": "keyword_lowercase_normalizer"
+            }
+        }
+    }
+}

From aee64e50ae2683dce4383e8b1923ebc74cdc4632 Mon Sep 17 00:00:00 2001
From: Yao Yao <yaoy.cs@gmail.com>
Date: Fri, 3 Nov 2023 13:58:02 -0700
Subject: [PATCH 12/13] delete blank lines in v2 mapping; fix a bug in merging
 protein field in v2 parser

---
 src/hub/dataload/sources/dbnsfp/dbnsfp_mapping_44a_v2.py | 2 --
 src/hub/dataload/sources/dbnsfp/dbnsfp_parser_44a_v2.py  | 2 +-
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/hub/dataload/sources/dbnsfp/dbnsfp_mapping_44a_v2.py b/src/hub/dataload/sources/dbnsfp/dbnsfp_mapping_44a_v2.py
index 92b455f5..e09206e4 100644
--- a/src/hub/dataload/sources/dbnsfp/dbnsfp_mapping_44a_v2.py
+++ b/src/hub/dataload/sources/dbnsfp/dbnsfp_mapping_44a_v2.py
@@ -266,8 +266,6 @@
                     },
                 }
             },
-
-
             "cds_strand": {
                 "type": "keyword",
                 "normalizer": "keyword_lowercase_normalizer"
diff --git a/src/hub/dataload/sources/dbnsfp/dbnsfp_parser_44a_v2.py b/src/hub/dataload/sources/dbnsfp/dbnsfp_parser_44a_v2.py
index a8f24a7a..02914f55 100644
--- a/src/hub/dataload/sources/dbnsfp/dbnsfp_parser_44a_v2.py
+++ b/src/hub/dataload/sources/dbnsfp/dbnsfp_parser_44a_v2.py
@@ -874,7 +874,7 @@ def load_file(path: str, assembly: str):
                 # We guarantee that the protein field is always a list at this moment. See prune_protein()
                 # if not isinstance(last_protein_field, list):
                 #     last_protein_field = [last_protein_field]
-                last_protein_field.append(curr_protein_field)
+                last_protein_field.extend(curr_protein_field)
 
                 last_doc["dbnsfp"]["protein"] = last_protein_field
                 continue

From 29364f927dcafee9fd7fc56d50376926743f9469 Mon Sep 17 00:00:00 2001
From: Yao Yao <yaoy.cs@gmail.com>
Date: Fri, 3 Nov 2023 13:59:15 -0700
Subject: [PATCH 13/13] add v2 uploaders

---
 .../dataload/sources/dbnsfp/dbnsfp_upload.py  | 59 +++++++++++++++----
 1 file changed, 49 insertions(+), 10 deletions(-)

diff --git a/src/hub/dataload/sources/dbnsfp/dbnsfp_upload.py b/src/hub/dataload/sources/dbnsfp/dbnsfp_upload.py
index 9815bd2b..2e0e4e70 100644
--- a/src/hub/dataload/sources/dbnsfp/dbnsfp_upload.py
+++ b/src/hub/dataload/sources/dbnsfp/dbnsfp_upload.py
@@ -1,8 +1,11 @@
 import os
 import glob
 
-from .dbnsfp_mapping import mapping
-from .dbnsfp_parser_44a_v1 import load_file
+from .dbnsfp_mapping_44a_v1 import mapping as mapping_v1
+from .dbnsfp_parser_44a_v1 import load_file as load_file_v1
+from .dbnsfp_mapping_44a_v2 import mapping as mapping_v2
+from .dbnsfp_parser_44a_v2 import load_file as load_file_v2
+
 import biothings.hub.dataload.uploader as uploader
 from hub.dataload.uploader import SnpeffPostUpdateUploader
 from hub.dataload.storage import MyVariantIgnoreDuplicatedStorage
@@ -15,15 +18,14 @@
 }
 
 
-class DBNSFPBaseUploader(uploader.ParallelizedSourceUploader,
-                         SnpeffPostUpdateUploader):
+class DBNSFPBaseUploaderV1(uploader.ParallelizedSourceUploader, SnpeffPostUpdateUploader):
 
     storage_class = MyVariantIgnoreDuplicatedStorage
     GLOB_PATTERN = "dbNSFP*_variant.chr*"
 
     @classmethod
     def get_mapping(cls):
-        return mapping
+        return mapping_v1
 
     def jobs(self):
         paths = glob.glob(os.path.join(self.data_folder, self.__class__.GLOB_PATTERN))
@@ -32,11 +34,48 @@ def jobs(self):
 
     def load_data(self, path, assembly):
         self.logger.debug("loading file " + path)
-        return load_file(path, version=assembly)
+        return load_file_v1(path, version=assembly)
+
+
+class DBNSFPBaseUploaderV2(uploader.ParallelizedSourceUploader, SnpeffPostUpdateUploader):
+
+    storage_class = MyVariantIgnoreDuplicatedStorage
+    GLOB_PATTERN = "dbNSFP*_variant.chr*"
+
+    @classmethod
+    def get_mapping(cls):
+        return mapping_v2
+
+    def jobs(self):
+        paths = glob.glob(os.path.join(self.data_folder, self.__class__.GLOB_PATTERN))
+        assembly = self.__class__.__metadata__["assembly"]
+        return map(lambda path: (path, assembly), paths)
+
+    def load_data(self, path, assembly):
+        self.logger.debug("loading file " + path)
+        return load_file_v2(path, version=assembly)
+
+
+class DBNSFPHG38UploaderV1(DBNSFPBaseUploaderV1):
+    name = "dbnsfp_hg38_v1"
+    main_source = "dbnsfp"
+    __metadata__ = {
+        "assembly": "hg38",
+        "src_meta": SRC_META
+    }
+
+
+class DBNSFPHG19UploaderV1(DBNSFPBaseUploaderV1):
+    name = "dbnsfp_hg19_v1"
+    main_source = "dbnsfp"
+    __metadata__ = {
+        "assembly": "hg19",
+        "src_meta": SRC_META
+    }
 
 
-class DBNSFPHG38Uploader(DBNSFPBaseUploader):
-    name = "dbnsfp_hg38"
+class DBNSFPHG38UploaderV2(DBNSFPBaseUploaderV2):
+    name = "dbnsfp_hg38_v2"
     main_source = "dbnsfp"
     __metadata__ = {
         "assembly": "hg38",
@@ -44,8 +83,8 @@ class DBNSFPHG38Uploader(DBNSFPBaseUploader):
     }
 
 
-class DBNSFPHG19Uploader(DBNSFPBaseUploader):
-    name = "dbnsfp_hg19"
+class DBNSFPHG19UploaderV2(DBNSFPBaseUploaderV2):
+    name = "dbnsfp_hg19_v2"
     main_source = "dbnsfp"
     __metadata__ = {
         "assembly": "hg19",