diff --git a/src/hub/dataload/sources/dbnsfp/dbnsfp_mapping.py b/src/hub/dataload/sources/dbnsfp/dbnsfp_mapping.py deleted file mode 100644 index fcccb139..00000000 --- a/src/hub/dataload/sources/dbnsfp/dbnsfp_mapping.py +++ /dev/null @@ -1,1116 +0,0 @@ -mapping = { - "dbnsfp": { - "properties": { - "rsid": { - "type": "keyword", - "normalizer": "keyword_lowercase_normalizer" - }, - "chrom": { - "type": "keyword", - "normalizer": "keyword_lowercase_normalizer" - }, - "hg19": { - "properties": { - "start": { - "type": "integer" - }, - "end": { - "type": "integer" - } - } - }, - "hg18": { - "properties": { - "start": { - "type": "integer" - }, - "end": { - "type": "integer" - } - } - }, - "hg38": { - "properties": { - "start": { - "type": "integer" - }, - "end": { - "type": "integer" - } - } - }, - "ref": { - "type": "keyword", - "normalizer": "keyword_lowercase_normalizer" - }, - "alt": { - "type": "keyword", - "normalizer": "keyword_lowercase_normalizer" - }, - "aa": { - "properties": { - "ref": { - "type": "keyword", - "normalizer": "keyword_lowercase_normalizer" - }, - "alt": { - "type": "keyword", - "normalizer": "keyword_lowercase_normalizer" - }, - "pos": { - "type": "keyword", - "normalizer": "keyword_lowercase_normalizer" - }, - "refcodon": { - "type": "keyword", - "normalizer": "keyword_lowercase_normalizer" - }, - "codonpos": { - "type": "keyword", - "normalizer": "keyword_lowercase_normalizer" - }, - "codon_degeneracy": { - "type": "keyword", - "normalizer": "keyword_lowercase_normalizer" - } - } - }, - "genename": { # Column 13 - "type": "keyword", - "normalizer": "keyword_lowercase_normalizer" - }, - "ensembl": { # Column 14-16 - "properties": { - "geneid": { - "type": "keyword", - "normalizer": "keyword_lowercase_normalizer" - }, - "transcriptid": { - "type": "keyword", - "normalizer": "keyword_lowercase_normalizer" - }, - "proteinid": { - "type": "keyword", - "normalizer": "keyword_lowercase_normalizer" - } - } - }, - "uniprot": { # Column 17-18 - "properties": { - "acc": { - "type": "keyword", - "normalizer": "keyword_lowercase_normalizer" - }, - "entry": { - "type": "keyword", - "normalizer": "keyword_lowercase_normalizer" - } - } - }, - "hgvsc": { # Column 19-21 - "type": "keyword", - "normalizer": "keyword_lowercase_normalizer" - }, - "hgvsp": { # Column 22-24 - "type": "keyword", - "normalizer": "keyword_lowercase_normalizer" - }, - "appris": { # Column 25 - "type": "keyword", - "normalizer": "keyword_lowercase_normalizer" - }, - "genecode_basic": { # Column 26 - "type": "keyword", - "normalizer": "keyword_lowercase_normalizer" - }, - "tsl": { # Column 27 - "type": "integer" - }, - "vep_canonical": { # Column 28 - "type": "keyword", - "normalizer": "keyword_lowercase_normalizer" - }, - "cds_strand": { # Column 29 - "type": "keyword", - "normalizer": "keyword_lowercase_normalizer" - }, - "ancestral_allele": { # Column 33 - "type": "keyword", - "normalizer": "keyword_lowercase_normalizer" - }, - "altai_neandertal": { # Column 34 - "type": "keyword", - "normalizer": "keyword_lowercase_normalizer" - }, - "denisova": { # Column 35 - "type": "keyword", - "normalizer": "keyword_lowercase_normalizer" - }, - "vindijia_neandertal": { # Column 36 - "type": "keyword", - "normalizer": "keyword_lowercase_normalizer" - }, - "chagyrskaya_neandertal": { # Column 37 - "type": "keyword", - "normalizer": "keyword_lowercase_normalizer" - }, - "sift": { # Column 38-40 - "properties": { - "score": { - "type": "float" - }, - "converted_rankscore": { - "type": "float" - }, - "pred": { - "type": "keyword", - "normalizer": "keyword_lowercase_normalizer" - } - } - }, - "sift4g": { # Column 41-43 - "properties": { - "score": { - "type": "float" - }, - "pred": { - "type": "keyword", - "normalizer": "keyword_lowercase_normalizer" - }, - "converted_rankscore": { - "type": "float" - } - } - }, - "polyphen2": { # Column 44-49 - "properties": { - "hdiv": { - "properties": { - "score": { - "type": "float" - }, - "rankscore": { - "type": "float" - }, - "pred": { - "type": "keyword", - "normalizer": "keyword_lowercase_normalizer" - } - } - }, - "hvar": { - "properties": { - "score": { - "type": "float" - }, - "rankscore": { - "type": "float" - }, - "pred": { - "type": "keyword", - "normalizer": "keyword_lowercase_normalizer" - } - } - } - } - }, - "lrt": { # Column 50-53 - "properties": { - "score": { - "type": "float" - }, - "converted_rankscore": { - "type": "float" - }, - "pred": { - "type": "keyword", - "normalizer": "keyword_lowercase_normalizer" - }, - "omega": { - "type": "float" - } - } - }, - "mutationtaster": { # Column 54-58 - "properties": { - "score": { - "type": "float" - }, - "converted_rankscore": { - "type": "float" - }, - "pred": { - "type": "keyword", - "normalizer": "keyword_lowercase_normalizer" - }, - "model": { - "type": "keyword", - "normalizer": "keyword_lowercase_normalizer" - }, - "AAE": { - "type": "text" - } - } - }, - "mutationassessor": { # Column 59-61 - "properties": { - "score": { - "type": "float" - }, - "rankscore": { - "type": "float" - }, - "pred": { - "type": "keyword", - "normalizer": "keyword_lowercase_normalizer" - } - } - }, - "fathmm": { # Column 62-64 - "properties": { - "score": { - "type": "float" - }, - "rankscore": { - "type": "float" - }, - "pred": { - "type": "keyword", - "normalizer": "keyword_lowercase_normalizer" - } - } - }, - "provean": { # Column 65-67 - "properties": { - "score": { - "type": "float" - }, - "rankscore": { - "type": "float" - }, - "pred": { - "type": "keyword", - "normalizer": "keyword_lowercase_normalizer" - } - } - }, - "vest4": { # Column 68-69 - "properties": { - "score": { - "type": "float" - }, - "rankscore": { - "type": "float" - } - } - }, - "metasvm": { # Column 70-72 - "properties": { - "score": { - "type": "float" - }, - "rankscore": { - "type": "float" - }, - "pred": { - "type": "keyword", - "normalizer": "keyword_lowercase_normalizer" - } - } - }, - "metalr": { # Column 73-75 - "properties": { - "score": { - "type": "float" - }, - "rankscore": { - "type": "float" - }, - "pred": { - "type": "keyword", - "normalizer": "keyword_lowercase_normalizer" - } - } - }, - "reliability_index": { # Column 76 - "type": "integer" - }, - "metarnn": { # Column 77-79 - "properties": { - "score": { - "type": "float" - }, - "rankscore": { - "type": "float" - }, - "pred": { - "type": "keyword", - "normalizer": "keyword_lowercase_normalizer" - } - } - }, - "m-cap": { # Column 80-82 - "properties": { - "score": { - "type": "float" - }, - "rankscore": { - "type": "float" - }, - "pred": { - "type": "keyword", - "normalizer": "keyword_lowercase_normalizer" - } - } - }, - "revel": { # Column 83-84 - "properties": { - "score": { - "type": "float" - }, - "rankscore": { - "type": "float" - } - } - }, - "mutpred": { # Column 85-89 - "properties": { - "score": { - "type": "float" - }, - "rankscore": { - "type": "float" - }, - "accession": { - "type": "keyword", - "normalizer": "keyword_lowercase_normalizer" - }, - "aa_change": { - "type": "keyword", - "normalizer": "keyword_lowercase_normalizer" - }, - "pred": { - "properties": { - "p_val": { - "type": "float" - }, - "mechanism": { - "type": "text" - } - } - } - } - }, - "mvp": { # Column 90-91 - "properties": { - "score": { - "type": "float" - }, - "rankscore": { - "type": "float" - } - } - }, - "mpc": { # Column 92-93 - "properties": { - "score": { - "type": "float" - }, - "rankscore": { - "type": "float" - } - } - }, - "primateai": { # Column 94-96 - "properties": { - "score": { - "type": "float" - }, - "rankscore": { - "type": "float" - }, - "pred": { - "type": "keyword", - "normalizer": "keyword_lowercase_normalizer" - } - } - }, - "deogen2": { # Column 97-99 - "properties": { - "score": { - "type": "float" - }, - "rankscore": { - "type": "float" - }, - "pred": { - "type": "keyword", - "normalizer": "keyword_lowercase_normalizer" - } - } - }, - "bayesdel": { # Column 100-105 - "properties": { - "add_af": { - "properties": { - "score": { - "type": "float" - }, - "rankscore": { - "type": "float" - }, - "pred": { - "type": "keyword", - "normalizer": "keyword_lowercase_normalizer" - } - } - }, - "no_af": { - "properties": { - "score": { - "type": "float" - }, - "rankscore": { - "type": "float" - }, - "pred": { - "type": "keyword", - "normalizer": "keyword_lowercase_normalizer" - } - } - } - } - }, - "clinpred": { # Column 106-108 - "properties": { - "score": { - "type": "float" - }, - "rankscore": { - "type": "float" - }, - "pred": { - "type": "keyword", - "normalizer": "keyword_lowercase_normalizer" - } - } - }, - "list-s2": { # Column 109-111 - "properties": { - "score": { - "type": "float" - }, - "rankscore": { - "type": "float" - }, - "pred": { - "type": "keyword", - "normalizer": "keyword_lowercase_normalizer" - } - } - }, - "aloft": { # Column 112-117 - "properties": { - "prob_tolerant": { - "type": "keyword", - "normalizer": "keyword_lowercase_normalizer" - }, - "prob_recessive": { - "type": "keyword", - "normalizer": "keyword_lowercase_normalizer" - }, - "prob_dominant": { - "type": "keyword", - "normalizer": "keyword_lowercase_normalizer" - }, - "pred": { - "type": "keyword", - "normalizer": "keyword_lowercase_normalizer" - }, - "fraction_transcripts_affected": { - "type": "keyword", - "normalizer": "keyword_lowercase_normalizer" - }, - "confidence": { - "type": "text" - } - } - }, - "cadd": { - # Column 118-123 - # Column 118-120 are hg38 - # Column 121-123 are hg19 - # Only column 117-119 will be included in the document for "hg38" - # No CADD fields will be included when "hg19" - "properties": { - "raw_score": { - "type": "float" - }, - "raw_rankscore": { - "type": "float" - }, - "pred": { - "type": "keyword", - "normalizer": "keyword_lowercase_normalizer" - } - } - }, - "dann": { # Column 124-125 - "properties": { - "score": { - "type": "float" - }, - "rankscore": { - "type": "float" - } - } - }, - "fathmm-mkl": { # Column 126-129 - "properties": { - "coding_score": { - "type": "float" - }, - "coding_rankscore": { - "type": "float" - }, - "coding_pred": { - "type": "keyword", - "normalizer": "keyword_lowercase_normalizer" - }, - "coding_group": { - "type": "keyword", - "normalizer": "keyword_lowercase_normalizer" - } - } - }, - "fathmm-xf": { # Column 130-132 - "properties": { - "coding_score": { - "type": "float" - }, - "coding_rankscore": { - "type": "float" - }, - "coding_pred": { - "type": "keyword", - "normalizer": "keyword_lowercase_normalizer" - } - } - }, - "eigen": { # Column 133-135 - "properties": { - "raw_coding": { - "type": "float" - }, - "raw_coding_rankscore": { - "type": "float" - }, - "phred_coding": { - "type": "float" - } - } - }, - "eigen-pc": { # Column 136-138 - "properties": { - "raw_coding": { - "type": "float" - }, - "raw_coding_rankscore": { - "type": "float" - }, - "phred_coding": { - "type": "float" - }, - } - }, - "genocanyon": { # Column 139-140 - "properties": { - "score": { - "type": "float" - }, - "rankscore": { - "type": "float" - } - } - }, - "integrated": { # Column 141-143 - "properties": { - "fitcons_score": { - "type": "float" - }, - "fitcons_rankscore": { - "type": "float" - }, - "confidence_value": { - "type": "integer" - } - } - }, - "gm12878": { # Column 144-146 - "properties": { - "fitcons_score": { - "type": "float" - }, - "fitcons_rankscore": { - "type": "float" - }, - "confidence_value": { - "type": "integer" - } - } - }, - "h1-hesc": { # Column 147-149 - "properties": { - "fitcons_score": { - "type": "float" - }, - "fitcons_rankscore": { - "type": "float" - }, - "confidence_value": { - "type": "integer" - } - } - }, - "huvec": { # Column 150-152 - "properties": { - "fitcons_score": { - "type": "float" - }, - "fitcons_rankscore": { - "type": "float" - }, - "confidence_value": { - "type": "integer" - } - } - }, - "linsight": { # Column 153-154 - "properties": { - "score": { - "type": "float" - }, - "rankscore": { - "type": "float" - } - } - }, - "gerp++": { # Column 155-157 - "properties": { - "nr": { - "type": "float" - }, - "rs": { - "type": "float" - }, - "rs_rankscore": { - "type": "float" - } - } - }, - "phylop": { # Column 158-163 - "properties": { - "100way_vertebrate": { - "properties": { - "score": { - "type": "float" - }, - "rankscore": { - "type": "float" - } - } - }, - "30way_mammalian": { - "properties": { - "score": { - "type": "float" - }, - "rankscore": { - "type": "float" - } - } - }, - "17way_primate": { - "properties": { - "score": { - "type": "float" - }, - "rankscore": { - "type": "float" - } - } - } - } - }, - "phastcons": { # Column 164-169 - "properties": { - "100way_vertebrate": { - "properties": { - "score": { - "type": "float" - }, - "rankscore": { - "type": "float" - } - } - }, - "30way_mammalian": { - "properties": { - "score": { - "type": "float" - }, - "rankscore": { - "type": "float" - } - } - }, - "17way_primate": { - "properties": { - "score": { - "type": "float" - }, - "rankscore": { - "type": "float" - } - } - } - } - }, - "siphy_29way": { # Column 170-172 - "properties": { - "pi": { - "properties": { - "a": { - "type": "float" - }, - "c": { - "type": "float" - }, - "g": { - "type": "float" - }, - "t": { - "type": "float" - } - } - }, - "logodds_score": { - "type": "float" - }, - "logodds_rankscore": { - "type": "float" - } - } - }, - "bstatistic": { # Column 173-174 - "properties": { - "score": { - "type": "integer" - }, - "converted_rankscore": { - "type": "float" - } - } - }, - "1000gp3": { # Column 175-186 - "properties": { - "ac": { - "type": "integer" - }, - "af": { - "type": "float" - }, - "afr_ac": { - "type": "integer" - }, - "afr_af": { - "type": "float" - }, - "eur_ac": { - "type": "integer" - }, - "eur_af": { - "type": "float" - }, - "amr_ac": { - "type": "integer" - }, - "amr_af": { - "type": "float" - }, - "eas_ac": { - "type": "integer" - }, - "eas_af": { - "type": "float" - }, - "sas_ac": { - "type": "integer" - }, - "sas_af": { - "type": "float" - } - } - }, - "twinsuk": { # Column 187-188 - "properties": { - "ac": { - "type": "integer" - }, - "af": { - "type": "float" - } - } - }, - "alspac": { # Column 189-190 - "properties": { - "ac": { - "type": "integer" - }, - "af": { - "type": "float" - } - } - }, - "uk10k": { # Column 191-192 - "properties": { - "ac": { - "type": "integer" - }, - "af": { - "type": "float" - } - } - }, - "esp6500": { # Column 193-196 - "properties": { - "aa_ac": { - "type": "integer" - }, - "aa_af": { - "type": "float" - }, - "ea_ac": { - "type": "integer" - }, - "ea_af": { - "type": "float" - } - } - }, - "exac": { # Column 197-212 - "properties": { - "ac": { - "type": "integer" - }, - "af": { - "type": "float" - }, - "adj_ac": { - "type": "integer" - }, - "adj_af": { - "type": "float" - }, - "afr_ac": { - "type": "integer" - }, - "afr_af": { - "type": "float" - }, - "amr_ac": { - "type": "integer" - }, - "amr_af": { - "type": "float" - }, - "eas_ac": { - "type": "integer" - }, - "eas_af": { - "type": "float" - }, - "fin_ac": { - "type": "integer" - }, - "fin_af": { - "type": "float" - }, - "nfe_ac": { - "type": "integer" - }, - "nfe_af": { - "type": "float" - }, - "sas_ac": { - "type": "integer" - }, - "sas_af": { - "type": "float" - } - } - }, - "exac_nontcga": { # Column 213-228 - "properties": { - "ac": { - "type": "integer" - }, - "af": { - "type": "float" - }, - "adj_ac": { - "type": "integer" - }, - "adj_af": { - "type": "float" - }, - "afr_ac": { - "type": "integer" - }, - "afr_af": { - "type": "float" - }, - "amr_ac": { - "type": "integer" - }, - "amr_af": { - "type": "float" - }, - "eas_ac": { - "type": "integer" - }, - "eas_af": { - "type": "float" - }, - "fin_ac": { - "type": "integer" - }, - "fin_af": { - "type": "float" - }, - "nfe_ac": { - "type": "integer" - }, - "nfe_af": { - "type": "float" - }, - "sas_ac": { - "type": "integer" - }, - "sas_af": { - "type": "float" - } - } - }, - "exac_nonpsych": { # Column 229-244 - "properties": { - "ac": { - "type": "integer" - }, - "af": { - "type": "float" - }, - "adj_ac": { - "type": "integer" - }, - "adj_af": { - "type": "float" - }, - "afr_ac": { - "type": "integer" - }, - "afr_af": { - "type": "float" - }, - "amr_ac": { - "type": "integer" - }, - "amr_af": { - "type": "float" - }, - "eas_ac": { - "type": "integer" - }, - "eas_af": { - "type": "float" - }, - "fin_ac": { - "type": "integer" - }, - "fin_af": { - "type": "float" - }, - "nfe_ac": { - "type": "integer" - }, - "nfe_af": { - "type": "float" - }, - "sas_ac": { - "type": "integer" - }, - "sas_af": { - "type": "float" - } - } - }, - - # Column 245-630 are gnomAD_* columns. Skipped. - - "clinvar": { # Column 631-639 - "properties": { - "clinvar_id": { - "type": "integer" - }, - "clinsig": { - "type": "keyword", - "normalizer": "keyword_lowercase_normalizer" - }, - "trait": { - "type": "keyword", - "normalizer": "keyword_lowercase_normalizer" - }, - "review": { - "type": "keyword", - "normalizer": "keyword_lowercase_normalizer" - }, - "hgvs": { - "type": "keyword", - "normalizer": "keyword_lowercase_normalizer" - }, - "medgen": { - "type": "keyword", - "normalizer": "keyword_lowercase_normalizer" - }, - "orphanet": { - "type": "keyword", - "normalizer": "keyword_lowercase_normalizer" - }, - "var_source": { - "type": "keyword", - "normalizer": "keyword_lowercase_normalizer" - } - } - }, - "interpro_domain": { # Column 640 - "type": "text" - }, - "gtex": { # Column 641-642 - "properties": { - "gene": { - "type": "keyword", - "normalizer": "keyword_lowercase_normalizer" - }, - "tissue": { - "type": "keyword", - "normalizer": "keyword_lowercase_normalizer" - } - } - }, - "geuvadis_eqtl_target_gene": { # Column 643 - "type": "keyword", - "normalizer": "keyword_lowercase_normalizer" - } - } - } -} \ No newline at end of file diff --git a/src/hub/dataload/sources/dbnsfp/dbnsfp_mapping_44a_v1.py b/src/hub/dataload/sources/dbnsfp/dbnsfp_mapping_44a_v1.py new file mode 100644 index 00000000..c1d7bbcf --- /dev/null +++ b/src/hub/dataload/sources/dbnsfp/dbnsfp_mapping_44a_v1.py @@ -0,0 +1,1070 @@ +start_pos_field = {"start": {"type": "integer"}} +end_pos_field = {"end": {"type": "integer"}} + +score_field = {"score": {"type": "float"}} +converted_rankscore_field = {"converted_rankscore": {"type": "float"}} +rankscore_field = {"rankscore": {"type": "float"}} +confidence_value_field = {"confidence_value": {"type": "int"}} +pred_field = { + "pred": { + "type": "keyword", + "normalizer": "keyword_lowercase_normalizer" + } +} + +allele_count_field = {"ac": {"type": "integer"}} +allele_num_field = {"an": {"type": "integer"}} +allele_freq_field = {"af": {"type": "float"}} +adj_allele_count_field = {"adj_ac": {"type": "integer"}} +adj_allele_freq_field = {"adj_af": {"type": "float"}} + +mapping = { + "dbnsfp": { + "properties": { + "rsid": { + "type": "keyword", + "normalizer": "keyword_lowercase_normalizer" + }, + "chrom": { + "type": "keyword", + "normalizer": "keyword_lowercase_normalizer" + }, + "hg19": { + "properties": { + **start_pos_field, + **end_pos_field + } + }, + "hg18": { + "properties": { + **start_pos_field, + **end_pos_field + } + }, + "hg38": { + "properties": { + **start_pos_field, + **end_pos_field + } + }, + "ref": { + "type": "keyword", + "normalizer": "keyword_lowercase_normalizer" + }, + "alt": { + "type": "keyword", + "normalizer": "keyword_lowercase_normalizer" + }, + "aa": { + "properties": { + "ref": { + "type": "keyword", + "normalizer": "keyword_lowercase_normalizer" + }, + "alt": { + "type": "keyword", + "normalizer": "keyword_lowercase_normalizer" + }, + "pos": { + "type": "keyword", + "normalizer": "keyword_lowercase_normalizer" + }, + "refcodon": { + "type": "keyword", + "normalizer": "keyword_lowercase_normalizer" + }, + "codonpos": { + "type": "keyword", + "normalizer": "keyword_lowercase_normalizer" + }, + "codon_degeneracy": { + "type": "keyword", + "normalizer": "keyword_lowercase_normalizer" + } + } + }, + "genename": { + "type": "keyword", + "normalizer": "keyword_lowercase_normalizer" + }, + "ensembl": { + "properties": { + "geneid": { + "type": "keyword", + "normalizer": "keyword_lowercase_normalizer" + }, + "transcriptid": { + "type": "keyword", + "normalizer": "keyword_lowercase_normalizer" + }, + "proteinid": { + "type": "keyword", + "normalizer": "keyword_lowercase_normalizer" + } + } + }, + "uniprot": { + "properties": { + "acc": { + "type": "keyword", + "normalizer": "keyword_lowercase_normalizer" + }, + "entry": { + "type": "keyword", + "normalizer": "keyword_lowercase_normalizer" + } + } + }, + "hgvsc": { + "type": "keyword", + "normalizer": "keyword_lowercase_normalizer" + }, + "hgvsp": { + "type": "keyword", + "normalizer": "keyword_lowercase_normalizer" + }, + "appris": { + "type": "keyword", + "normalizer": "keyword_lowercase_normalizer" + }, + "gencode_basic": { + "type": "keyword", + "normalizer": "keyword_lowercase_normalizer" + }, + "tsl": { + "type": "integer" + }, + "vep_canonical": { + "type": "keyword", + "normalizer": "keyword_lowercase_normalizer" + }, + "cds_strand": { + "type": "keyword", + "normalizer": "keyword_lowercase_normalizer" + }, + "ancestral_allele": { + "type": "keyword", + "normalizer": "keyword_lowercase_normalizer" + }, + "altai_neandertal": { + "type": "keyword", + "normalizer": "keyword_lowercase_normalizer" + }, + "denisova": { + "type": "keyword", + "normalizer": "keyword_lowercase_normalizer" + }, + "vindijia_neandertal": { + "type": "keyword", + "normalizer": "keyword_lowercase_normalizer" + }, + "chagyrskaya_neandertal": { + "type": "keyword", + "normalizer": "keyword_lowercase_normalizer" + }, + "sift": { + "properties": { + **score_field, + **converted_rankscore_field, + **pred_field + } + }, + "sift4g": { + "properties": { + **score_field, + **converted_rankscore_field, + **pred_field + } + }, + "polyphen2": { + "properties": { + "hdiv": { + "properties": { + **score_field, + **rankscore_field, + **pred_field + } + }, + "hvar": { + "properties": { + **score_field, + **rankscore_field, + **pred_field + } + } + } + }, + "lrt": { + "properties": { + **score_field, + **converted_rankscore_field, + **pred_field, + "omega": { + "type": "float" + } + } + }, + "mutationtaster": { + "properties": { + **converted_rankscore_field, + "analysis": { # see prune_mutation_taster() + "properties": { + **pred_field, + **score_field, + "model": { + "type": "keyword", + "normalizer": "keyword_lowercase_normalizer" + }, + "aae": { + "type": "text" + } + } + } + } + }, + "mutationassessor": { + "properties": { + **score_field, + **rankscore_field, + **pred_field, + } + }, + "fathmm": { + "properties": { + **score_field, + **converted_rankscore_field, + **pred_field + } + }, + "provean": { + "properties": { + **score_field, + **converted_rankscore_field, + **pred_field + } + }, + "vest4": { + "properties": { + **score_field, + **rankscore_field + } + }, + "metasvm": { + "properties": { + **score_field, + **rankscore_field, + **pred_field + } + }, + "metalr": { + "properties": { + **score_field, + **rankscore_field, + **pred_field + } + }, + "reliability_index": { + "type": "integer" + }, + "metarnn": { + "properties": { + **score_field, + **rankscore_field, + **pred_field + } + }, + "m-cap": { + "properties": { + **score_field, + **rankscore_field, + **pred_field + } + }, + "revel": { + "properties": { + **score_field, + **rankscore_field + } + }, + "mutpred": { + "properties": { + **score_field, + **rankscore_field, + "accession": { + "type": "keyword", + "normalizer": "keyword_lowercase_normalizer" + }, + "aa_change": { + "type": "keyword", + "normalizer": "keyword_lowercase_normalizer" + }, + "pred": { + "properties": { + "p_val": { + "type": "float" + }, + "mechanism": { + "type": "text" + } + } + } + } + }, + "mvp": { + "properties": { + **score_field, + **rankscore_field + } + }, + "gmvp": { # new in 4.4.a + "properties": { + **score_field, + **rankscore_field + } + }, + "mpc": { + "properties": { + **score_field, + **rankscore_field + } + }, + "primateai": { + "properties": { + **score_field, + **rankscore_field, + **pred_field + } + }, + "deogen2": { + "properties": { + **score_field, + **rankscore_field, + **pred_field + } + }, + "bayesdel": { + "properties": { + "add_af": { + "properties": { + **score_field, + **rankscore_field, + **pred_field + } + }, + "no_af": { + "properties": { + **score_field, + **rankscore_field, + **pred_field + } + } + } + }, + "clinpred": { + "properties": { + **score_field, + **rankscore_field, + **pred_field + } + }, + "list-s2": { + "properties": { + **score_field, + **rankscore_field, + **pred_field + } + }, + "varity": { + "r": { + "properties": { + **score_field, + **rankscore_field + } + }, + "er": { + "properties": { + **score_field, + **rankscore_field + } + }, + "r_loo": { + "properties": { + **score_field, + **rankscore_field + } + }, + "er_loo": { + "properties": { + **score_field, + **rankscore_field + } + } + }, + "aloft": { + "properties": { + "fraction_transcripts_affected": { + "type": "keyword", + "normalizer": "keyword_lowercase_normalizer" + }, + "prob_tolerant": { + "type": "keyword", + "normalizer": "keyword_lowercase_normalizer" + }, + "prob_recessive": { + "type": "keyword", + "normalizer": "keyword_lowercase_normalizer" + }, + "prob_dominant": { + "type": "keyword", + "normalizer": "keyword_lowercase_normalizer" + }, + "pred": { + "type": "keyword", + "normalizer": "keyword_lowercase_normalizer" + }, + "confidence": { + "type": "text" + } + } + }, + "cadd": { + # Only for "hg38" + # No CADD fields will be included for "hg19" + "properties": { + "raw_score": { + "type": "float" + }, + "raw_rankscore": { + "type": "float" + }, + "pred": { + "type": "float" # CADD phred-like scores, not as other predications of string type + } + } + }, + "dann": { + "properties": { + **score_field, + **rankscore_field + } + }, + "fathmm-mkl": { + "properties": { + "coding_score": { + "type": "float" + }, + "coding_rankscore": { + "type": "float" + }, + "coding_pred": { + "type": "keyword", + "normalizer": "keyword_lowercase_normalizer" + }, + "coding_group": { + "type": "keyword", + "normalizer": "keyword_lowercase_normalizer" + } + } + }, + "fathmm-xf": { + "properties": { + "coding_score": { + "type": "float" + }, + "coding_rankscore": { + "type": "float" + }, + "coding_pred": { + "type": "keyword", + "normalizer": "keyword_lowercase_normalizer" + } + } + }, + "eigen": { + "properties": { + "raw_coding": { + "type": "float" + }, + "raw_coding_rankscore": { + "type": "float" + }, + "phred_coding": { + "type": "float" + } + } + }, + "eigen-pc": { + "properties": { + "raw_coding": { + "type": "float" + }, + "raw_coding_rankscore": { + "type": "float" + }, + "phred_coding": { + "type": "float" + }, + } + }, + "genocanyon": { + "properties": { + **score_field, + **rankscore_field + } + }, + # "integrated": { + # "properties": { + # "fitcons_score": { "type": "float" }, + # "fitcons_rankscore": { "type": "float" }, + # "confidence_value": { "type": "integer" } + # } + # }, + # "gm12878": { + # "properties": { + # "fitcons_score": { "type": "float" }, + # "fitcons_rankscore": { "type": "float" }, + # "confidence_value": { "type": "integer" } + # } + # }, + # "h1-hesc": { + # "properties": { + # "fitcons_score": { "type": "float" }, + # "fitcons_rankscore": { "type": "float" }, + # "confidence_value": { "type": "integer" } + # } + # }, + # "huvec": { + # "properties": { + # "fitcons_score": { "type": "float" }, + # "fitcons_rankscore": { "type": "float" }, + # "confidence_value": { "type": "integer" } + # } + # }, + "fitcons": { + "properties": { + "integrated": { + "properties": { + **score_field, + **rankscore_field, + **confidence_value_field + } + }, + "gm12878": { + "properties": { + **score_field, + **rankscore_field, + **confidence_value_field + } + }, + "h1-hesc": { + "properties": { + **score_field, + **rankscore_field, + **confidence_value_field + } + }, + "huvec": { + "properties": { + **score_field, + **rankscore_field, + **confidence_value_field + } + }, + } + }, + "linsight": { + "properties": { + **score_field, + **rankscore_field + } + }, + "gerp++": { + "properties": { + "nr": { + "type": "float" + }, + "rs": { + "type": "float" + }, + "rs_rankscore": { + "type": "float" + } + } + }, + "phylop": { + "properties": { + "100way_vertebrate": { + "properties": { + **score_field, + **rankscore_field + } + }, + "470way_mammalian": { # replaced 30way_mammalian in 4.4.a + "properties": { + **score_field, + **rankscore_field + } + }, + "17way_primate": { + "properties": { + **score_field, + **rankscore_field + } + } + } + }, + "phastcons": { + "properties": { + "100way_vertebrate": { + "properties": { + **score_field, + **rankscore_field + } + }, + "470way_mammalian": { # replaced 30way_mammalian in 4.4.a + "properties": { + **score_field, + **rankscore_field + } + }, + "17way_primate": { + "properties": { + **score_field, + **rankscore_field + } + } + } + }, + "siphy_29way": { + "properties": { + "pi": { + "properties": { + "a": { + "type": "float" + }, + "c": { + "type": "float" + }, + "g": { + "type": "float" + }, + "t": { + "type": "float" + } + } + }, + "logodds_score": { + "type": "float" + }, + "logodds_rankscore": { + "type": "float" + } + } + }, + "bstatistic": { + "properties": { + **score_field, + **converted_rankscore_field + } + }, + "1000gp3": { # changed since 4.4.a + "properties": { + **allele_count_field, + **allele_freq_field, + # "afr_ac": { "type": "integer" }, + # "afr_af": { "type": "float" }, + "afr": { + "properties": { + **allele_count_field, + **allele_freq_field + } + }, + # "eur_ac": { "type": "integer" }, + # "eur_af": { "type": "float" }, + "eur": { + "properties": { + **allele_count_field, + **allele_freq_field + } + }, + # "amr_ac": { "type": "integer" }, + # "amr_af": { "type": "float" }, + "amr": { + "properties": { + **allele_count_field, + **allele_freq_field + } + }, + # "eas_ac": { "type": "integer" }, + # "eas_af": { "type": "float" }, + "eas": { + "properties": { + **allele_count_field, + **allele_freq_field + } + }, + # "sas_ac": { "type": "integer" }, + # "sas_af": { "type": "float"} + "sas": { + "properties": { + **allele_count_field, + **allele_freq_field + } + } + } + }, + "twinsuk": { + "properties": { + **allele_count_field, + **allele_freq_field + } + }, + "alspac": { + "properties": { + **allele_count_field, + **allele_freq_field + } + }, + "uk10k": { + "properties": { + **allele_count_field, + **allele_freq_field + } + }, + "esp6500": { # changed since 4.4.a + "properties": { + # "aa_ac": { "type": "integer" }, + # "aa_af": { "type": "float" }, + "aa": { + "properties": { + **allele_count_field, + **allele_freq_field + } + }, + # "ea_ac": { "type": "integer" }, + # "ea_af": { "type": "float" } + "ea": { + "properties": { + **allele_count_field, + **allele_freq_field + } + }, + } + }, + "exac": { # changed since 4.4.a + "properties": { + **allele_count_field, + **allele_freq_field, + **adj_allele_count_field, + **adj_allele_freq_field, + # "afr_ac": { "type": "integer" }, + # "afr_af": { "type": "float" }, + "afr": { + "properties": { + **allele_count_field, + **allele_freq_field + } + }, + # "amr_ac": { "type": "integer" }, + # "amr_af": { "type": "float" }, + "amr": { + "properties": { + **allele_count_field, + **allele_freq_field + } + }, + # "eas_ac": { "type": "integer" }, + # "eas_af": { "type": "float" }, + "eas": { + "properties": { + **allele_count_field, + **allele_freq_field + } + }, + # "fin_ac": { "type": "integer" }, + # "fin_af": { "type": "float" }, + "fin": { + "properties": { + **allele_count_field, + **allele_freq_field + } + }, + # "nfe_ac": { "type": "integer" }, + # "nfe_af": { "type": "float" }, + "nfe": { + "properties": { + **allele_count_field, + **allele_freq_field + } + }, + # "sas_ac": { "type": "integer" }, + # "sas_af": { "type": "float" } + "sas": { + "properties": { + **allele_count_field, + **allele_freq_field + } + } + } + }, + "exac_nontcga": { # changed since 4.4.a + "properties": { + **allele_count_field, + **allele_freq_field, + **adj_allele_count_field, + **adj_allele_freq_field, + # "afr_ac": { "type": "integer" }, + # "afr_af": { "type": "float" }, + "afr": { + "properties": { + **allele_count_field, + **allele_freq_field + } + }, + # "amr_ac": { "type": "integer" }, + # "amr_af": { "type": "float" }, + "amr": { + "properties": { + **allele_count_field, + **allele_freq_field + } + }, + # "eas_ac": { "type": "integer" }, + # "eas_af": { "type": "float" }, + "eas": { + "properties": { + **allele_count_field, + **allele_freq_field + } + }, + # "fin_ac": { "type": "integer" }, + # "fin_af": { "type": "float" }, + "fin": { + "properties": { + **allele_count_field, + **allele_freq_field + } + }, + # "nfe_ac": { "type": "integer" }, + # "nfe_af": { "type": "float" }, + "nfe": { + "properties": { + **allele_count_field, + **allele_freq_field + } + }, + # "sas_ac": { "type": "integer" }, + # "sas_af": { "type": "float" } + "sas": { + "properties": { + **allele_count_field, + **allele_freq_field + } + } + } + }, + "exac_nonpsych": { # changed since 4.4.a + "properties": { + **allele_count_field, + **allele_freq_field, + **adj_allele_count_field, + **adj_allele_freq_field, + # "afr_ac": { "type": "integer" }, + # "afr_af": { "type": "float" }, + "afr": { + "properties": { + **allele_count_field, + **allele_freq_field + } + }, + # "amr_ac": { "type": "integer" }, + # "amr_af": { "type": "float" }, + "amr": { + "properties": { + **allele_count_field, + **allele_freq_field + } + }, + # "eas_ac": { "type": "integer" }, + # "eas_af": { "type": "float" }, + "eas": { + "properties": { + **allele_count_field, + **allele_freq_field + } + }, + # "fin_ac": { "type": "integer" }, + # "fin_af": { "type": "float" }, + "fin": { + "properties": { + **allele_count_field, + **allele_freq_field + } + }, + # "nfe_ac": { "type": "integer" }, + # "nfe_af": { "type": "float" }, + "nfe": { + "properties": { + **allele_count_field, + **allele_freq_field + } + }, + # "sas_ac": { "type": "integer" }, + # "sas_af": { "type": "float" } + "sas": { + "properties": { + **allele_count_field, + **allele_freq_field + } + } + } + }, + "alfa": { # new in 4.4.a + "properties": { + "european": { + "properties": { + **allele_count_field, + **allele_num_field, + **allele_freq_field + } + }, + "african_others": { + "properties": { + **allele_count_field, + **allele_num_field, + **allele_freq_field + } + }, + "east_asian": { + "properties": { + **allele_count_field, + **allele_num_field, + **allele_freq_field + } + }, + "african_american": { + "properties": { + **allele_count_field, + **allele_num_field, + **allele_freq_field + } + }, + "latin_american_1": { + "properties": { + **allele_count_field, + **allele_num_field, + **allele_freq_field + } + }, + "latin_american_2": { + "properties": { + **allele_count_field, + **allele_num_field, + **allele_freq_field + } + }, + "other_asian": { + "properties": { + **allele_count_field, + **allele_num_field, + **allele_freq_field + } + }, + "south_asian": { + "properties": { + **allele_count_field, + **allele_num_field, + **allele_freq_field + } + }, + "other": { + "properties": { + **allele_count_field, + **allele_num_field, + **allele_freq_field + } + }, + "african": { + "properties": { + **allele_count_field, + **allele_num_field, + **allele_freq_field + } + }, + "asian": { + "properties": { + **allele_count_field, + **allele_num_field, + **allele_freq_field + } + }, + "total": { + "properties": { + **allele_count_field, + **allele_num_field, + **allele_freq_field + } + }, + } + }, + "clinvar": { + "properties": { + "clinvar_id": { + "type": "integer" + }, + "clinsig": { + "type": "keyword", + "normalizer": "keyword_lowercase_normalizer" + }, + "trait": { + "type": "keyword", + "normalizer": "keyword_lowercase_normalizer" + }, + "review": { + "type": "keyword", + "normalizer": "keyword_lowercase_normalizer" + }, + "hgvs": { + "type": "keyword", + "normalizer": "keyword_lowercase_normalizer" + }, + "var_source": { + "type": "keyword", + "normalizer": "keyword_lowercase_normalizer" + }, + "medgen": { + "type": "keyword", + "normalizer": "keyword_lowercase_normalizer" + }, + "omim": { + "type": "keyword", + "normalizer": "keyword_lowercase_normalizer" + }, + "orphanet": { + "type": "keyword", + "normalizer": "keyword_lowercase_normalizer" + } + } + }, + "interpro_domain": { + "type": "text" + }, + "gtex": { + "properties": { + "gene": { + "type": "keyword", + "normalizer": "keyword_lowercase_normalizer" + }, + "tissue": { + "type": "keyword", + "normalizer": "keyword_lowercase_normalizer" + } + } + }, + "geuvadis_eqtl_target_gene": { + "type": "keyword", + "normalizer": "keyword_lowercase_normalizer" + } + } + } +} diff --git a/src/hub/dataload/sources/dbnsfp/dbnsfp_mapping_44a_v2.py b/src/hub/dataload/sources/dbnsfp/dbnsfp_mapping_44a_v2.py new file mode 100644 index 00000000..e09206e4 --- /dev/null +++ b/src/hub/dataload/sources/dbnsfp/dbnsfp_mapping_44a_v2.py @@ -0,0 +1,1149 @@ +start_pos_field = {"start": {"type": "integer"}} +end_pos_field = {"end": {"type": "integer"}} + +score_field = {"score": {"type": "float"}} +converted_rankscore_field = {"converted_rankscore": {"type": "float"}} +rankscore_field = {"rankscore": {"type": "float"}} +confidence_value_field = {"confidence_value": {"type": "int"}} +pred_field = { + "pred": { + "type": "keyword", + "normalizer": "keyword_lowercase_normalizer" + } +} + +allele_count_field = {"ac": {"type": "integer"}} +allele_num_field = {"an": {"type": "integer"}} +allele_freq_field = {"af": {"type": "float"}} +adj_allele_count_field = {"adj_ac": {"type": "integer"}} +adj_allele_freq_field = {"adj_af": {"type": "float"}} + +mapping = { + "dbnsfp": { + "properties": { + "rsid": { + "type": "keyword", + "normalizer": "keyword_lowercase_normalizer" + }, + "chrom": { + "type": "keyword", + "normalizer": "keyword_lowercase_normalizer" + }, + "hg19": { + "properties": { + **start_pos_field, + **end_pos_field + } + }, + "hg18": { + "properties": { + **start_pos_field, + **end_pos_field + } + }, + "hg38": { + "properties": { + **start_pos_field, + **end_pos_field + } + }, + "ref": { + "type": "keyword", + "normalizer": "keyword_lowercase_normalizer" + }, + "alt": { + "type": "keyword", + "normalizer": "keyword_lowercase_normalizer" + }, + "aa": { + "properties": { + "ref": { + "type": "keyword", + "normalizer": "keyword_lowercase_normalizer" + }, + "alt": { + "type": "keyword", + "normalizer": "keyword_lowercase_normalizer" + }, + + } + }, + "protein": { + "properties": { + "aa": { + "properties": { + "pos": { + "type": "keyword", + "normalizer": "keyword_lowercase_normalizer" + }, + + } + }, + "genename": { + "type": "keyword", + "normalizer": "keyword_lowercase_normalizer" + }, + "geneid": { + "type": "keyword", + "normalizer": "keyword_lowercase_normalizer" + }, + "transcriptid": { + "type": "keyword", + "normalizer": "keyword_lowercase_normalizer" + }, + "proteinid": { + "type": "keyword", + "normalizer": "keyword_lowercase_normalizer" + }, + "uniprot": { + "properties": { + "acc": { + "type": "keyword", + "normalizer": "keyword_lowercase_normalizer" + }, + "entry": { + "type": "keyword", + "normalizer": "keyword_lowercase_normalizer" + } + } + }, + "hgvsc": { + "properties": { + "annovar": { + "type": "keyword", + "normalizer": "keyword_lowercase_normalizer" + }, + "snpeff": { + "type": "keyword", + "normalizer": "keyword_lowercase_normalizer" + }, + "vep": { + "type": "keyword", + "normalizer": "keyword_lowercase_normalizer" + } + } + }, + "hgvsp": { + "properties": { + "annovar": { + "type": "keyword", + "normalizer": "keyword_lowercase_normalizer" + }, + "snpeff": { + "type": "keyword", + "normalizer": "keyword_lowercase_normalizer" + }, + "vep": { + "type": "keyword", + "normalizer": "keyword_lowercase_normalizer" + } + } + }, + "appris": { + "type": "keyword", + "normalizer": "keyword_lowercase_normalizer" + }, + "gencode_basic": { + "type": "keyword", + "normalizer": "keyword_lowercase_normalizer" + }, + "tsl": { + "type": "integer" + }, + "vep_canonical": { + "type": "keyword", + "normalizer": "keyword_lowercase_normalizer" + }, + "refcodon": { + "type": "keyword", + "normalizer": "keyword_lowercase_normalizer" + }, + "codonpos": { + "type": "keyword", + "normalizer": "keyword_lowercase_normalizer" + }, + "codon_degeneracy": { + "type": "keyword", + "normalizer": "keyword_lowercase_normalizer" + }, + "sift": { + "properties": { + **score_field, + **pred_field + } + }, + "sift4g": { + "properties": { + **score_field, + **pred_field + } + }, + "polyphen2": { + "properties": { + "hdiv": { + "properties": { + **score_field, + **pred_field + } + }, + "hvar": { + "properties": { + **score_field, + **pred_field + } + } + } + }, + "mutationassessor": { + "properties": { + **score_field, + **pred_field, + } + }, + "fathmm": { + "properties": { + **score_field, + **pred_field + } + }, + "provean": { + "properties": { + **score_field, + **pred_field + } + }, + "vest4": { + "properties": { + **score_field, + } + }, + "revel": { + "properties": { + **score_field, + } + }, + "mvp": { + "properties": { + **score_field, + } + }, + "gmvp": { # new in 4.4.a + "properties": { + **score_field, + } + }, + "mpc": { + "properties": { + **score_field, + } + }, + "aloft": { + "properties": { + "fraction_transcripts_affected": { + "type": "keyword", + "normalizer": "keyword_lowercase_normalizer" + }, + "prob_tolerant": { + "type": "keyword", + "normalizer": "keyword_lowercase_normalizer" + }, + "prob_recessive": { + "type": "keyword", + "normalizer": "keyword_lowercase_normalizer" + }, + "prob_dominant": { + "type": "keyword", + "normalizer": "keyword_lowercase_normalizer" + }, + "pred": { + "type": "keyword", + "normalizer": "keyword_lowercase_normalizer" + }, + "confidence": { + "type": "text" + } + } + }, + } + }, + "cds_strand": { + "type": "keyword", + "normalizer": "keyword_lowercase_normalizer" + }, + "ancestral_allele": { + "type": "keyword", + "normalizer": "keyword_lowercase_normalizer" + }, + "altai_neandertal": { + "type": "keyword", + "normalizer": "keyword_lowercase_normalizer" + }, + "denisova": { + "type": "keyword", + "normalizer": "keyword_lowercase_normalizer" + }, + "vindijia_neandertal": { + "type": "keyword", + "normalizer": "keyword_lowercase_normalizer" + }, + "chagyrskaya_neandertal": { + "type": "keyword", + "normalizer": "keyword_lowercase_normalizer" + }, + + "sift": { + "properties": { + **converted_rankscore_field, + } + }, + "sift4g": { + "properties": { + **converted_rankscore_field, + } + }, + "polyphen2": { + "properties": { + "hdiv": { + "properties": { + **rankscore_field, + } + }, + "hvar": { + "properties": { + **rankscore_field, + } + } + } + }, + "lrt": { + "properties": { + **score_field, + **converted_rankscore_field, + **pred_field, + "omega": { + "type": "float" + } + } + }, + "mutationtaster": { + "properties": { + **score_field, + **converted_rankscore_field, + **pred_field, + "model": { + "type": "keyword", + "normalizer": "keyword_lowercase_normalizer" + }, + "aae": { + "type": "text" + } + } + }, + "mutationassessor": { + "properties": { + **rankscore_field, + } + }, + "fathmm": { + "properties": { + **converted_rankscore_field, + } + }, + "provean": { + "properties": { + **converted_rankscore_field, + } + }, + "vest4": { + "properties": { + **rankscore_field + } + }, + "metasvm": { + "properties": { + **score_field, + **rankscore_field, + **pred_field + } + }, + "metalr": { + "properties": { + **score_field, + **rankscore_field, + **pred_field + } + }, + "reliability_index": { + "type": "integer" + }, + "metarnn": { + "properties": { + **score_field, + **rankscore_field, + **pred_field + } + }, + "m-cap": { + "properties": { + **score_field, + **rankscore_field, + **pred_field + } + }, + "revel": { + "properties": { + **rankscore_field + } + }, + "mutpred": { + "properties": { + **score_field, + **rankscore_field, + "accession": { + "type": "keyword", + "normalizer": "keyword_lowercase_normalizer" + }, + "aa_change": { + "type": "keyword", + "normalizer": "keyword_lowercase_normalizer" + }, + "pred": { + "properties": { + "p_val": { + "type": "float" + }, + "mechanism": { + "type": "text" + } + } + } + } + }, + "mvp": { + "properties": { + **rankscore_field + } + }, + "gmvp": { # new in 4.4.a + "properties": { + **rankscore_field + } + }, + "mpc": { + "properties": { + **rankscore_field + } + }, + "primateai": { + "properties": { + **score_field, + **rankscore_field, + **pred_field + } + }, + "deogen2": { + "properties": { + **score_field, + **rankscore_field, + **pred_field + } + }, + "bayesdel": { + "properties": { + "add_af": { + "properties": { + **score_field, + **rankscore_field, + **pred_field + } + }, + "no_af": { + "properties": { + **score_field, + **rankscore_field, + **pred_field + } + } + } + }, + "clinpred": { + "properties": { + **score_field, + **rankscore_field, + **pred_field + } + }, + "list-s2": { + "properties": { + **score_field, + **rankscore_field, + **pred_field + } + }, + "varity": { + "r": { + "properties": { + **score_field, + **rankscore_field + } + }, + "er": { + "properties": { + **score_field, + **rankscore_field + } + }, + "r_loo": { + "properties": { + **score_field, + **rankscore_field + } + }, + "er_loo": { + "properties": { + **score_field, + **rankscore_field + } + } + }, + "cadd": { + # Only for "hg38" + # No CADD fields will be included for "hg19" + "properties": { + "raw_score": { + "type": "float" + }, + "raw_rankscore": { + "type": "float" + }, + "pred": { + "type": "float" # CADD phred-like scores, not as other predications of string type + } + } + }, + "dann": { + "properties": { + **score_field, + **rankscore_field + } + }, + "fathmm-mkl": { + "properties": { + "coding_score": { + "type": "float" + }, + "coding_rankscore": { + "type": "float" + }, + "coding_pred": { + "type": "keyword", + "normalizer": "keyword_lowercase_normalizer" + }, + "coding_group": { + "type": "keyword", + "normalizer": "keyword_lowercase_normalizer" + } + } + }, + "fathmm-xf": { + "properties": { + "coding_score": { + "type": "float" + }, + "coding_rankscore": { + "type": "float" + }, + "coding_pred": { + "type": "keyword", + "normalizer": "keyword_lowercase_normalizer" + } + } + }, + "eigen": { + "properties": { + "raw_coding": { + "type": "float" + }, + "raw_coding_rankscore": { + "type": "float" + }, + "phred_coding": { + "type": "float" + } + } + }, + "eigen-pc": { + "properties": { + "raw_coding": { + "type": "float" + }, + "raw_coding_rankscore": { + "type": "float" + }, + "phred_coding": { + "type": "float" + }, + } + }, + "genocanyon": { + "properties": { + **score_field, + **rankscore_field + } + }, + # "integrated": { + # "properties": { + # "fitcons_score": { "type": "float" }, + # "fitcons_rankscore": { "type": "float" }, + # "confidence_value": { "type": "integer" } + # } + # }, + # "gm12878": { + # "properties": { + # "fitcons_score": { "type": "float" }, + # "fitcons_rankscore": { "type": "float" }, + # "confidence_value": { "type": "integer" } + # } + # }, + # "h1-hesc": { + # "properties": { + # "fitcons_score": { "type": "float" }, + # "fitcons_rankscore": { "type": "float" }, + # "confidence_value": { "type": "integer" } + # } + # }, + # "huvec": { + # "properties": { + # "fitcons_score": { "type": "float" }, + # "fitcons_rankscore": { "type": "float" }, + # "confidence_value": { "type": "integer" } + # } + # }, + "fitcons": { + "properties": { + "integrated": { + "properties": { + **score_field, + **rankscore_field, + **confidence_value_field + } + }, + "gm12878": { + "properties": { + **score_field, + **rankscore_field, + **confidence_value_field + } + }, + "h1-hesc": { + "properties": { + **score_field, + **rankscore_field, + **confidence_value_field + } + }, + "huvec": { + "properties": { + **score_field, + **rankscore_field, + **confidence_value_field + } + }, + } + }, + "linsight": { + "properties": { + **score_field, + **rankscore_field + } + }, + "gerp++": { + "properties": { + "nr": { + "type": "float" + }, + "rs": { + "type": "float" + }, + "rs_rankscore": { + "type": "float" + } + } + }, + "phylop": { + "properties": { + "100way_vertebrate": { + "properties": { + **score_field, + **rankscore_field + } + }, + "470way_mammalian": { # replaced 30way_mammalian in 4.4.a + "properties": { + **score_field, + **rankscore_field + } + }, + "17way_primate": { + "properties": { + **score_field, + **rankscore_field + } + } + } + }, + "phastcons": { + "properties": { + "100way_vertebrate": { + "properties": { + **score_field, + **rankscore_field + } + }, + "470way_mammalian": { # replaced 30way_mammalian in 4.4.a + "properties": { + **score_field, + **rankscore_field + } + }, + "17way_primate": { + "properties": { + **score_field, + **rankscore_field + } + } + } + }, + "siphy_29way": { + "properties": { + "pi": { + "properties": { + "a": { + "type": "float" + }, + "c": { + "type": "float" + }, + "g": { + "type": "float" + }, + "t": { + "type": "float" + } + } + }, + "logodds_score": { + "type": "float" + }, + "logodds_rankscore": { + "type": "float" + } + } + }, + "bstatistic": { + "properties": { + **score_field, + **converted_rankscore_field + } + }, + "1000gp3": { # changed since 4.4.a + "properties": { + **allele_count_field, + **allele_freq_field, + # "afr_ac": { "type": "integer" }, + # "afr_af": { "type": "float" }, + "afr": { + "properties": { + **allele_count_field, + **allele_freq_field + } + }, + # "eur_ac": { "type": "integer" }, + # "eur_af": { "type": "float" }, + "eur": { + "properties": { + **allele_count_field, + **allele_freq_field + } + }, + # "amr_ac": { "type": "integer" }, + # "amr_af": { "type": "float" }, + "amr": { + "properties": { + **allele_count_field, + **allele_freq_field + } + }, + # "eas_ac": { "type": "integer" }, + # "eas_af": { "type": "float" }, + "eas": { + "properties": { + **allele_count_field, + **allele_freq_field + } + }, + # "sas_ac": { "type": "integer" }, + # "sas_af": { "type": "float"} + "sas": { + "properties": { + **allele_count_field, + **allele_freq_field + } + } + } + }, + "twinsuk": { + "properties": { + **allele_count_field, + **allele_freq_field + } + }, + "alspac": { + "properties": { + **allele_count_field, + **allele_freq_field + } + }, + "uk10k": { + "properties": { + **allele_count_field, + **allele_freq_field + } + }, + "esp6500": { # changed since 4.4.a + "properties": { + # "aa_ac": { "type": "integer" }, + # "aa_af": { "type": "float" }, + "aa": { + "properties": { + **allele_count_field, + **allele_freq_field + } + }, + # "ea_ac": { "type": "integer" }, + # "ea_af": { "type": "float" } + "ea": { + "properties": { + **allele_count_field, + **allele_freq_field + } + }, + } + }, + "exac": { # changed since 4.4.a + "properties": { + **allele_count_field, + **allele_freq_field, + **adj_allele_count_field, + **adj_allele_freq_field, + # "afr_ac": { "type": "integer" }, + # "afr_af": { "type": "float" }, + "afr": { + "properties": { + **allele_count_field, + **allele_freq_field + } + }, + # "amr_ac": { "type": "integer" }, + # "amr_af": { "type": "float" }, + "amr": { + "properties": { + **allele_count_field, + **allele_freq_field + } + }, + # "eas_ac": { "type": "integer" }, + # "eas_af": { "type": "float" }, + "eas": { + "properties": { + **allele_count_field, + **allele_freq_field + } + }, + # "fin_ac": { "type": "integer" }, + # "fin_af": { "type": "float" }, + "fin": { + "properties": { + **allele_count_field, + **allele_freq_field + } + }, + # "nfe_ac": { "type": "integer" }, + # "nfe_af": { "type": "float" }, + "nfe": { + "properties": { + **allele_count_field, + **allele_freq_field + } + }, + # "sas_ac": { "type": "integer" }, + # "sas_af": { "type": "float" } + "sas": { + "properties": { + **allele_count_field, + **allele_freq_field + } + } + } + }, + "exac_nontcga": { # changed since 4.4.a + "properties": { + **allele_count_field, + **allele_freq_field, + **adj_allele_count_field, + **adj_allele_freq_field, + # "afr_ac": { "type": "integer" }, + # "afr_af": { "type": "float" }, + "afr": { + "properties": { + **allele_count_field, + **allele_freq_field + } + }, + # "amr_ac": { "type": "integer" }, + # "amr_af": { "type": "float" }, + "amr": { + "properties": { + **allele_count_field, + **allele_freq_field + } + }, + # "eas_ac": { "type": "integer" }, + # "eas_af": { "type": "float" }, + "eas": { + "properties": { + **allele_count_field, + **allele_freq_field + } + }, + # "fin_ac": { "type": "integer" }, + # "fin_af": { "type": "float" }, + "fin": { + "properties": { + **allele_count_field, + **allele_freq_field + } + }, + # "nfe_ac": { "type": "integer" }, + # "nfe_af": { "type": "float" }, + "nfe": { + "properties": { + **allele_count_field, + **allele_freq_field + } + }, + # "sas_ac": { "type": "integer" }, + # "sas_af": { "type": "float" } + "sas": { + "properties": { + **allele_count_field, + **allele_freq_field + } + } + } + }, + "exac_nonpsych": { # changed since 4.4.a + "properties": { + **allele_count_field, + **allele_freq_field, + **adj_allele_count_field, + **adj_allele_freq_field, + # "afr_ac": { "type": "integer" }, + # "afr_af": { "type": "float" }, + "afr": { + "properties": { + **allele_count_field, + **allele_freq_field + } + }, + # "amr_ac": { "type": "integer" }, + # "amr_af": { "type": "float" }, + "amr": { + "properties": { + **allele_count_field, + **allele_freq_field + } + }, + # "eas_ac": { "type": "integer" }, + # "eas_af": { "type": "float" }, + "eas": { + "properties": { + **allele_count_field, + **allele_freq_field + } + }, + # "fin_ac": { "type": "integer" }, + # "fin_af": { "type": "float" }, + "fin": { + "properties": { + **allele_count_field, + **allele_freq_field + } + }, + # "nfe_ac": { "type": "integer" }, + # "nfe_af": { "type": "float" }, + "nfe": { + "properties": { + **allele_count_field, + **allele_freq_field + } + }, + # "sas_ac": { "type": "integer" }, + # "sas_af": { "type": "float" } + "sas": { + "properties": { + **allele_count_field, + **allele_freq_field + } + } + } + }, + "alfa": { # new in 4.4.a + "properties": { + "european": { + "properties": { + **allele_count_field, + **allele_num_field, + **allele_freq_field + } + }, + "african_others": { + "properties": { + **allele_count_field, + **allele_num_field, + **allele_freq_field + } + }, + "east_asian": { + "properties": { + **allele_count_field, + **allele_num_field, + **allele_freq_field + } + }, + "african_american": { + "properties": { + **allele_count_field, + **allele_num_field, + **allele_freq_field + } + }, + "latin_american_1": { + "properties": { + **allele_count_field, + **allele_num_field, + **allele_freq_field + } + }, + "latin_american_2": { + "properties": { + **allele_count_field, + **allele_num_field, + **allele_freq_field + } + }, + "other_asian": { + "properties": { + **allele_count_field, + **allele_num_field, + **allele_freq_field + } + }, + "south_asian": { + "properties": { + **allele_count_field, + **allele_num_field, + **allele_freq_field + } + }, + "other": { + "properties": { + **allele_count_field, + **allele_num_field, + **allele_freq_field + } + }, + "african": { + "properties": { + **allele_count_field, + **allele_num_field, + **allele_freq_field + } + }, + "asian": { + "properties": { + **allele_count_field, + **allele_num_field, + **allele_freq_field + } + }, + "total": { + "properties": { + **allele_count_field, + **allele_num_field, + **allele_freq_field + } + }, + } + }, + "clinvar": { + "properties": { + "clinvar_id": { + "type": "integer" + }, + "clinsig": { + "type": "keyword", + "normalizer": "keyword_lowercase_normalizer" + }, + "trait": { + "type": "keyword", + "normalizer": "keyword_lowercase_normalizer" + }, + "review": { + "type": "keyword", + "normalizer": "keyword_lowercase_normalizer" + }, + "hgvs": { + "type": "keyword", + "normalizer": "keyword_lowercase_normalizer" + }, + "var_source": { + "type": "keyword", + "normalizer": "keyword_lowercase_normalizer" + }, + "medgen": { + "type": "keyword", + "normalizer": "keyword_lowercase_normalizer" + }, + "omim": { + "type": "keyword", + "normalizer": "keyword_lowercase_normalizer" + }, + "orphanet": { + "type": "keyword", + "normalizer": "keyword_lowercase_normalizer" + } + } + }, + "interpro_domain": { + "type": "text" + }, + "gtex": { + "properties": { + "gene": { + "type": "keyword", + "normalizer": "keyword_lowercase_normalizer" + }, + "tissue": { + "type": "keyword", + "normalizer": "keyword_lowercase_normalizer" + } + } + }, + "geuvadis_eqtl_target_gene": { + "type": "keyword", + "normalizer": "keyword_lowercase_normalizer" + } + } + } +} diff --git a/src/hub/dataload/sources/dbnsfp/dbnsfp_parser.py b/src/hub/dataload/sources/dbnsfp/dbnsfp_parser_43a.py similarity index 99% rename from src/hub/dataload/sources/dbnsfp/dbnsfp_parser.py rename to src/hub/dataload/sources/dbnsfp/dbnsfp_parser_43a.py index cbda413b..83316b85 100644 --- a/src/hub/dataload/sources/dbnsfp/dbnsfp_parser.py +++ b/src/hub/dataload/sources/dbnsfp/dbnsfp_parser_43a.py @@ -12,8 +12,7 @@ """ -this parser is for dbNSFP v4.3a downloaded from -https://sites.google.com/site/jpopgen/dbNSFP +Deprecated. This parser is for dbNSFP v4.3a downloaded from https://sites.google.com/site/jpopgen/dbNSFP """ @@ -644,7 +643,6 @@ def data_generator(input_file, version): for row in file_reader: row = dict(zip(header, row)) - # use transposed matrix to have 1 line with N 187 columns current_row = DbnsfpReader.map_row_to_json(row, version=version) if previous_row and current_row: if current_row["_id"] == previous_row["_id"]: diff --git a/src/hub/dataload/sources/dbnsfp/dbnsfp_parser_44a_v1.py b/src/hub/dataload/sources/dbnsfp/dbnsfp_parser_44a_v1.py new file mode 100644 index 00000000..014547ee --- /dev/null +++ b/src/hub/dataload/sources/dbnsfp/dbnsfp_parser_44a_v1.py @@ -0,0 +1,809 @@ +import re +import csv +from enum import Flag +from dataclasses import dataclass +from itertools import chain +from typing import Callable +from types import SimpleNamespace +from utils.table import TableColumn, create_tag_column_map +from utils.dotfield import parse_dot_fields +from biothings.utils.common import anyfile + + +# VALID_COLUMN_NO = 367 # for 4.1a +# VALID_COLUMN_NO = 642 # for 4.2a +# VALID_COLUMN_NO = 643 # for 4.3a +VALID_COLUMN_NO = 689 # for 4.4a + +MUTPRED_TOP5FEATURES_PATTERN = re.compile(r" \(P = ([eE0-9.-]*)\)$") + +# dbNSFP_variant use "." for missing values; +# other none values are borrowed from the `biothings.utils.dataload.dict_sweep` function and +# from the default `na_values` argument of pandas.read_csv(). +# see https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html +NA_VALUES = frozenset({ + r'.', r'', r" ", r"-", r'#N/A', r'#N/A N/A', r'#NA', r'-1.#IND', r'-1.#QNAN', r'-NaN', r'-nan', + r'1.#IND', r'1.#QNAN', r'', r'N/A', r'NA', r'NULL', r'NaN', r'n/a', r'nan', r'null', r'none', + r"Not Available", r"unknown" +}) + +COLUMN_TAG = SimpleNamespace() +COLUMN_TAG.HG38_POS = "hg38_pos" # for "pos(1-based)" +COLUMN_TAG.HG19_POS = "hg19_pos" # for "hg19_pos(1-based)" +COLUMN_TAG.HG38_CHROM = "hg38_chrom" # for "#chr" +COLUMN_TAG.HG19_CHROM = "hg19_chrom" # for "hg19_chr" +COLUMN_TAG.REF_ALLELE = "ref" +COLUMN_TAG.ALT_ALLELE = "alt" +COLUMN_TAG.UNIPROT_ACC = "uniprot_acc" +COLUMN_TAG.UNIPROT_ENTRY = "uniprot_entry" +COLUMN_TAG.HGVS_CODING = "hgvsc" # for "HGVSc_ANNOVAR", "HGVSc_snpEff", and "HGVSc_VEP" +COLUMN_TAG.HGVS_PROTEIN = "hgvsp" # for "HGVSp_ANNOVAR", "HGVSp_snpEff", and "HGVSp_VEP" +COLUMN_TAG.GTEX_GENE = "gtex_gene" +COLUMN_TAG.GTEX_TISSUE = "gtex_tissue" + + +def _check_length(lst: list): + """ + If the input list is empty (i.e. length is 0), return None; + if the input list has only 1 element (i.e. length is 1), return the element; + otherwise return the list as-is. + """ + if not lst: + return None + if len(lst) == 1: + return lst[0] + return lst + + +class Assembly(Flag): + HG19 = 1 # indicates that a column belongs to hg19 docs + HG38 = 2 # indicates that a column belongs to hg38 docs + BOTH = HG19 | HG38 # (BOTH == 3) applies to both assemblies + + @classmethod + def assembly_of(cls, name: str): + # E.g. when member_name == "HG19", member is Assembly.HG19 + for member_name, member in cls.__members__.items(): + if name.upper() == member_name: + return member + else: + raise ValueError(f"'{cls.__name__}' enum not found for '{name}'") + + +@dataclass +class Column(TableColumn): + """ + Assembly-specific column configuration + """ + assembly: str | Assembly = None # which assembly or assemblies this column belongs to + + def __post_init__(self): + super().__post_init__() + + if self.assembly is None: + self.assembly = Assembly.BOTH + return + + if isinstance(self.assembly, Assembly): + return + + if isinstance(self.assembly, str): + self.assembly = Assembly.assembly_of(self.assembly) + return + + raise ValueError(f"Cannot recognize assembly {self.assembly}") + + def is_hg19(self): + return bool(self.assembly & Assembly.HG19) # true if self.assembly is HG19 or BOTH + + def is_hg38(self): + return bool(self.assembly & Assembly.HG38) # true if self.assembly is HG38 or BOTH + + +def split(sep: str, na_values: set = NA_VALUES): + def _func(value: str): + result = [v for v in value.split(sep) if v not in na_values] + return _check_length(result) + + return _func + + +def split_cast(sep: str, astype: Callable, na_values: set = NA_VALUES): + def _func(value: str): + result = [astype(v) for v in value.split(sep) if v not in na_values] + return _check_length(result) + + return _func + + +# transforming functions for common data sources +split_str = split(";") +split_float = split_cast(";", float) +split_int = split_cast(";", int) + +# transforming functions for specific data sources +split_clinvar = split(r"|") +split_genotype = split(r"/") # for "AltaiNeandertal", "Denisova", "VindijiaNeandertal", and "ChagyrskayaNeandertal" + + +def normalize_chrom(chr: str): + """ + In dbNSFP, chromosomes are marked 1-22, "X", "Y", and "M" (Mitochondrial). + However, in MyVariant, we mark Mitochondrial chromosome "MT". + """ + return "MT" if chr == "M" else chr + + +def make_zero_based(pos: str): + """ + Convert a 1-based chromosomal position to a 0-based start-end pair. + """ + _pos = int(pos) + return {"start": _pos, "end": _pos} + + +def parse_mutpred_top5features(value): + """ + `mutpred_mechanisms` is a string combined from 5 clauses, separated by semicolons. + Each clause has the same pattern of " (P = )". + + E.g. "Loss of helix (P = 0.0444);Gain of loop (P = 0.0502);Gain of catalytic residue at A444 (P = 0.1876);\ + Gain of solvent accessibility (P = 0.2291);Loss of disorder (P = 0.9475)" + + Here we apply regex to parse this string and get a list of 5 tuples like + + [('Loss of helix', '0.0444'), ('Gain of loop', '0.0502'), ('Gain of catalytic residue at A444', '0.1876'), + ('Gain of solvent accessibility', '0.2291'), ('Loss of disorder', '0.9475')] + + Then construct a list of 5 dictionaries of <"mechanism": xxx, "p_val": xxx> and return + """ + if value is None: + return None + + mp_list = [tuple(e for e in MUTPRED_TOP5FEATURES_PATTERN.split(s) if e.strip()) for s in value.split(";")] + result = [{"mechanism": mp[0], "p_val": float(mp[1])} for mp in mp_list if mp and len(mp) == 2] + + return _check_length(result) + + +def parse_siphy_29way_pi(value: str): + """ + A "SiPhy_29way_pi" value, if not None, is a string separated by ":", representing an estimated stationary + distribution of A, C, G and T at a variant site. E.g. "0.0:0.5259:0.0:0.4741". + + Here we split the string and convert it to a dict of {: }. + """ + if value is None: + return None + + freq = [float(v) for v in value.split(":")] + pi_dict = {"a": freq[0], "c": freq[1], "g": freq[2], "t": freq[3]} + return pi_dict + + +def split_zip(a_value: str, b_value: str, sep: str, na_values: set = NA_VALUES): + """ + Split a_value and b_value by sep into two lists, and generate pairs from the two lists. + + This function assumes that the split two lists have the same length. + + E.g. with the following input, + + a_value = "P54578-2;P54578-3;A6NJA2;P54578" + b_value = UBP14_HUMAN;UBP14_HUMAN;A6NJA2_HUMAN;UBP14_HUMAN + + the returned generator can make: + + [('P54578-2', 'UBP14_HUMAN'), + ('P54578-3', 'UBP14_HUMAN'), + ('A6NJA2', 'A6NJA2_HUMAN'), + ('P54578', 'UBP14_HUMAN')] + """ + a_list = [v if v not in na_values else None for v in a_value.split(sep)] + b_list = [v if v not in na_values else None for v in b_value.split(sep)] + + result = ((a, b) for (a, b) in zip(a_list, b_list) if (a, b) != (None, None)) + # DO NOT use _check_length(result) otherwise the generator will be consumed + return result + + +def split_dedup(values: list, sep: str, na_values: set = NA_VALUES): + """ + Split each value from the input values by the separator, merge all the split results, and remove duplicates from the merged result. + + E.g. when values=["a;b;c", "b;c", "d"] and sep=";", the result is ["a", "b", "c", "d"] + """ + value_list = [value.split(sep=sep) for value in values] # a list of lists + value_set = set(chain.from_iterable(value_list)) # flatten and dedup + + result = list(v for v in value_set if v not in na_values) + return _check_length(result) + + +COLUMNS = [ + Column("#chr", dest="chrom", transform=normalize_chrom, assembly="hg38", tag=COLUMN_TAG.HG38_CHROM), # representing "chrom" only for assembly 'hg38' + Column("pos(1-based)", dest="hg38", transform=make_zero_based, tag=COLUMN_TAG.HG38_POS), + Column("ref", transform=str.upper, tag=COLUMN_TAG.REF_ALLELE), + Column("alt", transform=str.upper, tag=COLUMN_TAG.ALT_ALLELE), + Column("aaref", dest="aa.ref"), + Column("aaalt", dest="aa.alt"), + Column("rs_dbSNP", dest="rsid"), + Column("hg19_chr", dest="chrom", transform=normalize_chrom, assembly="hg19", tag=COLUMN_TAG.HG19_CHROM), # representing "chrom" only for assembly 'hg19' + Column("hg19_pos(1-based)", dest="hg19", transform=make_zero_based, tag=COLUMN_TAG.HG19_POS), + # Column("hg18_chr"), # Not Used + Column("hg18_pos(1-based)", dest="hg18", transform=make_zero_based), + Column("aapos", dest="aa.pos", transform=split_int), + Column("genename", transform=split_str), + Column("Ensembl_geneid", transform=split_str), + Column("Ensembl_transcriptid", transform=split_str), + Column("Ensembl_proteinid", transform=split_str), + Column("Uniprot_acc", tag=COLUMN_TAG.UNIPROT_ACC), # special column, see prune_uniprot() + Column("Uniprot_entry", tag=COLUMN_TAG.UNIPROT_ENTRY), # special column, see prune_uniprot() + Column("HGVSc_ANNOVAR", tag=COLUMN_TAG.HGVS_CODING), # special column, see prune_hgvsc_hgvsp() + Column("HGVSp_ANNOVAR", tag=COLUMN_TAG.HGVS_PROTEIN), # ditto + Column("HGVSc_snpEff", tag=COLUMN_TAG.HGVS_CODING), # ditto + Column("HGVSp_snpEff", tag=COLUMN_TAG.HGVS_PROTEIN), # ditto + Column("HGVSc_VEP", tag=COLUMN_TAG.HGVS_CODING), # ditto + Column("HGVSp_VEP", tag=COLUMN_TAG.HGVS_PROTEIN), # ditto + Column("APPRIS", transform=split_str), + Column("GENCODE_basic", dest="gencode_basic", transform=split_str), + Column("TSL", transform=split_int), + Column("VEP_canonical", dest="vep_canonical", transform=split_str), + Column("cds_strand", dest="cds_strand", transform=split_str), + Column("refcodon", dest="aa.refcodon", transform=split_str), + Column("codonpos", dest="aa.codonpos", transform=split_int), + Column("codon_degeneracy", dest="aa.codon_degeneracy", transform=split_int), + Column("Ancestral_allele", dest="ancestral_allele", transform=split_str), + Column("AltaiNeandertal", dest="altai_neandertal", transform=split_genotype), + Column("Denisova", transform=split_genotype), + Column("VindijiaNeandertal", dest="vindijia_neandertal", transform=split_genotype), + Column("ChagyrskayaNeandertal", dest="chagyrskaya_neandertal", transform=split_genotype), + Column("SIFT_score", transform=split_float), + Column("SIFT_converted_rankscore", dest="sift.converted_rankscore", transform=split_float), + Column("SIFT_pred", transform=split_str), + Column("SIFT4G_score", transform=split_float), + Column("SIFT4G_converted_rankscore", dest="sift4g.converted_rankscore", transform=split_float), + Column("SIFT4G_pred", transform=split_str), + Column("Polyphen2_HDIV_score", transform=split_float), + Column("Polyphen2_HDIV_rankscore", transform=split_float), + Column("Polyphen2_HDIV_pred", transform=split_str), + Column("Polyphen2_HVAR_score", transform=split_float), + Column("Polyphen2_HVAR_rankscore", transform=split_float), + Column("Polyphen2_HVAR_pred", transform=split_str), + Column("LRT_score", transform=split_float), + Column("LRT_converted_rankscore", dest="lrt.converted_rankscore", transform=split_float), + Column("LRT_pred", transform=split_str), + Column("LRT_Omega", transform=split_float), + Column("MutationTaster_score", transform=split_float), + Column("MutationTaster_converted_rankscore", dest="mutationtaster.converted_rankscore", transform=split_float), + Column("MutationTaster_pred", transform=split_str), + Column("MutationTaster_model", transform=split_str), + Column("MutationTaster_AAE", transform=split_str), + Column("MutationAssessor_score", transform=split_float), + Column("MutationAssessor_rankscore", transform=split_float), + Column("MutationAssessor_pred", transform=split_str), + Column("FATHMM_score", transform=split_float), + Column("FATHMM_converted_rankscore", dest="fathmm.converted_rankscore", transform=split_float), + Column("FATHMM_pred", transform=split_str), + Column("PROVEAN_score", transform=split_float), + Column("PROVEAN_converted_rankscore", dest="provean.converted_rankscore", transform=split_float), + Column("PROVEAN_pred", transform=split_str), + Column("VEST4_score", transform=split_float), + Column("VEST4_rankscore", transform=split_float), + Column("MetaSVM_score", transform=split_float), + Column("MetaSVM_rankscore", transform=split_float), + Column("MetaSVM_pred", transform=split_str), + Column("MetaLR_score", transform=split_float), + Column("MetaLR_rankscore", transform=split_float), + Column("MetaLR_pred", transform=split_str), + Column("Reliability_index", dest="reliability_index", transform=int), + Column("MetaRNN_score", transform=split_float), + Column("MetaRNN_rankscore", transform=split_float), + Column("MetaRNN_pred", transform=split_str), + Column("M-CAP_score", transform=split_float), + Column("M-CAP_rankscore", transform=split_float), + Column("M-CAP_pred", transform=split_str), + Column("REVEL_score", transform=split_float), + Column("REVEL_rankscore", transform=split_float), + Column("MutPred_score", transform=split_float), + Column("MutPred_rankscore", transform=split_float), + Column("MutPred_protID", dest="mutpred.accession", transform=split_str), + Column("MutPred_AAchange", dest="mutpred.aa_change", transform=split_str), + Column("MutPred_Top5features", dest="mutpred.pred", transform=parse_mutpred_top5features), + Column("MVP_score", transform=split_float), + Column("MVP_rankscore", transform=split_float), + Column("gMVP_score", transform=split_float), # new in 4.4.a + Column("gMVP_rankscore", transform=split_float), # new in 4.4.a + Column("MPC_score", transform=split_float), + Column("MPC_rankscore", transform=split_float), + Column("PrimateAI_score", transform=split_float), + Column("PrimateAI_rankscore", transform=split_float), + Column("PrimateAI_pred", transform=split_str), + Column("DEOGEN2_score", transform=split_float), + Column("DEOGEN2_rankscore", transform=split_float), + Column("DEOGEN2_pred", transform=split_str), + Column("BayesDel_addAF_score", dest="bayesdel.add_af.score", transform=split_float), + Column("BayesDel_addAF_rankscore", dest="bayesdel.add_af.rankscore", transform=split_float), + Column("BayesDel_addAF_pred", dest="bayesdel.add_af.pred", transform=split_str), + Column("BayesDel_noAF_score", dest="bayesdel.no_af.score", transform=split_float), + Column("BayesDel_noAF_rankscore", dest="bayesdel.no_af.rankscore", transform=split_float), + Column("BayesDel_noAF_pred", dest="bayesdel.no_af.pred", transform=split_str), + Column("ClinPred_score", transform=split_float), + Column("ClinPred_rankscore", transform=split_float), + Column("ClinPred_pred", transform=split_str), + Column("LIST-S2_score", transform=split_float), + Column("LIST-S2_rankscore", transform=split_float), + Column("LIST-S2_pred", transform=split_str), + Column("VARITY_R_score", transform=split_float), # new in 4.4.a + Column("VARITY_R_rankscore", transform=split_float), + Column("VARITY_ER_score", transform=split_float), + Column("VARITY_ER_rankscore", transform=split_float), + Column("VARITY_R_LOO_score", dest="varity.r_loo.score", transform=split_float), + Column("VARITY_R_LOO_rankscore", dest="varity.r_loo.rankscore", transform=split_float), + Column("VARITY_ER_LOO_score", dest="varity.er_loo.score", transform=split_float), + Column("VARITY_ER_LOO_rankscore", dest="varity.er_loo.rankscore", transform=split_float), + Column("Aloft_Fraction_transcripts_affected", dest="aloft.fraction_transcripts_affected", transform=split_str), + Column("Aloft_prob_Tolerant", dest="aloft.prob_tolerant", transform=split_str), + Column("Aloft_prob_Recessive", dest="aloft.prob_recessive", transform=split_str), + Column("Aloft_prob_Dominant", dest="aloft.prob_dominant", transform=split_str), + Column("Aloft_pred", transform=split_str), + Column("Aloft_Confidence", transform=split_str), + Column("CADD_raw", dest="cadd.raw_score", transform=split_float, assembly="hg38"), # TODO CADD will have hg38 next update. Deprecate these 3 field then. + Column("CADD_raw_rankscore", dest="cadd.raw_rankscore", transform=split_float, assembly="hg38"), + Column("CADD_phred", transform=split_float, assembly="hg38"), # CADD phred-like scores, not as other predications of string type + # Column("CADD_raw_hg19", assembly="hg19"), # discarded because Myvariant.info already has a hg19-only datasource of CADD. + # Column("CADD_raw_rankscore_hg19", assembly="hg19"), # ditto + # Column("CADD_phred_hg19", assembly="hg19"), # ditto + Column("DANN_score", transform=split_float), + Column("DANN_rankscore", transform=split_float), + Column("fathmm-MKL_coding_score", dest="fathmm-mkl.coding_score", transform=split_float), + Column("fathmm-MKL_coding_rankscore", dest="fathmm-mkl.coding_rankscore", transform=split_float), + Column("fathmm-MKL_coding_pred", dest="fathmm-mkl.coding_pred", transform=split_str), + Column("fathmm-MKL_coding_group", dest="fathmm-mkl.coding_group", transform=split_str), + Column("fathmm-XF_coding_score", dest="fathmm-xf.coding_score", transform=split_float), + Column("fathmm-XF_coding_rankscore", dest="fathmm-xf.coding_rankscore", transform=split_float), + Column("fathmm-XF_coding_pred", dest="fathmm-xf.coding_pred", transform=split_str), + Column("Eigen-raw_coding", dest="eigen.raw_coding", transform=split_float), + Column("Eigen-raw_coding_rankscore", dest="eigen.raw_coding_rankscore", transform=split_float), + Column("Eigen-phred_coding", dest="eigen.phred_coding", transform=split_float), + Column("Eigen-PC-raw_coding", dest="eigen-pc.raw_coding", transform=split_float), + Column("Eigen-PC-raw_coding_rankscore", dest="eigen-pc.raw_coding_rankscore", transform=split_float), + Column("Eigen-PC-phred_coding", dest="eigen-pc.phred_coding", transform=split_float), + Column("GenoCanyon_score", transform=split_float), + Column("GenoCanyon_rankscore", transform=split_float), + Column("integrated_fitCons_score", dest="fitcons.integrated.score", transform=split_float), + Column("integrated_fitCons_rankscore", dest="fitcons.integrated.rankscore", transform=split_float), + Column("integrated_confidence_value", dest="fitcons.integrated.confidence_value", transform=split_int), + Column("GM12878_fitCons_score", dest="fitcons.gm12878.score", transform=split_float), + Column("GM12878_fitCons_rankscore", dest="fitcons.gm12878.rankscore", transform=split_float), + Column("GM12878_confidence_value", dest="fitcons.gm12878.confidence_value", transform=split_int), + Column("H1-hESC_fitCons_score", dest="fitcons.h1-hesc.score", transform=split_float), + Column("H1-hESC_fitCons_rankscore", dest="fitcons.h1-hesc.rankscore", transform=split_float), + Column("H1-hESC_confidence_value", dest="fitcons.h1-hesc.confidence_value", transform=split_int), + Column("HUVEC_fitCons_score", dest="fitcons.huvec.score", transform=split_float), + Column("HUVEC_fitCons_rankscore", dest="fitcons.huvec.rankscore", transform=split_float), + Column("HUVEC_confidence_value", dest="fitcons.huvec.confidence_value", transform=split_int), + Column("LINSIGHT", dest="linsight.score", transform=split_float), + Column("LINSIGHT_rankscore", transform=split_float), + Column("GERP++_NR", transform=split_float), + Column("GERP++_RS", transform=split_float), + Column("GERP++_RS_rankscore", dest="gerp++.rs_rankscore", transform=split_float), + Column("phyloP100way_vertebrate", dest="phylop.100way_vertebrate.score", transform=split_float), + Column("phyloP100way_vertebrate_rankscore", dest="phylop.100way_vertebrate.rankscore", transform=split_float), + Column("phyloP470way_mammalian", dest="phylop.470way_mammalian.score", transform=split_float), # replaced 30way_mammalian in 4.4.a + Column("phyloP470way_mammalian_rankscore", dest="phylop.470way_mammalian.rankscore", transform=split_float), # replaced 30way_mammalian in 4.4.a + Column("phyloP17way_primate", dest="phylop.17way_primate.score", transform=split_float), + Column("phyloP17way_primate_rankscore", dest="phylop.17way_primate.rankscore", transform=split_float), + Column("phastCons100way_vertebrate", dest="phastcons.100way_vertebrate.score", transform=split_float), + Column("phastCons100way_vertebrate_rankscore", dest="phastcons.100way_vertebrate.rankscore", transform=split_float), + Column("phastCons470way_mammalian", dest="phastcons.470way_mammalian.score", transform=split_float), # replaced 30way_mammalian in 4.4.a + Column("phastCons470way_mammalian_rankscore", dest="phastcons.470way_mammalian.rankscore", transform=split_float), # replaced 30way_mammalian in 4.4.a + Column("phastCons17way_primate", dest="phastcons.17way_primate.score", transform=split_float), + Column("phastCons17way_primate_rankscore", dest="phastcons.17way_primate.rankscore", transform=split_float), + Column("SiPhy_29way_pi", dest="siphy_29way.pi", transform=parse_siphy_29way_pi), + Column("SiPhy_29way_logOdds", dest="siphy_29way.logodds_score", transform=split_float), + Column("SiPhy_29way_logOdds_rankscore", dest="siphy_29way.logodds_rankscore", transform=split_float), + Column("bStatistic", dest="bstatistic.score", transform=split_float), + Column("bStatistic_converted_rankscore", dest="bstatistic.converted_rankscore", transform=split_float), + Column("1000Gp3_AC", dest="1000gp3.ac", transform=int), + Column("1000Gp3_AF", dest="1000gp3.af", transform=float), + Column("1000Gp3_AFR_AC", dest="1000gp3.afr.ac", transform=int), # dest changed since 4.4.a + Column("1000Gp3_AFR_AF", dest="1000gp3.afr.af", transform=float), + Column("1000Gp3_EUR_AC", dest="1000gp3.eur.ac", transform=int), + Column("1000Gp3_EUR_AF", dest="1000gp3.eur.af", transform=float), + Column("1000Gp3_AMR_AC", dest="1000gp3.amr.ac", transform=int), + Column("1000Gp3_AMR_AF", dest="1000gp3.amr.af", transform=float), + Column("1000Gp3_EAS_AC", dest="1000gp3.eas.ac", transform=int), + Column("1000Gp3_EAS_AF", dest="1000gp3.eas.af", transform=float), + Column("1000Gp3_SAS_AC", dest="1000gp3.sas.ac", transform=int), + Column("1000Gp3_SAS_AF", dest="1000gp3.sas.af", transform=float), + Column("TWINSUK_AC", dest="twinsuk.ac", transform=int), + Column("TWINSUK_AF", dest="twinsuk.af", transform=float), + Column("ALSPAC_AC", dest="alspac.ac", transform=int), + Column("ALSPAC_AF", dest="alspac.af", transform=float), + Column("UK10K_AC", dest="uk10k.ac", transform=int), + Column("UK10K_AF", dest="uk10k.af", transform=float), + Column("ESP6500_AA_AC", dest="esp6500.aa.ac", transform=int), # dest changed since 4.4.a + Column("ESP6500_AA_AF", dest="esp6500.aa.af", transform=float), + Column("ESP6500_EA_AC", dest="esp6500.ea.ac", transform=int), + Column("ESP6500_EA_AF", dest="esp6500.ea.af", transform=float), + Column("ExAC_AC", dest="exac.ac", transform=int), # dest changed since 4.4.a + Column("ExAC_AF", dest="exac.af", transform=float), + Column("ExAC_Adj_AC", dest="exac.adj_ac", transform=int), + Column("ExAC_Adj_AF", dest="exac.adj_af", transform=float), + Column("ExAC_AFR_AC", dest="exac.afr.ac", transform=int), + Column("ExAC_AFR_AF", dest="exac.afr.af", transform=float), + Column("ExAC_AMR_AC", dest="exac.amr.ac", transform=int), + Column("ExAC_AMR_AF", dest="exac.amr.af", transform=float), + Column("ExAC_EAS_AC", dest="exac.eas.ac", transform=int), + Column("ExAC_EAS_AF", dest="exac.eas.af", transform=float), + Column("ExAC_FIN_AC", dest="exac.fin.ac", transform=int), + Column("ExAC_FIN_AF", dest="exac.fin.af", transform=float), + Column("ExAC_NFE_AC", dest="exac.nfe.ac", transform=int), + Column("ExAC_NFE_AF", dest="exac.nfe.af", transform=float), + Column("ExAC_SAS_AC", dest="exac.sas.ac", transform=int), + Column("ExAC_SAS_AF", dest="exac.sas.af", transform=float), + Column("ExAC_nonTCGA_AC", dest="exac_nontcga.ac", transform=int), + Column("ExAC_nonTCGA_AF", dest="exac_nontcga.af", transform=float), + Column("ExAC_nonTCGA_Adj_AC", dest="exac_nontcga.adj_ac", transform=int), + Column("ExAC_nonTCGA_Adj_AF", dest="exac_nontcga.adj_af", transform=float), + Column("ExAC_nonTCGA_AFR_AC", dest="exac_nontcga.afr.ac", transform=int), + Column("ExAC_nonTCGA_AFR_AF", dest="exac_nontcga.afr.af", transform=float), + Column("ExAC_nonTCGA_AMR_AC", dest="exac_nontcga.amr.ac", transform=int), + Column("ExAC_nonTCGA_AMR_AF", dest="exac_nontcga.amr.af", transform=float), + Column("ExAC_nonTCGA_EAS_AC", dest="exac_nontcga.eas.ac", transform=int), + Column("ExAC_nonTCGA_EAS_AF", dest="exac_nontcga.eas.af", transform=float), + Column("ExAC_nonTCGA_FIN_AC", dest="exac_nontcga.fin.ac", transform=int), + Column("ExAC_nonTCGA_FIN_AF", dest="exac_nontcga.fin.af", transform=float), + Column("ExAC_nonTCGA_NFE_AC", dest="exac_nontcga.nfe.ac", transform=int), + Column("ExAC_nonTCGA_NFE_AF", dest="exac_nontcga.nfe.af", transform=float), + Column("ExAC_nonTCGA_SAS_AC", dest="exac_nontcga.sas.ac", transform=int), + Column("ExAC_nonTCGA_SAS_AF", dest="exac_nontcga.sas.af", transform=float), + Column("ExAC_nonpsych_AC", dest="exac_nonpsych.ac", transform=int), + Column("ExAC_nonpsych_AF", dest="exac_nonpsych.af", transform=float), + Column("ExAC_nonpsych_Adj_AC", dest="exac_nonpsych.adj_ac", transform=int), + Column("ExAC_nonpsych_Adj_AF", dest="exac_nonpsych.adj_af", transform=float), + Column("ExAC_nonpsych_AFR_AC", dest="exac_nonpsych.afr.ac", transform=int), + Column("ExAC_nonpsych_AFR_AF", dest="exac_nonpsych.afr.af", transform=float), + Column("ExAC_nonpsych_AMR_AC", dest="exac_nonpsych.amr.ac", transform=int), + Column("ExAC_nonpsych_AMR_AF", dest="exac_nonpsych.amr.af", transform=float), + Column("ExAC_nonpsych_EAS_AC", dest="exac_nonpsych.eas.ac", transform=int), + Column("ExAC_nonpsych_EAS_AF", dest="exac_nonpsych.eas.af", transform=float), + Column("ExAC_nonpsych_FIN_AC", dest="exac_nonpsych.fin.ac", transform=int), + Column("ExAC_nonpsych_FIN_AF", dest="exac_nonpsych.fin.af", transform=float), + Column("ExAC_nonpsych_NFE_AC", dest="exac_nonpsych.nfe.ac", transform=int), + Column("ExAC_nonpsych_NFE_AF", dest="exac_nonpsych.nfe.af", transform=float), + Column("ExAC_nonpsych_SAS_AC", dest="exac_nonpsych.sas.ac", transform=int), + Column("ExAC_nonpsych_SAS_AF", dest="exac_nonpsych.sas.af", transform=float), + Column("ALFA_European_AC", dest="alfa.european.ac", transform=int), + Column("ALFA_European_AN", dest="alfa.european.an", transform=int), + Column("ALFA_European_AF", dest="alfa.european.af", transform=float), + Column("ALFA_African_Others_AC", dest="alfa.african_others.ac", transform=int), + Column("ALFA_African_Others_AN", dest="alfa.african_others.an", transform=int), + Column("ALFA_African_Others_AF", dest="alfa.african_others.af", transform=float), + Column("ALFA_East_Asian_AC", dest="alfa.east_asian.ac", transform=int), + Column("ALFA_East_Asian_AN", dest="alfa.east_asian.an", transform=int), + Column("ALFA_East_Asian_AF", dest="alfa.east_asian.af", transform=float), + Column("ALFA_African_American_AC", dest="alfa.african_american.ac", transform=int), + Column("ALFA_African_American_AN", dest="alfa.african_american.an", transform=int), + Column("ALFA_African_American_AF", dest="alfa.african_american.af", transform=float), + Column("ALFA_Latin_American_1_AC", dest="alfa.latin_american_1.ac", transform=int), + Column("ALFA_Latin_American_1_AN", dest="alfa.latin_american_1.an", transform=int), + Column("ALFA_Latin_American_1_AF", dest="alfa.latin_american_1.af", transform=float), + Column("ALFA_Latin_American_2_AC", dest="alfa.latin_american_2.ac", transform=int), + Column("ALFA_Latin_American_2_AN", dest="alfa.latin_american_2.an", transform=int), + Column("ALFA_Latin_American_2_AF", dest="alfa.latin_american_2.af", transform=float), + Column("ALFA_Other_Asian_AC", dest="alfa.other_asian.ac", transform=int), + Column("ALFA_Other_Asian_AN", dest="alfa.other_asian.an", transform=int), + Column("ALFA_Other_Asian_AF", dest="alfa.other_asian.af", transform=float), + Column("ALFA_South_Asian_AC", dest="alfa.south_asian.ac", transform=int), + Column("ALFA_South_Asian_AN", dest="alfa.south_asian.an", transform=int), + Column("ALFA_South_Asian_AF", dest="alfa.south_asian.af", transform=float), + Column("ALFA_Other_AC", dest="alfa.other.ac", transform=int), + Column("ALFA_Other_AN", dest="alfa.other.an", transform=int), + Column("ALFA_Other_AF", dest="alfa.other.af", transform=float), + Column("ALFA_African_AC", dest="alfa.african.ac", transform=int), + Column("ALFA_African_AN", dest="alfa.african.an", transform=int), + Column("ALFA_African_AF", dest="alfa.african.af", transform=float), + Column("ALFA_Asian_AC", dest="alfa.asian.ac", transform=int), + Column("ALFA_Asian_AN", dest="alfa.asian.an", transform=int), + Column("ALFA_Asian_AF", dest="alfa.asian.af", transform=float), + Column("ALFA_Total_AC", dest="alfa.total.ac", transform=int), + Column("ALFA_Total_AN", dest="alfa.total.an", transform=int), + Column("ALFA_Total_AF", dest="alfa.total.af", transform=float), + Column("clinvar_id", dest="clinvar.clinvar_id", transform=split_clinvar), + Column("clinvar_clnsig", transform=split_clinvar), + Column("clinvar_trait", transform=split_clinvar), + Column("clinvar_review", transform=split_clinvar), + Column("clinvar_hgvs", transform=split_clinvar), + Column("clinvar_var_source", dest="clinvar.var_source", transform=split_clinvar), + Column("clinvar_MedGen_id", dest="clinvar.medgen", transform=split_clinvar), + Column("clinvar_OMIM_id", dest="clinvar.omim", transform=split_clinvar), + Column("clinvar_Orphanet_id", dest="clinvar.orphanet", transform=split_clinvar), + Column("Interpro_domain", transform=split_str), + Column("GTEx_V8_gene", dest="gtex.gene", tag=COLUMN_TAG.GTEX_GENE), # special column, see prune_uniprot() + Column("GTEx_V8_tissue", dest="gtex.tissue", tag=COLUMN_TAG.GTEX_TISSUE), # special column, see prune_uniprot() + Column("Geuvadis_eQTL_target_gene", transform=split_str) +] + +HG19_COLUMNS = [c for c in COLUMNS if c.is_hg19()] +HG38_COLUMNS = [c for c in COLUMNS if c.is_hg38()] + +# Currently not necessary to make assembly-specific tag-column maps. +TAG_COLUMN_MAP = create_tag_column_map(COLUMNS) + + +def verify_pos(row, pos_column: Column, na_values: set = NA_VALUES): + pos_value = row[pos_column.name] + + if pos_value in na_values: + return False + + return True + + +def verify_hg19_row(row: dict, na_values: set = NA_VALUES): + pos_column = TAG_COLUMN_MAP[COLUMN_TAG.HG19_POS][0] + return verify_pos(row, pos_column=pos_column, na_values=na_values) + + +def verify_hg38_row(row: dict, na_values: set = NA_VALUES): + pos_column = TAG_COLUMN_MAP[COLUMN_TAG.HG38_POS][0] + return verify_pos(row, pos_column=pos_column, na_values=na_values) + + +def prune_uniprot(raw_doc: dict, acc_column: Column, entry_column: Column, na_values: set = NA_VALUES): + """ + Map each UniProt accession number and entry name from the raw document into a dictionary, + and assign all such dictionaries to the raw document's top "uniprot" field. + + E.g. with the following input value: + + raw_doc["uniprot.acc"] = "P54578-2;P54578-3;A6NJA2;P54578" + raw_doc["uniprot.entry"] = "UBP14_HUMAN;UBP14_HUMAN;A6NJA2_HUMAN;UBP14_HUMAN" + + raw_doc will be assigned as: + + raw_doc["uniprot"] = [ + {'acc': 'P54578-2', 'entry': 'UBP14_HUMAN'}, + {'acc': 'P54578-3', 'entry': 'UBP14_HUMAN'}, + {'acc': 'A6NJA2', 'entry': 'A6NJA2_HUMAN'}, + {'acc': 'P54578', 'entry': 'UBP14_HUMAN'} + ] + """ + # acc_column = TAG_COLUMN_MAP[COLUMN_TAG.UNIPROT_ACC][0] + # entry_column = TAG_COLUMN_MAP[COLUMN_TAG.UNIPROT_ENTRY][0] + + if (acc_column.dest in raw_doc) and (entry_column.dest in raw_doc): + acc_value = raw_doc[acc_column.dest] + entry_value = raw_doc[entry_column.dest] + + uniprot_result = [{"acc": acc, "entry": entry} for (acc, entry) in split_zip(acc_value, entry_value, sep=";", na_values=na_values)] + uniprot_result = _check_length(uniprot_result) + if uniprot_result is not None: + raw_doc["uniprot"] = uniprot_result + + del raw_doc[acc_column.dest] + del raw_doc[entry_column.dest] + + return raw_doc + + +def prune_hgvsc_hgvsp(raw_doc: dict, hgvsc_columns: list[Column], hgvsp_columns: list[Column], na_values: set = NA_VALUES): + """ + Split "HGVSc_ANNOVAR", "HGVSc_snpEff", and "HGVSc_VEP" values into "hgvsc" field; + split "HGVSp_ANNOVAR", "HGVSp_snpEff", and "HGVSp_VEP" values into "hgvsp" field. + """ + coding_values = [raw_doc[c.dest] for c in hgvsc_columns if c.dest in raw_doc] + protein_values = [raw_doc[c.dest] for c in hgvsp_columns if c.dest in raw_doc] + + coding_result = split_dedup(coding_values, sep=";", na_values=na_values) + protein_result = split_dedup(protein_values, sep=";", na_values=na_values) + + if coding_result is not None: + raw_doc["hgvsc"] = coding_result + if protein_result is not None: + raw_doc["hgvsp"] = protein_result + + for c in hgvsc_columns: + raw_doc.pop(c.dest, None) # safely delete the key because it can be absent + for c in hgvsp_columns: + raw_doc.pop(c.dest, None) # safely delete the key because it can be absent + + return raw_doc + + +def prune_gtex(raw_doc: dict, gene_column: Column, tissue_column: Column, na_values: set = NA_VALUES): + """ + Map each GTEx gene name and tissue name from the raw document into a dictionary, + and assign all such dictionaries to the raw document's top "gtex" field. + + E.g. with the following input value: + + row["gtex_gene"] = "ENOSF1|ENOSF1" + row["gtex_tissue"] = "Adipose_Subcutaneous|Muscle_Skeletal" + + raw_doc will be assigned as: + + row["gtex"] = [ + {'gene': 'ENOSF1', 'tissue': 'Adipose_Subcutaneous'}, + {'gene': 'ENOSF1', 'tissue': 'Muscle_Skeletal'} + ] + """ + # when these two keys are not present in the doc, it means the responding two values in tsv files are NA values + if (gene_column.dest in raw_doc) and (tissue_column.dest in raw_doc): + gene_value = raw_doc[gene_column.dest] + tissue_value = raw_doc[tissue_column.dest] + + # special separator "|" for GTEx + gtex_result = [{"gene": acc, "tissue": entry} for (acc, entry) in split_zip(gene_value, tissue_value, sep=r"|", na_values=na_values)] + gtex_result = _check_length(gtex_result) + if gtex_result is not None: + raw_doc["gtex"] = gtex_result + + del raw_doc[gene_column.dest] + del raw_doc[tissue_column.dest] + + return raw_doc + + +def prune_hg19_doc(doc: dict, na_values: set = NA_VALUES): + uniprot_acc_column = TAG_COLUMN_MAP[COLUMN_TAG.UNIPROT_ACC][0] + uniprot_entry_column = TAG_COLUMN_MAP[COLUMN_TAG.UNIPROT_ENTRY][0] + doc = prune_uniprot(doc, acc_column=uniprot_acc_column, entry_column=uniprot_entry_column, na_values=na_values) + + hgvs_coding_columns = TAG_COLUMN_MAP[COLUMN_TAG.HGVS_CODING] + hgvs_protein_columns = TAG_COLUMN_MAP[COLUMN_TAG.HGVS_PROTEIN] + doc = prune_hgvsc_hgvsp(doc, hgvsc_columns=hgvs_coding_columns, hgvsp_columns=hgvs_protein_columns, na_values=na_values) + + gtex_gene_column = TAG_COLUMN_MAP[COLUMN_TAG.GTEX_GENE][0] + gtex_tissue_column = TAG_COLUMN_MAP[COLUMN_TAG.GTEX_TISSUE][0] + doc = prune_gtex(doc, gene_column=gtex_gene_column, tissue_column=gtex_tissue_column, na_values=na_values) + + return doc + + +def prune_hg38_doc(doc: dict, na_values: set = NA_VALUES): + return prune_hg19_doc(doc, na_values=na_values) + + +def construct_raw_doc(row: dict, columns: list, na_values: set = NA_VALUES): + """ + Construct a raw dbnsfp doc from a dict-like row read from the csv file. + "Raw" means 1) the doc may contain dot fields that are not parsed, and 2) some values in the doc need further treatment/processing. + + Args: + row: a dict representing a csv row's content + columns: a list of Column object indicating how to construct each column + na_values: a set of values seen as NA + Returns: + a dict representing the doc's json object + """ + result = dict() + + for column in columns: + value = row[column.name] + if value in na_values: + continue + + value = column.transform(value) + if value is None: + continue + + result[column.dest] = value + + return result + + +def construct_hg19_raw_doc(row: dict, na_values: set = NA_VALUES): + return construct_raw_doc(row, columns=HG19_COLUMNS, na_values=na_values) + + +def construct_hg38_raw_doc(row: dict, na_values: set = NA_VALUES): + return construct_raw_doc(row, columns=HG38_COLUMNS, na_values=na_values) + + +def make_hgvs_id(doc: dict, chrom_column: Column, pos_column: Column, ref_column: Column, alt_column: Column): + chrom_value = doc[chrom_column.dest] + pos_value = doc[pos_column.dest]["start"] # see make_zero_based() + ref_value = doc[ref_column.dest] + alt_value = doc[alt_column.dest] + + hgvs_id = "chr%s:g.%d%s>%s" % (chrom_value, pos_value, ref_value, alt_value) + return hgvs_id + + +def make_hg19_hgvs_id(doc: dict): + chrom_column = TAG_COLUMN_MAP[COLUMN_TAG.HG19_CHROM][0] + pos_column = TAG_COLUMN_MAP[COLUMN_TAG.HG19_POS][0] + ref_column = TAG_COLUMN_MAP[COLUMN_TAG.REF_ALLELE][0] + alt_column = TAG_COLUMN_MAP[COLUMN_TAG.ALT_ALLELE][0] + + return make_hgvs_id(doc, chrom_column=chrom_column, pos_column=pos_column, ref_column=ref_column, alt_column=alt_column) + + +def make_hg38_hgvs_id(doc: dict): + chrom_column = TAG_COLUMN_MAP[COLUMN_TAG.HG38_CHROM][0] + pos_column = TAG_COLUMN_MAP[COLUMN_TAG.HG38_POS][0] + ref_column = TAG_COLUMN_MAP[COLUMN_TAG.REF_ALLELE][0] + alt_column = TAG_COLUMN_MAP[COLUMN_TAG.ALT_ALLELE][0] + + return make_hgvs_id(doc, chrom_column=chrom_column, pos_column=pos_column, ref_column=ref_column, alt_column=alt_column) + + +def construct_hg19_doc(row: dict, na_values: set = NA_VALUES): + verified = verify_hg19_row(row, na_values=na_values) + if not verified: + return None + + raw_doc = construct_hg19_raw_doc(row, na_values=na_values) + raw_doc = prune_hg19_doc(raw_doc, na_values=na_values) + hgvs_id = make_hg19_hgvs_id(raw_doc) + + doc = { + "_id": hgvs_id, + "dbnsfp": parse_dot_fields(raw_doc) # convert dot-fields into nested dictionaries + } + return doc + + +def construct_hg38_doc(row: dict, na_values: set = NA_VALUES): + verified = verify_hg38_row(row, na_values=na_values) + if not verified: + return None + + raw_doc = construct_hg38_raw_doc(row, na_values=na_values) + raw_doc = prune_hg38_doc(raw_doc, na_values=na_values) + hgvs_id = make_hg38_hgvs_id(raw_doc) + + doc = { + "_id": hgvs_id, + "dbnsfp": parse_dot_fields(raw_doc) # convert dot-fields into nested dictionaries + } + return doc + + +def load_file(path: str, assembly: str): + file = anyfile(path) + file_reader = csv.DictReader(file, delimiter="\t") + + num_columns = len(file_reader.fieldnames) + assert num_columns == VALID_COLUMN_NO, "Expecting %s columns, but got %s" % (VALID_COLUMN_NO, num_columns) + + _construct_doc = None + match assembly: + case "hg19": + _construct_doc = construct_hg19_doc + case "hg38": + _construct_doc = construct_hg38_doc + case _: + raise ValueError(f"Cannot recognize assembly. Accept 'hg19' or 'hg38', got '{assembly}'.") + + last_doc = None + for row in file_reader: + current_doc = _construct_doc(row, na_values=NA_VALUES) + + if current_doc is None: + continue + + if last_doc is not None: + if current_doc["_id"] == last_doc["_id"]: + last_aa = last_doc["dbnsfp"]["aa"] + current_aa = current_doc["dbnsfp"]["aa"] + + if not isinstance(last_aa, list): + last_aa = [last_aa] + last_aa.append(current_aa) + + last_doc["dbnsfp"]["aa"] = last_aa + continue + else: + yield last_doc + + last_doc = current_doc + + # yield the very last doc + if last_doc: + yield last_doc + + file.close() diff --git a/src/hub/dataload/sources/dbnsfp/dbnsfp_parser_44a_v2.py b/src/hub/dataload/sources/dbnsfp/dbnsfp_parser_44a_v2.py new file mode 100644 index 00000000..02914f55 --- /dev/null +++ b/src/hub/dataload/sources/dbnsfp/dbnsfp_parser_44a_v2.py @@ -0,0 +1,894 @@ +import re +import csv +from enum import Flag +from dataclasses import dataclass +from typing import Callable +from types import SimpleNamespace +from utils.table import TableColumn, create_tag_column_map +from utils.dotfield import parse_dot_fields +from biothings.utils.common import anyfile + + +# VALID_COLUMN_NO = 367 # for 4.1a +# VALID_COLUMN_NO = 642 # for 4.2a +# VALID_COLUMN_NO = 643 # for 4.3a +VALID_COLUMN_NO = 689 # for 4.4a + +MUTPRED_TOP5FEATURES_PATTERN = re.compile(r" \(P = ([eE0-9.-]*)\)$") + +# dbNSFP_variant use "." for missing values; +# other none values are borrowed from the `biothings.utils.dataload.dict_sweep` function and +# from the default `na_values` argument of pandas.read_csv(). +# see https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html +NA_VALUES = frozenset({ + r'.', r'', r" ", r"-", r'#N/A', r'#N/A N/A', r'#NA', r'-1.#IND', r'-1.#QNAN', r'-NaN', r'-nan', + r'1.#IND', r'1.#QNAN', r'', r'N/A', r'NA', r'NULL', r'NaN', r'n/a', r'nan', r'null', r'none', + r"Not Available", r"unknown" +}) + +# A tag can be any of string value; columns with the same tag are looked-up as a group +COLUMN_TAG = SimpleNamespace() +COLUMN_TAG.HG38_POS = "hg38_pos" # for "pos(1-based)" +COLUMN_TAG.HG19_POS = "hg19_pos" # for "hg19_pos(1-based)" +COLUMN_TAG.HG38_CHROM = "hg38_chrom" # for "#chr" +COLUMN_TAG.HG19_CHROM = "hg19_chrom" # for "hg19_chr" +COLUMN_TAG.REF_ALLELE = "ref" +COLUMN_TAG.ALT_ALLELE = "alt" +COLUMN_TAG.GTEX_GENE = "gtex_gene" +COLUMN_TAG.GTEX_TISSUE = "gtex_tissue" +# Note that column "MutationTaster_converted_rankscore" is not tagged +COLUMN_TAG.MUTATION_TASTER_AAE = "MutationTaster_AAE" +COLUMN_TAG.MUTATION_TASTER_MODEL = "MutationTaster_model" +COLUMN_TAG.MUTATION_TASTER_PRED = "MutationTaster_pred" +COLUMN_TAG.MUTATION_TASTER_SCORE = "MutationTaster_score" +COLUMN_TAG.ALOFT_FRACTION_TRANSCRIPTS_AFFECTED = "Aloft_Fraction_transcripts_affected" +COLUMN_TAG.ALOFT_PROB_TOLERANT = "Aloft_prob_Tolerant" +COLUMN_TAG.ALOFT_PROB_RECESSIVE = "Aloft_prob_Recessive" +COLUMN_TAG.ALOFT_PROB_DOMINANT = "Aloft_prob_Dominant" +COLUMN_TAG.ALOFT_PRED = "Aloft_pred" +COLUMN_TAG.ALOFT_CONFIDENCE = "Aloft_Confidence" + + +def _check_length(lst: list): + """ + If the input list is empty (i.e. length is 0), return None; + if the input list has only 1 element (i.e. length is 1), return the element; + otherwise return the list as-is. + """ + if not lst: + return None + if len(lst) == 1: + return lst[0] + return lst + + +class Assembly(Flag): + HG19 = 1 # indicates that a column belongs to hg19 docs + HG38 = 2 # indicates that a column belongs to hg38 docs + BOTH = HG19 | HG38 # (BOTH == 3) applies to both assemblies + + @classmethod + def assembly_of(cls, name: str): + # E.g. when member_name == "HG19", member is Assembly.HG19 + for member_name, member in cls.__members__.items(): + if name.upper() == member_name: + return member + else: + raise ValueError(f"'{cls.__name__}' enum not found for '{name}'") + + +@dataclass +class Column(TableColumn): + """ + Assembly-specific column configuration + """ + assembly: str | Assembly = None # which assembly or assemblies this column belongs to + + def __post_init__(self): + super().__post_init__() + + if self.assembly is None: + self.assembly = Assembly.BOTH + return + + if isinstance(self.assembly, Assembly): + return + + if isinstance(self.assembly, str): + self.assembly = Assembly.assembly_of(self.assembly) + return + + raise ValueError(f"Cannot recognize assembly {self.assembly}") + + def is_hg19(self): + return bool(self.assembly & Assembly.HG19) # true if self.assembly is HG19 or BOTH + + def is_hg38(self): + return bool(self.assembly & Assembly.HG38) # true if self.assembly is HG38 or BOTH + + +def split(sep: str, na_values: set = NA_VALUES, drop_na: bool = False): + def _func_drop_na(value: str): + result = [v for v in value.split(sep) if v not in na_values] + return result + + def _func_keep_na(value: str): + result = [v if v not in na_values else None for v in value.split(sep)] + if all(v is None for v in result): # we keep NA values in the result; however if every value in the result is None, we treat whole result as None + return None + return result + + return _func_drop_na if drop_na else _func_keep_na + + +def split_cast(sep: str, astype: Callable, na_values: set = NA_VALUES, drop_na: bool = False): + def _func_drop_na(value: str): + result = [astype(v) for v in value.split(sep) if v not in na_values] + return result + + def _func_keep_na(value: str): + result = [astype(v) if v not in na_values else None for v in value.split(sep)] + if all(v is None for v in result): # we keep NA values in the result; however if every value in the result is None, we treat whole result as None + return None + return result + + return _func_drop_na if drop_na else _func_keep_na + + +def compose(_split_func: Callable, _unlist_func: Callable): + def _func(value): + split_result = _split_func(value) + if split_result is None: + return None + return _unlist_func(split_result) + return _func + + +# Transforming functions for "protein" data sources +# We don't compose with _check_length because it would be easier to apply "zip" on the split results if all values are lists +split_str = split(";") +split_float = split_cast(";", float) +split_int = split_cast(";", int) + +# Transforming functions for other common non-"protein" data sources +split_str_drop_na = compose(split(";", drop_na=True), _check_length) +split_float_drop_na = compose(split_cast(";", float, drop_na=True), _check_length) +split_int_drop_na = compose(split_cast(";", int, drop_na=True), _check_length) + +# Transforming functions for specific data sources +split_clinvar = compose(split(r"|", drop_na=True), _check_length) +split_genotype = compose(split(r"/", drop_na=True), _check_length) # for "AltaiNeandertal", "Denisova", "VindijiaNeandertal", and "ChagyrskayaNeandertal" + + +def normalize_chrom(chr: str): + """ + In dbNSFP, chromosomes are marked 1-22, "X", "Y", and "M" (Mitochondrial). + However, in MyVariant, we mark Mitochondrial chromosome "MT". + """ + return "MT" if chr == "M" else chr + + +def make_zero_based(pos: str): + """ + Convert a 1-based chromosomal position to a 0-based start-end pair. + """ + _pos = int(pos) + return {"start": _pos, "end": _pos} + + +def parse_mutpred_top5features(value): + """ + `mutpred_mechanisms` is a string combined from 5 clauses, separated by semicolons. + Each clause has the same pattern of " (P = )". + + E.g. "Loss of helix (P = 0.0444);Gain of loop (P = 0.0502);Gain of catalytic residue at A444 (P = 0.1876);\ + Gain of solvent accessibility (P = 0.2291);Loss of disorder (P = 0.9475)" + + Here we apply regex to parse this string and get a list of 5 tuples like + + [('Loss of helix', '0.0444'), ('Gain of loop', '0.0502'), ('Gain of catalytic residue at A444', '0.1876'), + ('Gain of solvent accessibility', '0.2291'), ('Loss of disorder', '0.9475')] + + Then construct a list of 5 dictionaries of <"mechanism": xxx, "p_val": xxx> and return + """ + if value is None: + return None + + mp_list = [tuple(e for e in MUTPRED_TOP5FEATURES_PATTERN.split(s) if e.strip()) for s in value.split(";")] + result = [{"mechanism": mp[0], "p_val": float(mp[1])} for mp in mp_list if mp and len(mp) == 2] + + return _check_length(result) + + +def parse_siphy_29way_pi(value: str): + """ + A "SiPhy_29way_pi" value, if not None, is a string separated by ":", representing an estimated stationary + distribution of A, C, G and T at a variant site. E.g. "0.0:0.5259:0.0:0.4741". + + Here we split the string and convert it to a dict of {: }. + """ + if value is None: + return None + + freq = [float(v) for v in value.split(":")] + pi_dict = {"a": freq[0], "c": freq[1], "g": freq[2], "t": freq[3]} + return pi_dict + + +def split_zip(values: list[str], sep: str, na_values: set = NA_VALUES): + """ + Split each string in values by sep into a list, and generate tuples from all the lists. + + E.g. with the following input, + + values = ["P54578-2;P54578-3;A6NJA2;P54578", "UBP14_HUMAN;UBP14_HUMAN;A6NJA2_HUMAN;UBP14_HUMAN"] + + the returned generator can make: + + [('P54578-2', 'UBP14_HUMAN'), + ('P54578-3', 'UBP14_HUMAN'), + ('A6NJA2', 'A6NJA2_HUMAN'), + ('P54578', 'UBP14_HUMAN')] + + Reference implementation: https://docs.python.org/3.3/library/functions.html#zip + """ + sentinel = object() + iterators = [(v if v not in na_values else None for v in value.split(sep)) for value in values] + + while iterators: # always true if iterators is not empty + result = [] + for it in iterators: + element = next(it, sentinel) + if element is sentinel: # terminate at once when a `it` is fully consumed + return + result.append(element) + yield tuple(result) + + +COLUMNS = [ + Column("#chr", dest="chrom", transform=normalize_chrom, assembly="hg38", tag=COLUMN_TAG.HG38_CHROM), # representing "chrom" only for assembly 'hg38' + Column("pos(1-based)", dest="hg38", transform=make_zero_based, tag=COLUMN_TAG.HG38_POS), + Column("ref", transform=str.upper, tag=COLUMN_TAG.REF_ALLELE), + Column("alt", transform=str.upper, tag=COLUMN_TAG.ALT_ALLELE), + Column("aaref", dest="aa.ref"), + Column("aaalt", dest="aa.alt"), + Column("rs_dbSNP", dest="rsid"), + Column("hg19_chr", dest="chrom", transform=normalize_chrom, assembly="hg19", tag=COLUMN_TAG.HG19_CHROM), # representing "chrom" only for assembly 'hg19' + Column("hg19_pos(1-based)", dest="hg19", transform=make_zero_based, tag=COLUMN_TAG.HG19_POS), + # Column("hg18_chr"), # Not Used + Column("hg18_pos(1-based)", dest="hg18", transform=make_zero_based), + Column("aapos", dest="protein.aa.pos", transform=split_int), + Column("genename", dest="protein.genename", transform=split_str), + Column("Ensembl_geneid", dest="protein.geneid", transform=split_str), + Column("Ensembl_transcriptid", dest="protein.transcriptid", transform=split_str), + Column("Ensembl_proteinid", dest="protein.proteinid", transform=split_str), + Column("Uniprot_acc", dest="protein.uniprot.acc", transform=split_str), + Column("Uniprot_entry", dest="protein.uniprot.entry", transform=split_str), + Column("HGVSc_ANNOVAR", dest="protein.hgvsc.annovar", transform=split_str), + Column("HGVSp_ANNOVAR", dest="protein.hgvsp.annovar", transform=split_str), + Column("HGVSc_snpEff", dest="protein.hgvsc.snpeff", transform=split_str), + Column("HGVSp_snpEff", dest="protein.hgvsp.snpeff", transform=split_str), + Column("HGVSc_VEP", dest="protein.hgvsc.vep", transform=split_str), + Column("HGVSp_VEP", dest="protein.hgvsp.vep", transform=split_str), + Column("APPRIS", dest="protein.appris", transform=split_str), + Column("GENCODE_basic", dest="protein.gencode_basic", transform=split_str), + Column("TSL", dest="protein.tsl", transform=split_int), + Column("VEP_canonical", dest="protein.vep_canonical", transform=split_str), + Column("cds_strand", dest="cds_strand", transform=split_str_drop_na), + Column("refcodon", dest="protein.aa.refcodon", transform=split_str), + Column("codonpos", dest="protein.aa.codonpos", transform=split_int), + Column("codon_degeneracy", dest="protein.aa.codon_degeneracy", transform=split_int), + Column("Ancestral_allele", dest="ancestral_allele", transform=split_str_drop_na), + Column("AltaiNeandertal", dest="altai_neandertal", transform=split_genotype), + Column("Denisova", transform=split_genotype), + Column("VindijiaNeandertal", dest="vindijia_neandertal", transform=split_genotype), + Column("ChagyrskayaNeandertal", dest="chagyrskaya_neandertal", transform=split_genotype), + Column("SIFT_score", dest="protein.sift.score", transform=split_float), + Column("SIFT_converted_rankscore", dest="sift.converted_rankscore", transform=split_float_drop_na), + Column("SIFT_pred", dest="protein.sift.pred", transform=split_str), + Column("SIFT4G_score", dest="protein.sift4g.score", transform=split_float), + Column("SIFT4G_converted_rankscore", dest="sift4g.converted_rankscore", transform=split_float_drop_na), + Column("SIFT4G_pred", dest="protein.sift4g.pred", transform=split_str), + Column("Polyphen2_HDIV_score", dest="protein.polyphen2.hdiv.score", transform=split_float), + Column("Polyphen2_HDIV_rankscore", transform=split_float_drop_na), + Column("Polyphen2_HDIV_pred", dest="protein.polyphen2.hdiv.pred", transform=split_str), + Column("Polyphen2_HVAR_score", dest="protein.polyphen2.hvar.score", transform=split_float), + Column("Polyphen2_HVAR_rankscore", transform=split_float_drop_na), + Column("Polyphen2_HVAR_pred", dest="protein.polyphen2.hvar.pred", transform=split_str), + Column("LRT_score", transform=split_float_drop_na), + Column("LRT_converted_rankscore", dest="lrt.converted_rankscore", transform=split_float_drop_na), + Column("LRT_pred", transform=split_str_drop_na), + Column("LRT_Omega", transform=split_float_drop_na), + Column("MutationTaster_score", tag=COLUMN_TAG.MUTATION_TASTER_SCORE), + Column("MutationTaster_converted_rankscore", dest="mutationtaster.converted_rankscore", transform=split_float_drop_na), + Column("MutationTaster_pred", tag=COLUMN_TAG.MUTATION_TASTER_PRED), + Column("MutationTaster_model", tag=COLUMN_TAG.MUTATION_TASTER_MODEL), + Column("MutationTaster_AAE", tag=COLUMN_TAG.MUTATION_TASTER_AAE), + Column("MutationAssessor_score", dest="protein.mutationassessor.score", transform=split_float), + Column("MutationAssessor_rankscore", transform=split_float_drop_na), + Column("MutationAssessor_pred", dest="protein.mutationassessor.pred", transform=split_str), + Column("FATHMM_score", dest="protein.fathmm.score", transform=split_float), + Column("FATHMM_converted_rankscore", dest="fathmm.converted_rankscore", transform=split_float_drop_na), + Column("FATHMM_pred", dest="protein.fathmm.pred", transform=split_str), + Column("PROVEAN_score", dest="protein.provean.score", transform=split_float), + Column("PROVEAN_converted_rankscore", dest="provean.converted_rankscore", transform=split_float_drop_na), + Column("PROVEAN_pred", dest="protein.provean.pred", transform=split_str), + Column("VEST4_score", dest="protein.vest4.score", transform=split_float), + Column("VEST4_rankscore", transform=split_float_drop_na), + Column("MetaSVM_score", transform=split_float_drop_na), + Column("MetaSVM_rankscore", transform=split_float_drop_na), + Column("MetaSVM_pred", transform=split_str_drop_na), + Column("MetaLR_score", transform=split_float_drop_na), + Column("MetaLR_rankscore", transform=split_float_drop_na), + Column("MetaLR_pred", transform=split_str_drop_na), + Column("Reliability_index", dest="reliability_index", transform=int), + Column("MetaRNN_score", transform=split_float_drop_na), + Column("MetaRNN_rankscore", transform=split_float_drop_na), + Column("MetaRNN_pred", transform=split_str_drop_na), + Column("M-CAP_score", transform=split_float_drop_na), + Column("M-CAP_rankscore", transform=split_float_drop_na), + Column("M-CAP_pred", transform=split_str_drop_na), + Column("REVEL_score", dest="protein.revel.score", transform=split_float), + Column("REVEL_rankscore", transform=split_float_drop_na), + Column("MutPred_score", transform=split_float_drop_na), + Column("MutPred_rankscore", transform=split_float_drop_na), + Column("MutPred_protID", dest="mutpred.accession", transform=split_str_drop_na), + Column("MutPred_AAchange", dest="mutpred.aa_change", transform=split_str_drop_na), + Column("MutPred_Top5features", dest="mutpred.pred", transform=parse_mutpred_top5features), + Column("MVP_score", dest="protein.mvp.score", transform=split_float), + Column("MVP_rankscore", transform=split_float_drop_na), + Column("gMVP_score", dest="protein.gmvp.score", transform=split_float), # new in 4.4.a + Column("gMVP_rankscore", transform=split_float_drop_na), # new in 4.4.a + Column("MPC_score", dest="protein.mpc.score", transform=split_float), + Column("MPC_rankscore", transform=split_float_drop_na), + Column("PrimateAI_score", transform=split_float_drop_na), + Column("PrimateAI_rankscore", transform=split_float_drop_na), + Column("PrimateAI_pred", transform=split_str_drop_na), + Column("DEOGEN2_score", transform=split_float_drop_na), + Column("DEOGEN2_rankscore", transform=split_float_drop_na), + Column("DEOGEN2_pred", transform=split_str_drop_na), + Column("BayesDel_addAF_score", dest="bayesdel.add_af.score", transform=split_float_drop_na), + Column("BayesDel_addAF_rankscore", dest="bayesdel.add_af.rankscore", transform=split_float_drop_na), + Column("BayesDel_addAF_pred", dest="bayesdel.add_af.pred", transform=split_str_drop_na), + Column("BayesDel_noAF_score", dest="bayesdel.no_af.score", transform=split_float_drop_na), + Column("BayesDel_noAF_rankscore", dest="bayesdel.no_af.rankscore", transform=split_float_drop_na), + Column("BayesDel_noAF_pred", dest="bayesdel.no_af.pred", transform=split_str_drop_na), + Column("ClinPred_score", transform=split_float_drop_na), + Column("ClinPred_rankscore", transform=split_float_drop_na), + Column("ClinPred_pred", transform=split_str_drop_na), + Column("LIST-S2_score", transform=split_float_drop_na), + Column("LIST-S2_rankscore", transform=split_float_drop_na), + Column("LIST-S2_pred", transform=split_str_drop_na), + Column("VARITY_R_score", transform=split_float_drop_na), # new in 4.4.a + Column("VARITY_R_rankscore", transform=split_float_drop_na), + Column("VARITY_ER_score", transform=split_float_drop_na), + Column("VARITY_ER_rankscore", transform=split_float_drop_na), + Column("VARITY_R_LOO_score", dest="varity.r_loo.score", transform=split_float_drop_na), + Column("VARITY_R_LOO_rankscore", dest="varity.r_loo.rankscore", transform=split_float_drop_na), + Column("VARITY_ER_LOO_score", dest="varity.er_loo.score", transform=split_float_drop_na), + Column("VARITY_ER_LOO_rankscore", dest="varity.er_loo.rankscore", transform=split_float_drop_na), + Column("Aloft_Fraction_transcripts_affected", dest="protein.aloft.fraction_transcripts_affected", transform=split_str, tag=COLUMN_TAG.ALOFT_FRACTION_TRANSCRIPTS_AFFECTED), + Column("Aloft_prob_Tolerant", dest="protein.aloft.prob_tolerant", transform=split_str, tag=COLUMN_TAG.ALOFT_PROB_TOLERANT), + Column("Aloft_prob_Recessive", dest="protein.aloft.prob_recessive", transform=split_str, tag=COLUMN_TAG.ALOFT_PROB_RECESSIVE), + Column("Aloft_prob_Dominant", dest="protein.aloft.prob_dominant", transform=split_str, tag=COLUMN_TAG.ALOFT_PROB_DOMINANT), + Column("Aloft_pred", dest="protein.aloft.pred", transform=split_str, tag=COLUMN_TAG.ALOFT_PRED), + Column("Aloft_Confidence", dest="protein.aloft.confidence", transform=split_str, tag=COLUMN_TAG.ALOFT_CONFIDENCE), + Column("CADD_raw", dest="cadd.raw_score", transform=split_float_drop_na, assembly="hg38"), # TODO CADD will have hg38 next update. Deprecate these 3 field then. + Column("CADD_raw_rankscore", dest="cadd.raw_rankscore", transform=split_float_drop_na, assembly="hg38"), + Column("CADD_phred", transform=split_float_drop_na, assembly="hg38"), # CADD phred-like scores, not as other predications of string type + # Column("CADD_raw_hg19", assembly="hg19"), # discarded because Myvariant.info already has a hg19-only datasource of CADD. + # Column("CADD_raw_rankscore_hg19", assembly="hg19"), # ditto + # Column("CADD_phred_hg19", assembly="hg19"), # ditto + Column("DANN_score", transform=split_float_drop_na), + Column("DANN_rankscore", transform=split_float_drop_na), + Column("fathmm-MKL_coding_score", dest="fathmm-mkl.coding_score", transform=split_float_drop_na), + Column("fathmm-MKL_coding_rankscore", dest="fathmm-mkl.coding_rankscore", transform=split_float_drop_na), + Column("fathmm-MKL_coding_pred", dest="fathmm-mkl.coding_pred", transform=split_str_drop_na), + Column("fathmm-MKL_coding_group", dest="fathmm-mkl.coding_group", transform=split_str_drop_na), + Column("fathmm-XF_coding_score", dest="fathmm-xf.coding_score", transform=split_float_drop_na), + Column("fathmm-XF_coding_rankscore", dest="fathmm-xf.coding_rankscore", transform=split_float_drop_na), + Column("fathmm-XF_coding_pred", dest="fathmm-xf.coding_pred", transform=split_str_drop_na), + Column("Eigen-raw_coding", dest="eigen.raw_coding", transform=split_float_drop_na), + Column("Eigen-raw_coding_rankscore", dest="eigen.raw_coding_rankscore", transform=split_float_drop_na), + Column("Eigen-phred_coding", dest="eigen.phred_coding", transform=split_float_drop_na), + Column("Eigen-PC-raw_coding", dest="eigen-pc.raw_coding", transform=split_float_drop_na), + Column("Eigen-PC-raw_coding_rankscore", dest="eigen-pc.raw_coding_rankscore", transform=split_float_drop_na), + Column("Eigen-PC-phred_coding", dest="eigen-pc.phred_coding", transform=split_float_drop_na), + Column("GenoCanyon_score", transform=split_float_drop_na), + Column("GenoCanyon_rankscore", transform=split_float_drop_na), + Column("integrated_fitCons_score", dest="fitcons.integrated.score", transform=split_float_drop_na), + Column("integrated_fitCons_rankscore", dest="fitcons.integrated.rankscore", transform=split_float_drop_na), + Column("integrated_confidence_value", dest="fitcons.integrated.confidence_value", transform=split_int_drop_na), + Column("GM12878_fitCons_score", dest="fitcons.gm12878.score", transform=split_float_drop_na), + Column("GM12878_fitCons_rankscore", dest="fitcons.gm12878.rankscore", transform=split_float_drop_na), + Column("GM12878_confidence_value", dest="fitcons.gm12878.confidence_value", transform=split_int_drop_na), + Column("H1-hESC_fitCons_score", dest="fitcons.h1-hesc.score", transform=split_float_drop_na), + Column("H1-hESC_fitCons_rankscore", dest="fitcons.h1-hesc.rankscore", transform=split_float_drop_na), + Column("H1-hESC_confidence_value", dest="fitcons.h1-hesc.confidence_value", transform=split_int_drop_na), + Column("HUVEC_fitCons_score", dest="fitcons.huvec.score", transform=split_float_drop_na), + Column("HUVEC_fitCons_rankscore", dest="fitcons.huvec.rankscore", transform=split_float_drop_na), + Column("HUVEC_confidence_value", dest="fitcons.huvec.confidence_value", transform=split_int_drop_na), + Column("LINSIGHT", dest="linsight.score", transform=split_float_drop_na), + Column("LINSIGHT_rankscore", transform=split_float_drop_na), + Column("GERP++_NR", transform=split_float_drop_na), + Column("GERP++_RS", transform=split_float_drop_na), + Column("GERP++_RS_rankscore", dest="gerp++.rs_rankscore", transform=split_float_drop_na), + Column("phyloP100way_vertebrate", dest="phylop.100way_vertebrate.score", transform=split_float_drop_na), + Column("phyloP100way_vertebrate_rankscore", dest="phylop.100way_vertebrate.rankscore", transform=split_float_drop_na), + Column("phyloP470way_mammalian", dest="phylop.470way_mammalian.score", transform=split_float_drop_na), # replaced 30way_mammalian in 4.4.a + Column("phyloP470way_mammalian_rankscore", dest="phylop.470way_mammalian.rankscore", transform=split_float_drop_na), # replaced 30way_mammalian in 4.4.a + Column("phyloP17way_primate", dest="phylop.17way_primate.score", transform=split_float_drop_na), + Column("phyloP17way_primate_rankscore", dest="phylop.17way_primate.rankscore", transform=split_float_drop_na), + Column("phastCons100way_vertebrate", dest="phastcons.100way_vertebrate.score", transform=split_float_drop_na), + Column("phastCons100way_vertebrate_rankscore", dest="phastcons.100way_vertebrate.rankscore", transform=split_float_drop_na), + Column("phastCons470way_mammalian", dest="phastcons.470way_mammalian.score", transform=split_float_drop_na), # replaced 30way_mammalian in 4.4.a + Column("phastCons470way_mammalian_rankscore", dest="phastcons.470way_mammalian.rankscore", transform=split_float_drop_na), # replaced 30way_mammalian in 4.4.a + Column("phastCons17way_primate", dest="phastcons.17way_primate.score", transform=split_float_drop_na), + Column("phastCons17way_primate_rankscore", dest="phastcons.17way_primate.rankscore", transform=split_float_drop_na), + Column("SiPhy_29way_pi", dest="siphy_29way.pi", transform=parse_siphy_29way_pi), + Column("SiPhy_29way_logOdds", dest="siphy_29way.logodds_score", transform=split_float_drop_na), + Column("SiPhy_29way_logOdds_rankscore", dest="siphy_29way.logodds_rankscore", transform=split_float_drop_na), + Column("bStatistic", dest="bstatistic.score", transform=split_float_drop_na), + Column("bStatistic_converted_rankscore", dest="bstatistic.converted_rankscore", transform=split_float_drop_na), + Column("1000Gp3_AC", dest="1000gp3.ac", transform=int), + Column("1000Gp3_AF", dest="1000gp3.af", transform=float), + Column("1000Gp3_AFR_AC", dest="1000gp3.afr.ac", transform=int), # dest changed since 4.4.a + Column("1000Gp3_AFR_AF", dest="1000gp3.afr.af", transform=float), + Column("1000Gp3_EUR_AC", dest="1000gp3.eur.ac", transform=int), + Column("1000Gp3_EUR_AF", dest="1000gp3.eur.af", transform=float), + Column("1000Gp3_AMR_AC", dest="1000gp3.amr.ac", transform=int), + Column("1000Gp3_AMR_AF", dest="1000gp3.amr.af", transform=float), + Column("1000Gp3_EAS_AC", dest="1000gp3.eas.ac", transform=int), + Column("1000Gp3_EAS_AF", dest="1000gp3.eas.af", transform=float), + Column("1000Gp3_SAS_AC", dest="1000gp3.sas.ac", transform=int), + Column("1000Gp3_SAS_AF", dest="1000gp3.sas.af", transform=float), + Column("TWINSUK_AC", dest="twinsuk.ac", transform=int), + Column("TWINSUK_AF", dest="twinsuk.af", transform=float), + Column("ALSPAC_AC", dest="alspac.ac", transform=int), + Column("ALSPAC_AF", dest="alspac.af", transform=float), + Column("UK10K_AC", dest="uk10k.ac", transform=int), + Column("UK10K_AF", dest="uk10k.af", transform=float), + Column("ESP6500_AA_AC", dest="esp6500.aa.ac", transform=int), # dest changed since 4.4.a + Column("ESP6500_AA_AF", dest="esp6500.aa.af", transform=float), + Column("ESP6500_EA_AC", dest="esp6500.ea.ac", transform=int), + Column("ESP6500_EA_AF", dest="esp6500.ea.af", transform=float), + Column("ExAC_AC", dest="exac.ac", transform=int), # dest changed since 4.4.a + Column("ExAC_AF", dest="exac.af", transform=float), + Column("ExAC_Adj_AC", dest="exac.adj_ac", transform=int), + Column("ExAC_Adj_AF", dest="exac.adj_af", transform=float), + Column("ExAC_AFR_AC", dest="exac.afr.ac", transform=int), + Column("ExAC_AFR_AF", dest="exac.afr.af", transform=float), + Column("ExAC_AMR_AC", dest="exac.amr.ac", transform=int), + Column("ExAC_AMR_AF", dest="exac.amr.af", transform=float), + Column("ExAC_EAS_AC", dest="exac.eas.ac", transform=int), + Column("ExAC_EAS_AF", dest="exac.eas.af", transform=float), + Column("ExAC_FIN_AC", dest="exac.fin.ac", transform=int), + Column("ExAC_FIN_AF", dest="exac.fin.af", transform=float), + Column("ExAC_NFE_AC", dest="exac.nfe.ac", transform=int), + Column("ExAC_NFE_AF", dest="exac.nfe.af", transform=float), + Column("ExAC_SAS_AC", dest="exac.sas.ac", transform=int), + Column("ExAC_SAS_AF", dest="exac.sas.af", transform=float), + Column("ExAC_nonTCGA_AC", dest="exac_nontcga.ac", transform=int), + Column("ExAC_nonTCGA_AF", dest="exac_nontcga.af", transform=float), + Column("ExAC_nonTCGA_Adj_AC", dest="exac_nontcga.adj_ac", transform=int), + Column("ExAC_nonTCGA_Adj_AF", dest="exac_nontcga.adj_af", transform=float), + Column("ExAC_nonTCGA_AFR_AC", dest="exac_nontcga.afr.ac", transform=int), + Column("ExAC_nonTCGA_AFR_AF", dest="exac_nontcga.afr.af", transform=float), + Column("ExAC_nonTCGA_AMR_AC", dest="exac_nontcga.amr.ac", transform=int), + Column("ExAC_nonTCGA_AMR_AF", dest="exac_nontcga.amr.af", transform=float), + Column("ExAC_nonTCGA_EAS_AC", dest="exac_nontcga.eas.ac", transform=int), + Column("ExAC_nonTCGA_EAS_AF", dest="exac_nontcga.eas.af", transform=float), + Column("ExAC_nonTCGA_FIN_AC", dest="exac_nontcga.fin.ac", transform=int), + Column("ExAC_nonTCGA_FIN_AF", dest="exac_nontcga.fin.af", transform=float), + Column("ExAC_nonTCGA_NFE_AC", dest="exac_nontcga.nfe.ac", transform=int), + Column("ExAC_nonTCGA_NFE_AF", dest="exac_nontcga.nfe.af", transform=float), + Column("ExAC_nonTCGA_SAS_AC", dest="exac_nontcga.sas.ac", transform=int), + Column("ExAC_nonTCGA_SAS_AF", dest="exac_nontcga.sas.af", transform=float), + Column("ExAC_nonpsych_AC", dest="exac_nonpsych.ac", transform=int), + Column("ExAC_nonpsych_AF", dest="exac_nonpsych.af", transform=float), + Column("ExAC_nonpsych_Adj_AC", dest="exac_nonpsych.adj_ac", transform=int), + Column("ExAC_nonpsych_Adj_AF", dest="exac_nonpsych.adj_af", transform=float), + Column("ExAC_nonpsych_AFR_AC", dest="exac_nonpsych.afr.ac", transform=int), + Column("ExAC_nonpsych_AFR_AF", dest="exac_nonpsych.afr.af", transform=float), + Column("ExAC_nonpsych_AMR_AC", dest="exac_nonpsych.amr.ac", transform=int), + Column("ExAC_nonpsych_AMR_AF", dest="exac_nonpsych.amr.af", transform=float), + Column("ExAC_nonpsych_EAS_AC", dest="exac_nonpsych.eas.ac", transform=int), + Column("ExAC_nonpsych_EAS_AF", dest="exac_nonpsych.eas.af", transform=float), + Column("ExAC_nonpsych_FIN_AC", dest="exac_nonpsych.fin.ac", transform=int), + Column("ExAC_nonpsych_FIN_AF", dest="exac_nonpsych.fin.af", transform=float), + Column("ExAC_nonpsych_NFE_AC", dest="exac_nonpsych.nfe.ac", transform=int), + Column("ExAC_nonpsych_NFE_AF", dest="exac_nonpsych.nfe.af", transform=float), + Column("ExAC_nonpsych_SAS_AC", dest="exac_nonpsych.sas.ac", transform=int), + Column("ExAC_nonpsych_SAS_AF", dest="exac_nonpsych.sas.af", transform=float), + Column("ALFA_European_AC", dest="alfa.european.ac", transform=int), + Column("ALFA_European_AN", dest="alfa.european.an", transform=int), + Column("ALFA_European_AF", dest="alfa.european.af", transform=float), + Column("ALFA_African_Others_AC", dest="alfa.african_others.ac", transform=int), + Column("ALFA_African_Others_AN", dest="alfa.african_others.an", transform=int), + Column("ALFA_African_Others_AF", dest="alfa.african_others.af", transform=float), + Column("ALFA_East_Asian_AC", dest="alfa.east_asian.ac", transform=int), + Column("ALFA_East_Asian_AN", dest="alfa.east_asian.an", transform=int), + Column("ALFA_East_Asian_AF", dest="alfa.east_asian.af", transform=float), + Column("ALFA_African_American_AC", dest="alfa.african_american.ac", transform=int), + Column("ALFA_African_American_AN", dest="alfa.african_american.an", transform=int), + Column("ALFA_African_American_AF", dest="alfa.african_american.af", transform=float), + Column("ALFA_Latin_American_1_AC", dest="alfa.latin_american_1.ac", transform=int), + Column("ALFA_Latin_American_1_AN", dest="alfa.latin_american_1.an", transform=int), + Column("ALFA_Latin_American_1_AF", dest="alfa.latin_american_1.af", transform=float), + Column("ALFA_Latin_American_2_AC", dest="alfa.latin_american_2.ac", transform=int), + Column("ALFA_Latin_American_2_AN", dest="alfa.latin_american_2.an", transform=int), + Column("ALFA_Latin_American_2_AF", dest="alfa.latin_american_2.af", transform=float), + Column("ALFA_Other_Asian_AC", dest="alfa.other_asian.ac", transform=int), + Column("ALFA_Other_Asian_AN", dest="alfa.other_asian.an", transform=int), + Column("ALFA_Other_Asian_AF", dest="alfa.other_asian.af", transform=float), + Column("ALFA_South_Asian_AC", dest="alfa.south_asian.ac", transform=int), + Column("ALFA_South_Asian_AN", dest="alfa.south_asian.an", transform=int), + Column("ALFA_South_Asian_AF", dest="alfa.south_asian.af", transform=float), + Column("ALFA_Other_AC", dest="alfa.other.ac", transform=int), + Column("ALFA_Other_AN", dest="alfa.other.an", transform=int), + Column("ALFA_Other_AF", dest="alfa.other.af", transform=float), + Column("ALFA_African_AC", dest="alfa.african.ac", transform=int), + Column("ALFA_African_AN", dest="alfa.african.an", transform=int), + Column("ALFA_African_AF", dest="alfa.african.af", transform=float), + Column("ALFA_Asian_AC", dest="alfa.asian.ac", transform=int), + Column("ALFA_Asian_AN", dest="alfa.asian.an", transform=int), + Column("ALFA_Asian_AF", dest="alfa.asian.af", transform=float), + Column("ALFA_Total_AC", dest="alfa.total.ac", transform=int), + Column("ALFA_Total_AN", dest="alfa.total.an", transform=int), + Column("ALFA_Total_AF", dest="alfa.total.af", transform=float), + Column("clinvar_id", dest="clinvar.clinvar_id", transform=split_clinvar), + Column("clinvar_clnsig", transform=split_clinvar), + Column("clinvar_trait", transform=split_clinvar), + Column("clinvar_review", transform=split_clinvar), + Column("clinvar_hgvs", transform=split_clinvar), + Column("clinvar_var_source", dest="clinvar.var_source", transform=split_clinvar), + Column("clinvar_MedGen_id", dest="clinvar.medgen", transform=split_clinvar), + Column("clinvar_OMIM_id", dest="clinvar.omim", transform=split_clinvar), + Column("clinvar_Orphanet_id", dest="clinvar.orphanet", transform=split_clinvar), + Column("Interpro_domain", transform=split_str_drop_na), + Column("GTEx_V8_gene", dest="gtex.gene", tag=COLUMN_TAG.GTEX_GENE), # special column, see prune_gtex() + Column("GTEx_V8_tissue", dest="gtex.tissue", tag=COLUMN_TAG.GTEX_TISSUE), # special column, see prune_gtex() + Column("Geuvadis_eQTL_target_gene", transform=split_str_drop_na) +] + +HG19_COLUMNS = [c for c in COLUMNS if c.is_hg19()] +HG38_COLUMNS = [c for c in COLUMNS if c.is_hg38()] +PROTEIN_COLUMNS = [c for c in COLUMNS if c.dest.startswith(r"protein.")] + +# Currently not necessary to make assembly-specific tag-column maps. +TAG_COLUMN_MAP = create_tag_column_map(COLUMNS) + + +def verify_pos(row, pos_column: Column, na_values: set = NA_VALUES): + pos_value = row[pos_column.name] + + if pos_value in na_values: + return False + + return True + + +def verify_hg19_row(row: dict, na_values: set = NA_VALUES): + pos_column = TAG_COLUMN_MAP[COLUMN_TAG.HG19_POS][0] + return verify_pos(row, pos_column=pos_column, na_values=na_values) + + +def verify_hg38_row(row: dict, na_values: set = NA_VALUES): + pos_column = TAG_COLUMN_MAP[COLUMN_TAG.HG38_POS][0] + return verify_pos(row, pos_column=pos_column, na_values=na_values) + + +def normalize_hg19_row(row: dict): + """ + For unknown reasons, 4 MutationTaster columns and 6 Aloft columns have values ending in ";", which leads to an empty string when splitting the value by ";". + This function remove the tailing ";" in those values. + """ + columns = [ + # MutationTaster columns + TAG_COLUMN_MAP[COLUMN_TAG.MUTATION_TASTER_AAE][0], + TAG_COLUMN_MAP[COLUMN_TAG.MUTATION_TASTER_MODEL][0], + TAG_COLUMN_MAP[COLUMN_TAG.MUTATION_TASTER_PRED][0], + TAG_COLUMN_MAP[COLUMN_TAG.MUTATION_TASTER_SCORE][0], + # Aloft columns + TAG_COLUMN_MAP[COLUMN_TAG.ALOFT_FRACTION_TRANSCRIPTS_AFFECTED][0], + TAG_COLUMN_MAP[COLUMN_TAG.ALOFT_PROB_TOLERANT][0], + TAG_COLUMN_MAP[COLUMN_TAG.ALOFT_PROB_RECESSIVE][0], + TAG_COLUMN_MAP[COLUMN_TAG.ALOFT_PROB_DOMINANT][0], + TAG_COLUMN_MAP[COLUMN_TAG.ALOFT_PRED][0], + TAG_COLUMN_MAP[COLUMN_TAG.ALOFT_CONFIDENCE][0] + ] + + for c in columns: + if row[c.name] and row[c.name][-1] == ";": + row[c.name] = row[c.name][:-1] + + return row + + +def normalize_hg38_row(row: dict): + return normalize_hg19_row(row) + + +def prune_gtex(raw_doc: dict, gene_column: Column, tissue_column: Column, na_values: set = NA_VALUES): + """ + Map each GTEx gene name and tissue name from the raw document into a dictionary, + and assign all such dictionaries to the raw document's top "gtex" field. + + E.g. with the following input value: + + row["gtex.gene"] = "ENOSF1|ENOSF1" + row["gtex.tissue"] = "Adipose_Subcutaneous|Muscle_Skeletal" + + raw_doc will be assigned as: + + row["gtex"] = [ + {'gene': 'ENOSF1', 'tissue': 'Adipose_Subcutaneous'}, + {'gene': 'ENOSF1', 'tissue': 'Muscle_Skeletal'} + ] + """ + # when these two keys are not present in the doc, it means the responding two values in tsv files are NA values + if (gene_column.dest in raw_doc) and (tissue_column.dest in raw_doc): + gene_value = raw_doc[gene_column.dest] + tissue_value = raw_doc[tissue_column.dest] + + # special separator "|" for GTEx + gtex_result = [{"gene": acc, "tissue": entry} for (acc, entry) in split_zip([gene_value, tissue_value], sep=r"|", na_values=na_values)] + gtex_result = _check_length(gtex_result) + if gtex_result is not None: + raw_doc["gtex"] = gtex_result + + del raw_doc[gene_column.dest] + del raw_doc[tissue_column.dest] + + return raw_doc + + +def prune_mutation_taster(raw_doc: dict, aae_column: Column, model_column: Column, pred_column: Column, score_column: Column, na_values: set = NA_VALUES): + """ + Map each MutationTaster AAE, model, pred, and score value from the raw document into a dictionary, + and assign all such dictionaries to the raw document's "mutationtaster.analysis" field. + + E.g. with the following input value: + + row["mutationtaster.aae"] = "Y518*;Y518*;D532E" + row["mutationtaster.model"] = "complex_aae;complex_aae;simple_aae" + row["mutationtaster.pred"] = "D;D;N" + row["mutationtaster.score"] = "1;1;1" + + raw_doc will be assigned as: + + row["mutationtaster.analysis"] = [ + {'aae': 'Y518*', 'model': 'complex_aae', 'pred': 'D', 'score': 1}, + {'aae': 'Y518*', 'model': 'complex_aae', 'pred': 'D', 'score': 1}, + {'aae': 'D532E', 'model': 'simple_aae', 'pred': 'N', 'score': 1} + ] + """ + if (aae_column.dest in raw_doc) and (model_column.dest in raw_doc) and (pred_column.dest in raw_doc) and (score_column.dest in raw_doc): + aae_value = raw_doc[aae_column.dest] + model_value = raw_doc[model_column.dest] + pred_value = raw_doc[pred_column.dest] + score_value = raw_doc[score_column.dest] + + analysis_values = split_zip([aae_value, model_value, pred_value, score_value], sep=r";", na_values=na_values) + analysis_result = [{"aae": aae, "model": model, "pred": pred, "score": float(score)} for (aae, model, pred, score) in analysis_values] + analysis_result = _check_length(analysis_result) + if analysis_result is not None: + raw_doc["mutationtaster.analysis"] = analysis_result + + del raw_doc[aae_column.dest] + del raw_doc[model_column.dest] + del raw_doc[pred_column.dest] + del raw_doc[score_column.dest] + + # note that raw_doc[mutationtaster.converted_rankscore] is kept as-is + + return raw_doc + + +def prune_protein(raw_doc: set, protein_columns: list[Column]): + protein_fields = {c.dest: raw_doc[c.dest] for c in protein_columns} + + # assert len(set(map(len, protein_fields.values()))) == 1 # assert all values (as lists) in protein_fields have the same length before zipping + + """ + Convert protein fields (as a dictionary of lists) to a list of dictionaries. E.g. + + protein_field = { + 'protein.transcriptid': ['ENST00000624406', 'ENST00000398168'], + 'protein.proteinid': ['ENSP00000485669', 'ENSP00000381234'] + } + + will be converted to + + protein_result = [ + {'protein.transcriptid': 'ENST00000624406', 'protein.proteinid': 'ENSP00000485669'}, + {'protein.transcriptid': 'ENST00000398168', 'protein.proteinid': 'ENSP00000381234'} + ] + """ + protein_result = [] + protein_keys = protein_fields.keys() + for protein_values in zip(*protein_fields.values()): + elem = dict((key, value) for key, value in zip(protein_keys, protein_values) if value is not None) + elem = parse_dot_fields(elem)["protein"] + protein_result.append(elem) + # We keep protein_result as a list for easier merging + # protein_result = _check_length(protein_result) + # if protein_result is not None: + # raw_doc["protein"] = protein_result + raw_doc["protein"] = protein_result + + for c in protein_columns: + del raw_doc[c.dest] + + return raw_doc + + +def prune_hg19_doc(doc: dict, na_values: set = NA_VALUES): + protein_columns = [c for c in PROTEIN_COLUMNS if c.dest in doc] + doc = prune_protein(doc, protein_columns=protein_columns) + + gtex_gene_column = TAG_COLUMN_MAP[COLUMN_TAG.GTEX_GENE][0] + gtex_tissue_column = TAG_COLUMN_MAP[COLUMN_TAG.GTEX_TISSUE][0] + doc = prune_gtex(doc, gene_column=gtex_gene_column, tissue_column=gtex_tissue_column, na_values=na_values) + + mutation_taster_aae_column = TAG_COLUMN_MAP[COLUMN_TAG.MUTATION_TASTER_AAE][0] + mutation_taster_model_column = TAG_COLUMN_MAP[COLUMN_TAG.MUTATION_TASTER_MODEL][0] + mutation_taster_pred_column = TAG_COLUMN_MAP[COLUMN_TAG.MUTATION_TASTER_PRED][0] + mutation_taster_score_column = TAG_COLUMN_MAP[COLUMN_TAG.MUTATION_TASTER_SCORE][0] + doc = prune_mutation_taster(doc, aae_column=mutation_taster_aae_column, model_column=mutation_taster_model_column, + pred_column=mutation_taster_pred_column, score_column=mutation_taster_score_column, na_values=na_values) + + return doc + + +def prune_hg38_doc(doc: dict, na_values: set = NA_VALUES): + return prune_hg19_doc(doc, na_values=na_values) + + +def construct_raw_doc(row: dict, columns: list, na_values: set = NA_VALUES): + """ + Construct a raw dbnsfp doc from a dict-like row read from the csv file. + "Raw" means 1) the doc may contain dot fields that are not parsed, and 2) some values in the doc need further treatment/processing. + + Args: + row: a dict representing a csv row's content + columns: a list of Column object indicating how to construct each column + na_values: a set of values seen as NA + Returns: + a dict representing the doc's json object + """ + result = dict() + + for column in columns: + value = row[column.name] + if value in na_values: + continue + + value = column.transform(value) + if value is None: + continue + + result[column.dest] = value + + return result + + +def construct_hg19_raw_doc(row: dict, na_values: set = NA_VALUES): + return construct_raw_doc(row, columns=HG19_COLUMNS, na_values=na_values) + + +def construct_hg38_raw_doc(row: dict, na_values: set = NA_VALUES): + return construct_raw_doc(row, columns=HG38_COLUMNS, na_values=na_values) + + +def make_hgvs_id(doc: dict, chrom_column: Column, pos_column: Column, ref_column: Column, alt_column: Column): + chrom_value = doc[chrom_column.dest] + pos_value = doc[pos_column.dest]["start"] # see make_zero_based() + ref_value = doc[ref_column.dest] + alt_value = doc[alt_column.dest] + + hgvs_id = "chr%s:g.%d%s>%s" % (chrom_value, pos_value, ref_value, alt_value) + return hgvs_id + + +def make_hg19_hgvs_id(doc: dict): + chrom_column = TAG_COLUMN_MAP[COLUMN_TAG.HG19_CHROM][0] + pos_column = TAG_COLUMN_MAP[COLUMN_TAG.HG19_POS][0] + ref_column = TAG_COLUMN_MAP[COLUMN_TAG.REF_ALLELE][0] + alt_column = TAG_COLUMN_MAP[COLUMN_TAG.ALT_ALLELE][0] + + return make_hgvs_id(doc, chrom_column=chrom_column, pos_column=pos_column, ref_column=ref_column, alt_column=alt_column) + + +def make_hg38_hgvs_id(doc: dict): + chrom_column = TAG_COLUMN_MAP[COLUMN_TAG.HG38_CHROM][0] + pos_column = TAG_COLUMN_MAP[COLUMN_TAG.HG38_POS][0] + ref_column = TAG_COLUMN_MAP[COLUMN_TAG.REF_ALLELE][0] + alt_column = TAG_COLUMN_MAP[COLUMN_TAG.ALT_ALLELE][0] + + return make_hgvs_id(doc, chrom_column=chrom_column, pos_column=pos_column, ref_column=ref_column, alt_column=alt_column) + + +def construct_hg19_doc(row: dict, na_values: set = NA_VALUES): + verified = verify_hg19_row(row, na_values=na_values) + if not verified: + return None + + row = normalize_hg19_row(row) + raw_doc = construct_hg19_raw_doc(row, na_values=na_values) + raw_doc = prune_hg19_doc(raw_doc, na_values=na_values) + hgvs_id = make_hg19_hgvs_id(raw_doc) + + doc = { + "_id": hgvs_id, + "dbnsfp": parse_dot_fields(raw_doc) # convert dot-fields into nested dictionaries + } + return doc + + +def construct_hg38_doc(row: dict, na_values: set = NA_VALUES): + verified = verify_hg38_row(row, na_values=na_values) + if not verified: + return None + + row = normalize_hg38_row(row) + raw_doc = construct_hg38_raw_doc(row, na_values=na_values) + raw_doc = prune_hg38_doc(raw_doc, na_values=na_values) + hgvs_id = make_hg38_hgvs_id(raw_doc) + + doc = { + "_id": hgvs_id, + "dbnsfp": parse_dot_fields(raw_doc) # convert dot-fields into nested dictionaries + } + return doc + + +def load_file(path: str, assembly: str): + file = anyfile(path) + file_reader = csv.DictReader(file, delimiter="\t") + + num_columns = len(file_reader.fieldnames) + assert num_columns == VALID_COLUMN_NO, "Expecting %s columns, but got %s" % (VALID_COLUMN_NO, num_columns) + + _construct_doc = None + match assembly: + case "hg19": + _construct_doc = construct_hg19_doc + case "hg38": + _construct_doc = construct_hg38_doc + case _: + raise ValueError(f"Cannot recognize assembly. Accept 'hg19' or 'hg38', got '{assembly}'.") + + last_doc = None + for row in file_reader: + curr_doc = _construct_doc(row, na_values=NA_VALUES) + + if curr_doc is None: + continue + + if last_doc is not None: + if curr_doc["_id"] == last_doc["_id"]: + last_protein_field = last_doc["dbnsfp"]["protein"] + curr_protein_field = curr_doc["dbnsfp"]["protein"] + + # We guarantee that the protein field is always a list at this moment. See prune_protein() + # if not isinstance(last_protein_field, list): + # last_protein_field = [last_protein_field] + last_protein_field.extend(curr_protein_field) + + last_doc["dbnsfp"]["protein"] = last_protein_field + continue + else: + if len(last_doc["dbnsfp"]["protein"]) == 1: + last_doc["dbnsfp"]["protein"] = last_doc["dbnsfp"]["protein"][0] + yield last_doc + + last_doc = curr_doc + + # yield the very last doc + if last_doc: + if len(last_doc["dbnsfp"]["protein"]) == 1: + last_doc["dbnsfp"]["protein"] = last_doc["dbnsfp"]["protein"][0] + yield last_doc + + file.close() diff --git a/src/hub/dataload/sources/dbnsfp/dbnsfp_upload.py b/src/hub/dataload/sources/dbnsfp/dbnsfp_upload.py index b659370e..2e0e4e70 100644 --- a/src/hub/dataload/sources/dbnsfp/dbnsfp_upload.py +++ b/src/hub/dataload/sources/dbnsfp/dbnsfp_upload.py @@ -1,8 +1,11 @@ import os import glob -from .dbnsfp_mapping import mapping -from .dbnsfp_parser import load_data_file as load_common +from .dbnsfp_mapping_44a_v1 import mapping as mapping_v1 +from .dbnsfp_parser_44a_v1 import load_file as load_file_v1 +from .dbnsfp_mapping_44a_v2 import mapping as mapping_v2 +from .dbnsfp_parser_44a_v2 import load_file as load_file_v2 + import biothings.hub.dataload.uploader as uploader from hub.dataload.uploader import SnpeffPostUpdateUploader from hub.dataload.storage import MyVariantIgnoreDuplicatedStorage @@ -15,28 +18,64 @@ } -class DBNSFPBaseUploader(uploader.ParallelizedSourceUploader, - SnpeffPostUpdateUploader): +class DBNSFPBaseUploaderV1(uploader.ParallelizedSourceUploader, SnpeffPostUpdateUploader): storage_class = MyVariantIgnoreDuplicatedStorage GLOB_PATTERN = "dbNSFP*_variant.chr*" @classmethod - def get_mapping(klass): - return mapping + def get_mapping(cls): + return mapping_v1 def jobs(self): - # tuple(input_file,version), where version is either hg38 or hg19) - return map(lambda e: (e, self.__class__.__metadata__["assembly"]), - glob.glob(os.path.join(self.data_folder, self.__class__.GLOB_PATTERN))) + paths = glob.glob(os.path.join(self.data_folder, self.__class__.GLOB_PATTERN)) + assembly = self.__class__.__metadata__["assembly"] + return map(lambda path: (path, assembly), paths) + + def load_data(self, path, assembly): + self.logger.debug("loading file " + path) + return load_file_v1(path, version=assembly) + + +class DBNSFPBaseUploaderV2(uploader.ParallelizedSourceUploader, SnpeffPostUpdateUploader): + + storage_class = MyVariantIgnoreDuplicatedStorage + GLOB_PATTERN = "dbNSFP*_variant.chr*" + + @classmethod + def get_mapping(cls): + return mapping_v2 + + def jobs(self): + paths = glob.glob(os.path.join(self.data_folder, self.__class__.GLOB_PATTERN)) + assembly = self.__class__.__metadata__["assembly"] + return map(lambda path: (path, assembly), paths) + + def load_data(self, path, assembly): + self.logger.debug("loading file " + path) + return load_file_v2(path, version=assembly) + - def load_data(self, input_file, hg): - self.logger.debug("loading file " + input_file) - return load_common(input_file, version=hg) +class DBNSFPHG38UploaderV1(DBNSFPBaseUploaderV1): + name = "dbnsfp_hg38_v1" + main_source = "dbnsfp" + __metadata__ = { + "assembly": "hg38", + "src_meta": SRC_META + } + + +class DBNSFPHG19UploaderV1(DBNSFPBaseUploaderV1): + name = "dbnsfp_hg19_v1" + main_source = "dbnsfp" + __metadata__ = { + "assembly": "hg19", + "src_meta": SRC_META + } -class DBNSFPHG38Uploader(DBNSFPBaseUploader): - name = "dbnsfp_hg38" +class DBNSFPHG38UploaderV2(DBNSFPBaseUploaderV2): + name = "dbnsfp_hg38_v2" main_source = "dbnsfp" __metadata__ = { "assembly": "hg38", @@ -44,8 +83,8 @@ class DBNSFPHG38Uploader(DBNSFPBaseUploader): } -class DBNSFPHG19Uploader(DBNSFPBaseUploader): - name = "dbnsfp_hg19" +class DBNSFPHG19UploaderV2(DBNSFPBaseUploaderV2): + name = "dbnsfp_hg19_v2" main_source = "dbnsfp" __metadata__ = { "assembly": "hg19", diff --git a/src/utils/dotfield.py b/src/utils/dotfield.py new file mode 100644 index 00000000..c79b5aea --- /dev/null +++ b/src/utils/dotfield.py @@ -0,0 +1,43 @@ +import orjson +from biothings.utils.dotfield import merge_object + + +def make_object(attr, value): + """ + Create dictionary following the input dot notation and the value + Example:: + + make_object('a.b.c', 100) --> {a:{b:{c:100}}}, or + make_object(['a','b','c'], 100) --> {a:{b:{c:100}}} + + This is an orjson implementation of biothings.utils.dotfield.make_object, for better performance. + TODO Merge into biothings.utils.dotfield if necessary. (And delete this function then.) + """ + attr_list = attr.split(".") + s = "" + for k in attr_list: + s += '{"' + k + '":' + s += orjson.dumps(value).decode("utf-8") # decoding is necessary because orjson dumps into bytes + s += "}" * (len(attr_list)) + return orjson.loads(s) + + +def parse_dot_fields(genedoc): + """ + parse_dot_fields({'a': 1, 'b.c': 2, 'b.a.c': 3}) + should return + {'a': 1, 'b': {'a': {'c': 3}, 'c': 2}} + + This is a copy of biothings.utils.dotfield.parse_dot_fields. However here it uses the orjson make_object() function. + TODO If orjson make_object() function is merged to biothings.utils.dotfield, this function can be deleted. + """ + dot_fields = [] + expanded_doc = {} + for key in genedoc: + if key.find(".") != -1: + dot_fields.append(key) + expanded_doc = merge_object(expanded_doc, make_object(key, genedoc[key])) + genedoc.update(expanded_doc) + for key in dot_fields: + del genedoc[key] + return genedoc diff --git a/src/utils/table.py b/src/utils/table.py new file mode 100644 index 00000000..367291a8 --- /dev/null +++ b/src/utils/table.py @@ -0,0 +1,49 @@ +from dataclasses import dataclass +from itertools import groupby +from typing import Callable + + +@dataclass +class TableColumn: + """ + Configuration marker for each column in a tabular file. + + A TableColumn object indicates that a value from the named column must be transformed before its assignment to a destination field inside a JSON doc. + + E.g. TableColumn(name="AF", dest="allele_freq", transform=float) means that a value from the "AF" column must be cast to float and then be assigned to the + "allele_freq" field inside its associated JSON doc. + """ + name: str # column name + dest: str = None # destination field name + transform: Callable = None # transforming function applied to the column values + tag: str = None # tagging columns that need special prior sanity check or post-processing + + @classmethod + def identity_function(cls, value): + return value + + def __post_init__(self): + if self.dest is None: + # This is very common practice of determining field name. + # E.g. a value in column "SIFT_score" is often wrapped to field "sift.score" (dotfield) + self.dest = self.name.lower().replace("_", ".") + + # Default transformation is identity function; therefore we don't have to check if self.transform is None. + # The choice is made because most columns have transforming function in our application. + if self.transform is None: + self.transform = self.identity_function + + +def create_tag_column_map(columns: list[TableColumn]): + """ + Map each tag to its associated column or columns. + + Args: + columns: a list of TableColumn objects + + Returns: + a dictionary of { : } + """ + tagged_columns = sorted([c for c in columns if c.tag is not None], key=lambda c: c.tag) + result = {tag: list(columns) for tag, columns in groupby(tagged_columns, lambda c: c.tag)} + return result