-
Notifications
You must be signed in to change notification settings - Fork 2
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Supporting population allele frequencies #29
Changes from 9 commits
fd86f09
ae26a08
6a384ac
eae1b9a
0247d13
6865f77
22308a1
91699ce
36e7723
940ccf8
f6a2d27
31ca0f1
4eb9e14
e6a5f3e
bc8ba60
1870e41
3a78f32
f661d72
5575ce0
4dfa7d3
bb6ae82
d5a100c
8a7557d
42b7667
193ffe1
a032adf
518b5c8
f38e487
b74af67
f3bfa14
4052b2c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Large diffs are not rendered by default.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
{ | ||
"1000_genomes": [ | ||
{"name": "1000GENOMES:phase_3:ALL", "frequencies": {"af": "AF"}}, | ||
{"name": "1000GENOMES:phase_3:AFR", "frequencies": {"af": "AFR_AF"}}, | ||
{"name": "1000GENOMES:phase_3:AMR", "frequencies": {"af": "AMR_AF"}}, | ||
{"name": "1000GENOMES:phase_3:EAS", "frequencies": {"af": "EAS_AF"}}, | ||
{"name": "1000GENOMES:phase_3:EUR", "frequencies": {"af": "EUR_AF"}}, | ||
{"name": "1000GENOMES:phase_3:SAS", "frequencies": {"af": "SAS_AF" }} | ||
], | ||
"gnomAD_exomes": [ | ||
{"name": "gnomADe:ALL", "frequencies": {"af": "gnomAD_exomes_AF", "ac": "gnomAD_exomes_AC", "an": "gnomAD_exomes_AN"}}, | ||
{"name": "gnomADe:afr", "frequencies": {"af": "gnomAD_exomes_AF_afr", "ac": "gnomAD_exomes_AC_afr", "an": "gnomAD_exomes_AN_afr"}}, | ||
{"name": "gnomADe:amr", "frequencies": {"af": "gnomAD_exomes_AF_amr", "ac": "gnomAD_exomes_AC_amr", "an": "gnomAD_exomes_AN_amr"}}, | ||
{"name": "gnomADe:asj", "frequencies": {"af": "gnomAD_exomes_AF_asj", "ac": "gnomAD_exomes_AC_asj", "an": "gnomAD_exomes_AN_asj"}}, | ||
{"name": "gnomADe:eas", "frequencies": {"af": "gnomAD_exomes_AF_eas", "ac": "gnomAD_exomes_AC_eas", "an": "gnomAD_exomes_AN_eas"}}, | ||
{"name": "gnomADe:fin", "frequencies": {"af": "gnomAD_exomes_AF_fin", "ac": "gnomAD_exomes_AC_fin", "an": "gnomAD_exomes_AN_fin"}}, | ||
{"name": "gnomADe:nfe", "frequencies": {"af": "gnomAD_exomes_AF_nfe", "ac": "gnomAD_exomes_AC_nfe", "an": "gnomAD_exomes_AN_nfe"}}, | ||
{"name": "gnomADe:oth", "frequencies": {"af": "gnomAD_exomes_AF_oth", "ac": "gnomAD_exomes_AC_oth", "an": "gnomAD_exomes_AN_oth"}}, | ||
{"name": "gnomADe:sas", "frequencies": {"af": "gnomAD_exomes_AF_sas", "ac": "gnomAD_exomes_AC_sas", "an": "gnomAD_exomes_AN_sas"}} | ||
], | ||
"gnomAD_genomes": [ | ||
{"name": "gnomADg:ALL", "frequencies": {"af": "gnomAD_genomes_AF", "ac": "gnomAD_genomes_AC", "an": "gnomAD_genomes_AN"}}, | ||
{"name": "gnomADg:afr", "frequencies": {"af": "gnomAD_genomes_AF_afr", "ac": "gnomAD_genomes_AC_afr", "an": "gnomAD_genomes_AN_afr"}}, | ||
{"name": "gnomADg:ami", "frequencies": {"af": "gnomAD_genomes_AF_ami", "ac": "gnomAD_genomes_AC_ami", "an": "gnomAD_genomes_AN_ami"}}, | ||
{"name": "gnomADg:amr", "frequencies": {"af": "gnomAD_genomes_AF_amr", "ac": "gnomAD_genomes_AC_amr", "an": "gnomAD_genomes_AN_amr"}}, | ||
{"name": "gnomADg:asj", "frequencies": {"af": "gnomAD_genomes_AF_asj", "ac": "gnomAD_genomes_AC_asj", "an": "gnomAD_genomes_AN_asj"}}, | ||
{"name": "gnomADg:eas", "frequencies": {"af": "gnomAD_genomes_AF_eas", "ac": "gnomAD_genomes_AC_eas", "an": "gnomAD_genomes_AN_eas"}}, | ||
{"name": "gnomADg:fin", "frequencies": {"af": "gnomAD_genomes_AF_fin", "ac": "gnomAD_genomes_AC_fin", "an": "gnomAD_genomes_AN_fin"}}, | ||
{"name": "gnomADg:mid", "frequencies": {"af": "gnomAD_genomes_AF_mid", "ac": "gnomAD_genomes_AC_mid", "an": "gnomAD_genomes_AN_mid"}}, | ||
{"name": "gnomADg:nfe", "frequencies": {"af": "gnomAD_genomes_AF_nfe", "ac": "gnomAD_genomes_AC_nfe", "an": "gnomAD_genomes_AN_nfe"}}, | ||
{"name": "gnomADg:oth", "frequencies": {"af": "gnomAD_genomes_AF_oth", "ac": "gnomAD_genomes_AC_oth", "an": "gnomAD_genomes_AN_oth"}}, | ||
{"name": "gnomADg:sas", "frequencies": {"af": "gnomAD_genomes_AF_sas", "ac": "gnomAD_genomes_AC_sas", "an": "gnomAD_genomes_AN_sas"}} | ||
] | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -27,6 +27,7 @@ def __init__(self, record: Any, header: Any) -> None: | |
self.info = record.INFO | ||
self.type = "Variant" | ||
self.vep_version = re.search("v\d+", self.header.get_lines("VEP")[0].value).group() | ||
self.population_map = {} | ||
|
||
def get_alternative_names(self) -> List: | ||
return [] | ||
|
@@ -196,7 +197,7 @@ def get_most_severe_consequence(self) -> Mapping: | |
csq_record_list = csq_record.split("|") | ||
for cons in csq_record_list[consequence_index].split("&"): | ||
rank = consequence_rank[cons] | ||
consequence_map[rank] = cons | ||
consequence_map[int(rank)] = cons | ||
return{ | ||
"result": consequence_map[min(consequence_map.keys())] , | ||
"analysis_method": { | ||
|
@@ -240,34 +241,101 @@ def get_info_key_index(self, key: str, info_id: str ="CSQ") -> int: | |
for index, value in enumerate(csq_list): | ||
if value == key: | ||
return index | ||
|
||
def traverse_population_info(self) -> Mapping: | ||
directory = os.path.dirname(__file__) | ||
with open(os.path.join(directory,'populations.json')) as pop_file: | ||
pop_mapping = json.load(pop_file) | ||
population_frequency_map = {} | ||
for csq_record in self.info["CSQ"]: | ||
csq_record_list = csq_record.split("|") | ||
allele_index = self.get_info_key_index("Allele") | ||
if csq_record_list[allele_index] is not None: | ||
if csq_record_list[allele_index] not in population_frequency_map.keys(): | ||
population_frequency_map[csq_record_list[allele_index]] = {} | ||
for pop_key, pop in pop_mapping.items(): | ||
for sub_pop in pop: | ||
if sub_pop["name"] in population_frequency_map[csq_record_list[allele_index]]: | ||
continue | ||
allele_count = allele_number = allele_frequency = None | ||
for freq_key, freq_val in sub_pop["frequencies"].items(): | ||
col_index = self.get_info_key_index(freq_val) | ||
if col_index and csq_record_list[col_index] is not None: | ||
if freq_key == "af": | ||
allele_frequency = csq_record_list[col_index] | ||
elif freq_key == "an": | ||
allele_number = csq_record_list[col_index] or None | ||
elif freq_key == "ac": | ||
allele_count = csq_record_list[col_index] or None | ||
else: | ||
raise Exception('Frequency metric is not recognised') | ||
population_frequency = { | ||
"population_name": sub_pop["name"], | ||
"allele_frequency": allele_frequency or -1, | ||
"allele_count": allele_count, | ||
"allele_number": allele_number, | ||
"is_minor_allele": False, | ||
"is_hpmaf": False | ||
} | ||
|
||
population_frequency_map[csq_record_list[allele_index]][sub_pop["name"]] = population_frequency | ||
return population_frequency_map | ||
|
||
def set_frequency_flags(self, allele_list: List): | ||
def set_frequency_flags(self): | ||
""" | ||
Calculates minor allele frequency by iterating through each allele | ||
Assumption: Considers that population is only gnomAD (genomes) for now | ||
Sets the maf as hpmaf as gnomAD is the only population at the moment | ||
Calculates MAF (minor allele frequency) and HPMAF by iterating through each allele | ||
""" | ||
maf_frequency = -1 | ||
maf_index = -1 | ||
highest_frequency = -1 | ||
highest_frequency_index = -1 | ||
highest_maf_frequency = -1 | ||
highest_maf_frequency_index = -1 | ||
maf_map = {} | ||
for allele_index, allele in enumerate(allele_list): | ||
if(len(allele["population_frequencies"]) > 0): | ||
pop = allele["population_frequencies"][0] | ||
pop_allele_frequency = float(pop["allele_frequency"]) | ||
if ( pop_allele_frequency > maf_frequency and pop_allele_frequency < highest_frequency ): | ||
maf_frequency = pop_allele_frequency | ||
maf_index = allele_index | ||
elif ( pop_allele_frequency > highest_frequency ): | ||
maf_frequency = highest_frequency | ||
maf_index = highest_frequency_index | ||
highest_frequency = pop_allele_frequency | ||
highest_frequency_index = allele_index | ||
allele_list = self.get_alleles() | ||
directory = os.path.dirname(__file__) | ||
with open(os.path.join(directory,'populations.json')) as pop_file: | ||
pop_mapping = json.load(pop_file) | ||
pop_names = [] | ||
for pop in pop_mapping.values(): | ||
pop_names.extend([sub_pop["name"] for sub_pop in pop]) | ||
hpmaf = [] | ||
pop_frequency_map = self.traverse_population_info() | ||
|
||
for pop_name in pop_names: | ||
by_population = [] | ||
for allele in allele_list: | ||
if allele.minimise_allele(allele.alt) in pop_frequency_map: | ||
pop_freqs = pop_frequency_map[allele.minimise_allele(allele.alt)].values() | ||
# Calculate reference allele from parsed frequency info of all alleles | ||
if allele.get_allele_type()["accession_id"] == "biological_region" and len(by_population)>0 : | ||
allele_frequency_ref = 1 - sum(list(zip(*by_population))[0]) | ||
if allele_frequency_ref <= 1 and allele_frequency_ref >= 0: | ||
population_frequency_ref = { | ||
"population_name": pop_name, | ||
"allele_frequency": allele_frequency_ref , | ||
"allele_count": None, | ||
"allele_number": None, | ||
"is_minor_allele": False, | ||
"is_hpmaf": False | ||
} | ||
minimised_allele = allele.minimise_allele(allele.alt) | ||
if minimised_allele not in pop_frequency_map: | ||
pop_frequency_map[minimised_allele] = {} | ||
pop_frequency_map[minimised_allele][pop_name] = population_frequency_ref | ||
by_population.append([allele_frequency_ref,minimised_allele,pop_name]) | ||
|
||
else: | ||
for pop_freq in pop_freqs: | ||
if pop_name == pop_freq["population_name"] and float(pop_freq["allele_frequency"]) > -1: | ||
minimised_allele = allele.minimise_allele(allele.alt) | ||
by_population.append([float(pop_freq["allele_frequency"]),minimised_allele, pop_name]) | ||
|
||
|
||
|
||
by_population_sorted = sorted(by_population, key=lambda item: item[0]) | ||
if len(by_population_sorted) >= 2: | ||
maf_frequency, maf_allele, maf_population = by_population_sorted[-2] | ||
pop_frequency_map[maf_allele][maf_population]["is_minor_allele"] = True | ||
hpmaf.append([maf_frequency,maf_allele,maf_population]) | ||
if len(hpmaf) > 0: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There can be multiple Not much effect in EV currently as not part of the view. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks @nakib103 , this seems valid. Similarly, there can be multiple alleles having MAF, we need to mark them. Example to test multiple alleles having same MAF: There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks Likhitha, also tested for |
||
hpmaf_sorted = sorted(hpmaf, key=lambda item: item[0]) | ||
_, hpmaf_allele, hpmaf_population = hpmaf_sorted[-1] | ||
pop_frequency_map[hpmaf_allele][hpmaf_population]["is_hpmaf"] = True | ||
return pop_frequency_map | ||
|
||
|
||
if maf_frequency>=0: | ||
allele_list[maf_index]["population_frequencies"][0]["is_minor_allele"] = True | ||
allele_list[maf_index]["population_frequencies"][0]["is_hpmaf"] = True | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
type Population{ | ||
""" | ||
Population | ||
""" | ||
name: String! | ||
size: Int! | ||
description: String! | ||
type: String | ||
is_global: Boolean | ||
is_from_genotypes: Boolean | ||
display_group_name: String | ||
super_population: Population | ||
sub_populations: [Population] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Looks like all fields in the schema should be non-nullable (unless There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Updated the file according to VDM, all the fields except |
||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We now have multiple population and corresponding MAF. We should somehow have a way to tell which MAF we want to represent in the GB drawer. Before, we only have one population and it was not a issue.
It does not effect the current EV design.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This may need further discussion for how we would like the API to communicate this, for example, through a field in
variant
or through the sorting order inpopulation_allele_frequencies