Skip to content

Commit

Permalink
Merge pull request #29 from likhitha-surapaneni/population_scores
Browse files Browse the repository at this point in the history
Supporting population allele frequencies
  • Loading branch information
nakib103 authored Jan 31, 2024
2 parents 9f5eef9 + 4052b2c commit 517296f
Show file tree
Hide file tree
Showing 13 changed files with 1,139 additions and 510 deletions.
849 changes: 849 additions & 0 deletions common/file_model/population_metadata.json

Large diffs are not rendered by default.

34 changes: 34 additions & 0 deletions common/file_model/populations.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
{
"1000_genomes": [
{"name": "1000GENOMES:phase_3:ALL", "frequencies": {"af": "AF"}},
{"name": "1000GENOMES:phase_3:AFR", "frequencies": {"af": "AFR_AF"}},
{"name": "1000GENOMES:phase_3:AMR", "frequencies": {"af": "AMR_AF"}},
{"name": "1000GENOMES:phase_3:EAS", "frequencies": {"af": "EAS_AF"}},
{"name": "1000GENOMES:phase_3:EUR", "frequencies": {"af": "EUR_AF"}},
{"name": "1000GENOMES:phase_3:SAS", "frequencies": {"af": "SAS_AF" }}
],
"gnomAD_exomes": [
{"name": "gnomADe:ALL", "frequencies": {"af": "gnomAD_exomes_AF", "ac": "gnomAD_exomes_AC", "an": "gnomAD_exomes_AN"}},
{"name": "gnomADe:afr", "frequencies": {"af": "gnomAD_exomes_AF_afr", "ac": "gnomAD_exomes_AC_afr", "an": "gnomAD_exomes_AN_afr"}},
{"name": "gnomADe:amr", "frequencies": {"af": "gnomAD_exomes_AF_amr", "ac": "gnomAD_exomes_AC_amr", "an": "gnomAD_exomes_AN_amr"}},
{"name": "gnomADe:asj", "frequencies": {"af": "gnomAD_exomes_AF_asj", "ac": "gnomAD_exomes_AC_asj", "an": "gnomAD_exomes_AN_asj"}},
{"name": "gnomADe:eas", "frequencies": {"af": "gnomAD_exomes_AF_eas", "ac": "gnomAD_exomes_AC_eas", "an": "gnomAD_exomes_AN_eas"}},
{"name": "gnomADe:fin", "frequencies": {"af": "gnomAD_exomes_AF_fin", "ac": "gnomAD_exomes_AC_fin", "an": "gnomAD_exomes_AN_fin"}},
{"name": "gnomADe:nfe", "frequencies": {"af": "gnomAD_exomes_AF_nfe", "ac": "gnomAD_exomes_AC_nfe", "an": "gnomAD_exomes_AN_nfe"}},
{"name": "gnomADe:oth", "frequencies": {"af": "gnomAD_exomes_AF_oth", "ac": "gnomAD_exomes_AC_oth", "an": "gnomAD_exomes_AN_oth"}},
{"name": "gnomADe:sas", "frequencies": {"af": "gnomAD_exomes_AF_sas", "ac": "gnomAD_exomes_AC_sas", "an": "gnomAD_exomes_AN_sas"}}
],
"gnomAD_genomes": [
{"name": "gnomADg:ALL", "frequencies": {"af": "gnomAD_genomes_AF", "ac": "gnomAD_genomes_AC", "an": "gnomAD_genomes_AN"}},
{"name": "gnomADg:afr", "frequencies": {"af": "gnomAD_genomes_AF_afr", "ac": "gnomAD_genomes_AC_afr", "an": "gnomAD_genomes_AN_afr"}},
{"name": "gnomADg:ami", "frequencies": {"af": "gnomAD_genomes_AF_ami", "ac": "gnomAD_genomes_AC_ami", "an": "gnomAD_genomes_AN_ami"}},
{"name": "gnomADg:amr", "frequencies": {"af": "gnomAD_genomes_AF_amr", "ac": "gnomAD_genomes_AC_amr", "an": "gnomAD_genomes_AN_amr"}},
{"name": "gnomADg:asj", "frequencies": {"af": "gnomAD_genomes_AF_asj", "ac": "gnomAD_genomes_AC_asj", "an": "gnomAD_genomes_AN_asj"}},
{"name": "gnomADg:eas", "frequencies": {"af": "gnomAD_genomes_AF_eas", "ac": "gnomAD_genomes_AC_eas", "an": "gnomAD_genomes_AN_eas"}},
{"name": "gnomADg:fin", "frequencies": {"af": "gnomAD_genomes_AF_fin", "ac": "gnomAD_genomes_AC_fin", "an": "gnomAD_genomes_AN_fin"}},
{"name": "gnomADg:mid", "frequencies": {"af": "gnomAD_genomes_AF_mid", "ac": "gnomAD_genomes_AC_mid", "an": "gnomAD_genomes_AN_mid"}},
{"name": "gnomADg:nfe", "frequencies": {"af": "gnomAD_genomes_AF_nfe", "ac": "gnomAD_genomes_AC_nfe", "an": "gnomAD_genomes_AN_nfe"}},
{"name": "gnomADg:oth", "frequencies": {"af": "gnomAD_genomes_AF_oth", "ac": "gnomAD_genomes_AC_oth", "an": "gnomAD_genomes_AN_oth"}},
{"name": "gnomADg:sas", "frequencies": {"af": "gnomAD_genomes_AF_sas", "ac": "gnomAD_genomes_AC_sas", "an": "gnomAD_genomes_AN_sas"}}
]
}
142 changes: 115 additions & 27 deletions common/file_model/variant.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ def __init__(self, record: Any, header: Any) -> None:
self.info = record.INFO
self.type = "Variant"
self.vep_version = re.search("v\d+", self.header.get_lines("VEP")[0].value).group()
self.population_map = {}

def get_alternative_names(self) -> List:
return []
Expand Down Expand Up @@ -196,7 +197,7 @@ def get_most_severe_consequence(self) -> Mapping:
csq_record_list = csq_record.split("|")
for cons in csq_record_list[consequence_index].split("&"):
rank = consequence_rank[cons]
consequence_map[rank] = cons
consequence_map[int(rank)] = cons
return{
"result": consequence_map[min(consequence_map.keys())] ,
"analysis_method": {
Expand Down Expand Up @@ -240,34 +241,121 @@ def get_info_key_index(self, key: str, info_id: str ="CSQ") -> int:
for index, value in enumerate(csq_list):
if value == key:
return index

def traverse_population_info(self) -> Mapping:
directory = os.path.dirname(__file__)
with open(os.path.join(directory,'populations.json')) as pop_file:
pop_mapping = json.load(pop_file)
population_frequency_map = {}
for csq_record in self.info["CSQ"]:
csq_record_list = csq_record.split("|")
allele_index = self.get_info_key_index("Allele")
if csq_record_list[allele_index] is not None:
if csq_record_list[allele_index] not in population_frequency_map.keys():
population_frequency_map[csq_record_list[allele_index]] = {}
for pop_key, pop in pop_mapping.items():
for sub_pop in pop:
if sub_pop["name"] in population_frequency_map[csq_record_list[allele_index]]:
continue
allele_count = allele_number = allele_frequency = None
for freq_key, freq_val in sub_pop["frequencies"].items():
col_index = self.get_info_key_index(freq_val)
if col_index and csq_record_list[col_index] is not None:
if freq_key == "af":
allele_frequency = csq_record_list[col_index] or None
elif freq_key == "an":
allele_number = csq_record_list[col_index] or None
elif freq_key == "ac":
allele_count = csq_record_list[col_index] or None
else:
raise Exception('Frequency metric is not recognised')
population_frequency = {
"population_name": sub_pop["name"],
"allele_frequency": allele_frequency,
"allele_count": allele_count,
"allele_number": allele_number,
"is_minor_allele": False,
"is_hpmaf": False
}
if allele_frequency:
population_frequency_map[csq_record_list[allele_index]][sub_pop["name"]] = population_frequency
return population_frequency_map

def set_frequency_flags(self, allele_list: List):
def set_frequency_flags(self):
"""
Calculates minor allele frequency by iterating through each allele
Assumption: Considers that population is only gnomAD (genomes) for now
Sets the maf as hpmaf as gnomAD is the only population at the moment
Calculates MAF (minor allele frequency) and HPMAF by iterating through each allele
"""
maf_frequency = -1
maf_index = -1
highest_frequency = -1
highest_frequency_index = -1
highest_maf_frequency = -1
highest_maf_frequency_index = -1
maf_map = {}
for allele_index, allele in enumerate(allele_list):
if(len(allele["population_frequencies"]) > 0):
pop = allele["population_frequencies"][0]
pop_allele_frequency = float(pop["allele_frequency"])
if ( pop_allele_frequency > maf_frequency and pop_allele_frequency < highest_frequency ):
maf_frequency = pop_allele_frequency
maf_index = allele_index
elif ( pop_allele_frequency > highest_frequency ):
maf_frequency = highest_frequency
maf_index = highest_frequency_index
highest_frequency = pop_allele_frequency
highest_frequency_index = allele_index
allele_list = self.get_alleles()
directory = os.path.dirname(__file__)
with open(os.path.join(directory,'populations.json')) as pop_file:
pop_mapping = json.load(pop_file)
pop_names = []
for pop in pop_mapping.values():
pop_names.extend([sub_pop["name"] for sub_pop in pop])
hpmaf = []
pop_frequency_map = self.traverse_population_info()

for pop_name in pop_names:
by_population = []
for allele in allele_list:
if allele.minimise_allele(allele.alt) in pop_frequency_map:
pop_freqs = pop_frequency_map[allele.minimise_allele(allele.alt)].values()
# Calculate reference allele from parsed frequency info of all alleles
if allele.get_allele_type()["accession_id"] == "biological_region" and len(by_population)>0 :
allele_frequency_ref = 1 - sum(list(zip(*by_population))[0])
if allele_frequency_ref <= 1 and allele_frequency_ref >= 0:
population_frequency_ref = {
"population_name": pop_name,
"allele_frequency": allele_frequency_ref ,
"allele_count": None,
"allele_number": None,
"is_minor_allele": False,
"is_hpmaf": False
}
minimised_allele = allele.minimise_allele(allele.alt)
if minimised_allele not in pop_frequency_map:
pop_frequency_map[minimised_allele] = {}
pop_frequency_map[minimised_allele][pop_name] = population_frequency_ref
by_population.append([allele_frequency_ref,minimised_allele,pop_name])

else:
for pop_freq in pop_freqs:
if pop_name == pop_freq["population_name"] and pop_freq["allele_frequency"]:
minimised_allele = allele.minimise_allele(allele.alt)
by_population.append([float(pop_freq["allele_frequency"]),minimised_allele, pop_name])



by_population_sorted = sorted(by_population, key=lambda item: item[0])
if len(by_population_sorted) >= 2:
highest_frequency = by_population_sorted[-1][0]
maf_frequency = None
# When more than one allele has same maf and is not
# ref allele, we mark it as is_minor_allele
for pop in reversed(by_population_sorted[:-1]):
if pop[0] == highest_frequency:
continue
elif pop[0] < highest_frequency and not maf_frequency:
maf_frequency, maf_allele, maf_population = pop
pop_frequency_map[maf_allele][maf_population]["is_minor_allele"] = True
hpmaf.append([maf_frequency,maf_allele,maf_population])
elif maf_frequency and pop[0] == maf_frequency and maf_allele != allele.ref:
pop_frequency_map[maf_allele][maf_population]["is_minor_allele"] = True
hpmaf.append([maf_frequency,maf_allele,maf_population])
elif maf_frequency and pop[0] < maf_frequency:
break
if len(hpmaf) > 0:
hpmaf_sorted = sorted(hpmaf, key=lambda item: item[0])
hpmaf_frequency, hpmaf_allele, hpmaf_population = hpmaf_sorted[-1]
pop_frequency_map[hpmaf_allele][hpmaf_population]["is_hpmaf"] = True
# When more than one allele has same maf, we mark it as is_hpmaf
for hpmaf_pop in reversed(hpmaf_sorted[:-1]):
if hpmaf_pop[0] == hpmaf_frequency:
hpmaf_frequency, hpmaf_allele, hpmaf_population = hpmaf_pop
pop_frequency_map[hpmaf_allele][hpmaf_population]["is_hpmaf"] = True
elif hpmaf_pop[0] < hpmaf_frequency:
break
return pop_frequency_map


if maf_frequency>=0:
allele_list[maf_index]["population_frequencies"][0]["is_minor_allele"] = True
allele_list[maf_index]["population_frequencies"][0]["is_hpmaf"] = True

76 changes: 48 additions & 28 deletions common/file_model/variant_allele.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ def __init__(self, allele_index: str, alt: str, variant:dict) -> None:
self.type = "VariantAllele"
self.allele_sequence = alt
self.reference_sequence = variant.ref
self.population_map = []
self.info_map = self.traverse_csq_info()

def get_allele_type(self):
Expand All @@ -41,7 +42,12 @@ def get_prediction_results(self):
min_alt = self.minimise_allele(self.alt)
return self.info_map[min_alt]["prediction_results"] if min_alt in self.info_map else []


def get_population_allele_frequencies(self):
min_alt = self.minimise_allele(self.alt)
population_map = self.variant.set_frequency_flags()
return population_map[min_alt].values() if min_alt in population_map else []


def traverse_csq_info(self) -> Mapping:
"""
This function is to traverse the CSQ record and extract columns
Expand Down Expand Up @@ -70,7 +76,47 @@ def traverse_csq_info(self) -> Mapping:
current_prediction_results = info_map[csq_record_list[prediction_index_map["allele"]]]["prediction_results"]
info_map[csq_record_list[prediction_index_map["allele"]]]["prediction_results"] += self.create_allele_prediction_results(current_prediction_results, csq_record_list, prediction_index_map)
return info_map

# def traverse_population_info(self) -> Mapping:
# directory = os.path.dirname(__file__)
# with open(os.path.join(directory,'populations.json')) as pop_file:
# pop_mapping = json.load(pop_file)
# population_frequency_map = {}
# for csq_record in self.variant.info["CSQ"]:
# csq_record_list = csq_record.split("|")
# allele_index = self.variant.get_info_key_index("Allele")
# if csq_record_list[allele_index] is not None:
# if csq_record_list[allele_index] not in population_frequency_map.keys():
# population_frequency_map[csq_record_list[allele_index]] = {}
# for pop_key, pop in pop_mapping.items():
# for sub_pop in pop:
# if sub_pop["name"] in population_frequency_map[csq_record_list[allele_index]]:
# continue
# allele_count = allele_number = allele_frequency = None
# for freq_key, freq_val in sub_pop["frequencies"].items():
# col_index = self.variant.get_info_key_index(freq_val)
# if col_index is not None and csq_record_list[col_index] is not None:
# if freq_key == "af":
# allele_frequency = csq_record_list[col_index]
# elif freq_key == "an":
# allele_number = csq_record_list[col_index] or None
# elif freq_key == "ac":
# allele_count = csq_record_list[col_index] or None
# else:
# raise Exception('Frequency metric is not recognised')
# population_frequency = {
# "population": sub_pop["name"],
# "allele_frequency": allele_frequency or -1,
# "allele_count": allele_count,
# "allele_number": allele_number,
# "is_minor_allele": False,
# "is_hpmaf": False
# }

# population_frequency_map[csq_record_list[allele_index]][sub_pop["name"]] = population_frequency
# return population_frequency_map


def create_allele_prediction_results(self, current_prediction_results: Mapping, csq_record: List, prediction_index_map: dict) -> list:
prediction_results = []
if "cadd_phred" in prediction_index_map.keys():
Expand Down Expand Up @@ -219,30 +265,4 @@ def minimise_allele(self, alt: str):
elif len(alt) < len(self.variant.ref):
minimised_allele = "-"
return minimised_allele

def format_frequency(self, raw_frequency_list: List) -> Mapping:
freq_map = {}
for freq in raw_frequency_list:
key = freq.split(":")[0]
freq_list = freq.split(":")[1].split(",")
freq_map[key] = freq_list
return freq_map

def get_population_allele_frequencies(self) -> List:
frequency_map = {}
if "FREQ" in self.variant.info:
frequency_map = self.format_frequency(",".join(map(str,self.variant.info["FREQ"])).split("|"))
population_allele_frequencies = []
for key, pop_list in frequency_map.items():
## Adding only GnomAD population
if key == "GnomAD":
if pop_list[self.allele_index] not in ["None", "."]:
population_allele_frequencies.append({
"population": key,
"allele_frequency": pop_list[self.allele_index],
"is_minor_allele": False,
"is_hpmaf": False
})
return population_allele_frequencies



14 changes: 14 additions & 0 deletions common/schemas/population.graphql
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
type Population{
"""
Population
"""
name: String!
size: Int!
description: String!
type: String!
is_global: Boolean!
is_from_genotypes: Boolean!
display_group_name: String!
super_population: Population
sub_populations: [Population!]!
}
12 changes: 7 additions & 5 deletions common/schemas/population_allele_frequency.graphql
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,10 @@ type PopulationAlleleFrequency{
"""
Population Allele Frequency
"""
population: String
allele_frequency: Float
is_minor_allele: Boolean
is_hpmaf: Boolean
}
population_name: String!
allele_frequency: Float!
allele_count: Int
allele_number: Int
is_minor_allele: Boolean!
is_hpmaf: Boolean!
}
1 change: 1 addition & 0 deletions common/schemas/query.graphql
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
type Query {
version: Version
variant(by_id: IdInput): Variant
populations(genome_id: String!): [Population]
}

input IdInput {
Expand Down
Binary file removed data/test_dbsnp.vcf.gz
Binary file not shown.
Binary file removed data/test_dbsnp.vcf.gz.tbi
Binary file not shown.
2 changes: 1 addition & 1 deletion examples/population/query.graphql
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
query pop_example {
population(genome_id: "a7335667-93e7-11ec-a39d-005056b38ce3")
populations(genome_id: "a7335667-93e7-11ec-a39d-005056b38ce3")
{

name
Expand Down
Loading

0 comments on commit 517296f

Please sign in to comment.