diff --git a/Common/data_sources.py b/Common/data_sources.py index ea21dfef..4cfac24a 100644 --- a/Common/data_sources.py +++ b/Common/data_sources.py @@ -18,6 +18,7 @@ HMDB = 'HMDB' HUMAN_GOA = 'HumanGOA' INTACT = 'IntAct' +KINACE = 'KinAce' MONARCH_KG = 'MonarchKG' MONDO_PROPS = 'MONDOProps' ONTOLOGICAL_HIERARCHY = 'OntologicalHierarchy' @@ -60,6 +61,7 @@ HUMAN_GOA: ("parsers.GOA.src.loadGOA", "HumanGOALoader"), HUMAN_STRING: ("parsers.STRING.src.loadSTRINGDB", "HumanSTRINGDBLoader"), INTACT: ("parsers.IntAct.src.loadIA", "IALoader"), + KINACE: ("parsers.KinAce.src.loadKinAce", "KinAceLoader"), MONARCH_KG: ("parsers.monarchkg.src.loadMonarchKG", "MonarchKGLoader"), MONDO_PROPS: ("parsers.MONDOProperties.src.loadMP", "MPLoader"), ONTOLOGICAL_HIERARCHY: ("parsers.UberGraph.src.loadUG", "OHLoader"), diff --git a/parsers/KinAce/src/loadKinAce.py b/parsers/KinAce/src/loadKinAce.py new file mode 100644 index 00000000..e3069b6f --- /dev/null +++ b/parsers/KinAce/src/loadKinAce.py @@ -0,0 +1,112 @@ +import os +import enum +from zipfile import ZipFile as zipfile +import pandas as pd + +from Common.utils import GetData +from Common.loader_interface import SourceDataLoader +from Common.extractor import Extractor +from Common.node_types import PUBLICATIONS + +# Full Kinase-Substrate Phosphorylation Data. + +#make this reflect the column that the data is found in +class BD_EDGEUMAN(enum.IntEnum): + KINASE = 1 + SUBSTRATE = 2 + P_SITE = 3 + PRIMARY_SOURCE = 4 + SECONDARY_SOURCE = 5 + +############## +# Class: Loading kinase-substrate phosphorylation reactions from KinAce +# By: Jon-Michael Beasley +# Date: 03/7/2024 +############## +class KinAceLoader(SourceDataLoader): + + source_id: str = 'KinAce' + provenance_id: str = 'infores:kinace' + description = "The KinAce web portal aggregates and visualizes the network of interactions between protein-kinases and their substrates in the human genome." + source_data_url = "https://kinace.kinametrix.com/session/ff792906de38db0d1c9900ac5882497b/download/download0?w=" + license = "All data and download files in bindingDB are freely available under a 'Creative Commons BY 3.0' license.'" + attribution = 'https://kinace.kinametrix.com/#section-about' + parsing_version = '1.0' + + def __init__(self, test_mode: bool = False, source_data_dir: str = None): + """ + constructor + :param test_mode - sets the run into test mode + """ + # call the super + super().__init__(test_mode=test_mode, source_data_dir=source_data_dir) + + self.kinace_version = "2023-10-30" + #self.kinace_version = self.get_latest_source_version() + self.kinace_data_url = f"https://raw.githubusercontent.com/GauravPandeyLab/KinAce/master/data/{self.kinace_version}-kinace-dataset.zip" + + self.archive_file_name = f"{self.kinace_version}-kinace-dataset.zip" + self.interactions_file_name = f"ksi_source.csv" + self.data_files = [self.interactions_file_name] + + def get_latest_source_version(self) -> str: + """ + gets the latest version of the data + :return: + """ + if self.kinace_version: + return self.kinace_version + + return f"{self.kinace_version}" + + def get_data(self) -> int: + """ + Gets the KinAce data. + + """ + data_puller = GetData() + source_url = f"{self.kinace_data_url}" + data_puller.pull_via_http(source_url, self.data_path) + with zipfile(os.path.join(self.data_path, self.archive_file_name), 'r') as zip_ref: + zip_ref.extract(self.interactions_file_name, self.data_path) + return True + + def parse_data(self) -> dict: + """ + Parses the data file for graph nodes/edges + We are going to group by kinase-substrate pair and aggregate all phosphorylation sites and primary/secondary sources. + + :return: ret_val: load_metadata + """ + print('ok parsing') + # with zipfile(os.path.join(self.data_path, self.archive_file_name), 'r') as zip_ref: + # zip_ref.extract(self.interactions_file_name, self.data_path) + data = pd.read_csv(os.path.join(self.data_path, self.interactions_file_name)) + data = data.groupby(["Kinase", "Substrate"]).agg({"Site": list, "PrimarySource": list, "SecondarySource": list}).reset_index() + # Define a function to deduplicate lists + def deduplicate_list(lst): + lst = [x for x in lst if x == x] + return list(set(lst)) + # Apply deduplication function to each aggregated list + data['Site'] = data.apply(lambda row: list(set([x for x in row['Site'] if x==x])), axis=1) + data['PrimarySource'] = data.apply(lambda row: list(set([x for x in row['PrimarySource'] if x==x])), axis=1) + data['SecondarySource'] = data.apply(lambda row: list(set([x for x in row['SecondarySource'] if x==x])), axis=1) + data.to_csv(os.path.join(self.data_path, self.interactions_file_name)) + extractor = Extractor(file_writer=self.output_file_writer) + with open(os.path.join(self.data_path, self.interactions_file_name), 'rt') as fp: + extractor.csv_extract(fp, + lambda line: f"UniProtKB:{line[1]}", # subject id + lambda line: f"UniProtKB:{line[2]}", # object id + lambda line: "biolink:phosphorylates", # predicate + lambda line: {}, #Node 1 props + lambda line: {}, #Node 2 props + lambda line: { + 'phosphorylation_sites':line[3], + 'primary_sources':line[4], + 'secondary_sources':line[5] + }, #Edge props + comment_character=None, + delim=",", + has_header_row=True + ) + return extractor.load_metadata \ No newline at end of file