Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added parser for the KinAce database of kinase-substrate phosphorylat… #205

Merged
merged 1 commit into from
Mar 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions Common/data_sources.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
HMDB = 'HMDB'
HUMAN_GOA = 'HumanGOA'
INTACT = 'IntAct'
KINACE = 'KinAce'
MONARCH_KG = 'MonarchKG'
MONDO_PROPS = 'MONDOProps'
ONTOLOGICAL_HIERARCHY = 'OntologicalHierarchy'
Expand Down Expand Up @@ -60,6 +61,7 @@
HUMAN_GOA: ("parsers.GOA.src.loadGOA", "HumanGOALoader"),
HUMAN_STRING: ("parsers.STRING.src.loadSTRINGDB", "HumanSTRINGDBLoader"),
INTACT: ("parsers.IntAct.src.loadIA", "IALoader"),
KINACE: ("parsers.KinAce.src.loadKinAce", "KinAceLoader"),
MONARCH_KG: ("parsers.monarchkg.src.loadMonarchKG", "MonarchKGLoader"),
MONDO_PROPS: ("parsers.MONDOProperties.src.loadMP", "MPLoader"),
ONTOLOGICAL_HIERARCHY: ("parsers.UberGraph.src.loadUG", "OHLoader"),
Expand Down
112 changes: 112 additions & 0 deletions parsers/KinAce/src/loadKinAce.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
import os
import enum
from zipfile import ZipFile as zipfile
import pandas as pd

from Common.utils import GetData
from Common.loader_interface import SourceDataLoader
from Common.extractor import Extractor
from Common.node_types import PUBLICATIONS

# Full Kinase-Substrate Phosphorylation Data.

#make this reflect the column that the data is found in
class BD_EDGEUMAN(enum.IntEnum):
KINASE = 1
SUBSTRATE = 2
P_SITE = 3
PRIMARY_SOURCE = 4
SECONDARY_SOURCE = 5

##############
# Class: Loading kinase-substrate phosphorylation reactions from KinAce
# By: Jon-Michael Beasley
# Date: 03/7/2024
##############
class KinAceLoader(SourceDataLoader):

source_id: str = 'KinAce'
provenance_id: str = 'infores:kinace'
description = "The KinAce web portal aggregates and visualizes the network of interactions between protein-kinases and their substrates in the human genome."
source_data_url = "https://kinace.kinametrix.com/session/ff792906de38db0d1c9900ac5882497b/download/download0?w="
license = "All data and download files in bindingDB are freely available under a 'Creative Commons BY 3.0' license.'"
attribution = 'https://kinace.kinametrix.com/#section-about'
parsing_version = '1.0'

def __init__(self, test_mode: bool = False, source_data_dir: str = None):
"""
constructor
:param test_mode - sets the run into test mode
"""
# call the super
super().__init__(test_mode=test_mode, source_data_dir=source_data_dir)

self.kinace_version = "2023-10-30"
#self.kinace_version = self.get_latest_source_version()
self.kinace_data_url = f"https://raw.githubusercontent.com/GauravPandeyLab/KinAce/master/data/{self.kinace_version}-kinace-dataset.zip"

self.archive_file_name = f"{self.kinace_version}-kinace-dataset.zip"
self.interactions_file_name = f"ksi_source.csv"
self.data_files = [self.interactions_file_name]

def get_latest_source_version(self) -> str:
"""
gets the latest version of the data
:return:
"""
if self.kinace_version:
return self.kinace_version

return f"{self.kinace_version}"

def get_data(self) -> int:
"""
Gets the KinAce data.

"""
data_puller = GetData()
source_url = f"{self.kinace_data_url}"
data_puller.pull_via_http(source_url, self.data_path)
with zipfile(os.path.join(self.data_path, self.archive_file_name), 'r') as zip_ref:
zip_ref.extract(self.interactions_file_name, self.data_path)
return True

def parse_data(self) -> dict:
"""
Parses the data file for graph nodes/edges
We are going to group by kinase-substrate pair and aggregate all phosphorylation sites and primary/secondary sources.

:return: ret_val: load_metadata
"""
print('ok parsing')
# with zipfile(os.path.join(self.data_path, self.archive_file_name), 'r') as zip_ref:
# zip_ref.extract(self.interactions_file_name, self.data_path)
data = pd.read_csv(os.path.join(self.data_path, self.interactions_file_name))
data = data.groupby(["Kinase", "Substrate"]).agg({"Site": list, "PrimarySource": list, "SecondarySource": list}).reset_index()
# Define a function to deduplicate lists
def deduplicate_list(lst):
lst = [x for x in lst if x == x]
return list(set(lst))
# Apply deduplication function to each aggregated list
data['Site'] = data.apply(lambda row: list(set([x for x in row['Site'] if x==x])), axis=1)
data['PrimarySource'] = data.apply(lambda row: list(set([x for x in row['PrimarySource'] if x==x])), axis=1)
data['SecondarySource'] = data.apply(lambda row: list(set([x for x in row['SecondarySource'] if x==x])), axis=1)
data.to_csv(os.path.join(self.data_path, self.interactions_file_name))
extractor = Extractor(file_writer=self.output_file_writer)
with open(os.path.join(self.data_path, self.interactions_file_name), 'rt') as fp:
extractor.csv_extract(fp,
lambda line: f"UniProtKB:{line[1]}", # subject id
lambda line: f"UniProtKB:{line[2]}", # object id
lambda line: "biolink:phosphorylates", # predicate
lambda line: {}, #Node 1 props
lambda line: {}, #Node 2 props
lambda line: {
'phosphorylation_sites':line[3],
'primary_sources':line[4],
'secondary_sources':line[5]
}, #Edge props
comment_character=None,
delim=",",
has_header_row=True
)
return extractor.load_metadata
Loading