From b2eef57c74904134519d7ec1eb39ff16940fa274 Mon Sep 17 00:00:00 2001 From: James Chung Date: Wed, 30 Oct 2024 23:34:52 -0400 Subject: [PATCH 1/6] LINCS parser first try --- Common/data_sources.py | 2 + parsers/LINCS/src/loadLINCS.py | 114 +++++++++++++++++++++++++++++++++ 2 files changed, 116 insertions(+) create mode 100644 parsers/LINCS/src/loadLINCS.py diff --git a/Common/data_sources.py b/Common/data_sources.py index 082cf923..b71f2d9f 100644 --- a/Common/data_sources.py +++ b/Common/data_sources.py @@ -19,6 +19,7 @@ HMDB = 'HMDB' HUMAN_GOA = 'HumanGOA' INTACT = 'IntAct' +LINCS = "LINCS" LITCOIN = 'LitCoin' LITCOIN_SAPBERT = 'LitCoinSapBERT' LITCOIN_ENTITY_EXTRACTOR = 'LitCoinEntityExtractor' @@ -67,6 +68,7 @@ HUMAN_GOA: ("parsers.GOA.src.loadGOA", "HumanGOALoader"), HUMAN_STRING: ("parsers.STRING.src.loadSTRINGDB", "HumanSTRINGDBLoader"), INTACT: ("parsers.IntAct.src.loadIA", "IALoader"), + LINCS: ("parsers.LINCS.src.loadLINCS", "LINCSLoader") LITCOIN: ("parsers.LitCoin.src.loadLitCoin", "LitCoinLoader"), LITCOIN_ENTITY_EXTRACTOR: ("parsers.LitCoin.src.loadLitCoin", "LitCoinEntityExtractorLoader"), LITCOIN_SAPBERT: ("parsers.LitCoin.src.loadLitCoin", "LitCoinSapBERTLoader"), diff --git a/parsers/LINCS/src/loadLINCS.py b/parsers/LINCS/src/loadLINCS.py new file mode 100644 index 00000000..565d904d --- /dev/null +++ b/parsers/LINCS/src/loadLINCS.py @@ -0,0 +1,114 @@ +import os +import enum + +from Common.extractor import Extractor +from Common.loader_interface import SourceDataLoader +from Common.kgxmodel import kgxnode, kgxedge +from Common.neo4j_tools import Neo4jTools +from Common.biolink_constants import * +from Common.prefixes import PUBCHEM_COMPOUND, KNOWLEDGE_LEVEL, KNOWLEDGE_ASSERTION, AGENT_TYPE, DATA_PIPELINE +from Common.utils import GetData + + +# if parsing a tsv or csv type file with columns, use a enum to represent each field +class GENERICDATACOLS(enum.IntEnum): + SOURCE_ID = 2 + SOURCE_LABEL = 3 + TARGET_ID = 5 + TARGET_LABEL = 6 + PREDICATE = 7 + +PREDICATE_MAPPING = { + "in_similarity_relationship_with": "biolink:chemically_similar_to", + "negatively_regulates": { + "RO:0002448": { + OBJECT_DIRECTION_QUALIFIER: "downregulated"}}, + "positively_regulates": { + "RO:0002448": { + OBJECT_DIRECTION_QUALIFIER: "upregulated"}} + } + + + +############## +# Class: LINCS loader +# +# By: James Chung +# Date: 10/30/2023 +# Desc: Class that loads/parses the data in Library of Integrated Network-Based Cellular Signatures. +# +############## + + +class ParserTemplate(SourceDataLoader): + + source_id: str = 'LINCS' + # this should be a valid infores curie from the biolink infores catalog + provenance_id: str = 'infores:lincs' + # increment parsing_version whenever changes are made to the parser that would result in changes to parsing output + parsing_version: str = '1.0' + + def __init__(self, test_mode: bool = False, source_data_dir: str = None): + """ + :param test_mode - sets the run into test mode + :param source_data_dir - the specific storage directory to save files in + """ + super().__init__(test_mode=test_mode, source_data_dir=source_data_dir) + + self.lincs_url = 'https://stars.renci.org/var/data_services/LINCS/' + self.edge_file = "LINCS.lookup.edges.csv" + self.data_files = [self.edge_file] + + def get_latest_source_version(self) -> str: + # if possible go to the source and retrieve a string that is the latest version of the source data + # The KG was generated from Data Distillery KG. There was no version defined. + latest_version = 'v1.0' + return latest_version + + def get_data(self) -> bool: + # get_data is responsible for fetching the files in self.data_files and saving them to self.data_path + # Not used for LINCS so far. + source_data_url = f'{self.example_url}{self.edge_file}' + data_puller = GetData() + data_puller.pull_via_http(source_data_url, self.data_path) + return True + + def parse_data(self) -> dict: + """ + Parses the data file for graph nodes/edges + + :return: ret_val: load_metadata + """ + # This is a made up example of how one might extract nodes and edges from a tsv file + # In this case it's taking the subject ID from column 1 and the object ID from column 3, + # prepending them with a curie prefix. The predicate comes from column 3. The value in column 4 + # is set as a property on the edge. + extractor = Extractor(file_writer=self.output_file_writer) + lincs_file: str = os.path.join(self.lincs_url, self.edge_file) + with open(lincs_file, 'rt') as fp: + extractor.csv_extract(fp, + lambda line: self.resolve_id(line[GENERICDATACOLS.SOURCE_ID.value]), # source id + lambda line: self.resolve_id(line[GENERICDATACOLS.TARGET_ID.value]), # target id + lambda line: PREDICATE_MAPPING[line[GENERICDATACOLS.PREDICATE.value]].key, # predicate extractor + lambda line: {line[GENERICDATACOLS.SOURCE_LABEL.value]}, # subject properties + lambda line: {line[GENERICDATACOLS.TARGET_LABEL.value]}, # object properties + lambda line: self.format_edge_properties(line[GENERICDATACOLS.PREDICATE.value]), # edge properties + comment_character='#', + delim='\t', + has_header_row=True) + return extractor.load_metadata + + def resolve_id(self, idstring: str): + if idstring.startswith("PUBCHEM"): + return f"{PUBCHEM_COMPOUND}{idstring.replace("PUBCHEM","")}" + elif idstring.startswith("HGNC"): + return idstring + + def format_edge_properties(self, predicate: str): + properties = PREDICATE_MAPPING[predicate].value + + properties.update({ + PRIMARY_KNOWLEDGE_SOURCE: self.provenance_id, + KNOWLEDGE_LEVEL: KNOWLEDGE_ASSERTION, + AGENT_TYPE: DATA_PIPELINE + }) \ No newline at end of file From af3037a0e8c1ffd2eb0533af3f79a8a60f2c6b7a Mon Sep 17 00:00:00 2001 From: James Chung Date: Wed, 30 Oct 2024 23:35:18 -0400 Subject: [PATCH 2/6] LINCS parsers first try --- parsers/LINCS/src/loadLINCS.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parsers/LINCS/src/loadLINCS.py b/parsers/LINCS/src/loadLINCS.py index 565d904d..c81a3def 100644 --- a/parsers/LINCS/src/loadLINCS.py +++ b/parsers/LINCS/src/loadLINCS.py @@ -40,7 +40,7 @@ class GENERICDATACOLS(enum.IntEnum): ############## -class ParserTemplate(SourceDataLoader): +class LINCSLoader(SourceDataLoader): source_id: str = 'LINCS' # this should be a valid infores curie from the biolink infores catalog From 56e8b5b4e9ddaadd2717d74d56952716322bfc5b Mon Sep 17 00:00:00 2001 From: James Chung Date: Thu, 31 Oct 2024 08:14:15 -0400 Subject: [PATCH 3/6] return variable added --- parsers/LINCS/src/loadLINCS.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/parsers/LINCS/src/loadLINCS.py b/parsers/LINCS/src/loadLINCS.py index c81a3def..72b0153e 100644 --- a/parsers/LINCS/src/loadLINCS.py +++ b/parsers/LINCS/src/loadLINCS.py @@ -111,4 +111,6 @@ def format_edge_properties(self, predicate: str): PRIMARY_KNOWLEDGE_SOURCE: self.provenance_id, KNOWLEDGE_LEVEL: KNOWLEDGE_ASSERTION, AGENT_TYPE: DATA_PIPELINE - }) \ No newline at end of file + }) + + return properties \ No newline at end of file From 46f8117c7606d55e18552cdedfe173b7c4610e48 Mon Sep 17 00:00:00 2001 From: Evan Morris Date: Tue, 26 Nov 2024 10:56:15 -0500 Subject: [PATCH 4/6] general clean up, fixing imports, removing template comments --- parsers/LINCS/src/loadLINCS.py | 32 ++++++++++++-------------------- 1 file changed, 12 insertions(+), 20 deletions(-) diff --git a/parsers/LINCS/src/loadLINCS.py b/parsers/LINCS/src/loadLINCS.py index 72b0153e..f56536b8 100644 --- a/parsers/LINCS/src/loadLINCS.py +++ b/parsers/LINCS/src/loadLINCS.py @@ -3,14 +3,10 @@ from Common.extractor import Extractor from Common.loader_interface import SourceDataLoader -from Common.kgxmodel import kgxnode, kgxedge -from Common.neo4j_tools import Neo4jTools from Common.biolink_constants import * -from Common.prefixes import PUBCHEM_COMPOUND, KNOWLEDGE_LEVEL, KNOWLEDGE_ASSERTION, AGENT_TYPE, DATA_PIPELINE +from Common.prefixes import PUBCHEM_COMPOUND from Common.utils import GetData - -# if parsing a tsv or csv type file with columns, use a enum to represent each field class GENERICDATACOLS(enum.IntEnum): SOURCE_ID = 2 SOURCE_LABEL = 3 @@ -19,14 +15,16 @@ class GENERICDATACOLS(enum.IntEnum): PREDICATE = 7 PREDICATE_MAPPING = { - "in_similarity_relationship_with": "biolink:chemically_similar_to", - "negatively_regulates": { - "RO:0002448": { - OBJECT_DIRECTION_QUALIFIER: "downregulated"}}, - "positively_regulates": { - "RO:0002448": { - OBJECT_DIRECTION_QUALIFIER: "upregulated"}} - } + "in_similarity_relationship_with": "biolink:chemically_similar_to", + "negatively_regulates": { + "RO:0002448": { + OBJECT_DIRECTION_QUALIFIER: "downregulated"} + }, + "positively_regulates": { + "RO:0002448": { + OBJECT_DIRECTION_QUALIFIER: "upregulated"} + } +} @@ -34,7 +32,7 @@ class GENERICDATACOLS(enum.IntEnum): # Class: LINCS loader # # By: James Chung -# Date: 10/30/2023 +# Date: 10/30/2024 # Desc: Class that loads/parses the data in Library of Integrated Network-Based Cellular Signatures. # ############## @@ -43,9 +41,7 @@ class GENERICDATACOLS(enum.IntEnum): class LINCSLoader(SourceDataLoader): source_id: str = 'LINCS' - # this should be a valid infores curie from the biolink infores catalog provenance_id: str = 'infores:lincs' - # increment parsing_version whenever changes are made to the parser that would result in changes to parsing output parsing_version: str = '1.0' def __init__(self, test_mode: bool = False, source_data_dir: str = None): @@ -79,10 +75,6 @@ def parse_data(self) -> dict: :return: ret_val: load_metadata """ - # This is a made up example of how one might extract nodes and edges from a tsv file - # In this case it's taking the subject ID from column 1 and the object ID from column 3, - # prepending them with a curie prefix. The predicate comes from column 3. The value in column 4 - # is set as a property on the edge. extractor = Extractor(file_writer=self.output_file_writer) lincs_file: str = os.path.join(self.lincs_url, self.edge_file) with open(lincs_file, 'rt') as fp: From d2bead131a8ef190058f5e10c1d5ac5551ed860b Mon Sep 17 00:00:00 2001 From: Evan Morris Date: Mon, 2 Dec 2024 11:46:07 -0500 Subject: [PATCH 5/6] fixing missing comma, making quote usage consistent --- Common/data_sources.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Common/data_sources.py b/Common/data_sources.py index b71f2d9f..e49c9463 100644 --- a/Common/data_sources.py +++ b/Common/data_sources.py @@ -19,7 +19,7 @@ HMDB = 'HMDB' HUMAN_GOA = 'HumanGOA' INTACT = 'IntAct' -LINCS = "LINCS" +LINCS = 'LINCS' LITCOIN = 'LitCoin' LITCOIN_SAPBERT = 'LitCoinSapBERT' LITCOIN_ENTITY_EXTRACTOR = 'LitCoinEntityExtractor' @@ -68,7 +68,7 @@ HUMAN_GOA: ("parsers.GOA.src.loadGOA", "HumanGOALoader"), HUMAN_STRING: ("parsers.STRING.src.loadSTRINGDB", "HumanSTRINGDBLoader"), INTACT: ("parsers.IntAct.src.loadIA", "IALoader"), - LINCS: ("parsers.LINCS.src.loadLINCS", "LINCSLoader") + LINCS: ("parsers.LINCS.src.loadLINCS", "LINCSLoader"), LITCOIN: ("parsers.LitCoin.src.loadLitCoin", "LitCoinLoader"), LITCOIN_ENTITY_EXTRACTOR: ("parsers.LitCoin.src.loadLitCoin", "LitCoinEntityExtractorLoader"), LITCOIN_SAPBERT: ("parsers.LitCoin.src.loadLitCoin", "LitCoinSapBERTLoader"), From 3491e3f64ef7e79ad6a8f88e323d50014657d0a8 Mon Sep 17 00:00:00 2001 From: Evan Morris Date: Mon, 2 Dec 2024 11:51:21 -0500 Subject: [PATCH 6/6] fixing parser see commit description fixing several bugs and broken variable names - fixing source data download location - source data delimiter is comma not tab - cleaning up properties (they need to be a dictionary, but node props were unnecessary anyway) simplifying/fixing qualifier handling - using predicates like RO:0002212 includes directionality and will normalize to qualified version, old implementation didn't work anyway, so this is better --- parsers/LINCS/src/loadLINCS.py | 53 +++++++++++++--------------------- 1 file changed, 20 insertions(+), 33 deletions(-) diff --git a/parsers/LINCS/src/loadLINCS.py b/parsers/LINCS/src/loadLINCS.py index f56536b8..f71e16a7 100644 --- a/parsers/LINCS/src/loadLINCS.py +++ b/parsers/LINCS/src/loadLINCS.py @@ -7,6 +7,7 @@ from Common.prefixes import PUBCHEM_COMPOUND from Common.utils import GetData + class GENERICDATACOLS(enum.IntEnum): SOURCE_ID = 2 SOURCE_LABEL = 3 @@ -14,20 +15,14 @@ class GENERICDATACOLS(enum.IntEnum): TARGET_LABEL = 6 PREDICATE = 7 + PREDICATE_MAPPING = { "in_similarity_relationship_with": "biolink:chemically_similar_to", - "negatively_regulates": { - "RO:0002448": { - OBJECT_DIRECTION_QUALIFIER: "downregulated"} - }, - "positively_regulates": { - "RO:0002448": { - OBJECT_DIRECTION_QUALIFIER: "upregulated"} - } + "negatively_regulates": "RO:0002212", + "positively_regulates": "RO:0002213" } - ############## # Class: LINCS loader # @@ -36,8 +31,6 @@ class GENERICDATACOLS(enum.IntEnum): # Desc: Class that loads/parses the data in Library of Integrated Network-Based Cellular Signatures. # ############## - - class LINCSLoader(SourceDataLoader): source_id: str = 'LINCS' @@ -51,20 +44,17 @@ def __init__(self, test_mode: bool = False, source_data_dir: str = None): """ super().__init__(test_mode=test_mode, source_data_dir=source_data_dir) - self.lincs_url = 'https://stars.renci.org/var/data_services/LINCS/' + self.data_url = 'https://stars.renci.org/var/data_services/LINCS/' self.edge_file = "LINCS.lookup.edges.csv" self.data_files = [self.edge_file] def get_latest_source_version(self) -> str: - # if possible go to the source and retrieve a string that is the latest version of the source data # The KG was generated from Data Distillery KG. There was no version defined. latest_version = 'v1.0' return latest_version def get_data(self) -> bool: - # get_data is responsible for fetching the files in self.data_files and saving them to self.data_path - # Not used for LINCS so far. - source_data_url = f'{self.example_url}{self.edge_file}' + source_data_url = f'{self.data_url}{self.edge_file}' data_puller = GetData() data_puller.pull_via_http(source_data_url, self.data_path) return True @@ -76,33 +66,30 @@ def parse_data(self) -> dict: :return: ret_val: load_metadata """ extractor = Extractor(file_writer=self.output_file_writer) - lincs_file: str = os.path.join(self.lincs_url, self.edge_file) + lincs_file: str = os.path.join(self.data_path, self.edge_file) with open(lincs_file, 'rt') as fp: extractor.csv_extract(fp, lambda line: self.resolve_id(line[GENERICDATACOLS.SOURCE_ID.value]), # source id lambda line: self.resolve_id(line[GENERICDATACOLS.TARGET_ID.value]), # target id - lambda line: PREDICATE_MAPPING[line[GENERICDATACOLS.PREDICATE.value]].key, # predicate extractor - lambda line: {line[GENERICDATACOLS.SOURCE_LABEL.value]}, # subject properties - lambda line: {line[GENERICDATACOLS.TARGET_LABEL.value]}, # object properties - lambda line: self.format_edge_properties(line[GENERICDATACOLS.PREDICATE.value]), # edge properties + lambda line: PREDICATE_MAPPING[line[GENERICDATACOLS.PREDICATE.value]], # predicate extractor + lambda line: {}, # subject properties + lambda line: {}, # object properties + lambda line: self.get_edge_properties(), # edge properties comment_character='#', - delim='\t', + delim=',', has_header_row=True) return extractor.load_metadata - def resolve_id(self, idstring: str): + @staticmethod + def resolve_id(idstring: str): if idstring.startswith("PUBCHEM"): - return f"{PUBCHEM_COMPOUND}{idstring.replace("PUBCHEM","")}" - elif idstring.startswith("HGNC"): - return idstring - - def format_edge_properties(self, predicate: str): - properties = PREDICATE_MAPPING[predicate].value + return idstring.replace("PUBCHEM", PUBCHEM_COMPOUND) + return idstring - properties.update({ + def get_edge_properties(self): + properties = { PRIMARY_KNOWLEDGE_SOURCE: self.provenance_id, KNOWLEDGE_LEVEL: KNOWLEDGE_ASSERTION, AGENT_TYPE: DATA_PIPELINE - }) - - return properties \ No newline at end of file + } + return properties