From 0c245f92974752c55916f9913eb08c27b81a17e6 Mon Sep 17 00:00:00 2001 From: Evan Morris Date: Mon, 25 Mar 2024 23:18:08 -0400 Subject: [PATCH] updated for new data, added abstract id to entity extractor and changed over to csv format output --- parsers/LitCoin/src/loadLitCoin.py | 37 +++++++++++++++++++----------- 1 file changed, 23 insertions(+), 14 deletions(-) diff --git a/parsers/LitCoin/src/loadLitCoin.py b/parsers/LitCoin/src/loadLitCoin.py index 7c1cff94..dbfb633a 100644 --- a/parsers/LitCoin/src/loadLitCoin.py +++ b/parsers/LitCoin/src/loadLitCoin.py @@ -30,6 +30,7 @@ NODE_TYPE_MAPPINGS = { "Activity": "Activity", "AnatomicalStructure": "AnatomicalEntity", + "AnatomicalFeature": "AnatomicalEntity", "Antibody": "ChemicalEntity", "Behavior": "Behavior", "BiologicalStructure": "AnatomicalEntity", @@ -82,7 +83,7 @@ def __init__(self, test_mode: bool = False, source_data_dir: str = None): super().__init__(test_mode=test_mode, source_data_dir=source_data_dir) self.data_url = 'https://stars.renci.org/var/data_services/litcoin/' - self.data_file = 'abstracts_CompAndHeal_gpt4_20240205_train.json' + self.data_file = 'abstracts_CompAndHeal_gpt4_20240320_train.json' self.data_files = [self.data_file] # dicts of name to id lookups organized by node type (node_name_to_id_lookup[node_type] = dict of names -> id) self.node_name_to_id_lookup = defaultdict(dict) @@ -90,7 +91,7 @@ def __init__(self, test_mode: bool = False, source_data_dir: str = None): self.bl_utils = BiolinkUtils() def get_latest_source_version(self) -> str: - latest_version = 'v1.2' + latest_version = 'v1.3' return latest_version def get_data(self) -> bool: @@ -206,12 +207,15 @@ def process_llm_node(self, node_name: str, node_type: str): self.node_name_to_id_lookup[node_type][node_name] = standardized_name_res_result return standardized_name_res_result - @staticmethod - def convert_node_type_to_biolink_format(node_type): - biolink_node_type = re.sub("[()/]", "", node_type) # remove parentheses and forward slash - biolink_node_type = "".join([node_type_segment[0].upper() + node_type_segment[1:].lower() - for node_type_segment in biolink_node_type.split()]) # force Pascal case - return f'{biolink_node_type}' + def convert_node_type_to_biolink_format(self, node_type): + try: + biolink_node_type = re.sub("[()/]", "", node_type) # remove parentheses and forward slash + biolink_node_type = "".join([node_type_segment[0].upper() + node_type_segment[1:].lower() + for node_type_segment in biolink_node_type.split()]) # force Pascal case + return f'{biolink_node_type}' + except TypeError as e: + self.logger.error(f'Bad node type provided by llm: {node_type}') + return "" def parse_llm_output(self, llm_output): @@ -309,7 +313,7 @@ def standardize_name_resolution_results(self, name_res_json): class LitCoinEntityExtractorLoader(LitCoinLoader): source_id: str = 'LitCoinEntityExtractor' - parsing_version: str = '1.1' + parsing_version: str = '1.2' def parse_data(self) -> dict: litcoin_file_path: str = os.path.join(self.data_path, self.data_file) @@ -317,28 +321,33 @@ def parse_data(self) -> dict: with open(litcoin_file_path) as litcoin_file: litcoin_json = json.load(litcoin_file) for litcoin_object in litcoin_json: + abstract_id = litcoin_object['abstract_id'] llm_output = litcoin_object['output'] for litcoin_edge in self.parse_llm_output(llm_output): + subject_name = litcoin_edge[LLM_SUBJECT_NAME] subject_type = litcoin_edge[LLM_SUBJECT_TYPE] subject_mapped_type = NODE_TYPE_MAPPINGS.get(self.convert_node_type_to_biolink_format(subject_type), None) all_entities[f'{subject_name}{subject_type}'] = {'name': subject_name, 'llm_type': subject_type, - 'name_res_type': subject_mapped_type} + 'name_res_type': subject_mapped_type, + 'abstract_id': abstract_id} object_name = litcoin_edge[LLM_OBJECT_NAME] object_type = litcoin_edge[LLM_OBJECT_TYPE] object_mapped_type = NODE_TYPE_MAPPINGS.get(self.convert_node_type_to_biolink_format(object_type), None) all_entities[f'{object_name}{object_type}'] = {'name': object_name, 'llm_type': object_type, - 'name_res_type': object_mapped_type} + 'name_res_type': object_mapped_type, + 'abstract_id': abstract_id} with open(os.path.join(self.data_path, "..", f"parsed_{self.parsing_version}", - "name_res_inputs.json"), "w") as name_res_inputs: - entities_output = {'all_entities': [entity for entity in all_entities.values()]} - name_res_inputs.write(json.dumps(entities_output, indent=4)) + "name_res_inputs.csv"), "w") as name_res_inputs: + name_res_inputs.write("query,llm_type,biolink_type,abstract_id\n") + for entity in all_entities.values(): + name_res_inputs.write(f'"{entity["name"]}","{entity["llm_type"]}",{entity["name_res_type"]},{entity["abstract_id"]}\n') self.logger.info(f'{len(all_entities.values())} unique entities extracted') return {}