Skip to content

Commit

Permalink
added entity extractor and updated for latest llm version
Browse files Browse the repository at this point in the history
  • Loading branch information
EvanDietzMorris committed Feb 21, 2024
1 parent d9b27e5 commit 6185057
Show file tree
Hide file tree
Showing 2 changed files with 46 additions and 6 deletions.
2 changes: 2 additions & 0 deletions Common/data_sources.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
INTACT = 'IntAct'
LITCOIN = 'LitCoin_without_umls_with_autocomplete'
LITCOIN_SAPBERT = 'LitCoinSapBERT_without_umls'
LITCOIN_ENTITY_EXTRACTOR = "LitCoinEntityExtractor"
MONARCH_KG = 'MonarchKG'
MONDO_PROPS = 'MONDOProps'
ONTOLOGICAL_HIERARCHY = 'OntologicalHierarchy'
Expand Down Expand Up @@ -63,6 +64,7 @@
HUMAN_STRING: ("parsers.STRING.src.loadSTRINGDB", "HumanSTRINGDBLoader"),
INTACT: ("parsers.IntAct.src.loadIA", "IALoader"),
LITCOIN: ("parsers.LitCoin.src.loadLitCoin", "LitCoinLoader"),
LITCOIN_ENTITY_EXTRACTOR: ("parsers.LitCoin.src.loadLitCoin", "LitCoinEntityExtractorLoader"),
LITCOIN_SAPBERT: ("parsers.LitCoin.src.loadLitCoin", "LitCoinSapBERTLoader"),
MONARCH_KG: ("parsers.monarchkg.src.loadMonarchKG", "MonarchKGLoader"),
MONDO_PROPS: ("parsers.MONDOProperties.src.loadMP", "MPLoader"),
Expand Down
50 changes: 44 additions & 6 deletions parsers/LitCoin/src/loadLitCoin.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,10 @@
from Common.prefixes import PUBMED


LLM_SUBJECT_NAME = 'entity_1'
LLM_SUBJECT_TYPE = 'entity_1_type'
LLM_OBJECT_NAME = 'entity_2'
LLM_OBJECT_TYPE = 'entity_2_type'
LLM_SUBJECT_NAME = 'subject'
LLM_SUBJECT_TYPE = 'subject_type'
LLM_OBJECT_NAME = 'object'
LLM_OBJECT_TYPE = 'object_type'
LLM_RELATIONSHIP = 'relationship'
LLM_MAIN_FINDING = 'main_finding'

Expand Down Expand Up @@ -82,15 +82,15 @@ def __init__(self, test_mode: bool = False, source_data_dir: str = None):
super().__init__(test_mode=test_mode, source_data_dir=source_data_dir)

self.data_url = 'https://stars.renci.org/var/data_services/litcoin/'
self.data_file = 'HEAL_2.7.23_gpt4_20231114.json'
self.data_file = 'abstracts_CompAndHeal_gpt4_20240205_train.json'
self.data_files = [self.data_file]
# dicts of name to id lookups organized by node type (node_name_to_id_lookup[node_type] = dict of names -> id)
self.node_name_to_id_lookup = defaultdict(dict)
self.name_res_stats = []
self.bl_utils = BiolinkUtils()

def get_latest_source_version(self) -> str:
latest_version = 'v1.1'
latest_version = 'v1.2'
return latest_version

def get_data(self) -> bool:
Expand Down Expand Up @@ -310,3 +310,41 @@ def standardize_name_resolution_results(self, name_res_json):
"score": name_res_json['score']
}


class LitCoinEntityExtractorLoader(LitCoinLoader):
source_id: str = 'LitCoinEntityExtractor'
parsing_version: str = '1.1'

def parse_data(self) -> dict:
litcoin_file_path: str = os.path.join(self.data_path, self.data_file)
all_entities = {}
with open(litcoin_file_path) as litcoin_file:
litcoin_json = json.load(litcoin_file)
for litcoin_object in litcoin_json:
llm_output = litcoin_object['output']
for litcoin_edge in self.parse_llm_output(llm_output):
subject_name = litcoin_edge[LLM_SUBJECT_NAME]
subject_type = litcoin_edge[LLM_SUBJECT_TYPE]
subject_mapped_type = NODE_TYPE_MAPPINGS.get(self.convert_node_type_to_biolink_format(subject_type),
None)
all_entities[f'{subject_name}{subject_type}'] = {'name': subject_name,
'llm_type': subject_type,
'name_res_type': subject_mapped_type}
object_name = litcoin_edge[LLM_OBJECT_NAME]
object_type = litcoin_edge[LLM_OBJECT_TYPE]
object_mapped_type = NODE_TYPE_MAPPINGS.get(self.convert_node_type_to_biolink_format(object_type),
None)
all_entities[f'{object_name}{object_type}'] = {'name': object_name,
'llm_type': object_type,
'name_res_type': object_mapped_type}

with open(os.path.join(self.data_path, "..",
f"parsed_{self.parsing_version}",
"name_res_inputs.json"), "w") as name_res_inputs:
entities_output = {'all_entities': [entity for entity in all_entities.values()]}
name_res_inputs.write(json.dumps(entities_output, indent=4))
print(f'{len(all_entities.values())} unique entities extracted')
return {}



0 comments on commit 6185057

Please sign in to comment.