forked from KoslickiLab/DrugBankNER
-
Notifications
You must be signed in to change notification settings - Fork 0
/
perform_NER.py
116 lines (104 loc) · 6.2 KB
/
perform_NER.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import NER
import json
import spacy
import pickle
from utils import get_xml_data, delete_long_tokens, process_drug_bank_xmldict_data, remove_brackets
from CONSTANTS import MECHANISTIC_CATEGORIES, MOSTLY_TEXT_FIELDS
spacy.require_gpu()
# Chunyu's NER; different models have different strengths and weaknesses. Through trial and error, I decided on these
# five, since each results in matches the other models don't get.
ners = []
trapi_ner = NER.TRAPI_NER(synonymizer_dir='./data', synonymizer_dbname='node_synonymizer_v1.0_KG2.10.1.sqlite',
linker_name=['umls', 'mesh'], spacy_model='en_core_sci_lg', threshold=0.70,
num_neighbors=15, max_entities_per_mention=1)
ners.append(trapi_ner)
trapi_ner = NER.TRAPI_NER(synonymizer_dir='./data', synonymizer_dbname='node_synonymizer_v1.0_KG2.10.1.sqlite',
linker_name=['umls', 'mesh'], spacy_model='en_core_sci_scibert', threshold=0.75,
num_neighbors=10, max_entities_per_mention=1)
ners.append(trapi_ner)
trapi_ner = NER.TRAPI_NER(synonymizer_dir='./data', synonymizer_dbname='node_synonymizer_v1.0_KG2.10.1.sqlite',
linker_name=['rxnorm'], spacy_model='en_core_sci_lg', threshold=0.70,
num_neighbors=15, max_entities_per_mention=1)
ners.append(trapi_ner)
trapi_ner = NER.TRAPI_NER(synonymizer_dir='./data', synonymizer_dbname='node_synonymizer_v1.0_KG2.10.1.sqlite',
linker_name=['go'], spacy_model='en_core_sci_lg', threshold=0.70,
num_neighbors=15, max_entities_per_mention=1)
ners.append(trapi_ner)
trapi_ner = NER.TRAPI_NER(synonymizer_dir='./data', synonymizer_dbname='node_synonymizer_v1.0_KG2.10.1.sqlite',
linker_name=['hpo'], spacy_model='en_core_sci_lg', threshold=0.70,
num_neighbors=15, max_entities_per_mention=1)
def text_to_kg2_nodes(text, categories=None):
potential_mechanistic_matched_nodes = {}
# split the text into sentences
sentences = text.split('.')
for sentence in sentences:
# omit very long sequences and very short ones
if len(sentence) > 1000 or len(sentence) < 15:
continue
# omit very long tokens/words
sentence = delete_long_tokens(sentence)
print(f"on sentence: {sentence}")
for trapi_ner in ners:
try:
res = trapi_ner.get_kg2_match(sentence, remove_mark=True)
except RuntimeError:
continue
for key, value in res.items(): # keys are plain text names, values are lists of tuples
for v in value: # v[0] is the KG2 identifier, v[1] is the node info in the form of a dict
if categories:
if v[1]['preferred_category'] in categories:
if v[0] not in potential_mechanistic_matched_nodes:
potential_mechanistic_matched_nodes[v[0]] = {'name': key,
'category': v[1]['preferred_category']}
# replace name with longer name
elif v[0] in potential_mechanistic_matched_nodes and len(key) > len(
potential_mechanistic_matched_nodes[v[0]]['name']):
potential_mechanistic_matched_nodes[v[0]]['name'] = key
else:
if v[0] not in potential_mechanistic_matched_nodes:
potential_mechanistic_matched_nodes[v[0]] = {'name': key, 'category': v[1]['preferred_category']}
# replace name with longer name
elif v[0] in potential_mechanistic_matched_nodes and len(key) > len(
potential_mechanistic_matched_nodes[v[0]]['name']):
potential_mechanistic_matched_nodes[v[0]]['name'] = key
return potential_mechanistic_matched_nodes
def main():
# After running download_data.sh, the data will be in the data/ directory
# convert the xml to dicts
doc = get_xml_data()
kg2_drug_info = process_drug_bank_xmldict_data(doc)
print("Number of drugs with info:", len(kg2_drug_info))
# So now we have the KG2 identifiers for the drugs, as well as the category, name, and drugbank id
# Now, I would like to NER the indications to add to an "indication" field in the kg2_drug_info dictionary.
# While I am at it, also add the intermediate mechanistic nodes to the kg2_drug_info dictionary
i = 0
max_i = len(kg2_drug_info.keys())
for kg2_drug in kg2_drug_info.keys():
#if i % 100 == 0:
print(f"Processing drug {i} of {max_i}")
i += 1
# NER and KG2 align the indications
if kg2_drug_info[kg2_drug].get("indication"):
kg2_drug_info[kg2_drug]["indication_NER_aligned"] = text_to_kg2_nodes(
remove_brackets(kg2_drug_info[kg2_drug]["indication"]), categories=['biolink:Disease',
'biolink:PhenotypicFeature',
'biolink:DiseaseOrPhenotypicFeature'])
else:
kg2_drug_info[kg2_drug]["indication_NER_aligned"] = {}
# NER and KG2 align the mechanistic intermediate nodes from the text fields
all_intermediate_text = ""
for field in MOSTLY_TEXT_FIELDS:
text = kg2_drug_info[kg2_drug].get(field)
if text:
all_intermediate_text += remove_brackets(text) + "\n "
# then do the NER
kg2_drug_info[kg2_drug]["mechanistic_intermediate_nodes"] = text_to_kg2_nodes(all_intermediate_text,
categories=MECHANISTIC_CATEGORIES)
# Now, let's write this to a JSON file
with open('./data/kg2_drug_info.json', 'w') as f:
json.dump(kg2_drug_info, f, indent=4)
# also save as a pickle file for fast loading
with open('./data/kg2_drug_info.pkl', 'wb') as f:
pickle.dump(kg2_drug_info, f)
if __name__ == "__main__":
main()