-
Notifications
You must be signed in to change notification settings - Fork 3
/
convert_to_bio.py
70 lines (56 loc) · 1.58 KB
/
convert_to_bio.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import spacy
from spacy.gold import biluo_tags_from_offsets
data = {}
labels = {}
with open("test_train_split/validation_data.tsv") as f:
for line in f.readlines():
split = line.strip().split("\t")
data[split[0]] = split[1]
with open("test_train_split/validation_labels.tsv") as f:
for line in f.readlines()[1:]:
split = line.strip().split("\t")
if split[0] in labels.keys():
overlap = False
for label in labels[split[0]]:
lstart = label[0]
lend = label[1]
start = int(split[1])
end = int(split[2])
if lstart < end and start < lend:
overlap = True
if not overlap:
labels[split[0]].append((int(split[1]),int(split[2]),split[3]))
else:
labels[split[0]] = [(int(split[1]),int(split[2]),split[3])]
nlp = spacy.load("en_core_web_sm")
ids = list(data.keys())
bio_tags = []
sentences = []
for id in ids:
doc = nlp(data[id])
offsets = []
if id in labels.keys():
offsets = labels[id]
labs = biluo_tags_from_offsets(doc,offsets)
for sent in doc.sents:
s = []
l = []
contains_positive = False
for word in sent:
s.append(word.lower_)
label = labs[word.i]
if label == '-':
l.append("O")
else:
l.append(labs[word.i])
if labs[word.i] != 'O' and labs[word.i] != '-':
contains_positive = True
if len(s) > 150:
continue
if contains_positive:
bio_tags.append(id+"\t"+" ".join(l))
sentences.append(id+"\t"+" ".join(s))
with open("test_train_split/validation_labels_bio.tsv", "w") as f:
f.write("\n".join(bio_tags))
with open("test_train_split/validation_data_pretokenized.tsv","w") as f:
f.write("\n".join(sentences))