-
Notifications
You must be signed in to change notification settings - Fork 11
/
entity-collapser.py
89 lines (60 loc) · 1.78 KB
/
entity-collapser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
# script created by Devin Higgins, adapted by Brandon Locke
import csv
import sys
if len(sys.argv) != 2:
raise ValueError('Please provide a batchner file')
batchner = sys.argv[1]
with open(batchner, "rU") as csvfile:
csv_reader = csv.DictReader(csvfile)
for row in csv_reader:
print(row)
# Sample data structure.
data_structure = {
"text": {
"type": {
"entity": 0
},
},
}
data = {}
with open(batchner, "rU") as csvfile:
csv_reader = csv.DictReader(csvfile)
for row in csv_reader:
doc = row["doc"]
entity = row["entity"]
entityType = row["entityType"]
count = int(row["count"])
if doc in data:
if entityType in data[doc]:
if entity in data[doc][entityType]:
data[doc][entityType][entity] += count
else:
data[doc][entityType][entity] = count
else:
data[doc][entityType] = {entity: count}
else:
data[doc] = {
entityType: {
entity: count
}
}
print(data)
rows = []
# These must match the row keys below.
fieldnames = ["doc", "entity", "entityType", "count"]
for doc in data:
for entityType in data[doc]:
for entity in data[doc][entityType]:
row = {
# The keys here must match the fieldnames specified above.
"doc": doc,
"entity": entity,
"count": data[doc][entityType][entity],
"entityType": entityType,
}
rows.append(row)
with open(sys.argv[1].replace('.csv', '_refined.csv'), "w") as csvoutput:
csv_writer = csv.DictWriter(csvoutput, fieldnames=fieldnames)
csv_writer.writeheader()
for row in rows:
csv_writer.writerow(row)