-
Notifications
You must be signed in to change notification settings - Fork 0
/
time_pipeline.py
139 lines (113 loc) · 6.62 KB
/
time_pipeline.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import sys, os, subprocess
import time
NER_dir = "NER_v2"
RE_dir = "RE"
python_path = "python"
PIPELINE_PARTS = [True, True, True, False] # Whether to run a part of the pipeline: [NER, DEP_PARSE, RE, CLEAN_UP]
TIMER_FILE = 'time.txt'
if __name__ == "__main__":
if len(sys.argv) < 2:
print("Type in the path to the input directory (use absolute path and cannot include [space]).")
exit()
cwd = os.getcwd()
input_dir = sys.argv[1]
input_files = os.listdir(input_dir)
input_ids = []
for input_file in input_files:
if input_file[-3:] == "txt":
input_ids.append(input_file[:-4])
line_count = 0
for id in input_ids:
with open(os.path.join(input_dir, id + '.txt')) as input_file:
for line in input_file:
if len(line) > 1:
line_count += 1
time_file = open(TIMER_FILE, 'w')
time_file.write('There are a total of {} lines in {}\n'.format(line_count, input_dir))
# ============================== Name Entity Recognition ==============================
if PIPELINE_PARTS[0]:
start_time = time.time()
ner_path = os.path.join(cwd, NER_dir)
os.chdir(ner_path)
for id in input_ids:
txt_path = os.path.join(input_dir, id + ".txt")
# os.system("python " + "ner.py " + txt_path)
subprocess.call([python_path, "ner.py", txt_path])
total_time = time.time() - start_time
averge_time = total_time / line_count
time_file.write("NER took {} seconds to finish: \n\tAverge processing time for one line is {} seconds\n".format(total_time, averge_time))
# ============================== Dependency Parsing ==============================
if PIPELINE_PARTS[1]:
start_time = time.time()
os.chdir(os.path.join(cwd, RE_dir, "jPTDP"))
# python_bin = os.path.join(cwd, RE_dir, "jPTDP-master/.DyNet/bin/python")
script_file = os.path.join(cwd, RE_dir, "jPTDP/fast_parse.py")
# converter_file = os.path.join(cwd, RE_dir, "jPTDP-master/utils/converter.py")
# jPTDP_model_path = os.path.join(cwd, RE_dir, "jPTDP-master/sample/model256")
# jPTDP_params_path = os.path.join(cwd, RE_dir, "jPTDP-master/sample/model256.params")
for id in input_ids:
# line_count = 0
subprocess.call([python_path, script_file, os.path.join(input_dir, id + ".txt")])
# with open(os.path.join(input_dir, id + ".txt"), 'r') as input_file:
# # os.system("mkdir " + os.path.join(input_dir, id))
# subprocess.call(["mkdir", os.path.join(input_dir, id)])
# for line in input_file:
# line_strip = clean_str(line.strip())
# if line_strip != "" and not line_strip.isspace():
# line_path = os.path.join(input_dir, id, str(line_count) + ".txt")
# with open(line_path, 'w') as line_file:
# line_file.write(line_strip)
#
# parse_input = os.path.join(input_dir, id, str(line_count) + ".txt.conllu")
# parse_output = parse_input + ".pred"
#
# subprocess.call([python_bin, converter_file, line_path])
# subprocess.call([python_bin, script_file, "--predict", \
# "--model", jPTDP_model_path, "--params", jPTDP_params_path, \
# "--test", parse_input, "--outdir", input_dir, "--output", parse_output])
#
# line_count += 1
total_time = time.time() - start_time
averge_time = total_time / line_count
time_file.write("DEP_PARSE took {} seconds to finish: \n\tAverge processing time for one line is {} seconds\n".format(total_time, averge_time))
# ============================== Relation Extraction ==============================
if PIPELINE_PARTS[2]:
start_time = time.time()
SDP_dir = "SDP"
subprocess.call(["mkdir", os.path.join(input_dir, SDP_dir)])
# subprocess.call(["cp", os.path.join(input_dir, "*.txt"), os.path.join(input_dir, SDP_dir)])
# subprocess.call(["cp", os.path.join(input_dir, "*.ann"), os.path.join(input_dir, SDP_dir)])
os.chdir(os.path.join(cwd, RE_dir, "2. dep"))
for id in input_ids:
parse_dir = os.path.join(input_dir, id)
subprocess.call(["cp", parse_dir + ".txt", os.path.join(input_dir, SDP_dir)])
subprocess.call(["cp", parse_dir + ".ann", os.path.join(input_dir, SDP_dir)])
# subprocess.call(["cp", "-r", parse_dir, os.path.join(input_dir, SDP_dir)])
subprocess.call([python_path, "relation_dep.py", parse_dir, os.path.join(input_dir, SDP_dir, id+".txt"), os.path.join(input_dir, SDP_dir, id+".ann")])
total_time = time.time() - start_time
averge_time = total_time / line_count
time_file.write("{} took {} seconds to finish: \n\tAverge processing time for one line is {} seconds\n".format(SDP_dir, total_time, averge_time))
start_time = time.time()
NN_dir = "NN"
subprocess.call(["mkdir", os.path.join(input_dir, NN_dir)])
# subprocess.call(["cp", os.path.join(input_dir, "*.txt"), os.path.join(input_dir, SDP_dir)])
# subprocess.call(["cp", os.path.join(input_dir, "*.ann"), os.path.join(input_dir, SDP_dir)])
os.chdir(os.path.join(cwd, RE_dir, "3. nn"))
for id in input_ids:
parse_dir = os.path.join(input_dir, id)
subprocess.call(["cp", parse_dir + ".txt", os.path.join(input_dir, NN_dir)])
subprocess.call(["cp", parse_dir + ".ann", os.path.join(input_dir, NN_dir)])
subprocess.call(["cp", "-r", parse_dir, os.path.join(input_dir, NN_dir)])
# subprocess.call(["cp", parse_dir + ".ann", parse_dir + ".ann.nn"])
subprocess.call([python_path, "relation_nn.py", os.path.join(input_dir, NN_dir)])
total_time = time.time() - start_time
averge_time = total_time / line_count
time_file.write("{} took {} seconds to finish: \n\tAverge processing time for one line is {} seconds\n".format(NN_dir, total_time, averge_time))
time_file.close()
# ============================== Clean Up ==============================
if PIPELINE_PARTS[3]:
for id in input_ids:
subprocess.call(["rm", os.path.join(input_dir, id + ".ann")])
subprocess.call(["rm", os.path.join(input_dir, id + ".txt")])
subprocess.call(["rm", "-r", os.path.join(input_dir, id)])
subprocess.call(["rm", "-r", os.path.join(input_dir, NN_dir, id)])