-
Notifications
You must be signed in to change notification settings - Fork 4
/
gloss_links.py
executable file
·106 lines (85 loc) · 3.11 KB
/
gloss_links.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
#!/usr/bin/python3
import argparse
from subprocess import Popen, PIPE
from pylib.misc import filenm_from_key
ARG_ERROR = 1
"""
This program searches and outputs name of the file where glossary/keyword
appear. Check below on how to run the program.
for testing run:
(python3 gloss_links.py test_data/gloss_key.txt test_data --lf
"test_data/gloss_links_inp1.txt" "test_data/gloss_links_inp2.txt")
"""
def process_file(filenm, keyword_context, gloss_list):
"""
Parses each file for all the keyword and appends the
keyword context dictionary.
"""
for keyword in gloss_list:
# \\b was used for word breaks. Single backflash is ignored by python.
process = Popen(['grep', '-ioZ', '\\b' + keyword + '\\b',
filenm], stdout=PIPE)
(output, err) = process.communicate()
if(len(output) > 0):
if keyword not in keyword_context:
keyword_context[keyword] = []
keyword_context[keyword].append(filenm)
def process_args():
"""
Parses command line args and returns:
keyword_file, file_list
"""
arg_parser = argparse.ArgumentParser()
arg_parser.add_argument("gloss_key")
arg_parser.add_argument("outdir")
arg_parser.add_argument(
"--lf", # you need to add "--lf" flag in command line
nargs="*",
type=str,
default=[],
)
args = arg_parser.parse_args()
return (args.gloss_key, args.outdir, args.lf)
def output_context(outdir, keyword_context, gloss_lists):
"""
output context of a keyword
Args: outdir, keyword, context
Returns: None
"""
for keyword in keyword_context:
file_name = filenm_from_key(keyword)
output_name = outdir + "/" + file_name + ".txt"
with open(output_name, 'w') as files:
# br tags since this will be added as html file.
files.write(keyword + " found in: <br>")
temp = keyword_context[keyword]
for i in range(0, len(temp)):
files.write(" " + temp[i])
files.write("\n")
files.write("<br>")
# outputting names of files that dont't appear in glossary list
# needed to avoid django include error
for key in gloss_lists:
file_name = filenm_from_key(key)
output_name = outdir + "/" + file_name + ".txt"
if key not in keyword_context:
with open(output_name, 'w') as files:
pass
if __name__ == '__main__':
# get command line params:
(keyword_list, outdir, file_list) = process_args()
gloss_lists = []
keyword_contexts = {}
# first get all the gloss keywords
try:
with open(keyword_list, 'r') as f:
for line in f:
# tab delimited
key = line.strip().split("\t")
gloss_lists.append(key[0])
except IOError:
print("Couldn't read " + keyword_list)
exit(1)
for filename in file_list: # look for keywords in all files
process_file(filename, keyword_contexts, gloss_lists)
output_context(outdir, keyword_contexts, gloss_lists)