-
Notifications
You must be signed in to change notification settings - Fork 3
/
utils.py
144 lines (123 loc) · 4.59 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
from collections import Counter
import csv
import subprocess
import inflect
import pandas as pd
from statsmodels.stats.proportion import proportion_confint
infl_eng = inflect.engine()
dependency_fields = ['sentence', 'orig_sentence', 'pos_sentence',
'subj', 'verb', 'subj_pos', 'has_rel', 'has_nsubj',
'verb_pos', 'subj_index', 'verb_index', 'n_intervening',
'last_intervening', 'n_diff_intervening', 'distance',
'max_depth', 'all_nouns', 'nouns_up_to_verb']
def deps_to_tsv(deps, outfile):
writer = csv.writer(file(outfile, 'w'), delimiter='\t')
writer.writerow(dependency_fields)
for dep in deps:
writer.writerow([dep[key] for key in dependency_fields])
def deps_from_tsv(infile, limit=None):
res = []
for i, d in enumerate(csv.DictReader(open(infile), delimiter='\t')):
if limit is not None and i >= limit:
break
res.append({x: int(y) if y.isdigit() else y for x, y in d.items()})
return res
def zread(fname):
p = subprocess.Popen(['gunzip', '-c', fname], stdout=subprocess.PIPE)
for line in p.stdout:
yield line
p.wait()
def tokenize_blanks(fh):
sent = []
for line in fh:
line = line.strip().split()
if not line:
if sent:
yield sent
sent = []
else:
sent.append(line)
yield sent
def create_freq_dict(infile, outfile, minfreq=50):
d = Counter()
for i, line in enumerate(zread(infile)):
stripped = line.strip()
if stripped:
s = stripped.split()
d[s[1], s[3]] += 1
if i % 1000000 == 0:
print(i)
outfile = file(outfile, 'w')
for (w, pos), count in d.iteritems():
if count > minfreq:
outfile.write('%s\t%s\t%d\n' % (w, pos, count))
def confint(row):
n_errors = int(row['errorprob'] * row['count'])
return proportion_confint(n_errors, row['count'])
def add_confints(df):
df['minconf'] = df.apply(lambda row: confint(row)[0], axis=1)
df['maxconf'] = df.apply(lambda row: confint(row)[1], axis=1)
def get_grouping(df, grouping_vars):
funcs = {'correct': {'accuracy': 'mean', 'count': 'count'},
'distance': {'mean_distance': 'mean'}}
x = df.groupby(grouping_vars).aggregate(funcs)
x.columns = x.columns.droplevel()
x = x.reset_index()
x['errorprob'] = 1 - x['accuracy']
add_confints(x)
return x
def gen_inflect_from_vocab(vocab_file, freq_threshold=1000):
vbp = {}
vbz = {}
nn = {}
nns = {}
from_pos = {'NNS': nns, 'NN': nn, 'VBP': vbp, 'VBZ': vbz}
for line in file(vocab_file):
if line.startswith(' '): # empty string token
continue
word, pos, count = line.strip().split()
count = int(count)
if len(word) > 1 and pos in from_pos and count >= freq_threshold:
from_pos[pos][word] = count
verb_infl = {'VBP': 'VBZ', 'VBZ': 'VBP'}
for word, count in vbz.iteritems():
candidate = infl_eng.plural_verb(word)
if candidate in vbp:
verb_infl[candidate] = word
verb_infl[word] = candidate
noun_infl = {'NN': 'NNS', 'NNS': 'NN'}
for word, count in nn.iteritems():
candidate = infl_eng.plural_noun(word)
if candidate in nns:
noun_infl[candidate] = word
noun_infl[word] = candidate
return verb_infl, noun_infl
def annotate_relpron(df):
pd.options.mode.chained_assignment = None
def f(x):
blacklist = set(['NNP', 'PRP'])
relprons = set(['WDT', 'WP', 'WRB', 'WP$'])
vi = x['verb_index'] - 1
words_in_dep = x['orig_sentence'].split()[x['subj_index']:vi]
pos_in_dep = x['pos_sentence'].split()[x['subj_index']:vi]
first_is_that = words_in_dep[:1] == ['that']
return (bool(blacklist & set(pos_in_dep)),
bool(relprons & set(pos_in_dep[:2])) | first_is_that,
bool(relprons & set(pos_in_dep)) | first_is_that)
df['blacklisted'], df['has_early_relpron'], df['has_relpron'] = \
zip(*df.apply(f, axis=1))
df['has_early_relpron'] = True
def g(x):
if x['has_rel'] and x['has_relpron'] and x['has_early_relpron']:
return 'With relativizer'
elif x['has_rel'] and not x['has_relpron']:
return 'Without relativizer'
elif not x['has_rel']:
if x['has_relpron']:
return 'Error'
else:
return 'No relative clause'
else:
return 'Error'
df['condition'] = df.apply(g, axis=1)
return df