-
Notifications
You must be signed in to change notification settings - Fork 15
/
model_pp_relation.py
211 lines (192 loc) · 12.7 KB
/
model_pp_relation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
import sys
import argparse
import random
import numpy
import pickle
import warnings
from overrides import overrides
from keras.layers import Input
from encoders import LSTMEncoder, OntoLSTMEncoder
from index_data import DataProcessor
from preposition_model import PrepositionModel
from preposition_predictors import RelationPredictor, AttachmentPredictor
class PPRelationModel(PrepositionModel):
def __init__(self, tune_embedding, bidirectional, predict_attachment, **kwargs):
super(PPRelationModel, self).__init__(**kwargs)
self.tune_embedding = tune_embedding
self.bidirectional = bidirectional
self.predict_attachment = predict_attachment
self.num_relation_types = None
self.model_name = "PP Relation"
self.label_map = {}
self.custom_objects = {"RelationPredictor": RelationPredictor}
if self.predict_attachment:
warnings.warn("Assuming the last two words in input sentences are the preposition and its child.")
self.custom_objects["AttachmentPredictor"] = AttachmentPredictor
def get_input_layers(self, train_inputs):
sentence_inputs, preposition_indices = train_inputs
batch_size = preposition_indices.shape[0]
sentence_input_layer = Input(name="sentence", shape=sentence_inputs.shape[1:], dtype='int32')
prep_indices_layer = Input(name="prep_indices", shape=(1,), dtype='int32')
return sentence_input_layer, prep_indices_layer
def get_output_layers(self, inputs, dropout, embedding_file, num_mlp_layers):
sentence_input_layer, prep_indices_layer = inputs
encoded_input = self.encoder.get_encoded_phrase(sentence_input_layer, dropout, embedding_file)
if self.predict_attachment:
attachment_predictor = AttachmentPredictor(name='attachment_predictor', proj_dim=20,
composition_type='HPCD')
# Note: We assume here that the preposition phrase is the last two words.
attachment_probabilities = attachment_predictor(encoded_input)
relation_predictor = RelationPredictor(self.num_relation_types, name='relation_predictor', proj_dim=20,
composition_type='HPCT', num_hidden_layers=num_mlp_layers,
with_attachment_probs=True)
outputs = relation_predictor([encoded_input, prep_indices_layer, attachment_probabilities])
else:
predictor = RelationPredictor(self.num_relation_types, name='relation_predictor', proj_dim=20,
composition_type='HPCT', num_hidden_layers=num_mlp_layers)
outputs = predictor([encoded_input, prep_indices_layer])
return outputs
@overrides
def process_data(self, input_file, onto_aware, for_test=False):
dataset_type = "test" if for_test else "training"
print >>sys.stderr, "Reading %s data" % dataset_type
label_indices = []
prep_indices = []
tagged_sentences = []
max_sentence_length = 0
all_sentence_lengths = []
for line in open(input_file):
lnstrp = line.strip()
tagged_sentence, prep_index, label = lnstrp.split("\t")
sentence_length = len(tagged_sentence.split())
all_sentence_lengths.append(sentence_length)
if sentence_length > max_sentence_length:
max_sentence_length = sentence_length
prep_indices.append(int(prep_index))
if label not in self.label_map:
# Making indices start at 1 because _make_one_hot expects that.
self.label_map[label] = len(self.label_map) + 1
label_indices.append(self.label_map[label])
tagged_sentences.append(tagged_sentence)
self.num_relation_types = len(self.label_map)
if for_test:
if not self.model:
raise RuntimeError, "Model not trained yet!"
input_shape = self.model.get_input_shape_at(0)[0] # (num_sentences, num_words, ...)
sentlenlimit = input_shape[1]
else:
sentlenlimit = max_sentence_length
# We need to readjust the prep_indices because padding would affect the sentence indices.
for i in range(len(prep_indices)):
length = all_sentence_lengths[i]
prep_indices[i] += sentlenlimit - length
if not for_test:
# Shuffling so that when Keras does validation split, it is not always at the end.
sentences_indices_labels = zip(tagged_sentences, prep_indices, label_indices)
random.shuffle(sentences_indices_labels)
tagged_sentences, prep_indices, label_indices = zip(*sentences_indices_labels)
print >>sys.stderr, "Indexing %s data" % dataset_type
sentence_inputs = self.data_processor.prepare_input(tagged_sentences, onto_aware=onto_aware,
sentlenlimit=sentlenlimit, for_test=for_test,
remove_singletons=False)
prep_indices = numpy.asarray(prep_indices)
labels = self.data_processor.make_one_hot(label_indices)
return [sentence_inputs, prep_indices], labels
@overrides
def write_predictions(self, inputs):
rev_label_map = {j: i for (i, j) in self.label_map.items()}
predictions = numpy.argmax(self.model.predict(inputs), axis=1)
test_output_file = open("%s.predictions" % self.model_name_prefix, "w")
for prediction in predictions:
print >>test_output_file, rev_label_map[prediction + 1]
@overrides
def save_model(self, epoch):
pickle.dump(self.label_map, open("%s.label_map" % self.model_name_prefix, "wb"))
super(PPRelationModel, self).save_model(epoch)
@overrides
def load_model(self, epoch=None):
self.label_map = pickle.load(open("%s.label_map" % self.model_name_prefix, "rb"))
super(PPRelationModel, self).load_model(epoch)
class LSTMRelationModel(PPRelationModel):
def __init__(self, **kwargs):
super(LSTMRelationModel, self).__init__(**kwargs)
self.model_name_prefix = "lstm_prep_rel_tune-embedding=%s_bi=%s" % (self.tune_embedding,
self.bidirectional)
self.encoder = LSTMEncoder(self.data_processor, self.embed_dim, self.bidirectional, self.tune_embedding)
self.custom_objects.update(self.encoder.get_custom_objects())
class OntoLSTMRelationModel(PPRelationModel):
def __init__(self, num_senses, num_hyps, use_attention, set_sense_priors, prep_senses_dir, **kwargs):
super(OntoLSTMRelationModel, self).__init__(**kwargs)
# Set self.data_processor again, now with the right arguments.
process_preps = False if prep_senses_dir is None else True
self.data_processor = DataProcessor(word_syn_cutoff=num_senses, syn_path_cutoff=num_hyps,
process_preps=process_preps, prep_senses_dir=prep_senses_dir)
self.num_senses = num_senses
self.num_hyps = num_hyps
self.attention_model = None # Keras model with just embedding and encoder to output attention.
self.set_sense_priors = set_sense_priors
self.use_attention = use_attention
use_prep_senses = False if prep_senses_dir is None else True
self.encoder = OntoLSTMEncoder(self.num_senses, self.num_hyps, self.use_attention, self.set_sense_priors,
data_processor=self.data_processor, embed_dim=self.embed_dim,
bidirectional=self.bidirectional, tune_embedding=self.tune_embedding)
self.model_name_prefix = "ontolstm_prep_rel_att=%s_senses=%d_hyps=%d_sense-priors=%s_prep-senses=%s_tune-embedding=%s_bi=%s" % (
str(self.use_attention), self.num_senses, self.num_hyps, str(set_sense_priors), str(use_prep_senses), str(self.tune_embedding),
str(self.bidirectional))
self.custom_objects.update(self.encoder.get_custom_objects())
def main():
argparser = argparse.ArgumentParser(description="Train and test preposition relation prediction model")
argparser.add_argument('--train_file', type=str, help="TSV file with label and pos tagged phrase")
argparser.add_argument('--embedding_file', type=str, help="Gzipped embedding file")
argparser.add_argument('--embed_dim', type=int, help="Word/Synset vector size", default=50)
argparser.add_argument('--bidirectional', help="Encode bidirectionally followed by pooling", action='store_true')
argparser.add_argument('--onto_aware', help="Use ontology aware encoder. If this flag is not set, will use traditional encoder", action='store_true')
argparser.add_argument('--num_senses', type=int, help="Number of senses per word if using OntoLSTM (default 2)", default=2)
argparser.add_argument('--num_hyps', type=int, help="Number of hypernyms per sense if using OntoLSTM (default 5)", default=5)
argparser.add_argument('--prep_senses_dir', type=str, help="Directory containing preposition senses (from Semeval07 Task 6)")
argparser.add_argument('--set_sense_priors', help="Set an exponential prior on sense probabilities", action='store_true')
argparser.add_argument('--use_attention', help="Use attention in ontoLSTM. If this flag is not set, will use average concept representations", action='store_true')
argparser.add_argument('--test_file', type=str, help="Optionally provide test file for which accuracy will be computed")
argparser.add_argument('--load_model_from_epoch', type=int, help="Load model from a specific epoch. Will load best model by default.")
argparser.add_argument('--attention_output', type=str, help="Print attention values of the validation data in the given file")
argparser.add_argument('--tune_embedding', help="Fine tune pretrained embedding (if provided)", action='store_true')
argparser.add_argument('--num_epochs', type=int, help="Number of epochs (default 20)", default=20)
argparser.add_argument('--num_mlp_layers', type=int, help="Number of mlp layers (default 0)", default=0)
argparser.add_argument('--embedding_dropout', type=float, help="Dropout after embedding", default=0.0)
argparser.add_argument('--encoder_dropout', type=float, help="Dropout after encoder", default=0.0)
argparser.add_argument('--predict_attachment', help="Should we also predict the head? If not, we'll just use the last word in the head phrase.", action='store_true')
args = argparser.parse_args()
if args.onto_aware:
attachment_model = OntoLSTMRelationModel(num_senses=args.num_senses, num_hyps=args.num_hyps,
use_attention=args.use_attention,
set_sense_priors=args.set_sense_priors,
prep_senses_dir=args.prep_senses_dir,
embed_dim=args.embed_dim,
bidirectional=args.bidirectional,
tune_embedding=args.tune_embedding,
predict_attachment=args.predict_attachment)
else:
attachment_model = LSTMRelationModel(embed_dim=args.embed_dim, bidirectional=args.bidirectional,
tune_embedding=args.tune_embedding,
predict_attachment=args.predict_attachment)
## Train model or load trained model
if args.train_file is None:
attachment_model.load_model(args.load_model_from_epoch)
else:
train_inputs, train_labels = attachment_model.process_data(args.train_file, onto_aware=args.onto_aware,
for_test=False)
dropout = {"embedding": args.embedding_dropout,
"encoder": args.encoder_dropout}
attachment_model.train(train_inputs, train_labels, num_epochs=args.num_epochs,
dropout=dropout, num_mlp_layers=args.num_mlp_layers,
embedding_file=args.embedding_file)
## Test model
if args.test_file is not None:
test_inputs, test_labels = attachment_model.process_data(args.test_file, onto_aware=args.onto_aware,
for_test=True)
attachment_model.test(test_inputs, test_labels)
if args.attention_output is not None:
raise NotImplementedError
#attachment_model.print_attention_values(args.test_file, test_inputs, args.attention_output)
if __name__ == "__main__":
main()