-
Notifications
You must be signed in to change notification settings - Fork 18
/
visual-answer.py
135 lines (115 loc) · 3.86 KB
/
visual-answer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import numpy as np
from src.net import Net
from src.utils.visualqa import yield_weights, to_word
from src.utils.visualqa import glove_embed, read_image
from src.utils.visualqa import vgg16_model, lstm3_model, infer_model
'''
Model:
lstm x 3 -----\
> concatenate -> (dense, tanh) x 3 -> (dense, softmax) -> answer
vgg16 ------/
From: https://github.com/iamaaditya/VQA_Demo
'''
vgg16_weights = yield_weights(vgg16_model)
lstm3_weights = yield_weights(lstm3_model)
infer_weights = yield_weights(infer_model)
def build_n_convs(net, inp, n):
# build n sequential convolutions
def _build_1_conv(inp):
kernel, bias = next(vgg16_weights)
kernel = kernel.transpose([2, 3, 1, 0])
kernel = kernel.copy(order = 'C')
kernel, bias = map(
net.variable, [kernel, bias])
conved = net.conv2d(
inp, kernel, pad = (1,1), stride = (1,1))
return net.relu(net.plus_b(conved, bias))
conved = inp
for count in range(n):
conved = _build_1_conv(conved)
return net.maxpool2(conved)
def build_n_linears(net, inp, n):
# build n sequential fully connected
def _build_1_linear(inp):
weight, bias = next(vgg16_weights)
weight, bias = map(
net.variable, [weight, bias])
lineared = net.matmul(inp, weight)
return net.relu(net.plus_b(lineared, bias))
lineared = inp
for count in range(n):
lineared = _build_1_linear(lineared)
return lineared
def build_vgg16(net, img):
conved = img
for n in [2, 2, 3, 3, 3]:
conved = build_n_convs(net, conved, n)
# vgg16_weights are in NCHW order
tran = net.transpose(conved, [2, 0, 1])
flat = net.reshape(tran, (7 * 7 * 512,))
return build_n_linears(net, flat, 2)
def build_lstm3(net, question, real_len):
# build 3 sequential lstms
def build_1_lstm(inp, real_len):
w = next(lstm3_weights)
gates = [w[i: i + 3] \
for i in range(0, len(w), 3)]
tmp = dict()
# order (keras, essence): (igfo, fiog)
for gate, k in zip(gates, 'igfo'):
wx, wh, b = gate
whx = np.concatenate([wh, wx])
tmp[k] = (whx, b)
transfer = list(map(tmp.__getitem__, 'fiog'))
return net.lstm(
inp, real_len, 512,
gate_activation = 'hard_sigmoid',
transfer = transfer)
lstmed = question
for count in range(int(3)):
lstmed = build_1_lstm(lstmed, real_len)
last_time_step = net.batch_slice(
lstmed, real_len, shift = -1)
return last_time_step
def build_infer(net, inp):
def _fc(inp, act):
weight, bias = next(infer_weights)
weight, bias = map(
net.variable, [weight, bias])
lineared = net.matmul(inp, weight)
return act(net.plus_b(lineared, bias))
fc1 = _fc(inp, net.tanh)
fc2 = _fc(fc1, net.tanh)
fc3 = _fc(fc2, net.tanh)
fc4 = _fc(fc3, net.softmax)
return fc4
# Build the net.
net = Net()
image = net.portal((224, 224, 3))
question = net.portal((40, 300))
real_len = net.portal((1,))
vgg16_feat = build_vgg16(net, image)
lstm3_feat = build_lstm3(net, question, real_len)
infer_feat = net.concat([lstm3_feat, vgg16_feat])
answer = build_infer(net, infer_feat)
image_feed = read_image('test.jpg')
queries = [
u"What is the animal in the picture?"
]
import time
query_feed = list()
start_time = time.time()
for query in queries:
query_feed.append(glove_embed(query))
image_feed = [image_feed] * len(queries)
image_feed = np.array(image_feed)
query_feed = np.array(query_feed)
predicts, = net.forward([answer], {
image: image_feed,
question : query_feed,
real_len: [30] * len(queries)
})
for i, predict in enumerate(predicts):
print('Q: {:<40}. A: {}'.format(
queries[i], to_word(predict)))
print('Answered in {}s'.format(time.time() - start_time))