-
Notifications
You must be signed in to change notification settings - Fork 2
/
2018_lorelei_sf.tex
executable file
·447 lines (357 loc) · 12.6 KB
/
2018_lorelei_sf.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
\documentclass[compress]{beamer}
%\usepackage{beamerthemesplit}
\usepackage{xmpmulti}
\usepackage{multirow}
\usepackage{multicol}
\usepackage{booktabs}
\usepackage{graphicx,float,wrapfig, bbm}
\usepackage{amsfonts, bbold, comment}
\usepackage{mdwlist}
\usepackage{subfigure}
\usepackage{colortbl}
\usepackage{overpic}
\usepackage{pdfpages}
\usepackage{multirow}
\pgfdeclareimage[width=\paperwidth]{mybackground}{../../common/boulder.pdf}
\newcommand{\name}[0]{\textsc{caco}}
\newcommand{\vect}[1]{\bm{\mathbf{#1}}}
\newcommand{\slda}[0]{\abr{slda}}
\newcommand{\bm}[1]{\mbox{\boldmath$#1$}}
\newcommand{\lda}[0]{\abr{lda}}
\newcommand{\explain}[2]{\underbrace{#2}_{\mbox{\footnotesize{#1}}}}
\newcommand{\itmspace}[0]{\hspace{2cm}}
\newcommand{\pos}[1]{{\texttt{#1}}}
\newcommand{\e}[2]{\mathbb{E}_{#1}\left[ #2 \right] }
\newcommand{\ind}[1]{\mathbb{I}\left[ #1 \right] }
\newcommand{\abr}[1]{\textsc{#1} }
\newcommand{\ex}[1]{\mbox{exp}\left\{ #1\right\} }
\newcommand{\h}[2]{\mathbb{H}_{#1}\left[ #2 \right] }
\newcommand{\g}{\, | \,}
\newcommand{\popshow}[2]{\only<#1->{\alert<#1>{#2}}}
\newcommand{\citename}[1]{#1 }
\newcommand{\flag}[1]{{\setlength{\fboxsep}{0pt}\fbox{\includegraphics[height=0.30cm,width=0.45cm]{clwe/flags/#1.pdf}}}}
\newcommand{\fsi}[2]{
\begin{frame}[plain]
\vspace*{-1pt}
\makebox[\linewidth]{\includegraphics[width=\paperwidth]{#1}}
\begin{center}
#2
\end{center}
\end{frame}
}
\newcommand{\danquote}[1]{
\begin{flushright}
\begin{overpic}[width=5.5cm,tics=10]{general_figures/speech_bubble}
\put(10,30) { \parbox{4cm}{#1 }}
\end{overpic}
\includegraphics[width=1.5cm]{general_figures/milkman_dan}
\end{flushright}
}
\newcommand{\gfxi}[2]{
\begin{center}
\includegraphics[width=#2\linewidth]{interpretability/#1}
\end{center}
}
\newcommand{\gfxs}[2]{
\begin{center}
\includegraphics[width=#2\linewidth]{simtrans/#1}
\end{center}
}
\newcommand{\gfxq}[2]{
\begin{center}
\includegraphics[width=#2\linewidth]{qb/#1}
\end{center}
}
\newif\ifjobtalk\jobtalktrue
\newif\iflong\longtrue
\usetheme[
showdate=true, % show the date on the title page
alternativetitlepage=true, % Use the fancy title page.
titlepagelogo=general_figures/shell, % Logo for the fir\
st page.
]{UMD}
\title[]{Multilingual Situation Frame Prediction}
\author{Jordan Boyd-Graber, Benjamin Van Durme, Philip Resnik, Ting
Hua, \dots}
\date{GRACE BBN: UMD / JHU / CU}
\institute[] % (optional, but mostly needed)
{University of Maryland}
%gets rid of bottom navigation symbols
\setbeamertemplate{navigation symbols}{}
%gets rid of footer
%will override 'frame number' instruction above
%comment out to revert to previous/default definitions
\setbeamertemplate{footline}{}
\begin{document}
\frame{
\titlepage
\tiny
}
\begin{frame}{Recap: How we're doing SF}
\begin{itemize}
\item Cross-lingual word embeddings (CLWE) \popshow{2}{(Bad, OOV words)}
\item Use both translation and original text \popshow{2}{(MT makes mistakes)}
\item Bag of words model \popshow{2}{(Why not sequence?)}
\end{itemize}
\end{frame}
\begin{frame}{Three Issues}
\begin{enumerate}
\item How do we know if cross-lingual embeddings are any good?
\item Why don't sequence models degrade well with noisy MT input?
\item How can we deal with OOV input words?
\end{enumerate}
\end{frame}
\begin{frame}{Question 1: What makes a good embedding}
\begin{itemize}
\item Dictionary translations should have high \emph{cosine
similarity}
\item QVEC (Tsvetkov et al., 2015)
\begin{itemize}
\item Correlation of linguisticly-derived vector with word
dimension
\item Higher correlation better
\end{itemize}
\pause
\item But both of these require extensive resources
\item Can we do something simple?
\end{itemize}
\end{frame}
\begin{frame}{Look at nearest neighbor graph}
\begin{itemize}
\item Form a graph of each words $k$ nearest neighbors
\item Look at the language of the nearest neighbors
\begin{itemize}
\item Languages in their own neighborhood: bad
\item Mixed languages: good
\pause
\item Modularity
\end{itemize}
\end{itemize}
\end{frame}
\fsi{clwe/uighur_modularity}{Unmixed word embeddings}
\begin{frame}{Modularity}
Given $L$ different languages, modularity $Q$ is defined as:
\begin{equation}
\label{unnorm_modularity}
Q = \sum_{l = 1}^L (e_{ll} - a_l^2),
\end{equation}
where $e_{ll}$ is the fraction of edges that are connected to the same language:
\begin{equation}
e_{ll} = \frac{1}{2m} \sum_{ij} A_{ij} \delta(g_i=l) \delta(g_j=l),
\end{equation}
and $a_{l}$ is the expected fraction of edges connected to the same language:
\begin{equation}
a_{l} = \frac{1}{2m} \sum_{i} d_{i} \delta(g_i=l),
\end{equation}
where $m$ is the number of edges and $\delta$ is an indicator function that evaluates to $1$ if the argument is true and $0$ otherwise.
\end{frame}
\begin{frame}{Predicting Classification Accuracy}
\begin{columns}
\column{.4\linewidth}
Predict classification {\bf excluding}:
\begin{itemize}
\item \alert<3>{Modularity}
\item \alert<2>{QVEC}
\item \alert<1>{Cosine}
\end{itemize}
\column{.6\linewidth}
\begin{center}
\only<3->{\includegraphics[width=.9\linewidth]{clwe/0_ablated}}
\only<2>{\includegraphics[width=.9\linewidth]{clwe/1_ablated}}
\only<1>{\includegraphics[width=.9\linewidth]{clwe/2_ablated}}
\end{center}
\end{columns}
\vspace{1cm}
\only<4>{Takeaway: modularity works well without huge resources}
\end{frame}
\begin{frame}{Next steps}
\begin{itemize}
\item We can tell bad from good, but unclear if helpful for good
vs. great
\item How to extend to many languages at once
\item How to know when to trust \emph{specific} word embeddings
\end{itemize}
\end{frame}
\begin{frame}{Question 2: Why Do Sequence Models Degrade Poorly?}
\begin{itemize}
\item We train models on perfect, complete English data
\item But at test time, we have problems
\begin{itemize}
\item Translation errors
\item OOV words
\item Hard to predict problems in advance
\end{itemize}
\item Not much luck with sequence models on SF task
\end{itemize}
\end{frame}
\begin{frame}{Crazy things can happen}
Let's start removing words from examples and see what happens when
applying RNN model to tiny example \dots
\begin{itemize}
\item Situation frame: ``during'' $\rightarrow$ {\bf Terrorism}
\pause
\item Sentiment: ``must'' $\rightarrow$ {\bf Positive}
\pause
\item Question Answering \dots
\end{itemize}
\end{frame}
\fsi{rawr/heat_map_1}{All these questions get answer right}
\begin{frame}{RAWR}
\begin{itemize}
\item Can get really short while giving same answer
\begin{center}
\includegraphics[width=.9\linewidth]{rawr/length_histogram}
\end{center}
\item We call these examples ``Right Answer, Wrong Reason''
\item Not all of these are reasonable decisions
\end{itemize}
\end{frame}
\begin{frame}{Fixing the problem}
\begin{itemize}
\item Training on full sentences leads to unexpected behavior on
shorter inputs
\item Solution: add entropy regularizer to emphasize
\emph{uncertainty}
\begin{equation*}
\sum_{(x, y) \in (X, Y)}\log(f(y \g x)) + \lambda\sum_{\tilde{x}\in \tilde{X}}
\h{}{f(\cdot \g \tilde{x})}.
\end{equation*}
\end{itemize}
\end{frame}
\begin{frame}{Improves raw performance and robustness}
\begin{itemize}
\item On most languages and tasks, RNN performance goes up .5 to 1.0
$f$-measure
\item Requires longer example to trigger (double the length)
\pause
\item Bag of word models still do
better on SF
\begin{itemize}
\item Too little data?
\item Syntax irregular / unimportant?
\end{itemize}
\end{itemize}
\end{frame}
\begin{frame}{Question 3: Dealing with OOV Words}
\begin{columns}
\column{.5\linewidth}
\begin{center}
\includegraphics[width=.9\linewidth]{clwe/mimick}
\end{center}
\column{.5\linewidth}
\begin{itemize}
\item Hacky solution: use MIMICK (Pinter et al, 2017)
\begin{itemize}
\item Train on Amharic characters
\item Apply to Tigrinya
\begin{center}
\includegraphics[width=.9\linewidth]{clwe/amharic}
\end{center}
\end{itemize}
\item Can we do better with full model?
\end{itemize}
\end{columns}
\end{frame}
\fsi{clwe/architecture}{Embedder (LSTM) feeds into classifier (DAN)}
\begin{frame}{Objective}
\begin{itemize}
\item Classifier accuracy (source language)
\begin{equation}
L_s(\theta) = -\frac{1}{|S|}\sum_{\langle \vect{w}, y \rangle \in S}
\log p(y \mid \vect{w}; \theta),
\end{equation}
\item Get things in dictionary right
\begin{equation}
L_d(\theta) = \frac{1}{|D|}\sum_{\langle w, w' \rangle \in D}
||e(w) - e(w')||_2^2.
\end{equation}
\item Use pre-trained embeddings if you have / trust them
\begin{equation}
L_e(\theta) = \frac{1}{V} \sum_{i=1}^V ||e(w_i) - x_i||_2^2
\end{equation}
\item Complete objective
\begin{equation}
L(\theta) = \explain{classifier}{L_s(\theta)} + \lambda_d \explain{dictionary}{L_d(\theta)} + \lambda_e \explain{embed}{L_e(\theta)},
\end{equation}
\end{itemize}
\end{frame}
\begin{frame}{Character Transfer Results}
\begin{itemize}
\item Sanity check: compare with MIMICK hack
\begin{itemize}
\item Improvement on IL5 (+0.1)
\item Drop on IL6 (-0.2)
\end{itemize}
\pause
\item Roughly doing what we had before
\item Fuller model allows more exploration
\item Other annotations
\end{itemize}
\end{frame}
\begin{frame}{Experiment Setup}
\begin{itemize}
\item Cross-lingual document classification (CLDC) experiments on RCV2 (Klementiev et al., 2012).
\item Two North Germanic and three Romance languages.
\item Word-based models: DAN with 40-dim multiCCA/multiCluster CLWE (Ammar et al., 2016).
\item {\bf C}lassification {\bf A}ided by {\bf C}onvergent {\bf
O}rthography (CACO) variants:
\begin{itemize}
\item SRC: labeled data only
\item DICT: labeled data + dictionary matching
\item MIM: labeled data + mimicking
\item ALL: everything
\item DICT+, ALL+: variants with language identifiers.
\end{itemize}
\end{itemize}
\end{frame}
\begin{frame}{Character Transfer Results}
\tiny
\begin{table}
\centering
\begin{tabular}{ll|rr|rrrrrrr}
& & \multicolumn{2}{c|}{word-baed \abr{dan}} & \multicolumn{6}{c}{\name{} variants}\\
source & target & multiCCA & multiCluster & \abr{src} & \abr{dict} & \abr{dict+} & \abr{mim} & \abr{all} & \abr{all+}\\
\hline
\flag{da}~\abr{da} & \flag{sv}~\abr{sv} & 67.5 & 58.5 & 51.0 & 60.5 & 53.5 & 66.0 & \textbf{69.5} & 68.0\\
\flag{sv}~\abr{sv} & \flag{da}~\abr{da} & 51.0 & 36.0 & 52.0 & \textbf{70.5} & 46.5 & 63.0 & 59.5 & 55.0\\
\flag{es}~\abr{es} & \flag{fr}~\abr{fr} & \textbf{60.0} & 37.0 & 47.0 & 51.0 & 48.5 & 51.5 & 48.5 & 48.5\\
\flag{es}~\abr{es} & \flag{it}~\abr{it} & 54.0 & 55.0 & 49.0 & 50.0 & 47.0 & 46.5 & \textbf{59.0} & 44.5\\
\flag{fr}~\abr{fr} & \flag{es}~\abr{es} & 61.0 & \textbf{65.5} & 62.5 & 60.5 & 65.0 & 58.0 & 60.0 & 54.0\\
\flag{fr}~\abr{fr} & \flag{it}~\abr{it} & 46.0 & \textbf{64.5} & 46.5 & 43.5 & 57.0 & 48.5 & 53.5 & 47.0\\
\flag{it}~\abr{it} & \flag{es}~\abr{es} & 42.5 & 42.0 & 54.5 & \textbf{62.0} & 54.5 & 58.0 & 45.5 & 47.5\\
\flag{it}~\abr{it} & \flag{fr}~\abr{fr} & 26.5 & \textbf{59.5} & 42.5 & 56.5 & 52.0 & 46.5 & 56.0 & 50.5\\
\hline
\multicolumn{2}{r|}{average} & 51.1 & 52.3 & 50.6 & \textbf{56.8} & 53.0 & 54.8 & 56.4 & 51.9\\
\end{tabular}
\caption{\label{tab:result} Results of \abr{cldc} experiments on eight
related language pairs from Reuters RCV2. The best result for each row is in
\textbf{boldface}. The \name{} models are competitive with \abr{clwe}-based
\abr{dan} models that use far more resources.}
\end{table}
\end{frame}
\begin{frame}{Challenges}
\begin{itemize}
\item Doesn't always work (Romance to Germanic)
\item What about different scripts? (Uighur and Tajik)
\item Using multiple languages at once
\pause
\item Annotation remains bottleneck
\end{itemize}
\end{frame}
\begin{frame}{Wrapup}
\begin{itemize}
\item We need good CLWE models; how can we diagnose
\item Sequence models are brittle
\item Transferring information between languages
\pause
\item Other things we're thinking about
\begin{itemize}
\item Harvesting / annotation data
\item Using spatial-temporal domain
\item Corpus-level interactions with NI
\end{itemize}
\end{itemize}
\end{frame}
\fsi{clwe/il5_med}{IL5: Medicine}
\fsi{clwe/il6_regime}{IL6: Regime}
\end{document}