-
Notifications
You must be signed in to change notification settings - Fork 2
/
2015_lorelei_site.tex
executable file
·357 lines (292 loc) · 11.7 KB
/
2015_lorelei_site.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
\documentclass[compress]{beamer}
\usepackage{xmpmulti}
\usepackage{graphicx,float}
\usepackage{amsfonts}
\usepackage{mdwlist}
\usepackage{colortbl}
\pgfdeclareimage[width=\paperwidth]{mybackground}{../../common/boulder.pdf}
\newcommand{\danquote}[1]{
\begin{flushright}
\begin{overpic}[width=5.5cm,tics=10]{general_figures/speech_bubble}
\put(10,30) { \parbox{4cm}{#1 }}
\end{overpic}
\includegraphics[width=1.5cm]{general_figures/milkman_dan}
\end{flushright}
}
\newcommand{\explain}[2]{\underbrace{#2}_{\mbox{\footnotesize{#1}}}}
\newcommand{\e}[2]{\mathbb{E}_{#1}\left[ #2 \right] }
\newcommand{\ind}[1]{\mathbb{I}\left[ #1 \right] }
\newcommand{\ex}[1]{\mbox{exp}\left\{ #1\right\} }
%\newcommand{\g}{\, | \,}
\newcommand{\citename}[1]{#1 }
\newcommand{\greentext}[1]{\textcolor{caribbeangreen}{#1}}
\newcommand{\yellowtext}[1]{\textcolor{amber}{#1}}
\newcommand{\redtext}[1]{\textcolor{red}{#1}}
\newcommand{\bluetext}[1]{\textcolor{blue}{#1}}
\newcommand{\bm}[1]{\mbox{\boldmath$#1$}}
\newcommand{\Dir}{\mathrm{Dir}}
\newcommand{\Mult}{\mathrm{Mult}}
\newcommand{\g}[1]{\Gamma \left( #1 \right)}
\newcommand{\paragraph}[1]{ \vskip 1cm {\bf \large #1}}
\newcommand{\tb}[1]{\textbf{#1}}
\newcommand{\subtwo}[2]{_{#1, #2}}
\newcommand{\subthree}[3]{_{#1, #2, #3}}
\newcommand{\minussubtwo}[2]{_{-{#1, #2}}}
\newcommand{\minussubthree}[3]{_{-{#1, #2, #3}}}
\newcommand{\suptwo}[2]{^{#1, #2}}
\newcommand{\supthree}[3]{^{#1, #2, #3}}
\newcommand{\minussuptwo}[2]{^{-{#1, #2}}}
\newcommand{\minussupthree}[3]{^{-{#1, #2, #3}}}
\newcommand{\prior}[1]{\mathcal{B}#1} % to be revised
\newcommand{\gfx}[2]{
\begin{center}
\includegraphics[width=#2\linewidth]{teaparty/figures/#1}
\end{center}
}
\usetheme[bullet=circle, % Use circles instead of squares for bullets.
titleline=true, % Show a line below the frame title.
showdate=true, % show the date on the title page
alternativetitlepage=true, % Use the fancy title page.
titlepagelogo=general_figures/culogo, % Logo for the first page.
% Logo for the header on first page.
headerlogo=general_figures/boulder_cs,
]{UCBoulder}
\usecolortheme{ucdblack}
\title[ITM]{Creating and Evaluating Multilingual Topic Models}
\author[Boyd-Graber]{Shudong Hao~(UCB), Mans Hulden~(UCB), Jordan Boyd-Graber~(UCB), Philip~Resnik~(UMD)}
\date{December 2015}
\institute[Boulder] % (optional, but mostly needed)
{University of Maryland and University of Colorado Boulder}
%\AtBeginSection[] % "Beamer, do the following at the start of every section"
%{ \begin{frame} \frametitle{Outline} % make a frame titled "Outline"
%\tableofcontents[currentsection] % show TOC and highlight current section
%\end{frame} }
\begin{document}
\maketitle
\begin{frame}{Goal: Consistent Representations Across Languages}
\begin{itemize}
\item Learn multilingual topics that can serve as
bridges between languages
\item Goal 1: Aid understanding of analyst (multilingual or
monolingual)
\item Goal 2: Serve as machine representation for classification
into ontology
\item Mimic Year 1 topic evaluation
\end{itemize}
\begin{center}
\includegraphics[width=.6\linewidth]{mlslda/chinese_amazon_dict}
\end{center}
\end{frame}
\begin{frame}{Document-Links Model}
\begin{center}
\includegraphics[height=0.4\textheight]{multilingual_itm/polylingual-new.pdf}
\end{center}
\begin{itemize}
\item Use documents about the same subject to
create links across languages (e.g., these
documents discuss the same election in
Arabic and English)
\end{itemize}
\end{frame}
\begin{frame}{Vocabulary-Links Model}
\begin{center}
\includegraphics[height=0.4\textheight]{multilingual_itm/treeprior.pdf}
\end{center}
\begin{itemize}
\item Use dictionary (i.e., Wiktionary) as the
dictionary to create links between words (i.e., if
``fromage'' is in a topic in French, ``cheese''
should be in English)
\end{itemize}
\end{frame}
\begin{frame}{Combined Model}
\begin{center}
\includegraphics[height=0.4\textheight]{multilingual_itm/link-prior.pdf}
\end{center}
\begin{itemize}
\item Allow for either information about
documents or words to improve multilingual topics
\end{itemize}
\end{frame}
\begin{frame}{Problem: Detecting Good Multilingual Topics}
\begin{itemize}
\item If we don't understand the language, how do we know if we have
good topics?
\item Need metrics that:
\begin{itemize}
\item Are simple, not dependent on expensive resources
\item Correlate well with classification accuracy
\item Relatively language independent
\end{itemize}
\pause
\item {\bf NPMI} and {\bf human judgments} as proxy for classification accuracy
\end{itemize}
\end{frame}
\begin{frame}{Normalized Mutual Pointwise Information (NPMI)}
\begin{itemize}
\item \textbf{NPMI-Internal} focuses on a single language;
\begin{align}
NPMI(w_i,w_j)=\frac{\log\left(\frac{\Pr(w_i,w_j)}{\Pr(w_i)\Pr(w_j)}\right)}{-\log\Pr(w_i,w_j)}.
\end{align}
\item Correlates well with human judgments of
topic quality
\end{itemize}
\begin{center}
\includegraphics[height=0.3\textheight]{multilingual_itm/npmi-internal.pdf}
\end{center}
\end{frame}
\begin{frame}{Normalized Mutual Pointwise Information (NPMI)}
\begin{itemize}
\item \textbf{NPMI-Cross} focuses on language pairs;
\begin{align}
NPMI\left(w_i^{(l_1)},w_j^{(l_2)}\right)=\frac{\log\left(\frac{\Pr(w_i,w_j)}{\Pr(w_i)\Pr(w_j)}\right)}{-\log\Pr(w_i,w_j)}.
\end{align}
\begin{center}
\includegraphics[height=0.3\textheight]{multilingual_itm/npmi-cross.pdf}
\end{center}
\item When there are more than two languages, we calculate the NPMI-Cross for each pair of languages and take the average.
\item Seems to work well even with
very limited parallel data
\end{itemize}
\end{frame}
\begin{frame}{Classification}
\begin{columns}
\column{.3\linewidth}
\begin{itemize}
\item Topic distribution of $l_1$ docs
train SVM
\item Test on topic distribution of $l_2$ docs
\item Classification correlates
well with cross-NPMI.
\end{itemize}
\column{.6\linewidth}
\only<1>{\includegraphics[width=1.0\linewidth]{multilingual_itm/classification_1}}
\only<2>{\includegraphics[width=1.0\linewidth]{multilingual_itm/classification_2}}
\end{columns}
\end{frame}
% \begin{frame}{Dataset}
% Note on the corpus:
% \begin{itemize}
% \item It requires multilingual labeled corpus;
% \item I still use TED Talks 2013, since each article has many categories that can be used as labels;
% \item I choose \textbf{art} and \textbf{biology} as labels, since there are only three articles contain both labels. They have the minimal overlaps and comparable amounts of documents in each label;
% \item \textbf{Art}: $122$ documents in each language;
% \item \textbf{Biology}: $88$ documents in each language.
% \end{itemize}
% \end{frame}
\begin{frame}{TED Talk Translations}
\begin{itemize}
\item Languages: English, Chinese, Turkish;
\item $970$ documents in each language;
\item Labeled with subjects (business, society, technology)
\end{itemize}
\end{frame}
\begin{frame}{Wikipedia}
\begin{itemize}
\item Obtained from \url{http://linguatools.org/tools/corpora/wikipedia-comparable-corpora/};
\item Languages: English, Chinese, Turkish;
\item $1600$ parallel documents;
\item Used for evaluating NPMI scores.
\pause
\item Could we get by with fewer documents?
\end{itemize}
\end{frame}
\begin{frame}{Brief News}
\begin{itemize}
\item Provided by LORELEI;
\item Languages: English, Uzbek;
\item $1,731$ documents in each language;
\item Working on hand-annotation of subject (not complete)
\end{itemize}
\end{frame}
\begin{frame}{English and Chinese}
\begin{itemize}
\item Correlation between \textbf{classification accuracy} and \textbf{NPMI-Internal}:
\begin{itemize}
\item English: $0.8309$
\item Chinese: $0.8095$
\end{itemize}
\item Correlation between \textbf{average classification accuracy} and \textbf{NPMI-Cross}:
\begin{itemize}
\item $0.8216$
\end{itemize}
\end{itemize}
\begin{center}
\includegraphics[height=0.5\textheight]{multilingual_itm/clf-en-cmn.pdf}
\includegraphics[height=0.5\textheight]{multilingual_itm/npmi-en-cmn.pdf}
\end{center}
\end{frame}
\begin{frame}{English and Turkish}
\begin{itemize}
\item Correlation between \textbf{classification accuracy} and \textbf{NPMI-Internal}:
\begin{itemize}
\item English: $0.8244$
\item Chinese: $0.6292$
\end{itemize}
\item Correlation between \textbf{average classification accuracy} and \textbf{NPMI-Cross}:
\begin{itemize}
\item $0.7963$
\end{itemize}
\end{itemize}
\begin{center}
\includegraphics[height=0.5\textheight]{multilingual_itm/clf-en-tr.pdf}
\includegraphics[height=0.5\textheight]{multilingual_itm/npmi-en-tr.pdf}
\end{center}
\end{frame}
\begin{frame}{Chinese and Turkish}
\begin{itemize}
\item Correlation between \textbf{classification accuracy} and \textbf{NPMI-Internal}:
\begin{itemize}
\item Chinese: $0.6875$
\item Turkish: $0.6140$
\end{itemize}
\item Correlation between \textbf{average classification accuracy} and \textbf{NPMI-Cross}:
\begin{itemize}
\item $0.7394$
\end{itemize}
\end{itemize}
\begin{center}
\includegraphics[height=0.5\textheight]{multilingual_itm/clf-cmn-tr.pdf}
\includegraphics[height=0.5\textheight]{multilingual_itm/npmi-cmn-tr.pdf}
\end{center}
\end{frame}
\begin{frame}{Human Interpretation}
\begin{itemize}
\item Crowdsourcing with bilingual users
\item Given a topic in one language
and five topics in another language,
see if human picks corresponding topic
\item Experiments underway
\end{itemize}
\end{frame}
\begin{frame}{Human Interpretation}
\begin{center}
\includegraphics[height=0.8\textheight]{multilingual_itm/crowdflower.png}
\end{center}
\end{frame}
\begin{frame}{Human Improvement}
\begin{itemize}
\item Now that we know what a good topic looks like,
how can we use brief user inputs to improve model?
\begin{itemize}
\item These documents are about the same things
\item These words are about the same things
(connection to morphology / lemmatization)
\item Direct classification of documents (to
improve model)
\end{itemize}
\end{itemize}
\end{frame}
\begin{frame}{Example on highly inflected language}
\includegraphics[width=1.0\linewidth]{multilingual_itm/morphology}
\end{frame}
\begin{frame}{LTDE Inputs}
\begin{itemize}
\item Machine readable vector (useful for clustering
/ visualization) of topic assignments
\item Human-readable topics in English and incident
languages
\item Association of documents to labels from ontology
\end{itemize}
\end{frame}
\end{document}