summaryrefslogtreecommitdiff
path: root/report/intro_slides/opening_slides.tex
diff options
context:
space:
mode:
Diffstat (limited to 'report/intro_slides/opening_slides.tex')
-rw-r--r--report/intro_slides/opening_slides.tex814
1 files changed, 0 insertions, 814 deletions
diff --git a/report/intro_slides/opening_slides.tex b/report/intro_slides/opening_slides.tex
deleted file mode 100644
index 98cf4f99..00000000
--- a/report/intro_slides/opening_slides.tex
+++ /dev/null
@@ -1,814 +0,0 @@
-\documentclass{beamer}
-
-\mode<presentation>
-{
- \usetheme{Boadilla}
- \setbeamercovered{transparent}}
-
-\usepackage[english]{babel}
-\usepackage{times}
-
-\usepackage{xcolor}
-\usepackage{colortbl}
-%\usepackage{subfigure}
-
-\usepackage{fontspec}
-\usepackage{xunicode}
-\usepackage{xltxtra}
-\usepackage{booktabs}
-\newenvironment{CJK}{\fontspec[Scale=0.9]{PMingLiU}}{}
-\newenvironment{Geeza}{\fontspec[Scale=0.9]{Geeza Pro}}{}
-
-%% for tables
-\newcommand{\mc}{\multicolumn}
-\newcommand{\lab}[1]{\multicolumn{1}{c}{#1}}
-\newcommand{\ind}[1]{{\fboxsep1pt\raisebox{-.5ex}{\fbox{{\tiny #1}}}}}
-\newcommand{\IND}[1]{{\fboxsep1pt\raisebox{0ex}{\fbox{{\small #1}}}}}
-\newcommand\production[2]{\ensuremath{\langle\mbox{#1}, \mbox{#2}\rangle}}
-
-%% markup
-\newcommand{\buffer}[1]{{\color{blue}\textbf{#1}}}
-\newcommand{\pred}[1]{\code{#1}}
-
-%% colors
-\newcommand{\textred}[1]{\alert{#1}}
-\newcommand{\textblue}[1]{\buffer{#1}}
-\definecolor{tablecolor}{cmyk}{0,0.3,0.3,0}
-\newcommand{\keytab}[1]{\mc{1}{>{\columncolor{tablecolor}}d}{#1}}
-
-% rules
-\newcommand{\psr}[2]{#1 $\rightarrow \langle $ #2 $\rangle$}
-
-\newenvironment{unpacked_itemize}{
-\begin{itemize}
- \setlength{\itemsep}{10pt}
- \setlength{\parskip}{0pt}
- \setlength{\parsep}{0pt}
-}{\end{itemize}}
-
-\newcommand{\condon}{\hspace{0pt} | \hspace{1pt}}
-\definecolor{darkblue}{rgb}{0,0,0.6}
-\newcommand{\blueexample}[1]{\textcolor{darkblue}{\rm #1}}
-
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-
-\newcommand{\ws}{\ensuremath{\vec{w}}}
-\newcommand{\pu}{\ensuremath{P_0}}
-\newcommand{\bx}{\mathbf{x}}
-\newcommand{\bz}{\mathbf{z}}
-\newcommand{\bd}{\mathbf{d}}
-\newcommand{\by}{\mathbf{y}}
-\newcommand\bleu{${B{\scriptstyle LEU}}$}
-
-
-\title[Models of SCFG Induction]{Models of Synchronous Grammar Induction for SMT}
-
-\author[CLSP Workshop 2010]{
- Workshop 2010
- %Phil Blunsom$^1$ \and Trevor Cohn$^2$ \and Chris Dyer$^3$ \and Adam Lopez$^4$
-}
-
-\institute[Baltimore]{
- The Center for Speech and Language Processing \\ Johns Hopkins University
-% $^1$University of Oxford\\
-% $^2$University of Sheffield\\
-% $^3$Carnegie Mellon University\\
-% $^4$University of Edinburgh
-}
-\date[June 21]{June 21, 2010}
-
-%\subject{Unsupervised models of Synchronous Grammar Induction for SMT}
-
-%\pgfdeclareimage[height=1.0cm]{university-logo}{logo}
-%\logo{\pgfuseimage{university-logo}}
-
-%\AtBeginSection[]
-%{
-% \begin{frame}<beamer>{Outline}
-% %\tableofcontents[currentsection,currentsubsection]
-% \tableofcontents[currentsection]
-% \end{frame}
-%}
-
-%\beamerdefaultoverlayspecification{<+->}
-
-\begin{document}
-
-\begin{frame}
- \titlepage
-\end{frame}
-
-%\begin{frame}{Outline}
-% \tableofcontents
-% You might wish to add the option [pausesections]
-%\end{frame}
-
-%\begin{frame}{Outline}
-% \tableofcontents
-% % You might wish to add the option [pausesections]
-%\end{frame}
-
-
-\begin{frame}[t]{Team members}
-\begin{center}
-{\bf Senior Members} \\
- Phil Blunsom (Oxford)\\
- Trevor Cohn (Sheffield)\\
- Adam Lopez (Edinburgh/COE)\\
- Chris Dyer (CMU)\\
- Jonathan Graehl (ISI)\\
-\vspace{0.2in}
-{\bf Graduate Students} \\
- Jan Botha (Oxford) \\
- Vladimir Eidelman (Maryland) \\
- Ziyuan Wang (JHU) \\
- ThuyLinh Nguyen (CMU) \\
-\vspace{0.2in}
-{\bf Undergraduate Students} \\
- Olivia Buzek (Maryland) \\
- Desai Chen (CMU) \\
-\end{center}
-\end{frame}
-
-
-
-\begin{frame}[t]{Statistical machine translation}
-%\vspace{1.0cm}
-\begin{exampleblock}{Arabic $\rightarrow$ English}
- \begin{figure}
- {\centering \includegraphics[scale=0.55]{arabic.pdf}}
- \end{figure}
-\vspace{0.10cm}
-\end{exampleblock}
-\begin{itemize}
- \item Statistical machine translation: Learn how to translate from parallel corpora.
-\end{itemize}
-\end{frame}
-
-
-\begin{frame}[t]{Statistical machine translation: successes}
-%\vspace{1.0cm}
-\begin{exampleblock}{Arabic $\rightarrow$ English}
- \begin{figure}
- {\centering \includegraphics[scale=0.55]{arabic-good.pdf}}
- \end{figure}
-\end{exampleblock}
-\begin{itemize}
- \item Statistical machine translation: Learn how to translate from parallel corpora
-\end{itemize}
-\end{frame}
-
-\begin{frame}[t]{Statistical machine translation: limitations}
-%\vspace{1.0cm}
-\begin{alertblock}{Chinese $\rightarrow$ English}
- \begin{figure}
- {\centering \includegraphics[scale=0.7]{chinese-bad.pdf}}
- \end{figure}
-\end{alertblock}
-\begin{itemize}
- \item This workshop: Learn to do it better.
-\end{itemize}
-\end{frame}
-
-
-\begin{frame}[t]{Statistical machine translation: limitations}
-\vspace{1.0cm}
-\begin{exampleblock}{Structural divergence between languages:}
- %\vspace{0.3cm}
- \begin{table}
- \centering
- \only<1>{
- \begin{tabular}{|l|l|}
- \hline
-% {\bf English} & {\bf The plane is faster than the train.}\\
-% \hline
-% Arabic & \begin{Geeza}الطائرة أسرع من القطار\end{Geeza} \\
-% & (the-plane) (faster) (than) (the train) \\
-% \hline
-% Chinese & \begin{CJK}飞机 比 火车 快\end{CJK} \\
-% & (plane) (compared-to) (train) (fast) \\
-% \hline
-% \hline
- {\bf English} & {\bf Who wrote this letter?} \\
- \hline
- Arabic & \begin{Geeza}من الذي كتب هذه الرسالة؟\end{Geeza} \\
- & \textcolor{gray}{(function-word)} (who) (wrote) (this) (the-letter) \\
- \hline
- Chinese & \begin{CJK}这封 信 是 谁 写 的 ?\end{CJK} \\
- & (this) (letter) (be) (who) (write) (come-from) \textcolor{gray}{(function-word)} \\
- \hline
- \end{tabular}
- }
- \only<2>{
- \begin{tabular}{|l|l|}
- \hline
- {\bf English} & {\bf \textcolor{blue}{Who} \textcolor{green}{wrote} \textcolor{red}{this} \textcolor{orange}{letter?}} \\
- \hline
- Arabic & \begin{Geeza}من الذي كتب هذه الرسالة؟\end{Geeza} \\
- & \textcolor{gray}{(function-word)} \textcolor{blue}{(who)} \textcolor{green}{(wrote)} \textcolor{red}{(this)} \textcolor{orange}{(the-letter)} \\
- \hline
- Chinese & \begin{CJK}这封 信 是 谁 写 的 ?\end{CJK} \\
- & (this) (letter) (be) (who) (write) (come-from) \textcolor{gray}{(function-word)} \\
- \hline
- \end{tabular}
- }
- \only<3->{
- \begin{tabular}{|l|l|}
- \hline
- {\bf English} & {\bf \textcolor{blue}{Who wrote} \textcolor{red}{this letter}?} \\
- \hline
- Arabic & \begin{Geeza}من الذي كتب هذه الرسالة؟\end{Geeza} \\
- & \textcolor{gray}{(function-word)} (who) (wrote) (this) (the-letter) \\
- \hline
- Chinese & \begin{CJK}\textcolor{red}{这封 信} \textcolor{blue}{是 谁 写} 的 ?\end{CJK} \\
- & \textcolor{red}{(this) (letter)} \textcolor{blue}{(be) (who) (write) (come-from)} \textcolor{gray}{(function-word)} \\
- \hline
- \end{tabular}
- }
- \end{table}
-\end{exampleblock}
-\only<4>{
- \begin{itemize}
- \item Phrasal translation equivalences \textcolor{green}{(existing models)}
- \item {\bf Constituent reordering \textcolor{blue}{(this workshop!)}}
- \item Morphology \textcolor{red}{(Next year?)}
- \end{itemize}
-}
-\end{frame}
-
-\begin{frame}[t]{Statistical machine translation: successes}
-\begin{center}
- \includegraphics[scale=0.35]{GoogleTranslateLanguages.pdf}
-\end{center}
-\end{frame}
-
-\begin{frame}[t]{Workshop overview}
-Input:
- \begin{itemize}
-% \item Joshua decoder
- \item Existing procedures for synchronous grammar extraction
- \end{itemize}
-\vspace{0.3in}
-Output:
- \begin{itemize}
- \item New unsupervised models for large scale synchronous grammar extraction,
-% \item An implementation of this model,
- \item A systematic comparison and analysis of the existing and proposed models,
- \item Extended decoders (cdec/Joshua) capable of working efficiently with these models.
- \end{itemize}
-\end{frame}
-
-\begin{frame}[t]{Models of translation}
-\begin{exampleblock}{Supervised SCFG: Syntactic Tree-to-String}
-\begin{center}
- \includegraphics[scale=0.55]{JeNeVeuxPasTravailler-tsg.pdf}
- \hspace{0.3in}
- \includegraphics[scale=0.55]{JeVeuxTravailler-tsg.pdf}
-\end{center}
-\end{exampleblock}
-\begin{itemize}
-\item Strong model of sentence structure.
-\item Reliant on a treebank to train the parser.
-\end{itemize}
-\end{frame}
-
-\begin{frame}[t]{Models of translation}
-\begin{block}{Unlabelled SCFG: Hiero}
- \begin{center}
- \includegraphics[scale=0.55]{JeNeVeuxPasTravailler-Hiero.pdf}
- \hspace{0.3in}
- \includegraphics[scale=0.55]{JeVeuxTravailler-Hiero.pdf}
- \end{center}
-\end{block}
-\begin{itemize}
-\item Only requires the parallel corpus.
-\item But weak model of sentence structure.
-\end{itemize}
-\end{frame}
-
-%\begin{frame}[t]{Models of translation}
-%\begin{block}{Hierarchical}
-% \begin{center}
-% \includegraphics[scale=0.55]{JeNeVeuxPasTravailler-Hiero.pdf}
-% \hspace{0.3in}
-% \includegraphics[scale=0.55]{JeVeuxTravailler-Hiero.pdf}
-% \end{center}
-%\end{block}
-%\end{frame}
-
-
-%\begin{frame}[t]{Impact}
-% \begin{center}
-% \includegraphics[scale=0.3]{ccb_tree.pdf}
-% \end{center}
-%\end{frame}
-
-
-\begin{frame}[t]{Impact}
-Systems using syntax have outperformed those that didn't:
- \begin{center}
- \includegraphics[scale=1.0]{ccb_graph1.pdf}
- \end{center}
-\end{frame}
-
-
-\begin{frame}[t]{Impact}
-\vspace{0.5in}
-\begin{table}
- \begin{tabular}{l|rr}
- \hline
- Language & Words & Domain \\ \hline
- English & 4.5M& Financial news \\
- Chinese & 0.5M & Broadcasting news \\
- Arabic & 300K (1M planned) & News \\
- Korean & 54K & Military \\ \hline
- \end{tabular}
-\caption{Major treebanks: data size and domain \label{table_treebanks_size}}
-\end{table}
-\end{frame}
-
-
-\begin{frame}[t]{Impact}
-Parallel corpora far exceed treebanks (millions of words):
- \begin{figure}
- {\centering \includegraphics[scale=0.7]{resource_matrix.pdf}}
- \end{figure}
-\end{frame}
-
-
-\begin{frame}[t]{Models of translation}
-\begin{block}{Hierarchical}
- \begin{center}
- \includegraphics[scale=0.55]{JeNeVeuxPasTravailler-Hiero-labelled.pdf}
- \hspace{0.3in}
- \includegraphics[scale=0.55]{JeVeuxTravailler-Hiero-labelled.pdf}
- \end{center}
-\end{block}
-\begin{itemize}
-\item \alert{AIM: Implement a large scale open-source synchronous constituent learning system.}
-\item \alert{AIM: Investigate and understand the relationship between the choice of synchronous grammar and SMT performance,}
-\item \alert{AIM: and fix our decoders accordingly.}
-\end{itemize}
-\end{frame}
-
-
-\begin{frame}[t]{Impact}
-Systems using syntax have outperformed those that didn't:
- \begin{center}
- \includegraphics[scale=1.0]{ccb_graph2.pdf}
- \end{center}
-\end{frame}
-
-\begin{frame}[t]{Evaluation goals}
-We will predominately evaluate using BLEU, but also use automatic structured metrics and perform small scale human evaluation:
-\vspace{0.25in}
-\begin{unpacked_itemize}
-\item Evaluate phrasal, syntactic, unsupervised syntactic,
-\item Aim 1: Do no harm (not true of existing syntactic approach)
-\item Aim 2: Exceed the performance of current non-syntactic systems.
-\item Aim 3: Meet or exceed performance of existing syntactic systems.
-\end{unpacked_itemize}
-\end{frame}
-
-%\begin{frame}[t]{Impact}
-%Success will have a significant impact on two areas of CL:
-%\vspace{0.25in}
-%\begin{unpacked_itemize}
-%\item Machine translation
-%\begin{unpacked_itemize}
-% \item Make the benefits of richly structured translation models available to a much wider range of researchers and for a wider range of languages.
-%% \item Change the research outlook of the field.
-%\end{unpacked_itemize}
-%\item Grammar induction:
-%\begin{unpacked_itemize}
-% \item Provide an empirical validation of state-of-the-art grammar induction techniques.
-%\end{unpacked_itemize}
-%\end{unpacked_itemize}
-%\end{frame}
-
-
-\begin{frame}[t]{Workshop Streams}
-\vspace{0.25in}
-\begin{unpacked_itemize}
-\item Implement scalable SCFG grammar extraction algorithms.
-\item Improve SCFG decoders to effieciently handle the grammars produce.
-\item Investigate discriminative training regimes the leverage features extracted from these grammars.
-\end{unpacked_itemize}
-\end{frame}
-
-
-%\begin{frame}[t]
-%\frametitle{Inducing a STSG given an observed tree:}
-%\only<1>{\frametitle{Inducing a STSG given an observed tree:}}
-%\only<2->{\frametitle{Existing approach (Galley et al. 2004):}}
-%
-%\begin{center}
-% \only<1>{\hspace{1mm}\includegraphics[scale=0.45]{full_of_fun_slides_start.pdf}}
-% \only<2>{\includegraphics[scale=0.45]{full_of_fun_slides_waligned.pdf}}
-% \only<3>{\vspace{-2mm}\includegraphics[scale=0.45]{full_of_fun_slides_waligned_overlay.pdf}}
-%% \only<4>{\includegraphics[scale=0.4]{full_of_fun_slides_third.pdf}}
-%% \only<5>{\includegraphics[scale=0.4]{full_of_fun_slides_forth.pdf}}
-%
-% \only<1>{Training instance}
-% \only<2>{Step 1: word alignment}
-% \only<3>{Step 2: rule extraction heuristic}
-%% \only<4>{Step 2: the rules extracted}
-%% \only<5>{Step 3: estimate a grammar}
-%\end{center}
-%\end{frame}
-
-
-% Il ne veut pas travailler
-
-
-%\begin{frame}[t]{Models of translation}
-%\begin{block}{Hierarchical}
-% \begin{center}
-% \includegraphics[scale=0.55]{JeNeVeuxPasTravailler-Hiero-labelled.pdf}
-% \hspace{0.3in}
-% \includegraphics[scale=0.55]{JeVeuxTravailler-Hiero-labelled.pdf}
-% \end{center}
-%\end{block}
-%\begin{itemize}
-%\item \alert{AIM: Implement a large scale open-source synchronous constituent labelling system.}
-%\item \alert{AIM: Investigate and understand the relationship between synchronous constituency and SMT performance.}
-%\end{itemize}
-%\end{frame}
-%
-%\begin{frame}[t]{Models of translation}
-%\begin{block}{Hierarchical}
-% \begin{center}
-% \includegraphics[scale=0.5]{JeNeVeuxPasTravailler-Hiero-labelled.pdf}
-% \includegraphics[scale=0.5]{IlNeVeutPasTravailler-Hiero-labelled.pdf}
-% \end{center}
-% \vspace{0.001in}
-%\end{block}
-%\begin{itemize}
-%\item \alert{AIM: Implement a large scale open-source synchronous constituent labelling system.}
-%\item \alert{AIM: Investigate and understand the relationship between synchronous constituency and SMT performance.}
-%\end{itemize}
-%\end{frame}
-
-\begin{frame}[t]{Unsupervised grammar induction}
-There has been significant research into monolingual grammar induction:
-\vspace{0.1in}
-\alert{Constituent context is a prime indicator of constituency.}
-\begin{unpacked_itemize}
-\item Alexander Clark. Unsupervised induction of stochastic context-free grammars using distributional clustering, 2001
-\item Dan Klein and Chris Manning. A Generative Constituent-Context Model for Improved Grammar Induction, 2002
-\end{unpacked_itemize}
-\vspace{0.1in}
-\alert{We can formalise this notion in algebraic structures}
-\begin{itemize}
-\item Alexander Clark. A learnable representation for syntax using residuated lattices, 2009
-\end{itemize}
-\vspace{0.1in}
-Deep connections to unsupervised word sense disambiguation, thesaurus extraction etc.
-\end{frame}
-
-%\begin{frame}[t]{Monolingual grammar induction}
-%Induce bracketing phrase-structure grammars:
-% \includegraphics[scale=1]{klein_ccm.pdf}
-%
-%\vspace{2ex}
-%And dependency trees: \\
-% \includegraphics[scale=1]{klein_dependency.pdf}
-%
-%\vspace{2ex}
-%Informed by constituent context: surrounding words are a good indicator of substitutability
-%\end{frame}
-
-
-\begin{frame}[t]{SCFG Grammar Induction}
-%\vspace{1.0cm}
-\begin{exampleblock}{Distributional Hypothesis}
-\begin{quote}
-\emph{Words that occur in the same contexts tend to have similar meanings}
-\end{quote}
-\hfill (Zellig Harris, 1954)
-\end{exampleblock}
-
-\vspace{3ex}
-
-We will leverage this in a translation setting:
-\begin{itemize}
- \item Use the contexts to \alert{cluster} translation units into groups
- \item Units in the same group expected to be semantically and syntactically similar
- \item Then use these cluster labels to guide translation
- \begin{itemize}
- \item lexical selection: translating ambiguous source word/s
- \item reordering: consistent syntactic patterns of reordering
- \end{itemize}
-\end{itemize}
-\end{frame}
-
-\begin{frame}[t]{Monolingual Example}
-Task: cluster words into their parts-of-speech. \\
-
-\vspace{1ex}
-Illustrate by starting with the word `deal' (noun or verb):
-
-\only<1>{\includegraphics[width=\columnwidth]{deal_first.pdf} \\ Step 1: Find contexts for `deal'}
-\only<2->{\includegraphics[width=\columnwidth]{deal.pdf} \\ Step 2: Find other words which occur in these contexts}
-%\only<3>{\includegraphics[width=\columnwidth]{deal_more.pdf} \\ \ldots continue to expand}
-
-\only<3>{
-\vspace{1ex}
-Notice that the instances of deal can be split into two connected sub-graphs:
-\begin{itemize}
- \item noun: the left two contexts ``a \ldots with'' and ``a \ldots that''
- \item verb: the right two contexts ``to \ldots with'' and ``not \ldots with''
- \item neighbouring words of these contexts share the same PoS
-\end{itemize}
-}
-
-\end{frame}
-
-%\begin{frame}[t]{More Formally}
-%
-%Construct a bipartite graph
-%\begin{itemize}
-% \item Nodes on the top layer denote word types (bilingual phrase pairs)
-% \item Nodes on the bottom layer denote context types (monlingual/bilingual words)
-% \item Edges connect words and their contexts
-%\end{itemize}
-%
-%\includegraphics[width=\columnwidth]{bipartite.pdf}
-%
-%\end{frame}
-
-\begin{frame}[t]{Clustering}
-
-Task is to cluster the graph into sub-graphs. Nodes in the sub-graphs should be
-\begin{itemize}
-\item strongly connected to one another
-\item weakly connected to nodes outside the sub-graph
-\item could formulate as either \emph{hard} or \emph{soft} clustering
-\end{itemize}
-Choose \alert{soft clustering} to allow for syntactic and semantic ambiguity
-
-\centering
-\includegraphics[width=0.7\columnwidth]{bipartite_lda.pdf}
-
-\end{frame}
-
-\begin{frame}[t]{Constituency and context}
-\vspace{0.25in}
-\begin{center}
-\only<1>{
- \includegraphics[scale=0.5]{WantTo_Veux_context.pdf}
- \includegraphics[scale=0.5]{WantTo_Veux_context2.pdf}
-}
-\only<2>{
- \includegraphics[scale=0.5]{WantTo_Veux_context_split.pdf}
- \includegraphics[scale=0.5]{WantTo_Veux_context2_split.pdf}
-}
-\only<3>{
- \includegraphics[scale=0.5]{WantTo_Veux_context_split_mono.pdf}
- \includegraphics[scale=0.5]{WantTo_Veux_context2_split_mono.pdf}
-}
-\end{center}
-\vspace{0.1in}
-%\only<1>{
-% There has been significant research into monolingual grammar induction:
-% \vspace{0.1in}
-% \begin{unpacked_itemize}
-% \item Alexander Clark. Unsupervised induction of stochastic context-free grammars using distributional clustering, 2001
-% \item Dan Klein and Chris Manning. A Generative Constituent-Context Model for Improved Grammar Induction, 2002
-% \end{unpacked_itemize}
-% \alert{Constituent context is a prime indicator of constituency.}
-%}
-%\only<1>{
-\begin{unpacked_itemize}
-\item Design and apply large scale scale clustering and topic modelling algorithms (LDA, HDPs, HPYPs etc),
-\item identify sets of frequent contexts that distinguish synchronous constituent properties.
-\item Motivated by successful models of monolingual grammar induction,
-\item deep connections to unsupervised word sense disambiguation, thesaurus extraction etc.
-\end{unpacked_itemize}
-%}
-\end{frame}
-
-\begin{frame}[t]{Latent Dirichlet Allocation (LDA)}
-
-LDA is a generative model which treats documents as bags of words
-\begin{itemize}
- \item each word is assign a \alert{topic} (cluster tag)
- \item words are generated from a topic-specific multinomial
- \item topics are \alert{tied} across a document using a Dirichlet prior
- \item $\alpha < 1$ biases towards \alert{sparse} distributions, i.e., topic reuse
- \item inferred $\theta_d$ describes a document and $\phi_t$ describes a topic
-\end{itemize}
-
-\vspace{-3ex}
-\includegraphics[scale=0.55]{lda.pdf}
-
-\end{frame}
-
-\begin{frame}[t]{LDA over Contexts}
-
-Generative story:
-\begin{itemize}
- \item for each word type $w$
- \item for each of the $L$ contexts
- \item first we draw a topic $t$, then generate the context $\vec{c}$ given the topic
- \item the Dirichlet prior ties the topics for each $w$
- \item we're primarily interested in the learnt $\theta$ values
-\end{itemize}
-
-\includegraphics[scale=0.4]{context_lda.pdf}
-
-\end{frame}
-
-\begin{frame}[t]{Scalable grammar extraction with MapReduce}
-\begin{itemize}
-\item Divide and conquer approach to...counting
-\begin{itemize}
-\item map function $\mathcal{M}(x) \rightarrow \langle k_1, v_1 \rangle, \langle k_2, v_2 \rangle, \ldots$
-\item write a reduce function $\mathcal{R}(k_i : v_7, v_{13} , \ldots) \rightarrow \langle k_i, \overline{v} \rangle$
-\end{itemize}
-\end{itemize}
-\begin{center}
- \includegraphics[scale=0.4]{mroutline.pdf}
-\end{center}
-\end{frame}
-\begin{frame}[t]{Scalable grammar extraction with MapReduce : mapper}
-\begin{center}
- \includegraphics[scale=0.4]{mapper.pdf}
-\end{center}
-\end{frame}
-
-\begin{frame}[t]{Scalable grammar extraction with MapReduce : reducer}
-\begin{center}
- \includegraphics[scale=0.4]{reducer.pdf}
-\end{center}
-\end{frame}
-
-\begin{frame}[t]{Scalable grammar extraction with MapReduce : Hadoop}
-\begin{center}
- \includegraphics[scale=0.4]{hadoop-extract.pdf}
-\end{center}
-\end{frame}
-
-\begin{frame}[t]{Scalable grammar extraction with MapReduce : Hadoop}
-\begin{center}
- \includegraphics[scale=0.4]{hadoop-extract-arrows.pdf}
-\end{center}
-\end{frame}
-
-
-%\begin{frame}[t]{Discriminative training}
-%\begin{unpacked_itemize}
-%\item MIRA
-%\item Expected loss minimisation.
-%\end{unpacked_itemize}
-%\end{frame}
-
-
-\begin{frame}[t]{Language pairs (small)}
-\begin{itemize}
-\item BTEC Chinese-English:
- \begin{itemize}
- \item 44k sentence pairs, short sentences
- \item Widely reported `prototyping' corpus
- \item Hiero baseline score: 52.4 (16 references)
- \item Prospects: BTEC always gives you good results
- \end{itemize}
-\item NIST Urdu-English:
- \begin{itemize}
- \item 50k sentence pairs
- \item Hiero baseline score: MT05 - 23.7 (4 references)
- \item Major challenges: major long-range reordering, SOV word order
- \item Prospects: small data, previous gains with supervised syntax
- \end{itemize}
-\end{itemize}
-\end{frame}
-
-\begin{frame}[t]{Language pairs (large)}
-\begin{itemize}
-\item NIST Chinese-English:
- \begin{itemize}
- \item 1.7M sentence pairs, Standard NIST test sets
- \item Hiero baseline score: MT05 - 33.9 (4 references)
- \item Major challenges: large data, mid-range reordering, lexical ambiguity
- \item Prospects: supervised syntax gains reported
- \end{itemize}
-\item NIST Arabic-English:
- \begin{itemize}
- \item 900k sentence pairs
- \item Hiero baseline score: MT05 - 48.9 (4 references)
- \item Major challenges: strong baseline, local reordering, VSO word order
- \item Prospects: difficult
- \end{itemize}
-\item Europarl Dutch-French:
- \begin{itemize}
- \item 1.5M sentence pairs, standard Europarl test sets
- \item Hiero baseline score: Europarl 2008 - 26.3 (1 reference)
- \item Major challenges: V2 / V-final word order, many non-literal translations
- \item Prospects: ???
- \end{itemize}
-\end{itemize}
-\end{frame}
-
-%\begin{frame}[t]{Draft Schedule}
-%\begin{itemize}
-%\item Pre-workshop:
-% \begin{itemize}
-% \item Collect existing open-source tools for synchronous grammar induction,
-% \item Collect corpora across a range of translations conditions: small, large, low-density languages etc.
-% \item Implement phrase and context extraction algorithms.
-% \item Design the integration of various existing approaches into the decoders.
-% \end{itemize}
-%\item Week 1:
-% \begin{itemize}
-% \item Optimise and reconfigure decoders to handle labelled synchronous grammars,
-% \item Perform a empirical study of synchronous constituency models.
-% \end{itemize}
-%\end{itemize}
-%\end{frame}
-
-%\begin{frame}[t]{Draft Schedule}
-%\begin{itemize}
-%\item Week 2-3:
-% \begin{itemize}
-% \item Continue optimising decoder to handle labelled synchronous grammars,
-% \item Implement unsupervised label induction algorithms, initially inducing a single label per-phrase.
-% \item Extend to ''topic"-modelling style representation where a phrase may have multiple labellings.
-% \item Perform experimental comparison of existing synchronous grammar translation models.
-% \end{itemize}
-%\item Week 3-6:
-% \begin{itemize}
-% \item Perform experimental comparison of unsupervised synchronous grammar translation models.
-% \item Extend the evaluation to small/big data sets, hi-density vs. low-density language pairs.
-% \item Create ``semi-supervised'' models combining knowledge from treebank parser into the unsupervised algorithms.
-% \item Wrap-up and write final report.
-% \end{itemize}
-%\end{itemize}
-%\end{frame}
-
-
-\begin{frame}[t]{Pre-workshop experiments}
-\vspace{0.25in}
-We have implemented a baseline constituent modelling and distrbuted grammar extraction pipeline. Initial results on the small BTEC corpora:
-
-\vspace{0.25in}
-\begin{exampleblock}
-\footnotesize
-\centering
-\begin{tabular}{lcccccc}
-\toprule
-Categories & \small 1-gram & \small 2-grams & \small 3-grams & \small 4-grams & \small BP & BLEU \\
-\midrule
-1 & \small 84.7 & \small 62.0 & \small 47.2 & \small 36.4 & \small 0.969 & \textcolor{blue}{53.10} \\
-10 & \small 84.0 & \small 60.9 & \small 46.4 & \small 35.9 & \small 0.979 & \textcolor{red}{52.88} \\
-25 & \small 84.4 & \small 61.8 & \small 47.6 & \small 36.7 & \small 0.973 & \textcolor{blue}{53.47} \\
-50 & \small 84.8 & \small 61.2 & \small 46.6 & \small 36.2 & \small 0.971 & \textcolor{red}{52.83} \\
-100 & \small 83.5 & \small 60.1 & \small 45.7 & \small 35.3 & \small 0.972 & \textcolor{red}{51.86} \\
-\bottomrule
-\end{tabular}
-\end{exampleblock}
-\end{frame}
-
-
-%{\centering
-%A unique opportunity to bring together researchers operating at the coal face of SMT development with leading theoreticians in the field of formal grammar induction.
-%}
-%\begin{unpacked_itemize}
-%\item Understand the relationship between constituent labels and performance in SMT,
-%\item Compare monolingual and bilingual induced grammars against parser output in terms of translation quality,
-%\item Produce a large scale implementation of the label induction algorithms,
-%\end{unpacked_itemize}
-%\begin{unpacked_itemize}
-%\item \alert{Learn language-pair dependent structure that produces translation performance gains across all language pairs,}
-%\item \alert{Initiate a research program that redirects the SMT research community back to language neutral unsupervised systems.}
-%\end{unpacked_itemize}
-
-
-\begin{frame}[t]{Summary}
-\begin{itemize}
-\item Scientific Merit:
- \begin{itemize}
- \item A systematic comparison of existing syntactive approaches to SMT.
- \item An empirical study of how constituency is useful in SMT.
- \item An evaluation of existing theories of grammar induction in a practical application (end-to-end evaluation).
- \end{itemize}
-\item Potential Impact:
- \begin{itemize}
- \item Better MT systems, for more languages, across a range of domains.
- \item More accessible high performance translation models for researchers. % all over the world.
- \end{itemize}
-\item Feasibility:
- \begin{itemize}
- \item A great team with a wide range of both theoretical and practical experience.
- %\item Incremental plan without any deal breaking dependencies.
- \item Solid preparation.
- \end{itemize}
-\item Novelty:
- \begin{itemize}
- \item First attempt at large scale unsupervised synchronous grammar induction.
-% \item First study seeking to compare and understand the impact of synchronous structure on translation performance.
- \end{itemize}
-\end{itemize}
-\end{frame}
-
-
-\end{document}