diff options
author | philblunsom@gmail.com <philblunsom@gmail.com@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-07-26 21:44:09 +0000 |
---|---|---|
committer | philblunsom@gmail.com <philblunsom@gmail.com@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-07-26 21:44:09 +0000 |
commit | e76460b3d35dd486e6d53ed5b401af4ca79bf410 (patch) | |
tree | 06892374d181b3f0fd0bbc0ccd516200f4c7bc00 /report/intro_slides/tcohn_slides.tex | |
parent | 580c488f2074377026de7312a20faf933e751287 (diff) |
First ci.
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@429 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'report/intro_slides/tcohn_slides.tex')
-rw-r--r-- | report/intro_slides/tcohn_slides.tex | 167 |
1 files changed, 167 insertions, 0 deletions
diff --git a/report/intro_slides/tcohn_slides.tex b/report/intro_slides/tcohn_slides.tex new file mode 100644 index 00000000..950a0036 --- /dev/null +++ b/report/intro_slides/tcohn_slides.tex @@ -0,0 +1,167 @@ +\documentclass{beamer} + +\mode<presentation> +{ + \usetheme{Boadilla} + \setbeamercovered{transparent}} + +\usepackage[english]{babel} +\usepackage{times} + +\usepackage{xcolor} +\usepackage{colortbl} +%\usepackage{subfigure} + +%% for tables +\newcommand{\mc}{\multicolumn} +\newcommand{\lab}[1]{\multicolumn{1}{c}{#1}} +\newcommand{\ind}[1]{{\fboxsep1pt\raisebox{-.5ex}{\fbox{{\tiny #1}}}}} +\newcommand{\IND}[1]{{\fboxsep1pt\raisebox{0ex}{\fbox{{\small #1}}}}} +\newcommand\production[2]{\ensuremath{\langle\mbox{#1}, \mbox{#2}\rangle}} + +%% markup +\newcommand{\buffer}[1]{{\color{blue}\textbf{#1}}} +\newcommand{\pred}[1]{\code{#1}} + +%% colors +\newcommand{\textred}[1]{\alert{#1}} +\newcommand{\textblue}[1]{\buffer{#1}} +\definecolor{tablecolor}{cmyk}{0,0.3,0.3,0} +\newcommand{\keytab}[1]{\mc{1}{>{\columncolor{tablecolor}}d}{#1}} + +% rules +\newcommand{\psr}[2]{#1 $\rightarrow \langle $ #2 $\rangle$} + +\newenvironment{unpacked_itemize}{ +\begin{itemize} + \setlength{\itemsep}{10pt} + \setlength{\parskip}{0pt} + \setlength{\parsep}{0pt} +}{\end{itemize}} + +\newcommand{\condon}{\hspace{0pt} | \hspace{1pt}} +\definecolor{darkblue}{rgb}{0,0,0.6} +\newcommand{\blueexample}[1]{\textcolor{darkblue}{\rm #1}} + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +\newcommand{\ws}{\ensuremath{\vec{w}}} +\newcommand{\pu}{\ensuremath{P_0}} +\newcommand{\bx}{\mathbf{x}} +\newcommand{\bz}{\mathbf{z}} +\newcommand{\bd}{\mathbf{d}} +\newcommand{\by}{\mathbf{y}} +\newcommand\bleu{${B{\scriptstyle LEU}}$} + + +\title{Clustering of phrases and contexts} +\author{Trevor Cohn} +\date{\today} + +\begin{document} + +\begin{frame}[t]{Motivation} +%\vspace{1.0cm} +\begin{exampleblock}{Distributional Hypothesis} +\begin{quote} +\emph{Words that occur in the same contexts tend to have similar meanings} +\end{quote} +\hfill (Zellig Harris, 1954) +\end{exampleblock} + +\vspace{3ex} + +We will leverage this in a translation setting: +\begin{itemize} + \item Use the contexts to \alert{cluster} translation units into groups + \item Units in the same group expected to be semantically and syntactically similar + \item Then use these cluster labels to guide translation + \begin{itemize} + \item lexical selection: translating ambiguous source word/s + \item reordering: consistent syntactic patterns of reordering + \end{itemize} +\end{itemize} +\end{frame} + +\begin{frame}[t]{Monolingual Example} +Task: cluster words into their parts-of-speech. \\ + +\vspace{1ex} +Illustrate by starting with the word `deal' (noun or verb): + +\only<1>{\includegraphics[width=\columnwidth]{deal_first.pdf} \\ Step 1: Find contexts for `deal'} +\only<2->{\includegraphics[width=\columnwidth]{deal.pdf} \\ Step 2: Find other words which occur in these contexts} +%\only<3>{\includegraphics[width=\columnwidth]{deal_more.pdf} \\ \ldots continue to expand} + +\only<3>{ +\vspace{1ex} +Notice that the instances of deal can be split into two connected sub-graphs: +\begin{itemize} + \item noun: the left two contexts ``a \ldots with'' and ``a \ldots that'' + \item verb: the right two contexts ``to \ldots with'' and ``not \ldots with'' + \item neighbouring words of these contexts share the same PoS +\end{itemize} +} + +\end{frame} + +\begin{frame}[t]{More Formally} + +Construct a bipartite graph +\begin{itemize} + \item Nodes on the top layer denote word types (bilingual phrase pairs) + \item Nodes on the bottom layer denote context types (monlingual/bilingual words) + \item Edges connect words and their contexts +\end{itemize} + +\includegraphics[width=\columnwidth]{bipartite.pdf} + +\end{frame} + +\begin{frame}[t]{Clustering} + +Task is to cluster the graph into sub-graphs. Nodes in the sub-graphs should be +\begin{itemize} +\item strongly connected to one another +\item weakly connected to nodes outside the sub-graph +\item could formulate as either \emph{hard} or \emph{soft} clustering +\end{itemize} +Choose \alert{soft clustering} to allow for syntactic and semantic ambiguity + +\centering +\includegraphics[width=0.7\columnwidth]{bipartite_lda.pdf} + +\end{frame} + +\begin{frame}[t]{Latent Dirichlet Allocation (LDA)} + +LDA is a generative model which treats documents as bags of words +\begin{itemize} + \item each word is assign a \alert{topic} (cluster tag) + \item words are generated from a topic-specific multinomial + \item topics are \alert{tied} across a document using a Dirichlet prior + \item $\alpha < 1$ biases towards \alert{sparse} distributions, i.e., topic reuse + \item inferred $\theta_d$ describes a document and $\phi_t$ describes a topic +\end{itemize} + +\vspace{-3ex} +\includegraphics[scale=0.55]{lda.pdf} + +\end{frame} + +\begin{frame}[t]{LDA over Contexts} + +Generative story: +\begin{itemize} + \item for each word type $w$ + \item for each of the $L$ contexts + \item first we draw a topic $t$, then generate the context $\vec{c}$ given the topic + \item the Dirichlet prior ties the topics for each $w$ + \item we're primarily interested in the learnt $\theta$ values +\end{itemize} + +\includegraphics[scale=0.4]{context_lda.pdf} + +\end{frame} + +\end{document} |