summaryrefslogtreecommitdiff
path: root/report/pyp_clustering/acl09-short/slides.tex
diff options
context:
space:
mode:
authorChris Dyer <cdyer@cs.cmu.edu>2010-12-22 08:58:07 -0600
committerChris Dyer <cdyer@cs.cmu.edu>2010-12-22 08:58:07 -0600
commitb5ca2bd7001a385594af8dc4b9206399c679f8c5 (patch)
tree332cb09a27f783760532e688e14bde21a128bb2b /report/pyp_clustering/acl09-short/slides.tex
parent86805dcb8aaaa716fdc73725ad41e411be53f6a6 (diff)
remove report
Diffstat (limited to 'report/pyp_clustering/acl09-short/slides.tex')
-rw-r--r--report/pyp_clustering/acl09-short/slides.tex345
1 files changed, 0 insertions, 345 deletions
diff --git a/report/pyp_clustering/acl09-short/slides.tex b/report/pyp_clustering/acl09-short/slides.tex
deleted file mode 100644
index af1db1fc..00000000
--- a/report/pyp_clustering/acl09-short/slides.tex
+++ /dev/null
@@ -1,345 +0,0 @@
-\documentclass{beamer}
-%\documentclass[serif]{beamer}
-
-
-\mode<presentation>
-{
-% \usetheme{Warsaw}
-% \usetheme{Madrid}
- \usetheme{Boadilla}
-
- \setbeamercovered{transparent}
-}
-
-
-\usepackage[english]{babel}
-\usepackage[utf8]{inputenc}
-\usepackage{times}
-\usepackage[T1]{fontenc}
-
-\usepackage{xcolor}
-\usepackage{colortbl}
-\usepackage{subfigure}
-\usepackage{CJK}
-
-% Or whatever. Note that the encoding and the font should match. If T1
-% does not look nice, try deleting the line with the fontenc.
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-%% abbreviations
-
-%% for tables
-\newcommand{\mc}{\multicolumn}
-\newcommand{\lab}[1]{\multicolumn{1}{c}{#1}}
-\newcommand{\ind}[1]{{\fboxsep1pt\raisebox{-.5ex}{\fbox{{\tiny #1}}}}}
-
-%% for dcolumn
-%\newcolumntype{d}{D{.}{.}{1.4}}
-%\newcolumntype{s}{D{.}{.}{0.3}}
-
-%% markup
-%\renewcommand{\key}[1]{\alert{\textit{#1}}}
-\newcommand{\buffer}[1]{{\color{blue}\textbf{#1}}}
-\newcommand{\pred}[1]{\code{#1}}
-
-%% colors
-\newcommand{\textred}[1]{\alert{#1}}
-\newcommand{\textblue}[1]{\buffer{#1}}
-\definecolor{tablecolor}{cmyk}{0,0.3,0.3,0}
-\newcommand{\keytab}[1]{\mc{1}{>{\columncolor{tablecolor}}d}{#1}}
-
-% rules
-\newcommand{\psr}[2]{#1 $\rightarrow \langle $ #2 $\rangle$}
-
-\newenvironment{unpacked_itemize}{
-\begin{itemize}
- \setlength{\itemsep}{10pt}
- \setlength{\parskip}{0pt}
- \setlength{\parsep}{0pt}
-}{\end{itemize}}
-
-\newcommand{\condon}{\hspace{0pt} | \hspace{1pt}}
-\definecolor{darkblue}{rgb}{0,0,0.6}
-\newcommand{\blueexample}[1]{\textcolor{darkblue}{\rm #1}}
-
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-
-\newcommand{\bx}{\mathbf{x}}
-\newcommand{\bz}{\mathbf{z}}
-\newcommand{\bd}{\mathbf{d}}
-\newcommand{\by}{\mathbf{y}}
-\newcommand\bleu{${B{\scriptstyle LEU}}$}
-
-
-\title[A Note on the Implemention of HDPs]{A Note on the Implementation of Hierarchical Dirichlet Processes}
-
-\author[Blunsom et al.]{{\bf Phil Blunsom}$^*$, Trevor Cohn$^*$, \\Sharon
-Goldwater$^*$ and Mark Johnson$^\dagger$}
-
-\institute[Uni. of Edinburgh] % (optional, but mostly needed)
-{
- $^*$School of Informatics, University of Edinburgh \\
- $^\dagger$Department of Cognitive and Linguistic Sciences, Brown University \\
-}
-
-\date{August 4, 2009}
-
-
-\subject{Hierarchical Dirichlet Processes}
-
-
-\pgfdeclareimage[height=1.0cm]{university-logo}{logo}
-\logo{\pgfuseimage{university-logo}}
-
-\AtBeginSection[]
-{
- \begin{frame}<beamer>{Outline}
- %\tableofcontents[currentsection,currentsubsection]
- \tableofcontents[currentsection]
- \end{frame}
-}
-
-
-%\beamerdefaultoverlayspecification{<+->}
-
-
-\begin{document}
-
-\begin{frame}
- \titlepage
-\end{frame}
-
-
-\begin{frame}[t]{Outline}
-%\begin{exampleblock}{An example}
-\vspace{0.5in}
-\Large
-\begin{unpacked_itemize}
-\onslide<1-> \item GGJ06\footnote{S. Goldwater, T. Griffiths, M. Johnson.
-Contextual dependencies in unsupervised word segmentation. ACL/COLING-06}
-introduced an approximation for use in hierarchical Dirichlet process (HDP) inference: \\
-\onslide<2->{\alert{\textbf{It's wrong, don't use it.}}}
-\onslide<3-> \item We correct that approximation for DP models. \\
-\onslide<4->{\alert{\textbf{However, this doesn't extend to HDPs.}}}
-\onslide<5> \item But that's ok because we'll describe an efficient exact implementation.
-\end{unpacked_itemize}
-%\end{exampleblock}
-\end{frame}
-
-\begin{frame}
-\frametitle{The Chinese Restaurant Process}
-In a Dirichlet Process unigram language model words $w_1 \ldots w_n$ are generated as follows:
-\begin{align}
-\nonumber G | & \alpha_0, P_0 &\sim & ~ \mbox{DP}(\alpha_0,P_0) \\
-\nonumber w_i | & G &\sim & ~ G
-\end{align}
-\begin{itemize}
- \item $G$ is a distribution over an infinite set of words,
- \item $P_0$ is the probability that an word will be in the support of $G$,
- \item $\alpha_0$ determines the variance of $G$.
-\end{itemize}
-\vspace{0.2in}
-One way of understanding the predictions made by the DP model is through the Chinese restaurant process (CRP) \dots
-\end{frame}
-
-\begin{frame}
-\frametitle{The Chinese Restaurant Process}
-\only<1-9>{\vspace{-0.4in}}
-\begin{figure}
-\begin{center}
- \only<1>{\includegraphics[scale=0.7]{tables0.pdf}}
- \only<2>{\includegraphics[scale=0.7]{tables1.pdf}}
- \only<3>{\includegraphics[scale=0.7]{tables2.pdf}}
- \only<4>{\includegraphics[scale=0.7]{tables3.pdf}}
- \only<5>{\includegraphics[scale=0.7]{tables4.pdf}}
- \only<6>{\includegraphics[scale=0.7]{tables5.pdf}}
- \only<7>{\includegraphics[scale=0.7]{tables7.pdf}}
- \only<8>{\includegraphics[scale=0.7]{tables8.pdf}}
- \only<9>{\includegraphics[scale=0.7]{tables6.pdf}}
-\end{center}
-\end{figure}
-\only<1-6>{
-\vspace{-0.6in}
-Customers (words) enter a restaurant and choose a table according to the distribution:
-\begin{align}
-\nonumber P(z_i = k | w_i = w, \mathbf{z}_{-i}) = \left\{
-\begin{array}{ll}
- \frac{n_k^{\mathbf{z}_{-i}}}{n_w + \alpha_0 P_0(w)}, 0 \leq k < |k| \\
- \\ \frac{\alpha_0 P_0(w)}{n_w + \alpha_0 P_0(w)}, k = |k|
-\end{array} \right.
-\end{align}
-%where $\mathbf{z}_{-i} = z_1 \dots z_{i-1}$ are the table assignments of the previous customers, $n_k^{\mathbf{z}_{-i}}$ is the number of customers at table $k$ in ${\mathbf{z}_{-i}}$, and $K(\mathbf{z}_{-i})$ is the total number of occupied tables.
-}
-\only<7-9>{
-\vspace{-0.4in}
-The 7$^{th}$ customer `{\em the}' enters the restaurant and choses a table from
-those already seating `{\em the}', or opening a new table:
-}
-\only<7>{
-\begin{align}
-\nonumber P(z_6 = 0 | w_6 = the, \mathbf{z}_{-6}) = \frac{2}{3 + \alpha_0 P_0(the)}
-\end{align}
-}
-\only<8>{
-\begin{align}
-\nonumber P(z_6 = 2 | w_6 = the, \mathbf{z}_{-6}) = \frac{1}{3 + \alpha_0 P_0(the)}
-\end{align}
-}
-\only<9>{
-\begin{align}
-\nonumber P(z_6 = 4 | w_6 = the, \mathbf{z}_{-6}) = \frac{P_0(the)}{3 + \alpha_0 P_0(the)}
-\end{align}
-}
-\only<7-9>{\vspace{0.32in}}
-\end{frame}
-
-\begin{frame}
-\frametitle{Approximating the table counts}
-\begin{figure}
-\begin{center}
- \includegraphics[scale=0.7]{tables_expectation.pdf}
-\end{center}
-\end{figure}
-
-\begin{itemize}
- \item GGJ06 sought to avoid explicitly tracking tables by reasoning under
-the expected table counts ($E[t_w]$).
-
-\item Antoniak(1974) derives the expected table count as equal to the recurrence:
-\begin{align}
-\nonumber E[t_w] = \alpha_0 P_0(w) \sum_{i=1}^{n_w} \frac{1}{\alpha_0 P_0(w) + i - 1}
-\label{eqn:true_expected}
-\end{align}
-\item Antoniak also suggests an approximation to this expectation which GGJ06
- presents as: \only<2>{\alert{(corrected)}}
-\only<1> {
-\begin{align}
- \nonumber E[t_w] \approx \alpha_0 \log \frac{n_w + \alpha_0}{\alpha_0}
-\end{align}
-}
-\only<2> {
-\begin{align}
- \nonumber E[t_w] \approx \alpha_0 \alert{P_0(w)} \log \frac{n_w + \alpha_0
- \alert{P_0(w)}}{\alpha_0 \alert{P_0(w)}}
-\end{align}
-\vspace{-0.32cm}
-}
-\end{itemize}
-\end{frame}
-
-
-\begin{frame}
-\frametitle{A better table count approximation}
-\begin{itemize}
-\item Antoniak's approximation makes two assumptions:
- \begin{unpacked_itemize}
- \item $\alpha_0$ is large, not the predominant situation in recent applications which employ a DP as a sparse prior,
- \item $P_0(w)$ is constant, which is not applicable to HDPs.
- \end{unpacked_itemize}
-\vspace{1.0cm}
-\item In our paper we derive an improved approximation based on a difference of digamma ($\psi$) functions:
-\begin{align}
-\nonumber E[t_w] = \alpha_0 P_0(w) \cdot \Bigg [\psi{\Big (\alpha_0
-P_0(w)+n_w \Big)} - \psi{\Big (\alpha_0 P_0(w)} \Big ) \Bigg ]
-\end{align}
-\vspace{0.5cm}
-\item However the restriction on $P_0(w)$ being constant remains \dots
-\end{itemize}
-\end{frame}
-
-\begin{frame}
-\frametitle{DP performance}
-\begin{figure}
-{\centering \includegraphics[scale=0.45]{code/plot0.pdf}}
-\end{figure}
-\end{frame}
-
-\begin{frame}
-\frametitle{DP performance}
-\begin{figure}
-{\centering \includegraphics[scale=0.45]{code/plot1.pdf}}
-\end{figure}
-\end{frame}
-
-\begin{frame}
-\frametitle{DP performance}
-\begin{figure}
-{\centering \includegraphics[scale=0.45]{code/plot2.pdf}}
-\end{figure}
-\end{frame}
-
-\begin{frame}
-\frametitle{HDP performance}
-\begin{figure}
-{\centering \includegraphics[scale=0.45]{code/plot3.pdf}}
-\end{figure}
-\end{frame}
-
-\begin{frame}
-\frametitle{Histogram Method}
-\begin{unpacked_itemize}
- \item At this point we don't have a useful approximation of the expected table
- counts in a HDP model.
- \item However, we can describe a more compact representation for the state of the restaurant that doesn't require explicit table tracking.
- \item Instead we maintain a histogram for each dish $w_i$ of the frequency of a table having a particular number of customers.
-\end{unpacked_itemize}
-\end{frame}
-
-\begin{frame}[t]
-\frametitle{Histogram Method}
-\begin{center}
- \only<1>{\includegraphics[scale=0.7]{tables0.pdf}}
- \only<2>{\includegraphics[scale=0.7]{tables1.pdf}}
- \only<3>{\includegraphics[scale=0.7]{tables2.pdf}}
- \only<4>{\includegraphics[scale=0.7]{tables3.pdf}}
- \only<5>{\includegraphics[scale=0.7]{tables4.pdf}}
- \only<6>{\includegraphics[scale=0.7]{tables5.pdf}}
- \only<7>{\includegraphics[scale=0.7]{tables7.pdf}}
- \only<8>{\includegraphics[scale=0.7]{tables9.pdf}}
-\end{center}
-\vspace{-2.5cm}
-\only<6->{\vspace{0.47cm}}
-\begin{center}
- \only<1>{\includegraphics[scale=0.2]{histogram_1.pdf}\hspace{-0.6cm}}
- \only<2>{\includegraphics[scale=0.2]{histogram_2.pdf}\hspace{-0.5cm}}
- \only<3>{\includegraphics[scale=0.2]{histogram_3.pdf}\hspace{-0.4cm}}
- \only<4>{\includegraphics[scale=0.2]{histogram_4.pdf}\hspace{-0.3cm}}
- \only<5>{\includegraphics[scale=0.2]{histogram_5.pdf}\hspace{-0.2cm}}
- \only<6>{\includegraphics[scale=0.2]{histogram_6.pdf}}
- \only<7>{\includegraphics[scale=0.2]{histogram_7.pdf}}
- \only<8>{\includegraphics[scale=0.2]{histogram_8.pdf}}
-\end{center}
-\end{frame}
-
-\begin{frame}
-\frametitle{Summary}
-\begin{center}
- \Large \textbf{The table count approximation of Goldwater et al. 2006
- is broken, \alert{don't use it!}}
-\end{center}
-\end{frame}
-
-\begin{frame}
-% \frametitle{Summary}
- \begin{center}
- \Large Thank you.
- \end{center}
-
-\begin{block}{References}
- \footnotesize
- \vspace{0.5cm}
- P. Blunsom, T. Cohn, S. Goldwater and M. Johnson. A note on the
- implementation of hierarchical Dirichlet processes,
- {\em In the Proceedings of ACL-IJCNLP 2009}. \\
- \vspace{0.5cm}
- C. E. Antoniak. 1974. Mixtures of dirichlet processes with
- applications to bayesian nonparametric problems.
- {\em The Annals of Statistics}, 2(6):1152-1174. \\
- \vspace{0.5cm}
- S. Goldwater, T. Griffiths, M. Johnson.
- Contextual dependencies in unsupervised word segmentation.
- {\em In the Proceedings of (COLING/ACL-2006)}.
- \vspace{0.5cm}
-\end{block}
-\end{frame}
-
-\end{document}