\documentclass{beamer}
%\documentclass[serif]{beamer}


\mode<presentation>
{
% \usetheme{Warsaw}
%  \usetheme{Madrid}
 \usetheme{Boadilla}

  \setbeamercovered{transparent}
}


\usepackage[english]{babel}
\usepackage[utf8]{inputenc}
\usepackage{times}
\usepackage[T1]{fontenc}

\usepackage{xcolor}
\usepackage{colortbl}
\usepackage{subfigure}
\usepackage{CJK}

% Or whatever. Note that the encoding and the font should match. If T1
% does not look nice, try deleting the line with the fontenc.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% abbreviations

%% for tables
\newcommand{\mc}{\multicolumn}
\newcommand{\lab}[1]{\multicolumn{1}{c}{#1}}
\newcommand{\ind}[1]{{\fboxsep1pt\raisebox{-.5ex}{\fbox{{\tiny #1}}}}}

%% for dcolumn
%\newcolumntype{d}{D{.}{.}{1.4}}
%\newcolumntype{s}{D{.}{.}{0.3}}

%% markup
%\renewcommand{\key}[1]{\alert{\textit{#1}}}
\newcommand{\buffer}[1]{{\color{blue}\textbf{#1}}}
\newcommand{\pred}[1]{\code{#1}}

%% colors
\newcommand{\textred}[1]{\alert{#1}}
\newcommand{\textblue}[1]{\buffer{#1}}
\definecolor{tablecolor}{cmyk}{0,0.3,0.3,0}
\newcommand{\keytab}[1]{\mc{1}{>{\columncolor{tablecolor}}d}{#1}}

% rules
\newcommand{\psr}[2]{#1 $\rightarrow \langle $ #2 $\rangle$}

\newenvironment{unpacked_itemize}{
\begin{itemize}
  \setlength{\itemsep}{10pt}
  \setlength{\parskip}{0pt}
  \setlength{\parsep}{0pt}
}{\end{itemize}}

\newcommand{\condon}{\hspace{0pt} | \hspace{1pt}}
\definecolor{darkblue}{rgb}{0,0,0.6}
\newcommand{\blueexample}[1]{\textcolor{darkblue}{\rm #1}}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\newcommand{\bx}{\mathbf{x}}
\newcommand{\bz}{\mathbf{z}}
\newcommand{\bd}{\mathbf{d}}
\newcommand{\by}{\mathbf{y}}
\newcommand\bleu{${B{\scriptstyle LEU}}$}


\title[A Note on the Implemention of HDPs]{A Note on the Implementation of Hierarchical Dirichlet Processes}

\author[Blunsom et al.]{{\bf Phil Blunsom}$^*$, Trevor Cohn$^*$, \\Sharon
Goldwater$^*$ and Mark Johnson$^\dagger$}

\institute[Uni. of Edinburgh] % (optional, but mostly needed)
{
  $^*$School of Informatics, University of Edinburgh \\
  $^\dagger$Department of Cognitive and Linguistic Sciences, Brown University \\
}

\date{August 4, 2009}


\subject{Hierarchical Dirichlet Processes}


\pgfdeclareimage[height=1.0cm]{university-logo}{logo}
\logo{\pgfuseimage{university-logo}}

\AtBeginSection[]
{
  \begin{frame}<beamer>{Outline}
    %\tableofcontents[currentsection,currentsubsection]
    \tableofcontents[currentsection]
  \end{frame}
}


%\beamerdefaultoverlayspecification{<+->}


\begin{document}

\begin{frame}
  \titlepage
\end{frame}


\begin{frame}[t]{Outline}
%\begin{exampleblock}{An example}
\vspace{0.5in}
\Large
\begin{unpacked_itemize}
\onslide<1-> \item GGJ06\footnote{S. Goldwater, T. Griffiths, M. Johnson.
Contextual dependencies in unsupervised word segmentation. ACL/COLING-06} 
introduced an approximation for use in hierarchical Dirichlet process (HDP) inference: \\ 
\onslide<2->{\alert{\textbf{It's wrong, don't use it.}}}
\onslide<3-> \item We correct that approximation for DP models. \\ 
\onslide<4->{\alert{\textbf{However, this doesn't extend to HDPs.}}}
\onslide<5> \item But that's ok because we'll describe an efficient exact implementation.
\end{unpacked_itemize}
%\end{exampleblock}
\end{frame}

\begin{frame}
\frametitle{The Chinese Restaurant Process}
In a Dirichlet Process unigram language model  words $w_1 \ldots w_n$ are generated as follows: 
\begin{align}
\nonumber G | & \alpha_0, P_0 &\sim & ~ \mbox{DP}(\alpha_0,P_0) \\
\nonumber w_i | & G &\sim & ~ G 
\end{align}
\begin{itemize}
  \item $G$ is a distribution over an infinite set of words, 
  \item $P_0$ is the probability that an word will be in the support of $G$, 
  \item $\alpha_0$ determines the variance of $G$.
\end{itemize}
\vspace{0.2in}
One way of understanding the predictions made by the DP model is through the Chinese restaurant process (CRP) \dots
\end{frame}

\begin{frame}
\frametitle{The Chinese Restaurant Process}
\only<1-9>{\vspace{-0.4in}}
\begin{figure}
\begin{center}
  \only<1>{\includegraphics[scale=0.7]{tables0.pdf}}
  \only<2>{\includegraphics[scale=0.7]{tables1.pdf}}
  \only<3>{\includegraphics[scale=0.7]{tables2.pdf}}
  \only<4>{\includegraphics[scale=0.7]{tables3.pdf}}
  \only<5>{\includegraphics[scale=0.7]{tables4.pdf}}
  \only<6>{\includegraphics[scale=0.7]{tables5.pdf}}
  \only<7>{\includegraphics[scale=0.7]{tables7.pdf}}
  \only<8>{\includegraphics[scale=0.7]{tables8.pdf}}
  \only<9>{\includegraphics[scale=0.7]{tables6.pdf}}
\end{center}
\end{figure}
\only<1-6>{
\vspace{-0.6in}
Customers (words) enter a restaurant and choose a table according to the distribution:
\begin{align}
\nonumber P(z_i = k | w_i = w, \mathbf{z}_{-i}) = \left\{ 
\begin{array}{ll} 
  \frac{n_k^{\mathbf{z}_{-i}}}{n_w + \alpha_0 P_0(w)}, 0 \leq k < |k|  \\
  \\ \frac{\alpha_0 P_0(w)}{n_w + \alpha_0 P_0(w)}, k = |k|
\end{array} \right.
\end{align}
%where $\mathbf{z}_{-i} = z_1 \dots z_{i-1}$ are the table assignments of the previous customers, $n_k^{\mathbf{z}_{-i}}$ is the number of customers at table $k$ in ${\mathbf{z}_{-i}}$, and $K(\mathbf{z}_{-i})$ is the total number of occupied tables.  
}
\only<7-9>{
\vspace{-0.4in}
The 7$^{th}$ customer `{\em the}' enters the restaurant and choses a table from
those already seating `{\em the}', or opening a new table:
}
\only<7>{
\begin{align}
\nonumber P(z_6 = 0 | w_6 = the, \mathbf{z}_{-6}) = \frac{2}{3 + \alpha_0 P_0(the)}
\end{align}
}
\only<8>{
\begin{align}
\nonumber P(z_6 = 2 | w_6 = the, \mathbf{z}_{-6}) = \frac{1}{3 + \alpha_0 P_0(the)}
\end{align}
}
\only<9>{
\begin{align}
\nonumber P(z_6 = 4 | w_6 = the, \mathbf{z}_{-6}) = \frac{P_0(the)}{3 + \alpha_0 P_0(the)}
\end{align}
}
\only<7-9>{\vspace{0.32in}}
\end{frame}

\begin{frame}
\frametitle{Approximating the table counts}
\begin{figure}
\begin{center}
  \includegraphics[scale=0.7]{tables_expectation.pdf}
\end{center}
\end{figure}

\begin{itemize}
  \item GGJ06 sought to avoid explicitly tracking tables by reasoning under
the expected table counts ($E[t_w]$).

\item Antoniak(1974) derives the expected table count as equal to the recurrence:
\begin{align}
\nonumber E[t_w] = \alpha_0 P_0(w) \sum_{i=1}^{n_w} \frac{1}{\alpha_0 P_0(w) + i - 1}
\label{eqn:true_expected}
\end{align}
\item Antoniak also suggests an approximation to this expectation which GGJ06 
 presents as: \only<2>{\alert{(corrected)}}
\only<1> {
\begin{align}
  \nonumber E[t_w] \approx \alpha_0 \log \frac{n_w + \alpha_0}{\alpha_0}
\end{align}
}
\only<2> {
\begin{align}
  \nonumber E[t_w] \approx \alpha_0 \alert{P_0(w)} \log \frac{n_w + \alpha_0
  \alert{P_0(w)}}{\alpha_0 \alert{P_0(w)}}
\end{align}
\vspace{-0.32cm}
}
\end{itemize}
\end{frame}


\begin{frame}
\frametitle{A better table count approximation}
\begin{itemize}
\item Antoniak's approximation makes two assumptions:
  \begin{unpacked_itemize}
    \item $\alpha_0$ is large, not the predominant situation in recent applications which employ a DP as a sparse prior,
    \item $P_0(w)$ is constant, which is not applicable to HDPs.
  \end{unpacked_itemize}
\vspace{1.0cm}
\item In our paper we derive an improved approximation based on a difference of digamma ($\psi$) functions:
\begin{align}
\nonumber E[t_w] = \alpha_0 P_0(w) \cdot \Bigg [\psi{\Big (\alpha_0
P_0(w)+n_w \Big)} - \psi{\Big (\alpha_0 P_0(w)} \Big ) \Bigg ]
\end{align}
\vspace{0.5cm}
\item However the restriction on $P_0(w)$ being constant remains \dots
\end{itemize}
\end{frame}

\begin{frame}
\frametitle{DP performance}
\begin{figure}
{\centering \includegraphics[scale=0.45]{code/plot0.pdf}}
\end{figure}
\end{frame}

\begin{frame}
\frametitle{DP performance}
\begin{figure}
{\centering \includegraphics[scale=0.45]{code/plot1.pdf}}
\end{figure}
\end{frame}

\begin{frame}
\frametitle{DP performance}
\begin{figure}
{\centering \includegraphics[scale=0.45]{code/plot2.pdf}}
\end{figure}
\end{frame}

\begin{frame}
\frametitle{HDP performance}
\begin{figure}
{\centering \includegraphics[scale=0.45]{code/plot3.pdf}}
\end{figure}
\end{frame}

\begin{frame}
\frametitle{Histogram Method}
\begin{unpacked_itemize}
  \item At this point we don't have a useful approximation of the expected table
    counts in a HDP model.
  \item However, we can describe a more compact representation for the state of the restaurant that doesn't require explicit table tracking.
  \item Instead we maintain a histogram for each dish $w_i$ of the frequency of a table having a particular number of customers.
\end{unpacked_itemize}
\end{frame}

\begin{frame}[t]
\frametitle{Histogram Method}
\begin{center}
  \only<1>{\includegraphics[scale=0.7]{tables0.pdf}}
  \only<2>{\includegraphics[scale=0.7]{tables1.pdf}}
  \only<3>{\includegraphics[scale=0.7]{tables2.pdf}}
  \only<4>{\includegraphics[scale=0.7]{tables3.pdf}}
  \only<5>{\includegraphics[scale=0.7]{tables4.pdf}}
  \only<6>{\includegraphics[scale=0.7]{tables5.pdf}}
  \only<7>{\includegraphics[scale=0.7]{tables7.pdf}}
  \only<8>{\includegraphics[scale=0.7]{tables9.pdf}}
\end{center}
\vspace{-2.5cm}
\only<6->{\vspace{0.47cm}}
\begin{center}
  \only<1>{\includegraphics[scale=0.2]{histogram_1.pdf}\hspace{-0.6cm}}
  \only<2>{\includegraphics[scale=0.2]{histogram_2.pdf}\hspace{-0.5cm}}
  \only<3>{\includegraphics[scale=0.2]{histogram_3.pdf}\hspace{-0.4cm}}
  \only<4>{\includegraphics[scale=0.2]{histogram_4.pdf}\hspace{-0.3cm}}
  \only<5>{\includegraphics[scale=0.2]{histogram_5.pdf}\hspace{-0.2cm}}
  \only<6>{\includegraphics[scale=0.2]{histogram_6.pdf}}
  \only<7>{\includegraphics[scale=0.2]{histogram_7.pdf}}
  \only<8>{\includegraphics[scale=0.2]{histogram_8.pdf}}
\end{center}
\end{frame}

\begin{frame}
\frametitle{Summary}
\begin{center} 
  \Large \textbf{The table count approximation of Goldwater et al. 2006 
  is broken, \alert{don't use it!}}
\end{center}
\end{frame}

\begin{frame}
%    \frametitle{Summary}
    \begin{center}
        \Large Thank you.
    \end{center}

\begin{block}{References}
    \footnotesize
    \vspace{0.5cm}
    P. Blunsom, T. Cohn, S. Goldwater and M. Johnson. A note on the
    implementation of hierarchical Dirichlet processes,
    {\em In the Proceedings of ACL-IJCNLP 2009}. \\
    \vspace{0.5cm}
    C. E. Antoniak. 1974. Mixtures of dirichlet processes with 
    applications to bayesian nonparametric problems. 
    {\em The Annals of Statistics}, 2(6):1152-1174.  \\
    \vspace{0.5cm}
    S. Goldwater, T. Griffiths, M. Johnson. 
    Contextual dependencies in unsupervised word segmentation. 
    {\em In the Proceedings of (COLING/ACL-2006)}. 
    \vspace{0.5cm}
\end{block}
\end{frame} 

\end{document}