diff options
Diffstat (limited to 'report/pyp_clustering/acl09-short/slides.tex')
-rw-r--r-- | report/pyp_clustering/acl09-short/slides.tex | 345 |
1 files changed, 345 insertions, 0 deletions
diff --git a/report/pyp_clustering/acl09-short/slides.tex b/report/pyp_clustering/acl09-short/slides.tex new file mode 100644 index 00000000..af1db1fc --- /dev/null +++ b/report/pyp_clustering/acl09-short/slides.tex @@ -0,0 +1,345 @@ +\documentclass{beamer} +%\documentclass[serif]{beamer} + + +\mode<presentation> +{ +% \usetheme{Warsaw} +% \usetheme{Madrid} + \usetheme{Boadilla} + + \setbeamercovered{transparent} +} + + +\usepackage[english]{babel} +\usepackage[utf8]{inputenc} +\usepackage{times} +\usepackage[T1]{fontenc} + +\usepackage{xcolor} +\usepackage{colortbl} +\usepackage{subfigure} +\usepackage{CJK} + +% Or whatever. Note that the encoding and the font should match. If T1 +% does not look nice, try deleting the line with the fontenc. +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% abbreviations + +%% for tables +\newcommand{\mc}{\multicolumn} +\newcommand{\lab}[1]{\multicolumn{1}{c}{#1}} +\newcommand{\ind}[1]{{\fboxsep1pt\raisebox{-.5ex}{\fbox{{\tiny #1}}}}} + +%% for dcolumn +%\newcolumntype{d}{D{.}{.}{1.4}} +%\newcolumntype{s}{D{.}{.}{0.3}} + +%% markup +%\renewcommand{\key}[1]{\alert{\textit{#1}}} +\newcommand{\buffer}[1]{{\color{blue}\textbf{#1}}} +\newcommand{\pred}[1]{\code{#1}} + +%% colors +\newcommand{\textred}[1]{\alert{#1}} +\newcommand{\textblue}[1]{\buffer{#1}} +\definecolor{tablecolor}{cmyk}{0,0.3,0.3,0} +\newcommand{\keytab}[1]{\mc{1}{>{\columncolor{tablecolor}}d}{#1}} + +% rules +\newcommand{\psr}[2]{#1 $\rightarrow \langle $ #2 $\rangle$} + +\newenvironment{unpacked_itemize}{ +\begin{itemize} + \setlength{\itemsep}{10pt} + \setlength{\parskip}{0pt} + \setlength{\parsep}{0pt} +}{\end{itemize}} + +\newcommand{\condon}{\hspace{0pt} | \hspace{1pt}} +\definecolor{darkblue}{rgb}{0,0,0.6} +\newcommand{\blueexample}[1]{\textcolor{darkblue}{\rm #1}} + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +\newcommand{\bx}{\mathbf{x}} +\newcommand{\bz}{\mathbf{z}} +\newcommand{\bd}{\mathbf{d}} +\newcommand{\by}{\mathbf{y}} +\newcommand\bleu{${B{\scriptstyle LEU}}$} + + +\title[A Note on the Implemention of HDPs]{A Note on the Implementation of Hierarchical Dirichlet Processes} + +\author[Blunsom et al.]{{\bf Phil Blunsom}$^*$, Trevor Cohn$^*$, \\Sharon +Goldwater$^*$ and Mark Johnson$^\dagger$} + +\institute[Uni. of Edinburgh] % (optional, but mostly needed) +{ + $^*$School of Informatics, University of Edinburgh \\ + $^\dagger$Department of Cognitive and Linguistic Sciences, Brown University \\ +} + +\date{August 4, 2009} + + +\subject{Hierarchical Dirichlet Processes} + + +\pgfdeclareimage[height=1.0cm]{university-logo}{logo} +\logo{\pgfuseimage{university-logo}} + +\AtBeginSection[] +{ + \begin{frame}<beamer>{Outline} + %\tableofcontents[currentsection,currentsubsection] + \tableofcontents[currentsection] + \end{frame} +} + + +%\beamerdefaultoverlayspecification{<+->} + + +\begin{document} + +\begin{frame} + \titlepage +\end{frame} + + +\begin{frame}[t]{Outline} +%\begin{exampleblock}{An example} +\vspace{0.5in} +\Large +\begin{unpacked_itemize} +\onslide<1-> \item GGJ06\footnote{S. Goldwater, T. Griffiths, M. Johnson. +Contextual dependencies in unsupervised word segmentation. ACL/COLING-06} +introduced an approximation for use in hierarchical Dirichlet process (HDP) inference: \\ +\onslide<2->{\alert{\textbf{It's wrong, don't use it.}}} +\onslide<3-> \item We correct that approximation for DP models. \\ +\onslide<4->{\alert{\textbf{However, this doesn't extend to HDPs.}}} +\onslide<5> \item But that's ok because we'll describe an efficient exact implementation. +\end{unpacked_itemize} +%\end{exampleblock} +\end{frame} + +\begin{frame} +\frametitle{The Chinese Restaurant Process} +In a Dirichlet Process unigram language model words $w_1 \ldots w_n$ are generated as follows: +\begin{align} +\nonumber G | & \alpha_0, P_0 &\sim & ~ \mbox{DP}(\alpha_0,P_0) \\ +\nonumber w_i | & G &\sim & ~ G +\end{align} +\begin{itemize} + \item $G$ is a distribution over an infinite set of words, + \item $P_0$ is the probability that an word will be in the support of $G$, + \item $\alpha_0$ determines the variance of $G$. +\end{itemize} +\vspace{0.2in} +One way of understanding the predictions made by the DP model is through the Chinese restaurant process (CRP) \dots +\end{frame} + +\begin{frame} +\frametitle{The Chinese Restaurant Process} +\only<1-9>{\vspace{-0.4in}} +\begin{figure} +\begin{center} + \only<1>{\includegraphics[scale=0.7]{tables0.pdf}} + \only<2>{\includegraphics[scale=0.7]{tables1.pdf}} + \only<3>{\includegraphics[scale=0.7]{tables2.pdf}} + \only<4>{\includegraphics[scale=0.7]{tables3.pdf}} + \only<5>{\includegraphics[scale=0.7]{tables4.pdf}} + \only<6>{\includegraphics[scale=0.7]{tables5.pdf}} + \only<7>{\includegraphics[scale=0.7]{tables7.pdf}} + \only<8>{\includegraphics[scale=0.7]{tables8.pdf}} + \only<9>{\includegraphics[scale=0.7]{tables6.pdf}} +\end{center} +\end{figure} +\only<1-6>{ +\vspace{-0.6in} +Customers (words) enter a restaurant and choose a table according to the distribution: +\begin{align} +\nonumber P(z_i = k | w_i = w, \mathbf{z}_{-i}) = \left\{ +\begin{array}{ll} + \frac{n_k^{\mathbf{z}_{-i}}}{n_w + \alpha_0 P_0(w)}, 0 \leq k < |k| \\ + \\ \frac{\alpha_0 P_0(w)}{n_w + \alpha_0 P_0(w)}, k = |k| +\end{array} \right. +\end{align} +%where $\mathbf{z}_{-i} = z_1 \dots z_{i-1}$ are the table assignments of the previous customers, $n_k^{\mathbf{z}_{-i}}$ is the number of customers at table $k$ in ${\mathbf{z}_{-i}}$, and $K(\mathbf{z}_{-i})$ is the total number of occupied tables. +} +\only<7-9>{ +\vspace{-0.4in} +The 7$^{th}$ customer `{\em the}' enters the restaurant and choses a table from +those already seating `{\em the}', or opening a new table: +} +\only<7>{ +\begin{align} +\nonumber P(z_6 = 0 | w_6 = the, \mathbf{z}_{-6}) = \frac{2}{3 + \alpha_0 P_0(the)} +\end{align} +} +\only<8>{ +\begin{align} +\nonumber P(z_6 = 2 | w_6 = the, \mathbf{z}_{-6}) = \frac{1}{3 + \alpha_0 P_0(the)} +\end{align} +} +\only<9>{ +\begin{align} +\nonumber P(z_6 = 4 | w_6 = the, \mathbf{z}_{-6}) = \frac{P_0(the)}{3 + \alpha_0 P_0(the)} +\end{align} +} +\only<7-9>{\vspace{0.32in}} +\end{frame} + +\begin{frame} +\frametitle{Approximating the table counts} +\begin{figure} +\begin{center} + \includegraphics[scale=0.7]{tables_expectation.pdf} +\end{center} +\end{figure} + +\begin{itemize} + \item GGJ06 sought to avoid explicitly tracking tables by reasoning under +the expected table counts ($E[t_w]$). + +\item Antoniak(1974) derives the expected table count as equal to the recurrence: +\begin{align} +\nonumber E[t_w] = \alpha_0 P_0(w) \sum_{i=1}^{n_w} \frac{1}{\alpha_0 P_0(w) + i - 1} +\label{eqn:true_expected} +\end{align} +\item Antoniak also suggests an approximation to this expectation which GGJ06 + presents as: \only<2>{\alert{(corrected)}} +\only<1> { +\begin{align} + \nonumber E[t_w] \approx \alpha_0 \log \frac{n_w + \alpha_0}{\alpha_0} +\end{align} +} +\only<2> { +\begin{align} + \nonumber E[t_w] \approx \alpha_0 \alert{P_0(w)} \log \frac{n_w + \alpha_0 + \alert{P_0(w)}}{\alpha_0 \alert{P_0(w)}} +\end{align} +\vspace{-0.32cm} +} +\end{itemize} +\end{frame} + + +\begin{frame} +\frametitle{A better table count approximation} +\begin{itemize} +\item Antoniak's approximation makes two assumptions: + \begin{unpacked_itemize} + \item $\alpha_0$ is large, not the predominant situation in recent applications which employ a DP as a sparse prior, + \item $P_0(w)$ is constant, which is not applicable to HDPs. + \end{unpacked_itemize} +\vspace{1.0cm} +\item In our paper we derive an improved approximation based on a difference of digamma ($\psi$) functions: +\begin{align} +\nonumber E[t_w] = \alpha_0 P_0(w) \cdot \Bigg [\psi{\Big (\alpha_0 +P_0(w)+n_w \Big)} - \psi{\Big (\alpha_0 P_0(w)} \Big ) \Bigg ] +\end{align} +\vspace{0.5cm} +\item However the restriction on $P_0(w)$ being constant remains \dots +\end{itemize} +\end{frame} + +\begin{frame} +\frametitle{DP performance} +\begin{figure} +{\centering \includegraphics[scale=0.45]{code/plot0.pdf}} +\end{figure} +\end{frame} + +\begin{frame} +\frametitle{DP performance} +\begin{figure} +{\centering \includegraphics[scale=0.45]{code/plot1.pdf}} +\end{figure} +\end{frame} + +\begin{frame} +\frametitle{DP performance} +\begin{figure} +{\centering \includegraphics[scale=0.45]{code/plot2.pdf}} +\end{figure} +\end{frame} + +\begin{frame} +\frametitle{HDP performance} +\begin{figure} +{\centering \includegraphics[scale=0.45]{code/plot3.pdf}} +\end{figure} +\end{frame} + +\begin{frame} +\frametitle{Histogram Method} +\begin{unpacked_itemize} + \item At this point we don't have a useful approximation of the expected table + counts in a HDP model. + \item However, we can describe a more compact representation for the state of the restaurant that doesn't require explicit table tracking. + \item Instead we maintain a histogram for each dish $w_i$ of the frequency of a table having a particular number of customers. +\end{unpacked_itemize} +\end{frame} + +\begin{frame}[t] +\frametitle{Histogram Method} +\begin{center} + \only<1>{\includegraphics[scale=0.7]{tables0.pdf}} + \only<2>{\includegraphics[scale=0.7]{tables1.pdf}} + \only<3>{\includegraphics[scale=0.7]{tables2.pdf}} + \only<4>{\includegraphics[scale=0.7]{tables3.pdf}} + \only<5>{\includegraphics[scale=0.7]{tables4.pdf}} + \only<6>{\includegraphics[scale=0.7]{tables5.pdf}} + \only<7>{\includegraphics[scale=0.7]{tables7.pdf}} + \only<8>{\includegraphics[scale=0.7]{tables9.pdf}} +\end{center} +\vspace{-2.5cm} +\only<6->{\vspace{0.47cm}} +\begin{center} + \only<1>{\includegraphics[scale=0.2]{histogram_1.pdf}\hspace{-0.6cm}} + \only<2>{\includegraphics[scale=0.2]{histogram_2.pdf}\hspace{-0.5cm}} + \only<3>{\includegraphics[scale=0.2]{histogram_3.pdf}\hspace{-0.4cm}} + \only<4>{\includegraphics[scale=0.2]{histogram_4.pdf}\hspace{-0.3cm}} + \only<5>{\includegraphics[scale=0.2]{histogram_5.pdf}\hspace{-0.2cm}} + \only<6>{\includegraphics[scale=0.2]{histogram_6.pdf}} + \only<7>{\includegraphics[scale=0.2]{histogram_7.pdf}} + \only<8>{\includegraphics[scale=0.2]{histogram_8.pdf}} +\end{center} +\end{frame} + +\begin{frame} +\frametitle{Summary} +\begin{center} + \Large \textbf{The table count approximation of Goldwater et al. 2006 + is broken, \alert{don't use it!}} +\end{center} +\end{frame} + +\begin{frame} +% \frametitle{Summary} + \begin{center} + \Large Thank you. + \end{center} + +\begin{block}{References} + \footnotesize + \vspace{0.5cm} + P. Blunsom, T. Cohn, S. Goldwater and M. Johnson. A note on the + implementation of hierarchical Dirichlet processes, + {\em In the Proceedings of ACL-IJCNLP 2009}. \\ + \vspace{0.5cm} + C. E. Antoniak. 1974. Mixtures of dirichlet processes with + applications to bayesian nonparametric problems. + {\em The Annals of Statistics}, 2(6):1152-1174. \\ + \vspace{0.5cm} + S. Goldwater, T. Griffiths, M. Johnson. + Contextual dependencies in unsupervised word segmentation. + {\em In the Proceedings of (COLING/ACL-2006)}. + \vspace{0.5cm} +\end{block} +\end{frame} + +\end{document} |