3 files changed, 44 insertions, 7 deletions
diff --git a/report/biblio.bib b/report/biblio.bib
index 261e965f..1dd81c8c 100644
--- a/report/biblio.bib
+++ b/report/biblio.bib
@@ -1,3 +1,38 @@
+@inproceedings{blunsom:acl2009,
+ author = {Blunsom, Phil and Cohn, Trevor and Dyer, Chris and Osborne, Miles},
+ title = {A {Gibbs} sampler for phrasal synchronous grammar induction},
+ booktitle = {ACL-IJCNLP '09: Proceedings of the Joint Conference of the 47th Annual Meeting of the ACL and the 4th International Joint Conference on Natural Language Processing of the AFNLP: Volume 2},
+ year = {2009},
+ pages = {782--790},
+ location = {Suntec, Singapore},
+ }
+
+@InProceedings{zhang:2008,
+  author    = {Zhang, Hao  and  Quirk, Chris  and  Moore, Robert C.  and  Gildea, Daniel},
+  title     = {{Bayesian} Learning of Non-Compositional Phrases with Synchronous Parsing},
+  booktitle = {Proceedings of ACL-08: HLT},
+  month     = {June},
+  year      = {2008},
+  address   = {Columbus, Ohio},
+  pages     = {97--105},
+}
+
+@inproceedings{blunsom:nips2008,
+  author    = {Phil Blunsom and
+               Trevor Cohn and
+               Miles Osborne},
+  title     = {Bayesian Synchronous Grammar Induction},
+  booktitle = {Advances in Neural Information Processing Systems 21, Proceedings of the Twenty-Second Annual Conference on Neural Information Processing Systems},
+  pages     = {161--168},
+  editor    = {Daphne Koller and
+               Dale Schuurmans and
+               Yoshua Bengio and
+               L{\'e}on Bottou},
+  publisher = {MIT Press},
+  year      = {2009},
+}
+
+
 @string{acl-1989 = {27th Annual Meeting of the Association for Computational Linguistics (ACL-1989)}}
 @string{acl-1989-address = {Vancouver, British Columbia, Canada}}
 @string{acl-1995 = {33rd Annual Meeting of the Association for Computational Linguistics (ACL-1995)}}
 @string{acl-1995-address = {Cambridge, Massachusetts}}
 @string{acl-1996 = {34rd Annual Meeting of the Association for Computational Linguistics (ACL-1996)}}
 @string{acl-1996-address = {Santa Cruz, California}}
 @string{acl-1997 = {35th Annual Meeting of the Association for Computational Linguistics (ACL-1997)}}
 @string{acl-1997-address = {Madrid, Spain}}
 @string{acl-1998 = {36th Annual Meeting of the Association for Computational Linguistics and 17th International Conference on Computational Linguistics (ACL-CoLing-1998)}}
 @string{acl-1998-address = {Montreal, Canada}}
 @string{acl-1999 = {Proceedings of the 37th Annual Meeting of the Association for Computational Linguistics (ACL)}}
 @string{acl-1999-address = {College Park, Maryland}}
 @string{acl-2000 = {Proceedings of the 40th Annual Meeting of the Association for Computational Linguistics (ACL-2000)}}
 @string{acl-2000-address = {Hong Kong}}
 @string{acl-2001 = {Proceedings of the 39th Annual Meeting of the Association for Computational Linguistics (ACL-2001)}}
 @string{acl-2001-address = {Toulouse, France}}
 @string{acl-2002 = {Proceedings of the 40th Annual Meeting of the Association for Computational Linguistics (ACL-2002)}}
 @string{acl-2002-address = {Philadelphia, Pennsylvania}}
 @string{acl-2003 = {Proceedings of the 41st Annual Meeting of the Association for Computational Linguistics (ACL-2003)}}
 @string{acl-2003-address = {Sapporo, Japan}}
 @string{acl-2004 = {Proceedings of the 42nd Annual Meeting of the Association for Computational Linguistics (ACL-2004)}}
 @string{acl-2004-address = {Barcelona, Spain}}
 @string{acl-2005 = {Proceedings of the 43rd Annual Meeting of the Association for Computational Linguistics (ACL-2005)}}
 @string{acl-2005-address = {Ann Arbor, Michigan}}
 @string{acl-2006 = {Proceedings of the 21st International Conference on Computational Linguistics and 44th Annual Meeting of the Association for Computational Linguistics (ACL-CoLing-2006)}}
 @string{acl-2006-address = {Sydney, Australia}}
 @string{acl-2007 = {Proceedings of the 45th Annual Meeting of the Association for Computational Linguistics (ACL-2007)}}
 @string{acl-2007-address = {Prague, Czech Republic}}
 @string{acl-2008 = {Proceedings of the 46th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies}}
 @string{acl-2008-address = {Colmbus, Ohio}}
 @string{acl-2009-address = {Singapore}}
 @string{amta-2002 = {Proceedings of the 5th Biennial Conference of the Association for Machine Translation in the Americas (AMTA-2002)}}
 @string{amta-2002-address = {Tiburon, California}}
 @string{amta-2004 = {Proceedings of the 6th Biennial Conference of the Association for Machine Translation in the Americas (AMTA-2004)}}
 @string{amta-2004-address = {Washington DC}}
 @string{amta-2006 = {Proceedings of the 7th Biennial Conference of the Association for Machine Translation in the Americas (AMTA-2006)}}
 @string{amta-2006-address = {Cambridge, Massachusetts}}
 @string{amta-2008 = {Proceedings of the 8th Biennial Conference of the Association for Machine Translation in the Americas (AMTA-2008)}}
 @string{amta-2008-address = {Honolulu, Hawaii}}
 @string{coling-2008 = {Proceedings of the 22nd International Conference on Computational Linguistics (COLING-2008)}}
 @string{coling-2008-address = {Manchester, England}}
 @string{eacl-1989 = {4th Conference of the European Chapter of the Association for Computational Linguistics (EACL-1989)}}
 @string{eacl-1989-address = {Manchester, England}}
 @string{eacl-2003 = {10th Conference of the European Chapter of the Association for Computational Linguistics (EACL-2003)}}
 @string{eacl-2003-address = {Budapest, Hungary}}
 @string{eacl-2006 = {11th Conference of the European Chapter of the Association for Computational Linguistics (EACL-2006)}}
 @string{eacl-2006-address = {Trento, Italy}}
 @string{eacl-2009 = {12th Conference of the European Chapter of the Association for Computational Linguistics (EACL-2009)}}
 @string{eacl-2009-address = {Athens, Greece}}
 @string{emnlp-2000 = {2000 Joint SIGDAT Conference on Empirical Methods in Natural Language Processing and Very Large Corpora}}
 @string{emnlp-2000-address = {Hong Kong}}
 @string{emnlp-2001 = {Proceedings of the 2001 Conference on Empirical Methods in Natural Language Processing (EMNLP-2001)}}
 @string{emnlp-2001-address = {Pittsburgh, Pennsylvania}}
 @string{emnlp-2002 = {Proceedings of the 2002 Conference on Empirical Methods in Natural Language Processing (EMNLP-2002)}}
 @string{emnlp-2002-address = {Philadelphia, Pennsylvania}}
 @string{emnlp-2003 = {Proceedings of the 2003 Conference on Empirical Methods in Natural Language Processing (EMNLP-2003)}}
 @string{emnlp-2003-address = {Sapporo, Japan}}
 @string{emnlp-2004 = {Proceedings of the 2004 Conference on Empirical Methods in Natural Language Processing (EMNLP-2004)}}
 @string{emnlp-2004-address = {Barcelona, Spain}}
 @string{emnlp-2005 = {Proceedings of the 2005 Conference on Empirical Methods in Natural Language Processing (EMNLP-2005)}}
 @string{emnlp-2005-address = {Vancouver, British Columbia., Canada}}
 @string{emnlp-2006 = {Proceedings of the 2006 Conference on Empirical Methods in Natural Language Processing (EMNLP-2006)}}
 @string{emnlp-2006-address = {Sydney, Australia}}
 @string{emnlp-2007 = {Proceedings of the 2007 Joint Conference on Empirical Methods in Natural Language Processing and Computational Natural Language Learning (EMNLP-CoNLL)}}
 @string{emnlp-2007-address = {Prague, Czech Republic}}
 @string{emnlp-2008 = {Proceedings of the 2008 Conference on Empirical Methods in Natural Language Processing (EMNLP-2008)}}
 @string{emnlp-2008-address = {Honolulu, Hawaii}}
 @string{emnlp-2009 = {Proceedings of the 2009 Conference on Empirical Methods in Natural Language Processing (EMNLP-2009)}}
 @string{emnlp-2009-address = {Singapore}}
 @string{hlt-2002 = {Proceedings of Second International Conference on Human Language Technology Research (HLT-02)}}
 @string{hlt-2002-address = {San Diego}}
 @string{hlt-naacl-2003 = {Proceedings of the Human Language Technology Conference of the North American chapter of the Association for Computational Linguistics (HLT/NAACL-2003)}}
 @string{hlt-naacl-2003-address = {Edmonton, Alberta}}
 @string{hlt-naacl-2004 = {Proceedings of the Human Language Technology Conference of the North American chapter of the Association for Computational Linguistics (HLT/NAACL-2004)}}
 @string{hlt-naacl-2004-address = {Boston, Massachusetts}}
 @string{hlt-naacl-2006 = {Proceedings of the Human Language Technology Conference of the North American chapter of the Association for Computational Linguistics (HLT/NAACL-2006)}}
 @string{hlt-naacl-2006-address = {New York, New York}}
 @string{hlt-naacl-2007 = {Proceedings of the Human Language Technology Conference of the North American chapter of the Association for Computational Linguistics (HLT/NAACL-2007)}}
 @string{hlt-naacl-2007-address = {Rochester, New York}}
 @string{hlt-naacl-2009-address = {Boulder, Colorado}}
 @string{iwpt = {Proceedings of the International Workshop on Parsing Technologies}}
 @string{iwpt-2005-address = {Vancouver, BC, Canada}}
 @string{iwslt = {Proceedings of the International Workshop on Spoken Language Technology}}
 @string{kdd = {Proceeding of the ACM SIGKDD international conference on Knowledge discovery and data mining}}
 @string{kdd-2008-address = {New York}}
 @string{mt-summit-9-address = {New Orleans, Louisiana}}
 @string{naacl-2001 = {Second Meeting of the North American Chapter of the Association for Computational Linguistics}}
 @string{naacl-2001-address = {Pittsburgh, Pennsylvania}}
 @string{wmt = {Proceedings of the Workshop on Statistical Machine Translation}}
 
 @inproceedings{Chiang2005,
diff --git a/report/np_clustering.tex b/report/np_clustering.tex
index 002877b5..17ff31a4 100644
--- a/report/np_clustering.tex
+++ b/report/np_clustering.tex
@@ -3,27 +3,28 @@
 
 \chapter{Nonparametric Models}
 
-In this chapter we describe several closely related Bayesian nonparametric models for inducing categories in a synchronous context-free grammar.  Our nonparametric models are variations on Latent Dirichlet Allocation (LDA) model of \cite{blei:2003}.  Rather than modeling sentences (or sentence pairs), we assume that rule extraction heuristics determine the set of valid constituents and grammar rules, and so our task is only to determine the category labels.  As discussed in the previous chapter, we make the critical assumption that each phrase (or pair), $\p$, can be clustered on the basis of the contexts it occurs in.  We therefore define a generative model of a corpus that consists of collections of contexts (one context collection for each phrase pair type).
+In this chapter we describe a Bayesian nonparametric model for inducing categories in a synchronous context-free grammar.  As discussed in Chapter~\ref{chapter:setup}, we hypothesize that each phrase pair, $\p$, can be clustered on the basis of the contexts it occurs in.  Using this as our starting point, we define a generative model where contexts are generated by the (latent) category type of the phrases they occur in.  In contrast to most prior work using Bayesian models for synchronous grammar induction \citep{blunsom:nips2008,blunsom:acl2009,zhang:2008}, we do not model parallel sentence pairs directly.  Rather, we assume that our corpus is a {\emph collection of contexts} (grouped according to the phrases they occur in), where each context is conditionally independent of the others, given the type of the category it surrounds.  The models used here are thus variations on the Latent Dirichlet Allocation (LDA) model of \cite{blei:2003}.
 
-\section{Model}
+In Section~\ref{sec:npmodel} we describe the basic structure of our nonparametric models as well as how inference was carried out.
 
-The high-level structure of our model is as follows: each observed phrase (pair), $\p$, consists of a finite mixture of categories, $\theta_{\p}$.  The list of contexts $C_{\p}$ is generated as follows.  A category type $z_i$ is drawn from $\theta_{\p}$, and this generates the observed context, $\textbf{c}_i$, according to a category-specific distribution over contexts types, $\phi_{z_i}$.  Since we do not know the values of $\theta_{\p}$ and $\phi_z$, we place priors on the distributions, to reflect our prior beliefs about the shape these distributions should have and infer their values from the data we can observe.  Specifically, our {\emph a priori} expectation is that both parameters will be relatively peaked, since each phrase, $\p$, should relatively unambiguous belong to particular category, and each category to generate a relatively small number of context strings, $\textbf{c}$.
+\section{Model}
+\label{sec:npmodel}
 
-To encode these prior beliefs, we make use of Pitman-Yor processes \citep{pitman:1997}, which can capture these intuitions and which have already been demonstrated to be particularly effective models for language \citep{teh:2006,goldwater:2006}.
+This section describes the details of the phrase clustering model model.  Each observed phrase (pair), $\p$, is characterized by a finite mixture of categories, $\theta_{\p}$.  The collection of contexts for each phrase, $C_{\p}$, is generated as follows.  A category type $z_i$ is drawn from $\theta_{\p}$, and this generates the observed context, $\textbf{c}_i$, according to a category-specific distribution over contexts types, $\phi_{z_i}$.  Since we do not know the values of $\theta_{\p}$ and $\phi_z$, we place priors on the distributions, to reflect our prior beliefs about the shape these distributions should have and infer their values from the data we can observe.  Specifically, our {\emph a priori} expectation is that both parameters will be relatively peaked, since each phrase, $\p$, should relatively unambiguous belong to particular category, and each category to generate a relatively small number of context strings, $\textbf{c}$. To encode these intuitions, we make use of Pitman-Yor processes \citep{pitman:1997}, which have already been demonstrated to be particularly effective models for language \citep{teh:2006,goldwater:2006}.
 
-Our models assume a fixed number of categories, $K$. The category type, $z \in \{ 1 , 2 , \ldots , K \}$, is generated from a PYP with a uniform base distribution:
+Our model assumes a fixed number of categories, $K$. The category type, $z \in \{ 1 , 2 , \ldots , K \}$, is generated from a PYP with a uniform base distribution:
 \begin{align*}
 z &| \p & \sim \theta_{\p} \\
 \theta_{\p} &| a_{\p},b_{\p},K & \sim \textrm{PYP}(a_{\p},b_{\p},\frac{1}{K})
 \end{align*}
-\noindent Alternatively, we used hierarchical PYP process which shares statistics about the use of categories across phrases:
+\noindent As a variation on this, we define a variant of the model with a hierarchical prior on the distribution over categories for a phrase.  We share statistics about category use across phrases, encourage a more peaked distribution of categories:
 \begin{align*}
 z &| \p & \sim \theta_{\p} \\
 \theta_{\p} &| a_{\p},b_{\p} & \sim \textrm{PYP}(a_{\p},b_{\p},\theta_0) \\
 \theta_0 &| a_0,b_0,K & \sim \textrm{PYP}(a_0,b_0,\frac{1}{K})
 \end{align*}
 
-\noindent Each category $z$ token then generates the context $\textbf{c}_i$. We again model this using a PYP, which will tend to cluster commonly used contexts across phrases into a single category. Additionally, by using hierarchical PYPs, we can smooth highly specific contexts by backing off to less specific contexts (e.g., composed of fewer words or word classes).
+\noindent Now that we have described how category labels are generated, we describe how contexts are generated from the category.  We again model this process using a PYP. Not only does this model tend to favor solutions where contexts used repeatedly are clustered, but it provides a natural way to do smoothing.  Since many contexts may be only infrequently observed in the training data, proper smoothing is crucial.  Specifically, we can smooth specific contexts by backing off to less specific contexts (e.g., composed of fewer words or word classes).
 
 The most basic version of our model uses a uniform base distribution over contexts. This model was most useful when generating contexts consisting of a single word or word class (i.e., $\textbf{c}=c_{-1}c_1$) in either the source or target language on either side.
 
diff --git a/report/setup.tex b/report/setup.tex
index b4f3f07d..e3357d3c 100644
--- a/report/setup.tex
+++ b/report/setup.tex
@@ -1,4 +1,5 @@
 \chapter{Experimental Setup}
+\label{chapter:setup}
 
 Our approach is based upon the popular and influential Hiero system \citep{chiang:2007} which uses a synchronous context free grammar (SCFG) to model translation. 
 This translation system uses only a single non-terminal symbol and therefore the system is inherently stateless.