1 files changed, 17 insertions, 16 deletions
diff --git a/report/np_clustering.tex b/report/np_clustering.tex
index 770a7da3..439bfdbe 100644
--- a/report/np_clustering.tex
+++ b/report/np_clustering.tex
@@ -99,6 +99,23 @@ Num. references & 16 & 4
 \label{tab:corpbtecur}
 \end{table}%
 
+\subsection{Features used}
+
+In addition to the language model and word penalty, we made use of the following features to score each rule $\textrm{Y} \rightarrow \langle \textbf{f},\textbf{e} \rangle$ used in a derivation.  Some features are meaningful only in the context of multi-category grammars.
+
+\begin{enumerate}
+\item The lexical translation log probability of the words in both phrases, $\log \textrm{{\emph lex}}(\textbf{e}|\textbf{f})$, as defined in \cite{Koehn2003}.
+\item The inverse lexical translation log probability, $\log \textrm{{\emph lex}}(\textbf{f}|\textbf{e})$.
+\item The log frequency of occurrence of the LHS category, $\log f(\textrm{Y})$. This feature always has a value of 0 in the single category (baseline) system.
+\item The relative frequency of \textbf{e} given \textbf{f}, collapsing all non-terminals into the symbol X, $f_{\textbf{X}}(\textbf{e}|\textbf{f})$. This is equivalent to the relative frequency of the rule in the 1-category `Hiero' grammar.
+\item The inverse relative frequency, $f_{\textbf{X}}(\textbf{f}|\textbf{e})$.
+\item The relative frequency of $\langle \textbf{f}, \textbf{e} \rangle$ given Y, $f(\textbf{f}, \textbf{e} | \textrm{Y})$.
+\item The log rule count, $\log C(\textrm{Y} \rightarrow \langle \textbf{f},\textbf{e} \rangle)$.
+\item A feature with value 1 (creates a count of the number of rules in the derivation).
+\end{enumerate}
+
+\noindent The above feature weights were tuned using the minimum error rate training algorithm (\textsc{mert}), to optimize the 1-best \textsc{bleu} on a held-out development set.
+
 \subsection{Baseline and benchmark systems}
 
 We provide two baseline systems: a single-category system constructed using the procedure described by \cite{chiang:2007} and a system constructed by assigning categories to each phrasal occurrence in the training data.  Additionally, we provide a benchmark system using supervised English (target) language parse trees \citep{samt}.  Table~\ref{tab:npbaselines} summarizes these baseline conditions.
@@ -124,22 +141,6 @@ POS-only & 56.2 & 22.3 \\
 
 Because the margin of improvement from the 1-category baseline to the supervised condition is much more substantial in the Urdu-English condition than in the BTEC condition, some experiments were only carried out on Urdu.
 
-\subsection{Features in the multi-category systems}
-
-The features used in the baseline system to evaluate translation hypotheses were generalized to exploit the presence of category labels.  In addition to the language model and word penalty, we made use of the following features to score each rule $\textrm{Y} \rightarrow \langle \textbf{f},\textbf{e} \rangle$ in a derivation.
-
-\begin{enumerate}
-\item The lexical translation probability of the words in both phrases, $\textrm{{\emph lex}}(\textbf{e}|\textbf{f})$, as defined in \cite{Koehn2003}.
-\item The inverse lexical translation probability, $\textrm{{\emph lex}}(\textbf{f}|\textbf{e})$.
-\item The frequency of occurrence of the LHS category, $f(\textrm{Y})$.
-\item The relative frequency of \textbf{e} given \textbf{f}, collapsing all non-terminals into the symbol X, $f_{\textbf{X}}(\textbf{e}|\textbf{f})$. This is equivalent to the relative frequency of the rule in the 1-category `Hiero' grammar.
-\item The inverse relative frequency, $f_{\textbf{X}}(\textbf{f}|\textbf{e})$.
-\item The relative frequency of $\langle \textbf{f}, \textbf{e} \rangle$ given Y, $f(\textbf{f}, \textbf{e} | \textrm{Y})$.
-\item The log rule count, $\log C(\textrm{Y} \rightarrow \langle \textbf{f},\textbf{e} \rangle)$.
-\item A feature with value 1 (creates a count of the number of rules in the derivation).
-\end{enumerate}
-
-
 \subsection{Number of categories}
 
 \begin{table}[h]