From 1080affc3484d7c5e5a0c3124273be8a7e76c2e0 Mon Sep 17 00:00:00 2001 From: "ccb@cs.jhu.edu" Date: Tue, 17 Aug 2010 20:07:57 +0000 Subject: Working on the SCFG section git-svn-id: https://ws10smt.googlecode.com/svn/trunk@581 ec762483-ff6d-05da-a07a-a48fb63a330f --- report/SCFGs.tex | 51 ++++++++++++++++++++++++++++++++--------- report/SCFGs/english-step0.pdf | Bin 0 -> 21396 bytes report/SCFGs/english-step1.pdf | Bin 0 -> 21961 bytes report/SCFGs/english-step2.pdf | Bin 0 -> 22319 bytes report/SCFGs/english-step3.pdf | Bin 0 -> 22999 bytes report/SCFGs/english-step4.pdf | Bin 0 -> 23985 bytes report/SCFGs/example-scfg.pdf | Bin 0 -> 28986 bytes report/SCFGs/urdu-input.pdf | Bin 0 -> 12765 bytes report/SCFGs/urdu-step0.pdf | Bin 0 -> 24914 bytes report/SCFGs/urdu-step1.pdf | Bin 0 -> 25488 bytes report/SCFGs/urdu-step2.pdf | Bin 0 -> 25868 bytes report/SCFGs/urdu-step3.pdf | Bin 0 -> 26522 bytes report/SCFGs/urdu-step4.pdf | Bin 0 -> 27517 bytes report/report.tex | 1 + 14 files changed, 41 insertions(+), 11 deletions(-) create mode 100644 report/SCFGs/english-step0.pdf create mode 100644 report/SCFGs/english-step1.pdf create mode 100644 report/SCFGs/english-step2.pdf create mode 100644 report/SCFGs/english-step3.pdf create mode 100644 report/SCFGs/english-step4.pdf create mode 100644 report/SCFGs/example-scfg.pdf create mode 100644 report/SCFGs/urdu-input.pdf create mode 100644 report/SCFGs/urdu-step0.pdf create mode 100644 report/SCFGs/urdu-step1.pdf create mode 100644 report/SCFGs/urdu-step2.pdf create mode 100644 report/SCFGs/urdu-step3.pdf create mode 100644 report/SCFGs/urdu-step4.pdf diff --git a/report/SCFGs.tex b/report/SCFGs.tex index 0002405d..3441c7db 100644 --- a/report/SCFGs.tex +++ b/report/SCFGs.tex @@ -1,22 +1,51 @@ \chapter{Synchronous context free grammars} \label{sec:scfg} -%\subsubsection*{Synchronous context free grammar} \label{sec:scfg} -\begin{figure}[t] + +The translation models used in this workshop are synchronous context free grammars (SCFGs). +SCFGs \cite{lewis68scfg} generalizes context-free grammars to generate strings concurrently in two (or more) languages. A string pair is generated by applying a series of paired rewrite rules of the form, $X \rightarrow \langle \mathbf{e}, \mathbf{f}, \mathbf{a} \rangle$, where $X$ is a non-terminal, $\mathbf{e}$ and $\mathbf{f}$ are strings of terminals and non-terminals and $\mathbf{a}$ specifies a one-to-one alignment between non-terminals in $\mathbf{e}$ and $\mathbf{f}$. +In the statistical machine translation, the two righthand sides of SCFG rules represent the source and target languages. The process of translation occurs by parsing the source sentence, which induces a parallel tree structure and translation in the target language \cite{chiang07hierarchical}. +Terminal are rewritten as pairs of strings of terminal symbols in the source and target languages. Additionally, one side of a terminal expansion may be the special symbol $\epsilon$, which indicates a null alignment which permits arbitrary insertions and deletions. +Figure \ref{fig:toy-scfg} gives an example SCFG between Urdu and English. Figure \ref{fig:toy-scfg-parse} shows how the SCFG is used to derive the translation of an input Urdu sentence. + + + + +\begin{figure} \begin{center} -\includegraphics[width=0.6\columnwidth]{example_derivation2.pdf} +\includegraphics[width=.6\linewidth]{SCFGs/example-scfg} \end{center} -\caption[Derivation]{An example SCFG derivation from a Chinese source sentence which yields the English sentence: {\em ``Brown arrived in Shanghai from Beijing late last night.''}. The non-terminal alignment $\mathbf{a}$ is specified by the variable subscripts.} -\label{fig:intro_example_derivation} +\caption{A toy example that illustrates a SCFG that can translate (romanized) Urdu into English for one sentence. }\label{fig:toy-scfg} \end{figure} -The translation models discussed explored in this workshop are based on synchronous grammars. -Here we provide a short definition of the formalism we've employed: synchronous context free grammar (SCFG). -A synchronous context free grammar (SCFG, \cite{lewis68scfg}) generalizes context-free grammars to generate strings concurrently in two (or more) languages. A string pair is generated by applying a series of paired rewrite rules of the form, $X \rightarrow \langle \mathbf{e}, \mathbf{f}, \mathbf{a} \rangle$, where $X$ is a non-terminal, $\mathbf{e}$ and $\mathbf{f}$ are strings of terminals and non-terminals and $\mathbf{a}$ specifies a one-to-one alignment between non-terminals in $\mathbf{e}$ and $\mathbf{f}$. -In the context of SMT, by assigning the source and target languages to the respective sides of a probabilistic SCFG it is possible to describe translation as the process of parsing the source sentence, which induces a parallel tree structure and translation in the target language \cite{chiang07hierarchical}. -Terminal are rewritten as pairs of strings of terminal symbols in the source and target languages. Additionally, one side of a terminal expansion may be the special symbol $\epsilon$, which indicates a null alignment which permits arbitrary insertions and deletions. -Figure \ref{fig:intro_example_derivation} is an example derivation for Chinese to English translation using an SCFG of the form that I propose to learn using non-parametric Bayesian models. + +\begin{figure} +\begin{tabular}{lll} +\multicolumn{3}{>{\columncolor[rgb]{0.95,0.95,0.75}}c}{The input is an Urdu sentence which is initially unanalyzed.}\\ +\includegraphics[width=.45\linewidth]{SCFGs/urdu-input} & & +\\ \hline +\multicolumn{3}{>{\columncolor[rgb]{0.95,0.95,0.75}}c}{Here all of the terminal symbols receive non-terminal labels. The English words are in Urdu order.}\\ +\includegraphics[width=.45\linewidth]{SCFGs/urdu-step0} & & +\includegraphics[width=.45\linewidth]{SCFGs/english-step0} \\ \hline +\multicolumn{3}{>{\columncolor[rgb]{0.95,0.95,0.75}}c}{The PP rule reorders the Urdu postpositional phrase to be a prepositional phrase on the English side.}\\\includegraphics[width=.45\linewidth]{SCFGs/urdu-step1} & & +\includegraphics[width=.45\linewidth]{SCFGs/english-step1} \\ \hline\multicolumn{3}{>{\columncolor[rgb]{0.95,0.95,0.75}}c}{The English auxiliary verb and main verb get reordered with the application of the VP rule.}\\ +\includegraphics[width=.45\linewidth]{SCFGs/urdu-step2} & & +\includegraphics[width=.45\linewidth]{SCFGs/english-step2} \\ \hline +\multicolumn{3}{>{\columncolor[rgb]{0.95,0.95,0.75}}c}{This VP rule moves the English verb from the Urdu verb-final position to its correct place before the PP.}\\ +\includegraphics[width=.45\linewidth]{SCFGs/urdu-step3} & & \includegraphics[width=.45\linewidth]{SCFGs/english-step3} \\ \hline +\multicolumn{3}{>{\columncolor[rgb]{0.95,0.95,0.75}}c}{Applying the S rule, means that we have a complete translation of the Urdu sentence.}\\ +\includegraphics[width=.45\linewidth]{SCFGs/urdu-step4} & & \includegraphics[width=.45\linewidth]{SCFGs/english-step4} +\end{tabular} +\caption{Using SCFGs as the underlying formalism means that the process of translation is one of parsing. This shows how an English sentence can be generated by parsing the Urdu sentence using the rules given in Figure \ref{fig:toy-scfg}}\label{fig:toy-scfg-parse} +\end{figure} + + + + + + +% of the form that I propose to learn using non-parametric Bayesian models. The generative story is as follows. In the beginning was the grammar, in which we allow two types of rules: {\emph non-terminal} and {\emph terminal} expansions. diff --git a/report/SCFGs/english-step0.pdf b/report/SCFGs/english-step0.pdf new file mode 100644 index 00000000..e8964fba Binary files /dev/null and b/report/SCFGs/english-step0.pdf differ diff --git a/report/SCFGs/english-step1.pdf b/report/SCFGs/english-step1.pdf new file mode 100644 index 00000000..29f56f1b Binary files /dev/null and b/report/SCFGs/english-step1.pdf differ diff --git a/report/SCFGs/english-step2.pdf b/report/SCFGs/english-step2.pdf new file mode 100644 index 00000000..4afb06b7 Binary files /dev/null and b/report/SCFGs/english-step2.pdf differ diff --git a/report/SCFGs/english-step3.pdf b/report/SCFGs/english-step3.pdf new file mode 100644 index 00000000..1926b4dd Binary files /dev/null and b/report/SCFGs/english-step3.pdf differ diff --git a/report/SCFGs/english-step4.pdf b/report/SCFGs/english-step4.pdf new file mode 100644 index 00000000..ccc13807 Binary files /dev/null and b/report/SCFGs/english-step4.pdf differ diff --git a/report/SCFGs/example-scfg.pdf b/report/SCFGs/example-scfg.pdf new file mode 100644 index 00000000..bc7e411e Binary files /dev/null and b/report/SCFGs/example-scfg.pdf differ diff --git a/report/SCFGs/urdu-input.pdf b/report/SCFGs/urdu-input.pdf new file mode 100644 index 00000000..a367dbb8 Binary files /dev/null and b/report/SCFGs/urdu-input.pdf differ diff --git a/report/SCFGs/urdu-step0.pdf b/report/SCFGs/urdu-step0.pdf new file mode 100644 index 00000000..31a225b9 Binary files /dev/null and b/report/SCFGs/urdu-step0.pdf differ diff --git a/report/SCFGs/urdu-step1.pdf b/report/SCFGs/urdu-step1.pdf new file mode 100644 index 00000000..5d386332 Binary files /dev/null and b/report/SCFGs/urdu-step1.pdf differ diff --git a/report/SCFGs/urdu-step2.pdf b/report/SCFGs/urdu-step2.pdf new file mode 100644 index 00000000..13cd73dc Binary files /dev/null and b/report/SCFGs/urdu-step2.pdf differ diff --git a/report/SCFGs/urdu-step3.pdf b/report/SCFGs/urdu-step3.pdf new file mode 100644 index 00000000..15636844 Binary files /dev/null and b/report/SCFGs/urdu-step3.pdf differ diff --git a/report/SCFGs/urdu-step4.pdf b/report/SCFGs/urdu-step4.pdf new file mode 100644 index 00000000..6a5fa2a7 Binary files /dev/null and b/report/SCFGs/urdu-step4.pdf differ diff --git a/report/report.tex b/report/report.tex index b0f8e223..98a3ff0a 100755 --- a/report/report.tex +++ b/report/report.tex @@ -10,6 +10,7 @@ \usepackage{epic,eepic} \usepackage{boxedminipage} \usepackage{fancybox} +\usepackage{colortbl} \usepackage[square]{natbib} \usepackage{epsfig} %\usepackage{subfig} -- cgit v1.2.3