From 59586b8b10f40b36178bd17f01497a73001cec5e Mon Sep 17 00:00:00 2001 From: "trevor.cohn" Date: Tue, 17 Aug 2010 11:09:05 +0000 Subject: Some more work on setup chapter. Nb. added some Chinese in UTF-8. Requires CJK package and the appropriate fonts to be installed. Tex-Live CJK does the trick. git-svn-id: https://ws10smt.googlecode.com/svn/trunk@572 ec762483-ff6d-05da-a07a-a48fb63a330f --- report/align.pdf | Bin 0 -> 17220 bytes report/report.tex | 8 +++ report/setup.tex | 51 ++++++++++++-- report/viewer/PMingLiU.ttf | Bin 0 -> 19389344 bytes report/viewer/alignment.py | 162 +++++++++++++++++++++++++++++++++++++++++++++ report/viewer/display.py | 22 ++++++ report/viewer/render.py | 128 +++++++++++++++++++++++++++++++++++ 7 files changed, 365 insertions(+), 6 deletions(-) create mode 100644 report/align.pdf create mode 100644 report/viewer/PMingLiU.ttf create mode 100644 report/viewer/alignment.py create mode 100755 report/viewer/display.py create mode 100644 report/viewer/render.py (limited to 'report') diff --git a/report/align.pdf b/report/align.pdf new file mode 100644 index 00000000..fd9a9965 Binary files /dev/null and b/report/align.pdf differ diff --git a/report/report.tex b/report/report.tex index 30ed260d..dd286dab 100755 --- a/report/report.tex +++ b/report/report.tex @@ -16,6 +16,14 @@ \usepackage{subfigure} \usepackage{booktabs} +\usepackage[encapsulated]{CJK} +\usepackage{ucs} +\usepackage[utf8x]{inputenc} +% use one of bsmi(trad Chinese), gbsn(simp Chinese), min(Japanese), mj(Korean); see: +% /usr/share/texmf-dist/tex/latex/cjk/texinput/UTF8/*.fd +\newcommand{\cntext}[1]{\begin{CJK}{UTF8}{gbsn}#1\end{CJK}} + + \oddsidemargin 0mm \evensidemargin 5mm \topmargin -20mm diff --git a/report/setup.tex b/report/setup.tex index abfde372..edb0bbb6 100644 --- a/report/setup.tex +++ b/report/setup.tex @@ -5,11 +5,13 @@ This translation system uses only a single non-terminal symbol and therefore the However, we know that using a richer set of non-terminals can greatly improve translation, as evidenced by the improvments obtained by SAMT system \cite{samt} which augments a Hiero-style SCFG model with syntactic labels. This is best explained in terms of the generalisation capability: a single category grammar can create all manner of string-pairs, the majority of which are nonsensical and agrammatical, while a model with syntactic categories inherently limits the sets of string pairs to those which are grammatical (largely). This can be seen from the following example rules, showing how rules can be combined to arrive at ungrammatical string pairs. -\begin{align*} -X &\rightarrow \langle \mbox{does not}~X, \mbox{ne}~X~\mbox{pas} \rangle \\ -X &\rightarrow \langle \mbox{cat}, \mbox{chat} \rangle \\ -X &\Rightarrow \langle \mbox{does not cat}, \mbox{ne cat pas} \rangle -\end{align*} +\begin{align} +X &\rightarrow \langle \mbox{does not}~X, \mbox{ne}~X~\mbox{pas} \rangle \label{eq:nepas}\\ +X &\rightarrow \langle \mbox{cat}, \mbox{chat} \rangle \label{eq:chat} \\ +X &\Rightarrow^{\ref{eq:nepas},\ref{eq:chat}} \langle \mbox{does not cat}, \mbox{ne cat pas} \rangle +\nonumber +\end{align} +If the non-terminals in the above rules were replaced with, e.g., parts-of-speech, such that rule \ref{eq:nepas} required a verb instead of an X in its right hand side, and the rule \ref{eq:chat} rewrote a noun, then this ungrammatical string pair would no longer be licenced. As such, the single-category model licenses all manner of word-salad output, thus relying on the language model to salvage a coherent sentence from these options. In contrast, the set of translation options for the grammar with syntactic labels is much smaller and more coherent, and thus the language model has less heavy lifting to do. @@ -30,7 +32,44 @@ Our bilingual translation setting differs from the earlier monolingual settings \section{Clustering Configuration} -Notion of context. +It still remains to define what we mean by context. Well we could choose the entire sentence or even document in which the phrase appears, we adopt a more local definition of context. We use a window of one or two words preceding the phrase and one or two words following. This local context is still highly informative about the syntactic role of the phrase, but given its small size we expect to observe the same context many times in a small corpus. That is, the sparsity of the data should no lo + +It still remains to define what we mean by context. Well we could choose the entire sentence or even document in which the phrase appears, we adopt a more local definition of context. We use a window of one or two words preceding the phrase and one or two words following. This local context is still highly informative about the syntactic role of the phrase, but given its small size we expect to observe the same context many times in a small corpus. That is, the sparsity of the data should no longer be a serious problem, as it would be for full sentences or example. + +The input to the clustering system are bilingual phrases in their bilingual contexts, as extracted using the standard phrase-based grammar extraction pipeline. However we choose to primarily perform monolingual clustering by discarding one side of the data, typically the source. Focusing our attention on the target side of the corpus replicates the configuration which is known to work best for syntactic translation. The target-side syntax is more informative than source-side syntax, it better models the target language and also avoids an errorful projection step onto the target (due to the grammars of the two languages not being isomorphic). On the flip side, source side syntax could be more useful in pruning the beam search for decoding, where we can use the source parse to prune away improbable items before translating into the target language. We will determine experimentally whether the effects of better pruning versus better modelling target grammar are more important for translation. +nger be a serious problem, as it would be for full sentences or example. + +The input to the clustering system are bilingual phrases in their bilingual contexts, as extracted using the standard phrase-based grammar extraction pipeline. This is illustrated in Figure~\ref{fig:extraction}. Instead of using the bilingual data directly, we choose to primarily perform monolingual clustering by discarding one side of the data, typically the source. Focusing our attention on the target side of the corpus replicates the configuration which is known to work best for syntactic translation. The target-side syntax is more informative than source-side syntax, as it better models the target language, thus better informing reordering and lexical selection. Target-side syntax avoids an errorful projection step onto the target, due to the grammars of the two languages rarely being even approximately isomorphic. On the flip side, source side syntax could be more useful in pruning the beam search for decoding, where we can use the source parse to prune away improbable items before translating into the target language. We will determine experimentally whether the effects of better pruning versus better modelling target grammar are more important for translation. + +\begin{CJK}{UTF8}{gbsn} +\begin{figure} +% input: 今天 有 棒球 比赛 吗 ? ||| are there any baseball games today ? ||| 1-0 1-1 1-2 2-3 3-4 0-5 4-6 5-6 +% from btec/split.zh-en.al line 26 +\centering +\subfigure[Example word-aligned sentence pair]{\hfill \includegraphics{align} \hfill} + +\subfigure[Some bilingual phrases and contexts extracted from (a)]{ +\centering +\begin{tabular}{rcl} +\toprule +$<$s$>$ $<$s$>$ & are there any baseball games today & ? $<$/s$>$ \\ +$<$s$>$ $<$s$>$ & 今天 有 棒球 比赛 & 吗 ? \\ +\midrule +baseball games & today & ? $<$/s$>$ \\ +$<$s$>$ $<$s$>$ & 今天 & 有 棒球 \\ +\midrule +$<$s$>$ $<$s$>$ & are there any & baseball games \\ +$<$s$>$ 今天 & 有 & 棒球 比赛 \\ +\midrule +any baseball & games & today ? \\ +有 棒球 & 比赛 & 吗 ? \\ +\bottomrule +\end{tabular}} +\caption{Example showing how a sentence pair gives rise to bilingual phrases and their contexts. These serve as input to the clusterer, typically by discarding the source side and processing the target.} +\label{fig:extraction} +\end{figure} +\end{CJK} + Mono/source/target/bi, words/classes/POS. Give example. Notation. diff --git a/report/viewer/PMingLiU.ttf b/report/viewer/PMingLiU.ttf new file mode 100644 index 00000000..03e923bf Binary files /dev/null and b/report/viewer/PMingLiU.ttf differ diff --git a/report/viewer/alignment.py b/report/viewer/alignment.py new file mode 100644 index 00000000..5fe03734 --- /dev/null +++ b/report/viewer/alignment.py @@ -0,0 +1,162 @@ +class Alignment: + SURE, POSSIBLE = 'S', 'P' + + def __init__(self, swords, twords, align): + self.swords = swords + self.twords = twords + self.align = align + + def reverse(self): + als = {} + for (frm, to), conf in self.align.items(): + als[to, frm] = conf + return Alignment(self.twords, self.swords, als) + + def merge(self, other): + assert self.swords == other.swords + assert self.twords == other.twords + + als = {} + for frm, to in self.align.keys(): + als[frm, to] = Alignment.POSSIBLE + + for frm, to in other.align.keys(): + if (frm, to) in als: + als[frm, to] = Alignment.SURE + else: + als[frm, to] = Alignment.POSSIBLE + + return Alignment(self.swords, self.twords, als) + + def __repr__(self): + return 'Alignment(swords=%s, twords=%s, align=%s)' % (self.swords, self.twords, self.align) + +def read_pharaoh_text(infile): + return infile.readline().strip().split() + +def parse_pharaoh_align(text): + als = {} + for part in text.strip().split(): + frm, to = map(int, part.split('-')) + als[frm, to] = Alignment.SURE + return als + +def read_pharaoh_align(infile): + als = {} + for part in infile.readline().strip().split(): + frm, to = map(int, part.split('-')) + als[frm, to] = Alignment.SURE + return als + +def read_pharaoh_alignment(swfile, twfile, afile): + sw = read_pharaoh_text(swfile) + tw = read_pharaoh_text(twfile) + als = read_pharaoh_align(afile) + return Alignment(sw, tw, als) + +def read_giza_alignment(infile): + infile.readline() # ignore + swords = infile.readline().strip().split() + twords = [] + als = {} + state = 0 + for token in infile.readline().strip().split(): + if state == 0: + if token != 'NULL': + if token != '({': + twords.append(token) + else: + state = 1 + elif state == 1: + if token != '})': + if twords: + als[int(token)-1, len(twords)-1] = Alignment.SURE + else: + state = 0 + return Alignment(swords, twords, als) + +def read_naacl_aligns(infile): + aligns = [] + last = None + for line in infile: + index, frm, to, conf = line.rstrip().split() + if int(index) != last: + aligns.append({}) + aligns[-1][int(frm)-1, int(to)-1] = conf + last = int(index) + return aligns + +# +# This phrase-extraction function largely mimics Pharaoh's phrase-extract +# code. It also supports the option to not advance over NULL alignments. +# + +def xextract_phrases(alignment, maxPhraseLength=None, advance=True): + T = len(alignment.twords) + S = len(alignment.swords) + if not maxPhraseLength: + maxPhraseLength = max(T, S) + + alignedCountS = [0 for s in alignment.swords] + alignedToT = [[] for t in alignment.twords] + alignedToS = [[] for s in alignment.swords] + for (s, t), conf in alignment.align.items(): + if conf == Alignment.SURE: + alignedCountS[s] += 1 + alignedToT[t].append(s) + alignedToS[s].append(t) + + # check alignments for english phrase startT...endT + for st in range(T): + for et in range(st, min(T, st + maxPhraseLength)): + minS = 9999 + maxS = -1 + usedS = alignedCountS[:] + for ti in range(st, et+1): + for si in alignedToT[ti]: + #print 'point (%d, %d)' % (si, ti) + if simaxS: maxS = si + usedS[si] -= 1 + + #print 's projected (%d-%d, %d, %d)' % (minS, maxS, st, et) + if (maxS >= 0 and # aligned to any foreign words at all + maxS-minS < maxPhraseLength): # foreign phrase within limits + # check if foreign words are aligned to out of bound english words + out_of_bounds = False + for si in range(minS, maxS): + if usedS[si] > 0: + #print 'out of bounds:', si + out_of_bounds = True + break + + # Pharoah doesn't use this check, but I think it's required + if not out_of_bounds: + for s in range(minS, maxS+1): + for t in alignedToS[s]: + if not (st <= t <= et): + #print 'out of bounds2:', t,s + out_of_bounds = True + break + + #print 'doing it for (%d-%d, %d, %d)' % (minS, maxS, st, et) + if not out_of_bounds: + if advance: + #print 'attempting to advance' + # start point of foreign phrase may advance over unaligned + ss = minS + while (ss>=0 and + ss>maxS-maxPhraseLength and # within length limit + (ss==minS or alignedCountS[ss]==0)): # unaligned + # end point of foreign phrase may advance over unaligned + es = maxS + while (es>sys.stderr, 'WARNING: replacing existing flowable' + self._body = flowable + + def render(self): + self._body.canv = self._canvas + width, height = self._body.wrap(*self._size) + width *= 1.02 + height *= 1.02 + + self._canvas = canvas.Canvas(self._filename, (width, height)) + self._body.canv = self._canvas + self._body.draw() + self._canvas.save() -- cgit v1.2.3