Some more work on setup chapter. Nb. added some Chinese in UTF-8. Requires CJK package and the appropriate fonts to be installed. Tex-Live CJK does the trick.

git-svn-id: https://ws10smt.googlecode.com/svn/trunk@572 ec762483-ff6d-05da-a07a-a48fb63a330f
author: trevor.cohn <trevor.cohn@ec762483-ff6d-05da-a07a-a48fb63a330f> 2010-08-17 11:09:05 +0000
committer: trevor.cohn <trevor.cohn@ec762483-ff6d-05da-a07a-a48fb63a330f> 2010-08-17 11:09:05 +0000
commit: 59586b8b10f40b36178bd17f01497a73001cec5e (patch)
tree: 6e86880a64f31b131a776662cb1da1c97c5dfdf2 /report/viewer
parent: f804499b85bee7ac5c37304412f8a3ae3da8f1e1 (diff)
4 files changed, 312 insertions, 0 deletions
diff --git a/report/viewer/PMingLiU.ttf b/report/viewer/PMingLiU.ttf
new file mode 100644
index 00000000..03e923bf
--- /dev/null
+++ b/report/viewer/PMingLiU.ttf
diff --git a/report/viewer/alignment.py b/report/viewer/alignment.py
new file mode 100644
index 00000000..5fe03734
--- /dev/null
+++ b/report/viewer/alignment.py
@@ -0,0 +1,162 @@
+class Alignment:
+    SURE, POSSIBLE = 'S', 'P'
+    
+    def __init__(self, swords, twords, align):
+        self.swords = swords
+        self.twords = twords
+        self.align = align
+
+    def reverse(self):
+        als = {}
+        for (frm, to), conf in self.align.items():
+            als[to, frm] = conf
+        return Alignment(self.twords, self.swords, als)
+
+    def merge(self, other):
+        assert self.swords == other.swords
+        assert self.twords == other.twords
+
+        als = {}
+        for frm, to in self.align.keys():
+            als[frm, to] = Alignment.POSSIBLE
+
+        for frm, to in other.align.keys():
+            if (frm, to) in als:
+                als[frm, to] = Alignment.SURE
+            else:
+                als[frm, to] = Alignment.POSSIBLE
+
+        return Alignment(self.swords, self.twords, als)
+
+    def __repr__(self):
+        return 'Alignment(swords=%s, twords=%s, align=%s)' % (self.swords, self.twords, self.align)
+
+def read_pharaoh_text(infile):
+    return infile.readline().strip().split()
+
+def parse_pharaoh_align(text):
+    als = {}
+    for part in text.strip().split():
+        frm, to = map(int, part.split('-'))
+        als[frm, to] = Alignment.SURE
+    return als
+
+def read_pharaoh_align(infile):
+    als = {}
+    for part in infile.readline().strip().split():
+        frm, to = map(int, part.split('-'))
+        als[frm, to] = Alignment.SURE
+    return als
+
+def read_pharaoh_alignment(swfile, twfile, afile):
+    sw = read_pharaoh_text(swfile)
+    tw = read_pharaoh_text(twfile)
+    als = read_pharaoh_align(afile)
+    return Alignment(sw, tw, als)
+    
+def read_giza_alignment(infile):
+    infile.readline() # ignore
+    swords = infile.readline().strip().split()
+    twords = []
+    als = {}
+    state = 0
+    for token in infile.readline().strip().split():
+        if state == 0:
+            if token != 'NULL':
+                if token != '({':
+                    twords.append(token)
+                else:
+                    state = 1
+        elif state == 1:
+            if token != '})':
+                if twords:
+                    als[int(token)-1, len(twords)-1] = Alignment.SURE
+            else:
+                state = 0
+    return Alignment(swords, twords, als)
+
+def read_naacl_aligns(infile):
+    aligns = []
+    last = None
+    for line in infile:
+        index, frm, to, conf = line.rstrip().split()
+        if int(index) != last:
+            aligns.append({})
+        aligns[-1][int(frm)-1, int(to)-1] = conf
+        last = int(index)
+    return aligns
+
+#
+# This phrase-extraction function largely mimics Pharaoh's phrase-extract
+# code. It also supports the option to not advance over NULL alignments.
+#
+
+def xextract_phrases(alignment, maxPhraseLength=None, advance=True):
+    T = len(alignment.twords)
+    S = len(alignment.swords)
+    if not maxPhraseLength:
+        maxPhraseLength = max(T, S)
+
+    alignedCountS = [0 for s in alignment.swords]
+    alignedToT = [[] for t in alignment.twords]
+    alignedToS = [[] for s in alignment.swords]
+    for (s, t), conf in alignment.align.items():
+        if conf == Alignment.SURE:
+            alignedCountS[s] += 1
+            alignedToT[t].append(s)
+            alignedToS[s].append(t)
+
+    # check alignments for english phrase startT...endT
+    for st in range(T):
+        for et in range(st, min(T, st + maxPhraseLength)):
+            minS = 9999
+            maxS = -1
+            usedS = alignedCountS[:]
+            for ti in range(st, et+1):
+                for si in alignedToT[ti]:
+                    #print 'point (%d, %d)' % (si, ti)
+                    if si<minS: minS = si
+                    if si>maxS: maxS = si
+                    usedS[si] -= 1
+                    
+            #print 's projected (%d-%d, %d, %d)' % (minS, maxS, st, et)
+            if (maxS >= 0 and  # aligned to any foreign words at all
+                    maxS-minS < maxPhraseLength): # foreign phrase within limits
+                # check if foreign words are aligned to out of bound english words
+                out_of_bounds = False
+                for si in range(minS, maxS):
+                    if usedS[si] > 0:
+                        #print 'out of bounds:', si
+                        out_of_bounds = True
+                        break
+
+                # Pharoah doesn't use this check, but I think it's required
+                if not out_of_bounds:
+                    for s in range(minS, maxS+1):
+                        for t in alignedToS[s]:
+                            if not (st <= t <= et):
+                                #print 'out of bounds2:', t,s
+                                out_of_bounds = True
+                                break
+
+                #print 'doing it for (%d-%d, %d, %d)' % (minS, maxS, st, et)
+                if not out_of_bounds:
+                    if advance:
+                        #print 'attempting to advance'
+                        # start point of foreign phrase may advance over unaligned
+                        ss = minS
+                        while (ss>=0 and
+                                 ss>maxS-maxPhraseLength and # within length limit
+                                 (ss==minS or alignedCountS[ss]==0)): # unaligned
+                            # end point of foreign phrase may advance over unaligned
+                            es = maxS
+                            while (es<S and 
+                                     es<ss+maxPhraseLength and # within length limit
+                                     (es==maxS or alignedCountS[es]==0)): #unaligned
+                                yield (ss, es, st, et)
+                                es += 1
+                            ss -= 1
+                    else:
+                        ss, es = minS, maxS
+                        yield (minS, maxS, st, et)
+
diff --git a/report/viewer/display.py b/report/viewer/display.py
new file mode 100755
index 00000000..bf9d2a1f
--- /dev/null
+++ b/report/viewer/display.py
@@ -0,0 +1,22 @@
+#!/usr/bin/env python
+
+import sys, os, gzip, re
+import render, alignment
+
+import reportlab.rl_config
+reportlab.rl_config.warnOnMissingFontGlyphs = 0 
+from reportlab.pdfbase.ttfonts import TTFont
+from reportlab.pdfbase import pdfmetrics
+
+pdfmetrics.registerFont(TTFont('PMingLiU', 'PMingLiU.ttf'))
+
+doc = render.Document(sys.argv[1])
+
+for line in sys.stdin:
+    src, tgt, align = line.split(' ||| ')
+    src = src.split()
+    tgt = tgt.split()
+    align = alignment.parse_pharaoh_align(align)
+    doc.append(render.Alignment(src, tgt, align, 'PMingLiU', 'Helvetica', 8, 0.4))
+
+doc.render()
diff --git a/report/viewer/render.py b/report/viewer/render.py
new file mode 100644
index 00000000..0934c39c
--- /dev/null
+++ b/report/viewer/render.py
@@ -0,0 +1,128 @@
+from reportlab.pdfgen import canvas
+from reportlab.lib.colors import black, gray, white, magenta, Color
+from reportlab.lib.pagesizes import A4
+from reportlab.lib.styles import getSampleStyleSheet
+from reportlab.lib.units import cm, inch
+from reportlab.platypus import SimpleDocTemplate, Spacer, Paragraph
+from reportlab.platypus.flowables import Flowable
+import re
+
+class Alignment(Flowable):
+    def __init__(self, x_words, y_words, alignment, x_font, y_font, ptsize, unit, scale=True, colours=None):
+        self._x_words = x_words
+        self._y_words = y_words
+        self._alignment = alignment
+        self._unit = unit*cm
+        self._x_font = x_font
+        self._y_font = y_font
+        self._ptsize = ptsize
+        self._scale = 1
+        self._do_scale = scale
+        self._colours = colours
+        if not colours:
+            self._colours = {'S':black, 'P':gray, 'N':magenta}
+
+    def wrap(self, rw, rh):
+        xws = [self.canv.stringWidth(w, self._x_font, self._ptsize)
+               for w in self._x_words]
+        yws = [self.canv.stringWidth(w, self._y_font, self._ptsize)
+               for w in self._y_words]
+        width = (len(self._x_words) + 0.22)* self._unit + max(yws)
+        height = (len(self._y_words) + 0.22)* self._unit + max(xws)
+        
+        if self._do_scale:
+            self._scale = min(rw / width, 1.5)
+            width *= self._scale
+            height *= self._scale
+
+        return (width, height)
+
+    def draw(self):
+        c = self.canv
+        print c.getAvailableFonts()
+
+        X=len(self._x_words)
+        Y=len(self._y_words)
+
+        c.saveState()
+        c.scale(self._scale, self._scale)
+
+        for (x, y), conf in self._alignment.items():
+            col = self._colours[conf]
+            if isinstance(col, Color):
+                c.setFillColor(col)
+                c.rect((0.02 + x)*self._unit, (0.02+Y-y-1)*self._unit,
+                       self._unit, self._unit, 0, 1)
+            else:
+                bl = (x*self._unit, (Y-y-1)*self._unit)
+                tl = (x*self._unit, (Y-y)*self._unit)
+                tr = ((x+1)*self._unit, (Y-y)*self._unit)
+                br = ((x+1)*self._unit, (Y-y-1)*self._unit)
+
+                p = c.beginPath()
+                p.moveTo(*br)
+                p.lineTo(*tr)
+                p.lineTo(*tl)
+                c.setFillColor(col[0])
+                c.drawPath(p, fill=1)
+                p = c.beginPath()
+                p.moveTo(*br)
+                p.lineTo(*bl)
+                p.lineTo(*tl)
+                c.setFillColor(col[1])
+                c.drawPath(p, fill=1)
+
+        c.setStrokeColor(black)
+        c.grid(map(lambda x: (0.02+x)*self._unit, range(X+1)),
+               map(lambda y: (0.02+y)*self._unit, range(Y+1)))
+
+        c.setFont(self._x_font, self._ptsize)
+        c.setFillColor(black)
+        for x, word in enumerate(self._x_words):
+            c.saveState()
+            c.translate((x+0.52)*self._unit, (Y+0.22)*self._unit)
+            c.rotate(60)
+            c.drawString(0, 0, word)
+            c.restoreState()
+
+        c.setFont(self._y_font, self._ptsize)
+        for y, word in enumerate(self._y_words):
+            c.drawString((X+0.22)*self._unit, (Y-y+0.42-1)*self._unit, word)
+
+        c.restoreState()
+
+class Document:
+    def __init__(self, filename):
+        self._styles = getSampleStyleSheet()
+        self._doc = SimpleDocTemplate(filename)
+        self._story = []
+
+    def append(self, flowable):
+        self._story.append(flowable)
+        self._story.append(Spacer(1, 1*cm))
+
+    def render(self):
+        self._doc.build(self._story[:-1])
+
+class Canvas:
+    def __init__(self, filename):
+        self._filename = filename
+        self._canvas = canvas.Canvas('.' + filename, A4)
+        self._size = A4
+        self._body = None
+
+    def append(self, flowable):
+        if self._body:
+            print >>sys.stderr, 'WARNING: replacing existing flowable' 
+        self._body = flowable
+
+    def render(self):
+        self._body.canv = self._canvas
+        width, height = self._body.wrap(*self._size)
+        width *= 1.02
+        height *= 1.02
+
+        self._canvas = canvas.Canvas(self._filename, (width, height))
+        self._body.canv = self._canvas
+        self._body.draw()
+        self._canvas.save()
author	trevor.cohn <trevor.cohn@ec762483-ff6d-05da-a07a-a48fb63a330f>	2010-08-17 11:09:05 +0000
committer	trevor.cohn <trevor.cohn@ec762483-ff6d-05da-a07a-a48fb63a330f>	2010-08-17 11:09:05 +0000
commit	59586b8b10f40b36178bd17f01497a73001cec5e (patch)
tree	6e86880a64f31b131a776662cb1da1c97c5dfdf2 /report/viewer
parent	f804499b85bee7ac5c37304412f8a3ae3da8f1e1 (diff)