diff options
| author | trevor.cohn <trevor.cohn@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-08-17 11:09:05 +0000 | 
|---|---|---|
| committer | trevor.cohn <trevor.cohn@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-08-17 11:09:05 +0000 | 
| commit | 59586b8b10f40b36178bd17f01497a73001cec5e (patch) | |
| tree | 6e86880a64f31b131a776662cb1da1c97c5dfdf2 /report/viewer | |
| parent | f804499b85bee7ac5c37304412f8a3ae3da8f1e1 (diff) | |
Some more work on setup chapter. Nb. added some Chinese in UTF-8. Requires CJK package and the appropriate fonts to be installed. Tex-Live CJK does the trick.
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@572 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'report/viewer')
| -rw-r--r-- | report/viewer/PMingLiU.ttf | bin | 0 -> 19389344 bytes | |||
| -rw-r--r-- | report/viewer/alignment.py | 162 | ||||
| -rwxr-xr-x | report/viewer/display.py | 22 | ||||
| -rw-r--r-- | report/viewer/render.py | 128 | 
4 files changed, 312 insertions, 0 deletions
| diff --git a/report/viewer/PMingLiU.ttf b/report/viewer/PMingLiU.ttfBinary files differ new file mode 100644 index 00000000..03e923bf --- /dev/null +++ b/report/viewer/PMingLiU.ttf diff --git a/report/viewer/alignment.py b/report/viewer/alignment.py new file mode 100644 index 00000000..5fe03734 --- /dev/null +++ b/report/viewer/alignment.py @@ -0,0 +1,162 @@ +class Alignment: +    SURE, POSSIBLE = 'S', 'P' +     +    def __init__(self, swords, twords, align): +        self.swords = swords +        self.twords = twords +        self.align = align + +    def reverse(self): +        als = {} +        for (frm, to), conf in self.align.items(): +            als[to, frm] = conf +        return Alignment(self.twords, self.swords, als) + +    def merge(self, other): +        assert self.swords == other.swords +        assert self.twords == other.twords + +        als = {} +        for frm, to in self.align.keys(): +            als[frm, to] = Alignment.POSSIBLE + +        for frm, to in other.align.keys(): +            if (frm, to) in als: +                als[frm, to] = Alignment.SURE +            else: +                als[frm, to] = Alignment.POSSIBLE + +        return Alignment(self.swords, self.twords, als) + +    def __repr__(self): +        return 'Alignment(swords=%s, twords=%s, align=%s)' % (self.swords, self.twords, self.align) + +def read_pharaoh_text(infile): +    return infile.readline().strip().split() + +def parse_pharaoh_align(text): +    als = {} +    for part in text.strip().split(): +        frm, to = map(int, part.split('-')) +        als[frm, to] = Alignment.SURE +    return als + +def read_pharaoh_align(infile): +    als = {} +    for part in infile.readline().strip().split(): +        frm, to = map(int, part.split('-')) +        als[frm, to] = Alignment.SURE +    return als + +def read_pharaoh_alignment(swfile, twfile, afile): +    sw = read_pharaoh_text(swfile) +    tw = read_pharaoh_text(twfile) +    als = read_pharaoh_align(afile) +    return Alignment(sw, tw, als) +     +def read_giza_alignment(infile): +    infile.readline() # ignore +    swords = infile.readline().strip().split() +    twords = [] +    als = {} +    state = 0 +    for token in infile.readline().strip().split(): +        if state == 0: +            if token != 'NULL': +                if token != '({': +                    twords.append(token) +                else: +                    state = 1 +        elif state == 1: +            if token != '})': +                if twords: +                    als[int(token)-1, len(twords)-1] = Alignment.SURE +            else: +                state = 0 +    return Alignment(swords, twords, als) + +def read_naacl_aligns(infile): +    aligns = [] +    last = None +    for line in infile: +        index, frm, to, conf = line.rstrip().split() +        if int(index) != last: +            aligns.append({}) +        aligns[-1][int(frm)-1, int(to)-1] = conf +        last = int(index) +    return aligns + +# +# This phrase-extraction function largely mimics Pharaoh's phrase-extract +# code. It also supports the option to not advance over NULL alignments. +# + +def xextract_phrases(alignment, maxPhraseLength=None, advance=True): +    T = len(alignment.twords) +    S = len(alignment.swords) +    if not maxPhraseLength: +        maxPhraseLength = max(T, S) + +    alignedCountS = [0 for s in alignment.swords] +    alignedToT = [[] for t in alignment.twords] +    alignedToS = [[] for s in alignment.swords] +    for (s, t), conf in alignment.align.items(): +        if conf == Alignment.SURE: +            alignedCountS[s] += 1 +            alignedToT[t].append(s) +            alignedToS[s].append(t) + +    # check alignments for english phrase startT...endT +    for st in range(T): +        for et in range(st, min(T, st + maxPhraseLength)): +            minS = 9999 +            maxS = -1 +            usedS = alignedCountS[:] +            for ti in range(st, et+1): +                for si in alignedToT[ti]: +                    #print 'point (%d, %d)' % (si, ti) +                    if si<minS: minS = si +                    if si>maxS: maxS = si +                    usedS[si] -= 1 +                     +            #print 's projected (%d-%d, %d, %d)' % (minS, maxS, st, et) +            if (maxS >= 0 and  # aligned to any foreign words at all +                    maxS-minS < maxPhraseLength): # foreign phrase within limits +                # check if foreign words are aligned to out of bound english words +                out_of_bounds = False +                for si in range(minS, maxS): +                    if usedS[si] > 0: +                        #print 'out of bounds:', si +                        out_of_bounds = True +                        break + +                # Pharoah doesn't use this check, but I think it's required +                if not out_of_bounds: +                    for s in range(minS, maxS+1): +                        for t in alignedToS[s]: +                            if not (st <= t <= et): +                                #print 'out of bounds2:', t,s +                                out_of_bounds = True +                                break + +                #print 'doing it for (%d-%d, %d, %d)' % (minS, maxS, st, et) +                if not out_of_bounds: +                    if advance: +                        #print 'attempting to advance' +                        # start point of foreign phrase may advance over unaligned +                        ss = minS +                        while (ss>=0 and +                                 ss>maxS-maxPhraseLength and # within length limit +                                 (ss==minS or alignedCountS[ss]==0)): # unaligned +                            # end point of foreign phrase may advance over unaligned +                            es = maxS +                            while (es<S and  +                                     es<ss+maxPhraseLength and # within length limit +                                     (es==maxS or alignedCountS[es]==0)): #unaligned +                                yield (ss, es, st, et) +                                es += 1 +                            ss -= 1 +                    else: +                        ss, es = minS, maxS +                        yield (minS, maxS, st, et) + diff --git a/report/viewer/display.py b/report/viewer/display.py new file mode 100755 index 00000000..bf9d2a1f --- /dev/null +++ b/report/viewer/display.py @@ -0,0 +1,22 @@ +#!/usr/bin/env python + +import sys, os, gzip, re +import render, alignment + +import reportlab.rl_config +reportlab.rl_config.warnOnMissingFontGlyphs = 0  +from reportlab.pdfbase.ttfonts import TTFont +from reportlab.pdfbase import pdfmetrics + +pdfmetrics.registerFont(TTFont('PMingLiU', 'PMingLiU.ttf')) + +doc = render.Document(sys.argv[1]) + +for line in sys.stdin: +    src, tgt, align = line.split(' ||| ') +    src = src.split() +    tgt = tgt.split() +    align = alignment.parse_pharaoh_align(align) +    doc.append(render.Alignment(src, tgt, align, 'PMingLiU', 'Helvetica', 8, 0.4)) + +doc.render() diff --git a/report/viewer/render.py b/report/viewer/render.py new file mode 100644 index 00000000..0934c39c --- /dev/null +++ b/report/viewer/render.py @@ -0,0 +1,128 @@ +from reportlab.pdfgen import canvas +from reportlab.lib.colors import black, gray, white, magenta, Color +from reportlab.lib.pagesizes import A4 +from reportlab.lib.styles import getSampleStyleSheet +from reportlab.lib.units import cm, inch +from reportlab.platypus import SimpleDocTemplate, Spacer, Paragraph +from reportlab.platypus.flowables import Flowable +import re + +class Alignment(Flowable): +    def __init__(self, x_words, y_words, alignment, x_font, y_font, ptsize, unit, scale=True, colours=None): +        self._x_words = x_words +        self._y_words = y_words +        self._alignment = alignment +        self._unit = unit*cm +        self._x_font = x_font +        self._y_font = y_font +        self._ptsize = ptsize +        self._scale = 1 +        self._do_scale = scale +        self._colours = colours +        if not colours: +            self._colours = {'S':black, 'P':gray, 'N':magenta} + +    def wrap(self, rw, rh): +        xws = [self.canv.stringWidth(w, self._x_font, self._ptsize) +               for w in self._x_words] +        yws = [self.canv.stringWidth(w, self._y_font, self._ptsize) +               for w in self._y_words] +        width = (len(self._x_words) + 0.22)* self._unit + max(yws) +        height = (len(self._y_words) + 0.22)* self._unit + max(xws) +         +        if self._do_scale: +            self._scale = min(rw / width, 1.5) +            width *= self._scale +            height *= self._scale + +        return (width, height) + +    def draw(self): +        c = self.canv +        print c.getAvailableFonts() + +        X=len(self._x_words) +        Y=len(self._y_words) + +        c.saveState() +        c.scale(self._scale, self._scale) + +        for (x, y), conf in self._alignment.items(): +            col = self._colours[conf] +            if isinstance(col, Color): +                c.setFillColor(col) +                c.rect((0.02 + x)*self._unit, (0.02+Y-y-1)*self._unit, +                       self._unit, self._unit, 0, 1) +            else: +                bl = (x*self._unit, (Y-y-1)*self._unit) +                tl = (x*self._unit, (Y-y)*self._unit) +                tr = ((x+1)*self._unit, (Y-y)*self._unit) +                br = ((x+1)*self._unit, (Y-y-1)*self._unit) + +                p = c.beginPath() +                p.moveTo(*br) +                p.lineTo(*tr) +                p.lineTo(*tl) +                c.setFillColor(col[0]) +                c.drawPath(p, fill=1) +                p = c.beginPath() +                p.moveTo(*br) +                p.lineTo(*bl) +                p.lineTo(*tl) +                c.setFillColor(col[1]) +                c.drawPath(p, fill=1) + +        c.setStrokeColor(black) +        c.grid(map(lambda x: (0.02+x)*self._unit, range(X+1)), +               map(lambda y: (0.02+y)*self._unit, range(Y+1))) + +        c.setFont(self._x_font, self._ptsize) +        c.setFillColor(black) +        for x, word in enumerate(self._x_words): +            c.saveState() +            c.translate((x+0.52)*self._unit, (Y+0.22)*self._unit) +            c.rotate(60) +            c.drawString(0, 0, word) +            c.restoreState() + +        c.setFont(self._y_font, self._ptsize) +        for y, word in enumerate(self._y_words): +            c.drawString((X+0.22)*self._unit, (Y-y+0.42-1)*self._unit, word) + +        c.restoreState() + +class Document: +    def __init__(self, filename): +        self._styles = getSampleStyleSheet() +        self._doc = SimpleDocTemplate(filename) +        self._story = [] + +    def append(self, flowable): +        self._story.append(flowable) +        self._story.append(Spacer(1, 1*cm)) + +    def render(self): +        self._doc.build(self._story[:-1]) + +class Canvas: +    def __init__(self, filename): +        self._filename = filename +        self._canvas = canvas.Canvas('.' + filename, A4) +        self._size = A4 +        self._body = None + +    def append(self, flowable): +        if self._body: +            print >>sys.stderr, 'WARNING: replacing existing flowable'  +        self._body = flowable + +    def render(self): +        self._body.canv = self._canvas +        width, height = self._body.wrap(*self._size) +        width *= 1.02 +        height *= 1.02 + +        self._canvas = canvas.Canvas(self._filename, (width, height)) +        self._body.canv = self._canvas +        self._body.draw() +        self._canvas.save() | 
