summaryrefslogtreecommitdiff
path: root/python/cdec/grammar.pxi
blob: d523e4d218f94527b76fcdbb8595747404f01c21 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
cimport grammar
cimport cdec.sa._sa as _sa
import cdec.sa._sa as _sa

def _phrase(phrase):
    return ' '.join(w.encode('utf8') if isinstance(w, unicode) else str(w) for w in phrase)

cdef class NT:
    cdef public bytes cat
    cdef public unsigned ref
    def __init__(self, bytes cat, unsigned ref=0):
        """NT(bytes cat, int ref=0) -> Non-terminal from category `cat`."""
        self.cat = cat
        self.ref = ref

    def __str__(self):
        if self.ref > 0:
            return '[%s,%d]' % (self.cat, self.ref)
        return '[%s]' % self.cat

cdef class NTRef:
    cdef public unsigned ref
    def __init__(self, unsigned ref):
        """NTRef(int ref) -> Non-terminal reference."""
        self.ref = ref

    def __str__(self):
        return '[%d]' % self.ref

cdef TRule convert_rule(_sa.Rule rule):
    lhs = _sa.sym_tocat(rule.lhs)
    scores = dict(rule.scores)
    f, e = [], []
    cdef int* fsyms = rule.f.syms
    for i in range(rule.f.n):
        if _sa.sym_isvar(fsyms[i]):
            f.append(NT(_sa.sym_tocat(fsyms[i])))
        else:
            f.append(_sa.sym_tostring(fsyms[i]))
    cdef int* esyms = rule.e.syms
    for i in range(rule.e.n):
        if _sa.sym_isvar(esyms[i]):
            e.append(NTRef(_sa.sym_getindex(esyms[i])))
        else:
            e.append(_sa.sym_tostring(esyms[i]))
    a = list(rule.alignments())
    return TRule(lhs, f, e, scores, a)

cdef class TRule:
    cdef shared_ptr[grammar.TRule]* rule

    def __init__(self, lhs, f, e, scores, a=None):
        """TRule(lhs, f, e, scores, a=None) -> Translation rule.
        lhs: left hand side non-terminal
        f: source phrase (list of words/NT)
        e: target phrase (list of words/NTRef)
        scores: dictionary of feature scores
        a: optional list of alignment points"""
        self.rule = new shared_ptr[grammar.TRule](new grammar.TRule())
        self.lhs = lhs
        self.e = e
        self.f = f
        self.scores = scores
        if a:
            self.a = a
        self.rule.get().ComputeArity()

    def __dealloc__(self):
        del self.rule

    property arity:
        def __get__(self):
            return self.rule.get().arity_

    property f:
        def __get__(self):
            cdef vector[WordID]* f_ = &self.rule.get().f_
            cdef WordID w
            cdef f = []
            cdef unsigned i
            cdef int idx = 0
            for i in range(f_.size()):
                w = f_[0][i]
                if w < 0:
                    idx += 1
                    f.append(NT(TDConvert(-w).c_str(), idx))
                else:
                    f.append(unicode(TDConvert(w).c_str(), encoding='utf8'))
            return f

        def __set__(self, f):
            cdef vector[WordID]* f_ = &self.rule.get().f_
            f_.resize(len(f))
            cdef unsigned i
            cdef int idx = 0
            for i in range(len(f)):
                if isinstance(f[i], NT):
                    f_[0][i] = -TDConvert((<NT> f[i]).cat)
                else:
                    fi = as_str(f[i])
                    f_[0][i] = TDConvert(fi)

    property e:
        def __get__(self):
            cdef vector[WordID]* e_ = &self.rule.get().e_
            cdef WordID w
            cdef e = []
            cdef unsigned i
            cdef int idx = 0
            for i in range(e_.size()):
                w = e_[0][i]
                if w < 1:
                    idx += 1
                    e.append(NTRef(1-w))
                else:
                    e.append(unicode(TDConvert(w).c_str(), encoding='utf8'))
            return e

        def __set__(self, e):
            cdef vector[WordID]* e_ = &self.rule.get().e_
            e_.resize(len(e))
            cdef unsigned i
            for i in range(len(e)):
                if isinstance(e[i], NTRef):
                    e_[0][i] = 1-e[i].ref
                else:
                    ei = as_str(e[i])
                    e_[0][i] = TDConvert(ei)

    property a:
        def __get__(self):
            cdef unsigned i
            cdef vector[grammar.AlignmentPoint]* a = &self.rule.get().a_
            for i in range(a.size()):
                yield (a[0][i].s_, a[0][i].t_)

        def __set__(self, a):
            cdef vector[grammar.AlignmentPoint]* a_ = &self.rule.get().a_
            a_.resize(len(a))
            cdef unsigned i
            cdef int s, t
            for i in range(len(a)):
                s, t = a[i]
                a_[0][i] = grammar.AlignmentPoint(s, t)

    property scores:
        def __get__(self):
            cdef SparseVector scores = SparseVector.__new__(SparseVector)
            scores.vector = new FastSparseVector[double](self.rule.get().scores_)
            return scores

        def __set__(self, scores):
            cdef FastSparseVector[double]* scores_ = &self.rule.get().scores_
            scores_.clear()
            cdef int fid
            cdef float fval
            for fname, fval in scores.items():
                fn = as_str(fname)
                fid = FDConvert(fn)
                if fid < 0: raise KeyError(fname)
                scores_.set_value(fid, fval)

    property lhs:
        def __get__(self):
            return NT(TDConvert(-self.rule.get().lhs_).c_str())

        def __set__(self, lhs):
            if not isinstance(lhs, NT):
                lhs = NT(lhs)
            self.rule.get().lhs_ = -TDConvert((<NT> lhs).cat)

    def __str__(self):
        scores = ' '.join('%s=%s' % feat for feat in self.scores)
        return '%s ||| %s ||| %s ||| %s' % (self.lhs,
                _phrase(self.f), _phrase(self.e), scores)

cdef class MRule(TRule):
    def __init__(self, lhs, rhs, scores):
        """MRule(lhs, rhs, scores, a=None) -> Monolingual rule.
        lhs: left hand side non-terminal
        rhs: right hand side phrase (list of words/NT)
        scores: dictionary of feature scores"""
        cdef unsigned i = 1
        e = []
        for s in rhs:
            if isinstance(s, NT):
                e.append(NTRef(i))
                i += 1
            else:
                e.append(s)
        super(MRule, self).__init__(lhs, rhs, e, scores, None)

cdef class Grammar:
    cdef shared_ptr[grammar.Grammar]* grammar
    
    def __dealloc__(self):
        del self.grammar
    
    def __iter__(self):
        cdef grammar.const_GrammarIter* root = self.grammar.get().GetRoot()
        cdef grammar.const_RuleBin* rbin = root.GetRules()
        cdef TRule trule
        cdef unsigned i
        for i in range(rbin.GetNumRules()):
            trule = TRule.__new__(TRule)
            trule.rule = new shared_ptr[grammar.TRule](rbin.GetIthRule(i))
            yield trule

    property name:
        def __get__(self):
            str(self.grammar.get().GetGrammarName().c_str())

        def __set__(self, name):
            name = as_str(name)
            self.grammar.get().SetGrammarName(name)

cdef class TextGrammar(Grammar):
    def __init__(self, rules):
        """TextGrammar(rules) -> SCFG Grammar containing the rules."""
        self.grammar = new shared_ptr[grammar.Grammar](new grammar.TextGrammar())
        cdef grammar.TextGrammar* _g = <grammar.TextGrammar*> self.grammar.get()
        for trule in rules:
            if isinstance(trule, _sa.Rule):
                trule = convert_rule(trule)
            elif not isinstance(trule, TRule):
                raise ValueError('the grammar should contain TRule objects')
            _g.AddRule((<TRule> trule).rule[0])
ʙ sf⾀bUGwb+p2T&r_q}cu\ZLA>'qU7b3^Y-/\\м~ m_߼{ 7Mf:DTX 58 $(ğѰ}"_N;R|uvչs[28{xg]xww`h{A໳Dzd6Gc [gýH![pNX׷йѳΦ*G66&شsH{zv! DFځԉԁTraPۇt<"B;v ߉ԁ^]TR0R~zo!|ݏ$϶f3R)l7؅vt;4{QijxӤƂBS~&z\ۓkw8snkC 䈆h@ 1#ި|k2s^({kDTCu|J\>_ 1x˔`"_ j4RC4q_Tvj!X&];kD0OiJ[^B)Q+gS jmH0S}4-ojQygTEòovٴyӦ-? endstream endobj 32 0 obj 4436 endobj 33 0 obj << /Type /FontDescriptor /Ascent 909 /CapHeight 808 /Descent -230 /Flags 96 /FontBBox [-12 -230 875 750] /FontName /PEIXHR+GillSans-Italic /ItalicAngle -8 /StemV 0 /MaxWidth 1088 /XHeight 606 /FontFile2 31 0 R >> endobj 34 0 obj [ 278 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 469 469 0 469 438 0 0 469 0 0 458 188 740 469 0 0 0 313 354 0 0 0 0 0 375 0 0 0 333 ] endobj 12 0 obj << /Type /Font /Subtype /TrueType /BaseFont /PEIXHR+GillSans-Italic /FontDescriptor 33 0 R /Widths 34 0 R /FirstChar 32 /LastChar 125 /Encoding /MacRomanEncoding >> endobj 35 0 obj << /Length 36 0 R /Length1 5944 /Filter /FlateDecode >> stream xX{TTGBhnD@qnAE1hHI4$19fLf(x41Lbf&Ɍ݉$'' I֩[ߣ_UĈ(:I$g}rI;m0ֶFI]Dܸ֮z#єsSٕـW~ aצ({%b AoAofnˈ NY-B6;s7!1)n"/1al$2 PiJ5tRM2W˼U{"j-a\?P<tN"S:HY Ut`4\K*=Ӵר~t)NVo7sh[;!Q8%Q!=@OMhǴ>xӭtz0e(7ڻde ״*$ER&R5]t9ZY=KiI.iWH7jPxRz@{TH>KO`(~^Op 톘AF Qi==9?`vymMJ$@(?Eƚ1#jc҅[ppD\%n$mz j)t;mmXDӗL\ X)/TvI8akA BW.>ūRA01gt6T7cH}LvL)-5ȺFЏ('ktQ/ "YeQ5,y a! BP' B/Z$K5dBvI#(G'^UCaaӰðK7fxøm5~j/Sʴ޴ s9:K\ꙋ~ =(U[-n ^d#v8LYY;韥R`wYȢLKOKMIv$)369њk>-*2"|rؤ&AFngS}gq]c6Qm|q=ў,6ee܊MKX2.kS?ź.OlcmirT泹ղdN11+p$>J [,hxj>7om8[˶F[|#) W a@3C58*jg MapW8_})Wl-T>x?b1:¯wo:1XuEiۡ)tcΩijneǮڼ0(#2D==D.k?M qpgTkv!>Lffʰ2+-XٚLCo`h":b띋y<A/fhnDYؕj2W./v;PQ^/z"͖̹,5SxA>gG`|oz Cx,bFsbWނQ! .s3 Ͻ)Fcx0q1\ogx(nhW fQf/gUcR켄H&* QMFj]B:dIg1{D j\_C/ڴT`| 8CD1~ gC yfGnA~ ¦2 %$+3)Iy1&_`”)I.(f5JJtܙ(X9 6l]d6:G8ך[yFJG7bFNV (\g 0SC5;3nOm̥eyww, Ok]:A~>>64*t"s#@;܈hc.q~<"cJ( |# 39qzݝ$yv¨IL`Zs .)3̿03b &f7ٗg";h8ס?9f꼫a޸Elk?oiCia_{L_=&C$J u8u3vT>gUHׄ@-X] D'H#G9\uVguڛYK5ߵ)}?z_"n>m^*PKPgfd̷P';JUf6@94*vJfY"$I^1-VL$1Čsdb{'ӄ/c@29¤G>Qj';֛+g$Sd+_H IhM*XXQ{]|̺Nޛb[C c 7zFnM/7!WJ$9.϶+g zJNfäg`+υ+Nzg(u,Br* ؽ}9/OݟVHie))Wd2no5e"$d7ś#)0Dl _sJ@KOh6o`αSf,<-}Ѵ9%gd _n:ᔑuG8›fhNCFQb),SOxg|caVu?> endobj 38 0 obj [ 556 0 0 556 0 0 0 0 0 0 0 0 833 556 0 0 0 0 0 278 0 0 0 0 0 500 ] endobj 13 0 obj << /Type /Font /Subtype /TrueType /BaseFont /VRYEZP+Helvetica-Oblique /FontDescriptor 37 0 R /Widths 38 0 R /FirstChar 97 /LastChar 122 /Encoding /MacRomanEncoding >> endobj 1 0 obj << /Producer (Mac OS X 10.5.8 Quartz PDFContext) /CreationDate (D:20090916202939Z00'00') /ModDate (D:20090916202939Z00'00') >> endobj xref 0 39 0000000000 65535 f 0000023834 00000 n 0000000695 00000 n 0000002895 00000 n 0000000022 00000 n 0000000676 00000 n 0000000799 00000 n 0000001907 00000 n