1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
|
#!/usr/bin/env python
import collections, sys
def main(argv):
for line in sys.stdin:
src, tgt, astr = (x.split() for x in line.split('|||'))
al = sorted(tuple(int(y) for y in x.split('-')) for x in astr)
extract_and_aggr(src, tgt, al)
# Extract hierarchical phrase pairs
# This could be far better optimized by integrating it
# with suffix array code. For now, it gets the job done.
def extract_and_aggr(src, tgt, al, max_len=5, max_size=15, max_nt=2, boundary_nt=True):
src_ph = collections.defaultdict(lambda: 0) # src = count
tgt_ph = collections.defaultdict(lambda: 0) # tgt = count
# [src][tgt] = count
phrase_pairs = collections.defaultdict(lambda: collections.defaultdict(lambda: 0))
src_w = collections.defaultdict(lambda: 0) # count
tgt_w = collections.defaultdict(lambda: 0) # count
# [src][tgt] = count
cooc_w = collections.defaultdict(lambda: collections.defaultdict(lambda: 0))
# Bilexical counts
for word in tgt:
tgt_w[word] += 1
for word in src:
src_w[word] += 1
for t_word in tgt:
cooc_w[word][t_word] += 1
def next_nt(nt):
if not nt:
return 1
return nt[-1][0] + 1
src_len = len(src)
a = [[] for i in range(src_len)]
# Pre-compute alignment min and max for each word
a_span = [[src_len + 1, -1] for i in range(src_len)]
for (s, t) in al:
a[s].append(t)
a_span[s][0] = min(a_span[s][0], t)
a_span[s][1] = max(a_span[s][1], t)
# Target side non-terimnal coverage
# Cython bit vector?
cover = [0] * src_len
print src
print tgt
print a_span
# Spans are _inclusive_ on both ends [i, j]
def span_check(vec, i, j):
k = i
while k <= j:
if vec[k]:
return False
k += 1
return True
def span_flip(vec, i, j):
k = i
while k <= j:
vec[k] = ~vec[k]
k += 1
# Extract all possible hierarchical phrases starting at a source index
# src i and j are current, tgt i and j are previous
def extract(src_i, src_j, tgt_i, tgt_j, wc, al, nt, nt_open):
# Phrase extraction limits
if wc > max_len or (src_j + 1) >= src_len or \
(src_j - src_i) + 1 > max_size or len(nt) > max_nt:
return
# Unaligned word
if not a[src_j]:
# Open non-terminal: extend
if nt_open:
nt[-1][2] += 1
extract(src_i, src_j + 1, tgt_i, tgt_j, wc, al, nt, True)
nt[-1][2] -= 1
# No open non-terminal: extend with word
else:
extract(src_i, src_j + 1, tgt_i, tgt_j, wc + 1, al, nt, False)
return
# Aligned word
link_i = a_span[src_j][0]
link_j = a_span[src_j][1]
new_tgt_i = min(link_i, tgt_i)
new_tgt_j = max(link_j, tgt_j)
# Open non-terminal: close, extract, extend
if nt_open:
# Close non-terminal, checking for collisions
old_last_nt = nt[-1][:]
nt[-1][2] = src_j
if link_i < nt[-1][3]:
if not span_check(cover, link_i, nt[-1][3] - 1):
nt[-1] = old_last_nt
return
span_flip(cover, link_i, nt[-1][3] - 1)
nt[-1][3] = link_i
if link_j > nt[-1][4]:
if not span_check(cover, nt[-1][4] + 1, link_j):
nt[-1] = old_last_nt
return
span_flip(cover, nt[-1][4] + 1, link_j)
nt[-1][4] = link_j
add_rule(src_i, new_tgt_i, src[src_i:src_j + 1], tgt[new_tgt_i:new_tgt_j + 1], nt, al)
extract(src_i, src_j + 1, new_tgt_i, new_tgt_j, wc, al, nt, False)
nt[-1] = old_last_nt
if link_i < nt[-1][3]:
span_flip(cover, link_i, nt[-1][3] - 1)
if link_j > nt[-1][4]:
span_flip(cover, nt[-1][4] + 1, link_j)
return
# No open non-terminal
# Extract, extend with word
collision = False
for link in a[src_j]:
if cover[link]:
collision = True
# Collisions block extraction and extension, but may be okay for
# continuing non-terminals
if not collision:
plus_al = []
for link in a[src_j]:
plus_al.append((src_j, link))
cover[link] = ~cover[link]
al.append(plus_al)
add_rule(src_i, new_tgt_i, src[src_i:src_j + 1], tgt[new_tgt_i:new_tgt_j + 1], nt, al)
extract(src_i, src_j + 1, new_tgt_i, new_tgt_j, wc + 1, al, nt, False)
al.pop()
for link in a[src_j]:
cover[link] = ~cover[link]
# Try to add a word to a (closed) non-terminal, extract, extend
if nt and nt[-1][2] == src_j - 1:
# Add to non-terminal, checking for collisions
old_last_nt = nt[-1][:]
nt[-1][2] = src_j
if link_i < nt[-1][3]:
if not span_check(cover, link_i, nt[-1][3] - 1):
nt[-1] = old_last_nt
return
span_flip(cover, link_i, nt[-1][3] - 1)
nt[-1][3] = link_i
if link_j > nt[-1][4]:
if not span_check(cover, nt[-1][4] + 1, link_j):
nt[-1] = old_last_nt
return
span_flip(cover, nt[-1][4] + 1, link_j)
nt[-1][4] = link_j
# Require at least one word in phrase
if al:
add_rule(src_i, new_tgt_i, src[src_i:src_j + 1], tgt[new_tgt_i:new_tgt_j + 1], nt, al)
extract(src_i, src_j + 1, new_tgt_i, new_tgt_j, wc, al, nt, False)
nt[-1] = old_last_nt
if new_tgt_i < nt[-1][3]:
span_flip(cover, link_i, nt[-1][3] - 1)
if link_j > nt[-1][4]:
span_flip(cover, nt[-1][4] + 1, link_j)
# Try to start a new non-terminal, extract, extend
if not nt or src_j - nt[-1][2] > 1:
# Check for collisions
if not span_check(cover, link_i, link_j):
return
span_flip(cover, link_i, link_j)
nt.append([next_nt(nt), src_j, src_j, link_i, link_j])
# Require at least one word in phrase
if al:
add_rule(src_i, new_tgt_i, src[src_i:src_j + 1], tgt[new_tgt_i:new_tgt_j + 1], nt, al)
extract(src_i, src_j + 1, new_tgt_i, new_tgt_j, wc, al, nt, False)
nt.pop()
span_flip(cover, link_i, link_j)
# TODO: try adding NT to start, end, both
# check: one aligned word on boundary that is not part of a NT
# Try to extract phrases from every src index
src_i = 0
while src_i < src_len:
# Skip if phrases won't be tight on left side
if not a[src_i]:
src_i += 1
continue
extract(src_i, src_i, src_len + 1, -1, 1, [], [], False)
src_i += 1
# Create a rule from source, target, non-terminals, and alignments
def add_rule(src_i, tgt_i, src_span, tgt_span, nt, al):
flat = (item for sub in al for item in sub)
astr = ' '.join('{0}-{1}'.format(x[0], x[1]) for x in flat)
# print '--- Rule'
# print src_span
# print tgt_span
# print nt
# print astr
# print '---'
# This could be more efficient but is probably not going to
# be the bottleneck
src_sym = src_span[:]
off = src_i
for next_nt in nt:
nt_len = (next_nt[2] - next_nt[1]) + 1
i = 0
while i < nt_len:
src_sym.pop(next_nt[1] - off)
i += 1
src_sym.insert(next_nt[1] - off, '[X,{0}]'.format(next_nt[0]))
off += (nt_len - 1)
tgt_sym = tgt_span[:]
off = tgt_i
for next_nt in sorted(nt, cmp=lambda x, y: cmp(x[3], y[3])):
nt_len = (next_nt[4] - next_nt[3]) + 1
i = 0
while i < nt_len:
tgt_sym.pop(next_nt[3] - off)
i += 1
tgt_sym.insert(next_nt[3] - off, '[X,{0}]'.format(next_nt[0]))
off += (nt_len - 1)
print '[X] ||| {0} ||| {1} ||| {2}'.format(' '.join(src_sym), ' '.join(tgt_sym), astr)
if __name__ == '__main__':
main(sys.argv)
|