1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
|
class Alignment:
SURE, POSSIBLE = 'S', 'P'
def __init__(self, swords, twords, align):
self.swords = swords
self.twords = twords
self.align = align
def reverse(self):
als = {}
for (frm, to), conf in self.align.items():
als[to, frm] = conf
return Alignment(self.twords, self.swords, als)
def merge(self, other):
assert self.swords == other.swords
assert self.twords == other.twords
als = {}
for frm, to in self.align.keys():
als[frm, to] = Alignment.POSSIBLE
for frm, to in other.align.keys():
if (frm, to) in als:
als[frm, to] = Alignment.SURE
else:
als[frm, to] = Alignment.POSSIBLE
return Alignment(self.swords, self.twords, als)
def __repr__(self):
return 'Alignment(swords=%s, twords=%s, align=%s)' % (self.swords, self.twords, self.align)
def read_pharaoh_text(infile):
return infile.readline().strip().split()
def parse_pharaoh_align(text):
als = {}
for part in text.strip().split():
frm, to = map(int, part.split('-'))
als[frm, to] = Alignment.SURE
return als
def read_pharaoh_align(infile):
als = {}
for part in infile.readline().strip().split():
frm, to = map(int, part.split('-'))
als[frm, to] = Alignment.SURE
return als
def read_pharaoh_alignment(swfile, twfile, afile):
sw = read_pharaoh_text(swfile)
tw = read_pharaoh_text(twfile)
als = read_pharaoh_align(afile)
return Alignment(sw, tw, als)
def read_giza_alignment(infile):
infile.readline() # ignore
swords = infile.readline().strip().split()
twords = []
als = {}
state = 0
for token in infile.readline().strip().split():
if state == 0:
if token != 'NULL':
if token != '({':
twords.append(token)
else:
state = 1
elif state == 1:
if token != '})':
if twords:
als[int(token)-1, len(twords)-1] = Alignment.SURE
else:
state = 0
return Alignment(swords, twords, als)
def read_naacl_aligns(infile):
aligns = []
last = None
for line in infile:
index, frm, to, conf = line.rstrip().split()
if int(index) != last:
aligns.append({})
aligns[-1][int(frm)-1, int(to)-1] = conf
last = int(index)
return aligns
#
# This phrase-extraction function largely mimics Pharaoh's phrase-extract
# code. It also supports the option to not advance over NULL alignments.
#
def xextract_phrases(alignment, maxPhraseLength=None, advance=True):
T = len(alignment.twords)
S = len(alignment.swords)
if not maxPhraseLength:
maxPhraseLength = max(T, S)
alignedCountS = [0 for s in alignment.swords]
alignedToT = [[] for t in alignment.twords]
alignedToS = [[] for s in alignment.swords]
for (s, t), conf in alignment.align.items():
if conf == Alignment.SURE:
alignedCountS[s] += 1
alignedToT[t].append(s)
alignedToS[s].append(t)
# check alignments for english phrase startT...endT
for st in range(T):
for et in range(st, min(T, st + maxPhraseLength)):
minS = 9999
maxS = -1
usedS = alignedCountS[:]
for ti in range(st, et+1):
for si in alignedToT[ti]:
#print 'point (%d, %d)' % (si, ti)
if si<minS: minS = si
if si>maxS: maxS = si
usedS[si] -= 1
#print 's projected (%d-%d, %d, %d)' % (minS, maxS, st, et)
if (maxS >= 0 and # aligned to any foreign words at all
maxS-minS < maxPhraseLength): # foreign phrase within limits
# check if foreign words are aligned to out of bound english words
out_of_bounds = False
for si in range(minS, maxS):
if usedS[si] > 0:
#print 'out of bounds:', si
out_of_bounds = True
break
# Pharoah doesn't use this check, but I think it's required
if not out_of_bounds:
for s in range(minS, maxS+1):
for t in alignedToS[s]:
if not (st <= t <= et):
#print 'out of bounds2:', t,s
out_of_bounds = True
break
#print 'doing it for (%d-%d, %d, %d)' % (minS, maxS, st, et)
if not out_of_bounds:
if advance:
#print 'attempting to advance'
# start point of foreign phrase may advance over unaligned
ss = minS
while (ss>=0 and
ss>maxS-maxPhraseLength and # within length limit
(ss==minS or alignedCountS[ss]==0)): # unaligned
# end point of foreign phrase may advance over unaligned
es = maxS
while (es<S and
es<ss+maxPhraseLength and # within length limit
(es==maxS or alignedCountS[es]==0)): #unaligned
yield (ss, es, st, et)
es += 1
ss -= 1
else:
ss, es = minS, maxS
yield (minS, maxS, st, et)
|