1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
|
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
ibm1_em.py
Patrick Simianer <p@simianer.de>
2010-11-01
"""
def print_probabilities(e_words, f_words, t):
s = ""
for i in range(len(f_words)):
s = s + "\t" + f_words[i]
s = s + "\n"
for j in range(len(e_words)):
s = s + e_words[j]
for i in range(len(f_words)):
s = s + "\t%.2f" % t[i][j]
s = s + "\n"
print s
def em(sentence_pairs, iterations):
pairs = [(p[0].lower().split(), p[1].lower().split()) for p in sentence_pairs]
e_vocab = []
f_vocab = []
for (e, f) in pairs:
e_vocab.extend([w for w in e if w not in e_vocab])
f_vocab.extend([w for w in f if w not in f_vocab])
# uniform init
e_len = len(e_vocab)
f_len = len(f_vocab)
t = []
for i in range(f_len):
t.append([])
for ew in e_vocab:
t[i].append(1./e_len)
print "Initial:"
print_probabilities(e_vocab, f_vocab, t)
while iterations > 0:
# initialize
count = []
total = []
for j in range(f_len):
total.append(0.)
count.append([])
for k in range(e_len):
count[j].append(0.)
s_total = []
for (e, f) in pairs:
# normalize
for ew in e:
s_total.append(0.)
ei = e_vocab.index(ew)
for fw in f:
fi = f_vocab.index(fw)
s_total[ei] = s_total[ei] + t[fi][ei]
# count
for ew in e:
ei = e_vocab.index(ew)
for fw in f:
fi = f_vocab.index(fw)
count[fi][ei] += t[fi][ei]/s_total[ei]
total[fi] += t[fi][ei]/s_total[ei]
# estimate
for i in range(f_len):
for j in range(e_len):
t[i][j] = count[i][j]/total[i]
print "Step"
print_probabilities(e_vocab, f_vocab, t)
print iterations
iterations = iterations-1
def main():
sp = [("The house", "Das Haus"), ("The book", "Das Buch"), ("A book", "Ein Buch")]
em(sp, 5)
if __name__ == '__main__':
main()
|