summaryrefslogtreecommitdiff
path: root/algorithms/ibm1_em.py
diff options
context:
space:
mode:
authorPatrick Simianer <p@simianer.de>2014-06-15 03:24:33 +0200
committerPatrick Simianer <p@simianer.de>2014-06-15 03:24:33 +0200
commitcf3a29feb5887344b6633ead1b4b6d5657a15a4b (patch)
treef1149508f7305a48dba0226699dfafdd68d81969 /algorithms/ibm1_em.py
parent5ddc763ab9953eebdaf78af4eb72288d7955b310 (diff)
old stuff: algorithms
Diffstat (limited to 'algorithms/ibm1_em.py')
-rwxr-xr-xalgorithms/ibm1_em.py93
1 files changed, 93 insertions, 0 deletions
diff --git a/algorithms/ibm1_em.py b/algorithms/ibm1_em.py
new file mode 100755
index 0000000..09b7d78
--- /dev/null
+++ b/algorithms/ibm1_em.py
@@ -0,0 +1,93 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+"""
+ibm1_em.py
+
+Patrick Simianer <p@simianer.de>
+2010-11-01
+"""
+
+
+def print_probabilities(e_words, f_words, t):
+ s = ""
+ for i in range(len(f_words)):
+ s = s + "\t" + f_words[i]
+
+ s = s + "\n"
+
+ for j in range(len(e_words)):
+ s = s + e_words[j]
+ for i in range(len(f_words)):
+ s = s + "\t%.2f" % t[i][j]
+ s = s + "\n"
+
+ print s
+
+
+def em(sentence_pairs, iterations):
+ pairs = [(p[0].lower().split(), p[1].lower().split()) for p in sentence_pairs]
+
+ e_vocab = []
+ f_vocab = []
+ for (e, f) in pairs:
+ e_vocab.extend([w for w in e if w not in e_vocab])
+ f_vocab.extend([w for w in f if w not in f_vocab])
+
+ # uniform init
+ e_len = len(e_vocab)
+ f_len = len(f_vocab)
+
+ t = []
+ for i in range(f_len):
+ t.append([])
+ for ew in e_vocab:
+ t[i].append(1./e_len)
+
+ print "Initial:"
+ print_probabilities(e_vocab, f_vocab, t)
+
+ while iterations > 0:
+ # initialize
+ count = []
+ total = []
+ for j in range(f_len):
+ total.append(0.)
+ count.append([])
+ for k in range(e_len):
+ count[j].append(0.)
+
+ s_total = []
+ for (e, f) in pairs:
+ # normalize
+ for ew in e:
+ s_total.append(0.)
+ ei = e_vocab.index(ew)
+ for fw in f:
+ fi = f_vocab.index(fw)
+ s_total[ei] = s_total[ei] + t[fi][ei]
+ # count
+ for ew in e:
+ ei = e_vocab.index(ew)
+ for fw in f:
+ fi = f_vocab.index(fw)
+ count[fi][ei] += t[fi][ei]/s_total[ei]
+ total[fi] += t[fi][ei]/s_total[ei]
+
+ # estimate
+ for i in range(f_len):
+ for j in range(e_len):
+ t[i][j] = count[i][j]/total[i]
+
+ print "Step"
+ print_probabilities(e_vocab, f_vocab, t)
+ print iterations
+ iterations = iterations-1
+
+def main():
+ sp = [("The house", "Das Haus"), ("The book", "Das Buch"), ("A book", "Ein Buch")]
+ em(sp, 5)
+
+if __name__ == '__main__':
+ main()
+