From cf3a29feb5887344b6633ead1b4b6d5657a15a4b Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Sun, 15 Jun 2014 03:24:33 +0200 Subject: old stuff: algorithms --- algorithms/ibm1_em.py | 93 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 93 insertions(+) create mode 100755 algorithms/ibm1_em.py (limited to 'algorithms/ibm1_em.py') diff --git a/algorithms/ibm1_em.py b/algorithms/ibm1_em.py new file mode 100755 index 0000000..09b7d78 --- /dev/null +++ b/algorithms/ibm1_em.py @@ -0,0 +1,93 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +""" +ibm1_em.py + +Patrick Simianer +2010-11-01 +""" + + +def print_probabilities(e_words, f_words, t): + s = "" + for i in range(len(f_words)): + s = s + "\t" + f_words[i] + + s = s + "\n" + + for j in range(len(e_words)): + s = s + e_words[j] + for i in range(len(f_words)): + s = s + "\t%.2f" % t[i][j] + s = s + "\n" + + print s + + +def em(sentence_pairs, iterations): + pairs = [(p[0].lower().split(), p[1].lower().split()) for p in sentence_pairs] + + e_vocab = [] + f_vocab = [] + for (e, f) in pairs: + e_vocab.extend([w for w in e if w not in e_vocab]) + f_vocab.extend([w for w in f if w not in f_vocab]) + + # uniform init + e_len = len(e_vocab) + f_len = len(f_vocab) + + t = [] + for i in range(f_len): + t.append([]) + for ew in e_vocab: + t[i].append(1./e_len) + + print "Initial:" + print_probabilities(e_vocab, f_vocab, t) + + while iterations > 0: + # initialize + count = [] + total = [] + for j in range(f_len): + total.append(0.) + count.append([]) + for k in range(e_len): + count[j].append(0.) + + s_total = [] + for (e, f) in pairs: + # normalize + for ew in e: + s_total.append(0.) + ei = e_vocab.index(ew) + for fw in f: + fi = f_vocab.index(fw) + s_total[ei] = s_total[ei] + t[fi][ei] + # count + for ew in e: + ei = e_vocab.index(ew) + for fw in f: + fi = f_vocab.index(fw) + count[fi][ei] += t[fi][ei]/s_total[ei] + total[fi] += t[fi][ei]/s_total[ei] + + # estimate + for i in range(f_len): + for j in range(e_len): + t[i][j] = count[i][j]/total[i] + + print "Step" + print_probabilities(e_vocab, f_vocab, t) + print iterations + iterations = iterations-1 + +def main(): + sp = [("The house", "Das Haus"), ("The book", "Das Buch"), ("A book", "Ein Buch")] + em(sp, 5) + +if __name__ == '__main__': + main() + -- cgit v1.2.3