#!/usr/bin/env python # -*- coding: utf-8 -*- """ ibm1_em.py Patrick Simianer 2010-11-01 """ def print_probabilities(e_words, f_words, t): s = "" for i in range(len(f_words)): s = s + "\t" + f_words[i] s = s + "\n" for j in range(len(e_words)): s = s + e_words[j] for i in range(len(f_words)): s = s + "\t%.2f" % t[i][j] s = s + "\n" print s def em(sentence_pairs, iterations): pairs = [(p[0].lower().split(), p[1].lower().split()) for p in sentence_pairs] e_vocab = [] f_vocab = [] for (e, f) in pairs: e_vocab.extend([w for w in e if w not in e_vocab]) f_vocab.extend([w for w in f if w not in f_vocab]) # uniform init e_len = len(e_vocab) f_len = len(f_vocab) t = [] for i in range(f_len): t.append([]) for ew in e_vocab: t[i].append(1./e_len) print "Initial:" print_probabilities(e_vocab, f_vocab, t) while iterations > 0: # initialize count = [] total = [] for j in range(f_len): total.append(0.) count.append([]) for k in range(e_len): count[j].append(0.) s_total = [] for (e, f) in pairs: # normalize for ew in e: s_total.append(0.) ei = e_vocab.index(ew) for fw in f: fi = f_vocab.index(fw) s_total[ei] = s_total[ei] + t[fi][ei] # count for ew in e: ei = e_vocab.index(ew) for fw in f: fi = f_vocab.index(fw) count[fi][ei] += t[fi][ei]/s_total[ei] total[fi] += t[fi][ei]/s_total[ei] # estimate for i in range(f_len): for j in range(e_len): t[i][j] = count[i][j]/total[i] print "Step" print_probabilities(e_vocab, f_vocab, t) print iterations iterations = iterations-1 def main(): sp = [("The house", "Das Haus"), ("The book", "Das Buch"), ("A book", "Ein Buch")] em(sp, 5) if __name__ == '__main__': main()