summaryrefslogtreecommitdiff
path: root/algorithms/ibm1_em.py
blob: 09b7d78f11ebac71c6703d9f77a57dddf386563f (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
ibm1_em.py

Patrick Simianer <p@simianer.de>
2010-11-01
"""


def print_probabilities(e_words, f_words, t):
  s = ""
  for i in range(len(f_words)):
    s = s + "\t" + f_words[i]

  s = s + "\n"

  for j in range(len(e_words)):
    s = s + e_words[j]
    for i in range(len(f_words)):
      s = s + "\t%.2f" % t[i][j]
    s = s + "\n"

  print  s


def em(sentence_pairs, iterations):
  pairs = [(p[0].lower().split(), p[1].lower().split()) for p in sentence_pairs]

  e_vocab = []
  f_vocab = []
  for (e, f) in pairs:
    e_vocab.extend([w for w in e if w not in e_vocab])
    f_vocab.extend([w for w in f if w not in f_vocab])

  # uniform init
  e_len = len(e_vocab)
  f_len = len(f_vocab)

  t = []
  for i in range(f_len):
    t.append([])
    for ew in e_vocab:
      t[i].append(1./e_len)

  print "Initial:"
  print_probabilities(e_vocab, f_vocab, t)

  while iterations > 0:
    # initialize
    count = []
    total = []
    for j in range(f_len):
      total.append(0.)
      count.append([])
      for k in range(e_len):
        count[j].append(0.)

    s_total = []
    for (e, f) in pairs:
      # normalize
      for ew in e:
        s_total.append(0.)
        ei = e_vocab.index(ew)
        for fw in f:
          fi = f_vocab.index(fw)
          s_total[ei] = s_total[ei] + t[fi][ei]
      # count
      for ew in e:
        ei = e_vocab.index(ew)
        for fw in f:
          fi = f_vocab.index(fw)
          count[fi][ei] += t[fi][ei]/s_total[ei]
          total[fi]   += t[fi][ei]/s_total[ei]

    # estimate
    for i in range(f_len):
      for j in range(e_len):
        t[i][j] = count[i][j]/total[i]

    print "Step"
    print_probabilities(e_vocab, f_vocab, t)
    print iterations
    iterations = iterations-1

def main():
     sp = [("The house", "Das Haus"), ("The book", "Das Buch"), ("A book", "Ein Buch")]
     em(sp, 5)

if __name__ == '__main__':
  main()