summaryrefslogtreecommitdiff
path: root/python/cdec/sa
diff options
context:
space:
mode:
authormjdenkowski <michael.j.denkowski@gmail.com>2014-03-10 18:32:53 -0400
committermjdenkowski <michael.j.denkowski@gmail.com>2014-03-10 18:32:53 -0400
commitfcd6fe0123e5a12c926e344ed93e17f021674edc (patch)
treec9ca3a108f0025d24a73d0c7f80c23fc2a5a1369 /python/cdec/sa
parent33bc1b6cdd76901e5c64f3871241b3106048e84b (diff)
Tuples are faster than bidirectional dicts
Diffstat (limited to 'python/cdec/sa')
-rw-r--r--python/cdec/sa/online.py56
1 files changed, 20 insertions, 36 deletions
diff --git a/python/cdec/sa/online.py b/python/cdec/sa/online.py
index f3054212..d3f967e8 100644
--- a/python/cdec/sa/online.py
+++ b/python/cdec/sa/online.py
@@ -29,11 +29,11 @@ class Bilex:
def __init__(self, in_f=None):
self.f = collections.defaultdict(int)
self.e = collections.defaultdict(int)
- self.fe = collections.defaultdict(lambda: collections.defaultdict(int))
- self.ef = collections.defaultdict(lambda: collections.defaultdict(int))
+ self.fe = collections.defaultdict(int)
if in_f:
self.read(in_f)
+ # Compatibility with Cython implementation
def get_score(self, f, e, dir):
if dir == 0:
p = self.p_fe(f, e)
@@ -42,22 +42,22 @@ class Bilex:
return p
def p_fe(self, f, e):
- d = self.fe.get(f, None)
- if not d:
+ denom = self.f.get(f, None)
+ if not denom:
return None
- val = d.get(e, None)
- if not val:
+ num = self.fe.get((f, e), None)
+ if not num:
return None
- return val / self.f.get(f)
+ return num / denom
def p_ef(self, e, f):
- d = self.ef.get(e, None)
- if not d:
+ denom = self.e.get(e, None)
+ if not denom:
return None
- val = d.get(f, None)
- if not val:
+ num = self.fe.get((f, e), None)
+ if not num:
return None
- return val / self.e.get(e)
+ return num / denom
# Update counts from aligned sentence
def update(self, f_words, e_words, links):
@@ -68,22 +68,19 @@ class Bilex:
covered_e.add(j)
self.f[f_words[i]] += 1
self.e[e_words[j]] += 1
- self.fe[f_words[i]][e_words[j]] += 1
- self.ef[e_words[j]][f_words[i]] += 1
+ self.fe[(f_words[i], e_words[j])] += 1
# e being covered corresponds to f->e
for j in range(len(e_words)):
if j not in covered_e:
self.f[NULL_WORD] += 1
self.e[e_words[j]] += 1
- self.fe[NULL_WORD][e_words[j]] += 1
- self.ef[e_words[j]][NULL_WORD] += 1
+ self.fe[(NULL_WORD, e_words[j])] += 1
# e->f
for i in range(len(f_words)):
if i not in covered_f:
self.f[f_words[i]] += 1
self.e[NULL_WORD] += 1
- self.ef[NULL_WORD][f_words[i]] += 1
- self.fe[f_words[i]][NULL_WORD] += 1
+ self.fe[(f_words[i], NULL_WORD)] += 1
# Update counts from alignd bitext
def add_bitext(self, alignment_f, text_f, target_text_f=None):
@@ -98,22 +95,15 @@ class Bilex:
self.update(source.split(), target.split(), links)
def write(self, out_f):
- fv = sorted(self.f)
- ev = sorted(self.e)
with gzip.open(out_f, 'wb') as out:
- for f in fv:
+ for f in sorted(self.f):
out.write('{} {}\n'.format(f, self.f[f]))
out.write('\n')
- for e in ev:
+ for e in sorted(self.e):
out.write('{} {}\n'.format(e, self.e[e]))
out.write('\n')
- for f in fv:
- for (e, c) in sorted(self.fe[f].iteritems()):
- out.write('{} {} {}\n'.format(f, e, c))
- out.write('\n')
- for e in ev:
- for (f, c) in sorted(self.ef[e].iteritems()):
- out.write('{} {} {}\n'.format(e, f, c))
+ for (f, e) in sorted(self.fe):
+ out.write('{} {} {}\n'.format(f, e, self.fe[(f, e)]))
out.write('\n')
def read(self, in_f):
@@ -135,10 +125,4 @@ class Bilex:
if not line:
break
(f, e, c) = line.split()
- self.fe[f][e] = float(c)
- while True:
- line = inp.readline().strip()
- if not line:
- break
- (e, f, c) = line.split()
- self.ef[e][f] = float(c)
+ self.fe[(f, e)] = float(c)