diff options
author | mjdenkowski <michael.j.denkowski@gmail.com> | 2014-03-10 18:32:53 -0400 |
---|---|---|
committer | mjdenkowski <michael.j.denkowski@gmail.com> | 2014-03-10 18:32:53 -0400 |
commit | 8c2d8217f068d8f107f95171496a25013d4e35fe (patch) | |
tree | f2b8e400f75a76c40a361a5165aa524cfb9c7fec | |
parent | 82a9bf6824faf52d3a998a89db087e8dc022c252 (diff) |
Tuples are faster than bidirectional dicts
-rw-r--r-- | python/cdec/sa/online.py | 56 |
1 files changed, 20 insertions, 36 deletions
diff --git a/python/cdec/sa/online.py b/python/cdec/sa/online.py index f3054212..d3f967e8 100644 --- a/python/cdec/sa/online.py +++ b/python/cdec/sa/online.py @@ -29,11 +29,11 @@ class Bilex: def __init__(self, in_f=None): self.f = collections.defaultdict(int) self.e = collections.defaultdict(int) - self.fe = collections.defaultdict(lambda: collections.defaultdict(int)) - self.ef = collections.defaultdict(lambda: collections.defaultdict(int)) + self.fe = collections.defaultdict(int) if in_f: self.read(in_f) + # Compatibility with Cython implementation def get_score(self, f, e, dir): if dir == 0: p = self.p_fe(f, e) @@ -42,22 +42,22 @@ class Bilex: return p def p_fe(self, f, e): - d = self.fe.get(f, None) - if not d: + denom = self.f.get(f, None) + if not denom: return None - val = d.get(e, None) - if not val: + num = self.fe.get((f, e), None) + if not num: return None - return val / self.f.get(f) + return num / denom def p_ef(self, e, f): - d = self.ef.get(e, None) - if not d: + denom = self.e.get(e, None) + if not denom: return None - val = d.get(f, None) - if not val: + num = self.fe.get((f, e), None) + if not num: return None - return val / self.e.get(e) + return num / denom # Update counts from aligned sentence def update(self, f_words, e_words, links): @@ -68,22 +68,19 @@ class Bilex: covered_e.add(j) self.f[f_words[i]] += 1 self.e[e_words[j]] += 1 - self.fe[f_words[i]][e_words[j]] += 1 - self.ef[e_words[j]][f_words[i]] += 1 + self.fe[(f_words[i], e_words[j])] += 1 # e being covered corresponds to f->e for j in range(len(e_words)): if j not in covered_e: self.f[NULL_WORD] += 1 self.e[e_words[j]] += 1 - self.fe[NULL_WORD][e_words[j]] += 1 - self.ef[e_words[j]][NULL_WORD] += 1 + self.fe[(NULL_WORD, e_words[j])] += 1 # e->f for i in range(len(f_words)): if i not in covered_f: self.f[f_words[i]] += 1 self.e[NULL_WORD] += 1 - self.ef[NULL_WORD][f_words[i]] += 1 - self.fe[f_words[i]][NULL_WORD] += 1 + self.fe[(f_words[i], NULL_WORD)] += 1 # Update counts from alignd bitext def add_bitext(self, alignment_f, text_f, target_text_f=None): @@ -98,22 +95,15 @@ class Bilex: self.update(source.split(), target.split(), links) def write(self, out_f): - fv = sorted(self.f) - ev = sorted(self.e) with gzip.open(out_f, 'wb') as out: - for f in fv: + for f in sorted(self.f): out.write('{} {}\n'.format(f, self.f[f])) out.write('\n') - for e in ev: + for e in sorted(self.e): out.write('{} {}\n'.format(e, self.e[e])) out.write('\n') - for f in fv: - for (e, c) in sorted(self.fe[f].iteritems()): - out.write('{} {} {}\n'.format(f, e, c)) - out.write('\n') - for e in ev: - for (f, c) in sorted(self.ef[e].iteritems()): - out.write('{} {} {}\n'.format(e, f, c)) + for (f, e) in sorted(self.fe): + out.write('{} {} {}\n'.format(f, e, self.fe[(f, e)])) out.write('\n') def read(self, in_f): @@ -135,10 +125,4 @@ class Bilex: if not line: break (f, e, c) = line.split() - self.fe[f][e] = float(c) - while True: - line = inp.readline().strip() - if not line: - break - (e, f, c) = line.split() - self.ef[e][f] = float(c) + self.fe[(f, e)] = float(c) |