diff options
author | Michael Denkowski <mdenkows@cs.cmu.edu> | 2014-03-03 07:58:11 -0800 |
---|---|---|
committer | Michael Denkowski <mdenkows@cs.cmu.edu> | 2014-03-03 07:58:11 -0800 |
commit | 825d766c681827c83db9b473cb750a911644f6fd (patch) | |
tree | bd82faa3875ef9ac2742d0b02b931e78272d704a /python/cdec/sa/rulefactory.pxi | |
parent | 324a978e7d766a3864e42efc0938fb6c9ef5a01c (diff) |
Online bilex counts
Diffstat (limited to 'python/cdec/sa/rulefactory.pxi')
-rw-r--r-- | python/cdec/sa/rulefactory.pxi | 31 |
1 files changed, 23 insertions, 8 deletions
diff --git a/python/cdec/sa/rulefactory.pxi b/python/cdec/sa/rulefactory.pxi index 78a23196..37dd6d3b 100644 --- a/python/cdec/sa/rulefactory.pxi +++ b/python/cdec/sa/rulefactory.pxi @@ -33,7 +33,8 @@ OnlineFeatureContext = namedtuple('OnlineFeatureContext', 'paircount', 'bilex_f', 'bilex_e', - 'bilex_fe' + 'bilex_fe', + 'bilex_ef' ]) cdef class OnlineStats: @@ -45,6 +46,7 @@ cdef class OnlineStats: cdef public bilex_f cdef public bilex_e cdef public bilex_fe + cdef public bilex_ef def __cinit__(self): # Keep track of everything that can be sampled: @@ -60,6 +62,7 @@ cdef class OnlineStats: self.bilex_f = defaultdict(int) self.bilex_e = defaultdict(int) self.bilex_fe = defaultdict(lambda: defaultdict(int)) + self.bilex_ef = defaultdict(lambda: defaultdict(int)) cdef int PRECOMPUTE = 0 cdef int MERGE = 1 @@ -2052,13 +2055,25 @@ cdef class HieroCachingRuleFactory: stats.phrases_al[f_ph][e_ph] = al # Update Bilexical counts - # TODO: use alignments instead of cooc - for e_w in e_words: - stats.bilex_e[e_w] += 1 - for f_w in f_words: - stats.bilex_f[f_w] += 1 - for e_w in e_words: - stats.bilex_fe[f_w][e_w] += 1 + aligned_fe = [list() for _ in range(len(f_words))] + aligned_ef = [list() for _ in range(len(e_words))] + for (i, j) in alignment: + aligned_fe[i].append(j) + aligned_ef[j].append(i) + for f_i in range(len(f_words)): + e_i_aligned = aligned_fe[f_i] + lc = len(e_i_aligned) + if lc > 0: + stats.bilex_f[f_words[f_i]] += 1 + for e_i in e_i_aligned: + stats.bilex_fe[f_words[f_i]][e_words[e_i]] += (1.0) / lc + for e_i in range(len(e_words)): + f_i_aligned = aligned_ef[e_i] + lc = len(f_i_aligned) + if lc > 1: + stats.bilex_e[e_words[e_i]] += 1 + for f_i in f_i_aligned: + stats.bilex_ef[e_words[e_i]][f_words[f_i]] += (1.0) / lc # Create a rule from source, target, non-terminals, and alignments def form_rule(self, f_i, e_i, f_span, e_span, nt, al): |