diff options
author | Michael Denkowski <mdenkows@cs.cmu.edu> | 2014-03-07 01:52:09 -0800 |
---|---|---|
committer | Michael Denkowski <mdenkows@cs.cmu.edu> | 2014-03-07 01:52:09 -0800 |
commit | fcf92357806816bb65dade6f761fa08448f88f71 (patch) | |
tree | 681aa2f4319ebfe1bf9e1d6a748e504cfae72790 /python/cdec/sa/rulefactory.pxi | |
parent | 256070339f557e295c1e90e9a451c82731d5c223 (diff) |
More online bilex updates
Diffstat (limited to 'python/cdec/sa/rulefactory.pxi')
-rw-r--r-- | python/cdec/sa/rulefactory.pxi | 42 |
1 files changed, 1 insertions, 41 deletions
diff --git a/python/cdec/sa/rulefactory.pxi b/python/cdec/sa/rulefactory.pxi index 86b994b9..ca3321a4 100644 --- a/python/cdec/sa/rulefactory.pxi +++ b/python/cdec/sa/rulefactory.pxi @@ -31,10 +31,6 @@ OnlineFeatureContext = namedtuple('OnlineFeatureContext', ['fcount', 'fsample_count', 'paircount', - 'bilex_f', - 'bilex_e', - 'bilex_fe', - 'bilex_ef' ]) cdef class OnlineStats: @@ -43,10 +39,6 @@ cdef class OnlineStats: cdef public phrases_e cdef public phrases_fe cdef public phrases_al - cdef public bilex_f - cdef public bilex_e - cdef public bilex_fe - cdef public bilex_ef def __cinit__(self): # Keep track of everything that can be sampled: @@ -58,12 +50,6 @@ cdef class OnlineStats: self.phrases_fe = defaultdict(lambda: defaultdict(int)) self.phrases_al = defaultdict(lambda: defaultdict(tuple)) - # Bilexical counts - self.bilex_f = defaultdict(int) - self.bilex_e = defaultdict(int) - self.bilex_fe = defaultdict(lambda: defaultdict(int)) - self.bilex_ef = defaultdict(lambda: defaultdict(int)) - cdef int PRECOMPUTE = 0 cdef int MERGE = 1 cdef int BAEZA_YATES = 2 @@ -2054,32 +2040,6 @@ cdef class HieroCachingRuleFactory: if not stats.phrases_al[f_ph][e_ph]: stats.phrases_al[f_ph][e_ph] = al - # Update Bilexical counts - aligned_fe = [list() for _ in range(len(f_words))] - aligned_ef = [list() for _ in range(len(e_words))] - null_word = sym_fromstring('NULL', True) - for (i, j) in alignment: - aligned_fe[i].append(j) - aligned_ef[j].append(i) - for f_i in range(len(f_words)): - stats.bilex_f[f_words[f_i]] += 1 - e_i_aligned = aligned_fe[f_i] - lc = len(e_i_aligned) - if lc > 0: - for e_i in e_i_aligned: - stats.bilex_fe[f_words[f_i]][e_words[e_i]] += (1.0) / lc - else: - stats.bilex_fe[f_words[f_i]][null_word] += 1 - for e_i in range(len(e_words)): - stats.bilex_e[e_words[e_i]] += 1 - f_i_aligned = aligned_ef[e_i] - lc = len(f_i_aligned) - if lc > 0: - for f_i in f_i_aligned: - stats.bilex_ef[e_words[e_i]][f_words[f_i]] += (1.0) / lc - else: - stats.bilex_ef[e_words[e_i]][null_word] += 1 - # Create a rule from source, target, non-terminals, and alignments def form_rule(self, f_i, e_i, f_span, e_span, nt, al): @@ -2159,7 +2119,7 @@ cdef class HieroCachingRuleFactory: fsample_count = stats.samples_f.get(f, 0) d = stats.phrases_fe.get(f, None) paircount = d.get(e, 0) if d else 0 - return OnlineFeatureContext(fcount, fsample_count, paircount, stats.bilex_f, stats.bilex_e, stats.bilex_fe) + return OnlineFeatureContext(fcount, fsample_count, paircount) return None # Find all phrases that we might try to extract |