diff options
Diffstat (limited to 'python/cdec/sa/rulefactory.pxi')
| -rw-r--r-- | python/cdec/sa/rulefactory.pxi | 31 | 
1 files changed, 23 insertions, 8 deletions
| diff --git a/python/cdec/sa/rulefactory.pxi b/python/cdec/sa/rulefactory.pxi index 78a23196..37dd6d3b 100644 --- a/python/cdec/sa/rulefactory.pxi +++ b/python/cdec/sa/rulefactory.pxi @@ -33,7 +33,8 @@ OnlineFeatureContext = namedtuple('OnlineFeatureContext',       'paircount',       'bilex_f',       'bilex_e', -     'bilex_fe' +     'bilex_fe', +     'bilex_ef'      ])  cdef class OnlineStats: @@ -45,6 +46,7 @@ cdef class OnlineStats:      cdef public bilex_f      cdef public bilex_e      cdef public bilex_fe +    cdef public bilex_ef      def __cinit__(self):          # Keep track of everything that can be sampled: @@ -60,6 +62,7 @@ cdef class OnlineStats:          self.bilex_f = defaultdict(int)          self.bilex_e = defaultdict(int)          self.bilex_fe = defaultdict(lambda: defaultdict(int)) +        self.bilex_ef = defaultdict(lambda: defaultdict(int))  cdef int PRECOMPUTE = 0  cdef int MERGE = 1 @@ -2052,13 +2055,25 @@ cdef class HieroCachingRuleFactory:                  stats.phrases_al[f_ph][e_ph] = al          # Update Bilexical counts -        # TODO: use alignments instead of cooc -        for e_w in e_words: -            stats.bilex_e[e_w] += 1 -        for f_w in f_words: -            stats.bilex_f[f_w] += 1 -            for e_w in e_words: -                stats.bilex_fe[f_w][e_w] += 1 +        aligned_fe = [list() for _ in range(len(f_words))] +        aligned_ef = [list() for _ in range(len(e_words))] +        for (i, j) in alignment: +            aligned_fe[i].append(j) +            aligned_ef[j].append(i) +        for f_i in range(len(f_words)): +            e_i_aligned = aligned_fe[f_i] +            lc = len(e_i_aligned) +            if lc > 0: +                stats.bilex_f[f_words[f_i]] += 1 +                for e_i in e_i_aligned: +                    stats.bilex_fe[f_words[f_i]][e_words[e_i]] += (1.0) / lc +        for e_i in range(len(e_words)): +            f_i_aligned = aligned_ef[e_i] +            lc = len(f_i_aligned) +            if lc > 1: +                stats.bilex_e[e_words[e_i]] += 1 +                for f_i in f_i_aligned: +                    stats.bilex_ef[e_words[e_i]][f_words[f_i]] += (1.0) / lc      # Create a rule from source, target, non-terminals, and alignments      def form_rule(self, f_i, e_i, f_span, e_span, nt, al): | 
