summaryrefslogtreecommitdiff
path: root/python/cdec/sa/rulefactory.pxi
diff options
context:
space:
mode:
authorMichael Denkowski <mdenkows@cs.cmu.edu>2014-03-06 15:35:10 -0800
committerMichael Denkowski <mdenkows@cs.cmu.edu>2014-03-06 15:35:10 -0800
commit256070339f557e295c1e90e9a451c82731d5c223 (patch)
tree7c34686446b2bcf0b6eb70114a6360c6ca14831b /python/cdec/sa/rulefactory.pxi
parent3b00351ff2047c226cde750fe67eae5b34388373 (diff)
Compile count-based bilex table for online grammar extraction.
Diffstat (limited to 'python/cdec/sa/rulefactory.pxi')
-rw-r--r--python/cdec/sa/rulefactory.pxi9
1 files changed, 7 insertions, 2 deletions
diff --git a/python/cdec/sa/rulefactory.pxi b/python/cdec/sa/rulefactory.pxi
index 044a78c8..86b994b9 100644
--- a/python/cdec/sa/rulefactory.pxi
+++ b/python/cdec/sa/rulefactory.pxi
@@ -2057,23 +2057,28 @@ cdef class HieroCachingRuleFactory:
# Update Bilexical counts
aligned_fe = [list() for _ in range(len(f_words))]
aligned_ef = [list() for _ in range(len(e_words))]
+ null_word = sym_fromstring('NULL', True)
for (i, j) in alignment:
aligned_fe[i].append(j)
aligned_ef[j].append(i)
for f_i in range(len(f_words)):
+ stats.bilex_f[f_words[f_i]] += 1
e_i_aligned = aligned_fe[f_i]
lc = len(e_i_aligned)
if lc > 0:
- stats.bilex_f[f_words[f_i]] += 1
for e_i in e_i_aligned:
stats.bilex_fe[f_words[f_i]][e_words[e_i]] += (1.0) / lc
+ else:
+ stats.bilex_fe[f_words[f_i]][null_word] += 1
for e_i in range(len(e_words)):
+ stats.bilex_e[e_words[e_i]] += 1
f_i_aligned = aligned_ef[e_i]
lc = len(f_i_aligned)
if lc > 0:
- stats.bilex_e[e_words[e_i]] += 1
for f_i in f_i_aligned:
stats.bilex_ef[e_words[e_i]][f_words[f_i]] += (1.0) / lc
+ else:
+ stats.bilex_ef[e_words[e_i]][null_word] += 1
# Create a rule from source, target, non-terminals, and alignments
def form_rule(self, f_i, e_i, f_span, e_span, nt, al):