From 9c9213239263e8e8de2f154068cc3ad44e0c2100 Mon Sep 17 00:00:00 2001 From: Victor Chahuneau Date: Tue, 14 Aug 2012 22:50:37 -0400 Subject: [cdec.sa] Explicit feature names in grammar extractor output + sparse features in extractor + hg.intersect(string) + basestring = str|unicode --- python/src/sa/rulefactory.pxi | 38 +++++++++++++++++--------------------- 1 file changed, 17 insertions(+), 21 deletions(-) (limited to 'python/src/sa/rulefactory.pxi') diff --git a/python/src/sa/rulefactory.pxi b/python/src/sa/rulefactory.pxi index 1c8d25a4..34a002c5 100644 --- a/python/src/sa/rulefactory.pxi +++ b/python/src/sa/rulefactory.pxi @@ -8,6 +8,8 @@ from libc.stdlib cimport malloc, realloc, free from libc.string cimport memset, memcpy from libc.math cimport fmod, ceil, floor, log +from collections import defaultdict, Counter + cdef int PRECOMPUTE = 0 cdef int MERGE = 1 cdef int BAEZA_YATES = 2 @@ -73,8 +75,7 @@ cdef class PhraseLocation: self.arr_high = arr_high self.arr = arr self.num_subpatterns = num_subpatterns - - + cdef class Sampler: '''A Sampler implements a logic for choosing @@ -208,6 +209,7 @@ cdef class HieroCachingRuleFactory: cdef TrieTable rules cdef Sampler sampler + cdef Scorer scorer cdef int max_chunks cdef int max_target_chunks @@ -359,7 +361,8 @@ cdef class HieroCachingRuleFactory: self.findexes = IntList(initial_len=10) self.findexes1 = IntList(initial_len=10) - def configure(self, SuffixArray fsarray, DataArray edarray, Sampler sampler): + def configure(self, SuffixArray fsarray, DataArray edarray, + Sampler sampler, Scorer scorer): '''This gives the RuleFactory access to the Context object. Here we also use it to precompute the most expensive intersections in the corpus quickly.''' @@ -370,6 +373,7 @@ cdef class HieroCachingRuleFactory: self.eid2symid = self.set_idmap(self.eda) self.precompute() self.sampler = sampler + self.scorer = scorer cdef set_idmap(self, DataArray darray): cdef int word_id, new_word_id, N @@ -916,7 +920,7 @@ cdef class HieroCachingRuleFactory: candidate.append([next_id,curr[1]+jump]) return sorted(result); - def input(self, fwords, models): + def input(self, fwords): '''When this function is called on the RuleFactory, it looks up all of the rules that can be used to translate the input sentence''' @@ -1074,18 +1078,12 @@ cdef class HieroCachingRuleFactory: extract_stop = monitor_cpu() self.extract_time = self.extract_time + extract_stop - extract_start if len(extracts) > 0: - fphrases = {} - fals = {} - fcount = {} + fcount = Counter() + fphrases = defaultdict(lambda: defaultdict(Counter)) for f, e, count, als in extracts: - fcount.setdefault(f, 0.0) - fcount[f] = fcount[f] + count - fphrases.setdefault(f, {}) - fphrases[f].setdefault(e, {}) - fphrases[f][e].setdefault(als,0.0) - fphrases[f][e][als] = fphrases[f][e][als] + count + fcount[f] += count + fphrases[f][e][als] += count for f, elist in fphrases.iteritems(): - f_margin = fcount[f] for e, alslist in elist.iteritems(): alignment = None count = 0 @@ -1093,11 +1091,9 @@ cdef class HieroCachingRuleFactory: if currcount > count: alignment = als count = currcount - scores = [] - for model in models: - scores.append(model(f, e, count, fcount[f], num_samples)) - yield Rule(self.category, f, e, - scores=scores, word_alignments=alignment) + scores = self.scorer.score(f, e, count, + fcount[f], num_samples) + yield Rule(self.category, f, e, scores, alignment) if len(phrase) < self.max_length and i+spanlen < len(fwords) and pathlen+1 <= self.max_initial_size: for alt_id in range(len(fwords[i+spanlen])): @@ -1377,9 +1373,9 @@ cdef class HieroCachingRuleFactory: free(e_gap_order) return result - cdef create_alignments(self, int* sent_links, int num_links, findexes, eindexes): + cdef IntList create_alignments(self, int* sent_links, int num_links, findexes, eindexes): cdef unsigned i - ret = IntList() + cdef IntList ret = IntList() for i in range(len(findexes)): s = findexes[i] if (s<0): -- cgit v1.2.3