diff options
author | Chris Dyer <cdyer@cs.cmu.edu> | 2012-07-27 17:31:00 -0400 |
---|---|---|
committer | Chris Dyer <cdyer@cs.cmu.edu> | 2012-07-27 17:31:00 -0400 |
commit | b317e0efd2398d75d70e027bb1e2cf442e683981 (patch) | |
tree | ec34aff0ce4e8fb9704d1cd2b7abf00cb9a25b9a /sa-extract/rulefactory.pyx | |
parent | b2a8bccb2bd713d9ec081cf3dad0162c2cb492d8 (diff) |
remove old suffix array extractor (use the one in python/ instead)
Diffstat (limited to 'sa-extract/rulefactory.pyx')
-rw-r--r-- | sa-extract/rulefactory.pyx | 2361 |
1 files changed, 0 insertions, 2361 deletions
diff --git a/sa-extract/rulefactory.pyx b/sa-extract/rulefactory.pyx deleted file mode 100644 index 792489c4..00000000 --- a/sa-extract/rulefactory.pyx +++ /dev/null @@ -1,2361 +0,0 @@ -# Pyrex implementation of the algorithms described in -# Lopez, EMNLP-CoNLL 2007 -# Much faster than the Python numbers reported there. -# Note to reader: this code is closer to C than Python -import sys -import sym -import log -import rule -import monitor -import cintlist -import csuf -import cdat -import cveb -import precomputation -import gc -import cn -import sgml - -cimport cmath -cimport csuf -cimport cdat -cimport cintlist -cimport rule -cimport cveb -cimport precomputation -cimport calignment - -from libc.stdlib cimport malloc, realloc, free -from libc.string cimport memset, memcpy -from libc.math cimport fmod, ceil, floor - -cdef int PRECOMPUTE -cdef int MERGE -cdef int BAEZA_YATES - -PRECOMPUTE = 0 -MERGE = 1 -BAEZA_YATES = 2 - -#cdef int node_count -#node_count = 0 - -cdef class TrieNode: - cdef public children - #cdef int id - - def __init__(self): - self.children = {} - #self.id = node_count - #node_count += 1 - - -cdef class ExtendedTrieNode(TrieNode): - cdef public phrase - cdef public phrase_location - cdef public suffix_link - - def __init__(self, phrase=None, phrase_location=None, suffix_link=None): - TrieNode.__init__(self) - self.phrase = phrase - self.phrase_location = phrase_location - self.suffix_link = suffix_link - - -cdef class TrieTable: - cdef public int extended - cdef public int count - cdef public root - def __cinit__(self, extended=False): - self.count = 0 - self.extended = extended - if extended: - self.root = ExtendedTrieNode() - else: - self.root = TrieNode() - -# linked list structure for storing matches in BaselineRuleFactory -cdef struct match_node: - int* match - match_node* next - -cdef class BaselineRuleFactory: - - cdef grammar, context_manager - cdef int max_terminals, max_nonterminals - cdef int max_initial_size, train_max_initial_size - cdef int min_gap_size, train_min_gap_size - cdef int category - cdef int visit - cdef float intersect_time, extract_time - cdef ruleFile, timingFile - cdef int* last_visit1 - cdef int* last_visit2 - cdef match_node** intersector1 - cdef match_node** intersector2 - cdef csuf.SuffixArray sa - cdef cintlist.CIntList sent_id - - def input(self, fwords): - flen = len(fwords) - start_time = monitor.cpu() - self.intersect_time = 0.0 - self.extract_time = 0.0 - - pyro_phrase_count = 0 - hiero_phrase_count = 0 - - frontier = [] - for i in xrange(len(fwords)): - frontier.append((i, (), False)) - - while len(frontier) > 0: - this_iter_intersect_time = self.intersect_time - new_frontier = [] - for i, prefix, is_shadow_path in frontier: - - word_id = fwords[i][0][0] - #print "word_id = %i" % word_id - phrase = prefix + (word_id,) - str_phrase = map(sym.tostring, phrase) - hiero_phrase = rule.Phrase(phrase) - - #log.writeln("pos %2d, '%s'" % (i, hiero_phrase)) - self.lookup(hiero_phrase) - if hiero_phrase.arity() == 0: - pyro_phrase_count = pyro_phrase_count + 1 - else: - hiero_phrase_count = hiero_phrase_count + 1 - - if len(phrase) - hiero_phrase.arity() < self.max_terminals and i+1 < len(fwords): - new_frontier.append((i+1, phrase, is_shadow_path)) - if hiero_phrase.arity() < self.max_nonterminals: - xcat = sym.setindex(self.category, hiero_phrase.arity()+1) - for j in xrange(i+1+self.min_gap_size, min(i+self.max_initial_size, len(fwords))): - new_frontier.append((j, phrase+(xcat,), is_shadow_path)) - log.writeln("This iteration intersect time = %f" % (self.intersect_time - this_iter_intersect_time)) - frontier = new_frontier - stop_time = monitor.cpu() - log.writeln("COUNT %d %d" % (pyro_phrase_count, hiero_phrase_count)) - - - def lookup(self, phrase): - cdef int j, g, start, stop, sent_id, num_ranges, arity - cdef match_node** cur_intersector - cdef match_node** next_intersector - cdef match_node** tmp_intersector - cdef match_node* node - cdef match_node* cur_node - cdef match_node* prev_node - cdef match_node** node_ptr - cdef int* cur_visit - cdef int* next_visit - cdef int* tmp_visit - cdef int* chunklen - - #print "\n\nLOOKUP\n\n" - ranges = [] - sizes = [] - arity = phrase.arity() - chunklen = <int *> malloc(arity*sizeof(int)) - for i from 0 <= i < arity+1: - chunk = phrase.getchunk(i) - chunklen[i] = len(chunk) - sa_range = None - phr = () - for offset, word_id in enumerate(chunk): - word = sym.tostring(word_id) - sa_range = self.context_manager.fsarray.lookup(word, offset, sa_range[0], sa_range[1]) - if sa_range is None: - #log.writeln("Returned for phrase %s" % rule.Phrase(phr)) - return - #log.writeln("Found range %s for phrase %s" % (sa_range, rule.Phrase(phr))) - ranges.append(sa_range) - sizes.append(sa_range[1]-sa_range[0]) - if phrase.arity() == 0: - return - - cur_intersector = self.intersector1 - next_intersector = self.intersector2 - cur_visit = self.last_visit1 - next_visit = self.last_visit2 - - num_ranges = len(ranges) - for i from 0 <= i < num_ranges: - sa_range = ranges[i] - start_time = monitor.cpu() - self.visit = self.visit + 1 - intersect_count = 0 - - start = sa_range[0] - stop = sa_range[1] - for j from start <= j < stop: - g = self.sa.sa.arr[j] - sent_id = self.sent_id.arr[g] - if i==0: - if next_visit[sent_id] != self.visit: - # clear intersector - node = next_intersector[sent_id] - next_intersector[sent_id] = NULL - while node != NULL: - prev_node = node - node = node.next - free(prev_node.match) - free(prev_node) - next_visit[sent_id] = self.visit - node_ptr = &(next_intersector[sent_id]) - while node_ptr[0] != NULL: - node_ptr = &(node_ptr[0].next) - node_ptr[0] = <match_node*> malloc(sizeof(match_node)) - node_ptr[0].match = <int *> malloc(sizeof(int)) - node_ptr[0].match[0] = g - node_ptr[0].next = NULL - intersect_count = intersect_count + 1 - else: - if cur_visit[sent_id] == self.visit-1: - cur_node = cur_intersector[sent_id] - while cur_node != NULL: - if g - cur_node.match[0] + chunklen[i] <= self.train_max_initial_size and g - cur_node.match[i-1] - chunklen[i-1] >= self.train_min_gap_size: - if next_visit[sent_id] != self.visit: - # clear intersector -- note that we only do this if we've got something to put there - node = next_intersector[sent_id] - next_intersector[sent_id] = NULL - while node != NULL: - prev_node = node - node = node.next - free(prev_node.match) - free(prev_node) - next_visit[sent_id] = self.visit - node_ptr = &(next_intersector[sent_id]) - while node_ptr[0] != NULL: - node_ptr = &(node_ptr[0].next) - node_ptr[0] = <match_node*> malloc(sizeof(match_node)) - node_ptr[0].match = <int *> malloc((i+1) * sizeof(int)) - memcpy(node_ptr[0].match, cur_node.match, i*sizeof(int)) - node_ptr[0].match[i] = g - node_ptr[0].next = NULL - intersect_count = intersect_count + 1 - cur_node = cur_node.next - tmp_intersector = cur_intersector - cur_intersector = next_intersector - next_intersector = tmp_intersector - - tmp_visit = cur_visit - cur_visit = next_visit - next_visit = tmp_visit - - intersect_time = monitor.cpu() - start_time - if i > 0: - log.writeln("INT %d %d %d %d %f baseline" % - (arity, prev_intersect_count, sa_range[1]-sa_range[0], - intersect_count, intersect_time)) - if intersect_count == 0: - return None - prev_intersect_count = intersect_count - free(chunklen) - - - - def __init__(self, max_terminals=5, max_nonterminals=2, - max_initial_size=10, train_max_initial_size=10, - min_gap_size=1, train_min_gap_size=2, - category='[PHRASE]', grammar=None, - ruleFile=None, timingFile=None): - self.grammar = grammar - self.max_terminals = max_terminals - self.max_nonterminals = max_nonterminals - self.max_initial_size = max_initial_size - self.train_max_initial_size = train_max_initial_size - self.min_gap_size = min_gap_size - self.train_min_gap_size = train_min_gap_size - self.category = sym.fromstring(category) - self.ruleFile = ruleFile - self.timingFile = timingFile - self.visit = 0 - - - def registerContext(self, context_manager): - cdef int num_sents - self.context_manager = context_manager - self.sa = context_manager.fsarray - self.sent_id = self.sa.darray.sent_id - - num_sents = len(self.sa.darray.sent_index) - self.last_visit1 = <int *> malloc(num_sents * sizeof(int)) - memset(self.last_visit1, 0, num_sents * sizeof(int)) - - self.last_visit2 = <int *> malloc(num_sents * sizeof(int)) - memset(self.last_visit2, 0, num_sents * sizeof(int)) - - self.intersector1 = <match_node **> malloc(num_sents * sizeof(match_node*)) - memset(self.intersector1, 0, num_sents * sizeof(match_node*)) - - self.intersector2 = <match_node **> malloc(num_sents * sizeof(match_node*)) - memset(self.intersector2, 0, num_sents * sizeof(match_node*)) - - -# encodes information needed to find a (hierarchical) phrase -# in the text. If phrase is contiguous, that's just a range -# in the suffix array; if discontiguous, it is the set of -# actual locations (packed into an array) -cdef class PhraseLocation: - cdef int sa_low - cdef int sa_high - cdef int arr_low - cdef int arr_high - cdef cintlist.CIntList arr - cdef int num_subpatterns - - # returns true if sent_id is contained - cdef int contains(self, int sent_id): - return 1 - - def __init__(self, sa_low=-1, sa_high=-1, arr_low=-1, arr_high=-1, arr=None, num_subpatterns=1): - self.sa_low = sa_low - self.sa_high = sa_high - self.arr_low = arr_low - self.arr_high = arr_high - self.arr = arr - self.num_subpatterns = num_subpatterns - - - -cdef class Sampler: - '''A Sampler implements a logic for choosing - samples from a population range''' - - cdef int sampleSize - cdef context_manager - cdef cintlist.CIntList sa - - def __init__(self, sampleSize=0): - self.sampleSize = sampleSize - if sampleSize > 0: - log.writeln("Sampling strategy: uniform, max sample size = %d" % sampleSize, 1) - else: - log.writeln("Sampling strategy: no sampling", 1) - - def registerContext(self, context_manager): - self.context_manager = context_manager - self.sa = (<csuf.SuffixArray> context_manager.fsarray).sa - - - def sample(self, PhraseLocation phrase_location): - '''Returns a sample of the locations for - the phrase. If there are less than self.sampleSize - locations, return all of them; otherwise, return - up to self.sampleSize locations. In the latter case, - we choose to sample UNIFORMLY -- that is, the locations - are chosen at uniform intervals over the entire set, rather - than randomly. This makes the algorithm deterministic, which - is good for things like MERT''' - cdef cintlist.CIntList sample - cdef double i, stepsize - cdef int num_locations, val, j - - sample = cintlist.CIntList() - if phrase_location.arr is None: - num_locations = phrase_location.sa_high - phrase_location.sa_low - if self.sampleSize == -1 or num_locations <= self.sampleSize: - sample._extend_arr(self.sa.arr + phrase_location.sa_low, num_locations) - else: - stepsize = float(num_locations)/float(self.sampleSize) - i = phrase_location.sa_low - while i < phrase_location.sa_high and sample.len < self.sampleSize: - '''Note: int(i) not guaranteed to have the desired - effect, according to the python documentation''' - if fmod(i,1.0) > 0.5: - val = int(ceil(i)) - else: - val = int(floor(i)) - sample._append(self.sa.arr[val]) - i = i + stepsize - else: - num_locations = (phrase_location.arr_high - phrase_location.arr_low) / phrase_location.num_subpatterns - if self.sampleSize == -1 or num_locations <= self.sampleSize: - sample = phrase_location.arr - else: - stepsize = float(num_locations)/float(self.sampleSize) - i = phrase_location.arr_low - while i < num_locations and sample.len < self.sampleSize * phrase_location.num_subpatterns: - '''Note: int(i) not guaranteed to have the desired - effect, according to the python documentation''' - if fmod(i,1.0) > 0.5: - val = int(ceil(i)) - else: - val = int(floor(i)) - j = phrase_location.arr_low + (val*phrase_location.num_subpatterns) - sample._extend_arr(phrase_location.arr.arr + j, phrase_location.num_subpatterns) - i = i + stepsize - return sample - - -cdef long nGramCount(PhraseLocation loc): - return (loc.arr_high - loc.arr_low)/ loc.num_subpatterns - - -# struct used to encapsulate a single matching -cdef struct Matching: - int* arr - int start - int end - int sent_id - int size - - -cdef void assign_matching(Matching* m, int* arr, int start, int step, int* sent_id_arr): - m.arr = arr - m.start = start - m.end = start + step - m.sent_id = sent_id_arr[arr[start]] - m.size = step - - -cdef int* append_combined_matching(int* arr, Matching* loc1, Matching* loc2, - int offset_by_one, int num_subpatterns, int* result_len): - cdef int i, new_len - - new_len = result_len[0] + num_subpatterns - arr = <int*> realloc(arr, new_len*sizeof(int)) - - for i from 0 <= i < loc1.size: - arr[result_len[0]+i] = loc1.arr[loc1.start+i] - if num_subpatterns > loc1.size: - arr[new_len-1] = loc2.arr[loc2.end-1] - result_len[0] = new_len - return arr - - -cdef int* extend_arr(int* arr, int* arr_len, int* appendix, int appendix_len): - cdef int new_len - - new_len = arr_len[0] + appendix_len - arr = <int*> realloc(arr, new_len*sizeof(int)) - memcpy(arr+arr_len[0], appendix, appendix_len*sizeof(int)) - arr_len[0] = new_len - return arr - - -#cdef matching2str(Matching* m): -# cdef int i -# cdef result - -# result = "(" -# for i from m.start <= i < m.end: -# result = result + str(m.arr[i]) + " " -# result = result + ")" -# return result - - -cdef int median(int low, int high, int step): - return low + (((high - low)/step)/2)*step - - -cdef void findComparableMatchings(int low, int high, int* arr, int step, int loc, int* loc_minus, int* loc_plus): - # Returns (minus, plus) indices for the portion of the array - # in which all matchings have the same first index as the one - # starting at loc - loc_plus[0] = loc + step - while loc_plus[0] < high and arr[loc_plus[0]] == arr[loc]: - loc_plus[0] = loc_plus[0] + step - loc_minus[0] = loc - while loc_minus[0]-step >= low and arr[loc_minus[0]-step] == arr[loc]: - loc_minus[0] = loc_minus[0] - step - - -cdef class HieroCachingRuleFactory: - '''This RuleFactory implements a caching - method using TrieTable, which makes phrase - generation somewhat speedier -- phrases only - need to be extracted once (however, it is - quite possible they need to be scored - for each input sentence, for contextual models)''' - - cdef rules, grammar, context_manager - - cdef int max_chunks - cdef int max_target_chunks - cdef int max_length - cdef int max_target_length - cdef int max_nonterminals - cdef int max_initial_size - cdef int train_max_initial_size - cdef int min_gap_size - cdef int train_min_gap_size - cdef int category - - cdef cacheBetweenSents - cdef precomputed_index - cdef precomputed_collocations - cdef precompute_file - cdef max_rank - cdef int precompute_rank, precompute_secondary_rank - cdef useBaezaYates - cdef use_index - cdef use_collocations - cdef float by_slack_factor - - cdef per_sentence_grammar - cdef rule_filehandler - cdef rule_file - cdef pruned_rule_file - cdef extract_file - cdef sample_file - cdef search_file - cdef timingFile - cdef log_int_stats - cdef prev_norm_prefix - cdef float intersect_time, extract_time - cdef csuf.SuffixArray fsa - cdef cdat.DataArray fda - cdef cdat.DataArray eda - - cdef calignment.Alignment alignment - cdef cintlist.CIntList eid2symid - cdef cintlist.CIntList fid2symid - cdef int tight_phrases - cdef int require_aligned_terminal - cdef int require_aligned_chunks - - cdef cintlist.CIntList findexes - cdef cintlist.CIntList findexes1 - - cdef int excluded_sent_id # exclude a sentence id - - def __init__(self, - alignment=None, # compiled alignment object (REQUIRED) - by_slack_factor=1.0, # parameter for double-binary search; doesn't seem to matter much - category="[PHRASE]", # name of generic nonterminal used by Hiero - cacheBetweenSents=False, # prevent flushing of tree between sents; use carefully or you'll run out of memory - extract_file=None, # print raw extracted rules to this file - grammar=None, # empty grammar object -- must be supplied from outside (REQUIRED) - log_int_stats=False, # prints timing data on intersections to stderr - max_chunks=None, # maximum number of contiguous chunks of terminal symbols in RHS of a rule. If None, defaults to max_nonterminals+1 - max_initial_size=10, # maximum span of a grammar rule in TEST DATA - max_length=5, # maximum number of symbols (both T and NT) allowed in a rule - max_nonterminals=2, # maximum number of nonterminals allowed in a rule (set >2 at your own risk) - max_target_chunks=None, # maximum number of contiguous chunks of terminal symbols in target-side RHS of a rule. If None, defaults to max_nonterminals+1 - max_target_length=None, # maximum number of target side symbols (both T and NT) allowed in a rule. If None, defaults to max_initial_size - min_gap_size=2, # minimum span of a nonterminal in the RHS of a rule in TEST DATA - precompute_file=None, # filename of file containing precomputed collocations - precompute_secondary_rank=20, # maximum frequency rank of patterns used to compute triples (don't set higher than 20). - precompute_rank=100, # maximum frequency rank of patterns used to compute collocations (no need to set higher than maybe 200-300) - pruned_rule_file=None, # if specified, pruned grammars will be written to this filename - require_aligned_terminal=True, # require extracted rules to have at least one aligned word - require_aligned_chunks=False, # require each contiguous chunk of extracted rules to have at least one aligned word - per_sentence_grammar=True, # generate grammar files for each input segment - rule_file=None, # UNpruned grammars will be written to this filename - sample_file=None, # Sampling statistics will be written to this filename - search_file=None, # lookup statistics will be written to this filename - train_max_initial_size=10, # maximum span of a grammar rule extracted from TRAINING DATA - train_min_gap_size=2, # minimum span of an RHS nonterminal in a rule extracted from TRAINING DATA - tight_phrases=False, # True if phrases should be tight, False otherwise (False == slower but better results) - timingFile=None, # timing statistics will be written to this filename - useBaezaYates=True, # True to require use of double-binary alg, false otherwise - use_collocations=True, # True to enable used of precomputed collocations - use_index=True # True to enable use of precomputed inverted indices - ): - '''Note: we make a distinction between the min_gap_size - and max_initial_size used in test and train. The latter - are represented by train_min_gap_size and train_max_initial_size, - respectively. This is because Chiang's model does not require - them to be the same, therefore we don't either.''' - self.rules = TrieTable(True) # cache - self.rules.root = ExtendedTrieNode(phrase_location=PhraseLocation()) - self.grammar = grammar - if alignment is None: - raise Exception("Must specify an alignment object") - self.alignment = alignment - - self.excluded_sent_id = -1 - - # grammar parameters and settings - # NOTE: setting max_nonterminals > 2 is not currently supported in Hiero - self.max_length = max_length - self.max_nonterminals = max_nonterminals - self.max_initial_size = max_initial_size - self.train_max_initial_size = train_max_initial_size - self.min_gap_size = min_gap_size - self.train_min_gap_size = train_min_gap_size - self.category = sym.fromstring(category) - - if max_chunks is None: - self.max_chunks = self.max_nonterminals + 1 - else: - self.max_chunks = max_chunks - - if max_target_chunks is None: - self.max_target_chunks = self.max_nonterminals + 1 - else: - self.max_target_chunks = max_target_chunks - - if max_target_length is None: - self.max_target_length = max_initial_size - else: - self.max_target_length = max_target_length - - # algorithmic parameters and settings - self.cacheBetweenSents = not per_sentence_grammar - self.precomputed_collocations = {} - self.precomputed_index = {} - self.use_index = use_index - self.use_collocations = use_collocations - self.max_rank = {} - self.precompute_file = precompute_file - self.precompute_rank = precompute_rank - self.precompute_secondary_rank = precompute_secondary_rank - self.useBaezaYates = useBaezaYates - self.by_slack_factor = by_slack_factor - if tight_phrases: - self.tight_phrases = 1 - else: - self.tight_phrases = 0 - - if require_aligned_chunks: - # one condition is a stronger version of the other. - self.require_aligned_chunks = 1 - self.require_aligned_terminal = 1 - elif require_aligned_terminal: - self.require_aligned_chunks = 0 - self.require_aligned_terminal = 1 - else: - self.require_aligned_chunks = 0 - self.require_aligned_terminal = 0 - - - self.per_sentence_grammar = per_sentence_grammar - if not self.per_sentence_grammar: - self.rule_filehandler = open(rule_file, "w") - # diagnostics - #if rule_file is None: - # self.rule_file = None - self.rule_file = rule_file - if extract_file is None: - self.extract_file = None - else: - self.extract_file = open(extract_file, "w") - if sample_file is None: - self.sample_file = None - else: - self.sample_file = open(sample_file, "w") - if search_file is None: - self.search_file = None - else: - self.search_file = open(search_file, "w") - self.pruned_rule_file = pruned_rule_file - self.timingFile = timingFile - self.log_int_stats = log_int_stats - self.prev_norm_prefix = () - - self.findexes = cintlist.CIntList(initial_len=10) - self.findexes1 = cintlist.CIntList(initial_len=10) - - def registerContext(self, context_manager): - '''This gives the RuleFactory access to the Context object. - Here we also use it to precompute the most expensive intersections - in the corpus quickly.''' - self.context_manager = context_manager - self.fsa = context_manager.fsarray - self.fda = self.fsa.darray - self.eda = context_manager.edarray - self.fid2symid = self.set_idmap(self.fda) - self.eid2symid = self.set_idmap(self.eda) - self.precompute() - - cdef set_idmap(self, cdat.DataArray darray): - cdef int word_id, new_word_id, N - cdef cintlist.CIntList idmap - - N = len(darray.id2word) - idmap = cintlist.CIntList(initial_len=N) - for word_id from 0 <= word_id < N: - new_word_id = sym.fromstring(darray.id2word[word_id], terminal=True) - idmap.arr[word_id] = new_word_id - return idmap - - - def pattern2phrase(self, pattern): - # pattern is a tuple, which we must convert to a hiero rule.Phrase - result = () - arity = 0 - for word_id in pattern: - if word_id == -1: - arity = arity + 1 - new_id = sym.setindex(self.category, arity) - else: - new_id = sym.fromstring(self.fda.id2word[word_id]) - result = result + (new_id,) - return rule.Phrase(result) - - def pattern2phrase_plus(self, pattern): - # returns a list containing both the pattern, and pattern - # suffixed/prefixed with the NT category. - patterns = [] - result = () - arity = 0 - for word_id in pattern: - if word_id == -1: - arity = arity + 1 - new_id = sym.setindex(self.category, arity) - else: - new_id = sym.fromstring(self.fda.id2word[word_id]) - result = result + (new_id,) - patterns.append(rule.Phrase(result)) - patterns.append(rule.Phrase(result + (sym.setindex(self.category, 1),))) - patterns.append(rule.Phrase((sym.setindex(self.category, 1),) + result)) - return patterns - - def precompute(self): - cdef precomputation.Precomputation pre - - if self.precompute_file is not None: - start_time = monitor.cpu() - log.write("Reading precomputed data from file %s... " % self.precompute_file, 1) - pre = precomputation.Precomputation(self.precompute_file, from_binary=True) - # check parameters of precomputation -- some are critical and some are not - if pre.max_nonterminals != self.max_nonterminals: - log.writeln("\nWARNING: Precomputation done with max nonterminals %d, decoder uses %d" % (pre.max_nonterminals, self.max_nonterminals)) - if pre.max_length != self.max_length: - log.writeln("\nWARNING: Precomputation done with max terminals %d, decoder uses %d" % (pre.max_length, self.max_length)) - if pre.train_max_initial_size != self.train_max_initial_size: - log.writeln("\nERROR: Precomputation done with max initial size %d, decoder uses %d" % (pre.train_max_initial_size, self.train_max_initial_size)) - raise Exception("Parameter mismatch with precomputed data") - if pre.train_min_gap_size != self.train_min_gap_size: - log.writeln("\nERROR: Precomputation done with min gap size %d, decoder uses %d" % (pre.train_min_gap_size, self.train_min_gap_size)) - raise Exception("Parameter mismatch with precomputed data") - log.writeln("done.", 1) - if self.use_index: - log.write("Converting %d hash keys on precomputed inverted index... " % (len(pre.precomputed_index)), 1) - for pattern, arr in pre.precomputed_index.iteritems(): - phrases = self.pattern2phrase_plus(pattern) - for phrase in phrases: - self.precomputed_index[phrase] = arr - log.writeln("done.", 1) - if self.use_collocations: - log.write("Converting %d hash keys on precomputed collocations... " % (len(pre.precomputed_collocations)), 1) - for pattern, arr in pre.precomputed_collocations.iteritems(): - phrase = self.pattern2phrase(pattern) - self.precomputed_collocations[phrase] = arr - log.writeln("done.", 1) - stop_time = monitor.cpu() - log.writeln("Processing precomputations took %f seconds" % (stop_time - start_time), 1) - - - def getPrecomputedCollocation(self, phrase): - if phrase in self.precomputed_collocations: - arr = self.precomputed_collocations[phrase] - return PhraseLocation(arr=arr, arr_low=0, arr_high=len(arr), num_subpatterns=phrase.arity()+1) - return None - - - cdef int* baezaYatesHelper(self, int low1, int high1, int* arr1, int step1, - int low2, int high2, int* arr2, int step2, - int offset_by_one, int len_last, int num_subpatterns, int* result_len): - cdef int i1, i2, j1, j2, med1, med2, med1_plus, med1_minus, med2_minus, med2_plus - cdef int d_first, qsetsize, dsetsize, tmp, search_low, search_high - cdef int med_result_len, low_result_len, high_result_len - cdef long comparison - cdef int* result - cdef int* low_result - cdef int* med_result - cdef int* high_result - cdef Matching loc1, loc2 - - result = <int*> malloc(0*sizeof(int*)) -# log.writeln("%sBY: [%d, %d, %d] [%d, %d, %d]" % (pad, low1, high1, step1, low2, high2, step2,), 5) - - d_first = 0 - if high1 - low1 > high2 - low2: -# log.writeln("%sD first" % (pad), 5) - d_first = 1 -# else: -# log.writeln("%sQ first" % (pad), 5) - -# '''First, check to see if we are at any of the -# recursive base cases''' -# -# '''Case 1: one of the sets is empty''' - if low1 >= high1 or low2 >= high2: -# log.writeln("%sRETURN: set is empty" % (pad), 5) - return result - -# '''Case 2: sets are non-overlapping''' - assign_matching(&loc1, arr1, high1-step1, step1, self.fda.sent_id.arr) - assign_matching(&loc2, arr2, low2, step2, self.fda.sent_id.arr) - if self.compare_matchings(&loc1, &loc2, offset_by_one, len_last) == -1: -# log.writeln("%s %s < %s" % (pad, tuple(arr1[high1-step1:high1]), tuple(arr2[low2:low2+step2])),5) -# log.writeln("%sRETURN: non-overlapping sets" % (pad), 5) - return result - - assign_matching(&loc1, arr1, low1, step1, self.fda.sent_id.arr) - assign_matching(&loc2, arr2, high2-step2, step2, self.fda.sent_id.arr) - if self.compare_matchings(&loc1, &loc2, offset_by_one, len_last) == 1: -# log.writeln("%s %s > %s" % (pad, tuple(arr1[low1:low1+step1]), tuple(arr2[high2-step2:high2])),5) -# log.writeln("%sRETURN: non-overlapping sets" % (pad), 5) - return result - - # Case 3: query set and data set do not meet size mismatch constraints; - # We use mergesort instead in this case - qsetsize = (high1-low1) / step1 - dsetsize = (high2-low2) / step2 - if d_first: - tmp = qsetsize - qsetsize = dsetsize - dsetsize = tmp - - if self.by_slack_factor * qsetsize * cmath.log(dsetsize) / cmath.log(2) > dsetsize: - free(result) - return self.mergeHelper(low1, high1, arr1, step1, low2, high2, arr2, step2, offset_by_one, len_last, num_subpatterns, result_len) - - # binary search. There are two flavors, depending on - # whether the queryset or dataset is first - if d_first: - med2 = median(low2, high2, step2) - assign_matching(&loc2, arr2, med2, step2, self.fda.sent_id.arr) - - search_low = low1 - search_high = high1 - while search_low < search_high: - med1 = median(search_low, search_high, step1) - findComparableMatchings(low1, high1, arr1, step1, med1, &med1_minus, &med1_plus) - comparison = self.compareMatchingsSet(med1_minus, med1_plus, arr1, step1, &loc2, offset_by_one, len_last) - if comparison == -1: - search_low = med1_plus - elif comparison == 1: - search_high = med1_minus - else: - break - else: - med1 = median(low1, high1, step1) - findComparableMatchings(low1, high1, arr1, step1, med1, &med1_minus, &med1_plus) - - search_low = low2 - search_high = high2 - while search_low < search_high: - med2 = median(search_low, search_high, step2) - assign_matching(&loc2, arr2, med2, step2, self.fda.sent_id.arr) - comparison = self.compareMatchingsSet(med1_minus, med1_plus, arr1, step1, &loc2, offset_by_one, len_last) - if comparison == -1: - search_high = med2 - elif comparison == 1: - search_low = med2 + step2 - else: - break - - med_result_len = 0 - med_result = <int*> malloc(0*sizeof(int*)) - if search_high > search_low: -# '''Then there is a match for the median element of Q''' -# -# '''What we want to find is the group of all bindings in the first set -# s.t. their first element == the first element of med1. Then we -# want to store the bindings for all of those elements. We can -# subsequently throw all of them away.''' - med2_minus = med2 - med2_plus = med2 + step2 - i1 = med1_minus - while i1 < med1_plus: - assign_matching(&loc1, arr1, i1, step1, self.fda.sent_id.arr) - while med2_minus-step2 >= low2: - assign_matching(&loc2, arr2, med2_minus-step2, step2, self.fda.sent_id.arr) - if self.compare_matchings(&loc1, &loc2, offset_by_one, len_last) < 1: - med2_minus = med2_minus - step2 - else: - break - i2 = med2_minus - while i2 < high2: - assign_matching(&loc2, arr2, i2, step2, self.fda.sent_id.arr) - comparison = self.compare_matchings(&loc1, &loc2, offset_by_one, len_last) - if comparison == 0: - pass - med_result = append_combined_matching(med_result, &loc1, &loc2, offset_by_one, num_subpatterns, &med_result_len) - if comparison == -1: - break - i2 = i2 + step2 - if i2 > med2_plus: - med2_plus = i2 - i1 = i1 + step1 - - tmp = med1_minus - med1_minus = med1_plus - med1_plus = tmp - else: - # No match; need to figure out the point of division in D and Q - med2_minus = med2 - med2_plus = med2 - if d_first: - med2_minus = med2_minus + step2 - if comparison == -1: - med1_minus = med1_plus - if comparison == 1: - med1_plus = med1_minus - else: - tmp = med1_minus - med1_minus = med1_plus - med1_plus = tmp - if comparison == 1: - med2_minus = med2_minus + step2 - med2_plus = med2_plus + step2 - - low_result_len = 0 - low_result = self.baezaYatesHelper(low1, med1_plus, arr1, step1, low2, med2_plus, arr2, step2, offset_by_one, len_last, num_subpatterns, &low_result_len) - high_result_len = 0 - high_result = self.baezaYatesHelper(med1_minus, high1, arr1, step1, med2_minus, high2, arr2, step2, offset_by_one, len_last, num_subpatterns, &high_result_len) - - result = extend_arr(result, result_len, low_result, low_result_len) - result = extend_arr(result, result_len, med_result, med_result_len) - result = extend_arr(result, result_len, high_result, high_result_len) - free(low_result) - free(med_result) - free(high_result) - - return result - - - - cdef long compareMatchingsSet(self, int i1_minus, int i1_plus, int* arr1, int step1, - Matching* loc2, int offset_by_one, int len_last): -# '''Compares a *set* of bindings, all with the same first element, -# to a single binding. Returns -1 if all comparisons == -1, 1 if all -# comparisons == 1, and 0 otherwise.''' - cdef int i1, comparison, prev_comparison - cdef Matching l1_stack - cdef Matching* loc1 - - loc1 = &l1_stack - - i1 = i1_minus - while i1 < i1_plus: - assign_matching(loc1, arr1, i1, step1, self.fda.sent_id.arr) - comparison = self.compare_matchings(loc1, loc2, offset_by_one, len_last) - if comparison == 0: - prev_comparison = 0 - break - elif i1 == i1_minus: - prev_comparison = comparison - else: - if comparison != prev_comparison: - prev_comparison = 0 - break - i1 = i1 + step1 - return prev_comparison - - - cdef long compare_matchings(self, Matching* loc1, Matching* loc2, int offset_by_one, int len_last): - cdef int i - - if loc1.sent_id > loc2.sent_id: - return 1 - if loc2.sent_id > loc1.sent_id: - return -1 - - if loc1.size == 1 and loc2.size == 1: - if loc2.arr[loc2.start] - loc1.arr[loc1.start] <= self.train_min_gap_size: - return 1 - - elif offset_by_one: - for i from 1 <= i < loc1.size: - if loc1.arr[loc1.start+i] > loc2.arr[loc2.start+i-1]: - return 1 - if loc1.arr[loc1.start+i] < loc2.arr[loc2.start+i-1]: - return -1 - - else: - if loc1.arr[loc1.start]+1 > loc2.arr[loc2.start]: - return 1 - if loc1.arr[loc1.start]+1 < loc2.arr[loc2.start]: - return -1 - - for i from 1 <= i < loc1.size: - if loc1.arr[loc1.start+i] > loc2.arr[loc2.start+i]: - return 1 - if loc1.arr[loc1.start+i] < loc2.arr[loc2.start+i]: - return -1 - - if loc2.arr[loc2.end-1] + len_last - loc1.arr[loc1.start] > self.train_max_initial_size: - return -1 - return 0 - - - cdef int* mergeHelper(self, int low1, int high1, int* arr1, int step1, - int low2, int high2, int* arr2, int step2, - int offset_by_one, int len_last, int num_subpatterns, int* result_len): - cdef int i1, i2, j1, j2 - cdef long comparison - cdef int* result - cdef Matching loc1, loc2 -# cdef int i - -# pad = " " -# log.writeln("->mergeHelper", 5) - - result_len[0] = 0 - result = <int*> malloc(0*sizeof(int)) - - i1 = low1 - i2 = low2 -# if log.level==5: -# log.writeln("%sMERGE lists [%d,%d,%d] and [%d,%d,%d]" % (pad,low1,high1,step1,low2,high2,step2), 5) -# log.writeln("%soffset_by_one: %d, len_last: %d" % (pad, offset_by_one, len_last), 5) -# log.write("[") -# for i from low1 <= i < high1: -# log.write("%d, " % arr1.arr[i],5) -# log.writeln("]") -# log.write("[") -# for i from low2 <= i < high2: -# log.write("%d, " % arr2.arr[i],5) -# log.writeln("]") - while i1 < high1 and i2 < high2: - -# '''First, pop all unneeded loc2's off the stack''' - assign_matching(&loc1, arr1, i1, step1, self.fda.sent_id.arr) -# if log.level==5: -# log.writeln("%s TOP1 %s" % (pad,matching2str(loc1)),5) - while i2 < high2: - assign_matching(&loc2, arr2, i2, step2, self.fda.sent_id.arr) - if self.compare_matchings(&loc1, &loc2, offset_by_one, len_last) == 1: -# if log.level==5: -# log.writeln("%s %s > %s" % (pad,matching2str(loc1),matching2str(loc2)),5) -# log.writeln("%s POP2 %s" % (pad,matching2str(loc2)),5) - i2 = i2 + step2 - else: - break - -# '''Next: process all loc1's with the same starting val''' - j1 = i1 - while i1 < high1 and arr1[j1] == arr1[i1]: - assign_matching(&loc1, arr1, i1, step1, self.fda.sent_id.arr) - j2 = i2 - while j2 < high2: - assign_matching(&loc2, arr2, j2, step2, self.fda.sent_id.arr) - comparison = self.compare_matchings(&loc1, &loc2, offset_by_one, len_last) - if comparison == 0: -# if log.level==5: -# log.writeln("%s %s == %s" % (pad,matching2str(loc1),matching2str(loc2)),5) - result = append_combined_matching(result, &loc1, &loc2, offset_by_one, num_subpatterns, result_len) - if comparison == 1: -# if log.level==5: -# log.writeln("%s %s > %s" % (pad,matching2str(loc1),matching2str(loc2)),5) - pass - if comparison == -1: -# if log.level==5: -# log.writeln("%s %s < %s" % (pad,matching2str(loc1),matching2str(loc2)),5) - break - else: - j2 = j2 + step2 -# if log.level==5: -# log.writeln("%s POP1 %s" % (pad,matching2str(loc1)),5) - i1 = i1 + step1 - -# log.writeln("<-mergeHelper", 5) - return result - - - cdef void sortPhraseLoc(self, cintlist.CIntList arr, PhraseLocation loc, rule.Phrase phrase): - cdef int i, j - cdef cveb.VEB veb - cdef cintlist.CIntList result - - if phrase in self.precomputed_index: - loc.arr = self.precomputed_index[phrase] - else: - loc.arr = cintlist.CIntList(initial_len=loc.sa_high-loc.sa_low) - veb = cveb.VEB(arr.len) - for i from loc.sa_low <= i < loc.sa_high: - veb._insert(arr.arr[i]) - i = veb.veb.min_val - for j from 0 <= j < loc.sa_high-loc.sa_low: - loc.arr.arr[j] = i - i = veb._findsucc(i) - loc.arr_low = 0 - loc.arr_high = loc.arr.len - - - cdef intersectHelper(self, rule.Phrase prefix, rule.Phrase suffix, - PhraseLocation prefix_loc, PhraseLocation suffix_loc, int algorithm): - - cdef cintlist.CIntList arr1, arr2, result - cdef int low1, high1, step1, low2, high2, step2, offset_by_one, len_last, num_subpatterns, result_len - cdef int* result_ptr - cdef csuf.SuffixArray suf - - result_len = 0 - - if sym.isvar(suffix[0]): - offset_by_one = 1 - else: - offset_by_one = 0 - - len_last = len(suffix.getchunk(suffix.arity())) - - if prefix_loc.arr is None: - suf = self.context_manager.fsarray - self.sortPhraseLoc(suf.sa, prefix_loc, prefix) - arr1 = prefix_loc.arr - low1 = prefix_loc.arr_low - high1 = prefix_loc.arr_high - step1 = prefix_loc.num_subpatterns - - if suffix_loc.arr is None: - suf = self.context_manager.fsarray - self.sortPhraseLoc(suf.sa, suffix_loc, suffix) - arr2 = suffix_loc.arr - low2 = suffix_loc.arr_low - high2 = suffix_loc.arr_high - step2 = suffix_loc.num_subpatterns - - num_subpatterns = prefix.arity()+1 - - if algorithm == MERGE: - result_ptr = self.mergeHelper(low1, high1, arr1.arr, step1, - low2, high2, arr2.arr, step2, - offset_by_one, len_last, num_subpatterns, &result_len) - else: - result_ptr = self.baezaYatesHelper(low1, high1, arr1.arr, step1, - low2, high2, arr2.arr, step2, - offset_by_one, len_last, num_subpatterns, &result_len) - - if result_len == 0: - free(result_ptr) - return None - else: - result = cintlist.CIntList() - free(result.arr) - result.arr = result_ptr - result.len = result_len - result.size = result_len - return PhraseLocation(arr_low=0, arr_high=result_len, arr=result, num_subpatterns=num_subpatterns) - - cdef loc2str(self, PhraseLocation loc): - cdef int i, j - result = "{" - i = 0 - while i < loc.arr_high: - result = result + "(" - for j from i <= j < i + loc.num_subpatterns: - result = result + ("%d " %loc.arr[j]) - result = result + ")" - i = i + loc.num_subpatterns - result = result + "}" - return result - -# cdef compareResults(self, PhraseLocation loc1, PhraseLocation loc2, phrase, type1, type2): -# cdef i -# if loc1 is None and type1=="pre": -# return -# if loc1 is None: -# if loc2 is None or loc2.arr_high == 0: -# return -# if loc2 is None: -# if loc1.arr_high == 0: -# return -# if loc1.arr_high != loc2.arr_high: -# log.writeln("ERROR: %d vs %d (%s vs %s)" % (loc1.arr_high, loc2.arr_high, type1, type2)) -# #log.writeln(" %s" % self.loc2str(loc2)) -# if loc1.arr_high == 0: -# return -# elif loc1.num_subpatterns != loc2.num_subpatterns: -# log.writeln("ERROR 2: %d vs %d (%d v %d) %s" % (loc1.num_subpatterns, loc2.num_subpatterns, loc1.arr_high, loc2.arr_high, phrase)) -# for i from 0 <= i < loc1.arr_high: -# if loc1.arr[i] != loc2.arr[i]: -# log.writeln("ERROR 3") -# - cdef PhraseLocation intersect(self, prefix_node, suffix_node, rule.Phrase phrase): - cdef rule.Phrase prefix, suffix - cdef PhraseLocation prefix_loc, suffix_loc, result - - start_time = monitor.cpu() - prefix = prefix_node.phrase - suffix = suffix_node.phrase - prefix_loc = prefix_node.phrase_location - suffix_loc = suffix_node.phrase_location - - result = self.getPrecomputedCollocation(phrase) - if result is not None: - intersect_method = "precomputed" - - if result is None: - if self.useBaezaYates: - result = self.intersectHelper(prefix, suffix, prefix_loc, suffix_loc, BAEZA_YATES) - intersect_method="double binary" - else: - result = self.intersectHelper(prefix, suffix, prefix_loc, suffix_loc, MERGE) - intersect_method="merge" - stop_time = monitor.cpu() - intersect_time = stop_time - start_time - if self.log_int_stats: - if intersect_method == "precomputed": - sort1 = "none" - sort2 = "none" - else: - if prefix in self.precomputed_index: - sort1 = "index" - else: - sort1 = "veb" - if suffix in self.precomputed_index: - sort2 = "index" - else: - sort2 = "veb" - result_len=0 - if result is not None: - result_len = len(result.arr)/result.num_subpatterns - rank = 0 -# if phrase in self.max_rank: -# rank = self.max_rank[phrase] -# else: -# rank = self.precompute_rank + 10 - log.writeln("INT %d %d %d %d %d %f %d %s %s %s" % - (len(prefix)+1 - prefix.arity(), prefix.arity(), - nGramCount(prefix_node.phrase_location), - nGramCount(suffix_node.phrase_location), - result_len, intersect_time, rank, intersect_method, sort1, sort2)) - return result - - def advance(self, frontier, res, fwords): - nf = [] - for (toskip, (i, alt, pathlen)) in frontier: - spanlen = fwords[i][alt][2] - if (toskip == 0): - #log.writeln("RES: (%d %d %d)" % (i, alt, pathlen), 3) - res.append((i, alt, pathlen)) - ni = i + spanlen - #log.writeln("proc: %d (%d %d %d) sl=%d ni=%d len(fwords)=%d" % (toskip, i, alt, pathlen, spanlen, ni, len(fwords)), 3) - if (ni < len(fwords) and (pathlen + 1) < self.max_initial_size): - for na in xrange(len(fwords[ni])): - nf.append((toskip - 1, (ni, na, pathlen + 1))) - if (len(nf) > 0): - return self.advance(nf, res, fwords) - else: - return res - - def get_all_nodes_isteps_away(self, skip, i, spanlen, pathlen, fwords, next_states, reachable_buffer): - frontier = [] - if (i+spanlen+skip >= len(next_states)): - return frontier - #print "get_all_nodes_isteps_away from %i" % (i) - key = tuple([i,spanlen]) - reachable = [] - if (key in reachable_buffer): - reachable = reachable_buffer[key] - else: - reachable = self.reachable(fwords, i, spanlen) - reachable_buffer[key] = reachable - #print "reachable(from=%i,dist=%i) = " % (i,spanlen) - #print reachable - for nextreachable in reachable: - for next_id in next_states[nextreachable]: - jump = self.shortest(fwords,i,next_id) - #print "checking next_id = %i, pathlen[sofar] = %i, jump = %i" % (next_id,pathlen,jump) - #if (next_id - (i+spanlen)) < skip: - if jump < skip: - continue - #if next_id-(i-pathlen) < self.max_initial_size: - if pathlen+jump <= self.max_initial_size: - for alt_id in xrange(len(fwords[next_id])): - if (fwords[next_id][alt_id][0] != cn.epsilon): - #frontier.append((next_id,alt_id,next_id-(i-pathlen))); - #print "finding the shortest from %i to %i" % (i, next_id) - newel = (next_id,alt_id,pathlen+jump) - if newel not in frontier: - frontier.append((next_id,alt_id,pathlen+jump)) - #print "appending to frontier = next_id=%i, alt_id=%i, pathlen=%i" % (next_id,alt_id,pathlen+jump) - #else: - #print "NOT appending to frontier = next_id=%i, alt_id=%i, pathlen=%i" % (next_id,alt_id,pathlen+jump) - #else: - #print "next_id = %s is aborted\n" % next_id - #print "returning frontier" - #print frontier - return frontier - - def reachable(self, fwords, ifrom, dist): - #print "inside reachable(%i,%i)" % (ifrom,dist) - ret = [] - if (ifrom >= len(fwords)): - return ret - for alt_id in xrange(len(fwords[ifrom])): - if (fwords[ifrom][alt_id][0] == cn.epsilon): - ret.extend(self.reachable(fwords,ifrom+fwords[ifrom][alt_id][2],dist)) - else: - if (dist==0): - if (ifrom not in ret): - ret.append(ifrom) - else: - for ifromchild in self.reachable(fwords,ifrom+fwords[ifrom][alt_id][2],dist-1): - if (ifromchild not in ret): - ret.append(ifromchild) - - return ret - - def shortest(self, fwords, ifrom, ito): - min = 1000 - #print "shortest ifrom=%i, ito=%i" % (ifrom,ito) - if (ifrom > ito): - return min - if (ifrom == ito): - return 0 - for alt_id in xrange(len(fwords[ifrom])): - currmin = self.shortest(fwords,ifrom+fwords[ifrom][alt_id][2],ito) - if (fwords[ifrom][alt_id][0] != cn.epsilon): - currmin += 1 - if (currmin<min): - min = currmin - return min - - def get_next_states(self, _columns, curr_idx, min_dist=2): - result = [] - candidate = [[curr_idx,0]] - - while len(candidate) > 0: - curr = candidate.pop() - if curr[0] >= len(_columns): - continue - if curr[0] not in result and min_dist <= curr[1] <= self.max_initial_size: - result.append(curr[0]); - curr_col = _columns[curr[0]] - for alt in curr_col: - next_id = curr[0]+alt[2] - jump = 1 - if (alt[0] == cn.epsilon): - jump = 0 - if next_id not in result and min_dist <= curr[1]+jump <= self.max_initial_size+1: - candidate.append([next_id,curr[1]+jump]) - return sorted(result); - - def input(self, fwords, meta=None, output=None): - '''When this function is called on the RuleFactory, - it looks up all of the rules that can be used to translate - the input sentence''' - cdef int i, j, k, flen, arity, num_subpatterns, num_samples - cdef float start_time - cdef PhraseLocation phrase_location - cdef cintlist.CIntList sample, chunklen - cdef Matching matching - cdef rule.Phrase hiero_phrase - - #fwords = [ ((1,0.0,1),), fwords1 ] #word id for <s> = 1, cost = 0.0, next = 1 - #print fwords - flen = len(fwords) - #print "length = %i" % flen - start_time = monitor.cpu() - self.intersect_time = 0.0 - self.extract_time = 0.0 - nodes_isteps_away_buffer = {} - hit = 0 - reachable_buffer = {} - if meta: - dattrs = sgml.attrs_to_dict(meta) - id = dattrs.get('id', 'NOID') - self.excluded_sent_id = int(dattrs.get('exclude', '-1')) - if output: - self.rule_filehandler = output - elif self.per_sentence_grammar: - self.rule_filehandler = open(self.rule_file+'.'+id, 'w') - - #print "max_initial_size = %i" % self.max_initial_size - - if not self.cacheBetweenSents: - self.rules.root = ExtendedTrieNode(phrase_location=PhraseLocation()) - self.grammar.root = [None, {}] - - frontier = [] - for i in xrange(len(fwords)): - for alt in xrange(0, len(fwords[i])): - if fwords[i][alt][0] != cn.epsilon: - frontier.append((i, i, alt, 0, self.rules.root, (), False)) - - xroot = None - x1 = sym.setindex(self.category, 1) - if x1 in self.rules.root.children: - xroot = self.rules.root.children[x1] - else: - xroot = ExtendedTrieNode(suffix_link=self.rules.root, phrase_location=PhraseLocation()) - self.rules.root.children[x1] = xroot - - for i in xrange(self.min_gap_size, len(fwords)): - for alt in xrange(0, len(fwords[i])): - if fwords[i][alt][0] != cn.epsilon: - frontier.append((i-self.min_gap_size, i, alt, self.min_gap_size, xroot, (x1,), True)) - '''for k, i, alt, pathlen, node, prefix, is_shadow_path in frontier: - if len(prefix)>0: - print k, i, alt, pathlen, node, map(sym.tostring,prefix), is_shadow_path - else: - print k, i, alt, pathlen, node, prefix, is_shadow_path''' - - #for wid in xrange(1000): - # print "%i = %s" % (wid, sym.tostring(wid)) - next_states = [] - for i in xrange(len(fwords)): - next_states.append(self.get_next_states(fwords,i,self.min_gap_size)) - #print "next state of %i" % i - #print next_states[i] - - while len(frontier) > 0: - #print "frontier = %i" % len(frontier) - this_iter_intersect_time = self.intersect_time - new_frontier = [] - for k, i, alt, pathlen, node, prefix, is_shadow_path in frontier: - #print "looking at: " - #if len(prefix)>0: - # print k, i, alt, pathlen, node, map(sym.tostring,prefix), is_shadow_path - #else: - # print k, i, alt, pathlen, node, prefix, is_shadow_path - word_id = fwords[i][alt][0] - spanlen = fwords[i][alt][2] - #print "word_id = %i, %s" % (word_id, sym.tostring(word_id)) - # to prevent .. [X] </S> - #print "prefix = ",prefix - #if word_id == 2 and len(prefix)>=2: - #print "at the end: %s" % (prefix[len(prefix)-1]) - #if prefix[len(prefix)-1]<0: - #print "break" - #continue - #print "continuing" - #if pathlen + spanlen > self.max_initial_size: - #continue - # TODO get rid of k -- pathlen is replacing it - if word_id == cn.epsilon: - #print "skipping because word_id is epsilon" - if i+spanlen >= len(fwords): - continue - for nualt in xrange(0,len(fwords[i+spanlen])): - frontier.append((k, i+spanlen, nualt, pathlen, node, prefix, is_shadow_path)) - continue - - phrase = prefix + (word_id,) - str_phrase = map(sym.tostring, phrase) - hiero_phrase = rule.Phrase(phrase) - arity = hiero_phrase.arity() - - #print "pos %2d, node %5d, '%s'" % (i, node.id, hiero_phrase) - if self.search_file is not None: - self.search_file.write("%s\n" % hiero_phrase) - - lookup_required = False - if word_id in node.children: - if node.children[word_id] is None: - #print "Path dead-ends at this node\n" - continue - else: - #print "Path continues at this node\n" - node = node.children[word_id] - else: - if node.suffix_link is None: - #print "Current node is root; lookup required\n" - lookup_required = True - else: - if word_id in node.suffix_link.children: - if node.suffix_link.children[word_id] is None: - #print "Suffix link reports path is dead end\n" - node.children[word_id] = None - continue - else: - #print "Suffix link indicates lookup is reqired\n" - lookup_required = True - else: - #print "ERROR: We never get here\n" - raise Exception("Keyword trie error") - #new_frontier.append((k, i, alt, pathlen, node, prefix, is_shadow_path)) - #print "checking whether lookup_required\n" - if lookup_required: - new_node = None - if is_shadow_path: - #print "Extending shadow path for %s \n" - # on the shadow path we don't do any search, we just use info from suffix link - new_node = ExtendedTrieNode(phrase_location=node.suffix_link.children[word_id].phrase_location, - suffix_link=node.suffix_link.children[word_id], - phrase=hiero_phrase) - else: - if arity > 0: - #print "Intersecting for %s because of arity > 0\n" % hiero_phrase - phrase_location = self.intersect(node, node.suffix_link.children[word_id], hiero_phrase) - else: - #print "Suffix array search for %s" % hiero_phrase - phrase_location = node.phrase_location - sa_range = self.context_manager.fsarray.lookup(str_phrase[-1], len(str_phrase)-1, phrase_location.sa_low, phrase_location.sa_high) - if sa_range is not None: - phrase_location = PhraseLocation(sa_low=sa_range[0], sa_high=sa_range[1]) - else: - phrase_location = None - - if phrase_location is None: - node.children[word_id] = None - #print "Search failed\n" - continue - #print "Search succeeded\n" - suffix_link = self.rules.root - if node.suffix_link is not None: - suffix_link = node.suffix_link.children[word_id] - new_node = ExtendedTrieNode(phrase_location=phrase_location, - suffix_link=suffix_link, - phrase=hiero_phrase) - node.children[word_id] = new_node - node = new_node - #print "Added node %d with suffix link %d\n" % (node.id, node.suffix_link.id) - - '''Automatically add a trailing X node, if allowed -- - This should happen before we get to extraction (so that - the node will exist if needed)''' - if arity < self.max_nonterminals: - xcat_index = arity+1 - xcat = sym.setindex(self.category, xcat_index) - suffix_link_xcat_index = xcat_index - if is_shadow_path: - suffix_link_xcat_index = xcat_index-1 - suffix_link_xcat = sym.setindex(self.category, suffix_link_xcat_index) - node.children[xcat] = ExtendedTrieNode(phrase_location=node.phrase_location, - suffix_link=node.suffix_link.children[suffix_link_xcat], - phrase= rule.Phrase(phrase + (xcat,))) - #log.writeln("Added node %d with suffix link %d (for X)" % (node.children[xcat].id, node.children[xcat].suffix_link.id), 4) - - # sample from range - if not is_shadow_path: - #print "is_not_shadow_path" - sample = self.context_manager.sampler.sample(node.phrase_location) - #print "node.phrase_location %s" % str(node.phrase_location) - #print "sample.len = %i" % len(sample) - num_subpatterns = (<PhraseLocation> node.phrase_location).num_subpatterns - chunklen = cintlist.CIntList(initial_len=num_subpatterns) - for j from 0 <= j < num_subpatterns: - chunklen.arr[j] = hiero_phrase.chunklen(j) - extracts = [] - j = 0 - extract_start = monitor.cpu() - '''orig_tight_phrases = self.tight_phrases - orig_require_aligned_terminal = self.require_aligned_terminal - orig_require_aligned_chunks = self.require_aligned_chunks - if k==0 or i==len(fwords)-1: - self.tight_phrases = 0 - self.require_aligned_terminal = 0 - self.require_aligned_chunks = 0''' - while j < sample.len: - extract = [] - - assign_matching(&matching, sample.arr, j, num_subpatterns, self.fda.sent_id.arr) - '''print "tight_phrase = " - print self.tight_phrases - print "require_aligned_terminal = " - print self.require_aligned_terminal - print "require_aligned_chunks = " - print self.require_aligned_chunks''' - - extract = self.extract(hiero_phrase, &matching, chunklen.arr, num_subpatterns) - extracts.extend(extract) - j = j + num_subpatterns - '''self.tight_phrases = orig_tight_phrases - sttice+sa.nw.normelf.require_aligned_terminal = orig_require_aligned_terminal - self.require_aligned_chunks = orig_require_aligned_chunks''' - num_samples = sample.len/num_subpatterns - extract_stop = monitor.cpu() - self.extract_time = self.extract_time + extract_stop - extract_start - #print "extract.size = %i" % len(extracts) - if len(extracts) > 0: - fphrases = {} - fals = {} - fcount = {} - for f, e, count, als in extracts: - fcount.setdefault(f, 0.0) - fcount[f] = fcount[f] + count - fphrases.setdefault(f, {}) - fphrases[f].setdefault(e, {}) - #fphrases[f][e] = fphrases[f][e] + count - fphrases[f][e].setdefault(als,0.0) - fphrases[f][e][als] = fphrases[f][e][als] + count - #print "f,e,als ",f," : ",e," : ",als," count = ",fphrases[f][e][als] - #fals[str(f)+" ||| "+str(e)] = als - for f, elist in fphrases.iteritems(): - #print "f = '%s'" % f - #if (str(f) in ['<s>','</s>','<s> [X,1]','[X,1] </s>']): - # print "rejected" - # continue - f_margin = fcount[f] - for e, alslist in elist.iteritems(): - alignment = None - count = 0 - for als, currcount in alslist.iteritems(): - #print "als = ",als,", count = ",currcount - if currcount > count: - alignment = als - count = currcount - #alignment = fals[str(f)+" ||| "+str(e)] - #print "selected = ",alignment," with count = ",count - scores = [] - for m in self.context_manager.models: - scores.append(m.compute_contextless_score(f, e, count, fcount[f], num_samples)) - r = rule.Rule(self.category, f, e, scores=scores, owner="context", word_alignments = alignment) - self.grammar.add(r) - if self.rule_filehandler is not None: - self.rule_filehandler.write("%s\n" % r.to_line()) - #print "adding a rule = %s" % r - - #if len(phrase) < self.max_length and i+spanlen < len(fwords) and pathlen+spanlen < self.max_initial_size: - if len(phrase) < self.max_length and i+spanlen < len(fwords) and pathlen+1 <= self.max_initial_size: - #to prevent [X] </s> - #print "lexicalized" - for alt_id in xrange(len(fwords[i+spanlen])): - #if (fwords[i+spanlen][alt_id][2]+pathlen+spanlen <= self.max_initial_size): - #new_frontier.append((k, i+spanlen, alt_id, pathlen + spanlen, node, phrase, is_shadow_path)) - #print "alt_id = %d\n" % alt_id - new_frontier.append((k, i+spanlen, alt_id, pathlen + 1, node, phrase, is_shadow_path)) - #print (k, i+spanlen, alt_id, pathlen + spanlen, node, map(sym.tostring,phrase), is_shadow_path) - #print "end lexicalized" - num_subpatterns = arity - if not is_shadow_path: - num_subpatterns = num_subpatterns + 1 - #to avoid <s> X ... we want <s> next to a lexicalized item - #if k>0 and i<len(fwords)-1 and len(phrase)+1 < self.max_length and arity < self.max_nonterminals and num_subpatterns < self.max_chunks: - if len(phrase)+1 < self.max_length and arity < self.max_nonterminals and num_subpatterns < self.max_chunks: - #print "masuk kondisi" - xcat = sym.setindex(self.category, arity+1) - xnode = node.children[xcat] - #frontier_nodes = self.get_all_nodes_isteps_away(self.min_gap_size, i, spanlen, pathlen, fwords, next_states) - # I put spanlen=1 below - key = tuple([self.min_gap_size, i, 1, pathlen]) - frontier_nodes = [] - if (key in nodes_isteps_away_buffer): - frontier_nodes = nodes_isteps_away_buffer[key] - else: - frontier_nodes = self.get_all_nodes_isteps_away(self.min_gap_size, i, 1, pathlen, fwords, next_states, reachable_buffer) - nodes_isteps_away_buffer[key] = frontier_nodes - - #print "new frontier:\n" - for (i, alt, pathlen) in frontier_nodes: - #if (pathlen+fwords[i][alt][2] <= self.max_initial_size): - new_frontier.append((k, i, alt, pathlen, xnode, phrase +(xcat,), is_shadow_path)) - #print k, i, alt, pathlen, node, map(sym.tostring,phrase +(xcat,)), is_shadow_path - #print "all end\n"; - #else: - #print "no new frontier1\n"; - #else : - #print "no new frontier2\n" - if self.log_int_stats: - log.writeln("This iteration intersect time = %f" % (self.intersect_time - this_iter_intersect_time)) - frontier = new_frontier - - stop_time = monitor.cpu() - log.writeln("Total time for rule lookup, extraction, and scoring = %f seconds" % (stop_time - start_time)) - #log.writeln(" Intersect time = %f seconds" % self.intersect_time) - gc.collect() - log.writeln(" Extract time = %f seconds" % self.extract_time) - if self.pruned_rule_file: - self.grammar.dump(self.pruned_rule_file) - if self.per_sentence_grammar: - self.rule_filehandler.close() - -# else: -# self.rule_filehandler.write("###EOS_"+ id +"\n") - - - cdef int find_fixpoint(self, - int f_low, f_high, - int* f_links_low, int* f_links_high, - int* e_links_low, int* e_links_high, - int e_in_low, int e_in_high, - int* e_low, int* e_high, - int* f_back_low, int* f_back_high, - int f_sent_len, int e_sent_len, - int max_f_len, int max_e_len, - int min_fx_size, int min_ex_size, - int max_new_x, - int allow_low_x, int allow_high_x, - int allow_arbitrary_x, int write_log): - cdef int e_low_prev, e_high_prev, f_low_prev, f_high_prev, new_x, new_low_x, new_high_x - - e_low[0] = e_in_low - e_high[0] = e_in_high - self.find_projection(f_low, f_high, f_links_low, f_links_high, e_low, e_high) - if e_low[0] == -1: - # low-priority corner case: if phrase w is unaligned, - # but we don't require aligned terminals, then returning - # an error here might prevent extraction of allowed - # rule X -> X_1 w X_2 / X_1 X_2. This is probably - # not worth the bother, though. - #print "find_fixpoint0" - return 0 - elif e_in_low != -1 and e_low[0] != e_in_low: - if e_in_low - e_low[0] < min_ex_size: - e_low[0] = e_in_low - min_ex_size - if e_low[0] < 0: - #print "find_fixpoint1" - return 0 - - if e_high[0] - e_low[0] > max_e_len: - #print "find_fixpoint2" - return 0 - elif e_in_high != -1 and e_high[0] != e_in_high: - if e_high[0] - e_in_high < min_ex_size: - e_high[0] = e_in_high + min_ex_size - if e_high[0] > e_sent_len: - #print "find_fixpoint3" - return 0 - - f_back_low[0] = -1 - f_back_high[0] = -1 - f_low_prev = f_low - f_high_prev = f_high - new_x = 0 - new_low_x = 0 - new_high_x = 0 - - while True: - - if f_back_low[0] == -1: - self.find_projection(e_low[0], e_high[0], e_links_low, e_links_high, f_back_low, f_back_high) - else: - self.find_projection(e_low[0], e_low_prev, e_links_low, e_links_high, f_back_low, f_back_high) - self.find_projection(e_high_prev, e_high[0], e_links_low, e_links_high, f_back_low, f_back_high) - - if f_back_low[0] > f_low: - f_back_low[0] = f_low - - if f_back_high[0] < f_high: - f_back_high[0] = f_high - - if f_back_low[0] == f_low_prev and f_back_high[0] == f_high_prev: - return 1 - - if allow_low_x == 0 and f_back_low[0] < f_low: -# log.writeln(" FAIL: f phrase is not tight") - #print " FAIL: f phrase is not tight" - return 0 - - if f_back_high[0] - f_back_low[0] > max_f_len: -# log.writeln(" FAIL: f back projection is too wide") - #print " FAIL: f back projection is too wide" - return 0 - - if allow_high_x == 0 and f_back_high[0] > f_high: -# log.writeln(" FAIL: extension on high side not allowed") - #print " FAIL: extension on high side not allowed" - return 0 - - if f_low != f_back_low[0]: - if new_low_x == 0: - if new_x >= max_new_x: -# log.writeln(" FAIL: extension required on low side violates max # of gaps") - #print " FAIL: extension required on low side violates max # of gaps" - return 0 - else: - new_x = new_x + 1 - new_low_x = 1 - if f_low - f_back_low[0] < min_fx_size: - f_back_low[0] = f_low - min_fx_size - if f_back_high[0] - f_back_low[0] > max_f_len: -# log.writeln(" FAIL: extension required on low side violates max initial length") - #print " FAIL: extension required on low side violates max initial length" - return 0 - if f_back_low[0] < 0: -# log.writeln(" FAIL: extension required on low side violates sentence boundary") - #print " FAIL: extension required on low side violates sentence boundary" - return 0 - - if f_high != f_back_high[0]: - if new_high_x == 0: - if new_x >= max_new_x: -# log.writeln(" FAIL: extension required on high side violates max # of gaps") - #print " FAIL: extension required on high side violates max # of gaps" - return 0 - else: - new_x = new_x + 1 - new_high_x = 1 - if f_back_high[0] - f_high < min_fx_size: - f_back_high[0] = f_high + min_fx_size - if f_back_high[0] - f_back_low[0] > max_f_len: -# log.writeln(" FAIL: extension required on high side violates max initial length") - #print " FAIL: extension required on high side violates max initial length" - return 0 - if f_back_high[0] > f_sent_len: -# log.writeln(" FAIL: extension required on high side violates sentence boundary") - #print " FAIL: extension required on high side violates sentence boundary" - return 0 - - e_low_prev = e_low[0] - e_high_prev = e_high[0] - - self.find_projection(f_back_low[0], f_low_prev, f_links_low, f_links_high, e_low, e_high) - self.find_projection(f_high_prev, f_back_high[0], f_links_low, f_links_high, e_low, e_high) - if e_low[0] == e_low_prev and e_high[0] == e_high_prev: - return 1 - if allow_arbitrary_x == 0: -# log.writeln(" FAIL: arbitrary expansion not permitted") - #print " FAIL: arbitrary expansion not permitted" - return 0 - if e_high[0] - e_low[0] > max_e_len: -# log.writeln(" FAIL: re-projection violates sentence max phrase length") - #print " FAIL: re-projection violates sentence max phrase length" - return 0 - f_low_prev = f_back_low[0] - f_high_prev = f_back_high[0] - - - cdef find_projection(self, int in_low, int in_high, int* in_links_low, int* in_links_high, - int* out_low, int* out_high): - cdef int i - for i from in_low <= i < in_high: - if in_links_low[i] != -1: - if out_low[0] == -1 or in_links_low[i] < out_low[0]: - out_low[0] = in_links_low[i] - if out_high[0] == -1 or in_links_high[i] > out_high[0]: - out_high[0] = in_links_high[i] - - - cdef int* int_arr_extend(self, int* arr, int* arr_len, int* data, int data_len): - cdef int new_len - new_len = arr_len[0] + data_len - arr = <int*> realloc(arr, new_len*sizeof(int)) - memcpy(arr+arr_len[0], data, data_len*sizeof(int)) - arr_len[0] = new_len - return arr - - - cdef extract_phrases(self, int e_low, int e_high, int* e_gap_low, int* e_gap_high, int* e_links_low, int num_gaps, - int f_low, int f_high, int* f_gap_low, int* f_gap_high, int* f_links_low, - int sent_id, int e_sent_len, int e_sent_start): - cdef int i, j, k, m, n, *e_gap_order, e_x_low, e_x_high, e_x_gap_low, e_x_gap_high - cdef int *e_gaps1, *e_gaps2, len1, len2, step, num_chunks - cdef cintlist.CIntList ephr_arr - cdef result - - #print "inside extract_phrases" - #print "f_low=%d, f_high=%d" % (f_low,f_high) - result = [] - len1 = 0 - e_gaps1 = <int*> malloc(0) - ephr_arr = cintlist.CIntList() - - e_gap_order = <int*> malloc(num_gaps*sizeof(int)) - if num_gaps > 0: - e_gap_order[0] = 0 - for i from 1 <= i < num_gaps: - for j from 0 <= j < i: - if e_gap_low[i] < e_gap_low[j]: - for k from j <= k < i: - e_gap_order[k+1] = e_gap_order[k] - e_gap_order[j] = i - break - else: - e_gap_order[i] = i - - e_x_low = e_low - e_x_high = e_high - if self.tight_phrases == 0: - while e_x_low > 0 and e_high - e_x_low < self.train_max_initial_size and e_links_low[e_x_low-1] == -1: - e_x_low = e_x_low - 1 - while e_x_high < e_sent_len and e_x_high - e_low < self.train_max_initial_size and e_links_low[e_x_high] == -1: - e_x_high = e_x_high + 1 - - for i from e_x_low <= i <= e_low: - e_gaps1 = self.int_arr_extend(e_gaps1, &len1, &i, 1) - - for i from 0 <= i < num_gaps: - e_gaps2 = <int*> malloc(0) - len2 = 0 - - j = e_gap_order[i] - e_x_gap_low = e_gap_low[j] - e_x_gap_high = e_gap_high[j] - if self.tight_phrases == 0: - while e_x_gap_low > e_x_low and e_links_low[e_x_gap_low-1] == -1: - e_x_gap_low = e_x_gap_low - 1 - while e_x_gap_high < e_x_high and e_links_low[e_x_gap_high] == -1: - e_x_gap_high = e_x_gap_high + 1 - - k = 0 - step = 1+(i*2) - while k < len1: - for m from e_x_gap_low <= m <= e_gap_low[j]: - if m >= e_gaps1[k+step-1]: - for n from e_gap_high[j] <= n <= e_x_gap_high: - if n-m >= 1: # extractor.py doesn't restrict target-side gap length - e_gaps2 = self.int_arr_extend(e_gaps2, &len2, e_gaps1+k, step) - e_gaps2 = self.int_arr_extend(e_gaps2, &len2, &m, 1) - e_gaps2 = self.int_arr_extend(e_gaps2, &len2, &n, 1) - k = k + step - free(e_gaps1) - e_gaps1 = e_gaps2 - len1 = len2 - - step = 1+(num_gaps*2) - e_gaps2 = <int*> malloc(0) - len2 = 0 - for i from e_high <= i <= e_x_high: - j = 0 - while j < len1: - if i - e_gaps1[j] <= self.train_max_initial_size and i >= e_gaps1[j+step-1]: - e_gaps2 = self.int_arr_extend(e_gaps2, &len2, e_gaps1+j, step) - e_gaps2 = self.int_arr_extend(e_gaps2, &len2, &i, 1) - j = j + step - free(e_gaps1) - e_gaps1 = e_gaps2 - len1 = len2 - - step = (num_gaps+1)*2 - i = 0 - - while i < len1: - ephr_arr._clear() - num_chunks = 0 - indexes = [] - for j from 0 <= j < num_gaps+1: - if e_gaps1[i+2*j] < e_gaps1[i+(2*j)+1]: - num_chunks = num_chunks + 1 - for k from e_gaps1[i+2*j] <= k < e_gaps1[i+(2*j)+1]: - indexes.append(k) - ephr_arr._append(self.eid2symid[self.eda.data.arr[e_sent_start+k]]) - if j < num_gaps: - indexes.append(sym.setindex(self.category, e_gap_order[j]+1)) - ephr_arr._append(sym.setindex(self.category, e_gap_order[j]+1)) - i = i + step - if ephr_arr.len <= self.max_target_length and num_chunks <= self.max_target_chunks: - result.append((rule.Phrase(ephr_arr),indexes)) - - free(e_gaps1) - free(e_gap_order) - return result - - cdef create_alignments(self, int* sent_links, int num_links, findexes, eindexes): - #print "create_alignments" - #s = "sent_links = " - #i = 0 - #while (i < num_links*2): - # s = s+"%d-%d " % (sent_links[i],sent_links[i+1]) - # i += 2 - #print s - #print findexes - #print eindexes - - ret = cintlist.CIntList() - for i in xrange(len(findexes)): - s = findexes[i] - if (s<0): - continue - idx = 0 - while (idx < num_links*2): - if (sent_links[idx] == s): - j = eindexes.index(sent_links[idx+1]) - ret.append(i*65536+j) - idx += 2 - return ret - - cdef extract(self, rule.Phrase phrase, Matching* matching, int* chunklen, int num_chunks): - cdef int* sent_links, *e_links_low, *e_links_high, *f_links_low, *f_links_high - cdef int *f_gap_low, *f_gap_high, *e_gap_low, *e_gap_high, num_gaps, gap_start - cdef int i, j, k, e_i, f_i, num_links, num_aligned_chunks, met_constraints - cdef int f_low, f_high, e_low, e_high, f_back_low, f_back_high - cdef int e_sent_start, e_sent_end, f_sent_start, f_sent_end, e_sent_len, f_sent_len - cdef int e_word_count, f_x_low, f_x_high, e_x_low, e_x_high, phrase_len - cdef float pair_count - cdef float available_mass - cdef extracts, phrase_list - cdef cintlist.CIntList fphr_arr - cdef rule.Phrase fphr - cdef reason_for_failure - - fphr_arr = cintlist.CIntList() - phrase_len = phrase.n - extracts = [] - sent_links = self.alignment._get_sent_links(matching.sent_id, &num_links) - - e_sent_start = self.eda.sent_index.arr[matching.sent_id] - e_sent_end = self.eda.sent_index.arr[matching.sent_id+1] - e_sent_len = e_sent_end - e_sent_start - 1 - f_sent_start = self.fda.sent_index.arr[matching.sent_id] - f_sent_end = self.fda.sent_index.arr[matching.sent_id+1] - f_sent_len = f_sent_end - f_sent_start - 1 - available_mass = 1.0 - if matching.sent_id == self.excluded_sent_id: - available_mass = 0.0 - - self.findexes1.reset() - sofar = 0 - for i in xrange(num_chunks): - for j in xrange(chunklen[i]): - self.findexes1.append(matching.arr[matching.start+i]+j-f_sent_start); - sofar += 1 - if (i+1<num_chunks): - self.findexes1.append(phrase[sofar]) - sofar += 1 - - - e_links_low = <int*> malloc(e_sent_len*sizeof(int)) - e_links_high = <int*> malloc(e_sent_len*sizeof(int)) - f_links_low = <int*> malloc(f_sent_len*sizeof(int)) - f_links_high = <int*> malloc(f_sent_len*sizeof(int)) - f_gap_low = <int*> malloc((num_chunks+1)*sizeof(int)) - f_gap_high = <int*> malloc((num_chunks+1)*sizeof(int)) - e_gap_low = <int*> malloc((num_chunks+1)*sizeof(int)) - e_gap_high = <int*> malloc((num_chunks+1)*sizeof(int)) - memset(f_gap_low, 0, (num_chunks+1)*sizeof(int)) - memset(f_gap_high, 0, (num_chunks+1)*sizeof(int)) - memset(e_gap_low, 0, (num_chunks+1)*sizeof(int)) - memset(e_gap_high, 0, (num_chunks+1)*sizeof(int)) - - reason_for_failure = "" - - for i from 0 <= i < e_sent_len: - e_links_low[i] = -1 - e_links_high[i] = -1 - for i from 0 <= i < f_sent_len: - f_links_low[i] = -1 - f_links_high[i] = -1 - - # this is really inefficient -- might be good to - # somehow replace with binary search just for the f - # links that we care about (but then how to look up - # when we want to check something on the e side?) - i = 0 - while i < num_links*2: - f_i = sent_links[i] - e_i = sent_links[i+1] - if f_links_low[f_i] == -1 or f_links_low[f_i] > e_i: - f_links_low[f_i] = e_i - if f_links_high[f_i] == -1 or f_links_high[f_i] < e_i + 1: - f_links_high[f_i] = e_i + 1 - if e_links_low[e_i] == -1 or e_links_low[e_i] > f_i: - e_links_low[e_i] = f_i - if e_links_high[e_i] == -1 or e_links_high[e_i] < f_i + 1: - e_links_high[e_i] = f_i + 1 - i = i + 2 - - als = [] - for x in xrange(matching.start,matching.end): - al = (matching.arr[x]-f_sent_start,f_links_low[matching.arr[x]-f_sent_start]) - als.append(al) - # check all source-side alignment constraints - met_constraints = 1 - if self.require_aligned_terminal: - num_aligned_chunks = 0 - for i from 0 <= i < num_chunks: - for j from 0 <= j < chunklen[i]: - if f_links_low[matching.arr[matching.start+i]+j-f_sent_start] > -1: - num_aligned_chunks = num_aligned_chunks + 1 - break - if num_aligned_chunks == 0: - reason_for_failure = "No aligned terminals" - met_constraints = 0 - if self.require_aligned_chunks and num_aligned_chunks < num_chunks: - reason_for_failure = "Unaligned chunk" - met_constraints = 0 - - if met_constraints and self.tight_phrases: - # outside edge constraints are checked later - for i from 0 <= i < num_chunks-1: - if f_links_low[matching.arr[matching.start+i]+chunklen[i]-f_sent_start] == -1: - reason_for_failure = "Gaps are not tight phrases" - met_constraints = 0 - break - if f_links_low[matching.arr[matching.start+i+1]-1-f_sent_start] == -1: - reason_for_failure = "Gaps are not tight phrases" - met_constraints = 0 - break - - f_low = matching.arr[matching.start] - f_sent_start - f_high = matching.arr[matching.start + matching.size - 1] + chunklen[num_chunks-1] - f_sent_start - if met_constraints: - - if self.find_fixpoint(f_low, f_high, f_links_low, f_links_high, e_links_low, e_links_high, - -1, -1, &e_low, &e_high, &f_back_low, &f_back_high, f_sent_len, e_sent_len, - self.train_max_initial_size, self.train_max_initial_size, - self.train_min_gap_size, 0, - self.max_nonterminals - num_chunks + 1, 1, 1, 0, 0): - gap_error = 0 - num_gaps = 0 - - if f_back_low < f_low: - f_gap_low[0] = f_back_low - f_gap_high[0] = f_low - num_gaps = 1 - gap_start = 0 - phrase_len = phrase_len+1 - if phrase_len > self.max_length: - gap_error = 1 - if self.tight_phrases: - if f_links_low[f_back_low] == -1 or f_links_low[f_low-1] == -1: - gap_error = 1 - reason_for_failure = "Inside edges of preceding subphrase are not tight" - else: - gap_start = 1 - if self.tight_phrases and f_links_low[f_low] == -1: - # this is not a hard error. we can't extract this phrase - # but we still might be able to extract a superphrase - met_constraints = 0 - - for i from 0 <= i < matching.size - 1: - f_gap_low[1+i] = matching.arr[matching.start+i] + chunklen[i] - f_sent_start - f_gap_high[1+i] = matching.arr[matching.start+i+1] - f_sent_start - num_gaps = num_gaps + 1 - - if f_high < f_back_high: - f_gap_low[gap_start+num_gaps] = f_high - f_gap_high[gap_start+num_gaps] = f_back_high - num_gaps = num_gaps + 1 - phrase_len = phrase_len+1 - if phrase_len > self.max_length: - gap_error = 1 - if self.tight_phrases: - if f_links_low[f_back_high-1] == -1 or f_links_low[f_high] == -1: - gap_error = 1 - reason_for_failure = "Inside edges of following subphrase are not tight" - else: - if self.tight_phrases and f_links_low[f_high-1] == -1: - met_constraints = 0 - - if gap_error == 0: - e_word_count = e_high - e_low - for i from 0 <= i < num_gaps: # check integrity of subphrases - if self.find_fixpoint(f_gap_low[gap_start+i], f_gap_high[gap_start+i], - f_links_low, f_links_high, e_links_low, e_links_high, - -1, -1, e_gap_low+gap_start+i, e_gap_high+gap_start+i, - f_gap_low+gap_start+i, f_gap_high+gap_start+i, - f_sent_len, e_sent_len, - self.train_max_initial_size, self.train_max_initial_size, - 0, 0, 0, 0, 0, 0, 0) == 0: - gap_error = 1 - reason_for_failure = "Subphrase [%d, %d] failed integrity check" % (f_gap_low[gap_start+i], f_gap_high[gap_start+i]) - break - - if gap_error == 0: - i = 1 - self.findexes.reset() - if f_back_low < f_low: - fphr_arr._append(sym.setindex(self.category, i)) - i = i+1 - self.findexes.append(sym.setindex(self.category, i)) - self.findexes.extend(self.findexes1) - for j from 0 <= j < phrase.n: - if sym.isvar(phrase.syms[j]): - fphr_arr._append(sym.setindex(self.category, i)) - i = i + 1 - else: - fphr_arr._append(phrase.syms[j]) - if f_back_high > f_high: - fphr_arr._append(sym.setindex(self.category, i)) - self.findexes.append(sym.setindex(self.category, i)) - - fphr = rule.Phrase(fphr_arr) - if met_constraints: - phrase_list = self.extract_phrases(e_low, e_high, e_gap_low + gap_start, e_gap_high + gap_start, e_links_low, num_gaps, - f_back_low, f_back_high, f_gap_low + gap_start, f_gap_high + gap_start, f_links_low, - matching.sent_id, e_sent_len, e_sent_start) - #print "e_low=%d, e_high=%d, gap_start=%d, num_gaps=%d, f_back_low=%d, f_back_high=%d" & (e_low, e_high, gap_start, num_gaps, f_back_low, f_back_high) - if len(phrase_list) > 0: - pair_count = available_mass / len(phrase_list) - else: - pair_count = 0 - reason_for_failure = "Didn't extract anything from [%d, %d] -> [%d, %d]" % (f_back_low, f_back_high, e_low, e_high) - for (phrase2,eindexes) in phrase_list: - als1 = self.create_alignments(sent_links,num_links,self.findexes,eindexes) - extracts.append((fphr, phrase2, pair_count, tuple(als1))) - if self.extract_file: - self.extract_file.write("%s ||| %s ||| %f ||| %d: [%d, %d] -> [%d, %d]\n" % (fphr, phrase2, pair_count, matching.sent_id+1, f_back_low, f_back_high, e_low, e_high)) - #print "extract_phrases1: %s ||| %s ||| %f ||| %d: [%d, %d] -> [%d, %d]\n" % (fphr, phrase2, pair_count, matching.sent_id+1, f_back_low, f_back_high, e_low, e_high) - - if (num_gaps < self.max_nonterminals and - phrase_len < self.max_length and - f_back_high - f_back_low + self.train_min_gap_size <= self.train_max_initial_size): - if (f_back_low == f_low and - f_low >= self.train_min_gap_size and - ((not self.tight_phrases) or (f_links_low[f_low-1] != -1 and f_links_low[f_back_high-1] != -1))): - f_x_low = f_low-self.train_min_gap_size - met_constraints = 1 - if self.tight_phrases: - while f_x_low >= 0 and f_links_low[f_x_low] == -1: - f_x_low = f_x_low - 1 - if f_x_low < 0 or f_back_high - f_x_low > self.train_max_initial_size: - met_constraints = 0 - - if (met_constraints and - self.find_fixpoint(f_x_low, f_back_high, - f_links_low, f_links_high, e_links_low, e_links_high, - e_low, e_high, &e_x_low, &e_x_high, &f_x_low, &f_x_high, - f_sent_len, e_sent_len, - self.train_max_initial_size, self.train_max_initial_size, - 1, 1, 1, 1, 0, 1, 0) and - ((not self.tight_phrases) or f_links_low[f_x_low] != -1) and - self.find_fixpoint(f_x_low, f_low, # check integrity of new subphrase - f_links_low, f_links_high, e_links_low, e_links_high, - -1, -1, e_gap_low, e_gap_high, f_gap_low, f_gap_high, - f_sent_len, e_sent_len, - self.train_max_initial_size, self.train_max_initial_size, - 0, 0, 0, 0, 0, 0, 0)): - fphr_arr._clear() - i = 1 - self.findexes.reset() - self.findexes.append(sym.setindex(self.category, i)) - fphr_arr._append(sym.setindex(self.category, i)) - i = i+1 - self.findexes.extend(self.findexes1) - for j from 0 <= j < phrase.n: - if sym.isvar(phrase.syms[j]): - fphr_arr._append(sym.setindex(self.category, i)) - i = i + 1 - else: - fphr_arr._append(phrase.syms[j]) - if f_back_high > f_high: - fphr_arr._append(sym.setindex(self.category, i)) - self.findexes.append(sym.setindex(self.category, i)) - fphr = rule.Phrase(fphr_arr) - phrase_list = self.extract_phrases(e_x_low, e_x_high, e_gap_low, e_gap_high, e_links_low, num_gaps+1, - f_x_low, f_x_high, f_gap_low, f_gap_high, f_links_low, matching.sent_id, - e_sent_len, e_sent_start) - if len(phrase_list) > 0: - pair_count = available_mass / len(phrase_list) - else: - pair_count = 0 - for phrase2,eindexes in phrase_list: - als2 = self.create_alignments(sent_links,num_links,self.findexes,eindexes) - extracts.append((fphr, phrase2, pair_count, tuple(als2))) - if self.extract_file: - self.extract_file.write("%s ||| %s ||| %f ||| %d: [%d, %d] -> [%d, %d]\n" % (fphr, phrase2, pair_count, matching.sent_id+1, f_x_low, f_x_high, e_x_low, e_x_high)) - #print "extract_phrases2: %s ||| %s ||| %f ||| %d: [%d, %d] -> [%d, %d]\n" % (fphr, phrase2, pair_count, matching.sent_id+1, f_x_low, f_x_high, e_x_low, e_x_high) - - if (f_back_high == f_high and - f_sent_len - f_high >= self.train_min_gap_size and - ((not self.tight_phrases) or (f_links_low[f_high] != -1 and f_links_low[f_back_low] != -1))): - f_x_high = f_high+self.train_min_gap_size - met_constraints = 1 - if self.tight_phrases: - while f_x_high <= f_sent_len and f_links_low[f_x_high-1] == -1: - f_x_high = f_x_high + 1 - if f_x_high > f_sent_len or f_x_high - f_back_low > self.train_max_initial_size: - met_constraints = 0 - - if (met_constraints and - self.find_fixpoint(f_back_low, f_x_high, - f_links_low, f_links_high, e_links_low, e_links_high, - e_low, e_high, &e_x_low, &e_x_high, &f_x_low, &f_x_high, - f_sent_len, e_sent_len, - self.train_max_initial_size, self.train_max_initial_size, - 1, 1, 1, 0, 1, 1, 0) and - ((not self.tight_phrases) or f_links_low[f_x_high-1] != -1) and - self.find_fixpoint(f_high, f_x_high, - f_links_low, f_links_high, e_links_low, e_links_high, - -1, -1, e_gap_low+gap_start+num_gaps, e_gap_high+gap_start+num_gaps, - f_gap_low+gap_start+num_gaps, f_gap_high+gap_start+num_gaps, - f_sent_len, e_sent_len, - self.train_max_initial_size, self.train_max_initial_size, - 0, 0, 0, 0, 0, 0, 0)): - fphr_arr._clear() - i = 1 - self.findexes.reset() - if f_back_low < f_low: - fphr_arr._append(sym.setindex(self.category, i)) - i = i+1 - self.findexes.append(sym.setindex(self.category, i)) - self.findexes.extend(self.findexes1) - for j from 0 <= j < phrase.n: - if sym.isvar(phrase.syms[j]): - fphr_arr._append(sym.setindex(self.category, i)) - i = i + 1 - else: - fphr_arr._append(phrase.syms[j]) - fphr_arr._append(sym.setindex(self.category, i)) - self.findexes.append(sym.setindex(self.category, i)) - fphr = rule.Phrase(fphr_arr) - phrase_list = self.extract_phrases(e_x_low, e_x_high, e_gap_low+gap_start, e_gap_high+gap_start, e_links_low, num_gaps+1, - f_x_low, f_x_high, f_gap_low+gap_start, f_gap_high+gap_start, f_links_low, - matching.sent_id, e_sent_len, e_sent_start) - if len(phrase_list) > 0: - pair_count = available_mass / len(phrase_list) - else: - pair_count = 0 - for phrase2, eindexes in phrase_list: - als3 = self.create_alignments(sent_links,num_links,self.findexes,eindexes) - extracts.append((fphr, phrase2, pair_count, tuple(als3))) - if self.extract_file: - self.extract_file.write("%s ||| %s ||| %f ||| %d: [%d, %d] -> [%d, %d]\n" % (fphr, phrase2, pair_count, matching.sent_id+1, f_x_low, f_x_high, e_x_low, e_x_high)) - #print "extract_phrases3: %s ||| %s ||| %f ||| %d: [%d, %d] -> [%d, %d]\n" % (fphr, phrase2, pair_count, matching.sent_id+1, f_x_low, f_x_high, e_x_low, e_x_high) - if (num_gaps < self.max_nonterminals - 1 and - phrase_len+1 < self.max_length and - f_back_high == f_high and - f_back_low == f_low and - f_back_high - f_back_low + (2*self.train_min_gap_size) <= self.train_max_initial_size and - f_low >= self.train_min_gap_size and - f_high <= f_sent_len - self.train_min_gap_size and - ((not self.tight_phrases) or (f_links_low[f_low-1] != -1 and f_links_low[f_high] != -1))): - - met_constraints = 1 - f_x_low = f_low-self.train_min_gap_size - if self.tight_phrases: - while f_x_low >= 0 and f_links_low[f_x_low] == -1: - f_x_low = f_x_low - 1 - if f_x_low < 0: - met_constraints = 0 - - f_x_high = f_high+self.train_min_gap_size - if self.tight_phrases: - while f_x_high <= f_sent_len and f_links_low[f_x_high-1] == -1: - f_x_high = f_x_high + 1 - if f_x_high > f_sent_len or f_x_high - f_x_low > self.train_max_initial_size: - met_constraints = 0 - - if (met_constraints and - self.find_fixpoint(f_x_low, f_x_high, - f_links_low, f_links_high, e_links_low, e_links_high, - e_low, e_high, &e_x_low, &e_x_high, &f_x_low, &f_x_high, - f_sent_len, e_sent_len, - self.train_max_initial_size, self.train_max_initial_size, - 1, 1, 2, 1, 1, 1, 1) and - ((not self.tight_phrases) or (f_links_low[f_x_low] != -1 and f_links_low[f_x_high-1] != -1)) and - self.find_fixpoint(f_x_low, f_low, - f_links_low, f_links_high, e_links_low, e_links_high, - -1, -1, e_gap_low, e_gap_high, f_gap_low, f_gap_high, - f_sent_len, e_sent_len, - self.train_max_initial_size, self.train_max_initial_size, - 0, 0, 0, 0, 0, 0, 0) and - self.find_fixpoint(f_high, f_x_high, - f_links_low, f_links_high, e_links_low, e_links_high, - -1, -1, e_gap_low+1+num_gaps, e_gap_high+1+num_gaps, - f_gap_low+1+num_gaps, f_gap_high+1+num_gaps, - f_sent_len, e_sent_len, - self.train_max_initial_size, self.train_max_initial_size, - 0, 0, 0, 0, 0, 0, 0)): - fphr_arr._clear() - i = 1 - self.findexes.reset() - self.findexes.append(sym.setindex(self.category, i)) - fphr_arr._append(sym.setindex(self.category, i)) - i = i+1 - self.findexes.extend(self.findexes1) - for j from 0 <= j < phrase.n: - if sym.isvar(phrase.syms[j]): - fphr_arr._append(sym.setindex(self.category, i)) - i = i + 1 - else: - fphr_arr._append(phrase.syms[j]) - fphr_arr._append(sym.setindex(self.category, i)) - self.findexes.append(sym.setindex(self.category, i)) - fphr = rule.Phrase(fphr_arr) - phrase_list = self.extract_phrases(e_x_low, e_x_high, e_gap_low, e_gap_high, e_links_low, num_gaps+2, - f_x_low, f_x_high, f_gap_low, f_gap_high, f_links_low, - matching.sent_id, e_sent_len, e_sent_start) - if len(phrase_list) > 0: - pair_count = available_mass / len(phrase_list) - else: - pair_count = 0 - for phrase2, eindexes in phrase_list: - als4 = self.create_alignments(sent_links,num_links,self.findexes,eindexes) - extracts.append((fphr, phrase2, pair_count, tuple(als4))) - if self.extract_file: - self.extract_file.write("%s ||| %s ||| %f ||| %d: [%d, %d] -> [%d, %d]\n" % (fphr, phrase2, pair_count, matching.sent_id+1, f_x_low, f_x_high, e_x_low, e_x_high)) - #print "extract_phrases4 %s ||| %s ||| %f ||| %d: [%d, %d] -> [%d, %d]\n" % (fphr, phrase2, pair_count, matching.sent_id+1, f_x_low, f_x_high, e_x_low, e_x_high) - else: - reason_for_failure = "Unable to extract basic phrase" - - free(sent_links) - free(f_links_low) - free(f_links_high) - free(e_links_low) - free(e_links_high) - free(f_gap_low) - free(f_gap_high) - free(e_gap_low) - free(e_gap_high) - - if self.sample_file is not None: - self.sample_file.write("%s ||| %d: [%d, %d] ||| %d ||| %s\n" % (phrase, matching.sent_id+1, f_low, f_high, len(extracts), reason_for_failure)) - - #print "%s ||| %d: [%d, %d] ||| %d ||| %s\n" % (phrase, matching.sent_id+1, f_low, f_high, len(extracts), reason_for_failure) - - - return extracts - |