Merge branch 'master' of github.com:redpony/cdec

author: Kenneth Heafield <github@kheafield.com> 2012-08-03 07:46:54 -0400
committer: Kenneth Heafield <github@kheafield.com> 2012-08-03 07:46:54 -0400
commit: 122f46c31102b683eaab3ad81a3a98accbc694bb (patch)
tree: 8d499d789b159ebed25bb23b6983813d064a6296 /sa-extract/rulefactory.pyx
parent: ac664bdb0e481539cf77098a7dd0e1ec8d937ba0 (diff)
parent: 193d137056c3c4f73d66f8db84691d63307de894 (diff)
1 files changed, 0 insertions, 2360 deletions
diff --git a/sa-extract/rulefactory.pyx b/sa-extract/rulefactory.pyx
deleted file mode 100644
index 20ea80d2..00000000
--- a/sa-extract/rulefactory.pyx
+++ /dev/null
@@ -1,2360 +0,0 @@
-# Pyrex implementation of the algorithms described in 
-# Lopez, EMNLP-CoNLL 2007
-# Much faster than the Python numbers reported there.
-# Note to reader: this code is closer to C than Python
-import sys
-import sym
-import log
-import rule
-import monitor
-import cintlist
-import csuf
-import cdat
-import cveb
-import precomputation
-import gc
-import cn
-import sgml
-
-cimport cmath
-cimport csuf
-cimport cdat
-cimport cintlist
-cimport rule
-cimport cveb
-cimport precomputation
-cimport calignment
-
-from libc.stdlib cimport malloc, realloc, free
-from libc.string cimport memset, memcpy
-from libc.math cimport fmod, ceil, floor
-
-cdef int PRECOMPUTE
-cdef int MERGE
-cdef int BAEZA_YATES
-
-PRECOMPUTE = 0
-MERGE = 1
-BAEZA_YATES = 2
-
-#cdef int node_count
-#node_count = 0
-
-cdef class TrieNode:
-  cdef public children
-  #cdef int id
-
-  def __init__(self):
-    self.children = {}
-    #self.id = node_count
-    #node_count += 1
-
-
-cdef class ExtendedTrieNode(TrieNode):
-  cdef public phrase
-  cdef public phrase_location
-  cdef public suffix_link
-
-  def __init__(self, phrase=None, phrase_location=None, suffix_link=None):
-    TrieNode.__init__(self)
-    self.phrase = phrase
-    self.phrase_location = phrase_location
-    self.suffix_link = suffix_link
-    
-
-cdef class TrieTable:
-  cdef public int extended
-  cdef public int count
-  cdef public root
-  def __cinit__(self, extended=False):
-    self.count = 0
-    self.extended = extended
-    if extended:
-      self.root = ExtendedTrieNode()
-    else:
-      self.root = TrieNode()
-
-# linked list structure for storing matches in BaselineRuleFactory
-cdef struct match_node:
-  int* match
-  match_node* next
-
-cdef class BaselineRuleFactory:
-
-  cdef grammar, context_manager
-  cdef int max_terminals, max_nonterminals
-  cdef int max_initial_size, train_max_initial_size
-  cdef int min_gap_size, train_min_gap_size
-  cdef int category
-  cdef int visit
-  cdef float intersect_time, extract_time
-  cdef ruleFile, timingFile
-  cdef int* last_visit1
-  cdef int* last_visit2
-  cdef match_node** intersector1
-  cdef match_node** intersector2
-  cdef csuf.SuffixArray sa
-  cdef cintlist.CIntList sent_id
-
-  def input(self, fwords):
-    flen = len(fwords)
-    start_time = monitor.cpu()
-    self.intersect_time = 0.0
-    self.extract_time = 0.0
-
-    pyro_phrase_count = 0
-    hiero_phrase_count = 0
-
-    frontier = []
-    for i in xrange(len(fwords)):
-      frontier.append((i, (), False))
-
-    while len(frontier) > 0:
-      this_iter_intersect_time = self.intersect_time
-      new_frontier = []
-      for i, prefix, is_shadow_path in frontier:
-
-        word_id = fwords[i][0][0]
-        #print "word_id = %i" % word_id
-        phrase = prefix + (word_id,)
-        str_phrase = map(sym.tostring, phrase)
-        hiero_phrase = rule.Phrase(phrase)
-
-        #log.writeln("pos %2d, '%s'" % (i, hiero_phrase))
-        self.lookup(hiero_phrase)
-        if hiero_phrase.arity() == 0:
-          pyro_phrase_count = pyro_phrase_count + 1
-        else:
-          hiero_phrase_count = hiero_phrase_count + 1
-
-        if len(phrase) - hiero_phrase.arity() < self.max_terminals and i+1 < len(fwords):
-          new_frontier.append((i+1, phrase, is_shadow_path))
-          if hiero_phrase.arity() < self.max_nonterminals:
-            xcat = sym.setindex(self.category, hiero_phrase.arity()+1)
-            for j in xrange(i+1+self.min_gap_size, min(i+self.max_initial_size, len(fwords))):
-              new_frontier.append((j, phrase+(xcat,), is_shadow_path))
-      log.writeln("This iteration intersect time = %f" % (self.intersect_time - this_iter_intersect_time))
-      frontier = new_frontier
-    stop_time = monitor.cpu()
-    log.writeln("COUNT %d %d" % (pyro_phrase_count, hiero_phrase_count))
-
-
-  def lookup(self, phrase):
-    cdef int j, g, start, stop, sent_id, num_ranges, arity
-    cdef match_node** cur_intersector
-    cdef match_node** next_intersector
-    cdef match_node** tmp_intersector
-    cdef match_node* node
-    cdef match_node* cur_node
-    cdef match_node* prev_node
-    cdef match_node** node_ptr
-    cdef int* cur_visit
-    cdef int* next_visit
-    cdef int* tmp_visit
-    cdef int* chunklen
-    
-    #print "\n\nLOOKUP\n\n"
-    ranges = []
-    sizes = []
-    arity = phrase.arity()
-    chunklen = <int *> malloc(arity*sizeof(int))
-    for i from 0 <= i < arity+1:
-      chunk = phrase.getchunk(i)
-      chunklen[i] = len(chunk)
-      sa_range = None
-      phr = ()
-      for offset, word_id in enumerate(chunk):
-        word = sym.tostring(word_id)
-        sa_range = self.context_manager.fsarray.lookup(word, offset, sa_range[0], sa_range[1])
-      if sa_range is None:
-        #log.writeln("Returned for phrase %s" % rule.Phrase(phr))
-        return
-      #log.writeln("Found range %s for phrase %s" % (sa_range, rule.Phrase(phr)))
-      ranges.append(sa_range)
-      sizes.append(sa_range[1]-sa_range[0])
-    if phrase.arity() == 0:
-      return
-
-    cur_intersector = self.intersector1
-    next_intersector = self.intersector2
-    cur_visit = self.last_visit1
-    next_visit = self.last_visit2
-
-    num_ranges = len(ranges)
-    for i from 0 <= i < num_ranges:
-      sa_range = ranges[i]
-      start_time = monitor.cpu()
-      self.visit = self.visit + 1
-      intersect_count = 0
-
-      start = sa_range[0]
-      stop = sa_range[1]
-      for j from start <= j < stop:
-        g = self.sa.sa.arr[j]
-        sent_id = self.sent_id.arr[g]
-        if i==0:
-          if next_visit[sent_id] != self.visit:
-            # clear intersector
-            node = next_intersector[sent_id]
-            next_intersector[sent_id] = NULL
-            while node != NULL:
-              prev_node = node
-              node = node.next
-              free(prev_node.match)
-              free(prev_node)
-            next_visit[sent_id] = self.visit
-          node_ptr = &(next_intersector[sent_id])
-          while node_ptr[0] != NULL:
-            node_ptr = &(node_ptr[0].next)
-          node_ptr[0] = <match_node*> malloc(sizeof(match_node))
-          node_ptr[0].match = <int *> malloc(sizeof(int))
-          node_ptr[0].match[0] = g
-          node_ptr[0].next = NULL
-          intersect_count = intersect_count + 1
-        else:
-          if cur_visit[sent_id] == self.visit-1:
-            cur_node = cur_intersector[sent_id]
-            while cur_node != NULL:
-              if g - cur_node.match[0] + chunklen[i] <= self.train_max_initial_size and g - cur_node.match[i-1] - chunklen[i-1] >= self.train_min_gap_size:
-                if next_visit[sent_id] != self.visit:
-                  # clear intersector -- note that we only do this if we've got something to put there
-                  node = next_intersector[sent_id]
-                  next_intersector[sent_id] = NULL
-                  while node != NULL:
-                    prev_node = node
-                    node = node.next
-                    free(prev_node.match)
-                    free(prev_node)
-                  next_visit[sent_id] = self.visit
-                node_ptr = &(next_intersector[sent_id])
-                while node_ptr[0] != NULL:
-                  node_ptr = &(node_ptr[0].next)
-                node_ptr[0] = <match_node*> malloc(sizeof(match_node))
-                node_ptr[0].match = <int *> malloc((i+1) * sizeof(int))
-                memcpy(node_ptr[0].match, cur_node.match, i*sizeof(int))
-                node_ptr[0].match[i] = g
-                node_ptr[0].next = NULL
-                intersect_count = intersect_count + 1
-              cur_node = cur_node.next
-      tmp_intersector = cur_intersector
-      cur_intersector = next_intersector
-      next_intersector = tmp_intersector
-      
-      tmp_visit = cur_visit
-      cur_visit = next_visit
-      next_visit = tmp_visit
-
-      intersect_time = monitor.cpu() - start_time
-      if i > 0:
-        log.writeln("INT %d %d %d %d %f baseline" % 
-          (arity, prev_intersect_count, sa_range[1]-sa_range[0], 
-          intersect_count, intersect_time))
-      if intersect_count == 0:
-        return None
-      prev_intersect_count = intersect_count
-    free(chunklen)
-
-
-
-  def __init__(self, max_terminals=5, max_nonterminals=2, 
-        max_initial_size=10, train_max_initial_size=10,
-        min_gap_size=1, train_min_gap_size=2, 
-        category='[PHRASE]', grammar=None,
-        ruleFile=None, timingFile=None):
-    self.grammar = grammar
-    self.max_terminals = max_terminals
-    self.max_nonterminals = max_nonterminals 
-    self.max_initial_size = max_initial_size
-    self.train_max_initial_size = train_max_initial_size
-    self.min_gap_size = min_gap_size
-    self.train_min_gap_size = train_min_gap_size
-    self.category = sym.fromstring(category)
-    self.ruleFile = ruleFile
-    self.timingFile = timingFile
-    self.visit = 0
-
-
-  def registerContext(self, context_manager):
-    cdef int num_sents
-    self.context_manager = context_manager
-    self.sa = context_manager.fsarray
-    self.sent_id = self.sa.darray.sent_id
-    
-    num_sents = len(self.sa.darray.sent_index)
-    self.last_visit1 = <int *> malloc(num_sents * sizeof(int))
-    memset(self.last_visit1, 0, num_sents * sizeof(int))
-    
-    self.last_visit2 = <int *> malloc(num_sents * sizeof(int))
-    memset(self.last_visit2, 0, num_sents * sizeof(int))
-
-    self.intersector1 = <match_node **> malloc(num_sents * sizeof(match_node*))
-    memset(self.intersector1, 0, num_sents * sizeof(match_node*))
-
-    self.intersector2 = <match_node **> malloc(num_sents * sizeof(match_node*))
-    memset(self.intersector2, 0, num_sents * sizeof(match_node*))
-
-
-# encodes information needed to find a (hierarchical) phrase
-# in the text.  If phrase is contiguous, that's just a range
-# in the suffix array; if discontiguous, it is the set of 
-# actual locations (packed into an array)
-cdef class PhraseLocation:
-  cdef int sa_low
-  cdef int sa_high
-  cdef int arr_low
-  cdef int arr_high
-  cdef cintlist.CIntList arr
-  cdef int num_subpatterns
-
-  # returns true if sent_id is contained
-  cdef int contains(self, int sent_id):
-    return 1
-
-  def __init__(self, sa_low=-1, sa_high=-1, arr_low=-1, arr_high=-1, arr=None, num_subpatterns=1):
-    self.sa_low = sa_low
-    self.sa_high = sa_high
-    self.arr_low = arr_low
-    self.arr_high = arr_high
-    self.arr = arr
-    self.num_subpatterns = num_subpatterns
-
-
-
-cdef class Sampler:
-  '''A Sampler implements a logic for choosing
-  samples from a population range'''
-
-  cdef int sampleSize
-  cdef context_manager
-  cdef cintlist.CIntList sa
-
-  def __init__(self, sampleSize=0):
-    self.sampleSize = sampleSize
-    if sampleSize > 0:
-      log.writeln("Sampling strategy: uniform, max sample size = %d" % sampleSize, 1)
-    else:
-      log.writeln("Sampling strategy: no sampling", 1)
-
-  def registerContext(self, context_manager):
-    self.context_manager = context_manager
-    self.sa = (<csuf.SuffixArray> context_manager.fsarray).sa
-
-
-  def sample(self, PhraseLocation phrase_location):
-    '''Returns a sample of the locations for
-    the phrase.  If there are less than self.sampleSize
-    locations, return all of them; otherwise, return
-    up to self.sampleSize locations.  In the latter case,
-    we choose to sample UNIFORMLY -- that is, the locations
-    are chosen at uniform intervals over the entire set, rather
-    than randomly.  This makes the algorithm deterministic, which
-    is good for things like MERT'''
-    cdef cintlist.CIntList sample
-    cdef double i, stepsize
-    cdef int num_locations, val, j
-
-    sample = cintlist.CIntList()
-    if phrase_location.arr is None:
-      num_locations = phrase_location.sa_high - phrase_location.sa_low
-      if self.sampleSize == -1 or num_locations <= self.sampleSize:
-        sample._extend_arr(self.sa.arr + phrase_location.sa_low, num_locations)
-      else:
-        stepsize = float(num_locations)/float(self.sampleSize)
-        i = phrase_location.sa_low
-        while i < phrase_location.sa_high and sample.len < self.sampleSize:
-          '''Note: int(i) not guaranteed to have the desired
-          effect, according to the python documentation'''
-          if fmod(i,1.0) > 0.5:
-            val = int(ceil(i))
-          else:
-            val = int(floor(i))
-          sample._append(self.sa.arr[val])
-          i = i + stepsize
-    else:
-      num_locations = (phrase_location.arr_high - phrase_location.arr_low) / phrase_location.num_subpatterns
-      if self.sampleSize == -1 or num_locations <= self.sampleSize:
-        sample = phrase_location.arr
-      else:
-        stepsize = float(num_locations)/float(self.sampleSize)
-        i = phrase_location.arr_low
-        while i < num_locations and sample.len < self.sampleSize * phrase_location.num_subpatterns:
-          '''Note: int(i) not guaranteed to have the desired
-          effect, according to the python documentation'''
-          if fmod(i,1.0) > 0.5:
-            val = int(ceil(i))
-          else:
-            val = int(floor(i))
-          j = phrase_location.arr_low + (val*phrase_location.num_subpatterns)
-          sample._extend_arr(phrase_location.arr.arr + j, phrase_location.num_subpatterns)
-          i = i + stepsize
-    return sample
-
-
-cdef long nGramCount(PhraseLocation loc):
-  return (loc.arr_high - loc.arr_low)/ loc.num_subpatterns
-
-
-# struct used to encapsulate a single matching
-cdef struct Matching:
-  int* arr
-  int start
-  int end
-  int sent_id
-  int size
-
-
-cdef void assign_matching(Matching* m, int* arr, int start, int step, int* sent_id_arr):
-  m.arr = arr
-  m.start = start
-  m.end = start + step
-  m.sent_id = sent_id_arr[arr[start]]
-  m.size = step
-
-
-cdef int* append_combined_matching(int* arr, Matching* loc1, Matching* loc2, 
-                int offset_by_one, int num_subpatterns, int* result_len):
-  cdef int i, new_len
-
-  new_len = result_len[0] + num_subpatterns
-  arr = <int*> realloc(arr, new_len*sizeof(int))
-
-  for i from 0 <= i < loc1.size:
-    arr[result_len[0]+i] = loc1.arr[loc1.start+i]
-  if num_subpatterns > loc1.size:
-    arr[new_len-1] = loc2.arr[loc2.end-1]
-  result_len[0] = new_len
-  return arr
-
-
-cdef int* extend_arr(int* arr, int* arr_len, int* appendix, int appendix_len):
-  cdef int new_len
-  
-  new_len = arr_len[0] + appendix_len
-  arr = <int*> realloc(arr, new_len*sizeof(int))
-  memcpy(arr+arr_len[0], appendix, appendix_len*sizeof(int))
-  arr_len[0] = new_len
-  return arr
-
-
-#cdef matching2str(Matching* m):
-#  cdef int i
-#  cdef result
-
-#  result = "("
-#  for i from m.start <= i < m.end:
-#    result = result + str(m.arr[i]) + " "
-#  result = result + ")"
-#  return result
-
-
-cdef int median(int low, int high, int step):
-  return low + (((high - low)/step)/2)*step
-
-
-cdef void findComparableMatchings(int low, int high, int* arr, int step, int loc, int* loc_minus, int* loc_plus):
-  # Returns (minus, plus) indices for the portion of the array
-  # in which all matchings have the same first index as the one
-  # starting at loc
-  loc_plus[0] = loc + step
-  while loc_plus[0] < high and arr[loc_plus[0]] == arr[loc]:
-    loc_plus[0] = loc_plus[0] + step
-  loc_minus[0] = loc
-  while loc_minus[0]-step >= low and arr[loc_minus[0]-step] == arr[loc]:
-    loc_minus[0] = loc_minus[0] - step
-
-
-cdef class HieroCachingRuleFactory:
-  '''This RuleFactory implements a caching 
-  method using TrieTable, which makes phrase
-  generation somewhat speedier -- phrases only
-  need to be extracted once (however, it is
-  quite possible they need to be scored 
-  for each input sentence, for contextual models)'''
-
-  cdef rules, grammar, context_manager
-
-  cdef int max_chunks
-  cdef int max_target_chunks
-  cdef int max_length
-  cdef int max_target_length
-  cdef int max_nonterminals
-  cdef int max_initial_size
-  cdef int train_max_initial_size
-  cdef int min_gap_size
-  cdef int train_min_gap_size
-  cdef int category
-
-  cdef cacheBetweenSents
-  cdef precomputed_index
-  cdef precomputed_collocations
-  cdef precompute_file
-  cdef max_rank
-  cdef int precompute_rank, precompute_secondary_rank
-  cdef useBaezaYates
-  cdef use_index
-  cdef use_collocations
-  cdef float by_slack_factor
-
-  cdef per_sentence_grammar
-  cdef rule_filehandler
-  cdef rule_file
-  cdef pruned_rule_file
-  cdef extract_file
-  cdef sample_file
-  cdef search_file
-  cdef timingFile
-  cdef log_int_stats
-  cdef prev_norm_prefix
-  cdef float intersect_time, extract_time
-  cdef csuf.SuffixArray fsa
-  cdef cdat.DataArray fda
-  cdef cdat.DataArray eda
-  
-  cdef calignment.Alignment alignment
-  cdef cintlist.CIntList eid2symid
-  cdef cintlist.CIntList fid2symid
-  cdef int tight_phrases
-  cdef int require_aligned_terminal
-  cdef int require_aligned_chunks
-
-  cdef cintlist.CIntList findexes
-  cdef cintlist.CIntList findexes1
-
-  cdef int excluded_sent_id       # exclude a sentence id
-
-  def __init__(self, 
-        alignment=None,           # compiled alignment object (REQUIRED)
-        by_slack_factor=1.0,      # parameter for double-binary search; doesn't seem to matter much
-        category="[PHRASE]",      # name of generic nonterminal used by Hiero
-        cacheBetweenSents=False,  # prevent flushing of tree between sents; use carefully or you'll run out of memory
-        extract_file=None,        # print raw extracted rules to this file
-        grammar=None,             # empty grammar object -- must be supplied from outside (REQUIRED)
-        log_int_stats=False,      # prints timing data on intersections to stderr
-        max_chunks=None,          # maximum number of contiguous chunks of terminal symbols in RHS of a rule. If None, defaults to max_nonterminals+1
-        max_initial_size=10,      # maximum span of a grammar rule in TEST DATA
-        max_length=5,             # maximum number of symbols (both T and NT) allowed in a rule
-        max_nonterminals=2,       # maximum number of nonterminals allowed in a rule (set >2 at your own risk)
-        max_target_chunks=None,   # maximum number of contiguous chunks of terminal symbols in target-side RHS of a rule. If None, defaults to max_nonterminals+1
-        max_target_length=None,   # maximum number of target side symbols (both T and NT) allowed in a rule. If None, defaults to max_initial_size
-        min_gap_size=2,           # minimum span of a nonterminal in the RHS of a rule in TEST DATA
-        precompute_file=None,     # filename of file containing precomputed collocations
-        precompute_secondary_rank=20, # maximum frequency rank of patterns used to compute triples (don't set higher than 20).
-        precompute_rank=100,      # maximum frequency rank of patterns used to compute collocations (no need to set higher than maybe 200-300)
-        pruned_rule_file=None,    # if specified, pruned grammars will be written to this filename
-        require_aligned_terminal=True, # require extracted rules to have at least one aligned word
-        require_aligned_chunks=False, # require each contiguous chunk of extracted rules to have at least one aligned word
-        per_sentence_grammar=True,  # generate grammar files for each input segment
-        rule_file=None,           # UNpruned grammars will be written to this filename
-        sample_file=None,         # Sampling statistics will be written to this filename
-        search_file=None,         # lookup statistics will be written to this filename
-        train_max_initial_size=10, # maximum span of a grammar rule extracted from TRAINING DATA
-        train_min_gap_size=2,     # minimum span of an RHS nonterminal in a rule extracted from TRAINING DATA
-        tight_phrases=False,      # True if phrases should be tight, False otherwise (False == slower but better results)
-        timingFile=None,          # timing statistics will be written to this filename
-        useBaezaYates=True,       # True to require use of double-binary alg, false otherwise
-        use_collocations=True,    # True to enable used of precomputed collocations
-        use_index=True            # True to enable use of precomputed inverted indices
-        ):
-    '''Note: we make a distinction between the min_gap_size
-    and max_initial_size used in test and train.  The latter
-    are represented by train_min_gap_size and train_max_initial_size,
-    respectively.  This is because Chiang's model does not require
-    them to be the same, therefore we don't either.'''
-    self.rules = TrieTable(True) # cache
-    self.rules.root = ExtendedTrieNode(phrase_location=PhraseLocation()) 
-    self.grammar = grammar
-    if alignment is None:
-      raise Exception("Must specify an alignment object")
-    self.alignment = alignment
-
-    self.excluded_sent_id = -1
-
-    # grammar parameters and settings
-    # NOTE: setting max_nonterminals > 2 is not currently supported in Hiero
-    self.max_length = max_length
-    self.max_nonterminals = max_nonterminals 
-    self.max_initial_size = max_initial_size
-    self.train_max_initial_size = train_max_initial_size
-    self.min_gap_size = min_gap_size
-    self.train_min_gap_size = train_min_gap_size
-    self.category = sym.fromstring(category)
-
-    if max_chunks is None:
-      self.max_chunks = self.max_nonterminals + 1
-    else:
-      self.max_chunks = max_chunks
-    
-    if max_target_chunks is None:
-      self.max_target_chunks = self.max_nonterminals + 1
-    else:
-      self.max_target_chunks = max_target_chunks
-
-    if max_target_length is None:
-      self.max_target_length = max_initial_size
-    else:
-      self.max_target_length = max_target_length
-
-    # algorithmic parameters and settings
-    self.cacheBetweenSents = not per_sentence_grammar
-    self.precomputed_collocations = {}
-    self.precomputed_index = {}
-    self.use_index = use_index
-    self.use_collocations = use_collocations
-    self.max_rank = {}
-    self.precompute_file = precompute_file
-    self.precompute_rank = precompute_rank
-    self.precompute_secondary_rank = precompute_secondary_rank
-    self.useBaezaYates = useBaezaYates
-    self.by_slack_factor = by_slack_factor
-    if tight_phrases:
-      self.tight_phrases = 1
-    else:
-      self.tight_phrases = 0
-
-    if require_aligned_chunks: 
-      # one condition is a stronger version of the other.
-      self.require_aligned_chunks = 1
-      self.require_aligned_terminal = 1
-    elif require_aligned_terminal:
-      self.require_aligned_chunks = 0
-      self.require_aligned_terminal = 1
-    else:
-      self.require_aligned_chunks = 0
-      self.require_aligned_terminal = 0
-    
-
-    self.per_sentence_grammar = per_sentence_grammar
-    if not self.per_sentence_grammar:
-      self.rule_filehandler = open(rule_file, "w")
-    # diagnostics
-    #if rule_file is None:
-    #  self.rule_file = None
-    self.rule_file = rule_file
-    if extract_file is None:
-      self.extract_file = None
-    else:
-      self.extract_file = open(extract_file, "w")
-    if sample_file is None:
-      self.sample_file = None
-    else:
-      self.sample_file = open(sample_file, "w")
-    if search_file is None:
-      self.search_file = None
-    else:
-      self.search_file = open(search_file, "w")
-    self.pruned_rule_file = pruned_rule_file
-    self.timingFile = timingFile
-    self.log_int_stats = log_int_stats
-    self.prev_norm_prefix = ()
-
-    self.findexes = cintlist.CIntList(initial_len=10)
-    self.findexes1 = cintlist.CIntList(initial_len=10)
-
-  def registerContext(self, context_manager):
-    '''This gives the RuleFactory access to the Context object.
-    Here we also use it to precompute the most expensive intersections
-    in the corpus quickly.'''
-    self.context_manager = context_manager
-    self.fsa = context_manager.fsarray
-    self.fda = self.fsa.darray
-    self.eda = context_manager.edarray
-    self.fid2symid = self.set_idmap(self.fda)
-    self.eid2symid = self.set_idmap(self.eda)
-    self.precompute()
-
-  cdef set_idmap(self, cdat.DataArray darray):
-    cdef int word_id, new_word_id, N
-    cdef cintlist.CIntList idmap
-    
-    N = len(darray.id2word)
-    idmap = cintlist.CIntList(initial_len=N)
-    for word_id from 0 <= word_id < N:
-      new_word_id = sym.fromstring(darray.id2word[word_id], terminal=True)
-      idmap.arr[word_id] = new_word_id
-    return idmap
-
-
-  def pattern2phrase(self, pattern):
-    # pattern is a tuple, which we must convert to a hiero rule.Phrase
-    result = ()
-    arity = 0
-    for word_id in pattern:
-      if word_id == -1:
-        arity = arity + 1
-        new_id = sym.setindex(self.category, arity)
-      else:
-        new_id = sym.fromstring(self.fda.id2word[word_id])
-      result = result + (new_id,)
-    return rule.Phrase(result)
-
-  def pattern2phrase_plus(self, pattern):
-    # returns a list containing both the pattern, and pattern
-    # suffixed/prefixed with the NT category.
-    patterns = []
-    result = ()
-    arity = 0
-    for word_id in pattern:
-      if word_id == -1:
-        arity = arity + 1
-        new_id = sym.setindex(self.category, arity)
-      else:
-        new_id = sym.fromstring(self.fda.id2word[word_id])
-      result = result + (new_id,)
-    patterns.append(rule.Phrase(result))
-    patterns.append(rule.Phrase(result + (sym.setindex(self.category, 1),)))
-    patterns.append(rule.Phrase((sym.setindex(self.category, 1),) + result))
-    return patterns
-
-  def precompute(self):
-    cdef precomputation.Precomputation pre
-    
-    if self.precompute_file is not None:
-      start_time = monitor.cpu()
-      log.write("Reading precomputed data from file %s... " % self.precompute_file, 1)
-      pre = precomputation.Precomputation(self.precompute_file, from_binary=True)
-      # check parameters of precomputation -- some are critical and some are not
-      if pre.max_nonterminals != self.max_nonterminals:
-        log.writeln("\nWARNING: Precomputation done with max nonterminals %d, decoder uses %d" % (pre.max_nonterminals, self.max_nonterminals))
-      if pre.max_length != self.max_length:
-        log.writeln("\nWARNING: Precomputation done with max terminals %d, decoder uses %d" % (pre.max_length, self.max_length))
-      if pre.train_max_initial_size != self.train_max_initial_size:
-        log.writeln("\nERROR: Precomputation done with max initial size %d, decoder uses %d" % (pre.train_max_initial_size, self.train_max_initial_size))
-        raise Exception("Parameter mismatch with precomputed data")
-      if pre.train_min_gap_size != self.train_min_gap_size:
-        log.writeln("\nERROR: Precomputation done with min gap size %d, decoder uses %d" % (pre.train_min_gap_size, self.train_min_gap_size))
-        raise Exception("Parameter mismatch with precomputed data")
-      log.writeln("done.", 1)
-      if self.use_index:
-        log.write("Converting %d hash keys on precomputed inverted index... " % (len(pre.precomputed_index)), 1)
-        for pattern, arr in pre.precomputed_index.iteritems():
-          phrases = self.pattern2phrase_plus(pattern)
-          for phrase in phrases:
-            self.precomputed_index[phrase] = arr
-        log.writeln("done.", 1)
-      if self.use_collocations:
-        log.write("Converting %d hash keys on precomputed collocations... " % (len(pre.precomputed_collocations)), 1)
-        for pattern, arr in pre.precomputed_collocations.iteritems():
-          phrase = self.pattern2phrase(pattern)
-          self.precomputed_collocations[phrase] = arr
-        log.writeln("done.", 1)
-      stop_time = monitor.cpu()
-      log.writeln("Processing precomputations took %f seconds" % (stop_time - start_time), 1)
-
-
-  def getPrecomputedCollocation(self, phrase):
-    if phrase in self.precomputed_collocations:
-      arr = self.precomputed_collocations[phrase]
-      return PhraseLocation(arr=arr, arr_low=0, arr_high=len(arr), num_subpatterns=phrase.arity()+1)
-    return None
-
-
-  cdef int* baezaYatesHelper(self, int low1, int high1, int* arr1, int step1, 
-            int low2, int high2, int* arr2, int step2, 
-            int offset_by_one, int len_last, int num_subpatterns, int* result_len):
-    cdef int i1, i2, j1, j2, med1, med2, med1_plus, med1_minus, med2_minus, med2_plus
-    cdef int d_first, qsetsize, dsetsize, tmp, search_low, search_high
-    cdef int med_result_len, low_result_len, high_result_len
-    cdef long comparison
-    cdef int* result
-    cdef int* low_result
-    cdef int* med_result
-    cdef int* high_result
-    cdef Matching loc1, loc2
-
-    result = <int*> malloc(0*sizeof(int*))
-#    log.writeln("%sBY: [%d, %d, %d] [%d, %d, %d]" % (pad, low1, high1, step1, low2, high2, step2,), 5)
-
-    d_first = 0
-    if high1 - low1 > high2 - low2:
-#      log.writeln("%sD first" % (pad), 5)
-      d_first = 1
-#    else:
-#      log.writeln("%sQ first" % (pad), 5)
-
-#    '''First, check to see if we are at any of the 
-#    recursive base cases'''
-#
-#    '''Case 1: one of the sets is empty'''
-    if low1 >= high1 or low2 >= high2:
-#      log.writeln("%sRETURN: set is empty" % (pad), 5)
-      return result
-
-#    '''Case 2: sets are non-overlapping'''
-    assign_matching(&loc1, arr1, high1-step1, step1, self.fda.sent_id.arr)
-    assign_matching(&loc2, arr2, low2, step2, self.fda.sent_id.arr)
-    if self.compare_matchings(&loc1, &loc2, offset_by_one, len_last) == -1:
-#      log.writeln("%s %s < %s" % (pad, tuple(arr1[high1-step1:high1]), tuple(arr2[low2:low2+step2])),5)
-#      log.writeln("%sRETURN: non-overlapping sets" % (pad), 5)
-      return result
-
-    assign_matching(&loc1, arr1, low1, step1, self.fda.sent_id.arr)
-    assign_matching(&loc2, arr2, high2-step2, step2, self.fda.sent_id.arr)
-    if self.compare_matchings(&loc1, &loc2, offset_by_one, len_last) == 1:
-#      log.writeln("%s %s > %s" % (pad, tuple(arr1[low1:low1+step1]), tuple(arr2[high2-step2:high2])),5)
-#      log.writeln("%sRETURN: non-overlapping sets" % (pad), 5)
-      return result
-
-    # Case 3: query set and data set do not meet size mismatch constraints;
-    # We use mergesort instead in this case
-    qsetsize = (high1-low1) / step1
-    dsetsize = (high2-low2) / step2
-    if d_first:
-      tmp = qsetsize
-      qsetsize = dsetsize
-      dsetsize = tmp
-
-    if self.by_slack_factor * qsetsize * cmath.log(dsetsize) / cmath.log(2) > dsetsize:
-      free(result)
-      return self.mergeHelper(low1, high1, arr1, step1, low2, high2, arr2, step2, offset_by_one, len_last, num_subpatterns, result_len)
-
-    # binary search.  There are two flavors, depending on 
-    # whether the queryset or dataset is first
-    if d_first:
-      med2 = median(low2, high2, step2)
-      assign_matching(&loc2, arr2, med2, step2, self.fda.sent_id.arr)
-
-      search_low = low1
-      search_high = high1
-      while search_low < search_high:
-        med1 = median(search_low, search_high, step1)
-        findComparableMatchings(low1, high1, arr1, step1, med1, &med1_minus, &med1_plus)
-        comparison = self.compareMatchingsSet(med1_minus, med1_plus, arr1, step1, &loc2, offset_by_one, len_last)
-        if comparison == -1:
-          search_low = med1_plus
-        elif comparison == 1:
-          search_high = med1_minus
-        else:
-          break
-    else:
-      med1 = median(low1, high1, step1)
-      findComparableMatchings(low1, high1, arr1, step1, med1, &med1_minus, &med1_plus)
-
-      search_low = low2
-      search_high = high2
-      while search_low < search_high:
-        med2 = median(search_low, search_high, step2)
-        assign_matching(&loc2, arr2, med2, step2, self.fda.sent_id.arr)
-        comparison = self.compareMatchingsSet(med1_minus, med1_plus, arr1, step1, &loc2, offset_by_one, len_last)
-        if comparison == -1:
-          search_high = med2
-        elif comparison == 1:
-          search_low = med2 + step2
-        else:
-          break
-
-    med_result_len = 0
-    med_result = <int*> malloc(0*sizeof(int*))
-    if search_high > search_low:
-#      '''Then there is a match for the median element of Q'''
-#      
-#      '''What we want to find is the group of all bindings in the first set
-#      s.t. their first element == the first element of med1.  Then we
-#      want to store the bindings for all of those elements.  We can
-#      subsequently throw all of them away.'''
-      med2_minus = med2
-      med2_plus = med2 + step2
-      i1 = med1_minus
-      while i1 < med1_plus:
-        assign_matching(&loc1, arr1, i1, step1, self.fda.sent_id.arr)
-        while med2_minus-step2 >= low2:
-          assign_matching(&loc2, arr2, med2_minus-step2, step2, self.fda.sent_id.arr)
-          if self.compare_matchings(&loc1, &loc2, offset_by_one, len_last) < 1:
-            med2_minus = med2_minus - step2
-          else:
-            break
-        i2 = med2_minus
-        while i2 < high2:
-          assign_matching(&loc2, arr2, i2, step2, self.fda.sent_id.arr)
-          comparison = self.compare_matchings(&loc1, &loc2, offset_by_one, len_last)
-          if comparison == 0:
-            pass
-            med_result = append_combined_matching(med_result, &loc1, &loc2, offset_by_one, num_subpatterns, &med_result_len)
-          if comparison == -1:
-            break
-          i2 = i2 + step2
-        if i2 > med2_plus:
-          med2_plus = i2
-        i1 = i1 + step1
-
-      tmp = med1_minus
-      med1_minus = med1_plus
-      med1_plus = tmp
-    else:
-      # No match; need to figure out the point of division in D and Q
-      med2_minus = med2
-      med2_plus = med2
-      if d_first:
-        med2_minus = med2_minus + step2
-        if comparison == -1:
-          med1_minus = med1_plus
-        if comparison == 1:
-          med1_plus = med1_minus
-      else:
-        tmp = med1_minus
-        med1_minus = med1_plus
-        med1_plus = tmp
-        if comparison == 1:
-          med2_minus = med2_minus + step2
-          med2_plus = med2_plus + step2
-
-    low_result_len = 0
-    low_result = self.baezaYatesHelper(low1, med1_plus, arr1, step1, low2, med2_plus, arr2, step2, offset_by_one, len_last, num_subpatterns, &low_result_len)
-    high_result_len = 0
-    high_result = self.baezaYatesHelper(med1_minus, high1, arr1, step1, med2_minus, high2, arr2, step2, offset_by_one, len_last, num_subpatterns, &high_result_len)
-
-    result = extend_arr(result, result_len, low_result, low_result_len)
-    result = extend_arr(result, result_len, med_result, med_result_len)
-    result = extend_arr(result, result_len, high_result, high_result_len)
-    free(low_result)
-    free(med_result)
-    free(high_result)
-
-    return result
-
-
-
-  cdef long compareMatchingsSet(self, int i1_minus, int i1_plus, int* arr1, int step1, 
-              Matching* loc2, int offset_by_one, int len_last):
-#    '''Compares a *set* of bindings, all with the same first element,
-#    to a single binding.  Returns -1 if all comparisons == -1, 1 if all
-#    comparisons == 1, and 0 otherwise.'''
-    cdef int i1, comparison, prev_comparison
-    cdef Matching l1_stack
-    cdef Matching* loc1
-
-    loc1 = &l1_stack
-
-    i1 = i1_minus
-    while i1 < i1_plus:
-      assign_matching(loc1, arr1, i1, step1, self.fda.sent_id.arr)
-      comparison = self.compare_matchings(loc1, loc2, offset_by_one, len_last)
-      if comparison == 0:
-        prev_comparison = 0
-        break
-      elif i1 == i1_minus:
-        prev_comparison = comparison
-      else:
-        if comparison != prev_comparison:
-          prev_comparison = 0
-          break
-      i1 = i1 + step1
-    return prev_comparison
-
-
-  cdef long compare_matchings(self, Matching* loc1, Matching* loc2, int offset_by_one, int len_last):
-    cdef int i
-
-    if loc1.sent_id > loc2.sent_id:
-      return 1
-    if loc2.sent_id > loc1.sent_id:
-      return -1
-
-    if loc1.size == 1 and loc2.size == 1:
-      if loc2.arr[loc2.start] - loc1.arr[loc1.start] <= self.train_min_gap_size:
-        return 1
-
-    elif offset_by_one:
-      for i from 1 <= i < loc1.size:
-        if loc1.arr[loc1.start+i] > loc2.arr[loc2.start+i-1]:
-          return 1
-        if loc1.arr[loc1.start+i] < loc2.arr[loc2.start+i-1]:
-          return -1
-
-    else:
-      if loc1.arr[loc1.start]+1 > loc2.arr[loc2.start]:
-        return 1
-      if loc1.arr[loc1.start]+1 < loc2.arr[loc2.start]:
-        return -1
-
-      for i from 1 <= i < loc1.size:
-        if loc1.arr[loc1.start+i] > loc2.arr[loc2.start+i]:
-          return 1
-        if loc1.arr[loc1.start+i] < loc2.arr[loc2.start+i]:
-          return -1
-
-    if loc2.arr[loc2.end-1] + len_last - loc1.arr[loc1.start] > self.train_max_initial_size:
-      return -1
-    return 0
-
-
-  cdef int* mergeHelper(self, int low1, int high1, int* arr1, int step1, 
-          int low2, int high2, int* arr2, int step2, 
-          int offset_by_one, int len_last, int num_subpatterns, int* result_len):
-    cdef int i1, i2, j1, j2
-    cdef long comparison
-    cdef int* result
-    cdef Matching loc1, loc2
-#    cdef int i
-
-#    pad = "  "
-#    log.writeln("->mergeHelper", 5)
-
-    result_len[0] = 0
-    result = <int*> malloc(0*sizeof(int))
-
-    i1 = low1
-    i2 = low2
-#    if log.level==5:
-#      log.writeln("%sMERGE lists [%d,%d,%d] and [%d,%d,%d]" % (pad,low1,high1,step1,low2,high2,step2), 5)
-#      log.writeln("%soffset_by_one: %d, len_last: %d" % (pad, offset_by_one, len_last), 5)
-#      log.write("[")
-#      for i from low1 <= i < high1:
-#        log.write("%d, " % arr1.arr[i],5)
-#      log.writeln("]")
-#      log.write("[")
-#      for i from low2 <= i < high2:
-#        log.write("%d, " % arr2.arr[i],5)
-#      log.writeln("]")
-    while i1 < high1 and i2 < high2:
-      
-#      '''First, pop all unneeded loc2's off the stack'''
-      assign_matching(&loc1, arr1, i1, step1, self.fda.sent_id.arr)
-#      if log.level==5:
-#        log.writeln("%s TOP1 %s" % (pad,matching2str(loc1)),5)
-      while i2 < high2:
-        assign_matching(&loc2, arr2, i2, step2, self.fda.sent_id.arr)
-        if self.compare_matchings(&loc1, &loc2, offset_by_one, len_last) == 1:
-#          if log.level==5:
-#            log.writeln("%s %s > %s" % (pad,matching2str(loc1),matching2str(loc2)),5)
-#            log.writeln("%s POP2 %s" % (pad,matching2str(loc2)),5)
-          i2 = i2 + step2
-        else:
-          break
-
-#      '''Next: process all loc1's with the same starting val'''
-      j1 = i1
-      while i1 < high1 and arr1[j1] == arr1[i1]:
-        assign_matching(&loc1, arr1, i1, step1, self.fda.sent_id.arr)
-        j2 = i2
-        while j2 < high2:
-          assign_matching(&loc2, arr2, j2, step2, self.fda.sent_id.arr)
-          comparison = self.compare_matchings(&loc1, &loc2, offset_by_one, len_last)
-          if comparison == 0:
-#            if log.level==5:
-#              log.writeln("%s %s == %s" % (pad,matching2str(loc1),matching2str(loc2)),5)
-            result = append_combined_matching(result, &loc1, &loc2, offset_by_one, num_subpatterns, result_len)
-          if comparison == 1:
-#            if log.level==5:
-#              log.writeln("%s %s > %s" % (pad,matching2str(loc1),matching2str(loc2)),5)
-            pass
-          if comparison == -1:
-#            if log.level==5:
-#              log.writeln("%s %s < %s" % (pad,matching2str(loc1),matching2str(loc2)),5)
-            break
-          else:
-            j2 = j2 + step2
-#        if log.level==5:
-#          log.writeln("%s POP1 %s" % (pad,matching2str(loc1)),5)
-        i1 = i1 + step1
-
-#    log.writeln("<-mergeHelper", 5)
-    return result
-
-
-  cdef void sortPhraseLoc(self, cintlist.CIntList arr, PhraseLocation loc, rule.Phrase phrase):
-    cdef int i, j
-    cdef cveb.VEB veb
-    cdef cintlist.CIntList result
-
-    if phrase in self.precomputed_index:
-      loc.arr = self.precomputed_index[phrase]
-    else:
-      loc.arr = cintlist.CIntList(initial_len=loc.sa_high-loc.sa_low)
-      veb = cveb.VEB(arr.len)
-      for i from loc.sa_low <= i < loc.sa_high:
-        veb._insert(arr.arr[i])
-      i = veb.veb.min_val
-      for j from 0 <= j < loc.sa_high-loc.sa_low:
-        loc.arr.arr[j] = i
-        i = veb._findsucc(i)
-    loc.arr_low = 0
-    loc.arr_high = loc.arr.len
-
-
-  cdef intersectHelper(self, rule.Phrase prefix, rule.Phrase suffix,
-        PhraseLocation prefix_loc, PhraseLocation suffix_loc, int algorithm):
-
-    cdef cintlist.CIntList arr1, arr2, result
-    cdef int low1, high1, step1, low2, high2, step2, offset_by_one, len_last, num_subpatterns, result_len
-    cdef int* result_ptr
-    cdef csuf.SuffixArray suf
-
-    result_len = 0
-
-    if sym.isvar(suffix[0]):
-      offset_by_one = 1
-    else:
-      offset_by_one = 0
-
-    len_last = len(suffix.getchunk(suffix.arity()))
-
-    if prefix_loc.arr is None:
-      suf = self.context_manager.fsarray
-      self.sortPhraseLoc(suf.sa, prefix_loc, prefix)
-    arr1 = prefix_loc.arr
-    low1 = prefix_loc.arr_low
-    high1 = prefix_loc.arr_high
-    step1 = prefix_loc.num_subpatterns
-
-    if suffix_loc.arr is None:
-      suf = self.context_manager.fsarray
-      self.sortPhraseLoc(suf.sa, suffix_loc, suffix)
-    arr2 = suffix_loc.arr
-    low2 = suffix_loc.arr_low
-    high2 = suffix_loc.arr_high
-    step2 = suffix_loc.num_subpatterns
-
-    num_subpatterns = prefix.arity()+1
-
-    if algorithm == MERGE:
-      result_ptr = self.mergeHelper(low1, high1, arr1.arr, step1, 
-                  low2, high2, arr2.arr, step2, 
-                  offset_by_one, len_last, num_subpatterns, &result_len)
-    else:
-      result_ptr = self.baezaYatesHelper(low1, high1, arr1.arr, step1, 
-                  low2, high2, arr2.arr, step2, 
-                  offset_by_one, len_last, num_subpatterns, &result_len)
-
-    if result_len == 0:
-      free(result_ptr)
-      return None
-    else:
-      result = cintlist.CIntList()
-      free(result.arr)
-      result.arr = result_ptr
-      result.len = result_len
-      result.size = result_len
-      return PhraseLocation(arr_low=0, arr_high=result_len, arr=result, num_subpatterns=num_subpatterns)
-
-  cdef loc2str(self, PhraseLocation loc):
-    cdef int i, j
-    result = "{"
-    i = 0
-    while i < loc.arr_high:
-      result = result + "("
-      for j from i <= j < i + loc.num_subpatterns:
-        result = result + ("%d " %loc.arr[j])
-      result = result + ")"
-      i = i + loc.num_subpatterns
-    result = result + "}"
-    return result
-
-#  cdef compareResults(self, PhraseLocation loc1, PhraseLocation loc2, phrase, type1, type2):
-#    cdef i
-#    if loc1 is None and type1=="pre":
-#      return
-#    if loc1 is None:
-#      if loc2 is None or loc2.arr_high == 0:
-#        return
-#    if loc2 is None:
-#      if loc1.arr_high == 0:
-#        return
-#    if loc1.arr_high != loc2.arr_high:
-#      log.writeln("ERROR: %d vs %d (%s vs %s)" % (loc1.arr_high, loc2.arr_high, type1, type2))
-#      #log.writeln("  %s" % self.loc2str(loc2))
-#    if loc1.arr_high == 0:
-#      return
-#    elif loc1.num_subpatterns != loc2.num_subpatterns:
-#      log.writeln("ERROR 2: %d vs %d (%d v %d) %s" % (loc1.num_subpatterns, loc2.num_subpatterns, loc1.arr_high, loc2.arr_high, phrase))
-#    for i from 0 <= i < loc1.arr_high:
-#      if loc1.arr[i] != loc2.arr[i]:
-#        log.writeln("ERROR 3")
-# 
-  cdef PhraseLocation intersect(self, prefix_node, suffix_node, rule.Phrase phrase):
-    cdef rule.Phrase prefix, suffix
-    cdef PhraseLocation prefix_loc, suffix_loc, result
-
-    start_time = monitor.cpu()
-    prefix = prefix_node.phrase
-    suffix = suffix_node.phrase
-    prefix_loc = prefix_node.phrase_location
-    suffix_loc = suffix_node.phrase_location
-
-    result = self.getPrecomputedCollocation(phrase)
-    if result is not None:
-      intersect_method = "precomputed"
-
-    if result is None:
-      if self.useBaezaYates:
-        result = self.intersectHelper(prefix, suffix, prefix_loc, suffix_loc, BAEZA_YATES)
-        intersect_method="double binary"
-      else:
-        result = self.intersectHelper(prefix, suffix, prefix_loc, suffix_loc, MERGE)
-        intersect_method="merge"
-    stop_time = monitor.cpu()
-    intersect_time = stop_time - start_time
-    if self.log_int_stats:
-      if intersect_method == "precomputed":
-        sort1 = "none"
-        sort2 = "none"
-      else:
-        if prefix in self.precomputed_index:
-          sort1 = "index"
-        else:
-          sort1 = "veb"
-        if suffix in self.precomputed_index:
-          sort2 = "index"
-        else:
-          sort2 = "veb"
-      result_len=0
-      if result is not None:
-        result_len = len(result.arr)/result.num_subpatterns
-      rank = 0
-#      if phrase in self.max_rank:
-#        rank = self.max_rank[phrase]
-#      else:
-#        rank = self.precompute_rank + 10
-      log.writeln("INT %d %d %d %d %d %f %d %s %s %s" % 
-        (len(prefix)+1 - prefix.arity(), prefix.arity(), 
-        nGramCount(prefix_node.phrase_location), 
-        nGramCount(suffix_node.phrase_location), 
-        result_len, intersect_time, rank, intersect_method, sort1, sort2))
-    return result
-
-  def advance(self, frontier, res, fwords):
-    nf = []
-    for (toskip, (i, alt, pathlen)) in frontier:
-      spanlen = fwords[i][alt][2]
-      if (toskip == 0):
-        #log.writeln("RES: (%d %d %d)" % (i, alt, pathlen), 3)
-        res.append((i, alt, pathlen))
-      ni = i + spanlen
-      #log.writeln("proc: %d (%d %d %d) sl=%d ni=%d len(fwords)=%d" % (toskip, i, alt, pathlen, spanlen, ni, len(fwords)), 3)
-      if (ni < len(fwords) and (pathlen + 1) < self.max_initial_size):
-        for na in xrange(len(fwords[ni])):
-          nf.append((toskip - 1, (ni, na, pathlen + 1)))
-    if (len(nf) > 0):
-      return self.advance(nf, res, fwords)
-    else:
-      return res
-    
-  def get_all_nodes_isteps_away(self, skip, i, spanlen, pathlen, fwords, next_states, reachable_buffer):
-    frontier = []
-    if (i+spanlen+skip >= len(next_states)):
-      return frontier
-    #print "get_all_nodes_isteps_away from %i" % (i)
-    key = tuple([i,spanlen])
-    reachable = []
-    if (key in reachable_buffer):
-      reachable = reachable_buffer[key]
-    else:
-      reachable = self.reachable(fwords, i, spanlen)
-      reachable_buffer[key] = reachable
-    #print "reachable(from=%i,dist=%i) = " % (i,spanlen)
-    #print reachable
-    for nextreachable in reachable:
-      for next_id in next_states[nextreachable]:
-        jump = self.shortest(fwords,i,next_id)
-        #print "checking next_id = %i, pathlen[sofar] = %i, jump = %i" % (next_id,pathlen,jump)
-        #if (next_id - (i+spanlen)) < skip:
-        if jump < skip:
-          continue
-        #if next_id-(i-pathlen) < self.max_initial_size:
-        if pathlen+jump <= self.max_initial_size:
-          for alt_id in xrange(len(fwords[next_id])):
-            if (fwords[next_id][alt_id][0] != cn.epsilon):
-              #frontier.append((next_id,alt_id,next_id-(i-pathlen)));
-              #print "finding the shortest from %i to %i" % (i, next_id)
-              newel = (next_id,alt_id,pathlen+jump)
-              if newel not in frontier:
-                frontier.append((next_id,alt_id,pathlen+jump))
-                #print "appending to frontier = next_id=%i, alt_id=%i, pathlen=%i" % (next_id,alt_id,pathlen+jump)
-              #else:
-                #print "NOT appending to frontier = next_id=%i, alt_id=%i, pathlen=%i" % (next_id,alt_id,pathlen+jump)
-        #else:
-          #print "next_id = %s is aborted\n" % next_id
-    #print "returning frontier"
-    #print frontier
-    return frontier
-
-  def reachable(self, fwords, ifrom, dist):
-    #print "inside reachable(%i,%i)" % (ifrom,dist)
-    ret = []
-    if (ifrom >= len(fwords)):
-      return ret
-    for alt_id in xrange(len(fwords[ifrom])):
-      if (fwords[ifrom][alt_id][0] == cn.epsilon):
-        ret.extend(self.reachable(fwords,ifrom+fwords[ifrom][alt_id][2],dist))
-      else:
-        if (dist==0):
-          if (ifrom not in ret):
-            ret.append(ifrom)
-        else:
-          for ifromchild in self.reachable(fwords,ifrom+fwords[ifrom][alt_id][2],dist-1):
-            if (ifromchild not in ret):
-              ret.append(ifromchild)
-          
-    return ret
-
-  def shortest(self, fwords, ifrom, ito):
-    min = 1000
-    #print "shortest ifrom=%i, ito=%i" % (ifrom,ito)
-    if (ifrom > ito):
-      return min
-    if (ifrom == ito):
-      return 0
-    for alt_id in xrange(len(fwords[ifrom])):
-      currmin = self.shortest(fwords,ifrom+fwords[ifrom][alt_id][2],ito)
-      if (fwords[ifrom][alt_id][0] != cn.epsilon):
-        currmin += 1
-      if (currmin<min):
-        min = currmin
-    return min
-
-  def get_next_states(self, _columns, curr_idx, min_dist=2):
-    result = []
-    candidate = [[curr_idx,0]]
-
-    while len(candidate) > 0:
-      curr = candidate.pop()
-      if curr[0] >= len(_columns):
-        continue
-      if curr[0] not in result and min_dist <= curr[1] <= self.max_initial_size:
-        result.append(curr[0]);
-      curr_col = _columns[curr[0]]
-      for alt in curr_col:
-        next_id = curr[0]+alt[2]
-        jump = 1
-        if (alt[0] == cn.epsilon):
-          jump = 0
-        if next_id not in result and min_dist <= curr[1]+jump <= self.max_initial_size+1:
-          candidate.append([next_id,curr[1]+jump])
-    return sorted(result);
-
-  def input(self, fwords, meta):
-    '''When this function is called on the RuleFactory,
-    it looks up all of the rules that can be used to translate
-    the input sentence'''
-    cdef int i, j, k, flen, arity, num_subpatterns, num_samples
-    cdef float start_time
-    cdef PhraseLocation phrase_location
-    cdef cintlist.CIntList sample, chunklen
-    cdef Matching matching
-    cdef rule.Phrase hiero_phrase
-    
-    #fwords = [ ((1,0.0,1),), fwords1 ] #word id for <s> = 1, cost = 0.0, next = 1
-    #print fwords
-    flen = len(fwords)
-    #print "length = %i" % flen
-    start_time = monitor.cpu()
-    self.intersect_time = 0.0
-    self.extract_time = 0.0
-    nodes_isteps_away_buffer = {}
-    hit = 0
-    reachable_buffer = {}
-    #print "id = ",meta
-    #print "rule_file = ",self.rule_file
-    dattrs = sgml.attrs_to_dict(meta)
-    id = dattrs.get('id', 'NOID')
-    if self.per_sentence_grammar:
-      self.rule_filehandler = open(self.rule_file+'.'+id, 'w')
-    self.excluded_sent_id = int(dattrs.get('exclude', '-1'))
-
-    #print "max_initial_size = %i" % self.max_initial_size
-
-    if not self.cacheBetweenSents:
-      self.rules.root = ExtendedTrieNode(phrase_location=PhraseLocation()) 
-      self.grammar.root = [None, {}]
-
-    frontier = []
-    for i in xrange(len(fwords)):
-      for alt in xrange(0, len(fwords[i])):
-        if fwords[i][alt][0] != cn.epsilon:
-          frontier.append((i, i, alt, 0, self.rules.root, (), False))
-
-    xroot = None
-    x1 = sym.setindex(self.category, 1)
-    if x1 in self.rules.root.children:
-      xroot = self.rules.root.children[x1]
-    else:
-      xroot = ExtendedTrieNode(suffix_link=self.rules.root, phrase_location=PhraseLocation()) 
-      self.rules.root.children[x1] = xroot
-
-    for i in xrange(self.min_gap_size, len(fwords)):
-      for alt in xrange(0, len(fwords[i])):
-        if fwords[i][alt][0] != cn.epsilon:
-          frontier.append((i-self.min_gap_size, i, alt, self.min_gap_size, xroot, (x1,), True))
-    '''for k, i, alt, pathlen, node, prefix, is_shadow_path in frontier:
-      if len(prefix)>0:
-        print k, i, alt, pathlen, node, map(sym.tostring,prefix), is_shadow_path
-      else:
-        print k, i, alt, pathlen, node, prefix, is_shadow_path'''
-
-    #for wid in xrange(1000):
-    #  print "%i = %s" % (wid, sym.tostring(wid))
-    next_states = []
-    for i in xrange(len(fwords)):
-      next_states.append(self.get_next_states(fwords,i,self.min_gap_size))
-      #print "next state of %i" % i
-      #print next_states[i]
-
-    while len(frontier) > 0:
-      #print "frontier = %i" % len(frontier)
-      this_iter_intersect_time = self.intersect_time
-      new_frontier = []
-      for k, i, alt, pathlen, node, prefix, is_shadow_path in frontier:
-        #print "looking at: "
-        #if len(prefix)>0:
-        #  print k, i, alt, pathlen, node, map(sym.tostring,prefix), is_shadow_path
-        #else:
-        #  print k, i, alt, pathlen, node, prefix, is_shadow_path
-        word_id = fwords[i][alt][0]
-        spanlen = fwords[i][alt][2]
-        #print "word_id = %i, %s" % (word_id, sym.tostring(word_id))
-        # to prevent .. [X] </S>
-        #print "prefix = ",prefix
-        #if word_id == 2 and len(prefix)>=2:
-          #print "at the end: %s" % (prefix[len(prefix)-1])
-          #if prefix[len(prefix)-1]<0:
-            #print "break"
-            #continue
-        #print "continuing"
-        #if pathlen + spanlen > self.max_initial_size:
-          #continue
-        # TODO get rid of k -- pathlen is replacing it
-        if word_id == cn.epsilon:
-          #print "skipping because word_id is epsilon"
-          if i+spanlen >= len(fwords): 
-            continue
-          for nualt in xrange(0,len(fwords[i+spanlen])):
-            frontier.append((k, i+spanlen, nualt, pathlen, node, prefix, is_shadow_path))
-          continue
-        
-        phrase = prefix + (word_id,)
-        str_phrase = map(sym.tostring, phrase)
-        hiero_phrase = rule.Phrase(phrase)
-        arity = hiero_phrase.arity()
-
-        #print "pos %2d, node %5d, '%s'" % (i, node.id, hiero_phrase)
-        if self.search_file is not None:
-          self.search_file.write("%s\n" % hiero_phrase)
-
-        lookup_required = False
-        if word_id in node.children:
-          if node.children[word_id] is None:
-            #print "Path dead-ends at this node\n"
-            continue
-          else:
-            #print "Path continues at this node\n"
-            node = node.children[word_id]
-        else:
-          if node.suffix_link is None:
-            #print "Current node is root; lookup required\n"
-            lookup_required = True
-          else:
-            if word_id in node.suffix_link.children:
-              if node.suffix_link.children[word_id] is None:
-                #print "Suffix link reports path is dead end\n"
-                node.children[word_id] = None
-                continue
-              else:
-                #print "Suffix link indicates lookup is reqired\n"
-                lookup_required = True
-            else:
-              #print "ERROR: We never get here\n"
-              raise Exception("Keyword trie error")
-              #new_frontier.append((k, i, alt, pathlen, node, prefix, is_shadow_path))
-        #print "checking whether lookup_required\n"
-        if lookup_required:
-          new_node = None
-          if is_shadow_path:
-            #print "Extending shadow path for %s \n"
-            # on the shadow path we don't do any search, we just use info from suffix link
-            new_node = ExtendedTrieNode(phrase_location=node.suffix_link.children[word_id].phrase_location, 
-                              suffix_link=node.suffix_link.children[word_id],
-                              phrase=hiero_phrase)
-          else:
-            if arity > 0:
-              #print "Intersecting for %s because of arity > 0\n" % hiero_phrase
-              phrase_location = self.intersect(node, node.suffix_link.children[word_id], hiero_phrase)
-            else:
-              #print "Suffix array search for %s" % hiero_phrase
-              phrase_location = node.phrase_location
-              sa_range = self.context_manager.fsarray.lookup(str_phrase[-1], len(str_phrase)-1, phrase_location.sa_low, phrase_location.sa_high)
-              if sa_range is not None:
-                phrase_location = PhraseLocation(sa_low=sa_range[0], sa_high=sa_range[1])
-              else:
-                phrase_location = None
-
-            if phrase_location is None:
-              node.children[word_id] = None
-              #print "Search failed\n"
-              continue
-            #print "Search succeeded\n"
-            suffix_link = self.rules.root
-            if node.suffix_link is not None:
-              suffix_link = node.suffix_link.children[word_id]
-            new_node = ExtendedTrieNode(phrase_location=phrase_location, 
-                              suffix_link=suffix_link, 
-                              phrase=hiero_phrase)
-          node.children[word_id] = new_node
-          node = new_node
-          #print "Added node %d with suffix link %d\n" % (node.id, node.suffix_link.id)
-
-          '''Automatically add a trailing X node, if allowed --
-          This should happen before we get to extraction (so that
-          the node will exist if needed)'''
-          if arity < self.max_nonterminals:
-            xcat_index = arity+1
-            xcat = sym.setindex(self.category, xcat_index)
-            suffix_link_xcat_index = xcat_index
-            if is_shadow_path:
-              suffix_link_xcat_index = xcat_index-1
-            suffix_link_xcat = sym.setindex(self.category, suffix_link_xcat_index)
-            node.children[xcat] = ExtendedTrieNode(phrase_location=node.phrase_location, 
-                                    suffix_link=node.suffix_link.children[suffix_link_xcat],
-                                    phrase= rule.Phrase(phrase + (xcat,)))
-            #log.writeln("Added node %d with suffix link %d (for X)" % (node.children[xcat].id, node.children[xcat].suffix_link.id), 4)
-
-          # sample from range
-          if not is_shadow_path:
-            #print "is_not_shadow_path"
-            sample = self.context_manager.sampler.sample(node.phrase_location)
-            #print "node.phrase_location %s" % str(node.phrase_location)
-            #print "sample.len = %i" % len(sample)
-            num_subpatterns = (<PhraseLocation> node.phrase_location).num_subpatterns
-            chunklen = cintlist.CIntList(initial_len=num_subpatterns)
-            for j from 0 <= j < num_subpatterns:
-              chunklen.arr[j] = hiero_phrase.chunklen(j)
-            extracts = []
-            j = 0
-            extract_start = monitor.cpu()
-            '''orig_tight_phrases = self.tight_phrases
-            orig_require_aligned_terminal = self.require_aligned_terminal
-            orig_require_aligned_chunks = self.require_aligned_chunks
-            if k==0 or i==len(fwords)-1:
-              self.tight_phrases = 0
-              self.require_aligned_terminal = 0
-              self.require_aligned_chunks = 0'''
-            while j < sample.len:
-              extract = []
-
-              assign_matching(&matching, sample.arr, j, num_subpatterns, self.fda.sent_id.arr)
-              '''print "tight_phrase = "
-              print self.tight_phrases
-              print "require_aligned_terminal = "
-              print self.require_aligned_terminal
-              print "require_aligned_chunks = "
-              print self.require_aligned_chunks'''
-              
-              extract = self.extract(hiero_phrase, &matching, chunklen.arr, num_subpatterns)
-              extracts.extend(extract)
-              j = j + num_subpatterns
-            '''self.tight_phrases = orig_tight_phrases
-            sttice+sa.nw.normelf.require_aligned_terminal = orig_require_aligned_terminal
-            self.require_aligned_chunks = orig_require_aligned_chunks'''
-            num_samples = sample.len/num_subpatterns
-            extract_stop = monitor.cpu()
-            self.extract_time = self.extract_time + extract_stop - extract_start
-            #print "extract.size = %i" % len(extracts)
-            if len(extracts) > 0:
-              fphrases = {}
-              fals = {}
-              fcount = {}
-              for f, e, count, als in extracts:
-                fcount.setdefault(f, 0.0)
-                fcount[f] = fcount[f] + count
-                fphrases.setdefault(f, {})
-                fphrases[f].setdefault(e, {})
-                #fphrases[f][e] = fphrases[f][e] + count
-                fphrases[f][e].setdefault(als,0.0)
-                fphrases[f][e][als] = fphrases[f][e][als] + count
-                #print "f,e,als ",f," : ",e," : ",als," count = ",fphrases[f][e][als]
-                #fals[str(f)+" ||| "+str(e)] = als
-              for f, elist in fphrases.iteritems():
-                #print "f = '%s'" % f
-                #if (str(f) in ['<s>','</s>','<s> [X,1]','[X,1] </s>']):
-                #  print "rejected"
-                #  continue
-                f_margin = fcount[f]
-                for e, alslist in elist.iteritems():
-                  alignment = None
-                  count = 0
-                  for als, currcount in alslist.iteritems():
-                    #print "als = ",als,", count = ",currcount
-                    if currcount > count:
-                      alignment = als
-                      count = currcount 
-                  #alignment = fals[str(f)+" ||| "+str(e)]
-                  #print "selected = ",alignment," with count = ",count
-                  scores = []
-                  for m in self.context_manager.models:
-                    scores.append(m.compute_contextless_score(f, e, count, fcount[f], num_samples))
-                  r = rule.Rule(self.category, f, e, scores=scores, owner="context", word_alignments = alignment)
-                  self.grammar.add(r)
-                  if self.rule_filehandler is not None:
-                    self.rule_filehandler.write("%s\n" % r.to_line())
-                  #print "adding a rule = %s" % r
-
-        #if len(phrase) < self.max_length and i+spanlen < len(fwords) and pathlen+spanlen < self.max_initial_size:
-        if len(phrase) < self.max_length and i+spanlen < len(fwords) and pathlen+1 <= self.max_initial_size:
-          #to prevent [X] </s>
-          #print "lexicalized"
-          for alt_id in xrange(len(fwords[i+spanlen])):
-            #if (fwords[i+spanlen][alt_id][2]+pathlen+spanlen <= self.max_initial_size):
-            #new_frontier.append((k, i+spanlen, alt_id, pathlen + spanlen, node, phrase, is_shadow_path))
-            #print "alt_id = %d\n" % alt_id
-            new_frontier.append((k, i+spanlen, alt_id, pathlen + 1, node, phrase, is_shadow_path))
-            #print (k, i+spanlen, alt_id, pathlen + spanlen, node, map(sym.tostring,phrase), is_shadow_path)
-          #print "end lexicalized"
-          num_subpatterns = arity
-          if not is_shadow_path:
-            num_subpatterns = num_subpatterns + 1
-          #to avoid <s> X ... we want <s> next to a lexicalized item
-          #if k>0 and i<len(fwords)-1 and len(phrase)+1 < self.max_length and arity < self.max_nonterminals and num_subpatterns < self.max_chunks:
-          if len(phrase)+1 < self.max_length and arity < self.max_nonterminals and num_subpatterns < self.max_chunks:
-            #print "masuk kondisi"
-            xcat = sym.setindex(self.category, arity+1)
-            xnode = node.children[xcat]
-            #frontier_nodes = self.get_all_nodes_isteps_away(self.min_gap_size, i, spanlen, pathlen, fwords, next_states)
-            # I put spanlen=1 below
-            key = tuple([self.min_gap_size, i, 1, pathlen])
-            frontier_nodes = []
-            if (key in nodes_isteps_away_buffer):
-              frontier_nodes = nodes_isteps_away_buffer[key]
-            else:
-              frontier_nodes = self.get_all_nodes_isteps_away(self.min_gap_size, i, 1, pathlen, fwords, next_states, reachable_buffer)
-              nodes_isteps_away_buffer[key] = frontier_nodes
-            
-            #print "new frontier:\n"
-            for (i, alt, pathlen) in frontier_nodes:
-              #if (pathlen+fwords[i][alt][2] <= self.max_initial_size):
-              new_frontier.append((k, i, alt, pathlen, xnode, phrase +(xcat,), is_shadow_path))
-              #print k, i, alt, pathlen, node, map(sym.tostring,phrase +(xcat,)), is_shadow_path
-          #print "all end\n";
-          #else:
-            #print "no new frontier1\n";
-        #else :
-          #print "no new frontier2\n"
-      if self.log_int_stats:
-        log.writeln("This iteration intersect time = %f" % (self.intersect_time - this_iter_intersect_time))
-      frontier = new_frontier
-        
-    stop_time = monitor.cpu()
-    log.writeln("Total time for rule lookup, extraction, and scoring = %f seconds" % (stop_time - start_time))
-    #log.writeln("  Intersect time = %f seconds" % self.intersect_time)
-    gc.collect()
-    log.writeln("  Extract time = %f seconds" % self.extract_time)
-    if self.pruned_rule_file:
-      self.grammar.dump(self.pruned_rule_file)
-    if self.per_sentence_grammar:
-      self.rule_filehandler.close()
-
-#    else:
-#      self.rule_filehandler.write("###EOS_"+ id +"\n")
-
-
-  cdef int find_fixpoint(self, 
-            int f_low, f_high, 
-            int* f_links_low, int* f_links_high, 
-            int* e_links_low, int* e_links_high,
-            int e_in_low, int e_in_high, 
-            int* e_low, int* e_high,
-            int* f_back_low, int* f_back_high, 
-            int f_sent_len, int e_sent_len,
-            int max_f_len, int max_e_len, 
-            int min_fx_size, int min_ex_size,
-            int max_new_x,
-            int allow_low_x, int allow_high_x, 
-            int allow_arbitrary_x, int write_log):
-    cdef int e_low_prev, e_high_prev, f_low_prev, f_high_prev, new_x, new_low_x, new_high_x
-
-    e_low[0] = e_in_low
-    e_high[0] = e_in_high
-    self.find_projection(f_low, f_high, f_links_low, f_links_high, e_low, e_high)
-    if e_low[0] == -1:
-      # low-priority corner case: if phrase w is unaligned,
-      # but we don't require aligned terminals, then returning
-      # an error here might prevent extraction of allowed
-      # rule X -> X_1 w X_2 / X_1 X_2.  This is probably
-      # not worth the bother, though.
-      #print "find_fixpoint0"
-      return 0
-    elif e_in_low != -1 and e_low[0] != e_in_low:
-      if e_in_low - e_low[0] < min_ex_size:
-        e_low[0] = e_in_low - min_ex_size
-        if e_low[0] < 0:
-          #print "find_fixpoint1"
-          return 0
-
-    if e_high[0] - e_low[0] > max_e_len:
-      #print "find_fixpoint2"
-      return 0
-    elif e_in_high != -1 and e_high[0] != e_in_high:
-      if e_high[0] - e_in_high < min_ex_size:
-        e_high[0] = e_in_high + min_ex_size
-        if e_high[0] > e_sent_len:
-          #print "find_fixpoint3"
-          return 0
-
-    f_back_low[0] = -1
-    f_back_high[0] = -1
-    f_low_prev = f_low
-    f_high_prev = f_high
-    new_x = 0
-    new_low_x = 0
-    new_high_x = 0
-
-    while True:
-
-      if f_back_low[0] == -1:
-        self.find_projection(e_low[0], e_high[0], e_links_low, e_links_high, f_back_low, f_back_high)
-      else:
-        self.find_projection(e_low[0], e_low_prev, e_links_low, e_links_high, f_back_low, f_back_high)
-        self.find_projection(e_high_prev, e_high[0], e_links_low, e_links_high, f_back_low, f_back_high)
-
-      if f_back_low[0] > f_low:
-        f_back_low[0] = f_low
-
-      if f_back_high[0] < f_high:
-        f_back_high[0] = f_high
-
-      if f_back_low[0] == f_low_prev and f_back_high[0] == f_high_prev:
-        return 1
-
-      if allow_low_x == 0 and f_back_low[0] < f_low:
-#        log.writeln("   FAIL: f phrase is not tight")
-        #print "   FAIL: f phrase is not tight"
-        return 0
-
-      if f_back_high[0] - f_back_low[0] > max_f_len:
-#        log.writeln("   FAIL: f back projection is too wide")
-        #print "   FAIL: f back projection is too wide"
-        return 0
-
-      if allow_high_x == 0 and f_back_high[0] > f_high:
-#        log.writeln("   FAIL: extension on high side not allowed")
-        #print "   FAIL: extension on high side not allowed"
-        return 0
-
-      if f_low != f_back_low[0]:
-        if new_low_x == 0:
-          if new_x >= max_new_x:
-#            log.writeln("   FAIL: extension required on low side violates max # of gaps")
-            #print "   FAIL: extension required on low side violates max # of gaps"
-            return 0
-          else:
-            new_x = new_x + 1
-            new_low_x = 1
-        if f_low - f_back_low[0] < min_fx_size:
-          f_back_low[0] = f_low - min_fx_size
-          if f_back_high[0] - f_back_low[0] > max_f_len:
-#            log.writeln("   FAIL: extension required on low side violates max initial length")
-            #print "   FAIL: extension required on low side violates max initial length"
-            return 0
-          if f_back_low[0] < 0:
-#            log.writeln("   FAIL: extension required on low side violates sentence boundary")
-            #print "   FAIL: extension required on low side violates sentence boundary"
-            return 0
-
-      if f_high != f_back_high[0]:
-        if new_high_x == 0:
-          if new_x >= max_new_x:
-#            log.writeln("   FAIL: extension required on high side violates max # of gaps")
-            #print "   FAIL: extension required on high side violates max # of gaps"
-            return 0
-          else:
-            new_x = new_x + 1
-            new_high_x = 1
-        if f_back_high[0] - f_high < min_fx_size:
-          f_back_high[0] = f_high + min_fx_size
-          if f_back_high[0] - f_back_low[0] > max_f_len:
-#            log.writeln("   FAIL: extension required on high side violates max initial length")
-            #print "   FAIL: extension required on high side violates max initial length"
-            return 0
-          if f_back_high[0] > f_sent_len:
-#            log.writeln("   FAIL: extension required on high side violates sentence boundary")
-            #print "   FAIL: extension required on high side violates sentence boundary"
-            return 0
-
-      e_low_prev = e_low[0]
-      e_high_prev = e_high[0]
-
-      self.find_projection(f_back_low[0], f_low_prev, f_links_low, f_links_high, e_low, e_high)
-      self.find_projection(f_high_prev, f_back_high[0], f_links_low, f_links_high, e_low, e_high)
-      if e_low[0] == e_low_prev and e_high[0] == e_high_prev:
-        return 1
-      if allow_arbitrary_x == 0:
-#        log.writeln("   FAIL: arbitrary expansion not permitted")
-        #print "   FAIL: arbitrary expansion not permitted"
-        return 0
-      if e_high[0] - e_low[0] > max_e_len:
-#        log.writeln("   FAIL: re-projection violates sentence max phrase length")
-        #print "   FAIL: re-projection violates sentence max phrase length"
-        return 0
-      f_low_prev = f_back_low[0]
-      f_high_prev = f_back_high[0]
-
-
-  cdef find_projection(self, int in_low, int in_high, int* in_links_low, int* in_links_high, 
-            int* out_low, int* out_high):
-    cdef int i
-    for i from in_low <= i < in_high:
-      if in_links_low[i] != -1:
-        if out_low[0] == -1 or in_links_low[i] < out_low[0]:
-          out_low[0] = in_links_low[i]
-        if out_high[0] == -1 or in_links_high[i] > out_high[0]:
-          out_high[0] = in_links_high[i]
-
-
-  cdef int* int_arr_extend(self, int* arr, int* arr_len, int* data, int data_len):
-    cdef int new_len
-    new_len = arr_len[0] + data_len
-    arr = <int*> realloc(arr, new_len*sizeof(int))
-    memcpy(arr+arr_len[0], data, data_len*sizeof(int))
-    arr_len[0] = new_len
-    return arr
-
-
-  cdef extract_phrases(self, int e_low, int e_high, int* e_gap_low, int* e_gap_high, int* e_links_low, int num_gaps,
-            int f_low, int f_high, int* f_gap_low, int* f_gap_high, int* f_links_low, 
-            int sent_id, int e_sent_len, int e_sent_start):
-    cdef int i, j, k, m, n, *e_gap_order, e_x_low, e_x_high, e_x_gap_low, e_x_gap_high
-    cdef int *e_gaps1, *e_gaps2, len1, len2, step, num_chunks
-    cdef cintlist.CIntList ephr_arr
-    cdef result
-
-    #print "inside extract_phrases"
-    #print "f_low=%d, f_high=%d" % (f_low,f_high)
-    result = []
-    len1 = 0
-    e_gaps1 = <int*> malloc(0)
-    ephr_arr = cintlist.CIntList()
-
-    e_gap_order = <int*> malloc(num_gaps*sizeof(int))
-    if num_gaps > 0:
-      e_gap_order[0] = 0
-      for i from 1 <= i < num_gaps:
-        for j from 0 <= j < i:
-          if e_gap_low[i] < e_gap_low[j]:
-            for k from j <= k < i:
-              e_gap_order[k+1] = e_gap_order[k]
-            e_gap_order[j] = i
-            break
-        else:
-          e_gap_order[i] = i
-
-    e_x_low = e_low
-    e_x_high = e_high
-    if self.tight_phrases == 0:
-      while e_x_low > 0 and e_high - e_x_low < self.train_max_initial_size and e_links_low[e_x_low-1] == -1:
-        e_x_low = e_x_low - 1
-      while e_x_high < e_sent_len and e_x_high - e_low < self.train_max_initial_size and e_links_low[e_x_high] == -1:
-        e_x_high = e_x_high + 1
-
-    for i from e_x_low <= i <= e_low:
-      e_gaps1 = self.int_arr_extend(e_gaps1, &len1, &i, 1)
-
-    for i from 0 <= i < num_gaps:
-      e_gaps2 = <int*> malloc(0)
-      len2 = 0
-
-      j = e_gap_order[i]
-      e_x_gap_low = e_gap_low[j]
-      e_x_gap_high = e_gap_high[j]
-      if self.tight_phrases == 0:
-        while e_x_gap_low > e_x_low and e_links_low[e_x_gap_low-1] == -1:
-          e_x_gap_low = e_x_gap_low - 1
-        while e_x_gap_high < e_x_high and e_links_low[e_x_gap_high] == -1:
-          e_x_gap_high = e_x_gap_high + 1
-
-      k = 0
-      step = 1+(i*2)
-      while k < len1:
-        for m from e_x_gap_low <= m <= e_gap_low[j]:
-          if m >= e_gaps1[k+step-1]:
-            for n from e_gap_high[j] <= n <= e_x_gap_high:
-              if n-m >= 1:  # extractor.py doesn't restrict target-side gap length 
-                e_gaps2 = self.int_arr_extend(e_gaps2, &len2, e_gaps1+k, step)
-                e_gaps2 = self.int_arr_extend(e_gaps2, &len2, &m, 1)
-                e_gaps2 = self.int_arr_extend(e_gaps2, &len2, &n, 1)
-        k = k + step
-      free(e_gaps1)
-      e_gaps1 = e_gaps2
-      len1 = len2
-
-    step = 1+(num_gaps*2)
-    e_gaps2 = <int*> malloc(0)
-    len2 = 0
-    for i from e_high <= i <= e_x_high:
-      j = 0
-      while j < len1:
-        if i - e_gaps1[j] <= self.train_max_initial_size and i >= e_gaps1[j+step-1]:
-          e_gaps2 = self.int_arr_extend(e_gaps2, &len2, e_gaps1+j, step)
-          e_gaps2 = self.int_arr_extend(e_gaps2, &len2, &i, 1)
-        j = j + step
-    free(e_gaps1)
-    e_gaps1 = e_gaps2
-    len1 = len2
-
-    step = (num_gaps+1)*2
-    i = 0
-    
-    while i < len1:
-      ephr_arr._clear()
-      num_chunks = 0
-      indexes = []
-      for j from 0 <= j < num_gaps+1:
-        if e_gaps1[i+2*j] < e_gaps1[i+(2*j)+1]:
-          num_chunks = num_chunks + 1
-        for k from e_gaps1[i+2*j] <= k < e_gaps1[i+(2*j)+1]:
-          indexes.append(k)
-          ephr_arr._append(self.eid2symid[self.eda.data.arr[e_sent_start+k]])
-        if j < num_gaps:
-          indexes.append(sym.setindex(self.category, e_gap_order[j]+1))
-          ephr_arr._append(sym.setindex(self.category, e_gap_order[j]+1))
-      i = i + step
-      if ephr_arr.len <= self.max_target_length and num_chunks <= self.max_target_chunks:
-        result.append((rule.Phrase(ephr_arr),indexes))
-
-    free(e_gaps1)
-    free(e_gap_order)
-    return result
-
-  cdef create_alignments(self, int* sent_links, int num_links, findexes, eindexes):
-    #print "create_alignments"
-    #s = "sent_links = "
-    #i = 0
-    #while (i < num_links*2):
-    #  s = s+"%d-%d " % (sent_links[i],sent_links[i+1])
-    #  i += 2
-    #print s
-    #print findexes
-    #print eindexes
-    
-    ret = cintlist.CIntList()
-    for i in xrange(len(findexes)):
-      s = findexes[i]
-      if (s<0):
-        continue
-      idx = 0
-      while (idx < num_links*2):
-        if (sent_links[idx] == s):
-          j = eindexes.index(sent_links[idx+1])
-          ret.append(i*65536+j)
-        idx += 2
-    return ret
-        
-  cdef extract(self, rule.Phrase phrase, Matching* matching, int* chunklen, int num_chunks):
-    cdef int* sent_links, *e_links_low, *e_links_high, *f_links_low, *f_links_high
-    cdef int *f_gap_low, *f_gap_high, *e_gap_low, *e_gap_high, num_gaps, gap_start
-    cdef int i, j, k, e_i, f_i, num_links, num_aligned_chunks, met_constraints
-    cdef int f_low, f_high, e_low, e_high, f_back_low, f_back_high
-    cdef int e_sent_start, e_sent_end, f_sent_start, f_sent_end, e_sent_len, f_sent_len
-    cdef int e_word_count, f_x_low, f_x_high, e_x_low, e_x_high, phrase_len
-    cdef float pair_count
-    cdef float available_mass
-    cdef extracts, phrase_list
-    cdef cintlist.CIntList fphr_arr
-    cdef rule.Phrase fphr
-    cdef reason_for_failure
-
-    fphr_arr = cintlist.CIntList()
-    phrase_len = phrase.n
-    extracts = []
-    sent_links = self.alignment._get_sent_links(matching.sent_id, &num_links)
-
-    e_sent_start = self.eda.sent_index.arr[matching.sent_id]
-    e_sent_end = self.eda.sent_index.arr[matching.sent_id+1]
-    e_sent_len = e_sent_end - e_sent_start - 1
-    f_sent_start = self.fda.sent_index.arr[matching.sent_id]
-    f_sent_end = self.fda.sent_index.arr[matching.sent_id+1]
-    f_sent_len = f_sent_end - f_sent_start - 1
-    available_mass = 1.0
-    if matching.sent_id == self.excluded_sent_id:
-      available_mass = 0.0
-
-    self.findexes1.reset()
-    sofar = 0
-    for i in xrange(num_chunks):
-      for j in xrange(chunklen[i]):
-        self.findexes1.append(matching.arr[matching.start+i]+j-f_sent_start);
-        sofar += 1
-      if (i+1<num_chunks):
-        self.findexes1.append(phrase[sofar])
-        sofar += 1
-      
-
-    e_links_low = <int*> malloc(e_sent_len*sizeof(int))
-    e_links_high = <int*> malloc(e_sent_len*sizeof(int))
-    f_links_low = <int*> malloc(f_sent_len*sizeof(int))
-    f_links_high = <int*> malloc(f_sent_len*sizeof(int))
-    f_gap_low = <int*> malloc((num_chunks+1)*sizeof(int))
-    f_gap_high = <int*> malloc((num_chunks+1)*sizeof(int))
-    e_gap_low = <int*> malloc((num_chunks+1)*sizeof(int))
-    e_gap_high = <int*> malloc((num_chunks+1)*sizeof(int))
-    memset(f_gap_low, 0, (num_chunks+1)*sizeof(int))
-    memset(f_gap_high, 0, (num_chunks+1)*sizeof(int))
-    memset(e_gap_low, 0, (num_chunks+1)*sizeof(int))
-    memset(e_gap_high, 0, (num_chunks+1)*sizeof(int))
-
-    reason_for_failure = ""
-
-    for i from 0 <= i < e_sent_len:
-      e_links_low[i] = -1
-      e_links_high[i] = -1
-    for i from 0 <= i < f_sent_len:
-      f_links_low[i] = -1
-      f_links_high[i] = -1
-
-    # this is really inefficient -- might be good to 
-    # somehow replace with binary search just for the f
-    # links that we care about (but then how to look up 
-    # when we want to check something on the e side?)
-    i = 0
-    while i < num_links*2:
-      f_i = sent_links[i]
-      e_i = sent_links[i+1]
-      if f_links_low[f_i] == -1 or f_links_low[f_i] > e_i:
-        f_links_low[f_i] = e_i
-      if f_links_high[f_i] == -1 or f_links_high[f_i] < e_i + 1:
-        f_links_high[f_i] = e_i + 1
-      if e_links_low[e_i] == -1 or e_links_low[e_i] > f_i:
-        e_links_low[e_i] = f_i
-      if e_links_high[e_i] == -1 or e_links_high[e_i] < f_i + 1:
-        e_links_high[e_i] = f_i + 1
-      i = i + 2
-    
-    als = []
-    for x in xrange(matching.start,matching.end):
-      al = (matching.arr[x]-f_sent_start,f_links_low[matching.arr[x]-f_sent_start])
-      als.append(al)
-    # check all source-side alignment constraints
-    met_constraints = 1
-    if self.require_aligned_terminal:
-      num_aligned_chunks = 0
-      for i from 0 <= i < num_chunks:
-        for j from 0 <= j < chunklen[i]:
-          if f_links_low[matching.arr[matching.start+i]+j-f_sent_start] > -1:
-            num_aligned_chunks = num_aligned_chunks + 1
-            break
-      if num_aligned_chunks == 0:
-        reason_for_failure = "No aligned terminals"
-        met_constraints = 0
-      if self.require_aligned_chunks and num_aligned_chunks < num_chunks:
-        reason_for_failure = "Unaligned chunk"
-        met_constraints = 0
-
-    if met_constraints and self.tight_phrases:
-      # outside edge constraints are checked later
-      for i from 0 <= i < num_chunks-1:
-        if f_links_low[matching.arr[matching.start+i]+chunklen[i]-f_sent_start] == -1:
-          reason_for_failure = "Gaps are not tight phrases"
-          met_constraints = 0
-          break
-        if f_links_low[matching.arr[matching.start+i+1]-1-f_sent_start] == -1:
-          reason_for_failure = "Gaps are not tight phrases"
-          met_constraints = 0
-          break
-
-    f_low = matching.arr[matching.start] - f_sent_start
-    f_high = matching.arr[matching.start + matching.size - 1] + chunklen[num_chunks-1] - f_sent_start
-    if met_constraints:
-
-      if self.find_fixpoint(f_low, f_high, f_links_low, f_links_high, e_links_low, e_links_high, 
-                -1, -1, &e_low, &e_high, &f_back_low, &f_back_high, f_sent_len, e_sent_len,
-                self.train_max_initial_size, self.train_max_initial_size, 
-                self.train_min_gap_size, 0,
-                self.max_nonterminals - num_chunks + 1, 1, 1, 0, 0):
-        gap_error = 0
-        num_gaps = 0
-
-        if f_back_low < f_low:
-          f_gap_low[0] = f_back_low
-          f_gap_high[0] = f_low
-          num_gaps = 1
-          gap_start = 0
-          phrase_len = phrase_len+1
-          if phrase_len > self.max_length:
-            gap_error = 1
-          if self.tight_phrases:
-            if f_links_low[f_back_low] == -1 or f_links_low[f_low-1] == -1:
-              gap_error = 1
-              reason_for_failure = "Inside edges of preceding subphrase are not tight"
-        else:
-          gap_start = 1
-          if self.tight_phrases and f_links_low[f_low] == -1:
-            # this is not a hard error.  we can't extract this phrase
-            # but we still might be able to extract a superphrase
-            met_constraints = 0
-
-        for i from 0 <= i < matching.size - 1:
-          f_gap_low[1+i] = matching.arr[matching.start+i] + chunklen[i] - f_sent_start
-          f_gap_high[1+i] = matching.arr[matching.start+i+1] - f_sent_start
-          num_gaps = num_gaps + 1
-
-        if f_high < f_back_high:
-          f_gap_low[gap_start+num_gaps] = f_high
-          f_gap_high[gap_start+num_gaps] = f_back_high
-          num_gaps = num_gaps + 1
-          phrase_len = phrase_len+1
-          if phrase_len > self.max_length:
-            gap_error = 1
-          if self.tight_phrases:
-            if f_links_low[f_back_high-1] == -1 or f_links_low[f_high] == -1:
-              gap_error = 1
-              reason_for_failure = "Inside edges of following subphrase are not tight"
-        else:
-          if self.tight_phrases and f_links_low[f_high-1] == -1:
-            met_constraints = 0
-
-        if gap_error == 0:
-          e_word_count = e_high - e_low
-          for i from 0 <= i < num_gaps: # check integrity of subphrases
-            if self.find_fixpoint(f_gap_low[gap_start+i], f_gap_high[gap_start+i], 
-                      f_links_low, f_links_high, e_links_low, e_links_high,
-                      -1, -1, e_gap_low+gap_start+i, e_gap_high+gap_start+i, 
-                      f_gap_low+gap_start+i, f_gap_high+gap_start+i,
-                      f_sent_len, e_sent_len, 
-                      self.train_max_initial_size, self.train_max_initial_size, 
-                      0, 0, 0, 0, 0, 0, 0) == 0:
-              gap_error = 1
-              reason_for_failure = "Subphrase [%d, %d] failed integrity check" % (f_gap_low[gap_start+i], f_gap_high[gap_start+i])
-              break
-
-        if gap_error == 0:
-          i = 1
-          self.findexes.reset()
-          if f_back_low < f_low:
-            fphr_arr._append(sym.setindex(self.category, i))
-            i = i+1
-            self.findexes.append(sym.setindex(self.category, i))
-          self.findexes.extend(self.findexes1)
-          for j from 0 <= j < phrase.n:
-            if sym.isvar(phrase.syms[j]):
-              fphr_arr._append(sym.setindex(self.category, i))
-              i = i + 1
-            else:
-              fphr_arr._append(phrase.syms[j])
-          if f_back_high > f_high:
-            fphr_arr._append(sym.setindex(self.category, i))
-            self.findexes.append(sym.setindex(self.category, i))
-
-          fphr = rule.Phrase(fphr_arr)
-          if met_constraints:
-            phrase_list = self.extract_phrases(e_low, e_high, e_gap_low + gap_start, e_gap_high + gap_start, e_links_low, num_gaps,
-                      f_back_low, f_back_high, f_gap_low + gap_start, f_gap_high + gap_start, f_links_low,
-                      matching.sent_id, e_sent_len, e_sent_start)
-            #print "e_low=%d, e_high=%d, gap_start=%d, num_gaps=%d, f_back_low=%d, f_back_high=%d" & (e_low, e_high, gap_start, num_gaps, f_back_low, f_back_high)
-            if len(phrase_list) > 0:
-              pair_count = available_mass / len(phrase_list)
-            else:
-              pair_count = 0
-              reason_for_failure = "Didn't extract anything from [%d, %d] -> [%d, %d]" % (f_back_low, f_back_high, e_low, e_high)
-            for (phrase2,eindexes) in phrase_list:
-              als1 = self.create_alignments(sent_links,num_links,self.findexes,eindexes)    
-              extracts.append((fphr, phrase2, pair_count, tuple(als1)))
-              if self.extract_file:
-                self.extract_file.write("%s ||| %s ||| %f ||| %d: [%d, %d] -> [%d, %d]\n" % (fphr, phrase2, pair_count, matching.sent_id+1, f_back_low, f_back_high, e_low, e_high))
-              #print "extract_phrases1: %s ||| %s ||| %f ||| %d: [%d, %d] -> [%d, %d]\n" % (fphr, phrase2, pair_count, matching.sent_id+1, f_back_low, f_back_high, e_low, e_high)
-
-          if (num_gaps < self.max_nonterminals and
-            phrase_len < self.max_length and
-            f_back_high - f_back_low + self.train_min_gap_size <= self.train_max_initial_size):
-            if (f_back_low == f_low and 
-                f_low >= self.train_min_gap_size and
-                ((not self.tight_phrases) or (f_links_low[f_low-1] != -1 and f_links_low[f_back_high-1] != -1))):
-              f_x_low = f_low-self.train_min_gap_size
-              met_constraints = 1
-              if self.tight_phrases:
-                while f_x_low >= 0 and f_links_low[f_x_low] == -1:
-                  f_x_low = f_x_low - 1
-              if f_x_low < 0 or f_back_high - f_x_low > self.train_max_initial_size:
-                met_constraints = 0
-
-              if (met_constraints and
-                self.find_fixpoint(f_x_low, f_back_high,
-                      f_links_low, f_links_high, e_links_low, e_links_high, 
-                      e_low, e_high, &e_x_low, &e_x_high, &f_x_low, &f_x_high, 
-                      f_sent_len, e_sent_len, 
-                      self.train_max_initial_size, self.train_max_initial_size, 
-                      1, 1, 1, 1, 0, 1, 0) and
-                ((not self.tight_phrases) or f_links_low[f_x_low] != -1) and
-                self.find_fixpoint(f_x_low, f_low,  # check integrity of new subphrase
-                      f_links_low, f_links_high, e_links_low, e_links_high,
-                      -1, -1, e_gap_low, e_gap_high, f_gap_low, f_gap_high, 
-                      f_sent_len, e_sent_len,
-                      self.train_max_initial_size, self.train_max_initial_size,
-                      0, 0, 0, 0, 0, 0, 0)):
-                fphr_arr._clear()
-                i = 1
-                self.findexes.reset()
-                self.findexes.append(sym.setindex(self.category, i))
-                fphr_arr._append(sym.setindex(self.category, i))
-                i = i+1
-                self.findexes.extend(self.findexes1)
-                for j from 0 <= j < phrase.n:
-                  if sym.isvar(phrase.syms[j]):
-                    fphr_arr._append(sym.setindex(self.category, i))
-                    i = i + 1
-                  else:
-                    fphr_arr._append(phrase.syms[j])
-                if f_back_high > f_high:
-                  fphr_arr._append(sym.setindex(self.category, i))
-                  self.findexes.append(sym.setindex(self.category, i))
-                fphr = rule.Phrase(fphr_arr)
-                phrase_list = self.extract_phrases(e_x_low, e_x_high, e_gap_low, e_gap_high, e_links_low, num_gaps+1,
-                          f_x_low, f_x_high, f_gap_low, f_gap_high, f_links_low, matching.sent_id, 
-                          e_sent_len, e_sent_start)
-                if len(phrase_list) > 0:
-                  pair_count = available_mass / len(phrase_list)
-                else:
-                  pair_count = 0
-                for phrase2,eindexes in phrase_list:
-                  als2 = self.create_alignments(sent_links,num_links,self.findexes,eindexes)    
-                  extracts.append((fphr, phrase2, pair_count, tuple(als2)))
-                  if self.extract_file:
-                    self.extract_file.write("%s ||| %s ||| %f ||| %d: [%d, %d] -> [%d, %d]\n" % (fphr, phrase2, pair_count, matching.sent_id+1, f_x_low, f_x_high, e_x_low, e_x_high))
-                  #print "extract_phrases2: %s ||| %s ||| %f ||| %d: [%d, %d] -> [%d, %d]\n" % (fphr, phrase2, pair_count, matching.sent_id+1, f_x_low, f_x_high, e_x_low, e_x_high)
-
-            if (f_back_high == f_high and 
-              f_sent_len - f_high >= self.train_min_gap_size and
-              ((not self.tight_phrases) or (f_links_low[f_high] != -1 and f_links_low[f_back_low] != -1))):
-              f_x_high = f_high+self.train_min_gap_size
-              met_constraints = 1
-              if self.tight_phrases:
-                while f_x_high <= f_sent_len and f_links_low[f_x_high-1] == -1:
-                  f_x_high = f_x_high + 1
-              if f_x_high > f_sent_len or f_x_high - f_back_low > self.train_max_initial_size:
-                met_constraints = 0
-              
-              if (met_constraints and 
-                self.find_fixpoint(f_back_low, f_x_high, 
-                      f_links_low, f_links_high, e_links_low, e_links_high,
-                      e_low, e_high, &e_x_low, &e_x_high, &f_x_low, &f_x_high, 
-                      f_sent_len, e_sent_len, 
-                      self.train_max_initial_size, self.train_max_initial_size, 
-                      1, 1, 1, 0, 1, 1, 0) and
-                ((not self.tight_phrases) or f_links_low[f_x_high-1] != -1) and
-                self.find_fixpoint(f_high, f_x_high,
-                      f_links_low, f_links_high, e_links_low, e_links_high,
-                      -1, -1, e_gap_low+gap_start+num_gaps, e_gap_high+gap_start+num_gaps, 
-                      f_gap_low+gap_start+num_gaps, f_gap_high+gap_start+num_gaps, 
-                      f_sent_len, e_sent_len,
-                      self.train_max_initial_size, self.train_max_initial_size,
-                      0, 0, 0, 0, 0, 0, 0)):
-                fphr_arr._clear()
-                i = 1
-                self.findexes.reset()
-                if f_back_low < f_low:
-                  fphr_arr._append(sym.setindex(self.category, i))
-                  i = i+1
-                  self.findexes.append(sym.setindex(self.category, i))
-                self.findexes.extend(self.findexes1)
-                for j from 0 <= j < phrase.n:
-                  if sym.isvar(phrase.syms[j]):
-                    fphr_arr._append(sym.setindex(self.category, i))
-                    i = i + 1
-                  else:
-                    fphr_arr._append(phrase.syms[j])
-                fphr_arr._append(sym.setindex(self.category, i))
-                self.findexes.append(sym.setindex(self.category, i))
-                fphr = rule.Phrase(fphr_arr)
-                phrase_list = self.extract_phrases(e_x_low, e_x_high, e_gap_low+gap_start, e_gap_high+gap_start, e_links_low, num_gaps+1,
-                          f_x_low, f_x_high, f_gap_low+gap_start, f_gap_high+gap_start, f_links_low, 
-                          matching.sent_id, e_sent_len, e_sent_start)
-                if len(phrase_list) > 0:
-                  pair_count = available_mass / len(phrase_list)
-                else:
-                  pair_count = 0
-                for phrase2, eindexes in phrase_list:
-                  als3 = self.create_alignments(sent_links,num_links,self.findexes,eindexes)    
-                  extracts.append((fphr, phrase2, pair_count, tuple(als3)))
-                  if self.extract_file:
-                    self.extract_file.write("%s ||| %s ||| %f ||| %d: [%d, %d] -> [%d, %d]\n" % (fphr, phrase2, pair_count, matching.sent_id+1, f_x_low, f_x_high, e_x_low, e_x_high))
-                  #print "extract_phrases3: %s ||| %s ||| %f ||| %d: [%d, %d] -> [%d, %d]\n" % (fphr, phrase2, pair_count, matching.sent_id+1, f_x_low, f_x_high, e_x_low, e_x_high)
-            if (num_gaps < self.max_nonterminals - 1 and 
-              phrase_len+1 < self.max_length and
-              f_back_high == f_high and 
-              f_back_low == f_low and 
-              f_back_high - f_back_low + (2*self.train_min_gap_size) <= self.train_max_initial_size and
-              f_low >= self.train_min_gap_size and
-              f_high <= f_sent_len - self.train_min_gap_size and
-              ((not self.tight_phrases) or (f_links_low[f_low-1] != -1 and f_links_low[f_high] != -1))):
-
-              met_constraints = 1
-              f_x_low = f_low-self.train_min_gap_size
-              if self.tight_phrases:
-                while f_x_low >= 0 and f_links_low[f_x_low] == -1:
-                  f_x_low = f_x_low - 1
-              if f_x_low < 0:
-                met_constraints = 0
-
-              f_x_high = f_high+self.train_min_gap_size
-              if self.tight_phrases:
-                while f_x_high <= f_sent_len and f_links_low[f_x_high-1] == -1:
-                  f_x_high = f_x_high + 1
-              if f_x_high > f_sent_len or f_x_high - f_x_low > self.train_max_initial_size:
-                met_constraints = 0
-
-              if (met_constraints and
-                self.find_fixpoint(f_x_low, f_x_high,
-                        f_links_low, f_links_high, e_links_low, e_links_high,
-                        e_low, e_high, &e_x_low, &e_x_high, &f_x_low, &f_x_high, 
-                        f_sent_len, e_sent_len,
-                        self.train_max_initial_size, self.train_max_initial_size, 
-                        1, 1, 2, 1, 1, 1, 1) and
-                ((not self.tight_phrases) or (f_links_low[f_x_low] != -1 and f_links_low[f_x_high-1] != -1)) and
-                self.find_fixpoint(f_x_low, f_low,
-                        f_links_low, f_links_high, e_links_low, e_links_high,
-                        -1, -1, e_gap_low, e_gap_high, f_gap_low, f_gap_high, 
-                        f_sent_len, e_sent_len,
-                        self.train_max_initial_size, self.train_max_initial_size,
-                        0, 0, 0, 0, 0, 0, 0) and
-                self.find_fixpoint(f_high, f_x_high,
-                        f_links_low, f_links_high, e_links_low, e_links_high,
-                        -1, -1, e_gap_low+1+num_gaps, e_gap_high+1+num_gaps, 
-                        f_gap_low+1+num_gaps, f_gap_high+1+num_gaps, 
-                        f_sent_len, e_sent_len,
-                        self.train_max_initial_size, self.train_max_initial_size,
-                        0, 0, 0, 0, 0, 0, 0)):
-                fphr_arr._clear()
-                i = 1
-                self.findexes.reset()
-                self.findexes.append(sym.setindex(self.category, i))
-                fphr_arr._append(sym.setindex(self.category, i))
-                i = i+1
-                self.findexes.extend(self.findexes1)
-                for j from 0 <= j < phrase.n:
-                  if sym.isvar(phrase.syms[j]):
-                    fphr_arr._append(sym.setindex(self.category, i))
-                    i = i + 1
-                  else:
-                    fphr_arr._append(phrase.syms[j])
-                fphr_arr._append(sym.setindex(self.category, i))
-                self.findexes.append(sym.setindex(self.category, i))
-                fphr = rule.Phrase(fphr_arr)
-                phrase_list = self.extract_phrases(e_x_low, e_x_high, e_gap_low, e_gap_high, e_links_low, num_gaps+2,
-                          f_x_low, f_x_high, f_gap_low, f_gap_high, f_links_low, 
-                          matching.sent_id, e_sent_len, e_sent_start)
-                if len(phrase_list) > 0:
-                  pair_count = available_mass / len(phrase_list)
-                else:
-                  pair_count = 0
-                for phrase2, eindexes in phrase_list:
-                  als4 = self.create_alignments(sent_links,num_links,self.findexes,eindexes)    
-                  extracts.append((fphr, phrase2, pair_count, tuple(als4)))
-                  if self.extract_file:
-                    self.extract_file.write("%s ||| %s ||| %f ||| %d: [%d, %d] -> [%d, %d]\n" % (fphr, phrase2, pair_count, matching.sent_id+1, f_x_low, f_x_high, e_x_low, e_x_high))
-                  #print "extract_phrases4 %s ||| %s ||| %f ||| %d: [%d, %d] -> [%d, %d]\n" % (fphr, phrase2, pair_count, matching.sent_id+1, f_x_low, f_x_high, e_x_low, e_x_high)
-      else:
-        reason_for_failure = "Unable to extract basic phrase"
-
-    free(sent_links)
-    free(f_links_low)
-    free(f_links_high)
-    free(e_links_low)
-    free(e_links_high)
-    free(f_gap_low)
-    free(f_gap_high)
-    free(e_gap_low)
-    free(e_gap_high)
-
-    if self.sample_file is not None:
-      self.sample_file.write("%s ||| %d: [%d, %d] ||| %d ||| %s\n" % (phrase, matching.sent_id+1, f_low, f_high, len(extracts), reason_for_failure))
-
-    #print "%s ||| %d: [%d, %d] ||| %d ||| %s\n" % (phrase, matching.sent_id+1, f_low, f_high, len(extracts), reason_for_failure)
-
-    
-    return extracts
-
author	Kenneth Heafield <github@kheafield.com>	2012-08-03 07:46:54 -0400
committer	Kenneth Heafield <github@kheafield.com>	2012-08-03 07:46:54 -0400
commit	122f46c31102b683eaab3ad81a3a98accbc694bb (patch)
tree	8d499d789b159ebed25bb23b6983813d064a6296 /sa-extract/rulefactory.pyx
parent	ac664bdb0e481539cf77098a7dd0e1ec8d937ba0 (diff)
parent	193d137056c3c4f73d66f8db84691d63307de894 (diff)