From 37bb4a4c537ea2f9bd916f3e031c759e61bf49f7 Mon Sep 17 00:00:00 2001
From: Victor Chahuneau <vchahune@cs.cmu.edu>
Date: Thu, 21 Jun 2012 17:27:48 -0400
Subject: Allow SA rule extraction to write to a python buffer

+ very small sa-extract cleanup
---
 sa-extract/manager.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'sa-extract/manager.py')

diff --git a/sa-extract/manager.py b/sa-extract/manager.py
index 767192c1..3a079c2a 100644
--- a/sa-extract/manager.py
+++ b/sa-extract/manager.py
@@ -1,5 +1,6 @@
 import csuf
 import cdat
+import cintlist
 
 class Sampler(object):
 	'''A Sampler implements a logic for choosing
@@ -15,7 +16,6 @@ class Sampler(object):
 		return cintlist.CIntList()
 
 
-
 class Extractor(object):
 	'''Extractor is responsible for extracting rules
 	from a given context; once a sentence id/location
-- 
cgit v1.2.3


From 733e1b1507d27d4f53055f740e8098f56215ab8f Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@cs.cmu.edu>
Date: Fri, 27 Jul 2012 17:31:00 -0400
Subject: remove old suffix array extractor (use the one in python/ instead)

---
 sa-extract/Makefile                |   12 -
 sa-extract/README                  |   62 -
 sa-extract/calignment.pxd          |   10 -
 sa-extract/calignment.pyx          |  128 --
 sa-extract/cdat.pxd                |   12 -
 sa-extract/cdat.pyx                |  178 ---
 sa-extract/cfloatlist.pxd          |   10 -
 sa-extract/cfloatlist.pyx          |   93 --
 sa-extract/cintlist.pxd            |   15 -
 sa-extract/cintlist.pyx            |  196 ---
 sa-extract/clex.pyx                |  460 -------
 sa-extract/cmath.pxd               |    2 -
 sa-extract/cn.py                   |  164 ---
 sa-extract/compile_bin.py          |  148 ---
 sa-extract/context_model.py        |  234 ----
 sa-extract/cstrmap.pxd             |   12 -
 sa-extract/cstrmap.pyx             |   14 -
 sa-extract/csuf.pxd                |   12 -
 sa-extract/csuf.pyx                |  322 -----
 sa-extract/cveb.pxd                |   15 -
 sa-extract/cveb.pyx                |  390 ------
 sa-extract/escape-testset.pl       |   35 -
 sa-extract/example/README          |    8 -
 sa-extract/example/corpus.align.gz |  Bin 829334 -> 0 bytes
 sa-extract/example/corpus.de.gz    |  Bin 1724393 -> 0 bytes
 sa-extract/example/corpus.en.gz    |  Bin 1457711 -> 0 bytes
 sa-extract/example/test.de         |   10 -
 sa-extract/example/test.ref.en     |   10 -
 sa-extract/extract.ini             |  116 --
 sa-extract/extractor.py            |   60 -
 sa-extract/lcp.pyx                 |  113 --
 sa-extract/lcp_ops.py              |   52 -
 sa-extract/log.py                  |   18 -
 sa-extract/manager.py              |  100 --
 sa-extract/model.py                |    7 -
 sa-extract/monitor.py              |   48 -
 sa-extract/precomputation.pxd      |   13 -
 sa-extract/precomputation.pyx      |  478 --------
 sa-extract/rule.pxd                |   13 -
 sa-extract/rule.pyx                |  286 -----
 sa-extract/rulefactory.pyx         | 2361 ------------------------------------
 sa-extract/sa-compile.pl           |  322 -----
 sa-extract/sa2cdec.py              |   19 -
 sa-extract/sa_feat_names.txt       |    7 -
 sa-extract/setup.cfg               |    2 -
 sa-extract/setup.py                |   45 -
 sa-extract/sgml.py                 |  194 ---
 sa-extract/strmap.cc               |  232 ----
 sa-extract/strmap.h                |   22 -
 sa-extract/strutil.c               |   63 -
 sa-extract/strutil.h               |    8 -
 sa-extract/sym.pxd                 |   17 -
 sa-extract/sym.pyx                 |  155 ---
 sa-extract/wrap_input.py           |   37 -
 54 files changed, 7340 deletions(-)
 delete mode 100644 sa-extract/Makefile
 delete mode 100644 sa-extract/README
 delete mode 100644 sa-extract/calignment.pxd
 delete mode 100644 sa-extract/calignment.pyx
 delete mode 100644 sa-extract/cdat.pxd
 delete mode 100644 sa-extract/cdat.pyx
 delete mode 100644 sa-extract/cfloatlist.pxd
 delete mode 100644 sa-extract/cfloatlist.pyx
 delete mode 100644 sa-extract/cintlist.pxd
 delete mode 100644 sa-extract/cintlist.pyx
 delete mode 100644 sa-extract/clex.pyx
 delete mode 100644 sa-extract/cmath.pxd
 delete mode 100644 sa-extract/cn.py
 delete mode 100755 sa-extract/compile_bin.py
 delete mode 100644 sa-extract/context_model.py
 delete mode 100644 sa-extract/cstrmap.pxd
 delete mode 100644 sa-extract/cstrmap.pyx
 delete mode 100644 sa-extract/csuf.pxd
 delete mode 100644 sa-extract/csuf.pyx
 delete mode 100644 sa-extract/cveb.pxd
 delete mode 100644 sa-extract/cveb.pyx
 delete mode 100755 sa-extract/escape-testset.pl
 delete mode 100644 sa-extract/example/README
 delete mode 100644 sa-extract/example/corpus.align.gz
 delete mode 100644 sa-extract/example/corpus.de.gz
 delete mode 100644 sa-extract/example/corpus.en.gz
 delete mode 100644 sa-extract/example/test.de
 delete mode 100644 sa-extract/example/test.ref.en
 delete mode 100644 sa-extract/extract.ini
 delete mode 100755 sa-extract/extractor.py
 delete mode 100644 sa-extract/lcp.pyx
 delete mode 100755 sa-extract/lcp_ops.py
 delete mode 100644 sa-extract/log.py
 delete mode 100644 sa-extract/manager.py
 delete mode 100644 sa-extract/model.py
 delete mode 100644 sa-extract/monitor.py
 delete mode 100644 sa-extract/precomputation.pxd
 delete mode 100644 sa-extract/precomputation.pyx
 delete mode 100644 sa-extract/rule.pxd
 delete mode 100644 sa-extract/rule.pyx
 delete mode 100644 sa-extract/rulefactory.pyx
 delete mode 100755 sa-extract/sa-compile.pl
 delete mode 100755 sa-extract/sa2cdec.py
 delete mode 100644 sa-extract/sa_feat_names.txt
 delete mode 100644 sa-extract/setup.cfg
 delete mode 100644 sa-extract/setup.py
 delete mode 100644 sa-extract/sgml.py
 delete mode 100644 sa-extract/strmap.cc
 delete mode 100644 sa-extract/strmap.h
 delete mode 100644 sa-extract/strutil.c
 delete mode 100644 sa-extract/strutil.h
 delete mode 100644 sa-extract/sym.pxd
 delete mode 100644 sa-extract/sym.pyx
 delete mode 100755 sa-extract/wrap_input.py

(limited to 'sa-extract/manager.py')

diff --git a/sa-extract/Makefile b/sa-extract/Makefile
deleted file mode 100644
index 10d14521..00000000
--- a/sa-extract/Makefile
+++ /dev/null
@@ -1,12 +0,0 @@
-%.c: %.pyx
-	cython -X cdivision=True $< -o $@
-
-%.o: %.cc
-	g++ -O6 -g -fPIC -c $<
-
-all: cstrmap.c strmap.cc rule.c sym.c cdat.c cintlist.c cfloatlist.c calignment.c csuf.c clex.c rulefactory.c cveb.c lcp.c precomputation.c
-	python setup.py build
-
-clean:
-	rm -f cdat.c cstrmap.c sym.c rule.c cintlist.c cfloatlist.c calignment.c csuf.c clex.c rulefactory.c cveb.c lcp.c precomputation.c *.so *.o *.cxx *~ *.pyc
-	rm -rf build
diff --git a/sa-extract/README b/sa-extract/README
deleted file mode 100644
index e4022c7e..00000000
--- a/sa-extract/README
+++ /dev/null
@@ -1,62 +0,0 @@
-SUFFIX-ARRAY-EXTRACT README
-  Feb 1, 2012
-
-Written by Adam Lopez, repackaged by Chris Dyer.
-
-Originally based on parts of Hiero, by David Chiang, but these dependencies
-have been removed or rewritten.
-
-
-BUILD INSTRUCTIONS
-==============================================================================
-
-Requirements:
-  Python 2.7 or later (http://www.python.org)
-  Cython 0.14.1 or later (http://cython.org/)
-
-- Edit Makefile to set the location of Python/Cython then do:
-
-  make
-
-
-COMPILING A PARALLEL CORPUS AND WORD ALIGNMENT
-==============================================================================
-- Run sa-compile.pl to compile the training data and generate an extract.ini
-  file (which is written to STDOUT):
-
-  sa-compile.pl -b bitext_name=source.fr,target.en \
-                -a alignment_name=alignment.txt > extract.ini
-
-
-  The training data should be in two parallel text files (source.fr,source.en)
-  and the alignments are expected in "0-0 1-2 2-1 ..." format produced by
-  most alignment toolkits. The text files should NOT be escaped for non-XML
-  characters.
-
-
-EXTRACTION OF PER-SENTENCE GRAMMARS
-==============================================================================
-The most common use-case we support is extraction of "per-sentence" grammars
-for each segment in a testset. You may run the extractor on test set, but it
-will try to interpret tags as SGML markup, so we provide a script that does
-escaping: ./escape-testset.pl.
-
-- Example:
-
-  cat test.fr | ./escape-testset.pl | ./extractor.py -c extract.ini
-
-
-EXTRACTION OF COMPLETE TEST-SET GRAMMARS
-==============================================================================
-Edit the generated extract.ini file a change per_sentence_grammar
-to False. Then, run extraction as normal.
-
-Note: extracting a single grammar for an entire test set will consume more
-memory during extraction and (probably) during decoding.
-
-
-EXAMPLE
-==============================================================================
-- See example/ and the README therein.
-
-
diff --git a/sa-extract/calignment.pxd b/sa-extract/calignment.pxd
deleted file mode 100644
index a7d3001f..00000000
--- a/sa-extract/calignment.pxd
+++ /dev/null
@@ -1,10 +0,0 @@
-cimport cintlist
-from libc.stdio cimport FILE
-
-cdef class Alignment:
-
-	cdef cintlist.CIntList links
-	cdef cintlist.CIntList sent_index
-	cdef int link(self, int i, int j)
-	cdef _unlink(self, int link, int* f, int* e)
-	cdef int* _get_sent_links(self, int sent_id, int* num_links)
diff --git a/sa-extract/calignment.pyx b/sa-extract/calignment.pyx
deleted file mode 100644
index 976fcd66..00000000
--- a/sa-extract/calignment.pyx
+++ /dev/null
@@ -1,128 +0,0 @@
-import log
-import gzip
-import cintlist
-
-from libc.stdio cimport FILE, fopen, fread, fwrite, fclose
-from libc.stdlib cimport malloc, realloc, free
-
-# Note: Callison-Burch uses short instead of int.  
-# We have the space for our corpus, so this is not a problem;
-# May need to revisit if things get really tight, though.
-cdef class Alignment:
-
-
-	cdef int link(self, int i, int j):
-		'''Integerizes an alignment link pair'''
-		return i*65536 + j
-
-
-	def unlink(self, link):
-		'''De-integerizes an alignment link pair'''
-		return (link/65536, link%65536)
-
-
-	cdef _unlink(self, int link, int* f, int* e):
-		f[0] = link/65536
-		e[0] = link%65536
-
-
-	def get_sent_links(self, int sent_id):
-		cdef cintlist.CIntList sent_links
-		cdef int* arr
-		cdef int arr_len
-
-		sent_links = cintlist.CIntList()
-		arr = self._get_sent_links(sent_id, &arr_len)
-		sent_links._extend_arr(arr, arr_len*2)
-		free(arr)
-		return sent_links
-
-
-	cdef int* _get_sent_links(self, int sent_id, int* num_links):
-		cdef int* sent_links
-		cdef int i, start, end
-
-		start = self.sent_index.arr[sent_id]
-		end = self.sent_index.arr[sent_id+1]
-		num_links[0] = end - start
-		sent_links = <int*> malloc(2*num_links[0]*sizeof(int))
-		for i from 0 <= i < num_links[0]:
-			self._unlink(self.links.arr[start + i], sent_links + (2*i), sent_links + (2*i) + 1)
-		return sent_links
-
-
-	def __cinit__(self, filename, from_binary=False):
-		self.links = cintlist.CIntList(1000,1000)
-		self.sent_index = cintlist.CIntList(1000,1000)
-		log.writeln("Reading alignment from file %s" % filename)
-		if from_binary:
-			self.read_binary(filename)
-		else:
-			self.read_text(filename)
-
-
-	def read_text(self, filename):
-		if filename[-2:] == "gz":
-			f = gzip.GzipFile(filename)
-		else:
-			f = open(filename)
-		for line in f:
-			self.sent_index.append(len(self.links))
-			pairs = line.split()
-			for pair in pairs:
-				(i, j) = map(int, pair.split('-'))
-				self.links.append(self.link(i, j))
-		self.sent_index.append(len(self.links))
-
-
-	def read_binary(self, filename):
-		cdef FILE* f
-		cdef bytes bfilename = filename
-		cdef char* cfilename = bfilename
-		f = fopen(cfilename, "r")
-		self.links.read_handle(f)
-		self.sent_index.read_handle(f)
-		fclose(f)
-
-
-	def write_text(self, filename):
-		f = open(filename, "w")
-		sent_num = 0
-		for i, link in enumerate(self.links):
-			while i >= self.sent_index[sent_num]:
-				f.write("\n")
-				sent_num = sent_num + 1
-			f.write("%d-%d " % self.unlink(link))
-		f.write("\n")
-
-
-	def write_binary(self, filename):
-		cdef FILE* f
-		cdef bytes bfilename = filename
-		cdef char* cfilename = bfilename
-		f = fopen(cfilename, "w")
-		self.links.write_handle(f)
-		self.sent_index.write_handle(f)
-		fclose(f)
-
-
-	def write_enhanced(self, filename):
-		f = open(filename, "w")
-		sent_num = 1
-		for link in self.links:
-			f.write("%d " % link)
-		f.write("\n")
-		for i in self.sent_index:
-			f.write("%d " % i)
-		f.write("\n")
-
-
-	def alignment(self, i):
-		'''Return all (e,f) pairs for sentence i'''
-		cdef int j, start, end
-		result = []
-		start = self.sent_index.arr[i]
-		end = self.sent_index.arr[i+1]
-		for j from start <= j < end:
-			result.append(self.unlink(self.links.arr[j]))
-		return result
diff --git a/sa-extract/cdat.pxd b/sa-extract/cdat.pxd
deleted file mode 100644
index b686f611..00000000
--- a/sa-extract/cdat.pxd
+++ /dev/null
@@ -1,12 +0,0 @@
-cimport cintlist
-from libc.stdio cimport FILE
-
-cdef class DataArray:
-	cdef word2id
-	cdef id2word
-	cdef cintlist.CIntList data
-	cdef cintlist.CIntList sent_id
-	cdef cintlist.CIntList sent_index
-	cdef use_sent_id
-	cdef void write_handle(self, FILE* f)
-	cdef void read_handle(self, FILE* f)
diff --git a/sa-extract/cdat.pyx b/sa-extract/cdat.pyx
deleted file mode 100644
index 57d3ad63..00000000
--- a/sa-extract/cdat.pyx
+++ /dev/null
@@ -1,178 +0,0 @@
-# cdat.pyx
-# Defines "data arrays" that can be directly written to/read from disk in binary format
-# In particular, the array itself is written/read directly as a glob of binary data
-# Adam Lopez <alopez@cs.umd.edu>
-
-import sys
-import gzip
-import log
-import cintlist
-
-from libc.stdio cimport FILE, fopen, fread, fwrite, fclose
-from libc.stdlib cimport malloc, realloc, free
-from libc.string cimport memset, strcpy, strlen
-
-cdef class DataArray:
-
-  def __init__(self, filename=None, from_binary=False, use_sent_id=False):
-    self.word2id = {"END_OF_FILE":0, "END_OF_LINE":1}
-    self.id2word = ["END_OF_FILE", "END_OF_LINE"]
-    self.data = cintlist.CIntList(1000,1000)
-    self.sent_id = cintlist.CIntList(1000,1000)
-    self.sent_index = cintlist.CIntList(1000,1000)
-    self.use_sent_id = use_sent_id
-
-    if filename is not None:
-      if from_binary:
-        self.read_binary(filename)
-      else:
-        self.read_text(filename)
-
-
-  def __len__(self):
-    return len(self.data)
-
-
-  def getSentId(self, i):
-    return self.sent_id.arr[i]
-
-
-  def getSent(self, i):
-    cdef int j, start, stop
-    sent = []
-    start = self.sent_index.arr[i]
-    stop = self.sent_index.arr[i+1]
-    for i from start <= i < stop:
-      sent.append(self.id2word[self.data.arr[i]])
-    return sent
-
-
-  def getSentPos(self, loc):
-    return loc - self.sent_index.arr[self.sent_id.arr[loc]]
-
-
-  def get_id(self, word):
-    if not word in self.word2id:
-      self.word2id[word] = len(self.id2word)
-      self.id2word.append(word)
-    return self.word2id[word]
-
-
-  def get_word(self, id):
-    return self.id2word[id]
-
-
-  def write_text(self, filename):
-    f = open(filename, "w")
-    for w_id in self.data:
-      if w_id > 1:
-        f.write("%s " % self.get_word(w_id))
-      if w_id == 1:
-        f.write("\n")
-    f.close()
-
-
-  def read_text(self, filename):
-    cdef int word_count
-
-    if filename[-2:] == "gz":
-      file = gzip.GzipFile(filename)
-    else:
-      file = open(filename)
-    word_count = 0
-    for line_num, line in enumerate(file):
-      self.sent_index.append(word_count)
-      for word in line.split():
-        self.data.append(self.get_id(word))
-        if self.use_sent_id:
-          self.sent_id.append(line_num)
-        word_count = word_count + 1
-      self.data.append(1)
-      if self.use_sent_id:
-        self.sent_id.append(line_num)
-      word_count = word_count + 1
-    self.data.append(0)
-    self.sent_index.append(word_count)
-
-  def read_binary(self, filename):
-    cdef FILE* f
-    cdef bytes bfilename = filename
-    cdef char* cfilename = bfilename
-    f = fopen(cfilename, "r")
-    self.read_handle(f)
-    fclose(f)
-
-
-  cdef void read_handle(self, FILE* f):
-    cdef int num_words
-    cdef int word_len
-    cdef char* c_word
-    cdef bytes py_word
-    self.data.read_handle(f)
-    self.sent_index.read_handle(f)
-    self.sent_id.read_handle(f)
-    fread(&(num_words), sizeof(int), 1, f)
-    for i in xrange(num_words):
-      fread(&(word_len), sizeof(int), 1, f)
-      c_word = <char*> malloc (word_len * sizeof(char))
-      fread(c_word, sizeof(char), word_len, f)
-      py_word = c_word
-      free(c_word)
-      self.word2id[py_word] = len(self.id2word)
-      self.id2word.append(py_word)
-    if len(self.sent_id) == 0:
-      self.use_sent_id = False
-    else:
-      self.use_sent_id = True
-
-
-  cdef void write_handle(self, FILE* f):
-    cdef int word_len
-    cdef int num_words
-    cdef char* c_word
-
-    self.data.write_handle(f)
-    self.sent_index.write_handle(f)
-    self.sent_id.write_handle(f)
-    num_words = len(self.id2word) - 2
-    fwrite(&(num_words), sizeof(int), 1, f)
-    for word in self.id2word[2:]:
-      c_word = word
-      word_len = strlen(c_word) + 1
-      fwrite(&(word_len), sizeof(int), 1, f)
-      fwrite(c_word, sizeof(char), word_len, f)
-
-
-  def write_binary(self, filename):
-    cdef FILE* f
-    cdef bytes bfilename = filename
-    cdef char* cfilename = bfilename
-    f = fopen(cfilename, "w")
-    self.write_handle(f)
-    fclose(f)
-
-
-  def write_enhanced_handle(self, f):
-    for i in self.data:
-      f.write("%d " %i)
-    f.write("\n")
-    for i in self.sent_index:
-      f.write("%d " %i)
-    f.write("\n")
-    for i in self.sent_id:
-      f.write("%d " %i)
-    f.write("\n")
-    for word in self.id2word:
-      f.write("%s %d " % (word, self.word2id[word]))
-    f.write("\n")
-
-
-
-  def write_enhanced(self, filename):
-    f = open(filename, "w")
-    self.write_enhanced_handle(self, f)
-    f.close()
-
-
-
-
diff --git a/sa-extract/cfloatlist.pxd b/sa-extract/cfloatlist.pxd
deleted file mode 100644
index 026f2739..00000000
--- a/sa-extract/cfloatlist.pxd
+++ /dev/null
@@ -1,10 +0,0 @@
-from libc.stdio cimport FILE
-
-cdef class CFloatList:
-	cdef int size
-	cdef int increment
-	cdef int len
-	cdef float* arr
-	cdef void write_handle(self, FILE* f)
-	cdef void read_handle(self, FILE* f)
-	cdef void set(self, int i, float v)
diff --git a/sa-extract/cfloatlist.pyx b/sa-extract/cfloatlist.pyx
deleted file mode 100644
index 18a0ef2a..00000000
--- a/sa-extract/cfloatlist.pyx
+++ /dev/null
@@ -1,93 +0,0 @@
-# conveniencelist.pyx
-# defines int arrays in C, with some convenience methods
-# for reading arrays directly as globs directly from disk.
-# Adam Lopez <alopez@cs.umd.edu>
-
-from libc.stdio cimport FILE, fopen, fread, fwrite, fclose
-from libc.stdlib cimport malloc, realloc, free
-from libc.string cimport memset, strcpy, strlen
-
-cdef class CFloatList:
-
-	def __cinit__(self, size=0, increment=1, initial_len=0):
-		if initial_len > size:
-			size = initial_len
-		self.arr = <float*> malloc(size*sizeof(float))
-		memset(self.arr, 0, initial_len*sizeof(float))
-
-
-	def __init__(self, size=0, increment=1, initial_len=0):
-		self.size = size
-		if initial_len > size:
-			self.size = initial_len
-		self.increment = increment
-		self.len = initial_len
-
-
-	def __dealloc__(self):
-		free(self.arr)
-
-
-	def __getitem__(self, i):
-		j = i
-		if i<0: 
-			j = self.len + i
-		if j<0 or j>=self.len:
-			raise IndexError("Requested index %d of %d-length FloatList" % (i, self.len))
-		return self.arr[j]
-
-
-	cdef void set(self, int i, float v):
-		j = i
-		if i<0: 
-			j = self.len + i
-		if j<0 or j>=self.len:
-			raise IndexError("Requested index %d of %d-length FloatList" % (i, self.len))
-		self.arr[j] = v
-	
-	def __setitem__(self, i, val):
-		self.set(i, val)
-
-	def __len__(self):
-		return self.len
-
-
-	def append(self, float val):
-		if self.len == self.size:
-			self.size = self.size + self.increment
-			self.arr = <float*> realloc(self.arr, self.size*sizeof(float))
-		self.arr[self.len] = val
-		self.len = self.len + 1
-
-
-	cdef void write_handle(self, FILE* f):
-		fwrite(&(self.len), sizeof(float), 1, f)
-		fwrite(self.arr, sizeof(float), self.len, f)
-
-
-	def write(self, filename):
-		cdef FILE* f
-		cdef bytes bfilename = filename
-		cdef char* cfilename = bfilename
-		f = fopen(cfilename, "w")
-		self.write_handle(f)
-		fclose(f)
-
-
-	cdef void read_handle(self, FILE* f):
-		free(self.arr)
-		fread(&(self.len), sizeof(float), 1, f)
-		self.arr = <float*> malloc(self.len * sizeof(float))
-		self.size = self.len
-		fread(self.arr, sizeof(float), self.len, f)
-
-
-	def read(self, filename):
-		cdef FILE* f
-		cdef bytes bfilename = filename
-		cdef char* cfilename = bfilename
-		f = fopen(cfilename, "r")
-		self.read_handle(f)
-		fclose(f)
-
-
diff --git a/sa-extract/cintlist.pxd b/sa-extract/cintlist.pxd
deleted file mode 100644
index 8a3a655c..00000000
--- a/sa-extract/cintlist.pxd
+++ /dev/null
@@ -1,15 +0,0 @@
-from libc.stdio cimport FILE
-
-cdef class CIntList:
-	cdef int size
-	cdef int increment
-	cdef int len
-	cdef int* arr
-	cdef void write_handle(self, FILE* f)
-	cdef void read_handle(self, FILE* f)
-	cdef void _append(self, int val)
-	cdef void _extend(self, CIntList other)
-	cdef void _extend_arr(self, int* other, int other_len)
-	cdef void _clear(self)
-	cdef void set(self, int i, int val)
-
diff --git a/sa-extract/cintlist.pyx b/sa-extract/cintlist.pyx
deleted file mode 100644
index 9d0a058e..00000000
--- a/sa-extract/cintlist.pyx
+++ /dev/null
@@ -1,196 +0,0 @@
-# cintlist.pyx
-# defines int arrays in C, with some convenience methods
-# for reading arrays as globs directly from disk.
-# Adam Lopez <alopez@cs.umd.edu>
-
-from libc.stdio cimport FILE, fopen, fread, fwrite, fclose
-from libc.stdlib cimport malloc, realloc, free
-from libc.string cimport memset, memcpy
-
-cdef class CIntList:
-
-	def __cinit__(self, size=0, increment=1, initial_len=0):
-		if initial_len > size:
-			size = initial_len
-		self.arr = <int*> malloc(size*sizeof(int))
-		memset(self.arr, 0, initial_len*sizeof(int))
-
-	def __str__(self):
-		ret = "CIntList["
-		for idx in xrange(self.size):
-			if idx>0:
-				ret += ","
-			ret += str(self.arr[idx])
-		ret += "]"
-		ret += "len="
-		ret += self.len
-		return ret
-
-	def index(self, val):
-		for i in xrange(self.len):
-			if self.arr[i] == val:
-				return i
-
-		return IndexError
-
-	def partition(self,start,end):
-		pivot = self.arr[end]
-		bottom = start-1
-		top = end
-		done = 0
-		while not done:
-			while not done:
-				bottom += 1
-				if bottom == top:
-					done = 1
-					break
-				if self.arr[bottom] > pivot:
-					self.arr[top] = self.arr[bottom]
-					break
-			while not done:
-				top -= 1
-				if top == bottom:
-					done = 1
-					break
-				if self.arr[top] < pivot:
-					self.arr[bottom] = self.arr[top]
-					break
-		self.arr[top] = pivot
-		return top
-
-	def _doquicksort(self,start,end):
-		if start < end:
-			split = self.partition(start,end)
-			self._doquicksort(start,split-1)
-			self._doquicksort(split+1,end)
-		else:
-			return
-
-	def sort(self):
-		self._doquicksort(0,self.len-1)
-
-	def reset(self):
-		self.len = 0
-
-	def __init__(self, size=0, increment=1, initial_len=0):
-		self.size = size
-		if initial_len > size:
-			self.size = initial_len
-		self.increment = increment
-		self.len = initial_len
-
-
-	def __dealloc__(self):
-		free(self.arr)
-
-
-	def __getitem__(self, index):
-		cdef int i, j, k
-
-		if type(index) == int:
-			j = index
-			if j < 0: 
-				j = self.len + j
-			if j<0 or j>=self.len:
-				raise IndexError("Requested index %d of %d-length CIntList" % (index, self.len))
-			return self.arr[j]
-		elif type(index) == slice:
-			i = index.start
-			j = index.stop
-			if i < 0:
-				i = self.len + i
-			if j < 0:
-				j = self.len + j
-			if i < 0 or i >= self.len or j < 0 or j > self.len:
-				raise IndexError("Requested index %d:%d of %d-length CIntList" % (index.start, index.stop, self.len))
-			result = ()
-			for k from i <= k < j:
-				result = result + (self.arr[k],)
-			return result
-		else:
-			raise IndexError("Illegal key type %s for CIntList" % (type(index)))
-
-	cdef void set(self, int i, int val):
-		j = i
-		if i<0:
-			j = self.len + i
-		if j<0 or j>=self.len:
-			raise IndexError("Requested index %d of %d-length IntList" % (i, self.len))
-		if type(val) != int:
-			raise TypeError
-		self.arr[j] = val
-		
-
-	def __setitem__(self, i, val):
-		self.set(i, val)
-
-	def __len__(self):
-		return self.len
-
-	def getSize(self):
-		return self.size
-
-	def append(self, int val):
-		self._append(val)
-
-	cdef void _append(self, int val):
-		if self.len == self.size:
-			self.size = self.size + self.increment
-			self.arr = <int*> realloc(self.arr, self.size*sizeof(int))
-		self.arr[self.len] = val
-		self.len = self.len + 1
-
-	def extend(self, other):
-		self._extend(other)
-
-
-	cdef void _extend(self, CIntList other):
-		self._extend_arr(other.arr, other.len)
-
-
-	cdef void _extend_arr(self, int* other, int other_len):
-		if self.size < self.len + other_len:
-			self.size = self.len + other_len
-			self.arr = <int*> realloc(self.arr, self.size*sizeof(int))
-		memcpy(self.arr+self.len, other, other_len*sizeof(int))
-		self.len = self.len + other_len
-
-
-	cdef void _clear(self):
-		free(self.arr)
-		self.len = 0
-		self.size = 0
-		self.arr = <int*> malloc(0)
-
-
-	cdef void write_handle(self, FILE* f):
-		fwrite(&(self.len), sizeof(int), 1, f)
-		fwrite(self.arr, sizeof(int), self.len, f)
-
-
-	def write(self, filename):
-		cdef FILE* f
-		cdef bytes bfilename = filename
-		cdef char* cfilename = bfilename
-		f = fopen(cfilename, "w")
-		self.write_handle(f)
-		fclose(f)
-
-
-	cdef void read_handle(self, FILE* f):
-		(self.arr)
-		fread(&(self.len), sizeof(int), 1, f)
-		self.arr = <int*> malloc(self.len * sizeof(int))
-		self.size = self.len
-		fread(self.arr, sizeof(int), self.len, f)
-
-
-	def read(self, filename):
-		cdef FILE* f
-		cdef bytes bfilename = filename
-		cdef char* cfilename = bfilename
-		f = fopen(cfilename, "r")
-		self.read_handle(f)
-		fclose(f)
-
-
diff --git a/sa-extract/clex.pyx b/sa-extract/clex.pyx
deleted file mode 100644
index fa30caad..00000000
--- a/sa-extract/clex.pyx
+++ /dev/null
@@ -1,460 +0,0 @@
-# clex.pyx
-# defines bilexical dictionaries in C, with some convenience methods
-# for reading arrays directly as globs directly from disk.
-# Adam Lopez <alopez@cs.umd.edu>
-
-import gzip
-import sys
-import context_model
-
-cimport cintlist
-cimport cfloatlist
-cimport calignment
-cimport csuf
-cimport cdat
-
-from libc.stdio cimport FILE, fopen, fread, fwrite, fclose
-from libc.stdlib cimport malloc, realloc, free
-from libc.string cimport memset, strcpy, strlen
-
-cdef struct _node:
-  _node* smaller
-  _node* bigger
-  int key
-  int val
-
-cdef _node* new_node(int key):
-  cdef _node* n
-  n = <_node*> malloc(sizeof(_node))
-  n.smaller = NULL
-  n.bigger = NULL
-  n.key = key
-  n.val = 0
-  return n
-
-
-cdef del_node(_node* n):
-  if n.smaller != NULL:
-    del_node(n.smaller)
-  if n.bigger != NULL:
-    del_node(n.bigger)
-  free(n)
-
-
-cdef int* get_val(_node* n, int key):
-  if key == n.key:
-    return &n.val
-  elif key < n.key:
-    if n.smaller == NULL:
-      n.smaller = new_node(key)
-      return &(n.smaller.val)
-    return get_val(n.smaller, key)
-  else:
-    if n.bigger == NULL:
-      n.bigger = new_node(key)
-      return &(n.bigger.val)
-    return get_val(n.bigger, key)
-
-
-cdef class CLex:
-
-  cdef cfloatlist.CFloatList col1
-  cdef cfloatlist.CFloatList col2
-  cdef cintlist.CIntList f_index
-  cdef cintlist.CIntList e_index
-  cdef id2eword, id2fword, eword2id, fword2id
-
-  def __init__(self, filename, from_binary=False, 
-        from_data=False, earray=None, fsarray=None):
-    self.id2eword = []
-    self.id2fword = []
-    self.eword2id = {}
-    self.fword2id = {}
-    self.e_index = cintlist.CIntList()
-    self.f_index = cintlist.CIntList()
-    self.col1 = cfloatlist.CFloatList()
-    self.col2 = cfloatlist.CFloatList()
-    if from_binary:
-      self.read_binary(filename)
-    else:
-      if from_data:
-        self.compute_from_data(fsarray, earray, filename)
-      else:
-        self.read_text(filename)
-    '''print "self.eword2id"
-    print "============="
-    for x in self.eword2id:
-      print x
-    print "self.fword2id"
-    print "============="
-    for x in self.fword2id:
-      print x
-    print "-------------"'''
-
-
-  cdef compute_from_data(self, csuf.SuffixArray fsa, cdat.DataArray eda, calignment.Alignment aa):
-    cdef int sent_id, num_links, l, i, j, f_i, e_j, I, J, V_E, V_F, num_pairs
-    cdef int *fsent, *esent, *alignment, *links, *ealigned, *faligned
-    cdef _node** dict
-    cdef int *fmargin, *emargin, *count
-    cdef bytes word
-    cdef int null_word
-
-    null_word = 0
-    for word in fsa.darray.id2word: # I miss list comprehensions
-      self.id2fword.append(word)
-    self.id2fword[null_word] = "NULL"
-    for id, word in enumerate(self.id2fword):
-      self.fword2id[word] = id
-
-    for word in eda.id2word:
-      self.id2eword.append(word)
-    self.id2eword[null_word] = "NULL"
-    for id, word in enumerate(self.id2eword):
-      self.eword2id[word] = id
-
-    num_pairs = 0
-
-    V_E = len(eda.id2word)
-    V_F = len(fsa.darray.id2word)
-    fmargin = <int*> malloc(V_F*sizeof(int))
-    emargin = <int*> malloc(V_E*sizeof(int))
-    memset(fmargin, 0, V_F*sizeof(int))
-    memset(emargin, 0, V_E*sizeof(int))
-
-    dict = <_node**> malloc(V_F*sizeof(_node*))
-    memset(dict, 0, V_F*sizeof(_node*))
-
-    num_sents = len(fsa.darray.sent_index)
-    for sent_id from 0 <= sent_id < num_sents-1:
-
-      fsent = fsa.darray.data.arr + fsa.darray.sent_index.arr[sent_id]
-      I = fsa.darray.sent_index.arr[sent_id+1] - fsa.darray.sent_index.arr[sent_id] - 1
-      faligned = <int*> malloc(I*sizeof(int))
-      memset(faligned, 0, I*sizeof(int))
-
-      esent = eda.data.arr + eda.sent_index.arr[sent_id]
-      J = eda.sent_index.arr[sent_id+1] - eda.sent_index.arr[sent_id] - 1
-      ealigned = <int*> malloc(J*sizeof(int))
-      memset(ealigned, 0, J*sizeof(int))
-
-      links = aa._get_sent_links(sent_id, &num_links)
-
-      for l from 0 <= l < num_links:
-        i = links[l*2]
-        j = links[l*2+1]
-        if i >= I or j >= J:
-          sys.stderr.write("  %d-%d out of bounds (I=%d,J=%d) in line %d\n" % (i,j,I,J,sent_id+1))
-          assert i < I
-          assert j < J
-        f_i = fsent[i]
-        e_j = esent[j]
-        fmargin[f_i] = fmargin[f_i]+1
-        emargin[e_j] = emargin[e_j]+1
-        if dict[f_i] == NULL:
-          dict[f_i] = new_node(e_j)
-          dict[f_i].val = 1
-          num_pairs = num_pairs + 1
-        else:
-          count = get_val(dict[f_i], e_j)
-          if count[0] == 0:
-            num_pairs = num_pairs + 1
-          count[0] = count[0] + 1
-        # add count
-        faligned[i] = 1
-        ealigned[j] = 1
-      for i from 0 <= i < I:
-        if faligned[i] == 0:
-          f_i = fsent[i]
-          fmargin[f_i] = fmargin[f_i] + 1
-          emargin[null_word] = emargin[null_word] + 1
-          if dict[f_i] == NULL:
-            dict[f_i] = new_node(null_word)
-            dict[f_i].val = 1
-            num_pairs = num_pairs + 1
-          else:
-            count = get_val(dict[f_i], null_word)
-            if count[0] == 0:
-              num_pairs = num_pairs + 1
-            count[0] = count[0] + 1
-      for j from 0 <= j < J:
-        if ealigned[j] == 0:
-          e_j = esent[j]
-          fmargin[null_word] = fmargin[null_word] + 1
-          emargin[e_j] = emargin[e_j] + 1
-          if dict[null_word] == NULL:
-            dict[null_word] = new_node(e_j)
-            dict[null_word].val = 1
-            num_pairs = num_pairs + 1
-          else:
-            count = get_val(dict[null_word], e_j)
-            if count[0] == 0:
-              num_pairs = num_pairs + 1
-            count[0] = count[0] + 1
-      free(links)
-      free(faligned)
-      free(ealigned)
-    self.f_index = cintlist.CIntList(initial_len=V_F)
-    self.e_index = cintlist.CIntList(initial_len=num_pairs)
-    self.col1 = cfloatlist.CFloatList(initial_len=num_pairs)
-    self.col2 = cfloatlist.CFloatList(initial_len=num_pairs)
-
-    num_pairs = 0
-    for i from 0 <= i < V_F:
-      #self.f_index[i] = num_pairs
-      self.f_index.set(i, num_pairs)
-      if dict[i] != NULL:
-        self._add_node(dict[i], &num_pairs, float(fmargin[i]), emargin)
-        del_node(dict[i])
-    free(fmargin)
-    free(emargin)
-    free(dict)
-    return
-
-
-  cdef _add_node(self, _node* n, int* num_pairs, float fmargin, int* emargin):
-    cdef int loc
-    if n.smaller != NULL:
-      self._add_node(n.smaller, num_pairs, fmargin, emargin)
-    loc = num_pairs[0]
-    self.e_index.set(loc, n.key)
-    self.col1.set(loc, float(n.val)/fmargin)
-    self.col2.set(loc, float(n.val)/float(emargin[n.key]))
-    num_pairs[0] = loc + 1
-    if n.bigger != NULL:
-      self._add_node(n.bigger, num_pairs, fmargin, emargin)
-
-
-  def write_binary(self, filename):
-    cdef FILE* f
-    cdef bytes bfilename = filename
-    cdef char* cfilename = bfilename
-    f = fopen(cfilename, "w")
-    self.f_index.write_handle(f)
-    self.e_index.write_handle(f)
-    self.col1.write_handle(f)
-    self.col2.write_handle(f)
-    self.write_wordlist(self.id2fword, f)
-    self.write_wordlist(self.id2eword, f)
-    fclose(f)
-
-
-  cdef write_wordlist(self, wordlist, FILE* f):
-    cdef int word_len
-    cdef int num_words
-    cdef char* c_word
-
-    num_words = len(wordlist)
-    fwrite(&(num_words), sizeof(int), 1, f)
-    for word in wordlist:
-      c_word = word
-      word_len = strlen(c_word) + 1
-      fwrite(&(word_len), sizeof(int), 1, f)
-      fwrite(c_word, sizeof(char), word_len, f)
-
-
-  cdef read_wordlist(self, word2id, id2word, FILE* f):
-    cdef int num_words
-    cdef int word_len
-    cdef char* c_word
-    cdef bytes py_word
-
-    fread(&(num_words), sizeof(int), 1, f)
-    for i from 0 <= i < num_words:
-      fread(&(word_len), sizeof(int), 1, f)
-      c_word = <char*> malloc (word_len * sizeof(char))
-      fread(c_word, sizeof(char), word_len, f)
-      py_word = c_word
-      free(c_word)
-      word2id[py_word] = len(id2word)
-      id2word.append(py_word)
-
-  def read_binary(self, filename):
-    cdef FILE* f
-    cdef bytes bfilename = filename
-    cdef char* cfilename = bfilename
-    f = fopen(cfilename, "r")
-    self.f_index.read_handle(f)
-    self.e_index.read_handle(f)
-    self.col1.read_handle(f)
-    self.col2.read_handle(f)
-    self.read_wordlist(self.fword2id, self.id2fword, f)
-    self.read_wordlist(self.eword2id, self.id2eword, f)
-    fclose(f)
-
-
-  def get_e_id(self, eword):
-    if eword not in self.eword2id:
-      e_id = len(self.id2eword)
-      self.id2eword.append(eword)
-      self.eword2id[eword] = e_id
-    return self.eword2id[eword]
-
-
-  def get_f_id(self, fword):
-    if fword not in self.fword2id:
-      f_id = len(self.id2fword)
-      self.id2fword.append(fword)
-      self.fword2id[fword] = f_id
-    return self.fword2id[fword]
-
-
-  def read_text(self, filename):
-    cdef i, j, w, e_id, f_id, n_f, n_e, N
-    cdef cintlist.CIntList fcount 
-
-    fcount = cintlist.CIntList()
-    if filename[-2:] == "gz":
-      f = gzip.GzipFile(filename)
-    else:
-      f = open(filename)
-
-    # first loop merely establishes size of array objects
-    sys.stderr.write("Initial read...\n")
-    for line in f:
-      (fword, eword, score1, score2) = line.split()
-      f_id = self.get_f_id(fword)
-      e_id = self.get_e_id(eword)
-      while f_id >= len(fcount):
-        fcount.append(0)
-      fcount.arr[f_id] = fcount.arr[f_id] + 1
-
-    # Allocate space for dictionary in arrays
-    N = 0
-    n_f = len(fcount)
-    self.f_index = cintlist.CIntList(initial_len=n_f+1)
-    for i from 0 <= i < n_f:
-      self.f_index.arr[i] = N
-      N = N + fcount.arr[i]
-      fcount.arr[i] = 0
-    self.f_index.arr[n_f] = N
-    self.e_index = cintlist.CIntList(initial_len=N)
-    self.col1 = cfloatlist.CFloatList(initial_len=N)
-    self.col2 = cfloatlist.CFloatList(initial_len=N)
-
-    # Re-read file, placing words into buckets
-    sys.stderr.write("Bucket sort...\n")
-    f.seek(0)
-    for line in f:
-      (fword, eword, score1, score2) = line.split()
-      f_id = self.get_f_id(fword)
-      e_id = self.get_e_id(eword)
-      index = self.f_index.arr[f_id] + fcount.arr[f_id]
-      fcount.arr[f_id] = fcount.arr[f_id] + 1
-      self.e_index.arr[index] = int(e_id)
-      self.col1[index] = float(score1)
-      self.col2[index] = float(score2)
-    f.close()
-
-    sys.stderr.write("Final sort...\n")
-    # Sort buckets by eword
-    for b from 0 <= b < n_f:
-      i = self.f_index.arr[b]
-      j = self.f_index.arr[b+1]
-      self.qsort(i,j, "")
-
-
-  cdef swap(self, int i, int j):
-    cdef int itmp
-    cdef float ftmp
-
-    if i == j:
-      return
-
-    itmp = self.e_index.arr[i]
-    self.e_index.arr[i] = self.e_index.arr[j]
-    self.e_index.arr[j] = itmp
-
-    ftmp = self.col1.arr[i]
-    self.col1.arr[i] = self.col1.arr[j]
-    self.col1.arr[j] = ftmp
-
-    ftmp = self.col2.arr[i]
-    self.col2.arr[i] = self.col2.arr[j]
-    self.col2.arr[j] = ftmp
-
-
-  cdef qsort(self, int i, int j, pad):
-    cdef int pval, p
-
-    if i > j:
-      raise Exception("Sort error in CLex")
-    if i == j: #empty interval
-      return
-    if i == j-1: # singleton interval
-      return
-
-    p = (i+j)/2
-    pval = self.e_index.arr[p]
-    self.swap(i, p)
-    p = i
-    for k from i+1 <= k < j:
-      if pval >= self.e_index.arr[k]:
-        self.swap(p+1, k)
-        self.swap(p, p+1)
-        p = p + 1
-    self.qsort(i,p, pad+"  ")
-    self.qsort(p+1,j, pad+"  ")
-
-
-  def write_enhanced(self, filename):
-    f = open(filename, "w")
-    for i in self.f_index:
-      f.write("%d " % i)
-    f.write("\n")
-    for i, s1, s2 in zip(self.e_index, self.col1, self.col2):
-      f.write("%d %f %f " % (i, s1, s2))
-    f.write("\n")
-    for i, w in enumerate(self.id2fword):
-      f.write("%d %s " % (i, w))
-    f.write("\n")
-    for i, w in enumerate(self.id2eword):
-      f.write("%d %s " % (i, w))
-    f.write("\n")
-    f.close()
-
-
-  def get_score(self, fword, eword, col):
-    cdef e_id, f_id, low, high, midpoint, val
-    #print "get_score fword=",fword,", eword=",eword,", col=",col
-
-    if eword not in self.eword2id:
-      return None
-    if fword not in self.fword2id:
-      return None
-    f_id = self.fword2id[fword]
-    e_id = self.eword2id[eword]
-    low = self.f_index.arr[f_id]
-    high = self.f_index.arr[f_id+1]
-    while high - low > 0:
-      midpoint = (low+high)/2
-      val = self.e_index.arr[midpoint]
-      if val == e_id:
-        if col == 0:
-          return self.col1.arr[midpoint]
-        if col == 1:
-          return self.col2.arr[midpoint]
-      if val > e_id:
-        high = midpoint
-      if val < e_id:
-        low = midpoint + 1
-    return None
-
-
-  def write_text(self, filename):
-    """Note: does not guarantee writing the dictionary in the original order"""
-    cdef i, N, e_id, f_id
-    
-    f = open(filename, "w")
-    N = len(self.e_index)
-    f_id = 0
-    for i from 0 <= i < N:
-      while self.f_index.arr[f_id+1] == i:
-        f_id = f_id + 1
-      e_id = self.e_index.arr[i]
-      score1 = self.col1.arr[i]
-      score2 = self.col2.arr[i]
-      f.write("%s %s %.6f %.6f\n" % (self.id2fword[f_id], self.id2eword[e_id], score1, score2))
-    f.close()
-
-
diff --git a/sa-extract/cmath.pxd b/sa-extract/cmath.pxd
deleted file mode 100644
index 3aaaa2a3..00000000
--- a/sa-extract/cmath.pxd
+++ /dev/null
@@ -1,2 +0,0 @@
-cdef extern from "math.h":
-	double log(double)
diff --git a/sa-extract/cn.py b/sa-extract/cn.py
deleted file mode 100644
index 6e45bcf9..00000000
--- a/sa-extract/cn.py
+++ /dev/null
@@ -1,164 +0,0 @@
-# cn.py
-# Chris Dyer <redpony@umd.edu>
-# Copyright (c) 2006 University of Maryland.
-
-# vim:tabstop=4:autoindent:expandtab
-
-import sym
-import log
-
-epsilon = sym.fromstring('*EPS*');
-
-class CNStats(object):
-    def __init__(self):
-      self.read = 0
-      self.colls = 0
-      self.words = 0
-
-    def collect(self, cn):
-      self.read += 1
-      self.colls += cn.get_length()
-      for col in cn.columns:
-        self.words += len(col)
-
-    def __str__(self):
-      return "confusion net statistics:\n succ. read: %d\n columns:    %d\n words:      %d\n avg. words/column:\t%f\n avg. cols/sent:\t%f\n\n" % (self.read, self.colls, self.words, float(self.words)/float(self.colls), float(self.colls)/float(self.read))
-
-class ConfusionNet(object):
-    def __init__(self, sent):
-        object.__init__(self)
-        if (len(sent.words) == 0):
-            self.columns = ()
-            return # empty line, it happens
-        line = sent.words[0]
-        if (line.startswith("(((")):
-            if (len(sent.words) > 1):
-                log.write("Bad sentence: %s\n" % (line))
-            assert(len(sent.words) == 1) # make sure there are no spaces in your confusion nets!
-            line =  "((('<s>',1.0,1),),"+line[1:len(line)-1]+"(('</s>',1.0,1),))"
-            cols = eval(line)
-            res = []
-            for col in cols:
-               x = []
-               for alt in col:
-                   costs = alt[1]
-                   if (type(costs) != type((1,2))):
-                       costs=(float(costs),)
-                   j=[]
-                   for c in costs:
-                       j.append(float(c))
-                   cost = tuple(j)
-                   spanlen = 1
-                   if (len(alt) == 3):
-                       spanlen = alt[2]
-                   x.append((sym.fromstring(alt[0],terminal=True), None, spanlen))
-               res.append(tuple(x))
-            self.columns = tuple(res)
-        else:  # convert a string of input into a CN
-            res = [];
-            res.append(((sym.fromstring('<s>',terminal=True), None, 1), ))
-            for word in sent.words:
-               res.append(((sym.fromstring(word,terminal=True), None, 1), ));  # (alt=word, cost=0.0)
-            res.append(((sym.fromstring('</s>',terminal=True), None, 1), ))
-            self.columns = tuple(res)
-
-    def is_epsilon(self, position):
-        x = self.columns[position[0]][position[1]][0]
-        return x == epsilon
-
-    def compute_epsilon_run_length(self, cn_path):
-        if (len(cn_path) == 0):
-            return 0
-        x = len(cn_path) - 1
-        res = 0
-        ''' -1 denotes a non-terminal '''
-        while (x >= 0 and cn_path[x][0] >= 0 and self.is_epsilon(cn_path[x])):
-            res += 1
-            x -= 1
-        return res
-
-    def compute_cn_cost(self, cn_path):
-       c = None
-       for (col, row) in cn_path:
-           if (col >= 0):
-               if c is None:
-                   c = self.columns[col][row][1].clone()
-               else:
-                   c += self.columns[col][row][1]
-       return c                                        
-
-    def get_column(self, col):
-        return self.columns[col]
-
-    def get_length(self):
-        return len(self.columns)
-
-    def __str__(self):
-        r = "conf net: %d\n" % (len(self.columns),)
-        i = 0
-        for col in self.columns:
-            r += "%d -- " % i
-            i += 1
-            for alternative in col:
-                r += "(%s, %s, %s) " % (sym.tostring(alternative[0]), alternative[1], alternative[2])
-            r += "\n"
-        return r
-    
-    def listdown(_columns, col = 0):
-        # output all the possible sentences out of the self lattice
-        # will be used by the "dumb" adaptation of lattice decoding with suffix array
-        result = []
-        for entry in _columns[col]:
-            if col+entry[2]+1<=len(_columns) :
-                for suffix in self.listdown(_columns,col+entry[2]):
-                    result.append(entry[0]+' '+suffix)
-                    #result.append(entry[0]+' '+suffix)
-            else:
-                result.append(entry[0])
-                #result.append(entry[0])
-        return result
-    
-    def next(self,_columns,curr_idx, min_dist=1):
-        # can be used only when prev_id is defined
-        result = []
-        #print "curr_idx=%i\n" % curr_idx
-        if curr_idx+min_dist >= len(_columns): 
-            return result
-        for alt_idx in xrange(len(_columns[curr_idx])):
-            alt = _columns[curr_idx][alt_idx]
-            #print "checking %i alternative : " % alt_idx
-            #print "%s %f %i\n" % (alt[0],alt[1],alt[2])
-            #print alt
-            if alt[2]<min_dist:
-                #print "recursive next(%i, %i, %i)\n" % (curr_idx,alt_idx,min_dist-alt[2])
-                result.extend(self.next(_columns,curr_idx+alt[2],min_dist-alt[2]))
-            elif curr_idx+alt[2]<len(_columns):
-                #print "adding because the skip %i doesn't go beyong the length\n" % alt[2]
-                result.append(curr_idx+alt[2])
-        return set(result)
-                
-    
-
-"""
-import sys
-import sgml
-file = open(sys.argv[1], "rb")
-sent = sgml.process_sgml_line(file.read())
-print sent
-cn = ConfusionNet(sent)
-print cn
-results = cn.listdown()
-for result in results:
-    print sym.tostring(result)
-print cn.next(0);
-print cn.next(1);
-print cn.next(2);
-print cn.next(3);
-print cn
-cn = ConfusionNet()
-k = 0
-while (cn.read(file)):
-  print cn
- 
-print cn.stats
-"""
diff --git a/sa-extract/compile_bin.py b/sa-extract/compile_bin.py
deleted file mode 100755
index 0196e552..00000000
--- a/sa-extract/compile_bin.py
+++ /dev/null
@@ -1,148 +0,0 @@
-#!/usr/bin/env python
-
-'''This program compiles/decompiles binary data objects used
-by the decoder'''
-
-import sys
-import cdat
-import calignment
-import csuf
-import clex
-import precomputation
-#import parse
-import monitor
-import optparse
-
-def main(argv=None):
-	'''Call this from the command-line to create a 
-	pre-computed binary data array for later use'''
-	if argv is None:
-		argv = sys.argv
-
-	parser = optparse.OptionParser(usage="Usage: %prog [-s|-d|-a|-p] <input file> <output file>"+
-								"\n\nNote: -d,-s,-a, and -p are mutually exclusive")
-	parser.add_option("-d", "--data-array", 
-					action="store_true", default=False,
-					dest="da", help="Compile file into data array (default)")
-	parser.add_option("-s", "--suffix-array", 
-					action="store_true", default=False,
-					dest="sa", help="Compile file into suffix array")
-	parser.add_option("-a", "--alignment", 
-					action="store_true", default=False,
-					dest="a", help="Compile file into alignment")
-	parser.add_option("-l", "--lexical", 
-					action="store_true", default=False,
-					dest="l", help="Compile file into lex file")
-	parser.add_option("-x", "--compute_lexical", action="store", nargs=2,
-					dest="lex_args", help="Compute lex file from data",
-					metavar="<f file> <e file>")
-	parser.add_option("-p", "--parse", 
-					action="store_true", default=False,
-					dest="p", help="Compile file into parse")
-	parser.add_option("-b", "--binary-infile", 
-					action="store_true", default=False,
-					dest="bin", help="Input file is binary (default: text)")
-	parser.add_option("-t", "--text-outfile", 
-					action="store_true", default=False,
-					dest="text", help="Output file is text (default: binary)")
-	parser.add_option("-e", "--enhanced-outfile", 
-					action="store_true", default=False,
-					dest="enhanced", help="Output file is enhanced text (default: binary)")
-	parser.add_option("-r", action="store", nargs=7,
-					dest="precomp_args", help="Precompute collocations (Hiero only)", 
-					metavar="max-len=<INT> max-nt=<INT> max-size=<INT> min-gap=<INT> rank1=<INT> rank2=<INT> sa=<FILE>")
-	(options, args) = parser.parse_args()
-
-	filetype_opts =  [options.da, options.sa, options.a, options.p]
-
-	if (len(filter(lambda x: x, filetype_opts))) > 1 or len(args) != 2:
-		parser.print_help()
-		sys.exit(1)
-
-	(infilename, outfilename) = args
-	if options.bin:
-		bin = " binary"
-	else:
-		bin = ""
-
-	start_time = monitor.cpu()
-	if options.precomp_args:
-		if options.bin:
-			obj = precomputation.Precomputation(infilename, from_binary=True)
-		else:
-			keys = set(["max-len", "max-nt", "max-size", "min-gap", "rank1", "rank2", "sa"])
-			precomp_opts = {} 
-			sys.stderr.write("Precomputing statistics for list %s\n" % infilename)
-			for pair in options.precomp_args:
-				(key, val) = pair.split("=")
-				if key in keys:
-					keys.remove(key)
-					if key != "sa":
-						val = int(val)
-					precomp_opts[key] = val
-				else:
-					sys.stderr.write("Unknown keyword arg %s for -r (must be one of: max-len, max-nt, max-size, min-gap, rank1, rank2)\n" % key)
-					return 1
-			sa = csuf.SuffixArray(precomp_opts["sa"], True)
-			obj = precomputation.Precomputation(infilename, sa, 
-				precompute_rank=precomp_opts["rank1"], 
-				precompute_secondary_rank=precomp_opts["rank2"], 
-				max_length=precomp_opts["max-len"], 
-				max_nonterminals=precomp_opts["max-nt"], 
-				train_max_initial_size=precomp_opts["max-size"], 
-				train_min_gap_size=precomp_opts["min-gap"])
-	elif options.sa:
-		sys.stderr.write("Reading %s as%s suffix array...\n" % (infilename, bin))
-		obj = csuf.SuffixArray(infilename, options.bin)
-	elif options.a:
-		sys.stderr.write("Reading %s as%s alignment array...\n" % (infilename, bin))
-		obj = calignment.Alignment(infilename, options.bin)
-	elif options.p:
-		sys.stderr.write("Reading %s as%s parse array...\n" % (infilename, bin))
-		obj = parse.ParseArray(infilename, options.bin)
-	elif options.l:
-		sys.stderr.write("Reading %s as%s lex array...\n" % (infilename, bin))
-		obj = clex.CLex(infilename, options.bin)
-	elif options.lex_args:
-		ffile = options.lex_args[0]
-		efile = options.lex_args[1]
-		sys.stderr.write("Computing lex array from:\n A=%s\n F=%s\n E=%s\n" % (infilename, ffile, efile))
-		fsarray = csuf.SuffixArray(ffile, True)
-		earray = cdat.DataArray(efile, True)
-		aarray = calignment.Alignment(infilename, True)
-		obj = clex.CLex(aarray, from_data=True, earray=earray, fsarray=fsarray)
-	else:
-		sys.stderr.write("Reading %s as%s data array...\n" % (infilename, bin))
-		obj = cdat.DataArray(infilename, options.bin)
-
-	sys.stderr.write("  Total time for read: %f\n" % (monitor.cpu() - start_time))
-	start_time = monitor.cpu()
-	if options.text:
-		sys.stderr.write("Writing text file %s...\n" % outfilename)
-		obj.write_text(outfilename)
-	elif options.enhanced:
-		sys.stderr.write("Writing enhanced text file %s...\n" % outfilename)
-		obj.write_enhanced(outfilename)
-	else:
-		sys.stderr.write("Writing binary file %s...\n" % outfilename)
-		obj.write_binary(outfilename)
-	sys.stderr.write("Finished.\n")
-	sys.stderr.write("  Total time for write: %f\n" % (monitor.cpu() - start_time))
-
-	mem_use = float(monitor.memory())
-	metric = "B"
-	if mem_use / 1000 > 1:
-		mem_use /= 1000
-		metric = "KB"
-	if mem_use / 1000 > 1:
-		mem_use /= 1000
-		metric = "MB"
-	if mem_use / 1000 > 1:
-		mem_use /= 1000
-		metric = "GB"
-	sys.stderr.write("  Memory usage: %.1f%s\n" % (mem_use, metric))
-
-
-
-if __name__ == "__main__":
-	sys.exit(main(sys.argv))
diff --git a/sa-extract/context_model.py b/sa-extract/context_model.py
deleted file mode 100644
index 8cb6c174..00000000
--- a/sa-extract/context_model.py
+++ /dev/null
@@ -1,234 +0,0 @@
-#!/usr/bin/env python
-import sys
-import model
-import sym
-import log
-import math
-
-class ContextModel(model.Model):
-  '''A ContextModel is one that is computed using information
-  from the Context object'''
-
-  def __init__(self, context_manager, default=0.0):
-    model.Model.__init__(self)
-    self.wordless = 0
-    self.initial = None
-    self.default = default
-    self.context_manager = context_manager
-    self.id = self.context_manager.add_model(self)
-
-    '''The next feature is true if the model depends in 
-    some way on the entire input sentence; that is, if 
-    it cannot be scored when created, but must be scored
-    no earlier than during the input method (note that
-    this is less strict than stateful)'''
-    self.contextual = True 
-    ''' It may seem somewhat counterintuitive that a
-    ContextModel can be non-contextual, but a good
-    example is the rule probabilites; although these
-    are computed using the Context object, they don't
-    really depend in any way on context'''
-
-
-  '''inherited from model.Model, called once for each input sentence'''
-  def input(self, fwords, meta):
-    # all ContextModels must make this call
-    self.context_manager.input(self, fwords, meta)
-
-
-  '''This function will be called via the input method
-  only for contextual models'''
-  def compute_contextual_score(self, r):
-    return 0.0
-
-  '''This function is only called on rule creation for
-  contextless models'''
-  def compute_contextless_score(self, fphrase, ephrase, paircount, fcount, fsample_count):
-    return 0.0
-
-  '''Stateless models should not need to 
-  override this function, unless they define
-  something for model.TO_GOAL'''
-  def transition (self,  r, antstates, i, j, j1=None):
-    return (None, 0.0)
-
-  def estimate(self, r):
-    return r.getscore("context", self.id)
-
-  def transition(self, r, antstates, i, j, j1=None):
-    return (None, r.getscore("context", self.id))
-
-  def finaltransition(self, state):
-    return 0.0
-
-  def rescore(self, ewords, score):
-    return score
-
-
-
-'''p(e|f)'''
-class EgivenF(ContextModel):
-  
-  def __init__(self, context_manager, default=0.0):
-    ContextModel.__init__(self, context_manager)
-    self.contextual = False
-
-  def compute_contextless_score(self, fphrase, ephrase, paircount, fcount, fsample_count):
-    prob = float(paircount)/float(fcount)
-    return -math.log10(prob)
-
-class CountEF(ContextModel):
-
-        def __init__(self, context_manager, default=0.0):
-                ContextModel.__init__(self, context_manager)
-                self.contextual = False
-
-        def compute_contextless_score(self, fphrase, ephrase, paircount, fcount, fsample_count):
-                return math.log10(1.0 + float(paircount))
-
-class SampleCountF(ContextModel):
-
-        def __init__(self, context_manager, default=0.0):
-                ContextModel.__init__(self, context_manager)
-                self.contextual = False
-
-        def compute_contextless_score(self, fphrase, ephrase, paircount, fcount, fsample_count):
-                return math.log10(1.0 + float(fsample_count))
-
-
-
-class EgivenFCoherent(ContextModel):
-
-  def __init__(self, context_manager, default=0.0):
-    ContextModel.__init__(self, context_manager)
-    self.contextual = False
-
-  def compute_contextless_score(self, fphrase, ephrase, paircount, fcount, fsample_count):
-    prob = float(paircount)/float(fsample_count)
-    #print "paircount=",paircount," , fsample_count=",fsample_count,", prob=",prob
-    if (prob == 0.0): return 99.0
-    return -math.log10(prob)
-
-
-
-class CoherenceProb(ContextModel):
-
-  def __init__(self, context_manager, default=0.0):
-    ContextModel.__init__(self, context_manager)
-    self.contextual = False
-
-  def compute_contextless_score(self, fphrase, ephrase, paircount, fcount, fsample_count):
-    prob = float(fcount)/float(fsample_count)
-    return -math.log10(prob)
-
-
-
-class MaxLexEgivenF(ContextModel):
-
-  def __init__(self, context_manager, ttable, col=0):
-    ContextModel.__init__(self, context_manager)
-    self.ttable = ttable
-    self.col = col
-    self.wordless = 0
-    self.initial = None
-    self.contextual = False
-
-
-  def compute_contextless_score(self, fphrase, ephrase, paircount, fcount, fsample_count):
-    totalscore = 1.0
-    fwords = map(sym.tostring, filter(lambda x: not sym.isvar(x), fphrase))
-    fwords.append("NULL")
-    ewords = map(sym.tostring, filter(lambda x: not sym.isvar(x), ephrase))
-    for e in ewords:
-      maxScore = 0.0
-      for f in fwords:
-        score = self.ttable.get_score(f, e, self.col)
-        #print "score(MaxLexEgivenF) = ",score
-        if score > maxScore:
-          maxScore = score
-      totalscore *= maxScore
-    if totalscore == 0.0:
-      return 999
-    else:
-      return -math.log10(totalscore)
-
-
-class MaxLexFgivenE(ContextModel):
-
-  def __init__(self, context_manager, ttable, col=1):
-    ContextModel.__init__(self, context_manager)
-    self.ttable = ttable
-    self.col = col
-    self.wordless = 0
-    self.initial = None
-    self.contextual = False
-
-
-  def compute_contextless_score(self, fphrase, ephrase, paircount, fcount, fsample_count):
-    totalscore = 1.0
-    fwords = map(sym.tostring, filter(lambda x: not sym.isvar(x), fphrase))
-    ewords = map(sym.tostring, filter(lambda x: not sym.isvar(x), ephrase))
-    ewords.append("NULL")
-    for f in fwords:
-      maxScore = 0.0
-      for e in ewords:
-        score = self.ttable.get_score(f, e, self.col)
-        #print "score(MaxLexFgivenE) = ",score
-        if score > maxScore:
-          maxScore = score
-      totalscore *= maxScore
-    if totalscore == 0.0:
-      return 999
-    else:
-      return -math.log10(totalscore)
-
-
-class IsSingletonF(ContextModel):
-
-        def __init__(self, context_manager, default=0.0):
-                ContextModel.__init__(self, context_manager)
-                self.contextual = False
-
-        def compute_contextless_score(self, fphrase, ephrase, paircount, fcount, fsample_count):
-                return (fcount==1)
-
-
-class IsSingletonFE(ContextModel):
-
-        def __init__(self, context_manager, default=0.0):
-                ContextModel.__init__(self, context_manager)
-                self.contextual = False
-
-        def compute_contextless_score(self, fphrase, ephrase, paircount, fcount, fsample_count):
-                return (paircount==1)
-
-class IsNotSingletonF(ContextModel):
-
-        def __init__(self, context_manager, default=0.0):
-                ContextModel.__init__(self, context_manager)
-                self.contextual = False
-
-        def compute_contextless_score(self, fphrase, ephrase, paircount, fcount, fsample_count):
-                return (fcount>1)
-
-
-class IsNotSingletonFE(ContextModel):
-
-        def __init__(self, context_manager, default=0.0):
-                ContextModel.__init__(self, context_manager)
-                self.contextual = False
-
-        def compute_contextless_score(self, fphrase, ephrase, paircount, fcount, fsample_count):
-                return (paircount>1)
-
-
-class IsFEGreaterThanZero(ContextModel):
-
-        def __init__(self, context_manager, default=0.0):
-                ContextModel.__init__(self, context_manager)
-                self.contextual = False
-
-        def compute_contextless_score(self, fphrase, ephrase, paircount, fcount, fsample_count):
-                return (paircount > 0.01)
-
-
diff --git a/sa-extract/cstrmap.pxd b/sa-extract/cstrmap.pxd
deleted file mode 100644
index 53becbc6..00000000
--- a/sa-extract/cstrmap.pxd
+++ /dev/null
@@ -1,12 +0,0 @@
-cdef extern from "strmap.h":
-    ctypedef struct StrMap
-    StrMap* stringmap_new()
-    void stringmap_delete(StrMap *vocab)
-    int stringmap_index(StrMap *vocab, char *s)
-    char* stringmap_word(StrMap *vocab, int i)
-
-cdef class StringMap:
-    cdef StrMap *vocab
-    cdef char *word(self, int i)
-    cdef int index(self, char *s)
-
diff --git a/sa-extract/cstrmap.pyx b/sa-extract/cstrmap.pyx
deleted file mode 100644
index d3883ea5..00000000
--- a/sa-extract/cstrmap.pyx
+++ /dev/null
@@ -1,14 +0,0 @@
-
-cdef class StringMap:
-    def __cinit__(self):
-        self.vocab = stringmap_new()
-
-    def __dealloc__(self):
-        stringmap_delete(self.vocab)
-        
-    cdef char *word(self, int i):
-        return stringmap_word(self.vocab, i)
-
-    cdef int index(self, char *s):
-        return stringmap_index(self.vocab, s)
-
diff --git a/sa-extract/csuf.pxd b/sa-extract/csuf.pxd
deleted file mode 100644
index 972f0178..00000000
--- a/sa-extract/csuf.pxd
+++ /dev/null
@@ -1,12 +0,0 @@
-cimport cdat
-cimport cintlist
-
-cdef class SuffixArray:
-	cdef cdat.DataArray darray
-	cdef cintlist.CIntList sa
-	cdef cintlist.CIntList ha
-	# cdef lookup(self, word, int offset, int low, int high)
-	cdef __lookup_helper(self, int word_id, int offset, int low, int high)
-	cdef __get_range(self, int word_id, int offset, int low, int high, int midpoint)
-	cdef int __search_low(self, int word_id, int offset, int low, int high)
-	cdef int __search_high(self, int word_id, int offset, int low, int high)
diff --git a/sa-extract/csuf.pyx b/sa-extract/csuf.pyx
deleted file mode 100644
index 2f0b1383..00000000
--- a/sa-extract/csuf.pyx
+++ /dev/null
@@ -1,322 +0,0 @@
-# csuf.pyx
-# Defines suffix arrays that can be directly written to/read from disk in binary format
-# Adam Lopez <alopez@cs.umd.edu>
-
-import sys
-import log
-import cdat
-import cintlist
-import monitor
-
-from libc.stdio cimport FILE, fclose, fopen
-
-cdef class SuffixArray:
-
-  def __init__(self, filename, from_binary=False):
-    self.darray = cdat.DataArray()
-    self.sa = cintlist.CIntList()
-    self.ha = cintlist.CIntList()
-    if from_binary:
-      self.read_binary(filename)
-    else:
-      self.read_text(filename)
-
-
-  def __getitem__(self, i):
-    return self.sa.arr[i]
-
-
-  def getSentId(self, i):
-    return self.darray.getSentId(i)
-
-
-  def getSent(self, i):
-    return self.darray.getSent(i)
-
-
-  def getSentPos(self, loc):
-    return self.darray.getSentPos(loc)
-
-  def read_text(self, filename):
-    '''Constructs suffix array using the algorithm
-    of Larsson & Sadahkane (1999)'''
-    cdef int V, N, i, j, h, a_i, n, current_run, skip
-    cdef cintlist.CIntList isa, word_count
-
-    self.darray = cdat.DataArray(filename, from_binary=False, use_sent_id=True)
-    N = len(self.darray)
-    V = len(self.darray.id2word)
-
-    self.sa = cintlist.CIntList(initial_len=N)
-    self.ha = cintlist.CIntList(initial_len=V+1)
-
-    isa = cintlist.CIntList(initial_len=N)
-    word_count = cintlist.CIntList(initial_len=V+1)
-
-    '''Step 1: bucket sort data'''
-    sort_start_time = monitor.cpu()
-    start_time = sort_start_time
-    for i from 0 <= i < N:
-      a_i = self.darray.data.arr[i]
-      word_count.arr[a_i] = word_count.arr[a_i] + 1
-
-    n = 0
-    for i from 0 <= i < V+1:
-      self.ha.arr[i] = n
-      n = n + word_count.arr[i]
-      word_count.arr[i] = 0
-
-    for i from 0 <= i < N:
-      a_i = self.darray.data.arr[i]
-      self.sa.arr[self.ha.arr[a_i] + word_count.arr[a_i]] = i
-      isa.arr[i] = self.ha.arr[a_i + 1] - 1 # bucket pointer is last index in bucket
-      word_count.arr[a_i] = word_count.arr[a_i] + 1
-
-    '''Determine size of initial runs'''
-    current_run = 0
-    for i from 0 <= i < V+1:
-      if i < V and self.ha.arr[i+1] - self.ha.arr[i] == 1:
-        current_run = current_run + 1
-      else:
-        if current_run > 0:
-          self.sa.arr[self.ha.arr[i] - current_run] = -current_run
-          current_run = 0
-
-    sys.stderr.write("  Bucket sort took %f seconds\n" % (monitor.cpu() - sort_start_time))
-
-    '''Step 2: prefix-doubling sort'''
-    h = 1
-    while self.sa.arr[0] != -N:
-      sort_start_time = monitor.cpu()
-      sys.stderr.write("  Refining, sort depth = %d\n" % (h,))
-      i = 0
-      skip = 0
-      while i < N:
-        if self.sa.arr[i] < 0:
-          #sys.stderr.write("Skip from %d to %d\n" % (i, i-self.sa.arr[i]-1))
-          skip = skip + self.sa.arr[i]
-          i = i - self.sa.arr[i]
-        else:
-          if skip < 0:
-            self.sa.arr[i+skip] = skip
-            skip = 0
-          j = isa.arr[self.sa.arr[i]]
-          #sys.stderr.write("Process from %d to %d (%d, %d, %d)\n" % (i, j, self.sa.arr[i], self.darray.data.arr[self.sa.arr[i]], isa.arr[self.sa.arr[i]]))
-          self.q3sort(i, j, h, isa)
-          i = j+1
-      if skip < 0:
-        self.sa.arr[i+skip] = skip
-      h = h * 2
-      sys.stderr.write("  Refinement took %f seconds\n" % (monitor.cpu() - sort_start_time))
-
-    '''Step 3: read off suffix array from inverse suffix array''' 
-    sys.stderr.write("  Finalizing sort...\n")
-    for i from 0 <= i < N:
-      j = isa.arr[i]
-      self.sa.arr[j] = i
-    sys.stderr.write("Suffix array construction took %f seconds\n" % (monitor.cpu() - start_time))
-
-  def q3sort(self, int i, int j, int h, cintlist.CIntList isa, pad=""):
-    '''This is a ternary quicksort. It divides the array into
-    three partitions: items less than the pivot, items equal
-    to pivot, and items greater than pivot.  The first and last
-    of these partitions are then recursively sorted'''
-    cdef int k, midpoint, pval, phead, ptail, tmp
-
-    if j-i < -1:
-      raise Exception("Unexpected condition found in q3sort: sort from %d to %d" % (i,j))
-    if j-i == -1:  # recursive base case -- empty interval
-      return
-    if (j-i == 0):  # recursive base case -- singleton interval
-      isa.arr[self.sa.arr[i]] = i
-      self.sa.arr[i] = -1
-      return 
-
-    # NOTE: choosing the first item as a pivot value resulted in 
-    # stack overflow for some very large buckets.  I think there
-    # is a natural bias towards order due the way the word ids are
-    # assigned; thus this resulted in the range to the left of the 
-    # pivot being nearly empty.   Therefore, choose the middle item.
-    # If the method of assigning word_id's is changed, this method
-    # may need to be reconsidered as well.
-    midpoint = (i+j)/2
-    pval = isa.arr[self.sa.arr[midpoint] + h]
-    if i != midpoint:
-      tmp = self.sa.arr[midpoint]
-      self.sa.arr[midpoint] = self.sa.arr[i]
-      self.sa.arr[i] = tmp
-    phead = i
-    ptail = i
-
-    # find the three partitions.  phead marks the first element
-    # of the middle partition, and ptail marks the last element
-    for k from i+1 <= k < j+1:
-      if isa.arr[self.sa.arr[k] + h] < pval:
-        if k > ptail+1:
-          tmp = self.sa.arr[phead]
-          self.sa.arr[phead] = self.sa.arr[k]
-          self.sa.arr[k] = self.sa.arr[ptail+1]
-          self.sa.arr[ptail+1] = tmp
-        else: # k == ptail+1
-          tmp = self.sa.arr[phead]
-          self.sa.arr[phead] = self.sa.arr[k]
-          self.sa.arr[k] = tmp
-        phead = phead + 1
-        ptail = ptail + 1
-      else:
-        if isa.arr[self.sa.arr[k] + h] == pval:
-          if k > ptail+1:
-            tmp = self.sa.arr[ptail+1]
-            self.sa.arr[ptail+1] = self.sa.arr[k]
-            self.sa.arr[k] = tmp
-          ptail = ptail + 1
-
-    # recursively sort smaller suffixes
-    self.q3sort(i, phead-1, h, isa, pad+"  ")
-
-    # update suffixes with pivot value
-    # corresponds to update_group function in Larsson & Sadakane
-    for k from phead <= k < ptail+1:
-      isa.arr[self.sa.arr[k]] = ptail
-    if phead == ptail:
-      self.sa.arr[phead] = -1
-
-    # recursively sort larger suffixes
-    self.q3sort(ptail+1, j, h, isa, pad+"  ")
-
-
-  def write_text(self, filename):
-    self.darray.write_text(filename)
-
-
-  def read_binary(self, filename):
-    cdef FILE *f
-    cdef bytes bfilename = filename
-    cdef char* cfilename = bfilename
-    f = fopen(cfilename, "r")
-    self.darray.read_handle(f)
-    self.sa.read_handle(f)
-    self.ha.read_handle(f)
-    fclose(f)
-
-
-  def write_binary(self, filename):
-    cdef FILE* f
-    cdef bytes bfilename = filename
-    cdef char* cfilename = bfilename
-    f = fopen(cfilename, "w")
-    self.darray.write_handle(f)
-    self.sa.write_handle(f)
-    self.ha.write_handle(f)
-    fclose(f)
-
-
-  def write_enhanced(self, filename):
-    f = open(filename, "w")
-    self.darray.write_enhanced_handle(f)
-    for a_i in self.sa:
-      f.write("%d " % a_i)
-    f.write("\n")
-    for w_i in self.ha:
-      f.write("%d " % w_i)
-    f.write("\n")
-    f.close()
-
-
-  cdef int __search_high(self, int word_id, int offset, int low, int high):
-    cdef int midpoint
-
-    if low >= high:
-      return high
-    midpoint = (high + low) / 2
-    if self.darray.data.arr[self.sa.arr[midpoint] + offset] == word_id:
-      return self.__search_high(word_id, offset, midpoint+1, high)
-    else:
-      return self.__search_high(word_id, offset, low, midpoint)
-
-
-  cdef int __search_low(self, int word_id, int offset, int low, int high):
-    cdef int midpoint
-
-    if low >= high:
-      return high
-    midpoint = (high + low) / 2
-    if self.darray.data.arr[self.sa.arr[midpoint] + offset] == word_id:
-      return self.__search_low(word_id, offset, low, midpoint)
-    else:
-      return self.__search_low(word_id, offset, midpoint+1, high)
-
-
-  cdef __get_range(self, int word_id, int offset, int low, int high, int midpoint):
-    return (self.__search_low(word_id, offset, low, midpoint),
-        self.__search_high(word_id, offset, midpoint, high))
-
-
-  cdef __lookup_helper(self, int word_id, int offset, int low, int high):
-    cdef int midpoint
-
-    if offset == 0:
-      return (self.ha.arr[word_id], self.ha.arr[word_id+1])
-    if low >= high:
-      return None
-
-    midpoint = (high + low) / 2
-    if self.darray.data.arr[self.sa.arr[midpoint] + offset] == word_id:
-      return self.__get_range(word_id, offset, low, high, midpoint)
-    if self.darray.data.arr[self.sa.arr[midpoint] + offset] > word_id:
-      return self.__lookup_helper(word_id, offset, low, midpoint)
-    else:
-      return self.__lookup_helper(word_id, offset, midpoint+1, high)
-
-
-  def lookup(self, word, int offset, int low, int high):
-    cdef int wordid
-    if low == -1: 
-      low = 0
-    if high == -1:
-      high = len(self.sa)
-    if word in self.darray.word2id:
-      word_id = self.darray.word2id[word]
-      return self.__lookup_helper(word_id, offset, low, high)
-    else:
-      return None
-
-
-
-  def print_sa(self, isa):
-    '''Slow; Use only in case of emergency'''
-    cdef int i, j, k, N
-    cdef cintlist.CIntList tmp_sa
-
-    N = len(self.sa)
-    for i from 0 <= i < N:
-      sys.stderr.write("%2d " % i)
-    sys.stderr.write("\n")
-    for i from 0 <= i < N:
-      sys.stderr.write("%2d " % self.darray.data.arr[i])
-    sys.stderr.write("\n")
-    for i from 0 <= i < N:
-      sys.stderr.write("%2d " % isa.arr[i])
-    sys.stderr.write("\n\n\n")
-
-    # Recover partially sorted array
-    tmp_sa = cintlist.CIntList(initial_len=N)
-    for i from 0 <= i < N:
-      j = isa.arr[i]
-      tmp_sa.arr[j] = i
-    for i from 0 <= i < N:
-      if self.sa.arr[i] > 0:
-        tmp_sa.arr[i] = self.sa.arr[i]
-
-    for i from 0 <= i < N:
-      j = tmp_sa.arr[i]
-      sys.stderr.write("%2d %2d | " % (i, self.sa.arr[i]))
-      for k from j <= k < N:
-        sys.stderr.write("%2d " % self.darray.data.arr[k])
-      sys.stderr.write("\n")
-    sys.stderr.write("\n")
-
-
-
-
-
diff --git a/sa-extract/cveb.pxd b/sa-extract/cveb.pxd
deleted file mode 100644
index 8967f8e3..00000000
--- a/sa-extract/cveb.pxd
+++ /dev/null
@@ -1,15 +0,0 @@
-cdef struct _VEB:
-	int top_universe_size
-	int num_bottom_bits
-	int max_val
-	int min_val
-	int size
-	void* top
-	void** bottom
-
-
-cdef class VEB:
-	cdef _VEB* veb
-	cdef int _findsucc(self, int i)
-	cdef int _insert(self, int i)
-	cdef int _first(self)
diff --git a/sa-extract/cveb.pyx b/sa-extract/cveb.pyx
deleted file mode 100644
index ca87becc..00000000
--- a/sa-extract/cveb.pyx
+++ /dev/null
@@ -1,390 +0,0 @@
-#!/usr/bin/env python2.4
-'''This module implements a partial stratified tree (van Emde Boas, 1977).
-Only insert findsucc, __iter__, and __contains__ are implemented.
-Delete is currently not supported.
-There is very little error-checking in this code -- it is designed
-to be used in the limited situation described in Lopez (EMNLP-CoNLL 2007),
-which doesn't cover all of the possible ways that you could misuse it
-(e.g. trying to insert a key larger than the universe size)
-Other notes -- this code is really rather ugly C code masquerading as
-Pyrex/Python.  Virtual function calls are bypassed by hand in several
-places for the sake of efficiency, and other Python niceties are 
-removed for the same reason.'''
-
-from libc.stdlib cimport malloc, free
-from libc.math cimport log, ceil
-from libc.string cimport memset
-
-cdef int MIN_BOTTOM_SIZE
-cdef int MIN_BOTTOM_BITS
-
-MIN_BOTTOM_SIZE = 32
-MIN_BOTTOM_BITS = 5
-
-cdef int lower_mask[32]
-cdef int i, mask
-
-for i from 0 <= i < MIN_BOTTOM_SIZE:
-	mask = (mask << 1) + 1
-	lower_mask[i] = mask
-
-
-cdef struct _BitSet:
-	long bitset
-	int min_val
-	int max_val
-	int size
-
-
-cdef _BitSet* new_BitSet():
-	cdef _BitSet* b
-
-	b = <_BitSet*> malloc(sizeof(_BitSet))
-	b.bitset = 0
-	b.min_val = -1
-	b.max_val = -1
-	b.size = 0
-	return b
-
-
-cdef int bitset_findsucc(_BitSet* b, int i):
-	cdef int bitset, mask
-	cdef int low, high, mid
-
-	if b.max_val == -1 or i >= b.max_val:
-		return -1
-	if i < b.min_val:
-		return b.min_val
-
-	bitset = b.bitset & ~lower_mask[i]
-	low = i+1
-	high = b.max_val+1
-	while low < high-1:
-		mid = (high + low)/2
-		mask = ~(lower_mask[high-1] ^ lower_mask[mid-1])
-		if bitset & mask == 0:
-			low = mid
-		else: 
-			bitset = bitset & mask
-			high = mid
-	return low
-
-
-cdef int bitset_insert(_BitSet* b, int i):
-	cdef int val
-
-	val = 1 << i
-	if b.bitset & val == 0:
-		b.bitset = b.bitset | val
-		if b.size == 0:
-			b.min_val = i
-			b.max_val = i
-		else:
-			if i < b.min_val:
-				b.min_val = i
-			if i > b.max_val:
-				b.max_val = i
-		b.size = b.size + 1
-		return 1
-	return 0
-
-
-cdef int bitset_contains(_BitSet* b, int i):
-	cdef int val
-
-	val = 1 << i
-	if b.bitset & val == 0:
-		return 0
-	else:
-		return 1
-
-
-cdef class BitSetIterator:
-	cdef _BitSet* b
-	cdef int next_val
-
-	def __next__(self):
-		cdef int ret_val
-
-		if self.next_val == -1:
-			raise StopIteration()
-		ret_val = self.next_val
-		self.next_val = bitset_findsucc(self.b, ret_val)
-		return ret_val
-
-
-
-# This is a Python wrapper class to give access to the
-# (entirely C-implemented) _BitSet struct. 
-# Very slow; use only for debugging
-cdef class BitSet:
-
-	cdef _BitSet* b
-
-	def __cinit__(self):
-		self.b = new_BitSet()
-
-	def __dealloc__(self):
-		free(self.b)
-
-	def __iter__(self):
-		cdef BitSetIterator it
-		it = BitSetIterator()
-		it.b = self.b
-		it.next_val = self.b.min_val
-		return it
-
-	def insert(self, i):
-		return bitset_insert(self.b, i)
-
-	def findsucc(self, i):
-		return bitset_findsucc(self.b, i)
-
-	def __str__(self):
-		return dec2bin(self.b.bitset)+"  ("+str(self.b.size)+","+str(self.b.min_val)+","+str(self.b.max_val)+")"
-
-	def min(self):
-		return self.b.min_val
-
-	def max(self):
-		return self.b.max_val
-
-	def __len__(self):
-		return self.b.size
-
-	def __contains__(self, i):
-		return bool(bitset_contains(self.b, i))
-
-
-def dec2bin(i):
-	cdef d
-	result = ""
-	for d from 0 <= d < MIN_BOTTOM_SIZE:
-		if i & lower_mask[0] == 0:
-			result = "0"+result
-		else:
-			result = "1"+result
-		i = i >> 1
-	return result
-
-
-cdef _VEB* new_VEB(int n):
-	cdef _VEB* veb
-	cdef int num_bits, num_top_bits, i
-
-	veb = <_VEB*> malloc(sizeof(_VEB))
-
-	num_bits = int(ceil(log(n) / log(2)))
-	veb.num_bottom_bits = num_bits/2
-	if veb.num_bottom_bits < MIN_BOTTOM_BITS:
-		veb.num_bottom_bits = MIN_BOTTOM_BITS
-	veb.top_universe_size = (n >> veb.num_bottom_bits) + 1
-
-	veb.bottom = <void**> malloc(veb.top_universe_size * sizeof(void*))
-	memset(veb.bottom, 0, veb.top_universe_size * sizeof(void*))
-
-	if veb.top_universe_size > MIN_BOTTOM_SIZE:
-		veb.top = new_VEB(veb.top_universe_size)
-	else:
-		veb.top = new_BitSet()
-
-	veb.max_val = -1
-	veb.min_val = -1
-	veb.size = 0
-	return veb
-
-
-cdef int VEB_insert(_VEB* veb, int i):
-	cdef _VEB* subv
-	cdef _BitSet* subb
-	cdef int a, b, tmp
-
-	if veb.size == 0:
-		veb.min_val = i
-		veb.max_val = i
-	elif i == veb.min_val or i == veb.max_val:
-		return 0
-	else:
-		if i < veb.min_val:
-			tmp = i
-			i = veb.min_val
-			veb.min_val = tmp
-		a = i >> veb.num_bottom_bits
-		b = i & lower_mask[veb.num_bottom_bits-1]
-		if veb.bottom[a] == NULL:
-			if veb.top_universe_size > MIN_BOTTOM_SIZE:
-				subv = <_VEB*> veb.top
-				VEB_insert(subv, a)
-			else:
-				subb = <_BitSet*> veb.top
-				bitset_insert(subb, a)
-			if veb.num_bottom_bits > MIN_BOTTOM_BITS:
-				veb.bottom[a] = new_VEB(1 << veb.num_bottom_bits)
-			else:
-				veb.bottom[a] = new_BitSet()
-		if veb.num_bottom_bits > MIN_BOTTOM_BITS:
-			subv = <_VEB*> veb.bottom[a]
-			if VEB_insert(subv, b) == 0:
-				return 0
-		else:
-			subb = <_BitSet*> veb.bottom[a]
-			if bitset_insert(subb, b) == 0:
-				return 0
-
-		if i > veb.max_val:
-			veb.max_val = i
-	veb.size = veb.size + 1
-	return 1
-
-
-cdef del_VEB(_VEB* veb):
-	cdef int i
-
-	if veb.top_universe_size > MIN_BOTTOM_SIZE:
-		i = (<_VEB*> veb.top).min_val
-	else:
-		i = (<_BitSet*> veb.top).min_val
-
-	while i != -1:
-		if veb.num_bottom_bits > MIN_BOTTOM_BITS:
-			del_VEB(<_VEB*> veb.bottom[i])
-		else:
-			free(<_BitSet*> veb.bottom[i])
-
-		if veb.top_universe_size > MIN_BOTTOM_SIZE:
-			i = VEB_findsucc(<_VEB*> veb.top, i)
-		else:
-			i = bitset_findsucc(<_BitSet*> veb.top, i)
-
-	if veb.top_universe_size > MIN_BOTTOM_SIZE:
-		del_VEB(<_VEB*> veb.top)
-	else:
-		free(<_BitSet*> veb.top)
-	free(veb.bottom)
-	free(veb)
-
-
-cdef int VEB_findsucc(_VEB* veb, int i):
-	cdef _VEB* subv
-	cdef _BitSet* subb
-	cdef int a, b, j, c, found
-
-	if veb.max_val == -1 or i>=veb.max_val:
-		return -1
-	if i < veb.min_val:
-		return veb.min_val
-
-	a = i >> veb.num_bottom_bits
-	b = i & lower_mask[veb.num_bottom_bits-1]
-	found = 0
-	if veb.bottom[a] != NULL:
-		if veb.num_bottom_bits > MIN_BOTTOM_BITS:
-			subv = <_VEB*> veb.bottom[a]
-			if subv.max_val > b:
-				j = (a << veb.num_bottom_bits) + VEB_findsucc(subv, b)
-				found = 1
-		else:
-			subb = <_BitSet*> veb.bottom[a]
-			if subb.max_val > b:
-				j = (a << veb.num_bottom_bits) + bitset_findsucc(subb, b)
-				found = 1
-	if found==0:
-		if veb.top_universe_size > MIN_BOTTOM_SIZE:
-			subv = <_VEB*> veb.top
-			c = VEB_findsucc(subv, a)
-		else:
-			subb = <_BitSet*> veb.top
-			c = bitset_findsucc(subb, a)
-		if veb.num_bottom_bits > MIN_BOTTOM_BITS:
-			subv = <_VEB*> veb.bottom[c]
-			j = (c << veb.num_bottom_bits) + subv.min_val
-		else:
-			subb = <_BitSet*> veb.bottom[c]
-			j = (c << veb.num_bottom_bits) + subb.min_val
-	return j
-
-
-cdef int VEB_contains(_VEB* veb, int i):
-	cdef _VEB* subv
-	cdef _BitSet* subb
-	cdef int a, b
-
-	if veb.size == 0 or i < veb.min_val or i > veb.max_val:
-		return 0
-
-	if veb.min_val == i:
-		return 1
-	else:
-		if veb.size == 1:
-			return 0
-
-	a = i >> veb.num_bottom_bits
-	b = i & lower_mask[veb.num_bottom_bits-1]
-	if veb.bottom[a] == NULL:
-		return 0
-	else:
-		if veb.num_bottom_bits > MIN_BOTTOM_BITS:
-			subv = <_VEB*> veb.bottom[a]
-			return VEB_contains(subv, b)
-		else:
-			subb = <_BitSet*> veb.bottom[a]
-			return bitset_contains(subb, b)
-
-
-cdef class VEBIterator:
-	cdef _VEB* v
-	cdef int next_val
-
-	def __next__(self):
-		cdef int ret_val
-
-		if self.next_val == -1:
-			raise StopIteration()
-		ret_val = self.next_val
-		self.next_val = VEB_findsucc(self.v, ret_val)
-		return ret_val
-
-
-cdef class VEB:
-
-	def __init__(self, size):
-		pass
-
-	def __cinit__(self, int size):
-		self.veb = new_VEB(size)
-
-	def __dealloc__(self):
-		del_VEB(self.veb)
-
-	def __iter__(self):
-		cdef VEBIterator it
-		it = VEBIterator()
-		it.v = self.veb
-		it.next_val = self.veb.min_val
-		return it
-
-	def insert(self, i):
-		return VEB_insert(self.veb, i)
-
-	cdef int _insert(self, int i):
-		return VEB_insert(self.veb, i)
-
-	def findsucc(self, i):
-		return VEB_findsucc(self.veb, i)
-
-	cdef int _first(self):
-		return self.veb.min_val
-
-	cdef int _findsucc(self, int i):
-		return VEB_findsucc(self.veb, i)
-
-	def __len__(self):
-		return self.veb.size
-
-	def __contains__(self, i):
-		return VEB_contains(self.veb, i)
-
-
-
-
-
diff --git a/sa-extract/escape-testset.pl b/sa-extract/escape-testset.pl
deleted file mode 100755
index 02fd7445..00000000
--- a/sa-extract/escape-testset.pl
+++ /dev/null
@@ -1,35 +0,0 @@
-#!/usr/bin/perl -w
-
-use utf8;
-use strict;
-
-binmode(STDIN,":utf8");
-binmode(STDOUT,":utf8");
-
-my @fh = ();
-if (scalar @ARGV == 0) {
-  push @fh, \*STDIN;
-} else {
-  for my $file (@ARGV) {
-    my $f;
-    open $f, "<$file" or die "Can't read $file: $!\n";
-    binmode $f, ":utf8";
-    push @fh, $f;
-  }
-}
-
-my $id = -1;
-for my $f (@fh) {
-  while(<$f>) {
-    chomp;
-    die "Empty line in test set" if /^\s*$/;
-    die "Please remove <seg> tags from input:\n$_" if /^\s*<seg/i;
-    $id++;
-    s/&/\&amp;/g;
-    s/</\&lt;/g;
-    s/>/\&gt;/g;
-    print "<seg id=\"$id\"> $_ </seg>\n";
-  }
-}
-
-
diff --git a/sa-extract/example/README b/sa-extract/example/README
deleted file mode 100644
index f6eac52b..00000000
--- a/sa-extract/example/README
+++ /dev/null
@@ -1,8 +0,0 @@
-Commands to compile a corpus and extract some grammars
-======================================================
-
-# compile
-../sa-compile.pl -b nc=corpus.de.gz,corpus.en.gz -a gdfa=corpus.align.gz > extract.ini
-# extract
-cat test.de | ../escape-testset.pl | ../extractor.py -c extract.ini
-
diff --git a/sa-extract/example/corpus.align.gz b/sa-extract/example/corpus.align.gz
deleted file mode 100644
index 741de7e4..00000000
Binary files a/sa-extract/example/corpus.align.gz and /dev/null differ
diff --git a/sa-extract/example/corpus.de.gz b/sa-extract/example/corpus.de.gz
deleted file mode 100644
index 0d66470a..00000000
Binary files a/sa-extract/example/corpus.de.gz and /dev/null differ
diff --git a/sa-extract/example/corpus.en.gz b/sa-extract/example/corpus.en.gz
deleted file mode 100644
index 28cb5c58..00000000
Binary files a/sa-extract/example/corpus.en.gz and /dev/null differ
diff --git a/sa-extract/example/test.de b/sa-extract/example/test.de
deleted file mode 100644
index 8923329f..00000000
--- a/sa-extract/example/test.de
+++ /dev/null
@@ -1,10 +0,0 @@
-dies ist der richtige ansatz für diejenigen in chinas politischer führung , die aus der who den maximalen nutzen für die unterstützung der inneren reform ziehen wollen .
-taiwan hat sich auch vorgenommen , konstruktiv zu sein - wenn china mitspielt .
-die stadt staaten hongkong und singapur verfolgen eine klarsichtige who - politik und konzentrieren sich auf markt zugänge und starke regeln .
-malaysia und thailand sind auch recht aktiv innerhalb der who , mit verschiedenen positionen , die vom frei handel bis zum protektionismus reichen .
-indonesien und die philippinen sind schwächer , überwältigt von politischer zusammen hanglosigkeit und ganz in anspruch genommen von den anstrengungen , das schlimmste zu hause zu verhüten , so dass nur geringe kräfte übrig bleiben , mit der stets anschwellenden und immer komplizierteren agenda der who fertig zu werden .
-die who steht vor einer wichtigen entscheidung .
-sie muss dringend den handel progressiv liberalisieren .
-eine starke führung seitens der usa ist erforderlich , damit die who in diese richtung gebracht werden kann und man gleichzeitig vermeidet , die zukunft nach dem muster der eu zu gestalten ( regel wucherung ) oder nach dem muster der uno ( macht lose gespräch runde ) .
-dies geschieht sicher besser unter bush , mit einem klaren bekenntnis zum offenen markt und einer aktiveren außen politik , als es unter irgendeiner demokratischen alternative geschehen könnte .
-robert zoellick , präsident bushs handel beauftragter , braucht aber verbündete .
diff --git a/sa-extract/example/test.ref.en b/sa-extract/example/test.ref.en
deleted file mode 100644
index e50edcac..00000000
--- a/sa-extract/example/test.ref.en
+++ /dev/null
@@ -1,10 +0,0 @@
-this is the right approach for those in china 's leadership who wish to extract maximum benefits from the wto to bolster domestic reform .
-taiwan is also set to play a constructive role -- if mainland china plays along .
-the city states , hong kong and singapore , have clear - sighted wto policies , focusing on market access and strong rules .
-malaysia and thailand are also fairly active in the wto , with a mix of free - market and protectionist positions .
-indonesia and the philippines are weaker , overwhelmed by policy incoherence and fire - fighting at home , and with insufficient capacity to deal with the wto 's burgeoning and increasingly complicated agenda .
-the wto is at a crossroads .
-it sorely needs to liberalize trade progressively .
-strong us leadership is required to push the wto in this direction while avoiding an eu - style future ( regulatory overload ) or a un - style future ( an irrelevant talking shop ) .
-this is more likely under a bush administration with better open - market credentials and a more assertive foreign policy than any democratic alternative .
-however , robert zoellick , president bush 's trade representative , needs allies .
diff --git a/sa-extract/extract.ini b/sa-extract/extract.ini
deleted file mode 100644
index 56913245..00000000
--- a/sa-extract/extract.ini
+++ /dev/null
@@ -1,116 +0,0 @@
-# This .ini file extracts grammars to a file using
-# the pattern matching infrastructure.
-#
-# Does not do any decoding.
-#
-# Variables can be set using sa-system.pl
-#
-# Usage: decoder.py -c <this ini file> [-x <grammar file>]
-#
-# If the -x option is used, grammar will be written to the
-# specified file, otherwise it is written to $PWD/grammar.out
-# 
-# NOTE: all information about rules is cached, so use generous
-# memory limits (rules themselves are not cached.)
-
-import os
-import manager
-import clex
-import context_model
-import rulefactory
-import calignment
-import sys
-
-out_grammar_file = "grammar.out"
-if opts.extra:
-	out_grammar_file = opts.extra
-
-# *** these variables written by sa-system.pl.  Do not modify ***
-lm_file = "/tmp/sa-redpony/de-en/lm/lm/lm.gz"
-f_sa_file = "/tmp/sa-redpony/de-en/bitext/soseos.vc/f.sa.bin"
-e_file = "/tmp/sa-redpony/de-en/bitext/soseos.vc/e.bin"
-a_file = "/tmp/sa-redpony/de-en/bitext/soseos.vc/a/gdfa/a.bin"
-lex_file = "/tmp/sa-redpony/de-en/bitext/soseos.vc/a/gdfa/lex.bin"
-max_len = 5
-max_nt = 2
-max_size=10
-min_gap=1
-rank1 = 100
-rank2 = 10
-precompute_file = "/tmp/sa-redpony/de-en/bitext/soseos.vc/precomp.5.2.10.1.100.10.bin"
-
-# check for path errors
-if not os.path.exists(f_sa_file):
-	raise Exception("Cannot find compiled source language suffix array file %s" % f_sa_file)
-if not os.path.exists(e_file):
-	raise Exception("Cannot find compiled target language array file %s" % e_file)
-if not os.path.exists(a_file):
-	raise Exception("Cannot find compiled alignment file %s" % a_file)
-if not os.path.exists(lex_file):
-	raise Exception("Cannot find compiled lexical weights file %s" % lex_file)
-if not os.path.exists(precompute_file):
-	log.writeln("Could not find precomputed collocations %s, decoding will be slower" % precompute_file)
-	precompute_file = None
-
-### Output options
-mark_phrases = False      # show derivation as SGML markup in output
-mert_mark_phrases = False # do the same when generating n-best lists (don't use this with minimum error rate training!)
-
-# Verbosity. 0 = silent, 1 = normal, 2-5 = verbose
-log.level = 1
-log.file = sys.stderr
-
-# pattern-matching stuff
-class PhonyGrammar:  # saves us the cost of keeping the rules around
-	def add(self, thing):
-		pass
-
-local_grammar = PhonyGrammar()
-xcat="X"
-
-cm = manager.ContextManager(
-	f_sa_file,
-	e_file,
-	sampler=rulefactory.Sampler(300),  # lower=faster, higher=better; improvements level off above 200-300 range, -1 = don't sample, use all data (VERY SLOW!)
-	rulefactory=rulefactory.HieroCachingRuleFactory(
-		alignment=calignment.Alignment(  # compiled alignment object (REQUIRED)
-			a_file, 
-			from_binary=True
-			),
-		category="["+xcat+"]",           # name of generic nonterminal used by Hiero
-		grammar=local_grammar,           # do not change for extraction
-		max_chunks=None,                 # maximum number of contiguous chunks of terminal symbols in RHS of a rule. If None, defaults to max_nonterminals+1
-		max_initial_size=15,             # maximum span of a grammar rule in TEST DATA
-		max_length=max_len,              # maximum number of symbols (both T and NT) allowed in a rule
-		max_nonterminals=max_nt,         # maximum number of nonterminals allowed in a rule (set >2 at your own risk)
-		max_target_chunks=None,          # maximum number of contiguous chunks of terminal symbols in target-side RHS of a rule. If None, defaults to max_nonterminals+1
-		max_target_length=None,          # maximum number of target side symbols (both T and NT) allowed in a rule. If None, defaults to max_initial_size
-		min_gap_size=1,                  # minimum span of a nonterminal in the RHS of a rule in TEST DATA
-		precompute_file=precompute_file, # filename of file containing precomputed collocations
-		precompute_secondary_rank=rank2, # maximum frequency rank of patterns used to compute triples (don't set higher than 20).
-		precompute_rank=rank1,           # maximum frequency rank of patterns used to compute collocations (no need to set higher than maybe 200-300)
-		require_aligned_terminal=True,   # require extracted rules to have at least one aligned word
-		require_aligned_chunks=False,    # require each contiguous chunk of extracted rules to have at least one aligned word
-		per_sentence_grammar=True,       # generate a complete grammar for each input sentence
-		rule_file=out_grammar_file,      # grammar is written to this file (sentence id is added to file name for per sentence grammars)
-		train_max_initial_size=max_size, # maximum span of a grammar rule extracted from TRAINING DATA
-		train_min_gap_size=min_gap,      # minimum span of an RHS nonterminal in a rule extracted from TRAINING DATA
-		tight_phrases=True,              # True if phrases should be tight, False otherwise (False seems to give better results but is slower)
-		),
-	from_binary=True
-	)
-
-# lexical weighting tables
-tt = clex.CLex(lex_file, from_binary=True)
-
-# Only include features that depend on rule identity here
-add_model(context_model.EgivenFCoherent(cm), 0.125) 
-add_model(context_model.SampleCountF(cm), 0.125)
-add_model(context_model.CountEF(cm), 0.125)
-add_model(context_model.MaxLexFgivenE(cm, tt), 0.125) 
-add_model(context_model.MaxLexEgivenF(cm, tt), 0.125) 
-add_model(context_model.IsSingletonF(cm), 0.125)
-add_model(context_model.IsSingletonFE(cm), 0.125)
-
-# grammars, search parameters and all that other stuff are irrelevant
-
diff --git a/sa-extract/extractor.py b/sa-extract/extractor.py
deleted file mode 100755
index 9d66ebf0..00000000
--- a/sa-extract/extractor.py
+++ /dev/null
@@ -1,60 +0,0 @@
-#!/usr/bin/env python
-
-# vim:expandtab:shiftwidth=4
-
-import sys, gc, monitor, sgml
-import optparse
-import model
-import log
-import cn
-
-models = []
-
-def add_model(m,w=0.0):
-    models.append(m)
-
-def extract_grammar(input):
-    confnet = cn.ConfusionNet(input)
-    meta = input.meta
-    for m in models:
-        m.input(confnet.columns, meta)
-
-if __name__ == "__main__":
-    optparser = optparse.OptionParser()
-    optparser.add_option("-c", "--config", dest="config", help="configuration module")
-    optparser.add_option("-x", "--extra", dest="extra", help="output grammar name override")
-    (opts,args) = optparser.parse_args()
-
-    if opts.config is None:
-        raise ValueError, "You must specify a configuration file."
-    else:
-        if log.level >= 1:
-            log.write("Reading configuration from %s\n" % opts.config)
-        execfile(opts.config)
-
-    if len(args) >= 1 and args[0] != "-":
-        input_file = file(args[0], "r")
-    else:
-        input_file = sys.stdin
-
-    if len(args) >= 2 and args[1] != "-":
-        output_file = file(args[1], "w")
-    else:
-        output_file = sys.stdout
-
-    gc.collect()
-    if log.level >= 1:
-        log.write("all structures loaded, memory %s, time %s\n" % (monitor.memory(), monitor.cpu()))
-        log.write("models: %s\n" % (" ".join(str(x.name) for x in models)))
-
-    sents = sgml.read_raw(input_file)
-    for sent in sents:
-        mark = sent.getmark()
-        if mark is not None:
-            (tag, attrs) = mark
-            if tag == "seg":
-                sent.unmark()
-                dattrs = sgml.attrs_to_dict(attrs)
-                sent.meta = attrs
-        extract_grammar(sent)
-
diff --git a/sa-extract/lcp.pyx b/sa-extract/lcp.pyx
deleted file mode 100644
index a992d3ee..00000000
--- a/sa-extract/lcp.pyx
+++ /dev/null
@@ -1,113 +0,0 @@
-#!/usr/bin/env python2.4
-
-'''Compute LCP array for a suffix array using the Kasai et al. algorithm'''
-'''Can also be used to compute statistics such
-as k most frequent n-grams'''
-
-import sys
-
-cimport cintlist
-cimport csuf
-cimport cdat
-cimport cveb
-
-cdef class LCP:
-
-	cdef csuf.SuffixArray sa
-	cdef cintlist.CIntList lcp
-
-	def __init__(self, sa):
-		self._construct(sa)
-
-	cdef _construct(self, csuf.SuffixArray sa):
-		cdef int i, k, j, h, n
-		cdef cintlist.CIntList rank
-
-		sys.stderr.write("Constructing LCP array\n")
-		self.sa = sa
-		n = self.sa.sa.len
-		self.lcp = cintlist.CIntList(initial_len=n)
-
-		rank = cintlist.CIntList(initial_len=n)
-		for i from 0 <= i < n:
-			rank.arr[sa.sa.arr[i]] = i
-
-		h = 0
-		for i from 0 <= i < n:
-			k = rank.arr[i]
-			if k == 0:
-				self.lcp.arr[k] = -1
-			else:
-				j = sa.sa.arr[k-1]
-				while i+h < n and j+h < n and sa.darray.data.arr[i+h] == sa.darray.data.arr[j+h]:
-					h = h+1
-				self.lcp.arr[k] = h
-			if h > 0:
-				h = h-1
-		sys.stderr.write("LCP array completed\n")
-
-
-	def compute_stats(self, max_n):
-		self._compute_stats(max_n)
-
-	cdef _compute_stats(self, int max_n):
-		'''Note: the output of this function is not exact.  In
-		particular, the frequency associated with each word is 
-		not guaranteed to be correct.  This is due to a bit of
-		laxness in the design; the function is intended only to
-		obtain a list of the most frequent words; for this 
-		purpose it is perfectly fine'''
-		cdef int i, ii, iii, j, k, h, n, N, rs, freq, valid
-		cdef cintlist.CIntList run_start
-		cdef cintlist.CIntList ngram_start
-		cdef cveb.VEB veb
-		
-		N = self.sa.sa.len
-
-		ngram_starts = []
-		for n from 0 <= n < max_n:
-			ngram_starts.append(cintlist.CIntList(initial_len=N))
-
-		run_start = cintlist.CIntList(initial_len=max_n)
-		veb = cveb.VEB(N)
-
-		for i from 0 <= i < N:
-			h = self.lcp.arr[i]
-			if h < 0:
-				h = 0
-			for n from h <= n < max_n:
-				rs = run_start.arr[n]
-				run_start.arr[n] = i
-				freq = i - rs
-				if freq > 1000: # arbitrary, but see note below
-					veb._insert(freq)
-					ngram_start = ngram_starts[n]
-					while ngram_start.arr[freq] > 0:
-						freq = freq + 1 # cheating a bit, should be ok for sparse histogram
-					ngram_start.arr[freq] = rs
-		i = veb.veb.min_val
-		while i != -1:
-			ii = veb._findsucc(i)
-			for n from 0 <= n < max_n:
-				ngram_start = ngram_starts[n]
-				iii = i
-				rs = ngram_start.arr[iii]
-				while (ii==-1 or iii < ii) and rs != 0:
-					j = self.sa.sa.arr[rs]
-					valid = 1
-					for k from 0 <= k < n+1:
-						if self.sa.darray.data.arr[j+k] < 2:
-							valid = 0
-					if valid:
-						ngram = ""
-						for k from 0 <= k < n+1:
-							ngram= ngram+ self.sa.darray.id2word[self.sa.darray.data.arr[j+k]] + " "
-						print i, n+1, ngram
-					iii = iii + 1
-					rs = ngram_start.arr[iii]
-			i = ii
-
-
-
-
-
diff --git a/sa-extract/lcp_ops.py b/sa-extract/lcp_ops.py
deleted file mode 100755
index 9df6e82a..00000000
--- a/sa-extract/lcp_ops.py
+++ /dev/null
@@ -1,52 +0,0 @@
-#!/usr/bin/env python
-
-import lcp
-import sys
-import optparse
-import csuf
-
-'''Run various computations using the LCP array'''
-def main():
-
-  optparser = optparse.OptionParser()
-#    optparser.add_option("-c", "--config", dest="config", help="configuration module")
-  optparser.add_option("-s", "--sa-check", dest="sa_check", default=False, action="store_true")
-  optparser.add_option("-l", "--lcp-check", dest="lcp_check", default=False, action="store_true")
-  optparser.add_option("-t", "--stats", dest="stats", default=0, type="int", action="store")
-  optparser.add_option("-u", "--unigram", dest="uni_check", default=False, action="store_true")
-  optparser.add_option("-r", "--report-long-lcps", dest="long_lcp", type="int", default=0, action="store")
-  (opts,args) = optparser.parse_args()
-
-  if len(args) < 1:
-    print >>sys.stderr, "Usage: lcp.py [opts] <sa file>"
-    sys.exit(1)
-
-  safile = args[0]
-  sa = csuf.SuffixArray(safile, from_binary=True)
-
-#  if opts.sa_check:
-#    check_sufarray(sa)
-
-  l = lcp.LCP(sa)
-
-  if opts.lcp_check:
-    print >>sys.stderr, "Checking LCP Array..."
-    l.check()
-    print >>sys.stderr, "Check finished"
-
-  if opts.stats > 0:
-    l.compute_stats(opts.stats)
-
-#  if opts.uni_check:
-#    if lcp is None:
-#      lcp = LCP(sa)
-#    unigram_stats(sa, lcp)
-#
-#  if opts.long_lcp:
-#    if lcp is None:
-#      lcp = LCP(sa, opts.long_lcp)
-
-if __name__ == "__main__":
-  sys.exit(main())
-  
-
diff --git a/sa-extract/log.py b/sa-extract/log.py
deleted file mode 100644
index d4f96cb4..00000000
--- a/sa-extract/log.py
+++ /dev/null
@@ -1,18 +0,0 @@
-import sys
-
-level = 1
-file = sys.stderr
-
-def writeln(s="", l=0):
-    if level >= l:
-	file.write("%s\n" % s)
-	file.flush()
-
-def write(s, l=0):
-    if level >= l:
-	file.write(s)
-	file.flush()
-
-
-    
-    
diff --git a/sa-extract/manager.py b/sa-extract/manager.py
deleted file mode 100644
index 3a079c2a..00000000
--- a/sa-extract/manager.py
+++ /dev/null
@@ -1,100 +0,0 @@
-import csuf
-import cdat
-import cintlist
-
-class Sampler(object):
-	'''A Sampler implements a logic for choosing
-	samples from a population range'''
-
-	def __init__(self):
-		pass
-
-	def registerContext(self, context_manager):
-		self.context_manager = context_manager
-
-	def sample(self, phrase_location):
-		return cintlist.CIntList()
-
-
-class Extractor(object):
-	'''Extractor is responsible for extracting rules
-	from a given context; once a sentence id/location
-	is found for the source fwords, extractor is 
-	responsible for producing any matching rule(s).
-	Optionally, extractor may return an empty list'''
-
-	def __init__(self):
-		pass
-
-	def registerContext(self, context_manager):
-		self.context_manager = context_manager
-
-	def extract(self, fwords, loc):
-		return []
-	
-
-
-class RuleFactory(object):
-	'''RuleFactory is a class that manages the
-	generation of translation rules, using the Context
-	and (optionally) any of its contained classes or
-	data.  The RuleFactory is responsible for handling
-	any caching (i.e. when presented with an input
-	sentence, it may lookup a rule from its cache
-	rather than extracting a new rule)'''
-
-	def __init__(self):
-		self.num_lookups = 0
-		self.num_extractions = 0
-		self.num_rules = 0
-		self.time = 0.0
-
-
-	def registerContext(self, context_manager):
-		self.context_manager = context_manager
-
-
-	def input(self, fwords):
-		'''Manages the process of enumerating
-		rules for a given input sentence, and
-		looking them with calls to Context,
-		Sampler, and Extractor'''
-		return []
-
-
-class ContextManager(object):
-
-	def __init__(self, ffile, efile, extractor=None, sampler=None, rulefactory=None, from_binary=False):
-		# NOTE: Extractor does not have a default value because
-		# the only nontrivial extractor right now depends on an
-		# alignment file
-
-		self.fsarray = csuf.SuffixArray(ffile, from_binary)
-		self.edarray = cdat.DataArray(efile, from_binary)
-
-		self.factory = rulefactory
-		self.factory.registerContext(self)
-
-		self.sampler = sampler
-		self.sampler.registerContext(self)
-
-		self.models = []
-		self.owner = None
-
-
-	def add_model(self, model):
-		if self.owner is None:
-			self.owner = model
-		model_id = len(self.models)
-		self.models.append(model)
-		return model_id
-
-
-	def input(self, model, fwords, meta):
-		if model != self.owner:
-			return
-		self.fwords = fwords
-		self.factory.input(self.fwords, meta)
-
-
-
diff --git a/sa-extract/model.py b/sa-extract/model.py
deleted file mode 100644
index bcdf129a..00000000
--- a/sa-extract/model.py
+++ /dev/null
@@ -1,7 +0,0 @@
-class Model(object):
-    def __init__(self, name=None):
-        object.__init__(self)
-        if name is None:
-            self.name = self.__class__.__name__
-        else:
-            self.name = name
diff --git a/sa-extract/monitor.py b/sa-extract/monitor.py
deleted file mode 100644
index eb0bed57..00000000
--- a/sa-extract/monitor.py
+++ /dev/null
@@ -1,48 +0,0 @@
-import os, resource
-
-def cpu():
-    return (resource.getrusage(resource.RUSAGE_SELF).ru_utime+
-            resource.getrusage(resource.RUSAGE_SELF).ru_stime)
-
-# from http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/286222
-
-
-_proc_status = '/proc/%d/status' % os.getpid()
-
-_scale = {'kB': 1024.0, 'mB': 1024.0*1024.0,
-          'KB': 1024.0, 'MB': 1024.0*1024.0}
-
-def _VmB(VmKey):
-    '''Private.
-    '''
-    global _proc_status, _scale
-     # get pseudo file  /proc/<pid>/status
-    try:
-        t = open(_proc_status)
-        v = t.read()
-        t.close()
-    except:
-        return 0.0  # non-Linux?
-     # get VmKey line e.g. 'VmRSS:  9999  kB\n ...'
-    i = v.index(VmKey)
-    v = v[i:].split(None, 3)  # whitespace
-    if len(v) < 3:
-        return 0.0  # invalid format?
-     # convert Vm value to bytes
-    return float(v[1]) * _scale[v[2]]
-
-def memory(since=0.0):
-    '''Return memory usage in bytes.
-    '''
-    return _VmB('VmSize:') - since
-
-def resident(since=0.0):
-    '''Return resident memory usage in bytes.
-    '''
-    return _VmB('VmRSS:') - since
-
-
-def stacksize(since=0.0):
-    '''Return stack size in bytes.
-    '''
-    return _VmB('VmStk:') - since
diff --git a/sa-extract/precomputation.pxd b/sa-extract/precomputation.pxd
deleted file mode 100644
index c75d5eef..00000000
--- a/sa-extract/precomputation.pxd
+++ /dev/null
@@ -1,13 +0,0 @@
-from libc.stdio cimport FILE
-
-cdef class Precomputation:
-	cdef int precompute_rank
-	cdef int precompute_secondary_rank
-	cdef int max_length
-	cdef int max_nonterminals
-	cdef int train_max_initial_size
-	cdef int train_min_gap_size
-	cdef precomputed_index
-	cdef precomputed_collocations
-	cdef read_map(self, FILE* f)
-	cdef write_map(self, m, FILE* f)
diff --git a/sa-extract/precomputation.pyx b/sa-extract/precomputation.pyx
deleted file mode 100644
index ce4c21aa..00000000
--- a/sa-extract/precomputation.pyx
+++ /dev/null
@@ -1,478 +0,0 @@
-# precomputes a set of collocations by advancing over the text.
-# warning: nasty C code
-
-import log
-import monitor
-
-cimport csuf
-cimport cdat
-cimport cintlist
-
-from libc.stdio cimport FILE, fopen, fread, fwrite, fclose
-from libc.stdlib cimport malloc, realloc, free
-from libc.string cimport memset, memcpy
-
-cdef struct _Trie_Node  # forward decl
-
-cdef struct _Trie_Edge:
-  int val
-  _Trie_Node* node
-  _Trie_Edge* bigger
-  _Trie_Edge* smaller
-
-cdef struct _Trie_Node:
-  _Trie_Edge* root
-  int* arr
-  int arr_len
-
-cdef _Trie_Node* new_trie_node():
-  cdef _Trie_Node* node
-  node = <_Trie_Node*> malloc(sizeof(_Trie_Node))
-  node.root = NULL
-  node.arr_len = 0
-  node.arr = <int*> malloc(sizeof(0*sizeof(int)))
-  return node
-
-cdef _Trie_Edge* new_trie_edge(int val):
-  cdef _Trie_Edge* edge
-  edge = <_Trie_Edge*> malloc(sizeof(_Trie_Edge))
-  edge.node = new_trie_node()
-  edge.bigger = NULL
-  edge.smaller = NULL
-  edge.val = val
-  return edge
-
-cdef free_trie_node(_Trie_Node* node):
-  if node != NULL:
-    free_trie_edge(node.root)
-    free(node.arr)
-
-cdef free_trie_edge(_Trie_Edge* edge):
-  if edge != NULL:
-    free_trie_node(edge.node)
-    free_trie_edge(edge.bigger)
-    free_trie_edge(edge.smaller)
-
-cdef _Trie_Node* trie_find(_Trie_Node* node, int val):
-  cdef _Trie_Edge* cur
-  cur = node.root
-  while cur != NULL and cur.val != val:
-    if val > cur.val:
-      cur = cur.bigger
-    elif val < cur.val:
-      cur = cur.smaller
-  if cur == NULL:
-    return NULL
-  else:
-    return cur.node
-
-cdef trie_node_data_append(_Trie_Node* node, int val):
-  cdef int new_len
-  new_len = node.arr_len + 1
-  node.arr = <int*> realloc(node.arr, new_len*sizeof(int))
-  node.arr[node.arr_len] = val
-  node.arr_len = new_len
-
-cdef trie_node_data_extend(_Trie_Node* node, int* vals, int num_vals):
-  cdef int new_len
-  new_len = node.arr_len + num_vals
-  node.arr = <int*> realloc(node.arr, new_len*sizeof(int))
-  memcpy(node.arr + node.arr_len, vals, num_vals*sizeof(int))
-  node.arr_len = new_len
-
-
-cdef _Trie_Node* trie_insert(_Trie_Node* node, int val):
-  cdef _Trie_Edge** cur
-  cur = &node.root
-  while cur[0] != NULL and cur[0].val != val:
-    if val > cur[0].val:
-      cur = &cur[0].bigger
-    elif val < cur[0].val:
-      cur = &cur[0].smaller
-  if cur[0] == NULL:
-    cur[0] = new_trie_edge(val)
-  return cur[0].node
-
-cdef trie_node_to_map(_Trie_Node* node, result, prefix, int include_zeros):
-  cdef cintlist.CIntList arr
-
-  if include_zeros or node.arr_len > 0:
-    arr = cintlist.CIntList()
-    free(arr.arr)
-    arr.arr = <int*> malloc(node.arr_len * sizeof(int))
-    memcpy(arr.arr, node.arr, node.arr_len * sizeof(int))
-    arr.len = node.arr_len
-    arr.size = node.arr_len
-    result[prefix] = arr
-  trie_edge_to_map(node.root, result, prefix, include_zeros)
-
-cdef trie_edge_to_map(_Trie_Edge* edge, result, prefix, int include_zeros):
-  if edge != NULL:
-    trie_edge_to_map(edge.smaller, result, prefix, include_zeros)
-    trie_edge_to_map(edge.bigger, result, prefix, include_zeros)
-    prefix = prefix + (edge.val,)
-    trie_node_to_map(edge.node, result, prefix, include_zeros)
-
-cdef class TrieMap:
-
-  cdef _Trie_Node** root
-  cdef int V
-
-  def __init__(self, alphabet_size):
-    self.V = alphabet_size
-    self.root = <_Trie_Node**> malloc(self.V * sizeof(_Trie_Node*))
-    memset(self.root, 0, self.V * sizeof(_Trie_Node*))
-
-
-  def __dealloc__(self):
-    cdef int i
-    for i from 0 <= i < self.V:
-      if self.root[i] != NULL:
-        free_trie_node(self.root[i])
-    free(self.root)
-
-
-  def insert(self, pattern):
-    cdef int* p
-    cdef int i, l
-    l = len(pattern)
-    p = <int*> malloc(l*sizeof(int))
-    for i from 0 <= i < l:
-      p[i] = pattern[i]
-    self._insert(p,l)
-    free(p)
-
-
-  cdef _Trie_Node* _insert(self, int* pattern, int pattern_len):
-    cdef int i
-    cdef _Trie_Node* node
-    if self.root[pattern[0]] == NULL:
-      self.root[pattern[0]] = new_trie_node()
-    node = self.root[pattern[0]]
-    for i from 1 <= i < pattern_len:
-      node = trie_insert(node, pattern[i])
-    return node
-
-  def contains(self, pattern):
-    cdef int* p
-    cdef int i, l
-    cdef _Trie_Node* node
-    l = len(pattern)
-    p = <int*> malloc(l*sizeof(int))
-    for i from 0 <= i < l:
-      p[i] = pattern[i]
-    node = self._contains(p,l)
-    free(p)
-    if node == NULL:
-      return False
-    else:
-      return True
-
-  cdef _Trie_Node* _contains(self, int* pattern, int pattern_len):
-    cdef int i
-    cdef _Trie_Node* node
-    node = self.root[pattern[0]]
-    i = 1
-    while node != NULL and i < pattern_len:
-      node = trie_find(node, pattern[i])
-      i = i+1
-    return node
-
-  def toMap(self, flag):
-    cdef int i, include_zeros
-
-    if flag:
-      include_zeros=1
-    else:
-      include_zeros=0
-    result = {}
-    for i from 0 <= i < self.V:
-      if self.root[i] != NULL:
-        trie_node_to_map(self.root[i], result, (i,), include_zeros)
-    return result
-
-
-cdef class Precomputation:
-
-#  Defined in .pxd file, here for reference:
-#  cdef int precompute_rank
-#  cdef int precompute_secondary_rank
-#  cdef int max_length
-#  cdef int max_nonterminals
-#  cdef int train_max_initial_size
-#  cdef int train_min_gap_size
-#  cdef precomputed_index
-#  cdef precomputed_collocations
-
-  def __init__(self, filename, sa=None, precompute_rank=1000, precompute_secondary_rank=20, max_length=5, 
-        max_nonterminals=2, train_max_initial_size=10, train_min_gap_size=2, from_binary=False):
-    self.precompute_rank = precompute_rank
-    self.precompute_secondary_rank = precompute_secondary_rank
-    self.max_length = max_length
-    self.max_nonterminals = max_nonterminals
-    self.train_max_initial_size = train_max_initial_size
-    self.train_min_gap_size = train_min_gap_size
-    if from_binary:
-      self.read_binary(filename)
-    else:
-      self.precompute(filename, sa)
-
-
-  def read_binary(self, filename):
-    cdef FILE* f
-    cdef bytes bfilename = filename
-    cdef char* cfilename = bfilename
-    f = fopen(cfilename, "r")
-    fread(&(self.precompute_rank), sizeof(int), 1, f)
-    fread(&(self.precompute_secondary_rank), sizeof(int), 1, f)
-    fread(&(self.max_length), sizeof(int), 1, f)
-    fread(&(self.max_nonterminals), sizeof(int), 1, f)
-    fread(&(self.train_max_initial_size), sizeof(int), 1, f)
-    fread(&(self.train_min_gap_size), sizeof(int), 1, f)
-    self.precomputed_index = self.read_map(f)
-    self.precomputed_collocations = self.read_map(f)
-    fclose(f)
-
-
-  def write_binary(self, filename):
-    cdef FILE* f
-    cdef bytes bfilename = filename
-    cdef char* cfilename = bfilename
-
-    f = fopen(cfilename, "w")
-    fwrite(&(self.precompute_rank), sizeof(int), 1, f)
-    fwrite(&(self.precompute_secondary_rank), sizeof(int), 1, f)
-    fwrite(&(self.max_length), sizeof(int), 1, f)
-    fwrite(&(self.max_nonterminals), sizeof(int), 1, f)
-    fwrite(&(self.train_max_initial_size), sizeof(int), 1, f)
-    fwrite(&(self.train_min_gap_size), sizeof(int), 1, f)
-    self.write_map(self.precomputed_index, f)
-    self.write_map(self.precomputed_collocations, f)
-    fclose(f)
-
-
-  cdef write_map(self, m, FILE* f):
-    cdef int i, N
-    cdef cintlist.CIntList arr
-
-    N = len(m)
-    fwrite(&(N), sizeof(int), 1, f)
-    for pattern, val in m.iteritems():
-      N = len(pattern)
-      fwrite(&(N), sizeof(int), 1, f)
-      for word_id in pattern:
-        i = word_id
-        fwrite(&(i), sizeof(int), 1, f)
-      arr = val
-      arr.write_handle(f)
-
-
-  cdef read_map(self, FILE* f):
-    cdef int i, j, k, word_id, N
-    cdef cintlist.CIntList arr
-
-    m = {}
-    fread(&(N), sizeof(int), 1, f)
-    for j from 0 <= j < N:
-      fread(&(i), sizeof(int), 1, f)
-      key = ()
-      for k from 0 <= k < i:
-        fread(&(word_id), sizeof(int), 1, f)
-        key = key + (word_id,)
-      arr = cintlist.CIntList()
-      arr.read_handle(f)
-      m[key] = arr
-    return m
-
-
-  def precompute(self, filename, sa):
-    cdef int i, l, N, max_pattern_len, i1, l1, i2, l2, i3, l3, ptr1, ptr2, ptr3, is_super, sent_count, max_rank
-    cdef csuf.SuffixArray sarray
-    cdef cdat.DataArray darray
-    cdef cintlist.CIntList data, queue, cost_by_rank, count_by_rank
-    cdef TrieMap frequent_patterns, super_frequent_patterns, collocations
-    cdef _Trie_Node* node
-
-    sarray = sa
-    darray = sarray.darray
-    data = darray.data
-
-    frequent_patterns = TrieMap(len(darray.id2word))
-    super_frequent_patterns = TrieMap(len(darray.id2word))
-    collocations = TrieMap(len(darray.id2word))
-
-    I_set = set()
-    J_set = set()
-    J2_set = set()
-    IJ_set = set()
-    pattern_rank = {}
-
-    log.writeln("Precomputing frequent intersections\n", 1)
-    start_time = monitor.cpu()
-
-    max_pattern_len = 0
-    if filename is not None:
-      precompute_file = open(filename)
-      for rank, line in enumerate(precompute_file):
-        if rank >= self.precompute_rank:
-          break
-        phrase_words = line.split()[2:]
-        phrase = ()
-        for word in phrase_words:
-          phrase = phrase + (darray.word2id[word],)
-        max_pattern_len = max(max_pattern_len, len(phrase))
-        frequent_patterns.insert(phrase)
-        I_set.add(phrase)
-        pattern_rank[phrase] = rank
-        if rank < self.precompute_secondary_rank:
-          super_frequent_patterns.insert(phrase)
-          J_set.add(phrase)
-      precompute_file.close()
-
-    queue = cintlist.CIntList(increment=1000)
-
-    log.writeln("  Computing inverted indexes...", 1)
-    N = len(data)
-    for i from 0 <= i < N:
-      sa_word_id = data.arr[i]
-      if sa_word_id == 1:
-        queue._append(-1)
-      else:
-        for l from 1 <= l <= max_pattern_len:
-          node = frequent_patterns._contains(data.arr+i, l)
-          if node == NULL:
-            break
-          queue._append(i)
-          queue._append(l)
-          trie_node_data_append(node, i)
-
-    log.writeln("  Computing collocations...", 1)
-    N = len(queue)
-    ptr1 = 0
-    sent_count = 0
-    while ptr1 < N:  # main loop
-      i1 = queue.arr[ptr1]
-      if i1 > -1:
-        l1 = queue.arr[ptr1+1]
-        ptr2 = ptr1 + 2
-        while ptr2 < N:
-          i2 = queue.arr[ptr2]
-          if i2 == -1 or i2 - i1 >= self.train_max_initial_size:
-            break
-          l2 = queue.arr[ptr2+1]
-          if i2 - i1 - l1 >= self.train_min_gap_size and i2 + l2 - i1 <= self.train_max_initial_size and l1+l2+1 <= self.max_length:
-            node = collocations._insert(data.arr+i1, l1)
-            node = trie_insert(node, -1)
-            for i from i2 <= i < i2+l2:
-              node = trie_insert(node, data.arr[i])
-            trie_node_data_append(node, i1)
-            trie_node_data_append(node, i2)
-            if super_frequent_patterns._contains(data.arr+i2, l2) != NULL:
-              if super_frequent_patterns._contains(data.arr+i1, l1) != NULL:
-                is_super = 1
-              else:
-                is_super = 0
-              ptr3 = ptr2 + 2
-              while ptr3 < N:
-                i3 = queue.arr[ptr3]
-                if i3 == -1 or i3 - i1 >= self.train_max_initial_size:
-                  break
-                l3 = queue.arr[ptr3+1]
-                if i3 - i2 - l2 >= self.train_min_gap_size and i3 + l3 - i1 <= self.train_max_initial_size and l1+l2+l3+2 <= self.max_length:
-                  if is_super or super_frequent_patterns._contains(data.arr+i3, l3) != NULL:
-                    node = collocations._insert(data.arr+i1, l1)
-                    node = trie_insert(node, -1)
-                    for i from i2 <= i < i2+l2:
-                      node = trie_insert(node, data.arr[i])
-                    node = trie_insert(node, -1)
-                    for i from i3 <= i < i3+l3:
-                      node = trie_insert(node, data.arr[i])
-                    trie_node_data_append(node, i1)
-                    trie_node_data_append(node, i2)
-                    trie_node_data_append(node, i3)
-                ptr3 = ptr3 + 2
-          ptr2 = ptr2 + 2
-        ptr1 = ptr1 + 2
-      else:
-        sent_count = sent_count + 1
-        if sent_count % 10000 == 0:
-          log.writeln("    %d sentences" % sent_count)
-        ptr1 = ptr1 + 1
-
-    self.precomputed_collocations = collocations.toMap(False)
-    self.precomputed_index = frequent_patterns.toMap(True)
-
-    x = 0
-    for pattern1 in J_set:
-      for pattern2 in J_set:
-        if len(pattern1) + len(pattern2) + 1 < self.max_length:
-          combined_pattern = pattern1 + (-1,) + pattern2
-          J2_set.add(combined_pattern)
-
-    for pattern1 in I_set:
-      for pattern2 in I_set:
-        x = x+1
-        if len(pattern1) + len(pattern2) + 1 <= self.max_length:
-          combined_pattern = pattern1 + (-1,) + pattern2
-          IJ_set.add(combined_pattern)
-
-    for pattern1 in I_set:
-      for pattern2 in J2_set:
-        x = x+2
-        if len(pattern1) + len(pattern2) + 1<= self.max_length:
-          combined_pattern = pattern1 + (-1,) + pattern2
-          IJ_set.add(combined_pattern)
-          combined_pattern = pattern2 + (-1,) + pattern1
-          IJ_set.add(combined_pattern)
-
-    N = len(pattern_rank)
-    cost_by_rank = cintlist.CIntList(initial_len=N)
-    count_by_rank = cintlist.CIntList(initial_len=N)
-    for pattern, arr in self.precomputed_collocations.iteritems():
-      if pattern not in IJ_set:
-        s = ""
-        for word_id in pattern:
-          if word_id == -1:
-            s = s + "X "
-          else:
-            s = s + darray.id2word[word_id] + " "
-        log.writeln("ERROR: unexpected pattern %s in set of precomputed collocations" % (s), 1)
-      else:
-        chunk = ()
-        max_rank = 0
-        arity = 0
-        for word_id in pattern:
-          if word_id == -1:
-            max_rank = max(max_rank, pattern_rank[chunk])
-            arity = arity + 1
-            chunk = ()
-          else:
-            chunk = chunk + (word_id,)
-        max_rank = max(max_rank, pattern_rank[chunk])
-        cost_by_rank.arr[max_rank] = cost_by_rank.arr[max_rank] + (4*len(arr))
-        count_by_rank.arr[max_rank] = count_by_rank.arr[max_rank] + (len(arr)/(arity+1))
-
-    cumul_cost = 0
-    cumul_count = 0
-    for i from 0 <= i < N:
-      cumul_cost = cumul_cost + cost_by_rank.arr[i]
-      cumul_count = cumul_count + count_by_rank.arr[i]
-      log.writeln("RANK %d\tCOUNT, COST: %d  %d\tCUMUL: %d, %d" % (i, count_by_rank.arr[i], cost_by_rank.arr[i], cumul_count, cumul_cost))
-
-    num_found_patterns = len(self.precomputed_collocations)
-    for pattern in IJ_set:
-      if pattern not in self.precomputed_collocations:
-        self.precomputed_collocations[pattern] = cintlist.CIntList()
-
-    stop_time = monitor.cpu()
-    log.writeln("Precomputed collocations for %d patterns out of %d possible (upper bound %d)" % (num_found_patterns,len(self.precomputed_collocations),x))
-    log.writeln("Precomputed inverted index for %d patterns " % len(self.precomputed_index))
-    log.writeln("Precomputation took %f seconds" % (stop_time - start_time))
-    log.writeln("Detailed statistics:")
-
-
-
-
-
-
-
diff --git a/sa-extract/rule.pxd b/sa-extract/rule.pxd
deleted file mode 100644
index c9c84e5c..00000000
--- a/sa-extract/rule.pxd
+++ /dev/null
@@ -1,13 +0,0 @@
-cdef class Phrase:
-    cdef int *syms
-    cdef int n, *varpos, n_vars
-    cdef public int chunkpos(self, int k)
-    cdef public int chunklen(self, int k)
-
-cdef class Rule:
-    cdef public int lhs
-    cdef readonly Phrase f, e
-    cdef float *cscores
-    cdef int n_scores
-    cdef public owner, word_alignments
-
diff --git a/sa-extract/rule.pyx b/sa-extract/rule.pyx
deleted file mode 100644
index 7cd3efda..00000000
--- a/sa-extract/rule.pyx
+++ /dev/null
@@ -1,286 +0,0 @@
-from libc.stdlib cimport malloc, calloc, realloc, free, strtof, strtol
-from libc.string cimport strsep, strcpy, strlen
-
-cdef extern from "strutil.h":
-   char *strstrsep(char **stringp, char *delim)
-   char *strip(char *s)
-   char **split(char *s, char *delim, int *pn)
-
-import sys
-
-import sym
-cimport sym
-cdef sym.Alphabet alphabet
-alphabet = sym.alphabet
-
-global span_limit
-span_limit = None
-
-cdef int bufsize
-cdef char *buf
-bufsize = 100
-buf = <char *>malloc(bufsize)
-cdef ensurebufsize(int size):
-   global buf, bufsize
-   if size > bufsize:
-      buf = <char *>realloc(buf, size*sizeof(char))
-      bufsize = size
-      
-cdef class Phrase:
-   def __cinit__(self, words):
-      cdef int i, j, n, n_vars
-      cdef char **toks
-      cdef bytes bwords
-      cdef char* cwords
-
-      n_vars = 0
-      if type(words) is str:
-         ensurebufsize(len(words)+1)
-         bwords = words
-         cwords = bwords
-         strcpy(buf, cwords)
-         toks = split(buf, NULL, &n)
-         self.syms = <int *>malloc(n*sizeof(int))
-         for i from 0 <= i < n:
-            self.syms[i] = alphabet.fromstring(toks[i], 0)
-            if alphabet.isvar(self.syms[i]):
-               n_vars = n_vars + 1
-         
-      else:
-         n = len(words)
-         self.syms = <int *>malloc(n*sizeof(int))
-         for i from 0 <= i < n:
-            self.syms[i] = words[i]
-            if alphabet.isvar(self.syms[i]):
-               n_vars = n_vars + 1
-      self.n = n
-      self.n_vars = n_vars
-      self.varpos = <int *>malloc(n_vars*sizeof(int))
-      j = 0
-      for i from 0 <= i < n:
-         if alphabet.isvar(self.syms[i]):
-            self.varpos[j] = i
-            j = j + 1
-
-   def __dealloc__(self):
-      free(self.syms)
-      free(self.varpos)
-
-   def __str__(self):
-      strs = []
-      cdef int i, s
-      for i from 0 <= i < self.n:
-         s = self.syms[i]
-         strs.append(alphabet.tostring(s))
-      return " ".join(strs)
-
-   def instantiable(self, i, j, n):
-      return span_limit is None or (j-i) <= span_limit
-
-   def handle(self):
-      """return a hashable representation that normalizes the ordering
-      of the nonterminal indices"""
-      norm = []
-      cdef int i, j, s
-      i = 1
-      j = 0
-      for j from 0 <= j < self.n:
-         s = self.syms[j]
-         if alphabet.isvar(s):
-            s = alphabet.setindex(s,i)
-            i = i + 1
-         norm.append(s)
-      return tuple(norm)
-
-   def strhandle(self):
-      strs = []
-      norm = []
-      cdef int i, j, s
-      i = 1
-      j = 0
-      for j from 0 <= j < self.n:
-         s = self.syms[j]
-         if alphabet.isvar(s):
-            s = alphabet.setindex(s,i)
-            i = i + 1
-         norm.append(alphabet.tostring(s))
-      return " ".join(norm)
-
-   def arity(self):
-      return self.n_vars
-
-   def getvarpos(self, i):
-      if 0 <= i < self.n_vars:
-         return self.varpos[i]
-      else:
-         raise IndexError
-
-   def getvar(self, i):
-      if 0 <= i < self.n_vars:
-         return self.syms[self.varpos[i]]
-      else:
-         raise IndexError
-
-   cdef int chunkpos(self, int k):
-      if k == 0:
-         return 0
-      else:
-         return self.varpos[k-1]+1
-
-   cdef int chunklen(self, int k):
-      if self.n_vars == 0:
-         return self.n
-      elif k == 0:
-         return self.varpos[0]
-      elif k == self.n_vars:
-         return self.n-self.varpos[k-1]-1
-      else:
-         return self.varpos[k]-self.varpos[k-1]-1
-
-   def clen(self, k):
-       return self.chunklen(k)
-
-   def getchunk(self, ci):
-      cdef int start, stop
-      start = self.chunkpos(ci)
-      stop = start+self.chunklen(ci)
-      chunk = []
-      for i from start <= i < stop:
-         chunk.append(self.syms[i])
-      return chunk
-
-   def __cmp__(self, other):
-      cdef Phrase otherp
-      cdef int i
-      otherp = other
-      for i from 0 <= i < min(self.n, otherp.n):
-         if self.syms[i] < otherp.syms[i]:
-            return -1
-         elif self.syms[i] > otherp.syms[i]:
-            return 1
-      if self.n < otherp.n:
-         return -1
-      elif self.n > otherp.n:
-         return 1
-      else:
-         return 0
-
-   def __hash__(self):
-      cdef int i
-      cdef unsigned h
-      h = 0
-      for i from 0 <= i < self.n:
-         if self.syms[i] > 0:
-            h = (h << 1) + self.syms[i]
-         else:
-            h = (h << 1) + -self.syms[i]
-      return h
-
-   def __len__(self):
-      return self.n
-
-   def __getitem__(self, i):
-      return self.syms[i]
-
-   def __iter__(self):
-      cdef int i
-      l = []
-      for i from 0 <= i < self.n:
-         l.append(self.syms[i])
-      return iter(l)
-
-   def subst(self, start, children):
-      cdef int i
-      for i from 0 <= i < self.n:
-         if alphabet.isvar(self.syms[i]):
-            start = start + children[alphabet.getindex(self.syms[i])-1]
-         else:
-            start = start + (self.syms[i],)
-      return start
-
-cdef class Rule:
-   def __cinit__(self, lhs, f, e, owner=None, scores=None, word_alignments=None):
-      cdef int i, n
-      cdef char *rest
-
-      self.word_alignments = word_alignments
-      if scores is None:
-         self.cscores = NULL
-         self.n_scores = 0
-      else:
-         n = len(scores)
-         self.cscores = <float *>malloc(n*sizeof(float))
-         self.n_scores = n
-         for i from 0 <= i < n:
-            self.cscores[i] = scores[i]
-
-   def __init__(self, lhs, f, e, owner=None, scores=None, word_alignments=None):
-      if not sym.isvar(lhs):
-         sys.stderr.write("error: lhs=%d\n" % lhs)
-      self.lhs = lhs
-      self.f = f
-      self.e = e
-      self.word_alignments = word_alignments
-
-   def __dealloc__(self):
-      if self.cscores != NULL:
-         free(self.cscores)
-
-   def __str__(self):
-      return self.to_line()
-
-   def __hash__(self):
-      return hash((self.lhs, self.f, self.e))
-
-   def __cmp__(self, Rule other):
-      return cmp((self.lhs, self.f, self.e, self.word_alignments), (other.lhs, other.f, other.e, self.word_alignments))
-
-   def __iadd__(self, Rule other):
-      if self.n_scores != other.n_scores:
-         raise ValueError
-      for i from 0 <= i < self.n_scores:
-         self.cscores[i] = self.cscores[i] + other.cscores[i]
-      return self
-
-   def fmerge(self, Phrase f):
-      if self.f == f:
-         self.f = f
-      
-   def arity(self):
-      return self.f.arity()
-
-   def to_line(self):
-      scorestrs = []
-      for i from 0 <= i < self.n_scores:
-         scorestrs.append(str(self.cscores[i]))
-      fields = [alphabet.tostring(self.lhs), str(self.f), str(self.e), " ".join(scorestrs)]
-      if self.word_alignments is not None:
-         alignstr = []
-         for i from 0 <= i < len(self.word_alignments):
-            alignstr.append("%d-%d" % (self.word_alignments[i]/65536, self.word_alignments[i]%65536))
-         #for s,t in self.word_alignments:
-             #alignstr.append("%d-%d" % (s,t)) 
-         fields.append(" ".join(alignstr))
-      
-      return " ||| ".join(fields)
-
-   property scores:
-      def __get__(self):
-         s = [None]*self.n_scores
-         for i from 0 <= i < self.n_scores:
-            s[i] = self.cscores[i]
-         return s
-
-      def __set__(self, s):
-         if self.cscores != NULL:
-            free(self.cscores)
-         self.cscores = <float *>malloc(len(s)*sizeof(float))
-         self.n_scores = len(s)
-         for i from 0 <= i < self.n_scores:
-            self.cscores[i] = s[i]
-
-def rule_copy(r):
-   r1 = Rule(r.lhs, r.f, r.e, r.owner, r.scores)
-   r1.word_alignments = r.word_alignments
-   return r1
-
diff --git a/sa-extract/rulefactory.pyx b/sa-extract/rulefactory.pyx
deleted file mode 100644
index 792489c4..00000000
--- a/sa-extract/rulefactory.pyx
+++ /dev/null
@@ -1,2361 +0,0 @@
-# Pyrex implementation of the algorithms described in 
-# Lopez, EMNLP-CoNLL 2007
-# Much faster than the Python numbers reported there.
-# Note to reader: this code is closer to C than Python
-import sys
-import sym
-import log
-import rule
-import monitor
-import cintlist
-import csuf
-import cdat
-import cveb
-import precomputation
-import gc
-import cn
-import sgml
-
-cimport cmath
-cimport csuf
-cimport cdat
-cimport cintlist
-cimport rule
-cimport cveb
-cimport precomputation
-cimport calignment
-
-from libc.stdlib cimport malloc, realloc, free
-from libc.string cimport memset, memcpy
-from libc.math cimport fmod, ceil, floor
-
-cdef int PRECOMPUTE
-cdef int MERGE
-cdef int BAEZA_YATES
-
-PRECOMPUTE = 0
-MERGE = 1
-BAEZA_YATES = 2
-
-#cdef int node_count
-#node_count = 0
-
-cdef class TrieNode:
-  cdef public children
-  #cdef int id
-
-  def __init__(self):
-    self.children = {}
-    #self.id = node_count
-    #node_count += 1
-
-
-cdef class ExtendedTrieNode(TrieNode):
-  cdef public phrase
-  cdef public phrase_location
-  cdef public suffix_link
-
-  def __init__(self, phrase=None, phrase_location=None, suffix_link=None):
-    TrieNode.__init__(self)
-    self.phrase = phrase
-    self.phrase_location = phrase_location
-    self.suffix_link = suffix_link
-    
-
-cdef class TrieTable:
-  cdef public int extended
-  cdef public int count
-  cdef public root
-  def __cinit__(self, extended=False):
-    self.count = 0
-    self.extended = extended
-    if extended:
-      self.root = ExtendedTrieNode()
-    else:
-      self.root = TrieNode()
-
-# linked list structure for storing matches in BaselineRuleFactory
-cdef struct match_node:
-  int* match
-  match_node* next
-
-cdef class BaselineRuleFactory:
-
-  cdef grammar, context_manager
-  cdef int max_terminals, max_nonterminals
-  cdef int max_initial_size, train_max_initial_size
-  cdef int min_gap_size, train_min_gap_size
-  cdef int category
-  cdef int visit
-  cdef float intersect_time, extract_time
-  cdef ruleFile, timingFile
-  cdef int* last_visit1
-  cdef int* last_visit2
-  cdef match_node** intersector1
-  cdef match_node** intersector2
-  cdef csuf.SuffixArray sa
-  cdef cintlist.CIntList sent_id
-
-  def input(self, fwords):
-    flen = len(fwords)
-    start_time = monitor.cpu()
-    self.intersect_time = 0.0
-    self.extract_time = 0.0
-
-    pyro_phrase_count = 0
-    hiero_phrase_count = 0
-
-    frontier = []
-    for i in xrange(len(fwords)):
-      frontier.append((i, (), False))
-
-    while len(frontier) > 0:
-      this_iter_intersect_time = self.intersect_time
-      new_frontier = []
-      for i, prefix, is_shadow_path in frontier:
-
-        word_id = fwords[i][0][0]
-        #print "word_id = %i" % word_id
-        phrase = prefix + (word_id,)
-        str_phrase = map(sym.tostring, phrase)
-        hiero_phrase = rule.Phrase(phrase)
-
-        #log.writeln("pos %2d, '%s'" % (i, hiero_phrase))
-        self.lookup(hiero_phrase)
-        if hiero_phrase.arity() == 0:
-          pyro_phrase_count = pyro_phrase_count + 1
-        else:
-          hiero_phrase_count = hiero_phrase_count + 1
-
-        if len(phrase) - hiero_phrase.arity() < self.max_terminals and i+1 < len(fwords):
-          new_frontier.append((i+1, phrase, is_shadow_path))
-          if hiero_phrase.arity() < self.max_nonterminals:
-            xcat = sym.setindex(self.category, hiero_phrase.arity()+1)
-            for j in xrange(i+1+self.min_gap_size, min(i+self.max_initial_size, len(fwords))):
-              new_frontier.append((j, phrase+(xcat,), is_shadow_path))
-      log.writeln("This iteration intersect time = %f" % (self.intersect_time - this_iter_intersect_time))
-      frontier = new_frontier
-    stop_time = monitor.cpu()
-    log.writeln("COUNT %d %d" % (pyro_phrase_count, hiero_phrase_count))
-
-
-  def lookup(self, phrase):
-    cdef int j, g, start, stop, sent_id, num_ranges, arity
-    cdef match_node** cur_intersector
-    cdef match_node** next_intersector
-    cdef match_node** tmp_intersector
-    cdef match_node* node
-    cdef match_node* cur_node
-    cdef match_node* prev_node
-    cdef match_node** node_ptr
-    cdef int* cur_visit
-    cdef int* next_visit
-    cdef int* tmp_visit
-    cdef int* chunklen
-    
-    #print "\n\nLOOKUP\n\n"
-    ranges = []
-    sizes = []
-    arity = phrase.arity()
-    chunklen = <int *> malloc(arity*sizeof(int))
-    for i from 0 <= i < arity+1:
-      chunk = phrase.getchunk(i)
-      chunklen[i] = len(chunk)
-      sa_range = None
-      phr = ()
-      for offset, word_id in enumerate(chunk):
-        word = sym.tostring(word_id)
-        sa_range = self.context_manager.fsarray.lookup(word, offset, sa_range[0], sa_range[1])
-      if sa_range is None:
-        #log.writeln("Returned for phrase %s" % rule.Phrase(phr))
-        return
-      #log.writeln("Found range %s for phrase %s" % (sa_range, rule.Phrase(phr)))
-      ranges.append(sa_range)
-      sizes.append(sa_range[1]-sa_range[0])
-    if phrase.arity() == 0:
-      return
-
-    cur_intersector = self.intersector1
-    next_intersector = self.intersector2
-    cur_visit = self.last_visit1
-    next_visit = self.last_visit2
-
-    num_ranges = len(ranges)
-    for i from 0 <= i < num_ranges:
-      sa_range = ranges[i]
-      start_time = monitor.cpu()
-      self.visit = self.visit + 1
-      intersect_count = 0
-
-      start = sa_range[0]
-      stop = sa_range[1]
-      for j from start <= j < stop:
-        g = self.sa.sa.arr[j]
-        sent_id = self.sent_id.arr[g]
-        if i==0:
-          if next_visit[sent_id] != self.visit:
-            # clear intersector
-            node = next_intersector[sent_id]
-            next_intersector[sent_id] = NULL
-            while node != NULL:
-              prev_node = node
-              node = node.next
-              free(prev_node.match)
-              free(prev_node)
-            next_visit[sent_id] = self.visit
-          node_ptr = &(next_intersector[sent_id])
-          while node_ptr[0] != NULL:
-            node_ptr = &(node_ptr[0].next)
-          node_ptr[0] = <match_node*> malloc(sizeof(match_node))
-          node_ptr[0].match = <int *> malloc(sizeof(int))
-          node_ptr[0].match[0] = g
-          node_ptr[0].next = NULL
-          intersect_count = intersect_count + 1
-        else:
-          if cur_visit[sent_id] == self.visit-1:
-            cur_node = cur_intersector[sent_id]
-            while cur_node != NULL:
-              if g - cur_node.match[0] + chunklen[i] <= self.train_max_initial_size and g - cur_node.match[i-1] - chunklen[i-1] >= self.train_min_gap_size:
-                if next_visit[sent_id] != self.visit:
-                  # clear intersector -- note that we only do this if we've got something to put there
-                  node = next_intersector[sent_id]
-                  next_intersector[sent_id] = NULL
-                  while node != NULL:
-                    prev_node = node
-                    node = node.next
-                    free(prev_node.match)
-                    free(prev_node)
-                  next_visit[sent_id] = self.visit
-                node_ptr = &(next_intersector[sent_id])
-                while node_ptr[0] != NULL:
-                  node_ptr = &(node_ptr[0].next)
-                node_ptr[0] = <match_node*> malloc(sizeof(match_node))
-                node_ptr[0].match = <int *> malloc((i+1) * sizeof(int))
-                memcpy(node_ptr[0].match, cur_node.match, i*sizeof(int))
-                node_ptr[0].match[i] = g
-                node_ptr[0].next = NULL
-                intersect_count = intersect_count + 1
-              cur_node = cur_node.next
-      tmp_intersector = cur_intersector
-      cur_intersector = next_intersector
-      next_intersector = tmp_intersector
-      
-      tmp_visit = cur_visit
-      cur_visit = next_visit
-      next_visit = tmp_visit
-
-      intersect_time = monitor.cpu() - start_time
-      if i > 0:
-        log.writeln("INT %d %d %d %d %f baseline" % 
-          (arity, prev_intersect_count, sa_range[1]-sa_range[0], 
-          intersect_count, intersect_time))
-      if intersect_count == 0:
-        return None
-      prev_intersect_count = intersect_count
-    free(chunklen)
-
-
-
-  def __init__(self, max_terminals=5, max_nonterminals=2, 
-        max_initial_size=10, train_max_initial_size=10,
-        min_gap_size=1, train_min_gap_size=2, 
-        category='[PHRASE]', grammar=None,
-        ruleFile=None, timingFile=None):
-    self.grammar = grammar
-    self.max_terminals = max_terminals
-    self.max_nonterminals = max_nonterminals 
-    self.max_initial_size = max_initial_size
-    self.train_max_initial_size = train_max_initial_size
-    self.min_gap_size = min_gap_size
-    self.train_min_gap_size = train_min_gap_size
-    self.category = sym.fromstring(category)
-    self.ruleFile = ruleFile
-    self.timingFile = timingFile
-    self.visit = 0
-
-
-  def registerContext(self, context_manager):
-    cdef int num_sents
-    self.context_manager = context_manager
-    self.sa = context_manager.fsarray
-    self.sent_id = self.sa.darray.sent_id
-    
-    num_sents = len(self.sa.darray.sent_index)
-    self.last_visit1 = <int *> malloc(num_sents * sizeof(int))
-    memset(self.last_visit1, 0, num_sents * sizeof(int))
-    
-    self.last_visit2 = <int *> malloc(num_sents * sizeof(int))
-    memset(self.last_visit2, 0, num_sents * sizeof(int))
-
-    self.intersector1 = <match_node **> malloc(num_sents * sizeof(match_node*))
-    memset(self.intersector1, 0, num_sents * sizeof(match_node*))
-
-    self.intersector2 = <match_node **> malloc(num_sents * sizeof(match_node*))
-    memset(self.intersector2, 0, num_sents * sizeof(match_node*))
-
-
-# encodes information needed to find a (hierarchical) phrase
-# in the text.  If phrase is contiguous, that's just a range
-# in the suffix array; if discontiguous, it is the set of 
-# actual locations (packed into an array)
-cdef class PhraseLocation:
-  cdef int sa_low
-  cdef int sa_high
-  cdef int arr_low
-  cdef int arr_high
-  cdef cintlist.CIntList arr
-  cdef int num_subpatterns
-
-  # returns true if sent_id is contained
-  cdef int contains(self, int sent_id):
-    return 1
-
-  def __init__(self, sa_low=-1, sa_high=-1, arr_low=-1, arr_high=-1, arr=None, num_subpatterns=1):
-    self.sa_low = sa_low
-    self.sa_high = sa_high
-    self.arr_low = arr_low
-    self.arr_high = arr_high
-    self.arr = arr
-    self.num_subpatterns = num_subpatterns
-
-
-
-cdef class Sampler:
-  '''A Sampler implements a logic for choosing
-  samples from a population range'''
-
-  cdef int sampleSize
-  cdef context_manager
-  cdef cintlist.CIntList sa
-
-  def __init__(self, sampleSize=0):
-    self.sampleSize = sampleSize
-    if sampleSize > 0:
-      log.writeln("Sampling strategy: uniform, max sample size = %d" % sampleSize, 1)
-    else:
-      log.writeln("Sampling strategy: no sampling", 1)
-
-  def registerContext(self, context_manager):
-    self.context_manager = context_manager
-    self.sa = (<csuf.SuffixArray> context_manager.fsarray).sa
-
-
-  def sample(self, PhraseLocation phrase_location):
-    '''Returns a sample of the locations for
-    the phrase.  If there are less than self.sampleSize
-    locations, return all of them; otherwise, return
-    up to self.sampleSize locations.  In the latter case,
-    we choose to sample UNIFORMLY -- that is, the locations
-    are chosen at uniform intervals over the entire set, rather
-    than randomly.  This makes the algorithm deterministic, which
-    is good for things like MERT'''
-    cdef cintlist.CIntList sample
-    cdef double i, stepsize
-    cdef int num_locations, val, j
-
-    sample = cintlist.CIntList()
-    if phrase_location.arr is None:
-      num_locations = phrase_location.sa_high - phrase_location.sa_low
-      if self.sampleSize == -1 or num_locations <= self.sampleSize:
-        sample._extend_arr(self.sa.arr + phrase_location.sa_low, num_locations)
-      else:
-        stepsize = float(num_locations)/float(self.sampleSize)
-        i = phrase_location.sa_low
-        while i < phrase_location.sa_high and sample.len < self.sampleSize:
-          '''Note: int(i) not guaranteed to have the desired
-          effect, according to the python documentation'''
-          if fmod(i,1.0) > 0.5:
-            val = int(ceil(i))
-          else:
-            val = int(floor(i))
-          sample._append(self.sa.arr[val])
-          i = i + stepsize
-    else:
-      num_locations = (phrase_location.arr_high - phrase_location.arr_low) / phrase_location.num_subpatterns
-      if self.sampleSize == -1 or num_locations <= self.sampleSize:
-        sample = phrase_location.arr
-      else:
-        stepsize = float(num_locations)/float(self.sampleSize)
-        i = phrase_location.arr_low
-        while i < num_locations and sample.len < self.sampleSize * phrase_location.num_subpatterns:
-          '''Note: int(i) not guaranteed to have the desired
-          effect, according to the python documentation'''
-          if fmod(i,1.0) > 0.5:
-            val = int(ceil(i))
-          else:
-            val = int(floor(i))
-          j = phrase_location.arr_low + (val*phrase_location.num_subpatterns)
-          sample._extend_arr(phrase_location.arr.arr + j, phrase_location.num_subpatterns)
-          i = i + stepsize
-    return sample
-
-
-cdef long nGramCount(PhraseLocation loc):
-  return (loc.arr_high - loc.arr_low)/ loc.num_subpatterns
-
-
-# struct used to encapsulate a single matching
-cdef struct Matching:
-  int* arr
-  int start
-  int end
-  int sent_id
-  int size
-
-
-cdef void assign_matching(Matching* m, int* arr, int start, int step, int* sent_id_arr):
-  m.arr = arr
-  m.start = start
-  m.end = start + step
-  m.sent_id = sent_id_arr[arr[start]]
-  m.size = step
-
-
-cdef int* append_combined_matching(int* arr, Matching* loc1, Matching* loc2, 
-                int offset_by_one, int num_subpatterns, int* result_len):
-  cdef int i, new_len
-
-  new_len = result_len[0] + num_subpatterns
-  arr = <int*> realloc(arr, new_len*sizeof(int))
-
-  for i from 0 <= i < loc1.size:
-    arr[result_len[0]+i] = loc1.arr[loc1.start+i]
-  if num_subpatterns > loc1.size:
-    arr[new_len-1] = loc2.arr[loc2.end-1]
-  result_len[0] = new_len
-  return arr
-
-
-cdef int* extend_arr(int* arr, int* arr_len, int* appendix, int appendix_len):
-  cdef int new_len
-  
-  new_len = arr_len[0] + appendix_len
-  arr = <int*> realloc(arr, new_len*sizeof(int))
-  memcpy(arr+arr_len[0], appendix, appendix_len*sizeof(int))
-  arr_len[0] = new_len
-  return arr
-
-
-#cdef matching2str(Matching* m):
-#  cdef int i
-#  cdef result
-
-#  result = "("
-#  for i from m.start <= i < m.end:
-#    result = result + str(m.arr[i]) + " "
-#  result = result + ")"
-#  return result
-
-
-cdef int median(int low, int high, int step):
-  return low + (((high - low)/step)/2)*step
-
-
-cdef void findComparableMatchings(int low, int high, int* arr, int step, int loc, int* loc_minus, int* loc_plus):
-  # Returns (minus, plus) indices for the portion of the array
-  # in which all matchings have the same first index as the one
-  # starting at loc
-  loc_plus[0] = loc + step
-  while loc_plus[0] < high and arr[loc_plus[0]] == arr[loc]:
-    loc_plus[0] = loc_plus[0] + step
-  loc_minus[0] = loc
-  while loc_minus[0]-step >= low and arr[loc_minus[0]-step] == arr[loc]:
-    loc_minus[0] = loc_minus[0] - step
-
-
-cdef class HieroCachingRuleFactory:
-  '''This RuleFactory implements a caching 
-  method using TrieTable, which makes phrase
-  generation somewhat speedier -- phrases only
-  need to be extracted once (however, it is
-  quite possible they need to be scored 
-  for each input sentence, for contextual models)'''
-
-  cdef rules, grammar, context_manager
-
-  cdef int max_chunks
-  cdef int max_target_chunks
-  cdef int max_length
-  cdef int max_target_length
-  cdef int max_nonterminals
-  cdef int max_initial_size
-  cdef int train_max_initial_size
-  cdef int min_gap_size
-  cdef int train_min_gap_size
-  cdef int category
-
-  cdef cacheBetweenSents
-  cdef precomputed_index
-  cdef precomputed_collocations
-  cdef precompute_file
-  cdef max_rank
-  cdef int precompute_rank, precompute_secondary_rank
-  cdef useBaezaYates
-  cdef use_index
-  cdef use_collocations
-  cdef float by_slack_factor
-
-  cdef per_sentence_grammar
-  cdef rule_filehandler
-  cdef rule_file
-  cdef pruned_rule_file
-  cdef extract_file
-  cdef sample_file
-  cdef search_file
-  cdef timingFile
-  cdef log_int_stats
-  cdef prev_norm_prefix
-  cdef float intersect_time, extract_time
-  cdef csuf.SuffixArray fsa
-  cdef cdat.DataArray fda
-  cdef cdat.DataArray eda
-  
-  cdef calignment.Alignment alignment
-  cdef cintlist.CIntList eid2symid
-  cdef cintlist.CIntList fid2symid
-  cdef int tight_phrases
-  cdef int require_aligned_terminal
-  cdef int require_aligned_chunks
-
-  cdef cintlist.CIntList findexes
-  cdef cintlist.CIntList findexes1
-
-  cdef int excluded_sent_id       # exclude a sentence id
-
-  def __init__(self, 
-        alignment=None,           # compiled alignment object (REQUIRED)
-        by_slack_factor=1.0,      # parameter for double-binary search; doesn't seem to matter much
-        category="[PHRASE]",      # name of generic nonterminal used by Hiero
-        cacheBetweenSents=False,  # prevent flushing of tree between sents; use carefully or you'll run out of memory
-        extract_file=None,        # print raw extracted rules to this file
-        grammar=None,             # empty grammar object -- must be supplied from outside (REQUIRED)
-        log_int_stats=False,      # prints timing data on intersections to stderr
-        max_chunks=None,          # maximum number of contiguous chunks of terminal symbols in RHS of a rule. If None, defaults to max_nonterminals+1
-        max_initial_size=10,      # maximum span of a grammar rule in TEST DATA
-        max_length=5,             # maximum number of symbols (both T and NT) allowed in a rule
-        max_nonterminals=2,       # maximum number of nonterminals allowed in a rule (set >2 at your own risk)
-        max_target_chunks=None,   # maximum number of contiguous chunks of terminal symbols in target-side RHS of a rule. If None, defaults to max_nonterminals+1
-        max_target_length=None,   # maximum number of target side symbols (both T and NT) allowed in a rule. If None, defaults to max_initial_size
-        min_gap_size=2,           # minimum span of a nonterminal in the RHS of a rule in TEST DATA
-        precompute_file=None,     # filename of file containing precomputed collocations
-        precompute_secondary_rank=20, # maximum frequency rank of patterns used to compute triples (don't set higher than 20).
-        precompute_rank=100,      # maximum frequency rank of patterns used to compute collocations (no need to set higher than maybe 200-300)
-        pruned_rule_file=None,    # if specified, pruned grammars will be written to this filename
-        require_aligned_terminal=True, # require extracted rules to have at least one aligned word
-        require_aligned_chunks=False, # require each contiguous chunk of extracted rules to have at least one aligned word
-        per_sentence_grammar=True,  # generate grammar files for each input segment
-        rule_file=None,           # UNpruned grammars will be written to this filename
-        sample_file=None,         # Sampling statistics will be written to this filename
-        search_file=None,         # lookup statistics will be written to this filename
-        train_max_initial_size=10, # maximum span of a grammar rule extracted from TRAINING DATA
-        train_min_gap_size=2,     # minimum span of an RHS nonterminal in a rule extracted from TRAINING DATA
-        tight_phrases=False,      # True if phrases should be tight, False otherwise (False == slower but better results)
-        timingFile=None,          # timing statistics will be written to this filename
-        useBaezaYates=True,       # True to require use of double-binary alg, false otherwise
-        use_collocations=True,    # True to enable used of precomputed collocations
-        use_index=True            # True to enable use of precomputed inverted indices
-        ):
-    '''Note: we make a distinction between the min_gap_size
-    and max_initial_size used in test and train.  The latter
-    are represented by train_min_gap_size and train_max_initial_size,
-    respectively.  This is because Chiang's model does not require
-    them to be the same, therefore we don't either.'''
-    self.rules = TrieTable(True) # cache
-    self.rules.root = ExtendedTrieNode(phrase_location=PhraseLocation()) 
-    self.grammar = grammar
-    if alignment is None:
-      raise Exception("Must specify an alignment object")
-    self.alignment = alignment
-
-    self.excluded_sent_id = -1
-
-    # grammar parameters and settings
-    # NOTE: setting max_nonterminals > 2 is not currently supported in Hiero
-    self.max_length = max_length
-    self.max_nonterminals = max_nonterminals 
-    self.max_initial_size = max_initial_size
-    self.train_max_initial_size = train_max_initial_size
-    self.min_gap_size = min_gap_size
-    self.train_min_gap_size = train_min_gap_size
-    self.category = sym.fromstring(category)
-
-    if max_chunks is None:
-      self.max_chunks = self.max_nonterminals + 1
-    else:
-      self.max_chunks = max_chunks
-    
-    if max_target_chunks is None:
-      self.max_target_chunks = self.max_nonterminals + 1
-    else:
-      self.max_target_chunks = max_target_chunks
-
-    if max_target_length is None:
-      self.max_target_length = max_initial_size
-    else:
-      self.max_target_length = max_target_length
-
-    # algorithmic parameters and settings
-    self.cacheBetweenSents = not per_sentence_grammar
-    self.precomputed_collocations = {}
-    self.precomputed_index = {}
-    self.use_index = use_index
-    self.use_collocations = use_collocations
-    self.max_rank = {}
-    self.precompute_file = precompute_file
-    self.precompute_rank = precompute_rank
-    self.precompute_secondary_rank = precompute_secondary_rank
-    self.useBaezaYates = useBaezaYates
-    self.by_slack_factor = by_slack_factor
-    if tight_phrases:
-      self.tight_phrases = 1
-    else:
-      self.tight_phrases = 0
-
-    if require_aligned_chunks: 
-      # one condition is a stronger version of the other.
-      self.require_aligned_chunks = 1
-      self.require_aligned_terminal = 1
-    elif require_aligned_terminal:
-      self.require_aligned_chunks = 0
-      self.require_aligned_terminal = 1
-    else:
-      self.require_aligned_chunks = 0
-      self.require_aligned_terminal = 0
-    
-
-    self.per_sentence_grammar = per_sentence_grammar
-    if not self.per_sentence_grammar:
-      self.rule_filehandler = open(rule_file, "w")
-    # diagnostics
-    #if rule_file is None:
-    #  self.rule_file = None
-    self.rule_file = rule_file
-    if extract_file is None:
-      self.extract_file = None
-    else:
-      self.extract_file = open(extract_file, "w")
-    if sample_file is None:
-      self.sample_file = None
-    else:
-      self.sample_file = open(sample_file, "w")
-    if search_file is None:
-      self.search_file = None
-    else:
-      self.search_file = open(search_file, "w")
-    self.pruned_rule_file = pruned_rule_file
-    self.timingFile = timingFile
-    self.log_int_stats = log_int_stats
-    self.prev_norm_prefix = ()
-
-    self.findexes = cintlist.CIntList(initial_len=10)
-    self.findexes1 = cintlist.CIntList(initial_len=10)
-
-  def registerContext(self, context_manager):
-    '''This gives the RuleFactory access to the Context object.
-    Here we also use it to precompute the most expensive intersections
-    in the corpus quickly.'''
-    self.context_manager = context_manager
-    self.fsa = context_manager.fsarray
-    self.fda = self.fsa.darray
-    self.eda = context_manager.edarray
-    self.fid2symid = self.set_idmap(self.fda)
-    self.eid2symid = self.set_idmap(self.eda)
-    self.precompute()
-
-  cdef set_idmap(self, cdat.DataArray darray):
-    cdef int word_id, new_word_id, N
-    cdef cintlist.CIntList idmap
-    
-    N = len(darray.id2word)
-    idmap = cintlist.CIntList(initial_len=N)
-    for word_id from 0 <= word_id < N:
-      new_word_id = sym.fromstring(darray.id2word[word_id], terminal=True)
-      idmap.arr[word_id] = new_word_id
-    return idmap
-
-
-  def pattern2phrase(self, pattern):
-    # pattern is a tuple, which we must convert to a hiero rule.Phrase
-    result = ()
-    arity = 0
-    for word_id in pattern:
-      if word_id == -1:
-        arity = arity + 1
-        new_id = sym.setindex(self.category, arity)
-      else:
-        new_id = sym.fromstring(self.fda.id2word[word_id])
-      result = result + (new_id,)
-    return rule.Phrase(result)
-
-  def pattern2phrase_plus(self, pattern):
-    # returns a list containing both the pattern, and pattern
-    # suffixed/prefixed with the NT category.
-    patterns = []
-    result = ()
-    arity = 0
-    for word_id in pattern:
-      if word_id == -1:
-        arity = arity + 1
-        new_id = sym.setindex(self.category, arity)
-      else:
-        new_id = sym.fromstring(self.fda.id2word[word_id])
-      result = result + (new_id,)
-    patterns.append(rule.Phrase(result))
-    patterns.append(rule.Phrase(result + (sym.setindex(self.category, 1),)))
-    patterns.append(rule.Phrase((sym.setindex(self.category, 1),) + result))
-    return patterns
-
-  def precompute(self):
-    cdef precomputation.Precomputation pre
-    
-    if self.precompute_file is not None:
-      start_time = monitor.cpu()
-      log.write("Reading precomputed data from file %s... " % self.precompute_file, 1)
-      pre = precomputation.Precomputation(self.precompute_file, from_binary=True)
-      # check parameters of precomputation -- some are critical and some are not
-      if pre.max_nonterminals != self.max_nonterminals:
-        log.writeln("\nWARNING: Precomputation done with max nonterminals %d, decoder uses %d" % (pre.max_nonterminals, self.max_nonterminals))
-      if pre.max_length != self.max_length:
-        log.writeln("\nWARNING: Precomputation done with max terminals %d, decoder uses %d" % (pre.max_length, self.max_length))
-      if pre.train_max_initial_size != self.train_max_initial_size:
-        log.writeln("\nERROR: Precomputation done with max initial size %d, decoder uses %d" % (pre.train_max_initial_size, self.train_max_initial_size))
-        raise Exception("Parameter mismatch with precomputed data")
-      if pre.train_min_gap_size != self.train_min_gap_size:
-        log.writeln("\nERROR: Precomputation done with min gap size %d, decoder uses %d" % (pre.train_min_gap_size, self.train_min_gap_size))
-        raise Exception("Parameter mismatch with precomputed data")
-      log.writeln("done.", 1)
-      if self.use_index:
-        log.write("Converting %d hash keys on precomputed inverted index... " % (len(pre.precomputed_index)), 1)
-        for pattern, arr in pre.precomputed_index.iteritems():
-          phrases = self.pattern2phrase_plus(pattern)
-          for phrase in phrases:
-            self.precomputed_index[phrase] = arr
-        log.writeln("done.", 1)
-      if self.use_collocations:
-        log.write("Converting %d hash keys on precomputed collocations... " % (len(pre.precomputed_collocations)), 1)
-        for pattern, arr in pre.precomputed_collocations.iteritems():
-          phrase = self.pattern2phrase(pattern)
-          self.precomputed_collocations[phrase] = arr
-        log.writeln("done.", 1)
-      stop_time = monitor.cpu()
-      log.writeln("Processing precomputations took %f seconds" % (stop_time - start_time), 1)
-
-
-  def getPrecomputedCollocation(self, phrase):
-    if phrase in self.precomputed_collocations:
-      arr = self.precomputed_collocations[phrase]
-      return PhraseLocation(arr=arr, arr_low=0, arr_high=len(arr), num_subpatterns=phrase.arity()+1)
-    return None
-
-
-  cdef int* baezaYatesHelper(self, int low1, int high1, int* arr1, int step1, 
-            int low2, int high2, int* arr2, int step2, 
-            int offset_by_one, int len_last, int num_subpatterns, int* result_len):
-    cdef int i1, i2, j1, j2, med1, med2, med1_plus, med1_minus, med2_minus, med2_plus
-    cdef int d_first, qsetsize, dsetsize, tmp, search_low, search_high
-    cdef int med_result_len, low_result_len, high_result_len
-    cdef long comparison
-    cdef int* result
-    cdef int* low_result
-    cdef int* med_result
-    cdef int* high_result
-    cdef Matching loc1, loc2
-
-    result = <int*> malloc(0*sizeof(int*))
-#    log.writeln("%sBY: [%d, %d, %d] [%d, %d, %d]" % (pad, low1, high1, step1, low2, high2, step2,), 5)
-
-    d_first = 0
-    if high1 - low1 > high2 - low2:
-#      log.writeln("%sD first" % (pad), 5)
-      d_first = 1
-#    else:
-#      log.writeln("%sQ first" % (pad), 5)
-
-#    '''First, check to see if we are at any of the 
-#    recursive base cases'''
-#
-#    '''Case 1: one of the sets is empty'''
-    if low1 >= high1 or low2 >= high2:
-#      log.writeln("%sRETURN: set is empty" % (pad), 5)
-      return result
-
-#    '''Case 2: sets are non-overlapping'''
-    assign_matching(&loc1, arr1, high1-step1, step1, self.fda.sent_id.arr)
-    assign_matching(&loc2, arr2, low2, step2, self.fda.sent_id.arr)
-    if self.compare_matchings(&loc1, &loc2, offset_by_one, len_last) == -1:
-#      log.writeln("%s %s < %s" % (pad, tuple(arr1[high1-step1:high1]), tuple(arr2[low2:low2+step2])),5)
-#      log.writeln("%sRETURN: non-overlapping sets" % (pad), 5)
-      return result
-
-    assign_matching(&loc1, arr1, low1, step1, self.fda.sent_id.arr)
-    assign_matching(&loc2, arr2, high2-step2, step2, self.fda.sent_id.arr)
-    if self.compare_matchings(&loc1, &loc2, offset_by_one, len_last) == 1:
-#      log.writeln("%s %s > %s" % (pad, tuple(arr1[low1:low1+step1]), tuple(arr2[high2-step2:high2])),5)
-#      log.writeln("%sRETURN: non-overlapping sets" % (pad), 5)
-      return result
-
-    # Case 3: query set and data set do not meet size mismatch constraints;
-    # We use mergesort instead in this case
-    qsetsize = (high1-low1) / step1
-    dsetsize = (high2-low2) / step2
-    if d_first:
-      tmp = qsetsize
-      qsetsize = dsetsize
-      dsetsize = tmp
-
-    if self.by_slack_factor * qsetsize * cmath.log(dsetsize) / cmath.log(2) > dsetsize:
-      free(result)
-      return self.mergeHelper(low1, high1, arr1, step1, low2, high2, arr2, step2, offset_by_one, len_last, num_subpatterns, result_len)
-
-    # binary search.  There are two flavors, depending on 
-    # whether the queryset or dataset is first
-    if d_first:
-      med2 = median(low2, high2, step2)
-      assign_matching(&loc2, arr2, med2, step2, self.fda.sent_id.arr)
-
-      search_low = low1
-      search_high = high1
-      while search_low < search_high:
-        med1 = median(search_low, search_high, step1)
-        findComparableMatchings(low1, high1, arr1, step1, med1, &med1_minus, &med1_plus)
-        comparison = self.compareMatchingsSet(med1_minus, med1_plus, arr1, step1, &loc2, offset_by_one, len_last)
-        if comparison == -1:
-          search_low = med1_plus
-        elif comparison == 1:
-          search_high = med1_minus
-        else:
-          break
-    else:
-      med1 = median(low1, high1, step1)
-      findComparableMatchings(low1, high1, arr1, step1, med1, &med1_minus, &med1_plus)
-
-      search_low = low2
-      search_high = high2
-      while search_low < search_high:
-        med2 = median(search_low, search_high, step2)
-        assign_matching(&loc2, arr2, med2, step2, self.fda.sent_id.arr)
-        comparison = self.compareMatchingsSet(med1_minus, med1_plus, arr1, step1, &loc2, offset_by_one, len_last)
-        if comparison == -1:
-          search_high = med2
-        elif comparison == 1:
-          search_low = med2 + step2
-        else:
-          break
-
-    med_result_len = 0
-    med_result = <int*> malloc(0*sizeof(int*))
-    if search_high > search_low:
-#      '''Then there is a match for the median element of Q'''
-#      
-#      '''What we want to find is the group of all bindings in the first set
-#      s.t. their first element == the first element of med1.  Then we
-#      want to store the bindings for all of those elements.  We can
-#      subsequently throw all of them away.'''
-      med2_minus = med2
-      med2_plus = med2 + step2
-      i1 = med1_minus
-      while i1 < med1_plus:
-        assign_matching(&loc1, arr1, i1, step1, self.fda.sent_id.arr)
-        while med2_minus-step2 >= low2:
-          assign_matching(&loc2, arr2, med2_minus-step2, step2, self.fda.sent_id.arr)
-          if self.compare_matchings(&loc1, &loc2, offset_by_one, len_last) < 1:
-            med2_minus = med2_minus - step2
-          else:
-            break
-        i2 = med2_minus
-        while i2 < high2:
-          assign_matching(&loc2, arr2, i2, step2, self.fda.sent_id.arr)
-          comparison = self.compare_matchings(&loc1, &loc2, offset_by_one, len_last)
-          if comparison == 0:
-            pass
-            med_result = append_combined_matching(med_result, &loc1, &loc2, offset_by_one, num_subpatterns, &med_result_len)
-          if comparison == -1:
-            break
-          i2 = i2 + step2
-        if i2 > med2_plus:
-          med2_plus = i2
-        i1 = i1 + step1
-
-      tmp = med1_minus
-      med1_minus = med1_plus
-      med1_plus = tmp
-    else:
-      # No match; need to figure out the point of division in D and Q
-      med2_minus = med2
-      med2_plus = med2
-      if d_first:
-        med2_minus = med2_minus + step2
-        if comparison == -1:
-          med1_minus = med1_plus
-        if comparison == 1:
-          med1_plus = med1_minus
-      else:
-        tmp = med1_minus
-        med1_minus = med1_plus
-        med1_plus = tmp
-        if comparison == 1:
-          med2_minus = med2_minus + step2
-          med2_plus = med2_plus + step2
-
-    low_result_len = 0
-    low_result = self.baezaYatesHelper(low1, med1_plus, arr1, step1, low2, med2_plus, arr2, step2, offset_by_one, len_last, num_subpatterns, &low_result_len)
-    high_result_len = 0
-    high_result = self.baezaYatesHelper(med1_minus, high1, arr1, step1, med2_minus, high2, arr2, step2, offset_by_one, len_last, num_subpatterns, &high_result_len)
-
-    result = extend_arr(result, result_len, low_result, low_result_len)
-    result = extend_arr(result, result_len, med_result, med_result_len)
-    result = extend_arr(result, result_len, high_result, high_result_len)
-    free(low_result)
-    free(med_result)
-    free(high_result)
-
-    return result
-
-
-
-  cdef long compareMatchingsSet(self, int i1_minus, int i1_plus, int* arr1, int step1, 
-              Matching* loc2, int offset_by_one, int len_last):
-#    '''Compares a *set* of bindings, all with the same first element,
-#    to a single binding.  Returns -1 if all comparisons == -1, 1 if all
-#    comparisons == 1, and 0 otherwise.'''
-    cdef int i1, comparison, prev_comparison
-    cdef Matching l1_stack
-    cdef Matching* loc1
-
-    loc1 = &l1_stack
-
-    i1 = i1_minus
-    while i1 < i1_plus:
-      assign_matching(loc1, arr1, i1, step1, self.fda.sent_id.arr)
-      comparison = self.compare_matchings(loc1, loc2, offset_by_one, len_last)
-      if comparison == 0:
-        prev_comparison = 0
-        break
-      elif i1 == i1_minus:
-        prev_comparison = comparison
-      else:
-        if comparison != prev_comparison:
-          prev_comparison = 0
-          break
-      i1 = i1 + step1
-    return prev_comparison
-
-
-  cdef long compare_matchings(self, Matching* loc1, Matching* loc2, int offset_by_one, int len_last):
-    cdef int i
-
-    if loc1.sent_id > loc2.sent_id:
-      return 1
-    if loc2.sent_id > loc1.sent_id:
-      return -1
-
-    if loc1.size == 1 and loc2.size == 1:
-      if loc2.arr[loc2.start] - loc1.arr[loc1.start] <= self.train_min_gap_size:
-        return 1
-
-    elif offset_by_one:
-      for i from 1 <= i < loc1.size:
-        if loc1.arr[loc1.start+i] > loc2.arr[loc2.start+i-1]:
-          return 1
-        if loc1.arr[loc1.start+i] < loc2.arr[loc2.start+i-1]:
-          return -1
-
-    else:
-      if loc1.arr[loc1.start]+1 > loc2.arr[loc2.start]:
-        return 1
-      if loc1.arr[loc1.start]+1 < loc2.arr[loc2.start]:
-        return -1
-
-      for i from 1 <= i < loc1.size:
-        if loc1.arr[loc1.start+i] > loc2.arr[loc2.start+i]:
-          return 1
-        if loc1.arr[loc1.start+i] < loc2.arr[loc2.start+i]:
-          return -1
-
-    if loc2.arr[loc2.end-1] + len_last - loc1.arr[loc1.start] > self.train_max_initial_size:
-      return -1
-    return 0
-
-
-  cdef int* mergeHelper(self, int low1, int high1, int* arr1, int step1, 
-          int low2, int high2, int* arr2, int step2, 
-          int offset_by_one, int len_last, int num_subpatterns, int* result_len):
-    cdef int i1, i2, j1, j2
-    cdef long comparison
-    cdef int* result
-    cdef Matching loc1, loc2
-#    cdef int i
-
-#    pad = "  "
-#    log.writeln("->mergeHelper", 5)
-
-    result_len[0] = 0
-    result = <int*> malloc(0*sizeof(int))
-
-    i1 = low1
-    i2 = low2
-#    if log.level==5:
-#      log.writeln("%sMERGE lists [%d,%d,%d] and [%d,%d,%d]" % (pad,low1,high1,step1,low2,high2,step2), 5)
-#      log.writeln("%soffset_by_one: %d, len_last: %d" % (pad, offset_by_one, len_last), 5)
-#      log.write("[")
-#      for i from low1 <= i < high1:
-#        log.write("%d, " % arr1.arr[i],5)
-#      log.writeln("]")
-#      log.write("[")
-#      for i from low2 <= i < high2:
-#        log.write("%d, " % arr2.arr[i],5)
-#      log.writeln("]")
-    while i1 < high1 and i2 < high2:
-      
-#      '''First, pop all unneeded loc2's off the stack'''
-      assign_matching(&loc1, arr1, i1, step1, self.fda.sent_id.arr)
-#      if log.level==5:
-#        log.writeln("%s TOP1 %s" % (pad,matching2str(loc1)),5)
-      while i2 < high2:
-        assign_matching(&loc2, arr2, i2, step2, self.fda.sent_id.arr)
-        if self.compare_matchings(&loc1, &loc2, offset_by_one, len_last) == 1:
-#          if log.level==5:
-#            log.writeln("%s %s > %s" % (pad,matching2str(loc1),matching2str(loc2)),5)
-#            log.writeln("%s POP2 %s" % (pad,matching2str(loc2)),5)
-          i2 = i2 + step2
-        else:
-          break
-
-#      '''Next: process all loc1's with the same starting val'''
-      j1 = i1
-      while i1 < high1 and arr1[j1] == arr1[i1]:
-        assign_matching(&loc1, arr1, i1, step1, self.fda.sent_id.arr)
-        j2 = i2
-        while j2 < high2:
-          assign_matching(&loc2, arr2, j2, step2, self.fda.sent_id.arr)
-          comparison = self.compare_matchings(&loc1, &loc2, offset_by_one, len_last)
-          if comparison == 0:
-#            if log.level==5:
-#              log.writeln("%s %s == %s" % (pad,matching2str(loc1),matching2str(loc2)),5)
-            result = append_combined_matching(result, &loc1, &loc2, offset_by_one, num_subpatterns, result_len)
-          if comparison == 1:
-#            if log.level==5:
-#              log.writeln("%s %s > %s" % (pad,matching2str(loc1),matching2str(loc2)),5)
-            pass
-          if comparison == -1:
-#            if log.level==5:
-#              log.writeln("%s %s < %s" % (pad,matching2str(loc1),matching2str(loc2)),5)
-            break
-          else:
-            j2 = j2 + step2
-#        if log.level==5:
-#          log.writeln("%s POP1 %s" % (pad,matching2str(loc1)),5)
-        i1 = i1 + step1
-
-#    log.writeln("<-mergeHelper", 5)
-    return result
-
-
-  cdef void sortPhraseLoc(self, cintlist.CIntList arr, PhraseLocation loc, rule.Phrase phrase):
-    cdef int i, j
-    cdef cveb.VEB veb
-    cdef cintlist.CIntList result
-
-    if phrase in self.precomputed_index:
-      loc.arr = self.precomputed_index[phrase]
-    else:
-      loc.arr = cintlist.CIntList(initial_len=loc.sa_high-loc.sa_low)
-      veb = cveb.VEB(arr.len)
-      for i from loc.sa_low <= i < loc.sa_high:
-        veb._insert(arr.arr[i])
-      i = veb.veb.min_val
-      for j from 0 <= j < loc.sa_high-loc.sa_low:
-        loc.arr.arr[j] = i
-        i = veb._findsucc(i)
-    loc.arr_low = 0
-    loc.arr_high = loc.arr.len
-
-
-  cdef intersectHelper(self, rule.Phrase prefix, rule.Phrase suffix,
-        PhraseLocation prefix_loc, PhraseLocation suffix_loc, int algorithm):
-
-    cdef cintlist.CIntList arr1, arr2, result
-    cdef int low1, high1, step1, low2, high2, step2, offset_by_one, len_last, num_subpatterns, result_len
-    cdef int* result_ptr
-    cdef csuf.SuffixArray suf
-
-    result_len = 0
-
-    if sym.isvar(suffix[0]):
-      offset_by_one = 1
-    else:
-      offset_by_one = 0
-
-    len_last = len(suffix.getchunk(suffix.arity()))
-
-    if prefix_loc.arr is None:
-      suf = self.context_manager.fsarray
-      self.sortPhraseLoc(suf.sa, prefix_loc, prefix)
-    arr1 = prefix_loc.arr
-    low1 = prefix_loc.arr_low
-    high1 = prefix_loc.arr_high
-    step1 = prefix_loc.num_subpatterns
-
-    if suffix_loc.arr is None:
-      suf = self.context_manager.fsarray
-      self.sortPhraseLoc(suf.sa, suffix_loc, suffix)
-    arr2 = suffix_loc.arr
-    low2 = suffix_loc.arr_low
-    high2 = suffix_loc.arr_high
-    step2 = suffix_loc.num_subpatterns
-
-    num_subpatterns = prefix.arity()+1
-
-    if algorithm == MERGE:
-      result_ptr = self.mergeHelper(low1, high1, arr1.arr, step1, 
-                  low2, high2, arr2.arr, step2, 
-                  offset_by_one, len_last, num_subpatterns, &result_len)
-    else:
-      result_ptr = self.baezaYatesHelper(low1, high1, arr1.arr, step1, 
-                  low2, high2, arr2.arr, step2, 
-                  offset_by_one, len_last, num_subpatterns, &result_len)
-
-    if result_len == 0:
-      free(result_ptr)
-      return None
-    else:
-      result = cintlist.CIntList()
-      free(result.arr)
-      result.arr = result_ptr
-      result.len = result_len
-      result.size = result_len
-      return PhraseLocation(arr_low=0, arr_high=result_len, arr=result, num_subpatterns=num_subpatterns)
-
-  cdef loc2str(self, PhraseLocation loc):
-    cdef int i, j
-    result = "{"
-    i = 0
-    while i < loc.arr_high:
-      result = result + "("
-      for j from i <= j < i + loc.num_subpatterns:
-        result = result + ("%d " %loc.arr[j])
-      result = result + ")"
-      i = i + loc.num_subpatterns
-    result = result + "}"
-    return result
-
-#  cdef compareResults(self, PhraseLocation loc1, PhraseLocation loc2, phrase, type1, type2):
-#    cdef i
-#    if loc1 is None and type1=="pre":
-#      return
-#    if loc1 is None:
-#      if loc2 is None or loc2.arr_high == 0:
-#        return
-#    if loc2 is None:
-#      if loc1.arr_high == 0:
-#        return
-#    if loc1.arr_high != loc2.arr_high:
-#      log.writeln("ERROR: %d vs %d (%s vs %s)" % (loc1.arr_high, loc2.arr_high, type1, type2))
-#      #log.writeln("  %s" % self.loc2str(loc2))
-#    if loc1.arr_high == 0:
-#      return
-#    elif loc1.num_subpatterns != loc2.num_subpatterns:
-#      log.writeln("ERROR 2: %d vs %d (%d v %d) %s" % (loc1.num_subpatterns, loc2.num_subpatterns, loc1.arr_high, loc2.arr_high, phrase))
-#    for i from 0 <= i < loc1.arr_high:
-#      if loc1.arr[i] != loc2.arr[i]:
-#        log.writeln("ERROR 3")
-# 
-  cdef PhraseLocation intersect(self, prefix_node, suffix_node, rule.Phrase phrase):
-    cdef rule.Phrase prefix, suffix
-    cdef PhraseLocation prefix_loc, suffix_loc, result
-
-    start_time = monitor.cpu()
-    prefix = prefix_node.phrase
-    suffix = suffix_node.phrase
-    prefix_loc = prefix_node.phrase_location
-    suffix_loc = suffix_node.phrase_location
-
-    result = self.getPrecomputedCollocation(phrase)
-    if result is not None:
-      intersect_method = "precomputed"
-
-    if result is None:
-      if self.useBaezaYates:
-        result = self.intersectHelper(prefix, suffix, prefix_loc, suffix_loc, BAEZA_YATES)
-        intersect_method="double binary"
-      else:
-        result = self.intersectHelper(prefix, suffix, prefix_loc, suffix_loc, MERGE)
-        intersect_method="merge"
-    stop_time = monitor.cpu()
-    intersect_time = stop_time - start_time
-    if self.log_int_stats:
-      if intersect_method == "precomputed":
-        sort1 = "none"
-        sort2 = "none"
-      else:
-        if prefix in self.precomputed_index:
-          sort1 = "index"
-        else:
-          sort1 = "veb"
-        if suffix in self.precomputed_index:
-          sort2 = "index"
-        else:
-          sort2 = "veb"
-      result_len=0
-      if result is not None:
-        result_len = len(result.arr)/result.num_subpatterns
-      rank = 0
-#      if phrase in self.max_rank:
-#        rank = self.max_rank[phrase]
-#      else:
-#        rank = self.precompute_rank + 10
-      log.writeln("INT %d %d %d %d %d %f %d %s %s %s" % 
-        (len(prefix)+1 - prefix.arity(), prefix.arity(), 
-        nGramCount(prefix_node.phrase_location), 
-        nGramCount(suffix_node.phrase_location), 
-        result_len, intersect_time, rank, intersect_method, sort1, sort2))
-    return result
-
-  def advance(self, frontier, res, fwords):
-    nf = []
-    for (toskip, (i, alt, pathlen)) in frontier:
-      spanlen = fwords[i][alt][2]
-      if (toskip == 0):
-        #log.writeln("RES: (%d %d %d)" % (i, alt, pathlen), 3)
-        res.append((i, alt, pathlen))
-      ni = i + spanlen
-      #log.writeln("proc: %d (%d %d %d) sl=%d ni=%d len(fwords)=%d" % (toskip, i, alt, pathlen, spanlen, ni, len(fwords)), 3)
-      if (ni < len(fwords) and (pathlen + 1) < self.max_initial_size):
-        for na in xrange(len(fwords[ni])):
-          nf.append((toskip - 1, (ni, na, pathlen + 1)))
-    if (len(nf) > 0):
-      return self.advance(nf, res, fwords)
-    else:
-      return res
-    
-  def get_all_nodes_isteps_away(self, skip, i, spanlen, pathlen, fwords, next_states, reachable_buffer):
-    frontier = []
-    if (i+spanlen+skip >= len(next_states)):
-      return frontier
-    #print "get_all_nodes_isteps_away from %i" % (i)
-    key = tuple([i,spanlen])
-    reachable = []
-    if (key in reachable_buffer):
-      reachable = reachable_buffer[key]
-    else:
-      reachable = self.reachable(fwords, i, spanlen)
-      reachable_buffer[key] = reachable
-    #print "reachable(from=%i,dist=%i) = " % (i,spanlen)
-    #print reachable
-    for nextreachable in reachable:
-      for next_id in next_states[nextreachable]:
-        jump = self.shortest(fwords,i,next_id)
-        #print "checking next_id = %i, pathlen[sofar] = %i, jump = %i" % (next_id,pathlen,jump)
-        #if (next_id - (i+spanlen)) < skip:
-        if jump < skip:
-          continue
-        #if next_id-(i-pathlen) < self.max_initial_size:
-        if pathlen+jump <= self.max_initial_size:
-          for alt_id in xrange(len(fwords[next_id])):
-            if (fwords[next_id][alt_id][0] != cn.epsilon):
-              #frontier.append((next_id,alt_id,next_id-(i-pathlen)));
-              #print "finding the shortest from %i to %i" % (i, next_id)
-              newel = (next_id,alt_id,pathlen+jump)
-              if newel not in frontier:
-                frontier.append((next_id,alt_id,pathlen+jump))
-                #print "appending to frontier = next_id=%i, alt_id=%i, pathlen=%i" % (next_id,alt_id,pathlen+jump)
-              #else:
-                #print "NOT appending to frontier = next_id=%i, alt_id=%i, pathlen=%i" % (next_id,alt_id,pathlen+jump)
-        #else:
-          #print "next_id = %s is aborted\n" % next_id
-    #print "returning frontier"
-    #print frontier
-    return frontier
-
-  def reachable(self, fwords, ifrom, dist):
-    #print "inside reachable(%i,%i)" % (ifrom,dist)
-    ret = []
-    if (ifrom >= len(fwords)):
-      return ret
-    for alt_id in xrange(len(fwords[ifrom])):
-      if (fwords[ifrom][alt_id][0] == cn.epsilon):
-        ret.extend(self.reachable(fwords,ifrom+fwords[ifrom][alt_id][2],dist))
-      else:
-        if (dist==0):
-          if (ifrom not in ret):
-            ret.append(ifrom)
-        else:
-          for ifromchild in self.reachable(fwords,ifrom+fwords[ifrom][alt_id][2],dist-1):
-            if (ifromchild not in ret):
-              ret.append(ifromchild)
-          
-    return ret
-
-  def shortest(self, fwords, ifrom, ito):
-    min = 1000
-    #print "shortest ifrom=%i, ito=%i" % (ifrom,ito)
-    if (ifrom > ito):
-      return min
-    if (ifrom == ito):
-      return 0
-    for alt_id in xrange(len(fwords[ifrom])):
-      currmin = self.shortest(fwords,ifrom+fwords[ifrom][alt_id][2],ito)
-      if (fwords[ifrom][alt_id][0] != cn.epsilon):
-        currmin += 1
-      if (currmin<min):
-        min = currmin
-    return min
-
-  def get_next_states(self, _columns, curr_idx, min_dist=2):
-    result = []
-    candidate = [[curr_idx,0]]
-
-    while len(candidate) > 0:
-      curr = candidate.pop()
-      if curr[0] >= len(_columns):
-        continue
-      if curr[0] not in result and min_dist <= curr[1] <= self.max_initial_size:
-        result.append(curr[0]);
-      curr_col = _columns[curr[0]]
-      for alt in curr_col:
-        next_id = curr[0]+alt[2]
-        jump = 1
-        if (alt[0] == cn.epsilon):
-          jump = 0
-        if next_id not in result and min_dist <= curr[1]+jump <= self.max_initial_size+1:
-          candidate.append([next_id,curr[1]+jump])
-    return sorted(result);
-
-  def input(self, fwords, meta=None, output=None):
-    '''When this function is called on the RuleFactory,
-    it looks up all of the rules that can be used to translate
-    the input sentence'''
-    cdef int i, j, k, flen, arity, num_subpatterns, num_samples
-    cdef float start_time
-    cdef PhraseLocation phrase_location
-    cdef cintlist.CIntList sample, chunklen
-    cdef Matching matching
-    cdef rule.Phrase hiero_phrase
-    
-    #fwords = [ ((1,0.0,1),), fwords1 ] #word id for <s> = 1, cost = 0.0, next = 1
-    #print fwords
-    flen = len(fwords)
-    #print "length = %i" % flen
-    start_time = monitor.cpu()
-    self.intersect_time = 0.0
-    self.extract_time = 0.0
-    nodes_isteps_away_buffer = {}
-    hit = 0
-    reachable_buffer = {}
-    if meta:
-        dattrs = sgml.attrs_to_dict(meta)
-        id = dattrs.get('id', 'NOID')
-        self.excluded_sent_id = int(dattrs.get('exclude', '-1'))
-    if output:
-      self.rule_filehandler = output
-    elif self.per_sentence_grammar:
-      self.rule_filehandler = open(self.rule_file+'.'+id, 'w')
-
-    #print "max_initial_size = %i" % self.max_initial_size
-
-    if not self.cacheBetweenSents:
-      self.rules.root = ExtendedTrieNode(phrase_location=PhraseLocation()) 
-      self.grammar.root = [None, {}]
-
-    frontier = []
-    for i in xrange(len(fwords)):
-      for alt in xrange(0, len(fwords[i])):
-        if fwords[i][alt][0] != cn.epsilon:
-          frontier.append((i, i, alt, 0, self.rules.root, (), False))
-
-    xroot = None
-    x1 = sym.setindex(self.category, 1)
-    if x1 in self.rules.root.children:
-      xroot = self.rules.root.children[x1]
-    else:
-      xroot = ExtendedTrieNode(suffix_link=self.rules.root, phrase_location=PhraseLocation()) 
-      self.rules.root.children[x1] = xroot
-
-    for i in xrange(self.min_gap_size, len(fwords)):
-      for alt in xrange(0, len(fwords[i])):
-        if fwords[i][alt][0] != cn.epsilon:
-          frontier.append((i-self.min_gap_size, i, alt, self.min_gap_size, xroot, (x1,), True))
-    '''for k, i, alt, pathlen, node, prefix, is_shadow_path in frontier:
-      if len(prefix)>0:
-        print k, i, alt, pathlen, node, map(sym.tostring,prefix), is_shadow_path
-      else:
-        print k, i, alt, pathlen, node, prefix, is_shadow_path'''
-
-    #for wid in xrange(1000):
-    #  print "%i = %s" % (wid, sym.tostring(wid))
-    next_states = []
-    for i in xrange(len(fwords)):
-      next_states.append(self.get_next_states(fwords,i,self.min_gap_size))
-      #print "next state of %i" % i
-      #print next_states[i]
-
-    while len(frontier) > 0:
-      #print "frontier = %i" % len(frontier)
-      this_iter_intersect_time = self.intersect_time
-      new_frontier = []
-      for k, i, alt, pathlen, node, prefix, is_shadow_path in frontier:
-        #print "looking at: "
-        #if len(prefix)>0:
-        #  print k, i, alt, pathlen, node, map(sym.tostring,prefix), is_shadow_path
-        #else:
-        #  print k, i, alt, pathlen, node, prefix, is_shadow_path
-        word_id = fwords[i][alt][0]
-        spanlen = fwords[i][alt][2]
-        #print "word_id = %i, %s" % (word_id, sym.tostring(word_id))
-        # to prevent .. [X] </S>
-        #print "prefix = ",prefix
-        #if word_id == 2 and len(prefix)>=2:
-          #print "at the end: %s" % (prefix[len(prefix)-1])
-          #if prefix[len(prefix)-1]<0:
-            #print "break"
-            #continue
-        #print "continuing"
-        #if pathlen + spanlen > self.max_initial_size:
-          #continue
-        # TODO get rid of k -- pathlen is replacing it
-        if word_id == cn.epsilon:
-          #print "skipping because word_id is epsilon"
-          if i+spanlen >= len(fwords): 
-            continue
-          for nualt in xrange(0,len(fwords[i+spanlen])):
-            frontier.append((k, i+spanlen, nualt, pathlen, node, prefix, is_shadow_path))
-          continue
-        
-        phrase = prefix + (word_id,)
-        str_phrase = map(sym.tostring, phrase)
-        hiero_phrase = rule.Phrase(phrase)
-        arity = hiero_phrase.arity()
-
-        #print "pos %2d, node %5d, '%s'" % (i, node.id, hiero_phrase)
-        if self.search_file is not None:
-          self.search_file.write("%s\n" % hiero_phrase)
-
-        lookup_required = False
-        if word_id in node.children:
-          if node.children[word_id] is None:
-            #print "Path dead-ends at this node\n"
-            continue
-          else:
-            #print "Path continues at this node\n"
-            node = node.children[word_id]
-        else:
-          if node.suffix_link is None:
-            #print "Current node is root; lookup required\n"
-            lookup_required = True
-          else:
-            if word_id in node.suffix_link.children:
-              if node.suffix_link.children[word_id] is None:
-                #print "Suffix link reports path is dead end\n"
-                node.children[word_id] = None
-                continue
-              else:
-                #print "Suffix link indicates lookup is reqired\n"
-                lookup_required = True
-            else:
-              #print "ERROR: We never get here\n"
-              raise Exception("Keyword trie error")
-              #new_frontier.append((k, i, alt, pathlen, node, prefix, is_shadow_path))
-        #print "checking whether lookup_required\n"
-        if lookup_required:
-          new_node = None
-          if is_shadow_path:
-            #print "Extending shadow path for %s \n"
-            # on the shadow path we don't do any search, we just use info from suffix link
-            new_node = ExtendedTrieNode(phrase_location=node.suffix_link.children[word_id].phrase_location, 
-                              suffix_link=node.suffix_link.children[word_id],
-                              phrase=hiero_phrase)
-          else:
-            if arity > 0:
-              #print "Intersecting for %s because of arity > 0\n" % hiero_phrase
-              phrase_location = self.intersect(node, node.suffix_link.children[word_id], hiero_phrase)
-            else:
-              #print "Suffix array search for %s" % hiero_phrase
-              phrase_location = node.phrase_location
-              sa_range = self.context_manager.fsarray.lookup(str_phrase[-1], len(str_phrase)-1, phrase_location.sa_low, phrase_location.sa_high)
-              if sa_range is not None:
-                phrase_location = PhraseLocation(sa_low=sa_range[0], sa_high=sa_range[1])
-              else:
-                phrase_location = None
-
-            if phrase_location is None:
-              node.children[word_id] = None
-              #print "Search failed\n"
-              continue
-            #print "Search succeeded\n"
-            suffix_link = self.rules.root
-            if node.suffix_link is not None:
-              suffix_link = node.suffix_link.children[word_id]
-            new_node = ExtendedTrieNode(phrase_location=phrase_location, 
-                              suffix_link=suffix_link, 
-                              phrase=hiero_phrase)
-          node.children[word_id] = new_node
-          node = new_node
-          #print "Added node %d with suffix link %d\n" % (node.id, node.suffix_link.id)
-
-          '''Automatically add a trailing X node, if allowed --
-          This should happen before we get to extraction (so that
-          the node will exist if needed)'''
-          if arity < self.max_nonterminals:
-            xcat_index = arity+1
-            xcat = sym.setindex(self.category, xcat_index)
-            suffix_link_xcat_index = xcat_index
-            if is_shadow_path:
-              suffix_link_xcat_index = xcat_index-1
-            suffix_link_xcat = sym.setindex(self.category, suffix_link_xcat_index)
-            node.children[xcat] = ExtendedTrieNode(phrase_location=node.phrase_location, 
-                                    suffix_link=node.suffix_link.children[suffix_link_xcat],
-                                    phrase= rule.Phrase(phrase + (xcat,)))
-            #log.writeln("Added node %d with suffix link %d (for X)" % (node.children[xcat].id, node.children[xcat].suffix_link.id), 4)
-
-          # sample from range
-          if not is_shadow_path:
-            #print "is_not_shadow_path"
-            sample = self.context_manager.sampler.sample(node.phrase_location)
-            #print "node.phrase_location %s" % str(node.phrase_location)
-            #print "sample.len = %i" % len(sample)
-            num_subpatterns = (<PhraseLocation> node.phrase_location).num_subpatterns
-            chunklen = cintlist.CIntList(initial_len=num_subpatterns)
-            for j from 0 <= j < num_subpatterns:
-              chunklen.arr[j] = hiero_phrase.chunklen(j)
-            extracts = []
-            j = 0
-            extract_start = monitor.cpu()
-            '''orig_tight_phrases = self.tight_phrases
-            orig_require_aligned_terminal = self.require_aligned_terminal
-            orig_require_aligned_chunks = self.require_aligned_chunks
-            if k==0 or i==len(fwords)-1:
-              self.tight_phrases = 0
-              self.require_aligned_terminal = 0
-              self.require_aligned_chunks = 0'''
-            while j < sample.len:
-              extract = []
-
-              assign_matching(&matching, sample.arr, j, num_subpatterns, self.fda.sent_id.arr)
-              '''print "tight_phrase = "
-              print self.tight_phrases
-              print "require_aligned_terminal = "
-              print self.require_aligned_terminal
-              print "require_aligned_chunks = "
-              print self.require_aligned_chunks'''
-              
-              extract = self.extract(hiero_phrase, &matching, chunklen.arr, num_subpatterns)
-              extracts.extend(extract)
-              j = j + num_subpatterns
-            '''self.tight_phrases = orig_tight_phrases
-            sttice+sa.nw.normelf.require_aligned_terminal = orig_require_aligned_terminal
-            self.require_aligned_chunks = orig_require_aligned_chunks'''
-            num_samples = sample.len/num_subpatterns
-            extract_stop = monitor.cpu()
-            self.extract_time = self.extract_time + extract_stop - extract_start
-            #print "extract.size = %i" % len(extracts)
-            if len(extracts) > 0:
-              fphrases = {}
-              fals = {}
-              fcount = {}
-              for f, e, count, als in extracts:
-                fcount.setdefault(f, 0.0)
-                fcount[f] = fcount[f] + count
-                fphrases.setdefault(f, {})
-                fphrases[f].setdefault(e, {})
-                #fphrases[f][e] = fphrases[f][e] + count
-                fphrases[f][e].setdefault(als,0.0)
-                fphrases[f][e][als] = fphrases[f][e][als] + count
-                #print "f,e,als ",f," : ",e," : ",als," count = ",fphrases[f][e][als]
-                #fals[str(f)+" ||| "+str(e)] = als
-              for f, elist in fphrases.iteritems():
-                #print "f = '%s'" % f
-                #if (str(f) in ['<s>','</s>','<s> [X,1]','[X,1] </s>']):
-                #  print "rejected"
-                #  continue
-                f_margin = fcount[f]
-                for e, alslist in elist.iteritems():
-                  alignment = None
-                  count = 0
-                  for als, currcount in alslist.iteritems():
-                    #print "als = ",als,", count = ",currcount
-                    if currcount > count:
-                      alignment = als
-                      count = currcount 
-                  #alignment = fals[str(f)+" ||| "+str(e)]
-                  #print "selected = ",alignment," with count = ",count
-                  scores = []
-                  for m in self.context_manager.models:
-                    scores.append(m.compute_contextless_score(f, e, count, fcount[f], num_samples))
-                  r = rule.Rule(self.category, f, e, scores=scores, owner="context", word_alignments = alignment)
-                  self.grammar.add(r)
-                  if self.rule_filehandler is not None:
-                    self.rule_filehandler.write("%s\n" % r.to_line())
-                  #print "adding a rule = %s" % r
-
-        #if len(phrase) < self.max_length and i+spanlen < len(fwords) and pathlen+spanlen < self.max_initial_size:
-        if len(phrase) < self.max_length and i+spanlen < len(fwords) and pathlen+1 <= self.max_initial_size:
-          #to prevent [X] </s>
-          #print "lexicalized"
-          for alt_id in xrange(len(fwords[i+spanlen])):
-            #if (fwords[i+spanlen][alt_id][2]+pathlen+spanlen <= self.max_initial_size):
-            #new_frontier.append((k, i+spanlen, alt_id, pathlen + spanlen, node, phrase, is_shadow_path))
-            #print "alt_id = %d\n" % alt_id
-            new_frontier.append((k, i+spanlen, alt_id, pathlen + 1, node, phrase, is_shadow_path))
-            #print (k, i+spanlen, alt_id, pathlen + spanlen, node, map(sym.tostring,phrase), is_shadow_path)
-          #print "end lexicalized"
-          num_subpatterns = arity
-          if not is_shadow_path:
-            num_subpatterns = num_subpatterns + 1
-          #to avoid <s> X ... we want <s> next to a lexicalized item
-          #if k>0 and i<len(fwords)-1 and len(phrase)+1 < self.max_length and arity < self.max_nonterminals and num_subpatterns < self.max_chunks:
-          if len(phrase)+1 < self.max_length and arity < self.max_nonterminals and num_subpatterns < self.max_chunks:
-            #print "masuk kondisi"
-            xcat = sym.setindex(self.category, arity+1)
-            xnode = node.children[xcat]
-            #frontier_nodes = self.get_all_nodes_isteps_away(self.min_gap_size, i, spanlen, pathlen, fwords, next_states)
-            # I put spanlen=1 below
-            key = tuple([self.min_gap_size, i, 1, pathlen])
-            frontier_nodes = []
-            if (key in nodes_isteps_away_buffer):
-              frontier_nodes = nodes_isteps_away_buffer[key]
-            else:
-              frontier_nodes = self.get_all_nodes_isteps_away(self.min_gap_size, i, 1, pathlen, fwords, next_states, reachable_buffer)
-              nodes_isteps_away_buffer[key] = frontier_nodes
-            
-            #print "new frontier:\n"
-            for (i, alt, pathlen) in frontier_nodes:
-              #if (pathlen+fwords[i][alt][2] <= self.max_initial_size):
-              new_frontier.append((k, i, alt, pathlen, xnode, phrase +(xcat,), is_shadow_path))
-              #print k, i, alt, pathlen, node, map(sym.tostring,phrase +(xcat,)), is_shadow_path
-          #print "all end\n";
-          #else:
-            #print "no new frontier1\n";
-        #else :
-          #print "no new frontier2\n"
-      if self.log_int_stats:
-        log.writeln("This iteration intersect time = %f" % (self.intersect_time - this_iter_intersect_time))
-      frontier = new_frontier
-        
-    stop_time = monitor.cpu()
-    log.writeln("Total time for rule lookup, extraction, and scoring = %f seconds" % (stop_time - start_time))
-    #log.writeln("  Intersect time = %f seconds" % self.intersect_time)
-    gc.collect()
-    log.writeln("  Extract time = %f seconds" % self.extract_time)
-    if self.pruned_rule_file:
-      self.grammar.dump(self.pruned_rule_file)
-    if self.per_sentence_grammar:
-      self.rule_filehandler.close()
-
-#    else:
-#      self.rule_filehandler.write("###EOS_"+ id +"\n")
-
-
-  cdef int find_fixpoint(self, 
-            int f_low, f_high, 
-            int* f_links_low, int* f_links_high, 
-            int* e_links_low, int* e_links_high,
-            int e_in_low, int e_in_high, 
-            int* e_low, int* e_high,
-            int* f_back_low, int* f_back_high, 
-            int f_sent_len, int e_sent_len,
-            int max_f_len, int max_e_len, 
-            int min_fx_size, int min_ex_size,
-            int max_new_x,
-            int allow_low_x, int allow_high_x, 
-            int allow_arbitrary_x, int write_log):
-    cdef int e_low_prev, e_high_prev, f_low_prev, f_high_prev, new_x, new_low_x, new_high_x
-
-    e_low[0] = e_in_low
-    e_high[0] = e_in_high
-    self.find_projection(f_low, f_high, f_links_low, f_links_high, e_low, e_high)
-    if e_low[0] == -1:
-      # low-priority corner case: if phrase w is unaligned,
-      # but we don't require aligned terminals, then returning
-      # an error here might prevent extraction of allowed
-      # rule X -> X_1 w X_2 / X_1 X_2.  This is probably
-      # not worth the bother, though.
-      #print "find_fixpoint0"
-      return 0
-    elif e_in_low != -1 and e_low[0] != e_in_low:
-      if e_in_low - e_low[0] < min_ex_size:
-        e_low[0] = e_in_low - min_ex_size
-        if e_low[0] < 0:
-          #print "find_fixpoint1"
-          return 0
-
-    if e_high[0] - e_low[0] > max_e_len:
-      #print "find_fixpoint2"
-      return 0
-    elif e_in_high != -1 and e_high[0] != e_in_high:
-      if e_high[0] - e_in_high < min_ex_size:
-        e_high[0] = e_in_high + min_ex_size
-        if e_high[0] > e_sent_len:
-          #print "find_fixpoint3"
-          return 0
-
-    f_back_low[0] = -1
-    f_back_high[0] = -1
-    f_low_prev = f_low
-    f_high_prev = f_high
-    new_x = 0
-    new_low_x = 0
-    new_high_x = 0
-
-    while True:
-
-      if f_back_low[0] == -1:
-        self.find_projection(e_low[0], e_high[0], e_links_low, e_links_high, f_back_low, f_back_high)
-      else:
-        self.find_projection(e_low[0], e_low_prev, e_links_low, e_links_high, f_back_low, f_back_high)
-        self.find_projection(e_high_prev, e_high[0], e_links_low, e_links_high, f_back_low, f_back_high)
-
-      if f_back_low[0] > f_low:
-        f_back_low[0] = f_low
-
-      if f_back_high[0] < f_high:
-        f_back_high[0] = f_high
-
-      if f_back_low[0] == f_low_prev and f_back_high[0] == f_high_prev:
-        return 1
-
-      if allow_low_x == 0 and f_back_low[0] < f_low:
-#        log.writeln("   FAIL: f phrase is not tight")
-        #print "   FAIL: f phrase is not tight"
-        return 0
-
-      if f_back_high[0] - f_back_low[0] > max_f_len:
-#        log.writeln("   FAIL: f back projection is too wide")
-        #print "   FAIL: f back projection is too wide"
-        return 0
-
-      if allow_high_x == 0 and f_back_high[0] > f_high:
-#        log.writeln("   FAIL: extension on high side not allowed")
-        #print "   FAIL: extension on high side not allowed"
-        return 0
-
-      if f_low != f_back_low[0]:
-        if new_low_x == 0:
-          if new_x >= max_new_x:
-#            log.writeln("   FAIL: extension required on low side violates max # of gaps")
-            #print "   FAIL: extension required on low side violates max # of gaps"
-            return 0
-          else:
-            new_x = new_x + 1
-            new_low_x = 1
-        if f_low - f_back_low[0] < min_fx_size:
-          f_back_low[0] = f_low - min_fx_size
-          if f_back_high[0] - f_back_low[0] > max_f_len:
-#            log.writeln("   FAIL: extension required on low side violates max initial length")
-            #print "   FAIL: extension required on low side violates max initial length"
-            return 0
-          if f_back_low[0] < 0:
-#            log.writeln("   FAIL: extension required on low side violates sentence boundary")
-            #print "   FAIL: extension required on low side violates sentence boundary"
-            return 0
-
-      if f_high != f_back_high[0]:
-        if new_high_x == 0:
-          if new_x >= max_new_x:
-#            log.writeln("   FAIL: extension required on high side violates max # of gaps")
-            #print "   FAIL: extension required on high side violates max # of gaps"
-            return 0
-          else:
-            new_x = new_x + 1
-            new_high_x = 1
-        if f_back_high[0] - f_high < min_fx_size:
-          f_back_high[0] = f_high + min_fx_size
-          if f_back_high[0] - f_back_low[0] > max_f_len:
-#            log.writeln("   FAIL: extension required on high side violates max initial length")
-            #print "   FAIL: extension required on high side violates max initial length"
-            return 0
-          if f_back_high[0] > f_sent_len:
-#            log.writeln("   FAIL: extension required on high side violates sentence boundary")
-            #print "   FAIL: extension required on high side violates sentence boundary"
-            return 0
-
-      e_low_prev = e_low[0]
-      e_high_prev = e_high[0]
-
-      self.find_projection(f_back_low[0], f_low_prev, f_links_low, f_links_high, e_low, e_high)
-      self.find_projection(f_high_prev, f_back_high[0], f_links_low, f_links_high, e_low, e_high)
-      if e_low[0] == e_low_prev and e_high[0] == e_high_prev:
-        return 1
-      if allow_arbitrary_x == 0:
-#        log.writeln("   FAIL: arbitrary expansion not permitted")
-        #print "   FAIL: arbitrary expansion not permitted"
-        return 0
-      if e_high[0] - e_low[0] > max_e_len:
-#        log.writeln("   FAIL: re-projection violates sentence max phrase length")
-        #print "   FAIL: re-projection violates sentence max phrase length"
-        return 0
-      f_low_prev = f_back_low[0]
-      f_high_prev = f_back_high[0]
-
-
-  cdef find_projection(self, int in_low, int in_high, int* in_links_low, int* in_links_high, 
-            int* out_low, int* out_high):
-    cdef int i
-    for i from in_low <= i < in_high:
-      if in_links_low[i] != -1:
-        if out_low[0] == -1 or in_links_low[i] < out_low[0]:
-          out_low[0] = in_links_low[i]
-        if out_high[0] == -1 or in_links_high[i] > out_high[0]:
-          out_high[0] = in_links_high[i]
-
-
-  cdef int* int_arr_extend(self, int* arr, int* arr_len, int* data, int data_len):
-    cdef int new_len
-    new_len = arr_len[0] + data_len
-    arr = <int*> realloc(arr, new_len*sizeof(int))
-    memcpy(arr+arr_len[0], data, data_len*sizeof(int))
-    arr_len[0] = new_len
-    return arr
-
-
-  cdef extract_phrases(self, int e_low, int e_high, int* e_gap_low, int* e_gap_high, int* e_links_low, int num_gaps,
-            int f_low, int f_high, int* f_gap_low, int* f_gap_high, int* f_links_low, 
-            int sent_id, int e_sent_len, int e_sent_start):
-    cdef int i, j, k, m, n, *e_gap_order, e_x_low, e_x_high, e_x_gap_low, e_x_gap_high
-    cdef int *e_gaps1, *e_gaps2, len1, len2, step, num_chunks
-    cdef cintlist.CIntList ephr_arr
-    cdef result
-
-    #print "inside extract_phrases"
-    #print "f_low=%d, f_high=%d" % (f_low,f_high)
-    result = []
-    len1 = 0
-    e_gaps1 = <int*> malloc(0)
-    ephr_arr = cintlist.CIntList()
-
-    e_gap_order = <int*> malloc(num_gaps*sizeof(int))
-    if num_gaps > 0:
-      e_gap_order[0] = 0
-      for i from 1 <= i < num_gaps:
-        for j from 0 <= j < i:
-          if e_gap_low[i] < e_gap_low[j]:
-            for k from j <= k < i:
-              e_gap_order[k+1] = e_gap_order[k]
-            e_gap_order[j] = i
-            break
-        else:
-          e_gap_order[i] = i
-
-    e_x_low = e_low
-    e_x_high = e_high
-    if self.tight_phrases == 0:
-      while e_x_low > 0 and e_high - e_x_low < self.train_max_initial_size and e_links_low[e_x_low-1] == -1:
-        e_x_low = e_x_low - 1
-      while e_x_high < e_sent_len and e_x_high - e_low < self.train_max_initial_size and e_links_low[e_x_high] == -1:
-        e_x_high = e_x_high + 1
-
-    for i from e_x_low <= i <= e_low:
-      e_gaps1 = self.int_arr_extend(e_gaps1, &len1, &i, 1)
-
-    for i from 0 <= i < num_gaps:
-      e_gaps2 = <int*> malloc(0)
-      len2 = 0
-
-      j = e_gap_order[i]
-      e_x_gap_low = e_gap_low[j]
-      e_x_gap_high = e_gap_high[j]
-      if self.tight_phrases == 0:
-        while e_x_gap_low > e_x_low and e_links_low[e_x_gap_low-1] == -1:
-          e_x_gap_low = e_x_gap_low - 1
-        while e_x_gap_high < e_x_high and e_links_low[e_x_gap_high] == -1:
-          e_x_gap_high = e_x_gap_high + 1
-
-      k = 0
-      step = 1+(i*2)
-      while k < len1:
-        for m from e_x_gap_low <= m <= e_gap_low[j]:
-          if m >= e_gaps1[k+step-1]:
-            for n from e_gap_high[j] <= n <= e_x_gap_high:
-              if n-m >= 1:  # extractor.py doesn't restrict target-side gap length 
-                e_gaps2 = self.int_arr_extend(e_gaps2, &len2, e_gaps1+k, step)
-                e_gaps2 = self.int_arr_extend(e_gaps2, &len2, &m, 1)
-                e_gaps2 = self.int_arr_extend(e_gaps2, &len2, &n, 1)
-        k = k + step
-      free(e_gaps1)
-      e_gaps1 = e_gaps2
-      len1 = len2
-
-    step = 1+(num_gaps*2)
-    e_gaps2 = <int*> malloc(0)
-    len2 = 0
-    for i from e_high <= i <= e_x_high:
-      j = 0
-      while j < len1:
-        if i - e_gaps1[j] <= self.train_max_initial_size and i >= e_gaps1[j+step-1]:
-          e_gaps2 = self.int_arr_extend(e_gaps2, &len2, e_gaps1+j, step)
-          e_gaps2 = self.int_arr_extend(e_gaps2, &len2, &i, 1)
-        j = j + step
-    free(e_gaps1)
-    e_gaps1 = e_gaps2
-    len1 = len2
-
-    step = (num_gaps+1)*2
-    i = 0
-    
-    while i < len1:
-      ephr_arr._clear()
-      num_chunks = 0
-      indexes = []
-      for j from 0 <= j < num_gaps+1:
-        if e_gaps1[i+2*j] < e_gaps1[i+(2*j)+1]:
-          num_chunks = num_chunks + 1
-        for k from e_gaps1[i+2*j] <= k < e_gaps1[i+(2*j)+1]:
-          indexes.append(k)
-          ephr_arr._append(self.eid2symid[self.eda.data.arr[e_sent_start+k]])
-        if j < num_gaps:
-          indexes.append(sym.setindex(self.category, e_gap_order[j]+1))
-          ephr_arr._append(sym.setindex(self.category, e_gap_order[j]+1))
-      i = i + step
-      if ephr_arr.len <= self.max_target_length and num_chunks <= self.max_target_chunks:
-        result.append((rule.Phrase(ephr_arr),indexes))
-
-    free(e_gaps1)
-    free(e_gap_order)
-    return result
-
-  cdef create_alignments(self, int* sent_links, int num_links, findexes, eindexes):
-    #print "create_alignments"
-    #s = "sent_links = "
-    #i = 0
-    #while (i < num_links*2):
-    #  s = s+"%d-%d " % (sent_links[i],sent_links[i+1])
-    #  i += 2
-    #print s
-    #print findexes
-    #print eindexes
-    
-    ret = cintlist.CIntList()
-    for i in xrange(len(findexes)):
-      s = findexes[i]
-      if (s<0):
-        continue
-      idx = 0
-      while (idx < num_links*2):
-        if (sent_links[idx] == s):
-          j = eindexes.index(sent_links[idx+1])
-          ret.append(i*65536+j)
-        idx += 2
-    return ret
-        
-  cdef extract(self, rule.Phrase phrase, Matching* matching, int* chunklen, int num_chunks):
-    cdef int* sent_links, *e_links_low, *e_links_high, *f_links_low, *f_links_high
-    cdef int *f_gap_low, *f_gap_high, *e_gap_low, *e_gap_high, num_gaps, gap_start
-    cdef int i, j, k, e_i, f_i, num_links, num_aligned_chunks, met_constraints
-    cdef int f_low, f_high, e_low, e_high, f_back_low, f_back_high
-    cdef int e_sent_start, e_sent_end, f_sent_start, f_sent_end, e_sent_len, f_sent_len
-    cdef int e_word_count, f_x_low, f_x_high, e_x_low, e_x_high, phrase_len
-    cdef float pair_count
-    cdef float available_mass
-    cdef extracts, phrase_list
-    cdef cintlist.CIntList fphr_arr
-    cdef rule.Phrase fphr
-    cdef reason_for_failure
-
-    fphr_arr = cintlist.CIntList()
-    phrase_len = phrase.n
-    extracts = []
-    sent_links = self.alignment._get_sent_links(matching.sent_id, &num_links)
-
-    e_sent_start = self.eda.sent_index.arr[matching.sent_id]
-    e_sent_end = self.eda.sent_index.arr[matching.sent_id+1]
-    e_sent_len = e_sent_end - e_sent_start - 1
-    f_sent_start = self.fda.sent_index.arr[matching.sent_id]
-    f_sent_end = self.fda.sent_index.arr[matching.sent_id+1]
-    f_sent_len = f_sent_end - f_sent_start - 1
-    available_mass = 1.0
-    if matching.sent_id == self.excluded_sent_id:
-      available_mass = 0.0
-
-    self.findexes1.reset()
-    sofar = 0
-    for i in xrange(num_chunks):
-      for j in xrange(chunklen[i]):
-        self.findexes1.append(matching.arr[matching.start+i]+j-f_sent_start);
-        sofar += 1
-      if (i+1<num_chunks):
-        self.findexes1.append(phrase[sofar])
-        sofar += 1
-      
-
-    e_links_low = <int*> malloc(e_sent_len*sizeof(int))
-    e_links_high = <int*> malloc(e_sent_len*sizeof(int))
-    f_links_low = <int*> malloc(f_sent_len*sizeof(int))
-    f_links_high = <int*> malloc(f_sent_len*sizeof(int))
-    f_gap_low = <int*> malloc((num_chunks+1)*sizeof(int))
-    f_gap_high = <int*> malloc((num_chunks+1)*sizeof(int))
-    e_gap_low = <int*> malloc((num_chunks+1)*sizeof(int))
-    e_gap_high = <int*> malloc((num_chunks+1)*sizeof(int))
-    memset(f_gap_low, 0, (num_chunks+1)*sizeof(int))
-    memset(f_gap_high, 0, (num_chunks+1)*sizeof(int))
-    memset(e_gap_low, 0, (num_chunks+1)*sizeof(int))
-    memset(e_gap_high, 0, (num_chunks+1)*sizeof(int))
-
-    reason_for_failure = ""
-
-    for i from 0 <= i < e_sent_len:
-      e_links_low[i] = -1
-      e_links_high[i] = -1
-    for i from 0 <= i < f_sent_len:
-      f_links_low[i] = -1
-      f_links_high[i] = -1
-
-    # this is really inefficient -- might be good to 
-    # somehow replace with binary search just for the f
-    # links that we care about (but then how to look up 
-    # when we want to check something on the e side?)
-    i = 0
-    while i < num_links*2:
-      f_i = sent_links[i]
-      e_i = sent_links[i+1]
-      if f_links_low[f_i] == -1 or f_links_low[f_i] > e_i:
-        f_links_low[f_i] = e_i
-      if f_links_high[f_i] == -1 or f_links_high[f_i] < e_i + 1:
-        f_links_high[f_i] = e_i + 1
-      if e_links_low[e_i] == -1 or e_links_low[e_i] > f_i:
-        e_links_low[e_i] = f_i
-      if e_links_high[e_i] == -1 or e_links_high[e_i] < f_i + 1:
-        e_links_high[e_i] = f_i + 1
-      i = i + 2
-    
-    als = []
-    for x in xrange(matching.start,matching.end):
-      al = (matching.arr[x]-f_sent_start,f_links_low[matching.arr[x]-f_sent_start])
-      als.append(al)
-    # check all source-side alignment constraints
-    met_constraints = 1
-    if self.require_aligned_terminal:
-      num_aligned_chunks = 0
-      for i from 0 <= i < num_chunks:
-        for j from 0 <= j < chunklen[i]:
-          if f_links_low[matching.arr[matching.start+i]+j-f_sent_start] > -1:
-            num_aligned_chunks = num_aligned_chunks + 1
-            break
-      if num_aligned_chunks == 0:
-        reason_for_failure = "No aligned terminals"
-        met_constraints = 0
-      if self.require_aligned_chunks and num_aligned_chunks < num_chunks:
-        reason_for_failure = "Unaligned chunk"
-        met_constraints = 0
-
-    if met_constraints and self.tight_phrases:
-      # outside edge constraints are checked later
-      for i from 0 <= i < num_chunks-1:
-        if f_links_low[matching.arr[matching.start+i]+chunklen[i]-f_sent_start] == -1:
-          reason_for_failure = "Gaps are not tight phrases"
-          met_constraints = 0
-          break
-        if f_links_low[matching.arr[matching.start+i+1]-1-f_sent_start] == -1:
-          reason_for_failure = "Gaps are not tight phrases"
-          met_constraints = 0
-          break
-
-    f_low = matching.arr[matching.start] - f_sent_start
-    f_high = matching.arr[matching.start + matching.size - 1] + chunklen[num_chunks-1] - f_sent_start
-    if met_constraints:
-
-      if self.find_fixpoint(f_low, f_high, f_links_low, f_links_high, e_links_low, e_links_high, 
-                -1, -1, &e_low, &e_high, &f_back_low, &f_back_high, f_sent_len, e_sent_len,
-                self.train_max_initial_size, self.train_max_initial_size, 
-                self.train_min_gap_size, 0,
-                self.max_nonterminals - num_chunks + 1, 1, 1, 0, 0):
-        gap_error = 0
-        num_gaps = 0
-
-        if f_back_low < f_low:
-          f_gap_low[0] = f_back_low
-          f_gap_high[0] = f_low
-          num_gaps = 1
-          gap_start = 0
-          phrase_len = phrase_len+1
-          if phrase_len > self.max_length:
-            gap_error = 1
-          if self.tight_phrases:
-            if f_links_low[f_back_low] == -1 or f_links_low[f_low-1] == -1:
-              gap_error = 1
-              reason_for_failure = "Inside edges of preceding subphrase are not tight"
-        else:
-          gap_start = 1
-          if self.tight_phrases and f_links_low[f_low] == -1:
-            # this is not a hard error.  we can't extract this phrase
-            # but we still might be able to extract a superphrase
-            met_constraints = 0
-
-        for i from 0 <= i < matching.size - 1:
-          f_gap_low[1+i] = matching.arr[matching.start+i] + chunklen[i] - f_sent_start
-          f_gap_high[1+i] = matching.arr[matching.start+i+1] - f_sent_start
-          num_gaps = num_gaps + 1
-
-        if f_high < f_back_high:
-          f_gap_low[gap_start+num_gaps] = f_high
-          f_gap_high[gap_start+num_gaps] = f_back_high
-          num_gaps = num_gaps + 1
-          phrase_len = phrase_len+1
-          if phrase_len > self.max_length:
-            gap_error = 1
-          if self.tight_phrases:
-            if f_links_low[f_back_high-1] == -1 or f_links_low[f_high] == -1:
-              gap_error = 1
-              reason_for_failure = "Inside edges of following subphrase are not tight"
-        else:
-          if self.tight_phrases and f_links_low[f_high-1] == -1:
-            met_constraints = 0
-
-        if gap_error == 0:
-          e_word_count = e_high - e_low
-          for i from 0 <= i < num_gaps: # check integrity of subphrases
-            if self.find_fixpoint(f_gap_low[gap_start+i], f_gap_high[gap_start+i], 
-                      f_links_low, f_links_high, e_links_low, e_links_high,
-                      -1, -1, e_gap_low+gap_start+i, e_gap_high+gap_start+i, 
-                      f_gap_low+gap_start+i, f_gap_high+gap_start+i,
-                      f_sent_len, e_sent_len, 
-                      self.train_max_initial_size, self.train_max_initial_size, 
-                      0, 0, 0, 0, 0, 0, 0) == 0:
-              gap_error = 1
-              reason_for_failure = "Subphrase [%d, %d] failed integrity check" % (f_gap_low[gap_start+i], f_gap_high[gap_start+i])
-              break
-
-        if gap_error == 0:
-          i = 1
-          self.findexes.reset()
-          if f_back_low < f_low:
-            fphr_arr._append(sym.setindex(self.category, i))
-            i = i+1
-            self.findexes.append(sym.setindex(self.category, i))
-          self.findexes.extend(self.findexes1)
-          for j from 0 <= j < phrase.n:
-            if sym.isvar(phrase.syms[j]):
-              fphr_arr._append(sym.setindex(self.category, i))
-              i = i + 1
-            else:
-              fphr_arr._append(phrase.syms[j])
-          if f_back_high > f_high:
-            fphr_arr._append(sym.setindex(self.category, i))
-            self.findexes.append(sym.setindex(self.category, i))
-
-          fphr = rule.Phrase(fphr_arr)
-          if met_constraints:
-            phrase_list = self.extract_phrases(e_low, e_high, e_gap_low + gap_start, e_gap_high + gap_start, e_links_low, num_gaps,
-                      f_back_low, f_back_high, f_gap_low + gap_start, f_gap_high + gap_start, f_links_low,
-                      matching.sent_id, e_sent_len, e_sent_start)
-            #print "e_low=%d, e_high=%d, gap_start=%d, num_gaps=%d, f_back_low=%d, f_back_high=%d" & (e_low, e_high, gap_start, num_gaps, f_back_low, f_back_high)
-            if len(phrase_list) > 0:
-              pair_count = available_mass / len(phrase_list)
-            else:
-              pair_count = 0
-              reason_for_failure = "Didn't extract anything from [%d, %d] -> [%d, %d]" % (f_back_low, f_back_high, e_low, e_high)
-            for (phrase2,eindexes) in phrase_list:
-              als1 = self.create_alignments(sent_links,num_links,self.findexes,eindexes)    
-              extracts.append((fphr, phrase2, pair_count, tuple(als1)))
-              if self.extract_file:
-                self.extract_file.write("%s ||| %s ||| %f ||| %d: [%d, %d] -> [%d, %d]\n" % (fphr, phrase2, pair_count, matching.sent_id+1, f_back_low, f_back_high, e_low, e_high))
-              #print "extract_phrases1: %s ||| %s ||| %f ||| %d: [%d, %d] -> [%d, %d]\n" % (fphr, phrase2, pair_count, matching.sent_id+1, f_back_low, f_back_high, e_low, e_high)
-
-          if (num_gaps < self.max_nonterminals and
-            phrase_len < self.max_length and
-            f_back_high - f_back_low + self.train_min_gap_size <= self.train_max_initial_size):
-            if (f_back_low == f_low and 
-                f_low >= self.train_min_gap_size and
-                ((not self.tight_phrases) or (f_links_low[f_low-1] != -1 and f_links_low[f_back_high-1] != -1))):
-              f_x_low = f_low-self.train_min_gap_size
-              met_constraints = 1
-              if self.tight_phrases:
-                while f_x_low >= 0 and f_links_low[f_x_low] == -1:
-                  f_x_low = f_x_low - 1
-              if f_x_low < 0 or f_back_high - f_x_low > self.train_max_initial_size:
-                met_constraints = 0
-
-              if (met_constraints and
-                self.find_fixpoint(f_x_low, f_back_high,
-                      f_links_low, f_links_high, e_links_low, e_links_high, 
-                      e_low, e_high, &e_x_low, &e_x_high, &f_x_low, &f_x_high, 
-                      f_sent_len, e_sent_len, 
-                      self.train_max_initial_size, self.train_max_initial_size, 
-                      1, 1, 1, 1, 0, 1, 0) and
-                ((not self.tight_phrases) or f_links_low[f_x_low] != -1) and
-                self.find_fixpoint(f_x_low, f_low,  # check integrity of new subphrase
-                      f_links_low, f_links_high, e_links_low, e_links_high,
-                      -1, -1, e_gap_low, e_gap_high, f_gap_low, f_gap_high, 
-                      f_sent_len, e_sent_len,
-                      self.train_max_initial_size, self.train_max_initial_size,
-                      0, 0, 0, 0, 0, 0, 0)):
-                fphr_arr._clear()
-                i = 1
-                self.findexes.reset()
-                self.findexes.append(sym.setindex(self.category, i))
-                fphr_arr._append(sym.setindex(self.category, i))
-                i = i+1
-                self.findexes.extend(self.findexes1)
-                for j from 0 <= j < phrase.n:
-                  if sym.isvar(phrase.syms[j]):
-                    fphr_arr._append(sym.setindex(self.category, i))
-                    i = i + 1
-                  else:
-                    fphr_arr._append(phrase.syms[j])
-                if f_back_high > f_high:
-                  fphr_arr._append(sym.setindex(self.category, i))
-                  self.findexes.append(sym.setindex(self.category, i))
-                fphr = rule.Phrase(fphr_arr)
-                phrase_list = self.extract_phrases(e_x_low, e_x_high, e_gap_low, e_gap_high, e_links_low, num_gaps+1,
-                          f_x_low, f_x_high, f_gap_low, f_gap_high, f_links_low, matching.sent_id, 
-                          e_sent_len, e_sent_start)
-                if len(phrase_list) > 0:
-                  pair_count = available_mass / len(phrase_list)
-                else:
-                  pair_count = 0
-                for phrase2,eindexes in phrase_list:
-                  als2 = self.create_alignments(sent_links,num_links,self.findexes,eindexes)    
-                  extracts.append((fphr, phrase2, pair_count, tuple(als2)))
-                  if self.extract_file:
-                    self.extract_file.write("%s ||| %s ||| %f ||| %d: [%d, %d] -> [%d, %d]\n" % (fphr, phrase2, pair_count, matching.sent_id+1, f_x_low, f_x_high, e_x_low, e_x_high))
-                  #print "extract_phrases2: %s ||| %s ||| %f ||| %d: [%d, %d] -> [%d, %d]\n" % (fphr, phrase2, pair_count, matching.sent_id+1, f_x_low, f_x_high, e_x_low, e_x_high)
-
-            if (f_back_high == f_high and 
-              f_sent_len - f_high >= self.train_min_gap_size and
-              ((not self.tight_phrases) or (f_links_low[f_high] != -1 and f_links_low[f_back_low] != -1))):
-              f_x_high = f_high+self.train_min_gap_size
-              met_constraints = 1
-              if self.tight_phrases:
-                while f_x_high <= f_sent_len and f_links_low[f_x_high-1] == -1:
-                  f_x_high = f_x_high + 1
-              if f_x_high > f_sent_len or f_x_high - f_back_low > self.train_max_initial_size:
-                met_constraints = 0
-              
-              if (met_constraints and 
-                self.find_fixpoint(f_back_low, f_x_high, 
-                      f_links_low, f_links_high, e_links_low, e_links_high,
-                      e_low, e_high, &e_x_low, &e_x_high, &f_x_low, &f_x_high, 
-                      f_sent_len, e_sent_len, 
-                      self.train_max_initial_size, self.train_max_initial_size, 
-                      1, 1, 1, 0, 1, 1, 0) and
-                ((not self.tight_phrases) or f_links_low[f_x_high-1] != -1) and
-                self.find_fixpoint(f_high, f_x_high,
-                      f_links_low, f_links_high, e_links_low, e_links_high,
-                      -1, -1, e_gap_low+gap_start+num_gaps, e_gap_high+gap_start+num_gaps, 
-                      f_gap_low+gap_start+num_gaps, f_gap_high+gap_start+num_gaps, 
-                      f_sent_len, e_sent_len,
-                      self.train_max_initial_size, self.train_max_initial_size,
-                      0, 0, 0, 0, 0, 0, 0)):
-                fphr_arr._clear()
-                i = 1
-                self.findexes.reset()
-                if f_back_low < f_low:
-                  fphr_arr._append(sym.setindex(self.category, i))
-                  i = i+1
-                  self.findexes.append(sym.setindex(self.category, i))
-                self.findexes.extend(self.findexes1)
-                for j from 0 <= j < phrase.n:
-                  if sym.isvar(phrase.syms[j]):
-                    fphr_arr._append(sym.setindex(self.category, i))
-                    i = i + 1
-                  else:
-                    fphr_arr._append(phrase.syms[j])
-                fphr_arr._append(sym.setindex(self.category, i))
-                self.findexes.append(sym.setindex(self.category, i))
-                fphr = rule.Phrase(fphr_arr)
-                phrase_list = self.extract_phrases(e_x_low, e_x_high, e_gap_low+gap_start, e_gap_high+gap_start, e_links_low, num_gaps+1,
-                          f_x_low, f_x_high, f_gap_low+gap_start, f_gap_high+gap_start, f_links_low, 
-                          matching.sent_id, e_sent_len, e_sent_start)
-                if len(phrase_list) > 0:
-                  pair_count = available_mass / len(phrase_list)
-                else:
-                  pair_count = 0
-                for phrase2, eindexes in phrase_list:
-                  als3 = self.create_alignments(sent_links,num_links,self.findexes,eindexes)    
-                  extracts.append((fphr, phrase2, pair_count, tuple(als3)))
-                  if self.extract_file:
-                    self.extract_file.write("%s ||| %s ||| %f ||| %d: [%d, %d] -> [%d, %d]\n" % (fphr, phrase2, pair_count, matching.sent_id+1, f_x_low, f_x_high, e_x_low, e_x_high))
-                  #print "extract_phrases3: %s ||| %s ||| %f ||| %d: [%d, %d] -> [%d, %d]\n" % (fphr, phrase2, pair_count, matching.sent_id+1, f_x_low, f_x_high, e_x_low, e_x_high)
-            if (num_gaps < self.max_nonterminals - 1 and 
-              phrase_len+1 < self.max_length and
-              f_back_high == f_high and 
-              f_back_low == f_low and 
-              f_back_high - f_back_low + (2*self.train_min_gap_size) <= self.train_max_initial_size and
-              f_low >= self.train_min_gap_size and
-              f_high <= f_sent_len - self.train_min_gap_size and
-              ((not self.tight_phrases) or (f_links_low[f_low-1] != -1 and f_links_low[f_high] != -1))):
-
-              met_constraints = 1
-              f_x_low = f_low-self.train_min_gap_size
-              if self.tight_phrases:
-                while f_x_low >= 0 and f_links_low[f_x_low] == -1:
-                  f_x_low = f_x_low - 1
-              if f_x_low < 0:
-                met_constraints = 0
-
-              f_x_high = f_high+self.train_min_gap_size
-              if self.tight_phrases:
-                while f_x_high <= f_sent_len and f_links_low[f_x_high-1] == -1:
-                  f_x_high = f_x_high + 1
-              if f_x_high > f_sent_len or f_x_high - f_x_low > self.train_max_initial_size:
-                met_constraints = 0
-
-              if (met_constraints and
-                self.find_fixpoint(f_x_low, f_x_high,
-                        f_links_low, f_links_high, e_links_low, e_links_high,
-                        e_low, e_high, &e_x_low, &e_x_high, &f_x_low, &f_x_high, 
-                        f_sent_len, e_sent_len,
-                        self.train_max_initial_size, self.train_max_initial_size, 
-                        1, 1, 2, 1, 1, 1, 1) and
-                ((not self.tight_phrases) or (f_links_low[f_x_low] != -1 and f_links_low[f_x_high-1] != -1)) and
-                self.find_fixpoint(f_x_low, f_low,
-                        f_links_low, f_links_high, e_links_low, e_links_high,
-                        -1, -1, e_gap_low, e_gap_high, f_gap_low, f_gap_high, 
-                        f_sent_len, e_sent_len,
-                        self.train_max_initial_size, self.train_max_initial_size,
-                        0, 0, 0, 0, 0, 0, 0) and
-                self.find_fixpoint(f_high, f_x_high,
-                        f_links_low, f_links_high, e_links_low, e_links_high,
-                        -1, -1, e_gap_low+1+num_gaps, e_gap_high+1+num_gaps, 
-                        f_gap_low+1+num_gaps, f_gap_high+1+num_gaps, 
-                        f_sent_len, e_sent_len,
-                        self.train_max_initial_size, self.train_max_initial_size,
-                        0, 0, 0, 0, 0, 0, 0)):
-                fphr_arr._clear()
-                i = 1
-                self.findexes.reset()
-                self.findexes.append(sym.setindex(self.category, i))
-                fphr_arr._append(sym.setindex(self.category, i))
-                i = i+1
-                self.findexes.extend(self.findexes1)
-                for j from 0 <= j < phrase.n:
-                  if sym.isvar(phrase.syms[j]):
-                    fphr_arr._append(sym.setindex(self.category, i))
-                    i = i + 1
-                  else:
-                    fphr_arr._append(phrase.syms[j])
-                fphr_arr._append(sym.setindex(self.category, i))
-                self.findexes.append(sym.setindex(self.category, i))
-                fphr = rule.Phrase(fphr_arr)
-                phrase_list = self.extract_phrases(e_x_low, e_x_high, e_gap_low, e_gap_high, e_links_low, num_gaps+2,
-                          f_x_low, f_x_high, f_gap_low, f_gap_high, f_links_low, 
-                          matching.sent_id, e_sent_len, e_sent_start)
-                if len(phrase_list) > 0:
-                  pair_count = available_mass / len(phrase_list)
-                else:
-                  pair_count = 0
-                for phrase2, eindexes in phrase_list:
-                  als4 = self.create_alignments(sent_links,num_links,self.findexes,eindexes)    
-                  extracts.append((fphr, phrase2, pair_count, tuple(als4)))
-                  if self.extract_file:
-                    self.extract_file.write("%s ||| %s ||| %f ||| %d: [%d, %d] -> [%d, %d]\n" % (fphr, phrase2, pair_count, matching.sent_id+1, f_x_low, f_x_high, e_x_low, e_x_high))
-                  #print "extract_phrases4 %s ||| %s ||| %f ||| %d: [%d, %d] -> [%d, %d]\n" % (fphr, phrase2, pair_count, matching.sent_id+1, f_x_low, f_x_high, e_x_low, e_x_high)
-      else:
-        reason_for_failure = "Unable to extract basic phrase"
-
-    free(sent_links)
-    free(f_links_low)
-    free(f_links_high)
-    free(e_links_low)
-    free(e_links_high)
-    free(f_gap_low)
-    free(f_gap_high)
-    free(e_gap_low)
-    free(e_gap_high)
-
-    if self.sample_file is not None:
-      self.sample_file.write("%s ||| %d: [%d, %d] ||| %d ||| %s\n" % (phrase, matching.sent_id+1, f_low, f_high, len(extracts), reason_for_failure))
-
-    #print "%s ||| %d: [%d, %d] ||| %d ||| %s\n" % (phrase, matching.sent_id+1, f_low, f_high, len(extracts), reason_for_failure)
-
-    
-    return extracts
-
diff --git a/sa-extract/sa-compile.pl b/sa-extract/sa-compile.pl
deleted file mode 100755
index 1cae83a7..00000000
--- a/sa-extract/sa-compile.pl
+++ /dev/null
@@ -1,322 +0,0 @@
-#!/usr/bin/env perl
-
-use strict;
-use Getopt::Long;
-
-my $cwd; BEGIN { use Cwd qw/ abs_path cwd /; use File::Basename; $cwd = cwd(); }
-
-my $rootdir = `dirname $0`; chomp $rootdir;
-my $compile = "$rootdir/compile_bin.py";
-my $lcp = "$rootdir/lcp_ops.py";
-die "Can't find $compile" unless -f $compile;
-die "Can't execute $compile" unless -x $compile;
-
-sub print_help;
-sub cleanup;
-
-my $alignment;
-my $bitext;
-my $catalog;
-my $dryrun = 0;
-my $group;
-my $help = 0;
-my $ini = "$rootdir/extract.ini";
-my $lm;
-my $precomp;
-my $no_ini = 0;
-my $remove;
-my $type;
-my $local_only = 1;
-my $output;
-
-# Process command-line options
-if (GetOptions(
-  "alignment=s" => \$alignment,
-  "bitext=s" => \$bitext,
-  "help" => \$help,
-  "ini=s" => \$ini,
-  "output=s" => \$output,
-  "precomp-options=s" => \$precomp,
-  "no-ini" => \$no_ini,
-) == 0 || $help == 1 || @ARGV > 0){
-  print_help;
-  die "\n";
-}
-
-open(INI, $ini) or die "Can't read $ini: $!";
-
-$bitext || die "You must specify a bitext with -b\n";
-$alignment || die "You must specify an alignment with -a\n";
-
-my $top_dir;
-if (defined $output) {
-  $top_dir = $output;
-} else {
-  $top_dir = "$cwd/sa-compiled";
-}
-
-my $type_dir = "$top_dir";
-
-my $bitext_name;
-my $bitext_f_file;
-my $bitext_e_file;
-my $bitext_dir;
-if ($bitext){
-  if ($bitext =~ /(.*)=(.*),(.*)/){
-    $bitext_name = $1;
-    $bitext_f_file = $2;
-    $bitext_e_file = $3;
-    -e $bitext_f_file || die "Could not find file $bitext_f_file\n";
-    -e $bitext_e_file || die "Could not find file $bitext_e_file\n";    
-  } else {
-    $bitext_name = $bitext;
-  }
-
-  $bitext_dir = "$type_dir/bitext/$bitext_name";
-  if ($bitext_f_file){
-    if (-e $bitext_dir) {
-      die "Bitext $bitext_name already exists\n";
-    }
-  } else {
-    unless (-e $bitext_dir){
-      die "No bitext $bitext_name. You must specify bitext files with -b\n";
-    }
-  }
-}
-
-my $max_nt = 2;
-my $max_len = 5;
-my $max_size = 15;
-my $min_gap = 1;
-my $rank1 = 100;
-my $rank2 = 10;
-my $precomp_file;
-if ($precomp){
-  unless ($bitext_name){
-    die "You must specify a bitext with -b if using -p\n";
-  }
-  my @precomp_args = split(/,/, $precomp);
-  my $precomp_arg;
-  for $precomp_arg (@precomp_args){
-    if ($precomp_arg =~ /(.*)=(.*)/){
-      my $key = $1;
-      my $value = $2;
-      unless ($value =~ /^\d+$/){
-        die "Value for -p option must be a positive integer, found $value\n";
-      }
-      if ($key eq "max-len"){ $max_len = $value; }
-      elsif ($key eq "max-nt"){ $max_nt = $value; }
-      elsif ($key eq "max-size"){ $max_size = $value; }
-      elsif ($key eq "min-gap"){ $min_gap = $value; }
-      elsif ($key eq "rank1"){ $rank1 = $value; }
-      elsif ($key eq "rank2"){ $rank2 = $value; }
-      else{
-        die "Unknown option $key given for -p\n";
-      }
-    } else {
-      die "When using -p, you must specify key-value pairs using syntax: <key1>=<value1>,...,<keyN>=<valueN>\n";
-    }
-  }
-}
-my $precomp_compile_needed = 0;
-if ($bitext_name){
-  $precomp_file = "$bitext_dir/precomp.$max_len.$max_nt.$max_size.$min_gap.$rank1.$rank2.bin";
-  unless (-e $precomp_file){
-    $precomp_compile_needed = 1;
-  }
-}
-
-my $alignment_name;
-my $alignment_file;
-my $alignment_dir;
-if ($alignment){
-  $bitext || die "Specified alignment $alignment without specifying bitext using -b\n";
-  if ($alignment =~ /(.*)=(.*)/){
-    $alignment_name = $1;
-    $alignment_file = $2;
-    -e $alignment_file || die "Could not find file $alignment_file\n";
-  } else {
-    $alignment_name = $alignment;
-  }
-
-  $alignment_dir = "$bitext_dir/a/$alignment_name";
-  if ($alignment_file){
-    if (-e $alignment_dir){
-      die "Alignment $alignment_name already exists for bitext $bitext_name\n";
-    }
-  } else {
-    require_top_dirs();
-    unless (-e $alignment_dir){
-      die "No alignment $alignment_name for bitext $bitext_name\n";
-    }
-  }
-}
-
-if ($bitext_name){
-  print STDERR " from files $bitext_f_file and $bitext_e_file\n";
-} else {
-  print " No bitext\n";
-}
-if ($precomp_compile_needed){
-  print STDERR "   Precompilation needed: max-len=$max_len, max-nt=$max_nt, max-size=$max_size, min-gap=$min_gap, rank1=$rank1, rank2=$rank2\n";
-}
-if ($alignment_name){
-  print STDERR " Alignment = $alignment_name";
-  if ($alignment_file){
-    print STDERR " from file $alignment_file\n";
-  }
-} else {
-  print STDERR " No alignment\n";
-}
-
-my $script;
-my $compile_dir;
-$SIG{INT} = "cleanup";
-$SIG{TERM} = "cleanup"; 
-$SIG{HUP} = "cleanup";
-
-  if ($bitext_e_file || $precomp_compile_needed || $alignment_file){
-    my $compiled_e_file;
-    my $compiled_f_file;
-
-    $compile_dir = $top_dir;
-    my $compile_top_dir = "$compile_dir";
-
-    my $compile_bitext_dir = "$compile_top_dir/bitext/$bitext_name";
-    if ($bitext_e_file){
-      `mkdir -p $compile_bitext_dir`;
-      print STDERR "\nCompiling bitext (f side)...\n";
-      `$compile -s $bitext_f_file $compile_bitext_dir/f.sa.bin`;
-      die "Command failed: $!" unless $? == 0;
-      print STDERR "\nCompiling bitext (e side)...\n";
-      `$compile -d $bitext_e_file $compile_bitext_dir/e.bin`;
-      die "Command failed: $!" unless $? == 0;
-
-      $compiled_f_file = "$compile_bitext_dir/f.sa.bin";
-      $compiled_e_file = "$compile_bitext_dir/e.bin";
-    } else { # bitext already compiled
-      $compiled_f_file = "$bitext_dir/f.sa.bin";
-      $compiled_e_file = "$bitext_dir/e.bin";
-    }
-
-    if ($precomp_compile_needed){
-      `mkdir -p $compile_bitext_dir`;
-      my $top_stats_file = "$compile_bitext_dir/f.top.$rank1";
-      my $compiled_precomp_file = "$compile_bitext_dir/precomp.$max_len.$max_nt.$max_size.$min_gap.$rank1.$rank2.bin";
-      my $cmd = "$lcp -t 4 $compiled_f_file | sort -nr | head -$rank1 > $top_stats_file";
-      print STDERR "$cmd\n";
-      `$cmd`;
-      die "Command failed: $cmd" unless $? == 0;
-      `$compile -r max-len=$max_len max-nt=$max_nt max-size=$max_size min-gap=$min_gap rank1=$rank1 rank2=$rank2 sa=$compiled_f_file $top_stats_file $compiled_precomp_file`;
-      die "Command failed: $!" unless $? == 0;
-    }
-
-    if ($alignment_file){
-      my $compile_alignment_dir = "$compile_top_dir/bitext/$bitext_name/a/$alignment_name";
-      `mkdir -p $compile_alignment_dir`;
-      print STDERR "\nCompiling alignment...\n";
-      my $cmd= "$compile -a $alignment_file $compile_alignment_dir/a.bin";
-      print STDERR "  $cmd\n";
-      `$cmd`;
-      die "Command failed: $!" unless $? == 0;
-
-      print STDERR "\nCompiling lexical weights file...\n";
-      $cmd="$compile -x $compiled_f_file $compiled_e_file $compile_alignment_dir/a.bin $compile_alignment_dir/lex.bin";
-      print STDERR "  $cmd\n";
-      `$cmd`;
-      die "Command failed: $!" unless $? == 0;
-    }
-
-    chdir $compile_dir;
-    print STDERR "Compiling done: $compile_dir\n";
-  }
-  
-  unless ($no_ini){
-    my $line;
-    while($line=<INI>){
-      $line =~ s/^([^#]*a_file\s*=\s*")(.*)("\s*)$/$1$alignment_dir\/a.bin$3/;
-      $line =~ s/^([^#]*lex_file\s*=\s*")(.*)("\s*)$/$1$alignment_dir\/lex.bin$3/;
-      $line =~ s/^([^#]*f_sa_file\s*=\s*")(.*)("\s*)$/$1$bitext_dir\/f.sa.bin$3/;
-      $line =~ s/^([^#]*e_file\s*=\s*")(.*)("\s*)$/$1$bitext_dir\/e.bin$3/;
-      $line =~ s/^([^#]*precompute_file\s*=\s*")(.*)("\s*)$/$1$bitext_dir\/precomp.$max_len.$max_nt.$max_size.$min_gap.$rank1.$rank2.bin$3/;
-
-      $line =~ s/^([^#]*max_len\s*=\s*)(.*)(\s*)$/$1$max_len$3/;
-      $line =~ s/^([^#]*max_nt\s*=\s*)(.*)(\s*)$/$1$max_nt$3/;
-      $line =~ s/^([^#]*max_size\s*=\s*)(.*)(\s*)$/$1$max_size$3/;
-      $line =~ s/^([^#]*min_gap\s*=\s*)(.*)(\s*)$/$1$min_gap$3/;
-      $line =~ s/^([^#]*rank1\s*=\s*)(.*)(\s*)$/$1$rank1$3/;
-      $line =~ s/^([^#]*rank2\s*=\s*)(.*)(\s*)$/$1$rank2$3/;
-
-      print $line;
-    }
-  }
-
-exit(0);
-
-sub cleanup {
-  die "Cleanup.\n";
-}
-
-sub print_help
-{
-  my $name = `basename $0`; chomp $name;
-  print << "Help";
-
-usage: $name [options]
-
-  Manage compilation of SA-Hiero files and creation of ini files.
-  In the default usage, the command deploys a set of files needed
-  to create a system, and writes an ini for the system on stdout.
-
-options:
-
-  -a, --alignment <name>[=<filename>]
-    Name of an alignment of a bitext (which must be specified 
-    with -b unless using the -c flag).  If used with -r, the 
-    alignment is removed from the deployment.  If used with -c, 
-    only alignments with this name are listed.  If a filename is 
-    given, then the file will be deployed using the name.
-
-  -b, --bitext <name>[=<f file>,<e file>]
-    Name of a bitext for a particular system type (which must be
-    specified with -t unless using the -c flag).  If used with -r,
-    the bitext is removed from the deployment.  If used with -c,
-    only bitexts with the name are listed.  If a filename is given,
-    then the file will be deployed using the name.
-
-  -h, --help
-    Prints this message.
-
-  -i, --ini <filename>
-    Use a specific ini file as the template for a system, rather than
-    the default ini file.
-
-  -p, --precomp-options <key1>=<value1>[,<key2>=<value2>,...,<keyN>=<valueN>]
-    Set parameters of the grammar.  This must be set by $name because
-    many parameters involve precomputation.  There are six keys that can
-    be set: 
-      max-len: maximum number of symbols (T and NT) in a grammar rule
-      max-nt: maximum number of nonterminals in a grammar rule
-      max-size: maximum span of a grammar rule extracted from training
-      min-gap: minimum gap spanned by a nonterminal in training
-      rank1: number of frequent words to precompute collocations for.
-      rank2: number of super-frequent words to precompute triple 
-        collocations for.
-    All values must be positive integers.  If not specified, defaults are:
-      max-len = 5
-      max-nt = 2  (>2 not supported)
-      max-size = 10
-      min-gap = 2
-      rank1 = 100 (>300 not recommended)
-      rank2 = 10  (>10 not recommended)
-
-  -n, --no-ini
-    Do not generate an ini file on stdout.  If this option is used, then
-    the requirement to specify a full system is relaxed.  Therefore, this
-    option can be used when the sole objective is deployment of files.
-
-  -o, --output-dir
-    Write the compiled model to this directory.
-
-Help
-}
diff --git a/sa-extract/sa2cdec.py b/sa-extract/sa2cdec.py
deleted file mode 100755
index 55fb19f3..00000000
--- a/sa-extract/sa2cdec.py
+++ /dev/null
@@ -1,19 +0,0 @@
-#!/usr/bin/env python
-import sys
-
-featNames = [ line.strip() for line in open(sys.argv[1]) if not line.startswith('#') ]
-
-for line in sys.stdin:
-  try:
-    (lhs, src, tgt, feats, align) = line.strip("\n").split(' ||| ')
-  except:
-    print >>sys.stderr, 'WARNING: No alignments:', line
-    try:
-      (lhs, src, tgt, feats) = line.strip().split(' ||| ')
-      align = ''
-    except:
-      print >>sys.stderr, "ERROR: Malformed line:", line
-      raise
-  featValues = feats.split(' ')
-  namedFeats = ' '.join( name+"="+value for (name, value) in zip(featNames, featValues) )
-  print " ||| ".join( (lhs, src, tgt, namedFeats, align) )
diff --git a/sa-extract/sa_feat_names.txt b/sa-extract/sa_feat_names.txt
deleted file mode 100644
index 02c137d7..00000000
--- a/sa-extract/sa_feat_names.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-EGivenFCoherent
-SampleCountF
-CountEF
-MaxLexFGivenE
-MaxLexEGivenF
-IsSingletonF
-IsSingletonFE
diff --git a/sa-extract/setup.cfg b/sa-extract/setup.cfg
deleted file mode 100644
index 8f696136..00000000
--- a/sa-extract/setup.cfg
+++ /dev/null
@@ -1,2 +0,0 @@
-[build_ext]
-inplace=1
diff --git a/sa-extract/setup.py b/sa-extract/setup.py
deleted file mode 100644
index cdcbfb54..00000000
--- a/sa-extract/setup.py
+++ /dev/null
@@ -1,45 +0,0 @@
-from distutils.core import setup, Extension
-from distutils.util import get_platform
-import os.path
-
-cstrmap_module = Extension('cstrmap', sources = ['cstrmap.c', 'strmap.cc'])
-setup (name = 'CStringMap', version = '1.0', description = 'C string->int map', ext_modules = [cstrmap_module])
-
-rule_module = Extension('rule',
-                         sources = ['rule.c', 'strutil.c'])
-setup (name = 'Rule', version = '1.0', description = 'rule class', ext_modules = [rule_module])
-
-sym_module = Extension('sym',
-                         sources = ['sym.c'])
-setup (name = 'Sym', version = '1.0', description = 'symbol class', ext_modules = [sym_module])
-
-cdat_module = Extension('cdat', sources = ['cdat.c'])
-setup(name = "CDat", version = '1.0', description = 'C Data class', ext_modules = [cdat_module])
-
-cintlist_module = Extension('cintlist', sources = ['cintlist.c'])
-setup(name = "CIntList", version = '1.0', description = 'C int array/list class', ext_modules = [cintlist_module])
-
-cfloatlist_module = Extension('cfloatlist', sources = ['cfloatlist.c'])
-setup(name = "CFloatList", version = '1.0', description = 'C float array/list class', ext_modules = [cfloatlist_module])
-
-calignment_module = Extension('calignment', sources = ['calignment.c'])
-setup(name = "CAlignment", version = '1.0', description = 'C alignment class', ext_modules = [calignment_module])
-
-csuf_module = Extension('csuf', sources = ['csuf.c'])
-setup(name = "CSuffixArray", version = '1.0', description = 'C suffix array class', ext_modules = [csuf_module])
-
-clex_module = Extension('clex', sources = ['clex.c'])
-setup(name = "CLex", version = '1.0', description = 'C lexical class', ext_modules = [clex_module])
-
-factory_module = Extension('rulefactory', sources = ['rulefactory.c'])
-setup(name = "RuleFactory", version = '1.0', description = 'C rule factory classes', ext_modules = [factory_module])
-
-cveb_module = Extension('cveb', sources = ['cveb.c'])
-setup(name = "CVEB", version = '1.0', description = 'C impl. of van Emde Boas tree', ext_modules = [cveb_module])
-
-lcp_module = Extension('lcp', sources = ['lcp.c'])
-setup(name = "LCP", version = '1.0', description = 'C impl. of LCP', ext_modules = [lcp_module])
-
-precomp_module = Extension('precomputation', sources = ['precomputation.c'])
-setup(name = "Precomputation", version = '1.0', description = 'Precomputation Algorithm', ext_modules = [precomp_module])
-
diff --git a/sa-extract/sgml.py b/sa-extract/sgml.py
deleted file mode 100644
index 2db8b5dc..00000000
--- a/sa-extract/sgml.py
+++ /dev/null
@@ -1,194 +0,0 @@
-import sys, sgmllib, xml.sax.saxutils, sym
-
-def attrs_to_str(d):
-    if len(d) == 0:
-        return ""
-    l = [""]+["%s=%s" % (name, xml.sax.saxutils.quoteattr(value)) for (name, value) in d]
-    return " ".join(l)
-
-def attrs_to_dict(a):
-    d = {}
-    for (name, value) in a:
-	if d.has_key(name.lower()):
-	    raise ValueError, "duplicate attribute names"
-	d[name.lower()] = value
-    return d
-
-class Sentence(object):
-    def __init__(self, words=None, meta=None):
-        if words is not None:
-            self.words = list(words)
-        else:
-            self.words = []
-        if meta is not None:
-            self.meta = meta
-        else:
-            self.meta = []
-
-    def copy(self):
-        return Sentence(self.words, list(self.meta))
-
-    def mark(self, tag, attrs):
-        self.meta.append((tag, attrs, 0, len(self.words)))
-
-    def getmark(self):
-        if len(self.meta) > 0:
-            (tag, attrs, i, j) = self.meta[-1]
-            if i == 0 and j == len(self.words):
-                return (tag, attrs)
-            else:
-                return None
-        else:
-            return None
-
-    def unmark(self):
-        mark = self.getmark()
-        if mark is not None:
-            self.meta = self.meta[:-1]
-        return mark
-
-    def __cmp__(self, other):
-        return cmp((self.words, self.meta), (other.words, other.meta))
-
-    def __str__(self):
-        def cmp_spans((tag1,attr1,i1,j1),(tag2,attr2,i2,j2)):
-            if i1==i2<=j1==j2:
-                return 0
-            elif i2<=i1<=j1<=j2:
-                return -1
-            elif i1<=i2<=j2<=j1:
-                return 1
-            else:
-                return cmp((i1,j1),(i2,j2)) # don't care
-        # this guarantees that equal spans will come out nested
-        # we want the later spans to be outer
-        # this relies on stable sort
-        open = [[] for i in xrange(len(self.words)+1)]
-        # there seems to be a bug still with empty spans
-        empty = [[] for i in xrange(len(self.words)+1)]
-        close = [[] for j in xrange(len(self.words)+1)]
-        for (tag,attrs,i,j) in sorted(self.meta, cmp=cmp_spans):
-            if i == j:
-                # do we want these to nest?
-                empty[i].append("<%s%s></%s>\n" % (tag, attrs_to_str(attrs), tag))
-            else:
-                open[i].append("<%s%s>\n" % (tag, attrs_to_str(attrs)))
-                close[j].append("</%s>\n" % tag)
-
-        result = []
-        if len(empty[0]) > 0:
-            result.extend(empty[0])
-        for i in xrange(len(self.words)):
-            if i > 0:
-                result.append(" ")
-            result.extend(reversed(open[i]))
-            result.append(xml.sax.saxutils.escape(sym.tostring(self.words[i])))
-            result.extend(close[i+1])
-            if len(empty[i+1]) > 0:
-                result.extend(empty[i+1])
-
-        return "".join(result)
-
-    def __add__(self, other):
-        if type(other) in (list, tuple):
-            return Sentence(self.words + list(other), self.meta)
-        else:
-            othermeta = [(tag, attrs, i+len(self.words), j+len(self.words)) for (tag, attrs, i, j) in other.meta]
-            return Sentence(self.words + other.words, self.meta+othermeta)
-
-def read_raw(f):
-    """Read a raw file into a list of Sentences."""
-    if type(f) is str:
-        f = file(f, "r")
-    i = 0
-    line = f.readline()
-    while line != "":
-        sent = process_sgml_line(line, i)
-        mark = sent.getmark()
-        if mark is None:
-            sent.mark('seg', [('id',str(i))])
-        else:
-            (tag, attrs) = mark
-            if tag == "seg" and not attrs_to_dict(attrs).has_key('id'):
-                x = ('id',str(i))
-                attrs.append(x)
-                sent.mark('seg', attrs)
-            if tag != "seg":
-                sent.mark('seg', [('id',str(i))])
-        yield sent
-        i += 1
-        line = f.readline()
-
-def process_sgml_line(line, id=None):
-    p = DatasetParser(None)
-    p.pos = 0
-    p.words = []
-    p.meta = []
-    p.feed(line)
-    p.close()
-    sent = Sentence(p.words, p.meta)
-    return sent
-
-class DatasetParser(sgmllib.SGMLParser):
-    def __init__(self, set):
-        sgmllib.SGMLParser.__init__(self)
-	self.words = None
-        self.mystack = []
-	self.string = None
-	self.convref = d = {"amp":"&", "lt":"<", "gt":">", "quot":'"', "squot":"'"}
-    def close(self):
-        self.convert()
-        sgmllib.SGMLParser.close(self)
-
-    def handle_starttag(self, tag, method, attrs):
-        thing = method(attrs)
-        self.mystack.append(thing)
-
-    def handle_endtag(self, tag, method):
-        thing = self.mystack.pop()
-        method(thing)
-
-    def unknown_starttag(self, tag, attrs):
-        thing = self.start(tag, attrs)
-        self.mystack.append(thing)
-
-    def unknown_endtag(self, tag):
-        thing = self.mystack.pop()
-        self.end(tag, thing)
-
-    def start(self, tag, attrs):
-        self.convert()
-        if self.words is not None:
-            return (tag, attrs, self.pos, None)
-        else:
-            return None
-
-    def convert(self):
-        if self.words is not None and self.string is not None:
-            words = self.string.split()
-            self.pos += len(words)
-	    self.words.extend(words)
-	    self.string = None
-	
-    def end(self, tag, thing):
-        self.convert()
-        if self.words is not None:
-            (tag, attrs, i, j) = thing
-            self.meta.append((tag, attrs, i, self.pos))
-
-    def handle_data(self, s):
-        if self.words is not None:
-	    if (self.string is None):
-	       self.string = s
-	    else:
-	       self.string += s
-
-    def handle_entityref(self, ref):
-	# s=self.convert_entityref(ref)  # if python 2.5
-	s=self.convref[ref]
-        if self.words is not None:
-	    if (self.string is None):
-	       self.string = s
-	    else:
-	       self.string += s
-
diff --git a/sa-extract/strmap.cc b/sa-extract/strmap.cc
deleted file mode 100644
index 5040477e..00000000
--- a/sa-extract/strmap.cc
+++ /dev/null
@@ -1,232 +0,0 @@
-#include "strmap.h"
-
-#include <vector>
-#include <string>
-#include <tr1/unordered_map>
-#include <stdint.h>
-
-using namespace std;
-using namespace std::tr1;
-
-#undef HAVE_64_BITS
-
-#if INTPTR_MAX == INT32_MAX
-# define HAVE_64_BITS 0
-#elif INTPTR_MAX >= INT64_MAX
-# define HAVE_64_BITS 1
-#else
-# error "couldn't tell if HAVE_64_BITS from INTPTR_MAX INT32_MAX INT64_MAX"
-#endif
-
-typedef uintptr_t MurmurInt;
-
-// MurmurHash2, by Austin Appleby
-
-static const uint32_t DEFAULT_SEED=2654435769U;
-
-#if HAVE_64_BITS
-//MurmurInt MurmurHash(void const *key, int len, uint32_t seed=DEFAULT_SEED);
-
-inline uint64_t MurmurHash64( const void * key, int len, unsigned int seed=DEFAULT_SEED )
-{
-  const uint64_t m = 0xc6a4a7935bd1e995ULL;
-  const int r = 47;
-
-  uint64_t h = seed ^ (len * m);
-
-  const uint64_t * data = (const uint64_t *)key;
-  const uint64_t * end = data + (len/8);
-
-  while(data != end)
-  {
-    uint64_t k = *data++;
-
-    k *= m;
-    k ^= k >> r;
-    k *= m;
-
-    h ^= k;
-    h *= m;
-  }
-
-  const unsigned char * data2 = (const unsigned char*)data;
-
-  switch(len & 7)
-  {
-  case 7: h ^= uint64_t(data2[6]) << 48;
-  case 6: h ^= uint64_t(data2[5]) << 40;
-  case 5: h ^= uint64_t(data2[4]) << 32;
-  case 4: h ^= uint64_t(data2[3]) << 24;
-  case 3: h ^= uint64_t(data2[2]) << 16;
-  case 2: h ^= uint64_t(data2[1]) << 8;
-  case 1: h ^= uint64_t(data2[0]);
-    h *= m;
-  };
-
-  h ^= h >> r;
-  h *= m;
-  h ^= h >> r;
-
-  return h;
-}
-
-inline uint32_t MurmurHash32(void const *key, int len, uint32_t seed=DEFAULT_SEED)
-{
-  return (uint32_t) MurmurHash64(key,len,seed);
-}
-
-inline MurmurInt MurmurHash(void const *key, int len, uint32_t seed=DEFAULT_SEED)
-{
-  return MurmurHash64(key,len,seed);
-}
-
-#else
-// 32-bit
-
-// Note - This code makes a few assumptions about how your machine behaves -
-// 1. We can read a 4-byte value from any address without crashing
-// 2. sizeof(int) == 4
-inline uint32_t MurmurHash32 ( const void * key, int len, uint32_t seed=DEFAULT_SEED)
-{
-  // 'm' and 'r' are mixing constants generated offline.
-  // They're not really 'magic', they just happen to work well.
-
-  const uint32_t m = 0x5bd1e995;
-  const int r = 24;
-
-  // Initialize the hash to a 'random' value
-
-  uint32_t h = seed ^ len;
-
-  // Mix 4 bytes at a time into the hash
-
-  const unsigned char * data = (const unsigned char *)key;
-
-  while(len >= 4)
-  {
-    uint32_t k = *(uint32_t *)data;
-
-    k *= m;
-    k ^= k >> r;
-    k *= m;
-
-    h *= m;
-    h ^= k;
-
-    data += 4;
-    len -= 4;
-  }
-
-  // Handle the last few bytes of the input array
-
-  switch(len)
-  {
-  case 3: h ^= data[2] << 16;
-  case 2: h ^= data[1] << 8;
-  case 1: h ^= data[0];
-    h *= m;
-  };
-
-  // Do a few final mixes of the hash to ensure the last few
-  // bytes are well-incorporated.
-
-  h ^= h >> 13;
-  h *= m;
-  h ^= h >> 15;
-
-  return h;
-}
-
-inline MurmurInt MurmurHash ( const void * key, int len, uint32_t seed=DEFAULT_SEED) {
-  return MurmurHash32(key,len,seed);
-}
-
-// 64-bit hash for 32-bit platforms
-
-inline uint64_t MurmurHash64 ( const void * key, int len, uint32_t seed=DEFAULT_SEED)
-{
-  const uint32_t m = 0x5bd1e995;
-  const int r = 24;
-
-  uint32_t h1 = seed ^ len;
-  uint32_t h2 = 0;
-
-  const uint32_t * data = (const uint32_t *)key;
-
-  while(len >= 8)
-  {
-    uint32_t k1 = *data++;
-    k1 *= m; k1 ^= k1 >> r; k1 *= m;
-    h1 *= m; h1 ^= k1;
-    len -= 4;
-
-    uint32_t k2 = *data++;
-    k2 *= m; k2 ^= k2 >> r; k2 *= m;
-    h2 *= m; h2 ^= k2;
-    len -= 4;
-  }
-
-  if(len >= 4)
-  {
-    uint32_t k1 = *data++;
-    k1 *= m; k1 ^= k1 >> r; k1 *= m;
-    h1 *= m; h1 ^= k1;
-    len -= 4;
-  }
-
-  switch(len)
-  {
-  case 3: h2 ^= ((unsigned char*)data)[2] << 16;
-  case 2: h2 ^= ((unsigned char*)data)[1] << 8;
-  case 1: h2 ^= ((unsigned char*)data)[0];
-    h2 *= m;
-  };
-
-  h1 ^= h2 >> 18; h1 *= m;
-  h2 ^= h1 >> 22; h2 *= m;
-  h1 ^= h2 >> 17; h1 *= m;
-  h2 ^= h1 >> 19; h2 *= m;
-
-  uint64_t h = h1;
-
-  h = (h << 32) | h2;
-
-  return h;
-}
-
-#endif
-//32bit
-
-struct MurmurHasher {
-  size_t operator()(const string& s) const {
-    return MurmurHash(s.c_str(), s.size());
-  }
-};
-
-struct StrMap {
-  StrMap() { keys_.reserve(10000); keys_.push_back("<bad0>"); map_[keys_[0]] = 0; }
-  unordered_map<string, int, MurmurHasher> map_;
-  vector<string> keys_;
-};
-
-StrMap* stringmap_new() {
-  return new StrMap;
-}
-
-void stringmap_delete(StrMap *vocab) {
-  delete vocab;
-}
-
-int stringmap_index(StrMap *vocab, char *s) {
-  int& cell = vocab->map_[s];
-  if (!cell) {
-    cell = vocab->keys_.size();
-    vocab->keys_.push_back(s);
-  }
-  return cell;
-}
-
-char* stringmap_word(StrMap *vocab, int i) {
-  return const_cast<char *>(vocab->keys_[i].c_str());
-}
-
diff --git a/sa-extract/strmap.h b/sa-extract/strmap.h
deleted file mode 100644
index a218a4c0..00000000
--- a/sa-extract/strmap.h
+++ /dev/null
@@ -1,22 +0,0 @@
-#ifndef _STRMAP_H_
-#define _STRMAP_H_
-
-#ifdef __cplusplus
-  extern "C" {
-#else
-    typedef struct StrMap StrMap; /* dummy type to stand in for class */
-#endif
-
-struct StrMap;
-
-StrMap* stringmap_new();
-void stringmap_delete(StrMap *vocab);
-int stringmap_index(StrMap *vocab, char *s);
-char* stringmap_word(StrMap *vocab, int i);
-
-#ifdef __cplusplus
-  }
-#endif
-
-
-#endif
diff --git a/sa-extract/strutil.c b/sa-extract/strutil.c
deleted file mode 100644
index 456de87a..00000000
--- a/sa-extract/strutil.c
+++ /dev/null
@@ -1,63 +0,0 @@
-#include <string.h>
-#include <stdlib.h>
-
-/* Like strsep(3) except that the delimiter is a string, not a set of characters. 
-*/
-char *strstrsep(char **stringp, const char *delim) {
-  char *match, *save;
-  save = *stringp;
-  if (*stringp == NULL)
-    return NULL;
-  match = strstr(*stringp, delim);
-  if (match == NULL) {
-    *stringp = NULL;
-    return save;
-  }
-  *match = '\0';
-  *stringp = match + strlen(delim);
-  return save;
-}
-
-static char **words = NULL;
-static int max_words;
-char **split(char *s, const char *delim, int *pn) {
-  int i;
-  char *tok, *rest;
-
-  if (words == NULL) {
-    max_words = 10;
-    words = malloc(max_words*sizeof(char *));
-  }
-  i = 0;
-  rest = s;
-  while ((tok = (delim ? strstrsep(&rest, delim) : strsep(&rest, " \t\n"))) != NULL) {
-    if (!delim && !*tok) // empty token
-      continue;
-    while (i+1 >= max_words) {
-      max_words *= 2;
-      words = realloc(words, max_words*sizeof(char *));
-    }
-    words[i] = tok;
-    i++;
-  }
-  words[i] = NULL;
-  if (pn != NULL)
-    *pn = i;
-  return words;
-}
-
-inline int isspace(char c) {
-  return (c == ' ' || c == '\t' || c == '\n');
-}
-
-char *strip(char *s) {
-  int n;
-  while (isspace(*s) && *s != '\0')
-    s++;
-  n = strlen(s);
-  while (n > 0 && isspace(s[n-1])) {
-    s[n-1] = '\0';
-    n--;
-  }
-  return s;
-}
diff --git a/sa-extract/strutil.h b/sa-extract/strutil.h
deleted file mode 100644
index 94a77033..00000000
--- a/sa-extract/strutil.h
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef STRUTIL_H
-#define STRUTIL_H
-
-char *strstrsep(char **stringp, const char *delim);
-char *strip(char *s);
-char **split(char *s, const char *delim, int *pn);
-
-#endif
diff --git a/sa-extract/sym.pxd b/sa-extract/sym.pxd
deleted file mode 100644
index d0650f46..00000000
--- a/sa-extract/sym.pxd
+++ /dev/null
@@ -1,17 +0,0 @@
-cimport cstrmap
-
-cdef class Alphabet:
-    cdef readonly cstrmap.StringMap terminals, nonterminals
-    cdef int first_nonterminal, last_nonterminal
-    cdef int isvar(self, int sym)
-    cdef int isword(self, int sym)
-    cdef int getindex(self, int sym)
-    cdef int setindex(self, int sym, int ind)
-    cdef int clearindex(self, int sym)
-    cdef int match(self, int sym1, int sym2)
-    cdef char* tocat(self, int sym)
-    cdef int fromcat(self, char *s)
-    cdef char* tostring(self, int sym)
-    cdef int fromstring(self, char *s, int terminal)
-
-    
diff --git a/sa-extract/sym.pyx b/sa-extract/sym.pyx
deleted file mode 100644
index 264cfcd9..00000000
--- a/sa-extract/sym.pyx
+++ /dev/null
@@ -1,155 +0,0 @@
-from libc.string cimport strrchr, strstr, strcpy, strlen
-from libc.stdlib cimport malloc, realloc, strtol
-
-cdef int index_shift, index_mask, n_index
-index_shift = 3
-n_index = 1<<index_shift
-index_mask = (1<<index_shift)-1
-cdef id2sym
-id2sym = {}
-
-cdef class Alphabet:
-    def __cinit__(self):
-        self.terminals = cstrmap.StringMap()
-        self.nonterminals = cstrmap.StringMap()
-
-    def __init__(self):
-        self.first_nonterminal = -1
-
-    def __dealloc__(self):
-        pass
-
-    cdef int isvar(self, int sym):
-        return sym < 0
-
-    cdef int isword(self, int sym):
-        return sym >= 0
-
-    cdef int getindex(self, int sym):
-        return -sym & index_mask
-
-    cdef int setindex(self, int sym, int ind):
-        return -(-sym & ~index_mask | ind)
-
-    cdef int clearindex(self, int sym):
-        return -(-sym& ~index_mask)
-
-    cdef int match(self, int sym1, int sym2):
-        return self.clearindex(sym1) == self.clearindex(sym2);
-
-    cdef char* tocat(self, int sym):
-        return self.nonterminals.word((-sym >> index_shift)-1)
-
-    cdef int fromcat(self, char *s):
-        cdef int i
-        i = self.nonterminals.index(s)
-        if self.first_nonterminal == -1:
-            self.first_nonterminal = i
-        if i > self.last_nonterminal:
-            self.last_nonterminal = i
-        return -(i+1 << index_shift)
-
-    cdef char* tostring(self, int sym):
-        cdef int ind
-        if self.isvar(sym):
-            if sym in id2sym:
-                return id2sym[sym]
-
-            ind = self.getindex(sym)
-            if ind > 0:
-                id2sym[sym] = "[%s,%d]" % (self.tocat(sym), ind)
-            else:
-                id2sym[sym] = "[%s]" % self.tocat(sym)
-            return id2sym[sym]
-                
-        else:
-            return self.terminals.word(sym)
-
-    cdef int fromstring(self, char *s, int terminal):
-        """Warning: this method is allowed to alter s."""
-        cdef char *comma
-        cdef int n
-        n = strlen(s)
-        cdef char *sep
-        sep = strstr(s,"_SEP_")
-        if n >= 3 and s[0] == c'[' and s[n-1] == c']' and sep == NULL:
-            if terminal:
-                s1 = "\\"+s
-                return self.terminals.index(s1)
-            s[n-1] = c'\0'
-            s = s + 1
-            comma = strrchr(s, c',')
-            if comma != NULL:
-                comma[0] = c'\0'
-                return self.setindex(self.fromcat(s), strtol(comma+1, NULL, 10))
-            else:
-                return self.fromcat(s)
-        else:
-            return self.terminals.index(s)
-
-# Expose Python functions as top-level functions for backward compatibility
-
-alphabet = Alphabet()
-
-cdef Alphabet calphabet
-calphabet = alphabet
-
-def isvar(int sym):
-    return calphabet.isvar(sym)
-
-def isword(int sym):
-    return calphabet.isword(sym)
-
-def getindex(int sym):
-    return calphabet.getindex(sym)
-
-def setindex(int sym, int ind):
-    return calphabet.setindex(sym, ind)
-
-def clearindex(int sym):
-    return calphabet.clearindex(sym)
-
-def match(int sym1, int sym2):
-    return calphabet.match(sym1, sym2) != 0
-
-def totag(int sym):
-    return calphabet.tocat(sym)
-
-def fromtag(s):
-    s = s.upper()
-    return calphabet.fromcat(s) 
-
-def tostring(sym):
-    if type(sym) is str:
-        return sym
-    else:
-        return calphabet.tostring(sym)
-
-cdef int bufsize
-cdef char *buf
-bufsize = 100
-buf = <char *>malloc(bufsize)
-cdef ensurebufsize(int size):
-   global buf, bufsize
-   if size > bufsize:
-      buf = <char *>realloc(buf, size*sizeof(char))
-      bufsize = size
-
-def fromstring(s, terminal=False):
-    cdef bytes bs
-    cdef char* cs
-    if terminal:
-        return calphabet.fromstring(s, 1)
-    else:
-        bs = s
-        cs = bs
-        ensurebufsize(len(s)+1)
-        strcpy(buf, cs)
-        return calphabet.fromstring(buf, 0)
-
-def nonterminals():
-    cdef i
-    l = []
-    for i from calphabet.first_nonterminal <= i <= calphabet.last_nonterminal:
-        l.append(-(i+1 << index_shift))
-    return l
diff --git a/sa-extract/wrap_input.py b/sa-extract/wrap_input.py
deleted file mode 100755
index e859a4fd..00000000
--- a/sa-extract/wrap_input.py
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/usr/bin/env python
-import sys
-import codecs
-import os
-import os.path
-from xml.sax.saxutils import escape
-
-graPrefix = sys.argv[1]
-
-# Second argument can be a file with observable sentence-level features,
-# one set of features per line (parallel with source sentences). Features are space-delimited indicator features.
-obsFeatsFile = None
-if len(sys.argv) == 3:
-  obsFeatsFilename = sys.argv[2]
-  obsFeatsFile = open(obsFeatsFilename)
-
-sys.stdin = codecs.getreader("utf-8")(sys.stdin)
-sys.stdout = codecs.getwriter("utf-8")(sys.stdout)
-
-i = 0
-for line in sys.stdin:
-  filename = "%s%d"%(graPrefix,i)
-  if not os.path.exists(filename):
-    filenameGz = filename + ".gz"
-    if not os.path.exists(filenameGz):
-      print >>sys.stderr, "Grammar file not found: ", filename, filenameGz
-      sys.exit(1)
-    else:
-      filename = filenameGz
-    
-  if obsFeatsFile:
-    obsFeats = obsFeatsFile.next().strip()
-    print '<seg id="%d" features="%s" grammar="%s"> '%(i,obsFeats,filename) + escape(line.strip()) + " </seg>"
-  else:
-    print '<seg id="%d" grammar="%s"> '%(i,filename) + escape(line.strip()) + " </seg>"
-  i+=1
-
-- 
cgit v1.2.3