summaryrefslogtreecommitdiff
path: root/sa-extract
diff options
context:
space:
mode:
authorPatrick Simianer <p@simianer.de>2012-03-13 09:24:47 +0100
committerPatrick Simianer <p@simianer.de>2012-03-13 09:24:47 +0100
commitef6085e558e26c8819f1735425761103021b6470 (patch)
tree5cf70e4c48c64d838e1326b5a505c8c4061bff4a /sa-extract
parent10a232656a0c882b3b955d2bcfac138ce11e8a2e (diff)
parentdfbc278c1057555fda9312291c8024049e00b7d8 (diff)
merge with upstream
Diffstat (limited to 'sa-extract')
-rw-r--r--sa-extract/Makefile18
-rw-r--r--sa-extract/README62
-rw-r--r--sa-extract/calignment.pxd10
-rw-r--r--sa-extract/calignment.pyx128
-rw-r--r--sa-extract/cdat.pxd12
-rw-r--r--sa-extract/cdat.pyx178
-rw-r--r--sa-extract/cfloatlist.pxd10
-rw-r--r--sa-extract/cfloatlist.pyx93
-rw-r--r--sa-extract/cintlist.pxd15
-rw-r--r--sa-extract/cintlist.pyx196
-rw-r--r--sa-extract/clex.pyx460
-rw-r--r--sa-extract/cmath.pxd2
-rw-r--r--sa-extract/cn.py164
-rwxr-xr-xsa-extract/compile_bin.py148
-rw-r--r--sa-extract/context_model.py234
-rw-r--r--sa-extract/cstrmap.pxd12
-rw-r--r--sa-extract/cstrmap.pyx14
-rw-r--r--sa-extract/csuf.pxd11
-rw-r--r--sa-extract/csuf.pyx321
-rw-r--r--sa-extract/cveb.pxd15
-rw-r--r--sa-extract/cveb.pyx390
-rwxr-xr-xsa-extract/escape-testset.pl35
-rw-r--r--sa-extract/example/README8
-rw-r--r--sa-extract/example/corpus.align.gzbin0 -> 829334 bytes
-rw-r--r--sa-extract/example/corpus.de.gzbin0 -> 1724393 bytes
-rw-r--r--sa-extract/example/corpus.en.gzbin0 -> 1457711 bytes
-rw-r--r--sa-extract/example/test.de10
-rw-r--r--sa-extract/example/test.ref.en10
-rw-r--r--sa-extract/extract.ini116
-rwxr-xr-xsa-extract/extractor.py60
-rw-r--r--sa-extract/lcp.pyx113
-rwxr-xr-xsa-extract/lcp_ops.py52
-rw-r--r--sa-extract/log.py18
-rw-r--r--sa-extract/manager.py100
-rw-r--r--sa-extract/model.py12
-rw-r--r--sa-extract/monitor.py48
-rw-r--r--sa-extract/precomputation.pxd13
-rw-r--r--sa-extract/precomputation.pyx478
-rw-r--r--sa-extract/rule.pxd13
-rw-r--r--sa-extract/rule.pyx286
-rw-r--r--sa-extract/rulefactory.pyx2360
-rwxr-xr-xsa-extract/sa-compile.pl322
-rw-r--r--sa-extract/setup.cfg2
-rw-r--r--sa-extract/setup.py45
-rw-r--r--sa-extract/sgml.py194
-rw-r--r--sa-extract/strmap.cc232
-rw-r--r--sa-extract/strmap.h22
-rw-r--r--sa-extract/strutil.c63
-rw-r--r--sa-extract/strutil.h8
-rw-r--r--sa-extract/sym.pxd17
-rw-r--r--sa-extract/sym.pyx155
51 files changed, 7285 insertions, 0 deletions
diff --git a/sa-extract/Makefile b/sa-extract/Makefile
new file mode 100644
index 00000000..7b39ae4d
--- /dev/null
+++ b/sa-extract/Makefile
@@ -0,0 +1,18 @@
+PYVER=python2.7
+PYDIR=/usr/local/Cellar/python/2.7.2
+PYINCLUDE=$(PYDIR)/include/$(PYVER)
+CYTHON=/usr/local/share/python/cython
+PYTHON=$(PYDIR)/bin/python
+
+%.c: %.pyx
+ $(CYTHON) $< -o $@
+
+%.o: %.cc
+ g++ -O6 -g -fPIC -c $<
+
+all: cstrmap.c strmap.cc rule.c sym.c cdat.c cintlist.c cfloatlist.c calignment.c csuf.c clex.c rulefactory.c cveb.c lcp.c precomputation.c
+ $(PYTHON) setup.py build
+
+clean:
+ rm -f cdat.c cstrmap.c sym.c rule.c cintlist.c cfloatlist.c calignment.c csuf.c clex.c rulefactory.c cveb.c lcp.c precomputation.c *.so *.o *.cxx *~ *.pyc
+ rm -rf build
diff --git a/sa-extract/README b/sa-extract/README
new file mode 100644
index 00000000..e4022c7e
--- /dev/null
+++ b/sa-extract/README
@@ -0,0 +1,62 @@
+SUFFIX-ARRAY-EXTRACT README
+ Feb 1, 2012
+
+Written by Adam Lopez, repackaged by Chris Dyer.
+
+Originally based on parts of Hiero, by David Chiang, but these dependencies
+have been removed or rewritten.
+
+
+BUILD INSTRUCTIONS
+==============================================================================
+
+Requirements:
+ Python 2.7 or later (http://www.python.org)
+ Cython 0.14.1 or later (http://cython.org/)
+
+- Edit Makefile to set the location of Python/Cython then do:
+
+ make
+
+
+COMPILING A PARALLEL CORPUS AND WORD ALIGNMENT
+==============================================================================
+- Run sa-compile.pl to compile the training data and generate an extract.ini
+ file (which is written to STDOUT):
+
+ sa-compile.pl -b bitext_name=source.fr,target.en \
+ -a alignment_name=alignment.txt > extract.ini
+
+
+ The training data should be in two parallel text files (source.fr,source.en)
+ and the alignments are expected in "0-0 1-2 2-1 ..." format produced by
+ most alignment toolkits. The text files should NOT be escaped for non-XML
+ characters.
+
+
+EXTRACTION OF PER-SENTENCE GRAMMARS
+==============================================================================
+The most common use-case we support is extraction of "per-sentence" grammars
+for each segment in a testset. You may run the extractor on test set, but it
+will try to interpret tags as SGML markup, so we provide a script that does
+escaping: ./escape-testset.pl.
+
+- Example:
+
+ cat test.fr | ./escape-testset.pl | ./extractor.py -c extract.ini
+
+
+EXTRACTION OF COMPLETE TEST-SET GRAMMARS
+==============================================================================
+Edit the generated extract.ini file a change per_sentence_grammar
+to False. Then, run extraction as normal.
+
+Note: extracting a single grammar for an entire test set will consume more
+memory during extraction and (probably) during decoding.
+
+
+EXAMPLE
+==============================================================================
+- See example/ and the README therein.
+
+
diff --git a/sa-extract/calignment.pxd b/sa-extract/calignment.pxd
new file mode 100644
index 00000000..a7d3001f
--- /dev/null
+++ b/sa-extract/calignment.pxd
@@ -0,0 +1,10 @@
+cimport cintlist
+from libc.stdio cimport FILE
+
+cdef class Alignment:
+
+ cdef cintlist.CIntList links
+ cdef cintlist.CIntList sent_index
+ cdef int link(self, int i, int j)
+ cdef _unlink(self, int link, int* f, int* e)
+ cdef int* _get_sent_links(self, int sent_id, int* num_links)
diff --git a/sa-extract/calignment.pyx b/sa-extract/calignment.pyx
new file mode 100644
index 00000000..976fcd66
--- /dev/null
+++ b/sa-extract/calignment.pyx
@@ -0,0 +1,128 @@
+import log
+import gzip
+import cintlist
+
+from libc.stdio cimport FILE, fopen, fread, fwrite, fclose
+from libc.stdlib cimport malloc, realloc, free
+
+# Note: Callison-Burch uses short instead of int.
+# We have the space for our corpus, so this is not a problem;
+# May need to revisit if things get really tight, though.
+cdef class Alignment:
+
+
+ cdef int link(self, int i, int j):
+ '''Integerizes an alignment link pair'''
+ return i*65536 + j
+
+
+ def unlink(self, link):
+ '''De-integerizes an alignment link pair'''
+ return (link/65536, link%65536)
+
+
+ cdef _unlink(self, int link, int* f, int* e):
+ f[0] = link/65536
+ e[0] = link%65536
+
+
+ def get_sent_links(self, int sent_id):
+ cdef cintlist.CIntList sent_links
+ cdef int* arr
+ cdef int arr_len
+
+ sent_links = cintlist.CIntList()
+ arr = self._get_sent_links(sent_id, &arr_len)
+ sent_links._extend_arr(arr, arr_len*2)
+ free(arr)
+ return sent_links
+
+
+ cdef int* _get_sent_links(self, int sent_id, int* num_links):
+ cdef int* sent_links
+ cdef int i, start, end
+
+ start = self.sent_index.arr[sent_id]
+ end = self.sent_index.arr[sent_id+1]
+ num_links[0] = end - start
+ sent_links = <int*> malloc(2*num_links[0]*sizeof(int))
+ for i from 0 <= i < num_links[0]:
+ self._unlink(self.links.arr[start + i], sent_links + (2*i), sent_links + (2*i) + 1)
+ return sent_links
+
+
+ def __cinit__(self, filename, from_binary=False):
+ self.links = cintlist.CIntList(1000,1000)
+ self.sent_index = cintlist.CIntList(1000,1000)
+ log.writeln("Reading alignment from file %s" % filename)
+ if from_binary:
+ self.read_binary(filename)
+ else:
+ self.read_text(filename)
+
+
+ def read_text(self, filename):
+ if filename[-2:] == "gz":
+ f = gzip.GzipFile(filename)
+ else:
+ f = open(filename)
+ for line in f:
+ self.sent_index.append(len(self.links))
+ pairs = line.split()
+ for pair in pairs:
+ (i, j) = map(int, pair.split('-'))
+ self.links.append(self.link(i, j))
+ self.sent_index.append(len(self.links))
+
+
+ def read_binary(self, filename):
+ cdef FILE* f
+ cdef bytes bfilename = filename
+ cdef char* cfilename = bfilename
+ f = fopen(cfilename, "r")
+ self.links.read_handle(f)
+ self.sent_index.read_handle(f)
+ fclose(f)
+
+
+ def write_text(self, filename):
+ f = open(filename, "w")
+ sent_num = 0
+ for i, link in enumerate(self.links):
+ while i >= self.sent_index[sent_num]:
+ f.write("\n")
+ sent_num = sent_num + 1
+ f.write("%d-%d " % self.unlink(link))
+ f.write("\n")
+
+
+ def write_binary(self, filename):
+ cdef FILE* f
+ cdef bytes bfilename = filename
+ cdef char* cfilename = bfilename
+ f = fopen(cfilename, "w")
+ self.links.write_handle(f)
+ self.sent_index.write_handle(f)
+ fclose(f)
+
+
+ def write_enhanced(self, filename):
+ f = open(filename, "w")
+ sent_num = 1
+ for link in self.links:
+ f.write("%d " % link)
+ f.write("\n")
+ for i in self.sent_index:
+ f.write("%d " % i)
+ f.write("\n")
+
+
+ def alignment(self, i):
+ '''Return all (e,f) pairs for sentence i'''
+ cdef int j, start, end
+ result = []
+ start = self.sent_index.arr[i]
+ end = self.sent_index.arr[i+1]
+ for j from start <= j < end:
+ result.append(self.unlink(self.links.arr[j]))
+ return result
diff --git a/sa-extract/cdat.pxd b/sa-extract/cdat.pxd
new file mode 100644
index 00000000..b686f611
--- /dev/null
+++ b/sa-extract/cdat.pxd
@@ -0,0 +1,12 @@
+cimport cintlist
+from libc.stdio cimport FILE
+
+cdef class DataArray:
+ cdef word2id
+ cdef id2word
+ cdef cintlist.CIntList data
+ cdef cintlist.CIntList sent_id
+ cdef cintlist.CIntList sent_index
+ cdef use_sent_id
+ cdef void write_handle(self, FILE* f)
+ cdef void read_handle(self, FILE* f)
diff --git a/sa-extract/cdat.pyx b/sa-extract/cdat.pyx
new file mode 100644
index 00000000..57d3ad63
--- /dev/null
+++ b/sa-extract/cdat.pyx
@@ -0,0 +1,178 @@
+# cdat.pyx
+# Defines "data arrays" that can be directly written to/read from disk in binary format
+# In particular, the array itself is written/read directly as a glob of binary data
+# Adam Lopez <alopez@cs.umd.edu>
+
+import sys
+import gzip
+import log
+import cintlist
+
+from libc.stdio cimport FILE, fopen, fread, fwrite, fclose
+from libc.stdlib cimport malloc, realloc, free
+from libc.string cimport memset, strcpy, strlen
+
+cdef class DataArray:
+
+ def __init__(self, filename=None, from_binary=False, use_sent_id=False):
+ self.word2id = {"END_OF_FILE":0, "END_OF_LINE":1}
+ self.id2word = ["END_OF_FILE", "END_OF_LINE"]
+ self.data = cintlist.CIntList(1000,1000)
+ self.sent_id = cintlist.CIntList(1000,1000)
+ self.sent_index = cintlist.CIntList(1000,1000)
+ self.use_sent_id = use_sent_id
+
+ if filename is not None:
+ if from_binary:
+ self.read_binary(filename)
+ else:
+ self.read_text(filename)
+
+
+ def __len__(self):
+ return len(self.data)
+
+
+ def getSentId(self, i):
+ return self.sent_id.arr[i]
+
+
+ def getSent(self, i):
+ cdef int j, start, stop
+ sent = []
+ start = self.sent_index.arr[i]
+ stop = self.sent_index.arr[i+1]
+ for i from start <= i < stop:
+ sent.append(self.id2word[self.data.arr[i]])
+ return sent
+
+
+ def getSentPos(self, loc):
+ return loc - self.sent_index.arr[self.sent_id.arr[loc]]
+
+
+ def get_id(self, word):
+ if not word in self.word2id:
+ self.word2id[word] = len(self.id2word)
+ self.id2word.append(word)
+ return self.word2id[word]
+
+
+ def get_word(self, id):
+ return self.id2word[id]
+
+
+ def write_text(self, filename):
+ f = open(filename, "w")
+ for w_id in self.data:
+ if w_id > 1:
+ f.write("%s " % self.get_word(w_id))
+ if w_id == 1:
+ f.write("\n")
+ f.close()
+
+
+ def read_text(self, filename):
+ cdef int word_count
+
+ if filename[-2:] == "gz":
+ file = gzip.GzipFile(filename)
+ else:
+ file = open(filename)
+ word_count = 0
+ for line_num, line in enumerate(file):
+ self.sent_index.append(word_count)
+ for word in line.split():
+ self.data.append(self.get_id(word))
+ if self.use_sent_id:
+ self.sent_id.append(line_num)
+ word_count = word_count + 1
+ self.data.append(1)
+ if self.use_sent_id:
+ self.sent_id.append(line_num)
+ word_count = word_count + 1
+ self.data.append(0)
+ self.sent_index.append(word_count)
+
+ def read_binary(self, filename):
+ cdef FILE* f
+ cdef bytes bfilename = filename
+ cdef char* cfilename = bfilename
+ f = fopen(cfilename, "r")
+ self.read_handle(f)
+ fclose(f)
+
+
+ cdef void read_handle(self, FILE* f):
+ cdef int num_words
+ cdef int word_len
+ cdef char* c_word
+ cdef bytes py_word
+ self.data.read_handle(f)
+ self.sent_index.read_handle(f)
+ self.sent_id.read_handle(f)
+ fread(&(num_words), sizeof(int), 1, f)
+ for i in xrange(num_words):
+ fread(&(word_len), sizeof(int), 1, f)
+ c_word = <char*> malloc (word_len * sizeof(char))
+ fread(c_word, sizeof(char), word_len, f)
+ py_word = c_word
+ free(c_word)
+ self.word2id[py_word] = len(self.id2word)
+ self.id2word.append(py_word)
+ if len(self.sent_id) == 0:
+ self.use_sent_id = False
+ else:
+ self.use_sent_id = True
+
+
+ cdef void write_handle(self, FILE* f):
+ cdef int word_len
+ cdef int num_words
+ cdef char* c_word
+
+ self.data.write_handle(f)
+ self.sent_index.write_handle(f)
+ self.sent_id.write_handle(f)
+ num_words = len(self.id2word) - 2
+ fwrite(&(num_words), sizeof(int), 1, f)
+ for word in self.id2word[2:]:
+ c_word = word
+ word_len = strlen(c_word) + 1
+ fwrite(&(word_len), sizeof(int), 1, f)
+ fwrite(c_word, sizeof(char), word_len, f)
+
+
+ def write_binary(self, filename):
+ cdef FILE* f
+ cdef bytes bfilename = filename
+ cdef char* cfilename = bfilename
+ f = fopen(cfilename, "w")
+ self.write_handle(f)
+ fclose(f)
+
+
+ def write_enhanced_handle(self, f):
+ for i in self.data:
+ f.write("%d " %i)
+ f.write("\n")
+ for i in self.sent_index:
+ f.write("%d " %i)
+ f.write("\n")
+ for i in self.sent_id:
+ f.write("%d " %i)
+ f.write("\n")
+ for word in self.id2word:
+ f.write("%s %d " % (word, self.word2id[word]))
+ f.write("\n")
+
+
+
+ def write_enhanced(self, filename):
+ f = open(filename, "w")
+ self.write_enhanced_handle(self, f)
+ f.close()
+
+
+
+
diff --git a/sa-extract/cfloatlist.pxd b/sa-extract/cfloatlist.pxd
new file mode 100644
index 00000000..026f2739
--- /dev/null
+++ b/sa-extract/cfloatlist.pxd
@@ -0,0 +1,10 @@
+from libc.stdio cimport FILE
+
+cdef class CFloatList:
+ cdef int size
+ cdef int increment
+ cdef int len
+ cdef float* arr
+ cdef void write_handle(self, FILE* f)
+ cdef void read_handle(self, FILE* f)
+ cdef void set(self, int i, float v)
diff --git a/sa-extract/cfloatlist.pyx b/sa-extract/cfloatlist.pyx
new file mode 100644
index 00000000..18a0ef2a
--- /dev/null
+++ b/sa-extract/cfloatlist.pyx
@@ -0,0 +1,93 @@
+# conveniencelist.pyx
+# defines int arrays in C, with some convenience methods
+# for reading arrays directly as globs directly from disk.
+# Adam Lopez <alopez@cs.umd.edu>
+
+from libc.stdio cimport FILE, fopen, fread, fwrite, fclose
+from libc.stdlib cimport malloc, realloc, free
+from libc.string cimport memset, strcpy, strlen
+
+cdef class CFloatList:
+
+ def __cinit__(self, size=0, increment=1, initial_len=0):
+ if initial_len > size:
+ size = initial_len
+ self.arr = <float*> malloc(size*sizeof(float))
+ memset(self.arr, 0, initial_len*sizeof(float))
+
+
+ def __init__(self, size=0, increment=1, initial_len=0):
+ self.size = size
+ if initial_len > size:
+ self.size = initial_len
+ self.increment = increment
+ self.len = initial_len
+
+
+ def __dealloc__(self):
+ free(self.arr)
+
+
+ def __getitem__(self, i):
+ j = i
+ if i<0:
+ j = self.len + i
+ if j<0 or j>=self.len:
+ raise IndexError("Requested index %d of %d-length FloatList" % (i, self.len))
+ return self.arr[j]
+
+
+ cdef void set(self, int i, float v):
+ j = i
+ if i<0:
+ j = self.len + i
+ if j<0 or j>=self.len:
+ raise IndexError("Requested index %d of %d-length FloatList" % (i, self.len))
+ self.arr[j] = v
+
+ def __setitem__(self, i, val):
+ self.set(i, val)
+
+ def __len__(self):
+ return self.len
+
+
+ def append(self, float val):
+ if self.len == self.size:
+ self.size = self.size + self.increment
+ self.arr = <float*> realloc(self.arr, self.size*sizeof(float))
+ self.arr[self.len] = val
+ self.len = self.len + 1
+
+
+ cdef void write_handle(self, FILE* f):
+ fwrite(&(self.len), sizeof(float), 1, f)
+ fwrite(self.arr, sizeof(float), self.len, f)
+
+
+ def write(self, filename):
+ cdef FILE* f
+ cdef bytes bfilename = filename
+ cdef char* cfilename = bfilename
+ f = fopen(cfilename, "w")
+ self.write_handle(f)
+ fclose(f)
+
+
+ cdef void read_handle(self, FILE* f):
+ free(self.arr)
+ fread(&(self.len), sizeof(float), 1, f)
+ self.arr = <float*> malloc(self.len * sizeof(float))
+ self.size = self.len
+ fread(self.arr, sizeof(float), self.len, f)
+
+
+ def read(self, filename):
+ cdef FILE* f
+ cdef bytes bfilename = filename
+ cdef char* cfilename = bfilename
+ f = fopen(cfilename, "r")
+ self.read_handle(f)
+ fclose(f)
+
+
diff --git a/sa-extract/cintlist.pxd b/sa-extract/cintlist.pxd
new file mode 100644
index 00000000..8a3a655c
--- /dev/null
+++ b/sa-extract/cintlist.pxd
@@ -0,0 +1,15 @@
+from libc.stdio cimport FILE
+
+cdef class CIntList:
+ cdef int size
+ cdef int increment
+ cdef int len
+ cdef int* arr
+ cdef void write_handle(self, FILE* f)
+ cdef void read_handle(self, FILE* f)
+ cdef void _append(self, int val)
+ cdef void _extend(self, CIntList other)
+ cdef void _extend_arr(self, int* other, int other_len)
+ cdef void _clear(self)
+ cdef void set(self, int i, int val)
+
diff --git a/sa-extract/cintlist.pyx b/sa-extract/cintlist.pyx
new file mode 100644
index 00000000..9d0a058e
--- /dev/null
+++ b/sa-extract/cintlist.pyx
@@ -0,0 +1,196 @@
+# cintlist.pyx
+# defines int arrays in C, with some convenience methods
+# for reading arrays as globs directly from disk.
+# Adam Lopez <alopez@cs.umd.edu>
+
+from libc.stdio cimport FILE, fopen, fread, fwrite, fclose
+from libc.stdlib cimport malloc, realloc, free
+from libc.string cimport memset, memcpy
+
+cdef class CIntList:
+
+ def __cinit__(self, size=0, increment=1, initial_len=0):
+ if initial_len > size:
+ size = initial_len
+ self.arr = <int*> malloc(size*sizeof(int))
+ memset(self.arr, 0, initial_len*sizeof(int))
+
+ def __str__(self):
+ ret = "CIntList["
+ for idx in xrange(self.size):
+ if idx>0:
+ ret += ","
+ ret += str(self.arr[idx])
+ ret += "]"
+ ret += "len="
+ ret += self.len
+ return ret
+
+ def index(self, val):
+ for i in xrange(self.len):
+ if self.arr[i] == val:
+ return i
+
+ return IndexError
+
+ def partition(self,start,end):
+ pivot = self.arr[end]
+ bottom = start-1
+ top = end
+ done = 0
+ while not done:
+ while not done:
+ bottom += 1
+ if bottom == top:
+ done = 1
+ break
+ if self.arr[bottom] > pivot:
+ self.arr[top] = self.arr[bottom]
+ break
+ while not done:
+ top -= 1
+ if top == bottom:
+ done = 1
+ break
+ if self.arr[top] < pivot:
+ self.arr[bottom] = self.arr[top]
+ break
+ self.arr[top] = pivot
+ return top
+
+ def _doquicksort(self,start,end):
+ if start < end:
+ split = self.partition(start,end)
+ self._doquicksort(start,split-1)
+ self._doquicksort(split+1,end)
+ else:
+ return
+
+ def sort(self):
+ self._doquicksort(0,self.len-1)
+
+ def reset(self):
+ self.len = 0
+
+ def __init__(self, size=0, increment=1, initial_len=0):
+ self.size = size
+ if initial_len > size:
+ self.size = initial_len
+ self.increment = increment
+ self.len = initial_len
+
+
+ def __dealloc__(self):
+ free(self.arr)
+
+
+ def __getitem__(self, index):
+ cdef int i, j, k
+
+ if type(index) == int:
+ j = index
+ if j < 0:
+ j = self.len + j
+ if j<0 or j>=self.len:
+ raise IndexError("Requested index %d of %d-length CIntList" % (index, self.len))
+ return self.arr[j]
+ elif type(index) == slice:
+ i = index.start
+ j = index.stop
+ if i < 0:
+ i = self.len + i
+ if j < 0:
+ j = self.len + j
+ if i < 0 or i >= self.len or j < 0 or j > self.len:
+ raise IndexError("Requested index %d:%d of %d-length CIntList" % (index.start, index.stop, self.len))
+ result = ()
+ for k from i <= k < j:
+ result = result + (self.arr[k],)
+ return result
+ else:
+ raise IndexError("Illegal key type %s for CIntList" % (type(index)))
+
+ cdef void set(self, int i, int val):
+ j = i
+ if i<0:
+ j = self.len + i
+ if j<0 or j>=self.len:
+ raise IndexError("Requested index %d of %d-length IntList" % (i, self.len))
+ if type(val) != int:
+ raise TypeError
+ self.arr[j] = val
+
+
+ def __setitem__(self, i, val):
+ self.set(i, val)
+
+ def __len__(self):
+ return self.len
+
+ def getSize(self):
+ return self.size
+
+ def append(self, int val):
+ self._append(val)
+
+ cdef void _append(self, int val):
+ if self.len == self.size:
+ self.size = self.size + self.increment
+ self.arr = <int*> realloc(self.arr, self.size*sizeof(int))
+ self.arr[self.len] = val
+ self.len = self.len + 1
+
+ def extend(self, other):
+ self._extend(other)
+
+
+ cdef void _extend(self, CIntList other):
+ self._extend_arr(other.arr, other.len)
+
+
+ cdef void _extend_arr(self, int* other, int other_len):
+ if self.size < self.len + other_len:
+ self.size = self.len + other_len
+ self.arr = <int*> realloc(self.arr, self.size*sizeof(int))
+ memcpy(self.arr+self.len, other, other_len*sizeof(int))
+ self.len = self.len + other_len
+
+
+ cdef void _clear(self):
+ free(self.arr)
+ self.len = 0
+ self.size = 0
+ self.arr = <int*> malloc(0)
+
+
+ cdef void write_handle(self, FILE* f):
+ fwrite(&(self.len), sizeof(int), 1, f)
+ fwrite(self.arr, sizeof(int), self.len, f)
+
+
+ def write(self, filename):
+ cdef FILE* f
+ cdef bytes bfilename = filename
+ cdef char* cfilename = bfilename
+ f = fopen(cfilename, "w")
+ self.write_handle(f)
+ fclose(f)
+
+
+ cdef void read_handle(self, FILE* f):
+ (self.arr)
+ fread(&(self.len), sizeof(int), 1, f)
+ self.arr = <int*> malloc(self.len * sizeof(int))
+ self.size = self.len
+ fread(self.arr, sizeof(int), self.len, f)
+
+
+ def read(self, filename):
+ cdef FILE* f
+ cdef bytes bfilename = filename
+ cdef char* cfilename = bfilename
+ f = fopen(cfilename, "r")
+ self.read_handle(f)
+ fclose(f)
+
+
diff --git a/sa-extract/clex.pyx b/sa-extract/clex.pyx
new file mode 100644
index 00000000..fa30caad
--- /dev/null
+++ b/sa-extract/clex.pyx
@@ -0,0 +1,460 @@
+# clex.pyx
+# defines bilexical dictionaries in C, with some convenience methods
+# for reading arrays directly as globs directly from disk.
+# Adam Lopez <alopez@cs.umd.edu>
+
+import gzip
+import sys
+import context_model
+
+cimport cintlist
+cimport cfloatlist
+cimport calignment
+cimport csuf
+cimport cdat
+
+from libc.stdio cimport FILE, fopen, fread, fwrite, fclose
+from libc.stdlib cimport malloc, realloc, free
+from libc.string cimport memset, strcpy, strlen
+
+cdef struct _node:
+ _node* smaller
+ _node* bigger
+ int key
+ int val
+
+cdef _node* new_node(int key):
+ cdef _node* n
+ n = <_node*> malloc(sizeof(_node))
+ n.smaller = NULL
+ n.bigger = NULL
+ n.key = key
+ n.val = 0
+ return n
+
+
+cdef del_node(_node* n):
+ if n.smaller != NULL:
+ del_node(n.smaller)
+ if n.bigger != NULL:
+ del_node(n.bigger)
+ free(n)
+
+
+cdef int* get_val(_node* n, int key):
+ if key == n.key:
+ return &n.val
+ elif key < n.key:
+ if n.smaller == NULL:
+ n.smaller = new_node(key)
+ return &(n.smaller.val)
+ return get_val(n.smaller, key)
+ else:
+ if n.bigger == NULL:
+ n.bigger = new_node(key)
+ return &(n.bigger.val)
+ return get_val(n.bigger, key)
+
+
+cdef class CLex:
+
+ cdef cfloatlist.CFloatList col1
+ cdef cfloatlist.CFloatList col2
+ cdef cintlist.CIntList f_index
+ cdef cintlist.CIntList e_index
+ cdef id2eword, id2fword, eword2id, fword2id
+
+ def __init__(self, filename, from_binary=False,
+ from_data=False, earray=None, fsarray=None):
+ self.id2eword = []
+ self.id2fword = []
+ self.eword2id = {}
+ self.fword2id = {}
+ self.e_index = cintlist.CIntList()
+ self.f_index = cintlist.CIntList()
+ self.col1 = cfloatlist.CFloatList()
+ self.col2 = cfloatlist.CFloatList()
+ if from_binary:
+ self.read_binary(filename)
+ else:
+ if from_data:
+ self.compute_from_data(fsarray, earray, filename)
+ else:
+ self.read_text(filename)
+ '''print "self.eword2id"
+ print "============="
+ for x in self.eword2id:
+ print x
+ print "self.fword2id"
+ print "============="
+ for x in self.fword2id:
+ print x
+ print "-------------"'''
+
+
+ cdef compute_from_data(self, csuf.SuffixArray fsa, cdat.DataArray eda, calignment.Alignment aa):
+ cdef int sent_id, num_links, l, i, j, f_i, e_j, I, J, V_E, V_F, num_pairs
+ cdef int *fsent, *esent, *alignment, *links, *ealigned, *faligned
+ cdef _node** dict
+ cdef int *fmargin, *emargin, *count
+ cdef bytes word
+ cdef int null_word
+
+ null_word = 0
+ for word in fsa.darray.id2word: # I miss list comprehensions
+ self.id2fword.append(word)
+ self.id2fword[null_word] = "NULL"
+ for id, word in enumerate(self.id2fword):
+ self.fword2id[word] = id
+
+ for word in eda.id2word:
+ self.id2eword.append(word)
+ self.id2eword[null_word] = "NULL"
+ for id, word in enumerate(self.id2eword):
+ self.eword2id[word] = id
+
+ num_pairs = 0
+
+ V_E = len(eda.id2word)
+ V_F = len(fsa.darray.id2word)
+ fmargin = <int*> malloc(V_F*sizeof(int))
+ emargin = <int*> malloc(V_E*sizeof(int))
+ memset(fmargin, 0, V_F*sizeof(int))
+ memset(emargin, 0, V_E*sizeof(int))
+
+ dict = <_node**> malloc(V_F*sizeof(_node*))
+ memset(dict, 0, V_F*sizeof(_node*))
+
+ num_sents = len(fsa.darray.sent_index)
+ for sent_id from 0 <= sent_id < num_sents-1:
+
+ fsent = fsa.darray.data.arr + fsa.darray.sent_index.arr[sent_id]
+ I = fsa.darray.sent_index.arr[sent_id+1] - fsa.darray.sent_index.arr[sent_id] - 1
+ faligned = <int*> malloc(I*sizeof(int))
+ memset(faligned, 0, I*sizeof(int))
+
+ esent = eda.data.arr + eda.sent_index.arr[sent_id]
+ J = eda.sent_index.arr[sent_id+1] - eda.sent_index.arr[sent_id] - 1
+ ealigned = <int*> malloc(J*sizeof(int))
+ memset(ealigned, 0, J*sizeof(int))
+
+ links = aa._get_sent_links(sent_id, &num_links)
+
+ for l from 0 <= l < num_links:
+ i = links[l*2]
+ j = links[l*2+1]
+ if i >= I or j >= J:
+ sys.stderr.write(" %d-%d out of bounds (I=%d,J=%d) in line %d\n" % (i,j,I,J,sent_id+1))
+ assert i < I
+ assert j < J
+ f_i = fsent[i]
+ e_j = esent[j]
+ fmargin[f_i] = fmargin[f_i]+1
+ emargin[e_j] = emargin[e_j]+1
+ if dict[f_i] == NULL:
+ dict[f_i] = new_node(e_j)
+ dict[f_i].val = 1
+ num_pairs = num_pairs + 1
+ else:
+ count = get_val(dict[f_i], e_j)
+ if count[0] == 0:
+ num_pairs = num_pairs + 1
+ count[0] = count[0] + 1
+ # add count
+ faligned[i] = 1
+ ealigned[j] = 1
+ for i from 0 <= i < I:
+ if faligned[i] == 0:
+ f_i = fsent[i]
+ fmargin[f_i] = fmargin[f_i] + 1
+ emargin[null_word] = emargin[null_word] + 1
+ if dict[f_i] == NULL:
+ dict[f_i] = new_node(null_word)
+ dict[f_i].val = 1
+ num_pairs = num_pairs + 1
+ else:
+ count = get_val(dict[f_i], null_word)
+ if count[0] == 0:
+ num_pairs = num_pairs + 1
+ count[0] = count[0] + 1
+ for j from 0 <= j < J:
+ if ealigned[j] == 0:
+ e_j = esent[j]
+ fmargin[null_word] = fmargin[null_word] + 1
+ emargin[e_j] = emargin[e_j] + 1
+ if dict[null_word] == NULL:
+ dict[null_word] = new_node(e_j)
+ dict[null_word].val = 1
+ num_pairs = num_pairs + 1
+ else:
+ count = get_val(dict[null_word], e_j)
+ if count[0] == 0:
+ num_pairs = num_pairs + 1
+ count[0] = count[0] + 1
+ free(links)
+ free(faligned)
+ free(ealigned)
+ self.f_index = cintlist.CIntList(initial_len=V_F)
+ self.e_index = cintlist.CIntList(initial_len=num_pairs)
+ self.col1 = cfloatlist.CFloatList(initial_len=num_pairs)
+ self.col2 = cfloatlist.CFloatList(initial_len=num_pairs)
+
+ num_pairs = 0
+ for i from 0 <= i < V_F:
+ #self.f_index[i] = num_pairs
+ self.f_index.set(i, num_pairs)
+ if dict[i] != NULL:
+ self._add_node(dict[i], &num_pairs, float(fmargin[i]), emargin)
+ del_node(dict[i])
+ free(fmargin)
+ free(emargin)
+ free(dict)
+ return
+
+
+ cdef _add_node(self, _node* n, int* num_pairs, float fmargin, int* emargin):
+ cdef int loc
+ if n.smaller != NULL:
+ self._add_node(n.smaller, num_pairs, fmargin, emargin)
+ loc = num_pairs[0]
+ self.e_index.set(loc, n.key)
+ self.col1.set(loc, float(n.val)/fmargin)
+ self.col2.set(loc, float(n.val)/float(emargin[n.key]))
+ num_pairs[0] = loc + 1
+ if n.bigger != NULL:
+ self._add_node(n.bigger, num_pairs, fmargin, emargin)
+
+
+ def write_binary(self, filename):
+ cdef FILE* f
+ cdef bytes bfilename = filename
+ cdef char* cfilename = bfilename
+ f = fopen(cfilename, "w")
+ self.f_index.write_handle(f)
+ self.e_index.write_handle(f)
+ self.col1.write_handle(f)
+ self.col2.write_handle(f)
+ self.write_wordlist(self.id2fword, f)
+ self.write_wordlist(self.id2eword, f)
+ fclose(f)
+
+
+ cdef write_wordlist(self, wordlist, FILE* f):
+ cdef int word_len
+ cdef int num_words
+ cdef char* c_word
+
+ num_words = len(wordlist)
+ fwrite(&(num_words), sizeof(int), 1, f)
+ for word in wordlist:
+ c_word = word
+ word_len = strlen(c_word) + 1
+ fwrite(&(word_len), sizeof(int), 1, f)
+ fwrite(c_word, sizeof(char), word_len, f)
+
+
+ cdef read_wordlist(self, word2id, id2word, FILE* f):
+ cdef int num_words
+ cdef int word_len
+ cdef char* c_word
+ cdef bytes py_word
+
+ fread(&(num_words), sizeof(int), 1, f)
+ for i from 0 <= i < num_words:
+ fread(&(word_len), sizeof(int), 1, f)
+ c_word = <char*> malloc (word_len * sizeof(char))
+ fread(c_word, sizeof(char), word_len, f)
+ py_word = c_word
+ free(c_word)
+ word2id[py_word] = len(id2word)
+ id2word.append(py_word)
+
+ def read_binary(self, filename):
+ cdef FILE* f
+ cdef bytes bfilename = filename
+ cdef char* cfilename = bfilename
+ f = fopen(cfilename, "r")
+ self.f_index.read_handle(f)
+ self.e_index.read_handle(f)
+ self.col1.read_handle(f)
+ self.col2.read_handle(f)
+ self.read_wordlist(self.fword2id, self.id2fword, f)
+ self.read_wordlist(self.eword2id, self.id2eword, f)
+ fclose(f)
+
+
+ def get_e_id(self, eword):
+ if eword not in self.eword2id:
+ e_id = len(self.id2eword)
+ self.id2eword.append(eword)
+ self.eword2id[eword] = e_id
+ return self.eword2id[eword]
+
+
+ def get_f_id(self, fword):
+ if fword not in self.fword2id:
+ f_id = len(self.id2fword)
+ self.id2fword.append(fword)
+ self.fword2id[fword] = f_id
+ return self.fword2id[fword]
+
+
+ def read_text(self, filename):
+ cdef i, j, w, e_id, f_id, n_f, n_e, N
+ cdef cintlist.CIntList fcount
+
+ fcount = cintlist.CIntList()
+ if filename[-2:] == "gz":
+ f = gzip.GzipFile(filename)
+ else:
+ f = open(filename)
+
+ # first loop merely establishes size of array objects
+ sys.stderr.write("Initial read...\n")
+ for line in f:
+ (fword, eword, score1, score2) = line.split()
+ f_id = self.get_f_id(fword)
+ e_id = self.get_e_id(eword)
+ while f_id >= len(fcount):
+ fcount.append(0)
+ fcount.arr[f_id] = fcount.arr[f_id] + 1
+
+ # Allocate space for dictionary in arrays
+ N = 0
+ n_f = len(fcount)
+ self.f_index = cintlist.CIntList(initial_len=n_f+1)
+ for i from 0 <= i < n_f:
+ self.f_index.arr[i] = N
+ N = N + fcount.arr[i]
+ fcount.arr[i] = 0
+ self.f_index.arr[n_f] = N
+ self.e_index = cintlist.CIntList(initial_len=N)
+ self.col1 = cfloatlist.CFloatList(initial_len=N)
+ self.col2 = cfloatlist.CFloatList(initial_len=N)
+
+ # Re-read file, placing words into buckets
+ sys.stderr.write("Bucket sort...\n")
+ f.seek(0)
+ for line in f:
+ (fword, eword, score1, score2) = line.split()
+ f_id = self.get_f_id(fword)
+ e_id = self.get_e_id(eword)
+ index = self.f_index.arr[f_id] + fcount.arr[f_id]
+ fcount.arr[f_id] = fcount.arr[f_id] + 1
+ self.e_index.arr[index] = int(e_id)
+ self.col1[index] = float(score1)
+ self.col2[index] = float(score2)
+ f.close()
+
+ sys.stderr.write("Final sort...\n")
+ # Sort buckets by eword
+ for b from 0 <= b < n_f:
+ i = self.f_index.arr[b]
+ j = self.f_index.arr[b+1]
+ self.qsort(i,j, "")
+
+
+ cdef swap(self, int i, int j):
+ cdef int itmp
+ cdef float ftmp
+
+ if i == j:
+ return
+
+ itmp = self.e_index.arr[i]
+ self.e_index.arr[i] = self.e_index.arr[j]
+ self.e_index.arr[j] = itmp
+
+ ftmp = self.col1.arr[i]
+ self.col1.arr[i] = self.col1.arr[j]
+ self.col1.arr[j] = ftmp
+
+ ftmp = self.col2.arr[i]
+ self.col2.arr[i] = self.col2.arr[j]
+ self.col2.arr[j] = ftmp
+
+
+ cdef qsort(self, int i, int j, pad):
+ cdef int pval, p
+
+ if i > j:
+ raise Exception("Sort error in CLex")
+ if i == j: #empty interval
+ return
+ if i == j-1: # singleton interval
+ return
+
+ p = (i+j)/2
+ pval = self.e_index.arr[p]
+ self.swap(i, p)
+ p = i
+ for k from i+1 <= k < j:
+ if pval >= self.e_index.arr[k]:
+ self.swap(p+1, k)
+ self.swap(p, p+1)
+ p = p + 1
+ self.qsort(i,p, pad+" ")
+ self.qsort(p+1,j, pad+" ")
+
+
+ def write_enhanced(self, filename):
+ f = open(filename, "w")
+ for i in self.f_index:
+ f.write("%d " % i)
+ f.write("\n")
+ for i, s1, s2 in zip(self.e_index, self.col1, self.col2):
+ f.write("%d %f %f " % (i, s1, s2))
+ f.write("\n")
+ for i, w in enumerate(self.id2fword):
+ f.write("%d %s " % (i, w))
+ f.write("\n")
+ for i, w in enumerate(self.id2eword):
+ f.write("%d %s " % (i, w))
+ f.write("\n")
+ f.close()
+
+
+ def get_score(self, fword, eword, col):
+ cdef e_id, f_id, low, high, midpoint, val
+ #print "get_score fword=",fword,", eword=",eword,", col=",col
+
+ if eword not in self.eword2id:
+ return None
+ if fword not in self.fword2id:
+ return None
+ f_id = self.fword2id[fword]
+ e_id = self.eword2id[eword]
+ low = self.f_index.arr[f_id]
+ high = self.f_index.arr[f_id+1]
+ while high - low > 0:
+ midpoint = (low+high)/2
+ val = self.e_index.arr[midpoint]
+ if val == e_id:
+ if col == 0:
+ return self.col1.arr[midpoint]
+ if col == 1:
+ return self.col2.arr[midpoint]
+ if val > e_id:
+ high = midpoint
+ if val < e_id:
+ low = midpoint + 1
+ return None
+
+
+ def write_text(self, filename):
+ """Note: does not guarantee writing the dictionary in the original order"""
+ cdef i, N, e_id, f_id
+
+ f = open(filename, "w")
+ N = len(self.e_index)
+ f_id = 0
+ for i from 0 <= i < N:
+ while self.f_index.arr[f_id+1] == i:
+ f_id = f_id + 1
+ e_id = self.e_index.arr[i]
+ score1 = self.col1.arr[i]
+ score2 = self.col2.arr[i]
+ f.write("%s %s %.6f %.6f\n" % (self.id2fword[f_id], self.id2eword[e_id], score1, score2))
+ f.close()
+
+
diff --git a/sa-extract/cmath.pxd b/sa-extract/cmath.pxd
new file mode 100644
index 00000000..3aaaa2a3
--- /dev/null
+++ b/sa-extract/cmath.pxd
@@ -0,0 +1,2 @@
+cdef extern from "math.h":
+ double log(double)
diff --git a/sa-extract/cn.py b/sa-extract/cn.py
new file mode 100644
index 00000000..e534783f
--- /dev/null
+++ b/sa-extract/cn.py
@@ -0,0 +1,164 @@
+# cn.py
+# Chris Dyer <redpony@umd.edu>
+# Copyright (c) 2006 University of Maryland.
+
+# vim:tabstop=4:autoindent:expandtab
+
+import sys
+import math
+import sym
+import log
+import sgml
+
+epsilon = sym.fromstring('*EPS*');
+
+class CNStats(object):
+ def __init__(self):
+ self.read = 0
+ self.colls = 0
+ self.words = 0
+
+ def collect(self, cn):
+ self.read += 1
+ self.colls += cn.get_length()
+ for col in cn.columns:
+ self.words += len(col)
+
+ def __str__(self):
+ return "confusion net statistics:\n succ. read: %d\n columns: %d\n words: %d\n avg. words/column:\t%f\n avg. cols/sent:\t%f\n\n" % (self.read, self.colls, self.words, float(self.words)/float(self.colls), float(self.colls)/float(self.read))
+
+class ConfusionNet(object):
+ def __init__(self, sent):
+ object.__init__(self)
+ if (len(sent.words) == 0):
+ self.columns = ()
+ return # empty line, it happens
+ line = sent.words[0]
+ if (line.startswith("(((")):
+ if (len(sent.words) > 1):
+ log.write("Bad sentence: %s\n" % (line))
+ assert(len(sent.words) == 1) # make sure there are no spaces in your confusion nets!
+ line = "((('<s>',1.0,1),),"+line[1:len(line)-1]+"(('</s>',1.0,1),))"
+ cols = eval(line)
+ res = []
+ for col in cols:
+ x = []
+ for alt in col:
+ costs = alt[1]
+ if (type(costs) != type((1,2))):
+ costs=(float(costs),)
+ j=[]
+ for c in costs:
+ j.append(float(c))
+ cost = tuple(j)
+ spanlen = 1
+ if (len(alt) == 3):
+ spanlen = alt[2]
+ x.append((sym.fromstring(alt[0],terminal=True), None, spanlen))
+ res.append(tuple(x))
+ self.columns = tuple(res)
+ else: # convert a string of input into a CN
+ res = [];
+ res.append(((sym.fromstring('<s>',terminal=True), None, 1), ))
+ for word in sent.words:
+ res.append(((sym.fromstring(word,terminal=True), None, 1), )); # (alt=word, cost=0.0)
+ res.append(((sym.fromstring('</s>',terminal=True), None, 1), ))
+ self.columns = tuple(res)
+
+ def is_epsilon(self, position):
+ x = self.columns[position[0]][position[1]][0]
+ return x == epsilon
+
+ def compute_epsilon_run_length(self, cn_path):
+ if (len(cn_path) == 0):
+ return 0
+ x = len(cn_path) - 1
+ res = 0
+ ''' -1 denotes a non-terminal '''
+ while (x >= 0 and cn_path[x][0] >= 0 and self.is_epsilon(cn_path[x])):
+ res += 1
+ x -= 1
+ return res
+
+ def compute_cn_cost(self, cn_path):
+ c = None
+ for (col, row) in cn_path:
+ if (col >= 0):
+ if c is None:
+ c = self.columns[col][row][1].clone()
+ else:
+ c += self.columns[col][row][1]
+ return c
+
+ def get_column(self, col):
+ return self.columns[col]
+
+ def get_length(self):
+ return len(self.columns)
+
+ def __str__(self):
+ r = "conf net: %d\n" % (len(self.columns),)
+ i = 0
+ for col in self.columns:
+ r += "%d -- " % i
+ i += 1
+ for alternative in col:
+ r += "(%s, %s, %s) " % (sym.tostring(alternative[0]), alternative[1], alternative[2])
+ r += "\n"
+ return r
+
+ def listdown(_columns, col = 0):
+ # output all the possible sentences out of the self lattice
+ # will be used by the "dumb" adaptation of lattice decoding with suffix array
+ result = []
+ for entry in _columns[col]:
+ if col+entry[2]+1<=len(_columns) :
+ for suffix in self.listdown(_columns,col+entry[2]):
+ result.append(entry[0]+' '+suffix)
+ #result.append(entry[0]+' '+suffix)
+ else:
+ result.append(entry[0])
+ #result.append(entry[0])
+ return result
+
+ def next(self,_columns,curr_idx, min_dist=1):
+ # can be used only when prev_id is defined
+ result = []
+ #print "curr_idx=%i\n" % curr_idx
+ if curr_idx+min_dist >= len(_columns):
+ return result
+ for alt_idx in xrange(len(_columns[curr_idx])):
+ alt = _columns[curr_idx][alt_idx]
+ #print "checking %i alternative : " % alt_idx
+ #print "%s %f %i\n" % (alt[0],alt[1],alt[2])
+ #print alt
+ if alt[2]<min_dist:
+ #print "recursive next(%i, %i, %i)\n" % (curr_idx,alt_idx,min_dist-alt[2])
+ result.extend(self.next(_columns,curr_idx+alt[2],min_dist-alt[2]))
+ elif curr_idx+alt[2]<len(_columns):
+ #print "adding because the skip %i doesn't go beyong the length\n" % alt[2]
+ result.append(curr_idx+alt[2])
+ return set(result)
+
+
+
+
+#file = open(sys.argv[1], "rb")
+#sent = sgml.process_sgml_line(file.read())
+#print sent
+#cn = ConfusionNet(sent)
+#print cn
+#results = cn.listdown()
+#for result in results:
+# print sym.tostring(result)
+#print cn.next(0);
+#print cn.next(1);
+#print cn.next(2);
+#print cn.next(3);
+#print cn
+#cn = ConfusionNet()
+#k = 0
+#while (cn.read(file)):
+# print cn
+
+#print cn.stats
diff --git a/sa-extract/compile_bin.py b/sa-extract/compile_bin.py
new file mode 100755
index 00000000..0196e552
--- /dev/null
+++ b/sa-extract/compile_bin.py
@@ -0,0 +1,148 @@
+#!/usr/bin/env python
+
+'''This program compiles/decompiles binary data objects used
+by the decoder'''
+
+import sys
+import cdat
+import calignment
+import csuf
+import clex
+import precomputation
+#import parse
+import monitor
+import optparse
+
+def main(argv=None):
+ '''Call this from the command-line to create a
+ pre-computed binary data array for later use'''
+ if argv is None:
+ argv = sys.argv
+
+ parser = optparse.OptionParser(usage="Usage: %prog [-s|-d|-a|-p] <input file> <output file>"+
+ "\n\nNote: -d,-s,-a, and -p are mutually exclusive")
+ parser.add_option("-d", "--data-array",
+ action="store_true", default=False,
+ dest="da", help="Compile file into data array (default)")
+ parser.add_option("-s", "--suffix-array",
+ action="store_true", default=False,
+ dest="sa", help="Compile file into suffix array")
+ parser.add_option("-a", "--alignment",
+ action="store_true", default=False,
+ dest="a", help="Compile file into alignment")
+ parser.add_option("-l", "--lexical",
+ action="store_true", default=False,
+ dest="l", help="Compile file into lex file")
+ parser.add_option("-x", "--compute_lexical", action="store", nargs=2,
+ dest="lex_args", help="Compute lex file from data",
+ metavar="<f file> <e file>")
+ parser.add_option("-p", "--parse",
+ action="store_true", default=False,
+ dest="p", help="Compile file into parse")
+ parser.add_option("-b", "--binary-infile",
+ action="store_true", default=False,
+ dest="bin", help="Input file is binary (default: text)")
+ parser.add_option("-t", "--text-outfile",
+ action="store_true", default=False,
+ dest="text", help="Output file is text (default: binary)")
+ parser.add_option("-e", "--enhanced-outfile",
+ action="store_true", default=False,
+ dest="enhanced", help="Output file is enhanced text (default: binary)")
+ parser.add_option("-r", action="store", nargs=7,
+ dest="precomp_args", help="Precompute collocations (Hiero only)",
+ metavar="max-len=<INT> max-nt=<INT> max-size=<INT> min-gap=<INT> rank1=<INT> rank2=<INT> sa=<FILE>")
+ (options, args) = parser.parse_args()
+
+ filetype_opts = [options.da, options.sa, options.a, options.p]
+
+ if (len(filter(lambda x: x, filetype_opts))) > 1 or len(args) != 2:
+ parser.print_help()
+ sys.exit(1)
+
+ (infilename, outfilename) = args
+ if options.bin:
+ bin = " binary"
+ else:
+ bin = ""
+
+ start_time = monitor.cpu()
+ if options.precomp_args:
+ if options.bin:
+ obj = precomputation.Precomputation(infilename, from_binary=True)
+ else:
+ keys = set(["max-len", "max-nt", "max-size", "min-gap", "rank1", "rank2", "sa"])
+ precomp_opts = {}
+ sys.stderr.write("Precomputing statistics for list %s\n" % infilename)
+ for pair in options.precomp_args:
+ (key, val) = pair.split("=")
+ if key in keys:
+ keys.remove(key)
+ if key != "sa":
+ val = int(val)
+ precomp_opts[key] = val
+ else:
+ sys.stderr.write("Unknown keyword arg %s for -r (must be one of: max-len, max-nt, max-size, min-gap, rank1, rank2)\n" % key)
+ return 1
+ sa = csuf.SuffixArray(precomp_opts["sa"], True)
+ obj = precomputation.Precomputation(infilename, sa,
+ precompute_rank=precomp_opts["rank1"],
+ precompute_secondary_rank=precomp_opts["rank2"],
+ max_length=precomp_opts["max-len"],
+ max_nonterminals=precomp_opts["max-nt"],
+ train_max_initial_size=precomp_opts["max-size"],
+ train_min_gap_size=precomp_opts["min-gap"])
+ elif options.sa:
+ sys.stderr.write("Reading %s as%s suffix array...\n" % (infilename, bin))
+ obj = csuf.SuffixArray(infilename, options.bin)
+ elif options.a:
+ sys.stderr.write("Reading %s as%s alignment array...\n" % (infilename, bin))
+ obj = calignment.Alignment(infilename, options.bin)
+ elif options.p:
+ sys.stderr.write("Reading %s as%s parse array...\n" % (infilename, bin))
+ obj = parse.ParseArray(infilename, options.bin)
+ elif options.l:
+ sys.stderr.write("Reading %s as%s lex array...\n" % (infilename, bin))
+ obj = clex.CLex(infilename, options.bin)
+ elif options.lex_args:
+ ffile = options.lex_args[0]
+ efile = options.lex_args[1]
+ sys.stderr.write("Computing lex array from:\n A=%s\n F=%s\n E=%s\n" % (infilename, ffile, efile))
+ fsarray = csuf.SuffixArray(ffile, True)
+ earray = cdat.DataArray(efile, True)
+ aarray = calignment.Alignment(infilename, True)
+ obj = clex.CLex(aarray, from_data=True, earray=earray, fsarray=fsarray)
+ else:
+ sys.stderr.write("Reading %s as%s data array...\n" % (infilename, bin))
+ obj = cdat.DataArray(infilename, options.bin)
+
+ sys.stderr.write(" Total time for read: %f\n" % (monitor.cpu() - start_time))
+ start_time = monitor.cpu()
+ if options.text:
+ sys.stderr.write("Writing text file %s...\n" % outfilename)
+ obj.write_text(outfilename)
+ elif options.enhanced:
+ sys.stderr.write("Writing enhanced text file %s...\n" % outfilename)
+ obj.write_enhanced(outfilename)
+ else:
+ sys.stderr.write("Writing binary file %s...\n" % outfilename)
+ obj.write_binary(outfilename)
+ sys.stderr.write("Finished.\n")
+ sys.stderr.write(" Total time for write: %f\n" % (monitor.cpu() - start_time))
+
+ mem_use = float(monitor.memory())
+ metric = "B"
+ if mem_use / 1000 > 1:
+ mem_use /= 1000
+ metric = "KB"
+ if mem_use / 1000 > 1:
+ mem_use /= 1000
+ metric = "MB"
+ if mem_use / 1000 > 1:
+ mem_use /= 1000
+ metric = "GB"
+ sys.stderr.write(" Memory usage: %.1f%s\n" % (mem_use, metric))
+
+
+
+if __name__ == "__main__":
+ sys.exit(main(sys.argv))
diff --git a/sa-extract/context_model.py b/sa-extract/context_model.py
new file mode 100644
index 00000000..8cb6c174
--- /dev/null
+++ b/sa-extract/context_model.py
@@ -0,0 +1,234 @@
+#!/usr/bin/env python
+import sys
+import model
+import sym
+import log
+import math
+
+class ContextModel(model.Model):
+ '''A ContextModel is one that is computed using information
+ from the Context object'''
+
+ def __init__(self, context_manager, default=0.0):
+ model.Model.__init__(self)
+ self.wordless = 0
+ self.initial = None
+ self.default = default
+ self.context_manager = context_manager
+ self.id = self.context_manager.add_model(self)
+
+ '''The next feature is true if the model depends in
+ some way on the entire input sentence; that is, if
+ it cannot be scored when created, but must be scored
+ no earlier than during the input method (note that
+ this is less strict than stateful)'''
+ self.contextual = True
+ ''' It may seem somewhat counterintuitive that a
+ ContextModel can be non-contextual, but a good
+ example is the rule probabilites; although these
+ are computed using the Context object, they don't
+ really depend in any way on context'''
+
+
+ '''inherited from model.Model, called once for each input sentence'''
+ def input(self, fwords, meta):
+ # all ContextModels must make this call
+ self.context_manager.input(self, fwords, meta)
+
+
+ '''This function will be called via the input method
+ only for contextual models'''
+ def compute_contextual_score(self, r):
+ return 0.0
+
+ '''This function is only called on rule creation for
+ contextless models'''
+ def compute_contextless_score(self, fphrase, ephrase, paircount, fcount, fsample_count):
+ return 0.0
+
+ '''Stateless models should not need to
+ override this function, unless they define
+ something for model.TO_GOAL'''
+ def transition (self, r, antstates, i, j, j1=None):
+ return (None, 0.0)
+
+ def estimate(self, r):
+ return r.getscore("context", self.id)
+
+ def transition(self, r, antstates, i, j, j1=None):
+ return (None, r.getscore("context", self.id))
+
+ def finaltransition(self, state):
+ return 0.0
+
+ def rescore(self, ewords, score):
+ return score
+
+
+
+'''p(e|f)'''
+class EgivenF(ContextModel):
+
+ def __init__(self, context_manager, default=0.0):
+ ContextModel.__init__(self, context_manager)
+ self.contextual = False
+
+ def compute_contextless_score(self, fphrase, ephrase, paircount, fcount, fsample_count):
+ prob = float(paircount)/float(fcount)
+ return -math.log10(prob)
+
+class CountEF(ContextModel):
+
+ def __init__(self, context_manager, default=0.0):
+ ContextModel.__init__(self, context_manager)
+ self.contextual = False
+
+ def compute_contextless_score(self, fphrase, ephrase, paircount, fcount, fsample_count):
+ return math.log10(1.0 + float(paircount))
+
+class SampleCountF(ContextModel):
+
+ def __init__(self, context_manager, default=0.0):
+ ContextModel.__init__(self, context_manager)
+ self.contextual = False
+
+ def compute_contextless_score(self, fphrase, ephrase, paircount, fcount, fsample_count):
+ return math.log10(1.0 + float(fsample_count))
+
+
+
+class EgivenFCoherent(ContextModel):
+
+ def __init__(self, context_manager, default=0.0):
+ ContextModel.__init__(self, context_manager)
+ self.contextual = False
+
+ def compute_contextless_score(self, fphrase, ephrase, paircount, fcount, fsample_count):
+ prob = float(paircount)/float(fsample_count)
+ #print "paircount=",paircount," , fsample_count=",fsample_count,", prob=",prob
+ if (prob == 0.0): return 99.0
+ return -math.log10(prob)
+
+
+
+class CoherenceProb(ContextModel):
+
+ def __init__(self, context_manager, default=0.0):
+ ContextModel.__init__(self, context_manager)
+ self.contextual = False
+
+ def compute_contextless_score(self, fphrase, ephrase, paircount, fcount, fsample_count):
+ prob = float(fcount)/float(fsample_count)
+ return -math.log10(prob)
+
+
+
+class MaxLexEgivenF(ContextModel):
+
+ def __init__(self, context_manager, ttable, col=0):
+ ContextModel.__init__(self, context_manager)
+ self.ttable = ttable
+ self.col = col
+ self.wordless = 0
+ self.initial = None
+ self.contextual = False
+
+
+ def compute_contextless_score(self, fphrase, ephrase, paircount, fcount, fsample_count):
+ totalscore = 1.0
+ fwords = map(sym.tostring, filter(lambda x: not sym.isvar(x), fphrase))
+ fwords.append("NULL")
+ ewords = map(sym.tostring, filter(lambda x: not sym.isvar(x), ephrase))
+ for e in ewords:
+ maxScore = 0.0
+ for f in fwords:
+ score = self.ttable.get_score(f, e, self.col)
+ #print "score(MaxLexEgivenF) = ",score
+ if score > maxScore:
+ maxScore = score
+ totalscore *= maxScore
+ if totalscore == 0.0:
+ return 999
+ else:
+ return -math.log10(totalscore)
+
+
+class MaxLexFgivenE(ContextModel):
+
+ def __init__(self, context_manager, ttable, col=1):
+ ContextModel.__init__(self, context_manager)
+ self.ttable = ttable
+ self.col = col
+ self.wordless = 0
+ self.initial = None
+ self.contextual = False
+
+
+ def compute_contextless_score(self, fphrase, ephrase, paircount, fcount, fsample_count):
+ totalscore = 1.0
+ fwords = map(sym.tostring, filter(lambda x: not sym.isvar(x), fphrase))
+ ewords = map(sym.tostring, filter(lambda x: not sym.isvar(x), ephrase))
+ ewords.append("NULL")
+ for f in fwords:
+ maxScore = 0.0
+ for e in ewords:
+ score = self.ttable.get_score(f, e, self.col)
+ #print "score(MaxLexFgivenE) = ",score
+ if score > maxScore:
+ maxScore = score
+ totalscore *= maxScore
+ if totalscore == 0.0:
+ return 999
+ else:
+ return -math.log10(totalscore)
+
+
+class IsSingletonF(ContextModel):
+
+ def __init__(self, context_manager, default=0.0):
+ ContextModel.__init__(self, context_manager)
+ self.contextual = False
+
+ def compute_contextless_score(self, fphrase, ephrase, paircount, fcount, fsample_count):
+ return (fcount==1)
+
+
+class IsSingletonFE(ContextModel):
+
+ def __init__(self, context_manager, default=0.0):
+ ContextModel.__init__(self, context_manager)
+ self.contextual = False
+
+ def compute_contextless_score(self, fphrase, ephrase, paircount, fcount, fsample_count):
+ return (paircount==1)
+
+class IsNotSingletonF(ContextModel):
+
+ def __init__(self, context_manager, default=0.0):
+ ContextModel.__init__(self, context_manager)
+ self.contextual = False
+
+ def compute_contextless_score(self, fphrase, ephrase, paircount, fcount, fsample_count):
+ return (fcount>1)
+
+
+class IsNotSingletonFE(ContextModel):
+
+ def __init__(self, context_manager, default=0.0):
+ ContextModel.__init__(self, context_manager)
+ self.contextual = False
+
+ def compute_contextless_score(self, fphrase, ephrase, paircount, fcount, fsample_count):
+ return (paircount>1)
+
+
+class IsFEGreaterThanZero(ContextModel):
+
+ def __init__(self, context_manager, default=0.0):
+ ContextModel.__init__(self, context_manager)
+ self.contextual = False
+
+ def compute_contextless_score(self, fphrase, ephrase, paircount, fcount, fsample_count):
+ return (paircount > 0.01)
+
+
diff --git a/sa-extract/cstrmap.pxd b/sa-extract/cstrmap.pxd
new file mode 100644
index 00000000..53becbc6
--- /dev/null
+++ b/sa-extract/cstrmap.pxd
@@ -0,0 +1,12 @@
+cdef extern from "strmap.h":
+ ctypedef struct StrMap
+ StrMap* stringmap_new()
+ void stringmap_delete(StrMap *vocab)
+ int stringmap_index(StrMap *vocab, char *s)
+ char* stringmap_word(StrMap *vocab, int i)
+
+cdef class StringMap:
+ cdef StrMap *vocab
+ cdef char *word(self, int i)
+ cdef int index(self, char *s)
+
diff --git a/sa-extract/cstrmap.pyx b/sa-extract/cstrmap.pyx
new file mode 100644
index 00000000..d3883ea5
--- /dev/null
+++ b/sa-extract/cstrmap.pyx
@@ -0,0 +1,14 @@
+
+cdef class StringMap:
+ def __cinit__(self):
+ self.vocab = stringmap_new()
+
+ def __dealloc__(self):
+ stringmap_delete(self.vocab)
+
+ cdef char *word(self, int i):
+ return stringmap_word(self.vocab, i)
+
+ cdef int index(self, char *s):
+ return stringmap_index(self.vocab, s)
+
diff --git a/sa-extract/csuf.pxd b/sa-extract/csuf.pxd
new file mode 100644
index 00000000..f44167dd
--- /dev/null
+++ b/sa-extract/csuf.pxd
@@ -0,0 +1,11 @@
+cimport cdat
+cimport cintlist
+
+cdef class SuffixArray:
+ cdef cdat.DataArray darray
+ cdef cintlist.CIntList sa
+ cdef cintlist.CIntList ha
+ cdef __lookup_helper(self, int word_id, int offset, int low, int high)
+ cdef __get_range(self, int word_id, int offset, int low, int high, int midpoint)
+ cdef __search_low(self, int word_id, int offset, int low, int high)
+ cdef __search_high(self, word_id, offset, low, high)
diff --git a/sa-extract/csuf.pyx b/sa-extract/csuf.pyx
new file mode 100644
index 00000000..64c44788
--- /dev/null
+++ b/sa-extract/csuf.pyx
@@ -0,0 +1,321 @@
+# csuf.pyx
+# Defines suffix arrays that can be directly written to/read from disk in binary format
+# Adam Lopez <alopez@cs.umd.edu>
+
+import sys
+import log
+import cdat
+import cintlist
+import monitor
+
+from libc.stdio cimport FILE, fclose, fopen
+
+cdef class SuffixArray:
+
+ def __init__(self, filename, from_binary=False):
+ self.darray = cdat.DataArray()
+ self.sa = cintlist.CIntList()
+ self.ha = cintlist.CIntList()
+ if from_binary:
+ self.read_binary(filename)
+ else:
+ self.read_text(filename)
+
+
+ def __getitem__(self, i):
+ return self.sa.arr[i]
+
+
+ def getSentId(self, i):
+ return self.darray.getSentId(i)
+
+
+ def getSent(self, i):
+ return self.darray.getSent(i)
+
+
+ def getSentPos(self, loc):
+ return self.darray.getSentPos(loc)
+
+ def read_text(self, filename):
+ '''Constructs suffix array using the algorithm
+ of Larsson & Sadahkane (1999)'''
+ cdef int V, N, i, j, h, a_i, n, current_run, skip
+ cdef cintlist.CIntList isa, word_count
+
+ self.darray = cdat.DataArray(filename, from_binary=False, use_sent_id=True)
+ N = len(self.darray)
+ V = len(self.darray.id2word)
+
+ self.sa = cintlist.CIntList(initial_len=N)
+ self.ha = cintlist.CIntList(initial_len=V+1)
+
+ isa = cintlist.CIntList(initial_len=N)
+ word_count = cintlist.CIntList(initial_len=V+1)
+
+ '''Step 1: bucket sort data'''
+ sort_start_time = monitor.cpu()
+ start_time = sort_start_time
+ for i from 0 <= i < N:
+ a_i = self.darray.data.arr[i]
+ word_count.arr[a_i] = word_count.arr[a_i] + 1
+
+ n = 0
+ for i from 0 <= i < V+1:
+ self.ha.arr[i] = n
+ n = n + word_count.arr[i]
+ word_count.arr[i] = 0
+
+ for i from 0 <= i < N:
+ a_i = self.darray.data.arr[i]
+ self.sa.arr[self.ha.arr[a_i] + word_count.arr[a_i]] = i
+ isa.arr[i] = self.ha.arr[a_i + 1] - 1 # bucket pointer is last index in bucket
+ word_count.arr[a_i] = word_count.arr[a_i] + 1
+
+ '''Determine size of initial runs'''
+ current_run = 0
+ for i from 0 <= i < V+1:
+ if i < V and self.ha.arr[i+1] - self.ha.arr[i] == 1:
+ current_run = current_run + 1
+ else:
+ if current_run > 0:
+ self.sa.arr[self.ha.arr[i] - current_run] = -current_run
+ current_run = 0
+
+ sys.stderr.write(" Bucket sort took %f seconds\n" % (monitor.cpu() - sort_start_time))
+
+ '''Step 2: prefix-doubling sort'''
+ h = 1
+ while self.sa.arr[0] != -N:
+ sort_start_time = monitor.cpu()
+ sys.stderr.write(" Refining, sort depth = %d\n" % (h,))
+ i = 0
+ skip = 0
+ while i < N:
+ if self.sa.arr[i] < 0:
+ #sys.stderr.write("Skip from %d to %d\n" % (i, i-self.sa.arr[i]-1))
+ skip = skip + self.sa.arr[i]
+ i = i - self.sa.arr[i]
+ else:
+ if skip < 0:
+ self.sa.arr[i+skip] = skip
+ skip = 0
+ j = isa.arr[self.sa.arr[i]]
+ #sys.stderr.write("Process from %d to %d (%d, %d, %d)\n" % (i, j, self.sa.arr[i], self.darray.data.arr[self.sa.arr[i]], isa.arr[self.sa.arr[i]]))
+ self.q3sort(i, j, h, isa)
+ i = j+1
+ if skip < 0:
+ self.sa.arr[i+skip] = skip
+ h = h * 2
+ sys.stderr.write(" Refinement took %f seconds\n" % (monitor.cpu() - sort_start_time))
+
+ '''Step 3: read off suffix array from inverse suffix array'''
+ sys.stderr.write(" Finalizing sort...\n")
+ for i from 0 <= i < N:
+ j = isa.arr[i]
+ self.sa.arr[j] = i
+ sys.stderr.write("Suffix array construction took %f seconds\n" % (monitor.cpu() - start_time))
+
+ def q3sort(self, int i, int j, int h, cintlist.CIntList isa, pad=""):
+ '''This is a ternary quicksort. It divides the array into
+ three partitions: items less than the pivot, items equal
+ to pivot, and items greater than pivot. The first and last
+ of these partitions are then recursively sorted'''
+ cdef int k, midpoint, pval, phead, ptail, tmp
+
+ if j-i < -1:
+ raise Exception("Unexpected condition found in q3sort: sort from %d to %d" % (i,j))
+ if j-i == -1: # recursive base case -- empty interval
+ return
+ if (j-i == 0): # recursive base case -- singleton interval
+ isa.arr[self.sa.arr[i]] = i
+ self.sa.arr[i] = -1
+ return
+
+ # NOTE: choosing the first item as a pivot value resulted in
+ # stack overflow for some very large buckets. I think there
+ # is a natural bias towards order due the way the word ids are
+ # assigned; thus this resulted in the range to the left of the
+ # pivot being nearly empty. Therefore, choose the middle item.
+ # If the method of assigning word_id's is changed, this method
+ # may need to be reconsidered as well.
+ midpoint = (i+j)/2
+ pval = isa.arr[self.sa.arr[midpoint] + h]
+ if i != midpoint:
+ tmp = self.sa.arr[midpoint]
+ self.sa.arr[midpoint] = self.sa.arr[i]
+ self.sa.arr[i] = tmp
+ phead = i
+ ptail = i
+
+ # find the three partitions. phead marks the first element
+ # of the middle partition, and ptail marks the last element
+ for k from i+1 <= k < j+1:
+ if isa.arr[self.sa.arr[k] + h] < pval:
+ if k > ptail+1:
+ tmp = self.sa.arr[phead]
+ self.sa.arr[phead] = self.sa.arr[k]
+ self.sa.arr[k] = self.sa.arr[ptail+1]
+ self.sa.arr[ptail+1] = tmp
+ else: # k == ptail+1
+ tmp = self.sa.arr[phead]
+ self.sa.arr[phead] = self.sa.arr[k]
+ self.sa.arr[k] = tmp
+ phead = phead + 1
+ ptail = ptail + 1
+ else:
+ if isa.arr[self.sa.arr[k] + h] == pval:
+ if k > ptail+1:
+ tmp = self.sa.arr[ptail+1]
+ self.sa.arr[ptail+1] = self.sa.arr[k]
+ self.sa.arr[k] = tmp
+ ptail = ptail + 1
+
+ # recursively sort smaller suffixes
+ self.q3sort(i, phead-1, h, isa, pad+" ")
+
+ # update suffixes with pivot value
+ # corresponds to update_group function in Larsson & Sadakane
+ for k from phead <= k < ptail+1:
+ isa.arr[self.sa.arr[k]] = ptail
+ if phead == ptail:
+ self.sa.arr[phead] = -1
+
+ # recursively sort larger suffixes
+ self.q3sort(ptail+1, j, h, isa, pad+" ")
+
+
+ def write_text(self, filename):
+ self.darray.write_text(filename)
+
+
+ def read_binary(self, filename):
+ cdef FILE *f
+ cdef bytes bfilename = filename
+ cdef char* cfilename = bfilename
+ f = fopen(cfilename, "r")
+ self.darray.read_handle(f)
+ self.sa.read_handle(f)
+ self.ha.read_handle(f)
+ fclose(f)
+
+
+ def write_binary(self, filename):
+ cdef FILE* f
+ cdef bytes bfilename = filename
+ cdef char* cfilename = bfilename
+ f = fopen(cfilename, "w")
+ self.darray.write_handle(f)
+ self.sa.write_handle(f)
+ self.ha.write_handle(f)
+ fclose(f)
+
+
+ def write_enhanced(self, filename):
+ f = open(filename, "w")
+ self.darray.write_enhanced_handle(f)
+ for a_i in self.sa:
+ f.write("%d " % a_i)
+ f.write("\n")
+ for w_i in self.ha:
+ f.write("%d " % w_i)
+ f.write("\n")
+ f.close()
+
+
+ cdef __search_high(self, word_id, offset, low, high):
+ cdef int midpoint
+
+ if low >= high:
+ return high
+ midpoint = (high + low) / 2
+ if self.darray.data.arr[self.sa.arr[midpoint] + offset] == word_id:
+ return self.__search_high(word_id, offset, midpoint+1, high)
+ else:
+ return self.__search_high(word_id, offset, low, midpoint)
+
+
+ cdef __search_low(self, int word_id, int offset, int low, int high):
+ cdef int midpoint
+
+ if low >= high:
+ return high
+ midpoint = (high + low) / 2
+ if self.darray.data.arr[self.sa.arr[midpoint] + offset] == word_id:
+ return self.__search_low(word_id, offset, low, midpoint)
+ else:
+ return self.__search_low(word_id, offset, midpoint+1, high)
+
+
+ cdef __get_range(self, int word_id, int offset, int low, int high, int midpoint):
+ return (self.__search_low(word_id, offset, low, midpoint),
+ self.__search_high(word_id, offset, midpoint, high))
+
+
+ cdef __lookup_helper(self, int word_id, int offset, int low, int high):
+ cdef int midpoint
+
+ if offset == 0:
+ return (self.ha.arr[word_id], self.ha.arr[word_id+1])
+ if low >= high:
+ return None
+
+ midpoint = (high + low) / 2
+ if self.darray.data.arr[self.sa.arr[midpoint] + offset] == word_id:
+ return self.__get_range(word_id, offset, low, high, midpoint)
+ if self.darray.data.arr[self.sa.arr[midpoint] + offset] > word_id:
+ return self.__lookup_helper(word_id, offset, low, midpoint)
+ else:
+ return self.__lookup_helper(word_id, offset, midpoint+1, high)
+
+
+ def lookup(self, word, offset, int low, int high):
+ if low == -1:
+ low = 0
+ if high == -1:
+ high = len(self.sa)
+ if word in self.darray.word2id:
+ word_id = self.darray.word2id[word]
+ return self.__lookup_helper(word_id, offset, low, high)
+ else:
+ return None
+
+
+
+ def print_sa(self, isa):
+ '''Slow; Use only in case of emergency'''
+ cdef int i, j, k, N
+ cdef cintlist.CIntList tmp_sa
+
+ N = len(self.sa)
+ for i from 0 <= i < N:
+ sys.stderr.write("%2d " % i)
+ sys.stderr.write("\n")
+ for i from 0 <= i < N:
+ sys.stderr.write("%2d " % self.darray.data.arr[i])
+ sys.stderr.write("\n")
+ for i from 0 <= i < N:
+ sys.stderr.write("%2d " % isa.arr[i])
+ sys.stderr.write("\n\n\n")
+
+ # Recover partially sorted array
+ tmp_sa = cintlist.CIntList(initial_len=N)
+ for i from 0 <= i < N:
+ j = isa.arr[i]
+ tmp_sa.arr[j] = i
+ for i from 0 <= i < N:
+ if self.sa.arr[i] > 0:
+ tmp_sa.arr[i] = self.sa.arr[i]
+
+ for i from 0 <= i < N:
+ j = tmp_sa.arr[i]
+ sys.stderr.write("%2d %2d | " % (i, self.sa.arr[i]))
+ for k from j <= k < N:
+ sys.stderr.write("%2d " % self.darray.data.arr[k])
+ sys.stderr.write("\n")
+ sys.stderr.write("\n")
+
+
+
+
+
diff --git a/sa-extract/cveb.pxd b/sa-extract/cveb.pxd
new file mode 100644
index 00000000..8967f8e3
--- /dev/null
+++ b/sa-extract/cveb.pxd
@@ -0,0 +1,15 @@
+cdef struct _VEB:
+ int top_universe_size
+ int num_bottom_bits
+ int max_val
+ int min_val
+ int size
+ void* top
+ void** bottom
+
+
+cdef class VEB:
+ cdef _VEB* veb
+ cdef int _findsucc(self, int i)
+ cdef int _insert(self, int i)
+ cdef int _first(self)
diff --git a/sa-extract/cveb.pyx b/sa-extract/cveb.pyx
new file mode 100644
index 00000000..ca87becc
--- /dev/null
+++ b/sa-extract/cveb.pyx
@@ -0,0 +1,390 @@
+#!/usr/bin/env python2.4
+'''This module implements a partial stratified tree (van Emde Boas, 1977).
+Only insert findsucc, __iter__, and __contains__ are implemented.
+Delete is currently not supported.
+There is very little error-checking in this code -- it is designed
+to be used in the limited situation described in Lopez (EMNLP-CoNLL 2007),
+which doesn't cover all of the possible ways that you could misuse it
+(e.g. trying to insert a key larger than the universe size)
+Other notes -- this code is really rather ugly C code masquerading as
+Pyrex/Python. Virtual function calls are bypassed by hand in several
+places for the sake of efficiency, and other Python niceties are
+removed for the same reason.'''
+
+from libc.stdlib cimport malloc, free
+from libc.math cimport log, ceil
+from libc.string cimport memset
+
+cdef int MIN_BOTTOM_SIZE
+cdef int MIN_BOTTOM_BITS
+
+MIN_BOTTOM_SIZE = 32
+MIN_BOTTOM_BITS = 5
+
+cdef int lower_mask[32]
+cdef int i, mask
+
+for i from 0 <= i < MIN_BOTTOM_SIZE:
+ mask = (mask << 1) + 1
+ lower_mask[i] = mask
+
+
+cdef struct _BitSet:
+ long bitset
+ int min_val
+ int max_val
+ int size
+
+
+cdef _BitSet* new_BitSet():
+ cdef _BitSet* b
+
+ b = <_BitSet*> malloc(sizeof(_BitSet))
+ b.bitset = 0
+ b.min_val = -1
+ b.max_val = -1
+ b.size = 0
+ return b
+
+
+cdef int bitset_findsucc(_BitSet* b, int i):
+ cdef int bitset, mask
+ cdef int low, high, mid
+
+ if b.max_val == -1 or i >= b.max_val:
+ return -1
+ if i < b.min_val:
+ return b.min_val
+
+ bitset = b.bitset & ~lower_mask[i]
+ low = i+1
+ high = b.max_val+1
+ while low < high-1:
+ mid = (high + low)/2
+ mask = ~(lower_mask[high-1] ^ lower_mask[mid-1])
+ if bitset & mask == 0:
+ low = mid
+ else:
+ bitset = bitset & mask
+ high = mid
+ return low
+
+
+cdef int bitset_insert(_BitSet* b, int i):
+ cdef int val
+
+ val = 1 << i
+ if b.bitset & val == 0:
+ b.bitset = b.bitset | val
+ if b.size == 0:
+ b.min_val = i
+ b.max_val = i
+ else:
+ if i < b.min_val:
+ b.min_val = i
+ if i > b.max_val:
+ b.max_val = i
+ b.size = b.size + 1
+ return 1
+ return 0
+
+
+cdef int bitset_contains(_BitSet* b, int i):
+ cdef int val
+
+ val = 1 << i
+ if b.bitset & val == 0:
+ return 0
+ else:
+ return 1
+
+
+cdef class BitSetIterator:
+ cdef _BitSet* b
+ cdef int next_val
+
+ def __next__(self):
+ cdef int ret_val
+
+ if self.next_val == -1:
+ raise StopIteration()
+ ret_val = self.next_val
+ self.next_val = bitset_findsucc(self.b, ret_val)
+ return ret_val
+
+
+
+# This is a Python wrapper class to give access to the
+# (entirely C-implemented) _BitSet struct.
+# Very slow; use only for debugging
+cdef class BitSet:
+
+ cdef _BitSet* b
+
+ def __cinit__(self):
+ self.b = new_BitSet()
+
+ def __dealloc__(self):
+ free(self.b)
+
+ def __iter__(self):
+ cdef BitSetIterator it
+ it = BitSetIterator()
+ it.b = self.b
+ it.next_val = self.b.min_val
+ return it
+
+ def insert(self, i):
+ return bitset_insert(self.b, i)
+
+ def findsucc(self, i):
+ return bitset_findsucc(self.b, i)
+
+ def __str__(self):
+ return dec2bin(self.b.bitset)+" ("+str(self.b.size)+","+str(self.b.min_val)+","+str(self.b.max_val)+")"
+
+ def min(self):
+ return self.b.min_val
+
+ def max(self):
+ return self.b.max_val
+
+ def __len__(self):
+ return self.b.size
+
+ def __contains__(self, i):
+ return bool(bitset_contains(self.b, i))
+
+
+def dec2bin(i):
+ cdef d
+ result = ""
+ for d from 0 <= d < MIN_BOTTOM_SIZE:
+ if i & lower_mask[0] == 0:
+ result = "0"+result
+ else:
+ result = "1"+result
+ i = i >> 1
+ return result
+
+
+cdef _VEB* new_VEB(int n):
+ cdef _VEB* veb
+ cdef int num_bits, num_top_bits, i
+
+ veb = <_VEB*> malloc(sizeof(_VEB))
+
+ num_bits = int(ceil(log(n) / log(2)))
+ veb.num_bottom_bits = num_bits/2
+ if veb.num_bottom_bits < MIN_BOTTOM_BITS:
+ veb.num_bottom_bits = MIN_BOTTOM_BITS
+ veb.top_universe_size = (n >> veb.num_bottom_bits) + 1
+
+ veb.bottom = <void**> malloc(veb.top_universe_size * sizeof(void*))
+ memset(veb.bottom, 0, veb.top_universe_size * sizeof(void*))
+
+ if veb.top_universe_size > MIN_BOTTOM_SIZE:
+ veb.top = new_VEB(veb.top_universe_size)
+ else:
+ veb.top = new_BitSet()
+
+ veb.max_val = -1
+ veb.min_val = -1
+ veb.size = 0
+ return veb
+
+
+cdef int VEB_insert(_VEB* veb, int i):
+ cdef _VEB* subv
+ cdef _BitSet* subb
+ cdef int a, b, tmp
+
+ if veb.size == 0:
+ veb.min_val = i
+ veb.max_val = i
+ elif i == veb.min_val or i == veb.max_val:
+ return 0
+ else:
+ if i < veb.min_val:
+ tmp = i
+ i = veb.min_val
+ veb.min_val = tmp
+ a = i >> veb.num_bottom_bits
+ b = i & lower_mask[veb.num_bottom_bits-1]
+ if veb.bottom[a] == NULL:
+ if veb.top_universe_size > MIN_BOTTOM_SIZE:
+ subv = <_VEB*> veb.top
+ VEB_insert(subv, a)
+ else:
+ subb = <_BitSet*> veb.top
+ bitset_insert(subb, a)
+ if veb.num_bottom_bits > MIN_BOTTOM_BITS:
+ veb.bottom[a] = new_VEB(1 << veb.num_bottom_bits)
+ else:
+ veb.bottom[a] = new_BitSet()
+ if veb.num_bottom_bits > MIN_BOTTOM_BITS:
+ subv = <_VEB*> veb.bottom[a]
+ if VEB_insert(subv, b) == 0:
+ return 0
+ else:
+ subb = <_BitSet*> veb.bottom[a]
+ if bitset_insert(subb, b) == 0:
+ return 0
+
+ if i > veb.max_val:
+ veb.max_val = i
+ veb.size = veb.size + 1
+ return 1
+
+
+cdef del_VEB(_VEB* veb):
+ cdef int i
+
+ if veb.top_universe_size > MIN_BOTTOM_SIZE:
+ i = (<_VEB*> veb.top).min_val
+ else:
+ i = (<_BitSet*> veb.top).min_val
+
+ while i != -1:
+ if veb.num_bottom_bits > MIN_BOTTOM_BITS:
+ del_VEB(<_VEB*> veb.bottom[i])
+ else:
+ free(<_BitSet*> veb.bottom[i])
+
+ if veb.top_universe_size > MIN_BOTTOM_SIZE:
+ i = VEB_findsucc(<_VEB*> veb.top, i)
+ else:
+ i = bitset_findsucc(<_BitSet*> veb.top, i)
+
+ if veb.top_universe_size > MIN_BOTTOM_SIZE:
+ del_VEB(<_VEB*> veb.top)
+ else:
+ free(<_BitSet*> veb.top)
+ free(veb.bottom)
+ free(veb)
+
+
+cdef int VEB_findsucc(_VEB* veb, int i):
+ cdef _VEB* subv
+ cdef _BitSet* subb
+ cdef int a, b, j, c, found
+
+ if veb.max_val == -1 or i>=veb.max_val:
+ return -1
+ if i < veb.min_val:
+ return veb.min_val
+
+ a = i >> veb.num_bottom_bits
+ b = i & lower_mask[veb.num_bottom_bits-1]
+ found = 0
+ if veb.bottom[a] != NULL:
+ if veb.num_bottom_bits > MIN_BOTTOM_BITS:
+ subv = <_VEB*> veb.bottom[a]
+ if subv.max_val > b:
+ j = (a << veb.num_bottom_bits) + VEB_findsucc(subv, b)
+ found = 1
+ else:
+ subb = <_BitSet*> veb.bottom[a]
+ if subb.max_val > b:
+ j = (a << veb.num_bottom_bits) + bitset_findsucc(subb, b)
+ found = 1
+ if found==0:
+ if veb.top_universe_size > MIN_BOTTOM_SIZE:
+ subv = <_VEB*> veb.top
+ c = VEB_findsucc(subv, a)
+ else:
+ subb = <_BitSet*> veb.top
+ c = bitset_findsucc(subb, a)
+ if veb.num_bottom_bits > MIN_BOTTOM_BITS:
+ subv = <_VEB*> veb.bottom[c]
+ j = (c << veb.num_bottom_bits) + subv.min_val
+ else:
+ subb = <_BitSet*> veb.bottom[c]
+ j = (c << veb.num_bottom_bits) + subb.min_val
+ return j
+
+
+cdef int VEB_contains(_VEB* veb, int i):
+ cdef _VEB* subv
+ cdef _BitSet* subb
+ cdef int a, b
+
+ if veb.size == 0 or i < veb.min_val or i > veb.max_val:
+ return 0
+
+ if veb.min_val == i:
+ return 1
+ else:
+ if veb.size == 1:
+ return 0
+
+ a = i >> veb.num_bottom_bits
+ b = i & lower_mask[veb.num_bottom_bits-1]
+ if veb.bottom[a] == NULL:
+ return 0
+ else:
+ if veb.num_bottom_bits > MIN_BOTTOM_BITS:
+ subv = <_VEB*> veb.bottom[a]
+ return VEB_contains(subv, b)
+ else:
+ subb = <_BitSet*> veb.bottom[a]
+ return bitset_contains(subb, b)
+
+
+cdef class VEBIterator:
+ cdef _VEB* v
+ cdef int next_val
+
+ def __next__(self):
+ cdef int ret_val
+
+ if self.next_val == -1:
+ raise StopIteration()
+ ret_val = self.next_val
+ self.next_val = VEB_findsucc(self.v, ret_val)
+ return ret_val
+
+
+cdef class VEB:
+
+ def __init__(self, size):
+ pass
+
+ def __cinit__(self, int size):
+ self.veb = new_VEB(size)
+
+ def __dealloc__(self):
+ del_VEB(self.veb)
+
+ def __iter__(self):
+ cdef VEBIterator it
+ it = VEBIterator()
+ it.v = self.veb
+ it.next_val = self.veb.min_val
+ return it
+
+ def insert(self, i):
+ return VEB_insert(self.veb, i)
+
+ cdef int _insert(self, int i):
+ return VEB_insert(self.veb, i)
+
+ def findsucc(self, i):
+ return VEB_findsucc(self.veb, i)
+
+ cdef int _first(self):
+ return self.veb.min_val
+
+ cdef int _findsucc(self, int i):
+ return VEB_findsucc(self.veb, i)
+
+ def __len__(self):
+ return self.veb.size
+
+ def __contains__(self, i):
+ return VEB_contains(self.veb, i)
+
+
+
+
+
diff --git a/sa-extract/escape-testset.pl b/sa-extract/escape-testset.pl
new file mode 100755
index 00000000..02fd7445
--- /dev/null
+++ b/sa-extract/escape-testset.pl
@@ -0,0 +1,35 @@
+#!/usr/bin/perl -w
+
+use utf8;
+use strict;
+
+binmode(STDIN,":utf8");
+binmode(STDOUT,":utf8");
+
+my @fh = ();
+if (scalar @ARGV == 0) {
+ push @fh, \*STDIN;
+} else {
+ for my $file (@ARGV) {
+ my $f;
+ open $f, "<$file" or die "Can't read $file: $!\n";
+ binmode $f, ":utf8";
+ push @fh, $f;
+ }
+}
+
+my $id = -1;
+for my $f (@fh) {
+ while(<$f>) {
+ chomp;
+ die "Empty line in test set" if /^\s*$/;
+ die "Please remove <seg> tags from input:\n$_" if /^\s*<seg/i;
+ $id++;
+ s/&/\&amp;/g;
+ s/</\&lt;/g;
+ s/>/\&gt;/g;
+ print "<seg id=\"$id\"> $_ </seg>\n";
+ }
+}
+
+
diff --git a/sa-extract/example/README b/sa-extract/example/README
new file mode 100644
index 00000000..f6eac52b
--- /dev/null
+++ b/sa-extract/example/README
@@ -0,0 +1,8 @@
+Commands to compile a corpus and extract some grammars
+======================================================
+
+# compile
+../sa-compile.pl -b nc=corpus.de.gz,corpus.en.gz -a gdfa=corpus.align.gz > extract.ini
+# extract
+cat test.de | ../escape-testset.pl | ../extractor.py -c extract.ini
+
diff --git a/sa-extract/example/corpus.align.gz b/sa-extract/example/corpus.align.gz
new file mode 100644
index 00000000..741de7e4
--- /dev/null
+++ b/sa-extract/example/corpus.align.gz
Binary files differ
diff --git a/sa-extract/example/corpus.de.gz b/sa-extract/example/corpus.de.gz
new file mode 100644
index 00000000..0d66470a
--- /dev/null
+++ b/sa-extract/example/corpus.de.gz
Binary files differ
diff --git a/sa-extract/example/corpus.en.gz b/sa-extract/example/corpus.en.gz
new file mode 100644
index 00000000..28cb5c58
--- /dev/null
+++ b/sa-extract/example/corpus.en.gz
Binary files differ
diff --git a/sa-extract/example/test.de b/sa-extract/example/test.de
new file mode 100644
index 00000000..8923329f
--- /dev/null
+++ b/sa-extract/example/test.de
@@ -0,0 +1,10 @@
+dies ist der richtige ansatz für diejenigen in chinas politischer führung , die aus der who den maximalen nutzen für die unterstützung der inneren reform ziehen wollen .
+taiwan hat sich auch vorgenommen , konstruktiv zu sein - wenn china mitspielt .
+die stadt staaten hongkong und singapur verfolgen eine klarsichtige who - politik und konzentrieren sich auf markt zugänge und starke regeln .
+malaysia und thailand sind auch recht aktiv innerhalb der who , mit verschiedenen positionen , die vom frei handel bis zum protektionismus reichen .
+indonesien und die philippinen sind schwächer , überwältigt von politischer zusammen hanglosigkeit und ganz in anspruch genommen von den anstrengungen , das schlimmste zu hause zu verhüten , so dass nur geringe kräfte übrig bleiben , mit der stets anschwellenden und immer komplizierteren agenda der who fertig zu werden .
+die who steht vor einer wichtigen entscheidung .
+sie muss dringend den handel progressiv liberalisieren .
+eine starke führung seitens der usa ist erforderlich , damit die who in diese richtung gebracht werden kann und man gleichzeitig vermeidet , die zukunft nach dem muster der eu zu gestalten ( regel wucherung ) oder nach dem muster der uno ( macht lose gespräch runde ) .
+dies geschieht sicher besser unter bush , mit einem klaren bekenntnis zum offenen markt und einer aktiveren außen politik , als es unter irgendeiner demokratischen alternative geschehen könnte .
+robert zoellick , präsident bushs handel beauftragter , braucht aber verbündete .
diff --git a/sa-extract/example/test.ref.en b/sa-extract/example/test.ref.en
new file mode 100644
index 00000000..e50edcac
--- /dev/null
+++ b/sa-extract/example/test.ref.en
@@ -0,0 +1,10 @@
+this is the right approach for those in china 's leadership who wish to extract maximum benefits from the wto to bolster domestic reform .
+taiwan is also set to play a constructive role -- if mainland china plays along .
+the city states , hong kong and singapore , have clear - sighted wto policies , focusing on market access and strong rules .
+malaysia and thailand are also fairly active in the wto , with a mix of free - market and protectionist positions .
+indonesia and the philippines are weaker , overwhelmed by policy incoherence and fire - fighting at home , and with insufficient capacity to deal with the wto 's burgeoning and increasingly complicated agenda .
+the wto is at a crossroads .
+it sorely needs to liberalize trade progressively .
+strong us leadership is required to push the wto in this direction while avoiding an eu - style future ( regulatory overload ) or a un - style future ( an irrelevant talking shop ) .
+this is more likely under a bush administration with better open - market credentials and a more assertive foreign policy than any democratic alternative .
+however , robert zoellick , president bush 's trade representative , needs allies .
diff --git a/sa-extract/extract.ini b/sa-extract/extract.ini
new file mode 100644
index 00000000..56913245
--- /dev/null
+++ b/sa-extract/extract.ini
@@ -0,0 +1,116 @@
+# This .ini file extracts grammars to a file using
+# the pattern matching infrastructure.
+#
+# Does not do any decoding.
+#
+# Variables can be set using sa-system.pl
+#
+# Usage: decoder.py -c <this ini file> [-x <grammar file>]
+#
+# If the -x option is used, grammar will be written to the
+# specified file, otherwise it is written to $PWD/grammar.out
+#
+# NOTE: all information about rules is cached, so use generous
+# memory limits (rules themselves are not cached.)
+
+import os
+import manager
+import clex
+import context_model
+import rulefactory
+import calignment
+import sys
+
+out_grammar_file = "grammar.out"
+if opts.extra:
+ out_grammar_file = opts.extra
+
+# *** these variables written by sa-system.pl. Do not modify ***
+lm_file = "/tmp/sa-redpony/de-en/lm/lm/lm.gz"
+f_sa_file = "/tmp/sa-redpony/de-en/bitext/soseos.vc/f.sa.bin"
+e_file = "/tmp/sa-redpony/de-en/bitext/soseos.vc/e.bin"
+a_file = "/tmp/sa-redpony/de-en/bitext/soseos.vc/a/gdfa/a.bin"
+lex_file = "/tmp/sa-redpony/de-en/bitext/soseos.vc/a/gdfa/lex.bin"
+max_len = 5
+max_nt = 2
+max_size=10
+min_gap=1
+rank1 = 100
+rank2 = 10
+precompute_file = "/tmp/sa-redpony/de-en/bitext/soseos.vc/precomp.5.2.10.1.100.10.bin"
+
+# check for path errors
+if not os.path.exists(f_sa_file):
+ raise Exception("Cannot find compiled source language suffix array file %s" % f_sa_file)
+if not os.path.exists(e_file):
+ raise Exception("Cannot find compiled target language array file %s" % e_file)
+if not os.path.exists(a_file):
+ raise Exception("Cannot find compiled alignment file %s" % a_file)
+if not os.path.exists(lex_file):
+ raise Exception("Cannot find compiled lexical weights file %s" % lex_file)
+if not os.path.exists(precompute_file):
+ log.writeln("Could not find precomputed collocations %s, decoding will be slower" % precompute_file)
+ precompute_file = None
+
+### Output options
+mark_phrases = False # show derivation as SGML markup in output
+mert_mark_phrases = False # do the same when generating n-best lists (don't use this with minimum error rate training!)
+
+# Verbosity. 0 = silent, 1 = normal, 2-5 = verbose
+log.level = 1
+log.file = sys.stderr
+
+# pattern-matching stuff
+class PhonyGrammar: # saves us the cost of keeping the rules around
+ def add(self, thing):
+ pass
+
+local_grammar = PhonyGrammar()
+xcat="X"
+
+cm = manager.ContextManager(
+ f_sa_file,
+ e_file,
+ sampler=rulefactory.Sampler(300), # lower=faster, higher=better; improvements level off above 200-300 range, -1 = don't sample, use all data (VERY SLOW!)
+ rulefactory=rulefactory.HieroCachingRuleFactory(
+ alignment=calignment.Alignment( # compiled alignment object (REQUIRED)
+ a_file,
+ from_binary=True
+ ),
+ category="["+xcat+"]", # name of generic nonterminal used by Hiero
+ grammar=local_grammar, # do not change for extraction
+ max_chunks=None, # maximum number of contiguous chunks of terminal symbols in RHS of a rule. If None, defaults to max_nonterminals+1
+ max_initial_size=15, # maximum span of a grammar rule in TEST DATA
+ max_length=max_len, # maximum number of symbols (both T and NT) allowed in a rule
+ max_nonterminals=max_nt, # maximum number of nonterminals allowed in a rule (set >2 at your own risk)
+ max_target_chunks=None, # maximum number of contiguous chunks of terminal symbols in target-side RHS of a rule. If None, defaults to max_nonterminals+1
+ max_target_length=None, # maximum number of target side symbols (both T and NT) allowed in a rule. If None, defaults to max_initial_size
+ min_gap_size=1, # minimum span of a nonterminal in the RHS of a rule in TEST DATA
+ precompute_file=precompute_file, # filename of file containing precomputed collocations
+ precompute_secondary_rank=rank2, # maximum frequency rank of patterns used to compute triples (don't set higher than 20).
+ precompute_rank=rank1, # maximum frequency rank of patterns used to compute collocations (no need to set higher than maybe 200-300)
+ require_aligned_terminal=True, # require extracted rules to have at least one aligned word
+ require_aligned_chunks=False, # require each contiguous chunk of extracted rules to have at least one aligned word
+ per_sentence_grammar=True, # generate a complete grammar for each input sentence
+ rule_file=out_grammar_file, # grammar is written to this file (sentence id is added to file name for per sentence grammars)
+ train_max_initial_size=max_size, # maximum span of a grammar rule extracted from TRAINING DATA
+ train_min_gap_size=min_gap, # minimum span of an RHS nonterminal in a rule extracted from TRAINING DATA
+ tight_phrases=True, # True if phrases should be tight, False otherwise (False seems to give better results but is slower)
+ ),
+ from_binary=True
+ )
+
+# lexical weighting tables
+tt = clex.CLex(lex_file, from_binary=True)
+
+# Only include features that depend on rule identity here
+add_model(context_model.EgivenFCoherent(cm), 0.125)
+add_model(context_model.SampleCountF(cm), 0.125)
+add_model(context_model.CountEF(cm), 0.125)
+add_model(context_model.MaxLexFgivenE(cm, tt), 0.125)
+add_model(context_model.MaxLexEgivenF(cm, tt), 0.125)
+add_model(context_model.IsSingletonF(cm), 0.125)
+add_model(context_model.IsSingletonFE(cm), 0.125)
+
+# grammars, search parameters and all that other stuff are irrelevant
+
diff --git a/sa-extract/extractor.py b/sa-extract/extractor.py
new file mode 100755
index 00000000..9d66ebf0
--- /dev/null
+++ b/sa-extract/extractor.py
@@ -0,0 +1,60 @@
+#!/usr/bin/env python
+
+# vim:expandtab:shiftwidth=4
+
+import sys, gc, monitor, sgml
+import optparse
+import model
+import log
+import cn
+
+models = []
+
+def add_model(m,w=0.0):
+ models.append(m)
+
+def extract_grammar(input):
+ confnet = cn.ConfusionNet(input)
+ meta = input.meta
+ for m in models:
+ m.input(confnet.columns, meta)
+
+if __name__ == "__main__":
+ optparser = optparse.OptionParser()
+ optparser.add_option("-c", "--config", dest="config", help="configuration module")
+ optparser.add_option("-x", "--extra", dest="extra", help="output grammar name override")
+ (opts,args) = optparser.parse_args()
+
+ if opts.config is None:
+ raise ValueError, "You must specify a configuration file."
+ else:
+ if log.level >= 1:
+ log.write("Reading configuration from %s\n" % opts.config)
+ execfile(opts.config)
+
+ if len(args) >= 1 and args[0] != "-":
+ input_file = file(args[0], "r")
+ else:
+ input_file = sys.stdin
+
+ if len(args) >= 2 and args[1] != "-":
+ output_file = file(args[1], "w")
+ else:
+ output_file = sys.stdout
+
+ gc.collect()
+ if log.level >= 1:
+ log.write("all structures loaded, memory %s, time %s\n" % (monitor.memory(), monitor.cpu()))
+ log.write("models: %s\n" % (" ".join(str(x.name) for x in models)))
+
+ sents = sgml.read_raw(input_file)
+ for sent in sents:
+ mark = sent.getmark()
+ if mark is not None:
+ (tag, attrs) = mark
+ if tag == "seg":
+ sent.unmark()
+ dattrs = sgml.attrs_to_dict(attrs)
+ sent.meta = attrs
+ extract_grammar(sent)
+
diff --git a/sa-extract/lcp.pyx b/sa-extract/lcp.pyx
new file mode 100644
index 00000000..a992d3ee
--- /dev/null
+++ b/sa-extract/lcp.pyx
@@ -0,0 +1,113 @@
+#!/usr/bin/env python2.4
+
+'''Compute LCP array for a suffix array using the Kasai et al. algorithm'''
+'''Can also be used to compute statistics such
+as k most frequent n-grams'''
+
+import sys
+
+cimport cintlist
+cimport csuf
+cimport cdat
+cimport cveb
+
+cdef class LCP:
+
+ cdef csuf.SuffixArray sa
+ cdef cintlist.CIntList lcp
+
+ def __init__(self, sa):
+ self._construct(sa)
+
+ cdef _construct(self, csuf.SuffixArray sa):
+ cdef int i, k, j, h, n
+ cdef cintlist.CIntList rank
+
+ sys.stderr.write("Constructing LCP array\n")
+ self.sa = sa
+ n = self.sa.sa.len
+ self.lcp = cintlist.CIntList(initial_len=n)
+
+ rank = cintlist.CIntList(initial_len=n)
+ for i from 0 <= i < n:
+ rank.arr[sa.sa.arr[i]] = i
+
+ h = 0
+ for i from 0 <= i < n:
+ k = rank.arr[i]
+ if k == 0:
+ self.lcp.arr[k] = -1
+ else:
+ j = sa.sa.arr[k-1]
+ while i+h < n and j+h < n and sa.darray.data.arr[i+h] == sa.darray.data.arr[j+h]:
+ h = h+1
+ self.lcp.arr[k] = h
+ if h > 0:
+ h = h-1
+ sys.stderr.write("LCP array completed\n")
+
+
+ def compute_stats(self, max_n):
+ self._compute_stats(max_n)
+
+ cdef _compute_stats(self, int max_n):
+ '''Note: the output of this function is not exact. In
+ particular, the frequency associated with each word is
+ not guaranteed to be correct. This is due to a bit of
+ laxness in the design; the function is intended only to
+ obtain a list of the most frequent words; for this
+ purpose it is perfectly fine'''
+ cdef int i, ii, iii, j, k, h, n, N, rs, freq, valid
+ cdef cintlist.CIntList run_start
+ cdef cintlist.CIntList ngram_start
+ cdef cveb.VEB veb
+
+ N = self.sa.sa.len
+
+ ngram_starts = []
+ for n from 0 <= n < max_n:
+ ngram_starts.append(cintlist.CIntList(initial_len=N))
+
+ run_start = cintlist.CIntList(initial_len=max_n)
+ veb = cveb.VEB(N)
+
+ for i from 0 <= i < N:
+ h = self.lcp.arr[i]
+ if h < 0:
+ h = 0
+ for n from h <= n < max_n:
+ rs = run_start.arr[n]
+ run_start.arr[n] = i
+ freq = i - rs
+ if freq > 1000: # arbitrary, but see note below
+ veb._insert(freq)
+ ngram_start = ngram_starts[n]
+ while ngram_start.arr[freq] > 0:
+ freq = freq + 1 # cheating a bit, should be ok for sparse histogram
+ ngram_start.arr[freq] = rs
+ i = veb.veb.min_val
+ while i != -1:
+ ii = veb._findsucc(i)
+ for n from 0 <= n < max_n:
+ ngram_start = ngram_starts[n]
+ iii = i
+ rs = ngram_start.arr[iii]
+ while (ii==-1 or iii < ii) and rs != 0:
+ j = self.sa.sa.arr[rs]
+ valid = 1
+ for k from 0 <= k < n+1:
+ if self.sa.darray.data.arr[j+k] < 2:
+ valid = 0
+ if valid:
+ ngram = ""
+ for k from 0 <= k < n+1:
+ ngram= ngram+ self.sa.darray.id2word[self.sa.darray.data.arr[j+k]] + " "
+ print i, n+1, ngram
+ iii = iii + 1
+ rs = ngram_start.arr[iii]
+ i = ii
+
+
+
+
+
diff --git a/sa-extract/lcp_ops.py b/sa-extract/lcp_ops.py
new file mode 100755
index 00000000..9df6e82a
--- /dev/null
+++ b/sa-extract/lcp_ops.py
@@ -0,0 +1,52 @@
+#!/usr/bin/env python
+
+import lcp
+import sys
+import optparse
+import csuf
+
+'''Run various computations using the LCP array'''
+def main():
+
+ optparser = optparse.OptionParser()
+# optparser.add_option("-c", "--config", dest="config", help="configuration module")
+ optparser.add_option("-s", "--sa-check", dest="sa_check", default=False, action="store_true")
+ optparser.add_option("-l", "--lcp-check", dest="lcp_check", default=False, action="store_true")
+ optparser.add_option("-t", "--stats", dest="stats", default=0, type="int", action="store")
+ optparser.add_option("-u", "--unigram", dest="uni_check", default=False, action="store_true")
+ optparser.add_option("-r", "--report-long-lcps", dest="long_lcp", type="int", default=0, action="store")
+ (opts,args) = optparser.parse_args()
+
+ if len(args) < 1:
+ print >>sys.stderr, "Usage: lcp.py [opts] <sa file>"
+ sys.exit(1)
+
+ safile = args[0]
+ sa = csuf.SuffixArray(safile, from_binary=True)
+
+# if opts.sa_check:
+# check_sufarray(sa)
+
+ l = lcp.LCP(sa)
+
+ if opts.lcp_check:
+ print >>sys.stderr, "Checking LCP Array..."
+ l.check()
+ print >>sys.stderr, "Check finished"
+
+ if opts.stats > 0:
+ l.compute_stats(opts.stats)
+
+# if opts.uni_check:
+# if lcp is None:
+# lcp = LCP(sa)
+# unigram_stats(sa, lcp)
+#
+# if opts.long_lcp:
+# if lcp is None:
+# lcp = LCP(sa, opts.long_lcp)
+
+if __name__ == "__main__":
+ sys.exit(main())
+
+
diff --git a/sa-extract/log.py b/sa-extract/log.py
new file mode 100644
index 00000000..d4f96cb4
--- /dev/null
+++ b/sa-extract/log.py
@@ -0,0 +1,18 @@
+import sys
+
+level = 1
+file = sys.stderr
+
+def writeln(s="", l=0):
+ if level >= l:
+ file.write("%s\n" % s)
+ file.flush()
+
+def write(s, l=0):
+ if level >= l:
+ file.write(s)
+ file.flush()
+
+
+
+
diff --git a/sa-extract/manager.py b/sa-extract/manager.py
new file mode 100644
index 00000000..767192c1
--- /dev/null
+++ b/sa-extract/manager.py
@@ -0,0 +1,100 @@
+import csuf
+import cdat
+
+class Sampler(object):
+ '''A Sampler implements a logic for choosing
+ samples from a population range'''
+
+ def __init__(self):
+ pass
+
+ def registerContext(self, context_manager):
+ self.context_manager = context_manager
+
+ def sample(self, phrase_location):
+ return cintlist.CIntList()
+
+
+
+class Extractor(object):
+ '''Extractor is responsible for extracting rules
+ from a given context; once a sentence id/location
+ is found for the source fwords, extractor is
+ responsible for producing any matching rule(s).
+ Optionally, extractor may return an empty list'''
+
+ def __init__(self):
+ pass
+
+ def registerContext(self, context_manager):
+ self.context_manager = context_manager
+
+ def extract(self, fwords, loc):
+ return []
+
+
+
+class RuleFactory(object):
+ '''RuleFactory is a class that manages the
+ generation of translation rules, using the Context
+ and (optionally) any of its contained classes or
+ data. The RuleFactory is responsible for handling
+ any caching (i.e. when presented with an input
+ sentence, it may lookup a rule from its cache
+ rather than extracting a new rule)'''
+
+ def __init__(self):
+ self.num_lookups = 0
+ self.num_extractions = 0
+ self.num_rules = 0
+ self.time = 0.0
+
+
+ def registerContext(self, context_manager):
+ self.context_manager = context_manager
+
+
+ def input(self, fwords):
+ '''Manages the process of enumerating
+ rules for a given input sentence, and
+ looking them with calls to Context,
+ Sampler, and Extractor'''
+ return []
+
+
+class ContextManager(object):
+
+ def __init__(self, ffile, efile, extractor=None, sampler=None, rulefactory=None, from_binary=False):
+ # NOTE: Extractor does not have a default value because
+ # the only nontrivial extractor right now depends on an
+ # alignment file
+
+ self.fsarray = csuf.SuffixArray(ffile, from_binary)
+ self.edarray = cdat.DataArray(efile, from_binary)
+
+ self.factory = rulefactory
+ self.factory.registerContext(self)
+
+ self.sampler = sampler
+ self.sampler.registerContext(self)
+
+ self.models = []
+ self.owner = None
+
+
+ def add_model(self, model):
+ if self.owner is None:
+ self.owner = model
+ model_id = len(self.models)
+ self.models.append(model)
+ return model_id
+
+
+ def input(self, model, fwords, meta):
+ if model != self.owner:
+ return
+ self.fwords = fwords
+ self.factory.input(self.fwords, meta)
+
+
+
diff --git a/sa-extract/model.py b/sa-extract/model.py
new file mode 100644
index 00000000..66c51051
--- /dev/null
+++ b/sa-extract/model.py
@@ -0,0 +1,12 @@
+
+class Model(object):
+ def __init__(self, name=None):
+ object.__init__(self)
+ if name is None:
+ self.name = self.__class__.__name__
+ else:
+ self.name = name
+
+ def input(self, fwords, meta):
+ pass
+
diff --git a/sa-extract/monitor.py b/sa-extract/monitor.py
new file mode 100644
index 00000000..eb0bed57
--- /dev/null
+++ b/sa-extract/monitor.py
@@ -0,0 +1,48 @@
+import os, resource
+
+def cpu():
+ return (resource.getrusage(resource.RUSAGE_SELF).ru_utime+
+ resource.getrusage(resource.RUSAGE_SELF).ru_stime)
+
+# from http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/286222
+
+
+_proc_status = '/proc/%d/status' % os.getpid()
+
+_scale = {'kB': 1024.0, 'mB': 1024.0*1024.0,
+ 'KB': 1024.0, 'MB': 1024.0*1024.0}
+
+def _VmB(VmKey):
+ '''Private.
+ '''
+ global _proc_status, _scale
+ # get pseudo file /proc/<pid>/status
+ try:
+ t = open(_proc_status)
+ v = t.read()
+ t.close()
+ except:
+ return 0.0 # non-Linux?
+ # get VmKey line e.g. 'VmRSS: 9999 kB\n ...'
+ i = v.index(VmKey)
+ v = v[i:].split(None, 3) # whitespace
+ if len(v) < 3:
+ return 0.0 # invalid format?
+ # convert Vm value to bytes
+ return float(v[1]) * _scale[v[2]]
+
+def memory(since=0.0):
+ '''Return memory usage in bytes.
+ '''
+ return _VmB('VmSize:') - since
+
+def resident(since=0.0):
+ '''Return resident memory usage in bytes.
+ '''
+ return _VmB('VmRSS:') - since
+
+
+def stacksize(since=0.0):
+ '''Return stack size in bytes.
+ '''
+ return _VmB('VmStk:') - since
diff --git a/sa-extract/precomputation.pxd b/sa-extract/precomputation.pxd
new file mode 100644
index 00000000..c75d5eef
--- /dev/null
+++ b/sa-extract/precomputation.pxd
@@ -0,0 +1,13 @@
+from libc.stdio cimport FILE
+
+cdef class Precomputation:
+ cdef int precompute_rank
+ cdef int precompute_secondary_rank
+ cdef int max_length
+ cdef int max_nonterminals
+ cdef int train_max_initial_size
+ cdef int train_min_gap_size
+ cdef precomputed_index
+ cdef precomputed_collocations
+ cdef read_map(self, FILE* f)
+ cdef write_map(self, m, FILE* f)
diff --git a/sa-extract/precomputation.pyx b/sa-extract/precomputation.pyx
new file mode 100644
index 00000000..ce4c21aa
--- /dev/null
+++ b/sa-extract/precomputation.pyx
@@ -0,0 +1,478 @@
+# precomputes a set of collocations by advancing over the text.
+# warning: nasty C code
+
+import log
+import monitor
+
+cimport csuf
+cimport cdat
+cimport cintlist
+
+from libc.stdio cimport FILE, fopen, fread, fwrite, fclose
+from libc.stdlib cimport malloc, realloc, free
+from libc.string cimport memset, memcpy
+
+cdef struct _Trie_Node # forward decl
+
+cdef struct _Trie_Edge:
+ int val
+ _Trie_Node* node
+ _Trie_Edge* bigger
+ _Trie_Edge* smaller
+
+cdef struct _Trie_Node:
+ _Trie_Edge* root
+ int* arr
+ int arr_len
+
+cdef _Trie_Node* new_trie_node():
+ cdef _Trie_Node* node
+ node = <_Trie_Node*> malloc(sizeof(_Trie_Node))
+ node.root = NULL
+ node.arr_len = 0
+ node.arr = <int*> malloc(sizeof(0*sizeof(int)))
+ return node
+
+cdef _Trie_Edge* new_trie_edge(int val):
+ cdef _Trie_Edge* edge
+ edge = <_Trie_Edge*> malloc(sizeof(_Trie_Edge))
+ edge.node = new_trie_node()
+ edge.bigger = NULL
+ edge.smaller = NULL
+ edge.val = val
+ return edge
+
+cdef free_trie_node(_Trie_Node* node):
+ if node != NULL:
+ free_trie_edge(node.root)
+ free(node.arr)
+
+cdef free_trie_edge(_Trie_Edge* edge):
+ if edge != NULL:
+ free_trie_node(edge.node)
+ free_trie_edge(edge.bigger)
+ free_trie_edge(edge.smaller)
+
+cdef _Trie_Node* trie_find(_Trie_Node* node, int val):
+ cdef _Trie_Edge* cur
+ cur = node.root
+ while cur != NULL and cur.val != val:
+ if val > cur.val:
+ cur = cur.bigger
+ elif val < cur.val:
+ cur = cur.smaller
+ if cur == NULL:
+ return NULL
+ else:
+ return cur.node
+
+cdef trie_node_data_append(_Trie_Node* node, int val):
+ cdef int new_len
+ new_len = node.arr_len + 1
+ node.arr = <int*> realloc(node.arr, new_len*sizeof(int))
+ node.arr[node.arr_len] = val
+ node.arr_len = new_len
+
+cdef trie_node_data_extend(_Trie_Node* node, int* vals, int num_vals):
+ cdef int new_len
+ new_len = node.arr_len + num_vals
+ node.arr = <int*> realloc(node.arr, new_len*sizeof(int))
+ memcpy(node.arr + node.arr_len, vals, num_vals*sizeof(int))
+ node.arr_len = new_len
+
+
+cdef _Trie_Node* trie_insert(_Trie_Node* node, int val):
+ cdef _Trie_Edge** cur
+ cur = &node.root
+ while cur[0] != NULL and cur[0].val != val:
+ if val > cur[0].val:
+ cur = &cur[0].bigger
+ elif val < cur[0].val:
+ cur = &cur[0].smaller
+ if cur[0] == NULL:
+ cur[0] = new_trie_edge(val)
+ return cur[0].node
+
+cdef trie_node_to_map(_Trie_Node* node, result, prefix, int include_zeros):
+ cdef cintlist.CIntList arr
+
+ if include_zeros or node.arr_len > 0:
+ arr = cintlist.CIntList()
+ free(arr.arr)
+ arr.arr = <int*> malloc(node.arr_len * sizeof(int))
+ memcpy(arr.arr, node.arr, node.arr_len * sizeof(int))
+ arr.len = node.arr_len
+ arr.size = node.arr_len
+ result[prefix] = arr
+ trie_edge_to_map(node.root, result, prefix, include_zeros)
+
+cdef trie_edge_to_map(_Trie_Edge* edge, result, prefix, int include_zeros):
+ if edge != NULL:
+ trie_edge_to_map(edge.smaller, result, prefix, include_zeros)
+ trie_edge_to_map(edge.bigger, result, prefix, include_zeros)
+ prefix = prefix + (edge.val,)
+ trie_node_to_map(edge.node, result, prefix, include_zeros)
+
+cdef class TrieMap:
+
+ cdef _Trie_Node** root
+ cdef int V
+
+ def __init__(self, alphabet_size):
+ self.V = alphabet_size
+ self.root = <_Trie_Node**> malloc(self.V * sizeof(_Trie_Node*))
+ memset(self.root, 0, self.V * sizeof(_Trie_Node*))
+
+
+ def __dealloc__(self):
+ cdef int i
+ for i from 0 <= i < self.V:
+ if self.root[i] != NULL:
+ free_trie_node(self.root[i])
+ free(self.root)
+
+
+ def insert(self, pattern):
+ cdef int* p
+ cdef int i, l
+ l = len(pattern)
+ p = <int*> malloc(l*sizeof(int))
+ for i from 0 <= i < l:
+ p[i] = pattern[i]
+ self._insert(p,l)
+ free(p)
+
+
+ cdef _Trie_Node* _insert(self, int* pattern, int pattern_len):
+ cdef int i
+ cdef _Trie_Node* node
+ if self.root[pattern[0]] == NULL:
+ self.root[pattern[0]] = new_trie_node()
+ node = self.root[pattern[0]]
+ for i from 1 <= i < pattern_len:
+ node = trie_insert(node, pattern[i])
+ return node
+
+ def contains(self, pattern):
+ cdef int* p
+ cdef int i, l
+ cdef _Trie_Node* node
+ l = len(pattern)
+ p = <int*> malloc(l*sizeof(int))
+ for i from 0 <= i < l:
+ p[i] = pattern[i]
+ node = self._contains(p,l)
+ free(p)
+ if node == NULL:
+ return False
+ else:
+ return True
+
+ cdef _Trie_Node* _contains(self, int* pattern, int pattern_len):
+ cdef int i
+ cdef _Trie_Node* node
+ node = self.root[pattern[0]]
+ i = 1
+ while node != NULL and i < pattern_len:
+ node = trie_find(node, pattern[i])
+ i = i+1
+ return node
+
+ def toMap(self, flag):
+ cdef int i, include_zeros
+
+ if flag:
+ include_zeros=1
+ else:
+ include_zeros=0
+ result = {}
+ for i from 0 <= i < self.V:
+ if self.root[i] != NULL:
+ trie_node_to_map(self.root[i], result, (i,), include_zeros)
+ return result
+
+
+cdef class Precomputation:
+
+# Defined in .pxd file, here for reference:
+# cdef int precompute_rank
+# cdef int precompute_secondary_rank
+# cdef int max_length
+# cdef int max_nonterminals
+# cdef int train_max_initial_size
+# cdef int train_min_gap_size
+# cdef precomputed_index
+# cdef precomputed_collocations
+
+ def __init__(self, filename, sa=None, precompute_rank=1000, precompute_secondary_rank=20, max_length=5,
+ max_nonterminals=2, train_max_initial_size=10, train_min_gap_size=2, from_binary=False):
+ self.precompute_rank = precompute_rank
+ self.precompute_secondary_rank = precompute_secondary_rank
+ self.max_length = max_length
+ self.max_nonterminals = max_nonterminals
+ self.train_max_initial_size = train_max_initial_size
+ self.train_min_gap_size = train_min_gap_size
+ if from_binary:
+ self.read_binary(filename)
+ else:
+ self.precompute(filename, sa)
+
+
+ def read_binary(self, filename):
+ cdef FILE* f
+ cdef bytes bfilename = filename
+ cdef char* cfilename = bfilename
+ f = fopen(cfilename, "r")
+ fread(&(self.precompute_rank), sizeof(int), 1, f)
+ fread(&(self.precompute_secondary_rank), sizeof(int), 1, f)
+ fread(&(self.max_length), sizeof(int), 1, f)
+ fread(&(self.max_nonterminals), sizeof(int), 1, f)
+ fread(&(self.train_max_initial_size), sizeof(int), 1, f)
+ fread(&(self.train_min_gap_size), sizeof(int), 1, f)
+ self.precomputed_index = self.read_map(f)
+ self.precomputed_collocations = self.read_map(f)
+ fclose(f)
+
+
+ def write_binary(self, filename):
+ cdef FILE* f
+ cdef bytes bfilename = filename
+ cdef char* cfilename = bfilename
+
+ f = fopen(cfilename, "w")
+ fwrite(&(self.precompute_rank), sizeof(int), 1, f)
+ fwrite(&(self.precompute_secondary_rank), sizeof(int), 1, f)
+ fwrite(&(self.max_length), sizeof(int), 1, f)
+ fwrite(&(self.max_nonterminals), sizeof(int), 1, f)
+ fwrite(&(self.train_max_initial_size), sizeof(int), 1, f)
+ fwrite(&(self.train_min_gap_size), sizeof(int), 1, f)
+ self.write_map(self.precomputed_index, f)
+ self.write_map(self.precomputed_collocations, f)
+ fclose(f)
+
+
+ cdef write_map(self, m, FILE* f):
+ cdef int i, N
+ cdef cintlist.CIntList arr
+
+ N = len(m)
+ fwrite(&(N), sizeof(int), 1, f)
+ for pattern, val in m.iteritems():
+ N = len(pattern)
+ fwrite(&(N), sizeof(int), 1, f)
+ for word_id in pattern:
+ i = word_id
+ fwrite(&(i), sizeof(int), 1, f)
+ arr = val
+ arr.write_handle(f)
+
+
+ cdef read_map(self, FILE* f):
+ cdef int i, j, k, word_id, N
+ cdef cintlist.CIntList arr
+
+ m = {}
+ fread(&(N), sizeof(int), 1, f)
+ for j from 0 <= j < N:
+ fread(&(i), sizeof(int), 1, f)
+ key = ()
+ for k from 0 <= k < i:
+ fread(&(word_id), sizeof(int), 1, f)
+ key = key + (word_id,)
+ arr = cintlist.CIntList()
+ arr.read_handle(f)
+ m[key] = arr
+ return m
+
+
+ def precompute(self, filename, sa):
+ cdef int i, l, N, max_pattern_len, i1, l1, i2, l2, i3, l3, ptr1, ptr2, ptr3, is_super, sent_count, max_rank
+ cdef csuf.SuffixArray sarray
+ cdef cdat.DataArray darray
+ cdef cintlist.CIntList data, queue, cost_by_rank, count_by_rank
+ cdef TrieMap frequent_patterns, super_frequent_patterns, collocations
+ cdef _Trie_Node* node
+
+ sarray = sa
+ darray = sarray.darray
+ data = darray.data
+
+ frequent_patterns = TrieMap(len(darray.id2word))
+ super_frequent_patterns = TrieMap(len(darray.id2word))
+ collocations = TrieMap(len(darray.id2word))
+
+ I_set = set()
+ J_set = set()
+ J2_set = set()
+ IJ_set = set()
+ pattern_rank = {}
+
+ log.writeln("Precomputing frequent intersections\n", 1)
+ start_time = monitor.cpu()
+
+ max_pattern_len = 0
+ if filename is not None:
+ precompute_file = open(filename)
+ for rank, line in enumerate(precompute_file):
+ if rank >= self.precompute_rank:
+ break
+ phrase_words = line.split()[2:]
+ phrase = ()
+ for word in phrase_words:
+ phrase = phrase + (darray.word2id[word],)
+ max_pattern_len = max(max_pattern_len, len(phrase))
+ frequent_patterns.insert(phrase)
+ I_set.add(phrase)
+ pattern_rank[phrase] = rank
+ if rank < self.precompute_secondary_rank:
+ super_frequent_patterns.insert(phrase)
+ J_set.add(phrase)
+ precompute_file.close()
+
+ queue = cintlist.CIntList(increment=1000)
+
+ log.writeln(" Computing inverted indexes...", 1)
+ N = len(data)
+ for i from 0 <= i < N:
+ sa_word_id = data.arr[i]
+ if sa_word_id == 1:
+ queue._append(-1)
+ else:
+ for l from 1 <= l <= max_pattern_len:
+ node = frequent_patterns._contains(data.arr+i, l)
+ if node == NULL:
+ break
+ queue._append(i)
+ queue._append(l)
+ trie_node_data_append(node, i)
+
+ log.writeln(" Computing collocations...", 1)
+ N = len(queue)
+ ptr1 = 0
+ sent_count = 0
+ while ptr1 < N: # main loop
+ i1 = queue.arr[ptr1]
+ if i1 > -1:
+ l1 = queue.arr[ptr1+1]
+ ptr2 = ptr1 + 2
+ while ptr2 < N:
+ i2 = queue.arr[ptr2]
+ if i2 == -1 or i2 - i1 >= self.train_max_initial_size:
+ break
+ l2 = queue.arr[ptr2+1]
+ if i2 - i1 - l1 >= self.train_min_gap_size and i2 + l2 - i1 <= self.train_max_initial_size and l1+l2+1 <= self.max_length:
+ node = collocations._insert(data.arr+i1, l1)
+ node = trie_insert(node, -1)
+ for i from i2 <= i < i2+l2:
+ node = trie_insert(node, data.arr[i])
+ trie_node_data_append(node, i1)
+ trie_node_data_append(node, i2)
+ if super_frequent_patterns._contains(data.arr+i2, l2) != NULL:
+ if super_frequent_patterns._contains(data.arr+i1, l1) != NULL:
+ is_super = 1
+ else:
+ is_super = 0
+ ptr3 = ptr2 + 2
+ while ptr3 < N:
+ i3 = queue.arr[ptr3]
+ if i3 == -1 or i3 - i1 >= self.train_max_initial_size:
+ break
+ l3 = queue.arr[ptr3+1]
+ if i3 - i2 - l2 >= self.train_min_gap_size and i3 + l3 - i1 <= self.train_max_initial_size and l1+l2+l3+2 <= self.max_length:
+ if is_super or super_frequent_patterns._contains(data.arr+i3, l3) != NULL:
+ node = collocations._insert(data.arr+i1, l1)
+ node = trie_insert(node, -1)
+ for i from i2 <= i < i2+l2:
+ node = trie_insert(node, data.arr[i])
+ node = trie_insert(node, -1)
+ for i from i3 <= i < i3+l3:
+ node = trie_insert(node, data.arr[i])
+ trie_node_data_append(node, i1)
+ trie_node_data_append(node, i2)
+ trie_node_data_append(node, i3)
+ ptr3 = ptr3 + 2
+ ptr2 = ptr2 + 2
+ ptr1 = ptr1 + 2
+ else:
+ sent_count = sent_count + 1
+ if sent_count % 10000 == 0:
+ log.writeln(" %d sentences" % sent_count)
+ ptr1 = ptr1 + 1
+
+ self.precomputed_collocations = collocations.toMap(False)
+ self.precomputed_index = frequent_patterns.toMap(True)
+
+ x = 0
+ for pattern1 in J_set:
+ for pattern2 in J_set:
+ if len(pattern1) + len(pattern2) + 1 < self.max_length:
+ combined_pattern = pattern1 + (-1,) + pattern2
+ J2_set.add(combined_pattern)
+
+ for pattern1 in I_set:
+ for pattern2 in I_set:
+ x = x+1
+ if len(pattern1) + len(pattern2) + 1 <= self.max_length:
+ combined_pattern = pattern1 + (-1,) + pattern2
+ IJ_set.add(combined_pattern)
+
+ for pattern1 in I_set:
+ for pattern2 in J2_set:
+ x = x+2
+ if len(pattern1) + len(pattern2) + 1<= self.max_length:
+ combined_pattern = pattern1 + (-1,) + pattern2
+ IJ_set.add(combined_pattern)
+ combined_pattern = pattern2 + (-1,) + pattern1
+ IJ_set.add(combined_pattern)
+
+ N = len(pattern_rank)
+ cost_by_rank = cintlist.CIntList(initial_len=N)
+ count_by_rank = cintlist.CIntList(initial_len=N)
+ for pattern, arr in self.precomputed_collocations.iteritems():
+ if pattern not in IJ_set:
+ s = ""
+ for word_id in pattern:
+ if word_id == -1:
+ s = s + "X "
+ else:
+ s = s + darray.id2word[word_id] + " "
+ log.writeln("ERROR: unexpected pattern %s in set of precomputed collocations" % (s), 1)
+ else:
+ chunk = ()
+ max_rank = 0
+ arity = 0
+ for word_id in pattern:
+ if word_id == -1:
+ max_rank = max(max_rank, pattern_rank[chunk])
+ arity = arity + 1
+ chunk = ()
+ else:
+ chunk = chunk + (word_id,)
+ max_rank = max(max_rank, pattern_rank[chunk])
+ cost_by_rank.arr[max_rank] = cost_by_rank.arr[max_rank] + (4*len(arr))
+ count_by_rank.arr[max_rank] = count_by_rank.arr[max_rank] + (len(arr)/(arity+1))
+
+ cumul_cost = 0
+ cumul_count = 0
+ for i from 0 <= i < N:
+ cumul_cost = cumul_cost + cost_by_rank.arr[i]
+ cumul_count = cumul_count + count_by_rank.arr[i]
+ log.writeln("RANK %d\tCOUNT, COST: %d %d\tCUMUL: %d, %d" % (i, count_by_rank.arr[i], cost_by_rank.arr[i], cumul_count, cumul_cost))
+
+ num_found_patterns = len(self.precomputed_collocations)
+ for pattern in IJ_set:
+ if pattern not in self.precomputed_collocations:
+ self.precomputed_collocations[pattern] = cintlist.CIntList()
+
+ stop_time = monitor.cpu()
+ log.writeln("Precomputed collocations for %d patterns out of %d possible (upper bound %d)" % (num_found_patterns,len(self.precomputed_collocations),x))
+ log.writeln("Precomputed inverted index for %d patterns " % len(self.precomputed_index))
+ log.writeln("Precomputation took %f seconds" % (stop_time - start_time))
+ log.writeln("Detailed statistics:")
+
+
+
+
+
+
+
diff --git a/sa-extract/rule.pxd b/sa-extract/rule.pxd
new file mode 100644
index 00000000..c9c84e5c
--- /dev/null
+++ b/sa-extract/rule.pxd
@@ -0,0 +1,13 @@
+cdef class Phrase:
+ cdef int *syms
+ cdef int n, *varpos, n_vars
+ cdef public int chunkpos(self, int k)
+ cdef public int chunklen(self, int k)
+
+cdef class Rule:
+ cdef public int lhs
+ cdef readonly Phrase f, e
+ cdef float *cscores
+ cdef int n_scores
+ cdef public owner, word_alignments
+
diff --git a/sa-extract/rule.pyx b/sa-extract/rule.pyx
new file mode 100644
index 00000000..7cd3efda
--- /dev/null
+++ b/sa-extract/rule.pyx
@@ -0,0 +1,286 @@
+from libc.stdlib cimport malloc, calloc, realloc, free, strtof, strtol
+from libc.string cimport strsep, strcpy, strlen
+
+cdef extern from "strutil.h":
+ char *strstrsep(char **stringp, char *delim)
+ char *strip(char *s)
+ char **split(char *s, char *delim, int *pn)
+
+import sys
+
+import sym
+cimport sym
+cdef sym.Alphabet alphabet
+alphabet = sym.alphabet
+
+global span_limit
+span_limit = None
+
+cdef int bufsize
+cdef char *buf
+bufsize = 100
+buf = <char *>malloc(bufsize)
+cdef ensurebufsize(int size):
+ global buf, bufsize
+ if size > bufsize:
+ buf = <char *>realloc(buf, size*sizeof(char))
+ bufsize = size
+
+cdef class Phrase:
+ def __cinit__(self, words):
+ cdef int i, j, n, n_vars
+ cdef char **toks
+ cdef bytes bwords
+ cdef char* cwords
+
+ n_vars = 0
+ if type(words) is str:
+ ensurebufsize(len(words)+1)
+ bwords = words
+ cwords = bwords
+ strcpy(buf, cwords)
+ toks = split(buf, NULL, &n)
+ self.syms = <int *>malloc(n*sizeof(int))
+ for i from 0 <= i < n:
+ self.syms[i] = alphabet.fromstring(toks[i], 0)
+ if alphabet.isvar(self.syms[i]):
+ n_vars = n_vars + 1
+
+ else:
+ n = len(words)
+ self.syms = <int *>malloc(n*sizeof(int))
+ for i from 0 <= i < n:
+ self.syms[i] = words[i]
+ if alphabet.isvar(self.syms[i]):
+ n_vars = n_vars + 1
+ self.n = n
+ self.n_vars = n_vars
+ self.varpos = <int *>malloc(n_vars*sizeof(int))
+ j = 0
+ for i from 0 <= i < n:
+ if alphabet.isvar(self.syms[i]):
+ self.varpos[j] = i
+ j = j + 1
+
+ def __dealloc__(self):
+ free(self.syms)
+ free(self.varpos)
+
+ def __str__(self):
+ strs = []
+ cdef int i, s
+ for i from 0 <= i < self.n:
+ s = self.syms[i]
+ strs.append(alphabet.tostring(s))
+ return " ".join(strs)
+
+ def instantiable(self, i, j, n):
+ return span_limit is None or (j-i) <= span_limit
+
+ def handle(self):
+ """return a hashable representation that normalizes the ordering
+ of the nonterminal indices"""
+ norm = []
+ cdef int i, j, s
+ i = 1
+ j = 0
+ for j from 0 <= j < self.n:
+ s = self.syms[j]
+ if alphabet.isvar(s):
+ s = alphabet.setindex(s,i)
+ i = i + 1
+ norm.append(s)
+ return tuple(norm)
+
+ def strhandle(self):
+ strs = []
+ norm = []
+ cdef int i, j, s
+ i = 1
+ j = 0
+ for j from 0 <= j < self.n:
+ s = self.syms[j]
+ if alphabet.isvar(s):
+ s = alphabet.setindex(s,i)
+ i = i + 1
+ norm.append(alphabet.tostring(s))
+ return " ".join(norm)
+
+ def arity(self):
+ return self.n_vars
+
+ def getvarpos(self, i):
+ if 0 <= i < self.n_vars:
+ return self.varpos[i]
+ else:
+ raise IndexError
+
+ def getvar(self, i):
+ if 0 <= i < self.n_vars:
+ return self.syms[self.varpos[i]]
+ else:
+ raise IndexError
+
+ cdef int chunkpos(self, int k):
+ if k == 0:
+ return 0
+ else:
+ return self.varpos[k-1]+1
+
+ cdef int chunklen(self, int k):
+ if self.n_vars == 0:
+ return self.n
+ elif k == 0:
+ return self.varpos[0]
+ elif k == self.n_vars:
+ return self.n-self.varpos[k-1]-1
+ else:
+ return self.varpos[k]-self.varpos[k-1]-1
+
+ def clen(self, k):
+ return self.chunklen(k)
+
+ def getchunk(self, ci):
+ cdef int start, stop
+ start = self.chunkpos(ci)
+ stop = start+self.chunklen(ci)
+ chunk = []
+ for i from start <= i < stop:
+ chunk.append(self.syms[i])
+ return chunk
+
+ def __cmp__(self, other):
+ cdef Phrase otherp
+ cdef int i
+ otherp = other
+ for i from 0 <= i < min(self.n, otherp.n):
+ if self.syms[i] < otherp.syms[i]:
+ return -1
+ elif self.syms[i] > otherp.syms[i]:
+ return 1
+ if self.n < otherp.n:
+ return -1
+ elif self.n > otherp.n:
+ return 1
+ else:
+ return 0
+
+ def __hash__(self):
+ cdef int i
+ cdef unsigned h
+ h = 0
+ for i from 0 <= i < self.n:
+ if self.syms[i] > 0:
+ h = (h << 1) + self.syms[i]
+ else:
+ h = (h << 1) + -self.syms[i]
+ return h
+
+ def __len__(self):
+ return self.n
+
+ def __getitem__(self, i):
+ return self.syms[i]
+
+ def __iter__(self):
+ cdef int i
+ l = []
+ for i from 0 <= i < self.n:
+ l.append(self.syms[i])
+ return iter(l)
+
+ def subst(self, start, children):
+ cdef int i
+ for i from 0 <= i < self.n:
+ if alphabet.isvar(self.syms[i]):
+ start = start + children[alphabet.getindex(self.syms[i])-1]
+ else:
+ start = start + (self.syms[i],)
+ return start
+
+cdef class Rule:
+ def __cinit__(self, lhs, f, e, owner=None, scores=None, word_alignments=None):
+ cdef int i, n
+ cdef char *rest
+
+ self.word_alignments = word_alignments
+ if scores is None:
+ self.cscores = NULL
+ self.n_scores = 0
+ else:
+ n = len(scores)
+ self.cscores = <float *>malloc(n*sizeof(float))
+ self.n_scores = n
+ for i from 0 <= i < n:
+ self.cscores[i] = scores[i]
+
+ def __init__(self, lhs, f, e, owner=None, scores=None, word_alignments=None):
+ if not sym.isvar(lhs):
+ sys.stderr.write("error: lhs=%d\n" % lhs)
+ self.lhs = lhs
+ self.f = f
+ self.e = e
+ self.word_alignments = word_alignments
+
+ def __dealloc__(self):
+ if self.cscores != NULL:
+ free(self.cscores)
+
+ def __str__(self):
+ return self.to_line()
+
+ def __hash__(self):
+ return hash((self.lhs, self.f, self.e))
+
+ def __cmp__(self, Rule other):
+ return cmp((self.lhs, self.f, self.e, self.word_alignments), (other.lhs, other.f, other.e, self.word_alignments))
+
+ def __iadd__(self, Rule other):
+ if self.n_scores != other.n_scores:
+ raise ValueError
+ for i from 0 <= i < self.n_scores:
+ self.cscores[i] = self.cscores[i] + other.cscores[i]
+ return self
+
+ def fmerge(self, Phrase f):
+ if self.f == f:
+ self.f = f
+
+ def arity(self):
+ return self.f.arity()
+
+ def to_line(self):
+ scorestrs = []
+ for i from 0 <= i < self.n_scores:
+ scorestrs.append(str(self.cscores[i]))
+ fields = [alphabet.tostring(self.lhs), str(self.f), str(self.e), " ".join(scorestrs)]
+ if self.word_alignments is not None:
+ alignstr = []
+ for i from 0 <= i < len(self.word_alignments):
+ alignstr.append("%d-%d" % (self.word_alignments[i]/65536, self.word_alignments[i]%65536))
+ #for s,t in self.word_alignments:
+ #alignstr.append("%d-%d" % (s,t))
+ fields.append(" ".join(alignstr))
+
+ return " ||| ".join(fields)
+
+ property scores:
+ def __get__(self):
+ s = [None]*self.n_scores
+ for i from 0 <= i < self.n_scores:
+ s[i] = self.cscores[i]
+ return s
+
+ def __set__(self, s):
+ if self.cscores != NULL:
+ free(self.cscores)
+ self.cscores = <float *>malloc(len(s)*sizeof(float))
+ self.n_scores = len(s)
+ for i from 0 <= i < self.n_scores:
+ self.cscores[i] = s[i]
+
+def rule_copy(r):
+ r1 = Rule(r.lhs, r.f, r.e, r.owner, r.scores)
+ r1.word_alignments = r.word_alignments
+ return r1
+
diff --git a/sa-extract/rulefactory.pyx b/sa-extract/rulefactory.pyx
new file mode 100644
index 00000000..20ea80d2
--- /dev/null
+++ b/sa-extract/rulefactory.pyx
@@ -0,0 +1,2360 @@
+# Pyrex implementation of the algorithms described in
+# Lopez, EMNLP-CoNLL 2007
+# Much faster than the Python numbers reported there.
+# Note to reader: this code is closer to C than Python
+import sys
+import sym
+import log
+import rule
+import monitor
+import cintlist
+import csuf
+import cdat
+import cveb
+import precomputation
+import gc
+import cn
+import sgml
+
+cimport cmath
+cimport csuf
+cimport cdat
+cimport cintlist
+cimport rule
+cimport cveb
+cimport precomputation
+cimport calignment
+
+from libc.stdlib cimport malloc, realloc, free
+from libc.string cimport memset, memcpy
+from libc.math cimport fmod, ceil, floor
+
+cdef int PRECOMPUTE
+cdef int MERGE
+cdef int BAEZA_YATES
+
+PRECOMPUTE = 0
+MERGE = 1
+BAEZA_YATES = 2
+
+#cdef int node_count
+#node_count = 0
+
+cdef class TrieNode:
+ cdef public children
+ #cdef int id
+
+ def __init__(self):
+ self.children = {}
+ #self.id = node_count
+ #node_count += 1
+
+
+cdef class ExtendedTrieNode(TrieNode):
+ cdef public phrase
+ cdef public phrase_location
+ cdef public suffix_link
+
+ def __init__(self, phrase=None, phrase_location=None, suffix_link=None):
+ TrieNode.__init__(self)
+ self.phrase = phrase
+ self.phrase_location = phrase_location
+ self.suffix_link = suffix_link
+
+
+cdef class TrieTable:
+ cdef public int extended
+ cdef public int count
+ cdef public root
+ def __cinit__(self, extended=False):
+ self.count = 0
+ self.extended = extended
+ if extended:
+ self.root = ExtendedTrieNode()
+ else:
+ self.root = TrieNode()
+
+# linked list structure for storing matches in BaselineRuleFactory
+cdef struct match_node:
+ int* match
+ match_node* next
+
+cdef class BaselineRuleFactory:
+
+ cdef grammar, context_manager
+ cdef int max_terminals, max_nonterminals
+ cdef int max_initial_size, train_max_initial_size
+ cdef int min_gap_size, train_min_gap_size
+ cdef int category
+ cdef int visit
+ cdef float intersect_time, extract_time
+ cdef ruleFile, timingFile
+ cdef int* last_visit1
+ cdef int* last_visit2
+ cdef match_node** intersector1
+ cdef match_node** intersector2
+ cdef csuf.SuffixArray sa
+ cdef cintlist.CIntList sent_id
+
+ def input(self, fwords):
+ flen = len(fwords)
+ start_time = monitor.cpu()
+ self.intersect_time = 0.0
+ self.extract_time = 0.0
+
+ pyro_phrase_count = 0
+ hiero_phrase_count = 0
+
+ frontier = []
+ for i in xrange(len(fwords)):
+ frontier.append((i, (), False))
+
+ while len(frontier) > 0:
+ this_iter_intersect_time = self.intersect_time
+ new_frontier = []
+ for i, prefix, is_shadow_path in frontier:
+
+ word_id = fwords[i][0][0]
+ #print "word_id = %i" % word_id
+ phrase = prefix + (word_id,)
+ str_phrase = map(sym.tostring, phrase)
+ hiero_phrase = rule.Phrase(phrase)
+
+ #log.writeln("pos %2d, '%s'" % (i, hiero_phrase))
+ self.lookup(hiero_phrase)
+ if hiero_phrase.arity() == 0:
+ pyro_phrase_count = pyro_phrase_count + 1
+ else:
+ hiero_phrase_count = hiero_phrase_count + 1
+
+ if len(phrase) - hiero_phrase.arity() < self.max_terminals and i+1 < len(fwords):
+ new_frontier.append((i+1, phrase, is_shadow_path))
+ if hiero_phrase.arity() < self.max_nonterminals:
+ xcat = sym.setindex(self.category, hiero_phrase.arity()+1)
+ for j in xrange(i+1+self.min_gap_size, min(i+self.max_initial_size, len(fwords))):
+ new_frontier.append((j, phrase+(xcat,), is_shadow_path))
+ log.writeln("This iteration intersect time = %f" % (self.intersect_time - this_iter_intersect_time))
+ frontier = new_frontier
+ stop_time = monitor.cpu()
+ log.writeln("COUNT %d %d" % (pyro_phrase_count, hiero_phrase_count))
+
+
+ def lookup(self, phrase):
+ cdef int j, g, start, stop, sent_id, num_ranges, arity
+ cdef match_node** cur_intersector
+ cdef match_node** next_intersector
+ cdef match_node** tmp_intersector
+ cdef match_node* node
+ cdef match_node* cur_node
+ cdef match_node* prev_node
+ cdef match_node** node_ptr
+ cdef int* cur_visit
+ cdef int* next_visit
+ cdef int* tmp_visit
+ cdef int* chunklen
+
+ #print "\n\nLOOKUP\n\n"
+ ranges = []
+ sizes = []
+ arity = phrase.arity()
+ chunklen = <int *> malloc(arity*sizeof(int))
+ for i from 0 <= i < arity+1:
+ chunk = phrase.getchunk(i)
+ chunklen[i] = len(chunk)
+ sa_range = None
+ phr = ()
+ for offset, word_id in enumerate(chunk):
+ word = sym.tostring(word_id)
+ sa_range = self.context_manager.fsarray.lookup(word, offset, sa_range[0], sa_range[1])
+ if sa_range is None:
+ #log.writeln("Returned for phrase %s" % rule.Phrase(phr))
+ return
+ #log.writeln("Found range %s for phrase %s" % (sa_range, rule.Phrase(phr)))
+ ranges.append(sa_range)
+ sizes.append(sa_range[1]-sa_range[0])
+ if phrase.arity() == 0:
+ return
+
+ cur_intersector = self.intersector1
+ next_intersector = self.intersector2
+ cur_visit = self.last_visit1
+ next_visit = self.last_visit2
+
+ num_ranges = len(ranges)
+ for i from 0 <= i < num_ranges:
+ sa_range = ranges[i]
+ start_time = monitor.cpu()
+ self.visit = self.visit + 1
+ intersect_count = 0
+
+ start = sa_range[0]
+ stop = sa_range[1]
+ for j from start <= j < stop:
+ g = self.sa.sa.arr[j]
+ sent_id = self.sent_id.arr[g]
+ if i==0:
+ if next_visit[sent_id] != self.visit:
+ # clear intersector
+ node = next_intersector[sent_id]
+ next_intersector[sent_id] = NULL
+ while node != NULL:
+ prev_node = node
+ node = node.next
+ free(prev_node.match)
+ free(prev_node)
+ next_visit[sent_id] = self.visit
+ node_ptr = &(next_intersector[sent_id])
+ while node_ptr[0] != NULL:
+ node_ptr = &(node_ptr[0].next)
+ node_ptr[0] = <match_node*> malloc(sizeof(match_node))
+ node_ptr[0].match = <int *> malloc(sizeof(int))
+ node_ptr[0].match[0] = g
+ node_ptr[0].next = NULL
+ intersect_count = intersect_count + 1
+ else:
+ if cur_visit[sent_id] == self.visit-1:
+ cur_node = cur_intersector[sent_id]
+ while cur_node != NULL:
+ if g - cur_node.match[0] + chunklen[i] <= self.train_max_initial_size and g - cur_node.match[i-1] - chunklen[i-1] >= self.train_min_gap_size:
+ if next_visit[sent_id] != self.visit:
+ # clear intersector -- note that we only do this if we've got something to put there
+ node = next_intersector[sent_id]
+ next_intersector[sent_id] = NULL
+ while node != NULL:
+ prev_node = node
+ node = node.next
+ free(prev_node.match)
+ free(prev_node)
+ next_visit[sent_id] = self.visit
+ node_ptr = &(next_intersector[sent_id])
+ while node_ptr[0] != NULL:
+ node_ptr = &(node_ptr[0].next)
+ node_ptr[0] = <match_node*> malloc(sizeof(match_node))
+ node_ptr[0].match = <int *> malloc((i+1) * sizeof(int))
+ memcpy(node_ptr[0].match, cur_node.match, i*sizeof(int))
+ node_ptr[0].match[i] = g
+ node_ptr[0].next = NULL
+ intersect_count = intersect_count + 1
+ cur_node = cur_node.next
+ tmp_intersector = cur_intersector
+ cur_intersector = next_intersector
+ next_intersector = tmp_intersector
+
+ tmp_visit = cur_visit
+ cur_visit = next_visit
+ next_visit = tmp_visit
+
+ intersect_time = monitor.cpu() - start_time
+ if i > 0:
+ log.writeln("INT %d %d %d %d %f baseline" %
+ (arity, prev_intersect_count, sa_range[1]-sa_range[0],
+ intersect_count, intersect_time))
+ if intersect_count == 0:
+ return None
+ prev_intersect_count = intersect_count
+ free(chunklen)
+
+
+
+ def __init__(self, max_terminals=5, max_nonterminals=2,
+ max_initial_size=10, train_max_initial_size=10,
+ min_gap_size=1, train_min_gap_size=2,
+ category='[PHRASE]', grammar=None,
+ ruleFile=None, timingFile=None):
+ self.grammar = grammar
+ self.max_terminals = max_terminals
+ self.max_nonterminals = max_nonterminals
+ self.max_initial_size = max_initial_size
+ self.train_max_initial_size = train_max_initial_size
+ self.min_gap_size = min_gap_size
+ self.train_min_gap_size = train_min_gap_size
+ self.category = sym.fromstring(category)
+ self.ruleFile = ruleFile
+ self.timingFile = timingFile
+ self.visit = 0
+
+
+ def registerContext(self, context_manager):
+ cdef int num_sents
+ self.context_manager = context_manager
+ self.sa = context_manager.fsarray
+ self.sent_id = self.sa.darray.sent_id
+
+ num_sents = len(self.sa.darray.sent_index)
+ self.last_visit1 = <int *> malloc(num_sents * sizeof(int))
+ memset(self.last_visit1, 0, num_sents * sizeof(int))
+
+ self.last_visit2 = <int *> malloc(num_sents * sizeof(int))
+ memset(self.last_visit2, 0, num_sents * sizeof(int))
+
+ self.intersector1 = <match_node **> malloc(num_sents * sizeof(match_node*))
+ memset(self.intersector1, 0, num_sents * sizeof(match_node*))
+
+ self.intersector2 = <match_node **> malloc(num_sents * sizeof(match_node*))
+ memset(self.intersector2, 0, num_sents * sizeof(match_node*))
+
+
+# encodes information needed to find a (hierarchical) phrase
+# in the text. If phrase is contiguous, that's just a range
+# in the suffix array; if discontiguous, it is the set of
+# actual locations (packed into an array)
+cdef class PhraseLocation:
+ cdef int sa_low
+ cdef int sa_high
+ cdef int arr_low
+ cdef int arr_high
+ cdef cintlist.CIntList arr
+ cdef int num_subpatterns
+
+ # returns true if sent_id is contained
+ cdef int contains(self, int sent_id):
+ return 1
+
+ def __init__(self, sa_low=-1, sa_high=-1, arr_low=-1, arr_high=-1, arr=None, num_subpatterns=1):
+ self.sa_low = sa_low
+ self.sa_high = sa_high
+ self.arr_low = arr_low
+ self.arr_high = arr_high
+ self.arr = arr
+ self.num_subpatterns = num_subpatterns
+
+
+
+cdef class Sampler:
+ '''A Sampler implements a logic for choosing
+ samples from a population range'''
+
+ cdef int sampleSize
+ cdef context_manager
+ cdef cintlist.CIntList sa
+
+ def __init__(self, sampleSize=0):
+ self.sampleSize = sampleSize
+ if sampleSize > 0:
+ log.writeln("Sampling strategy: uniform, max sample size = %d" % sampleSize, 1)
+ else:
+ log.writeln("Sampling strategy: no sampling", 1)
+
+ def registerContext(self, context_manager):
+ self.context_manager = context_manager
+ self.sa = (<csuf.SuffixArray> context_manager.fsarray).sa
+
+
+ def sample(self, PhraseLocation phrase_location):
+ '''Returns a sample of the locations for
+ the phrase. If there are less than self.sampleSize
+ locations, return all of them; otherwise, return
+ up to self.sampleSize locations. In the latter case,
+ we choose to sample UNIFORMLY -- that is, the locations
+ are chosen at uniform intervals over the entire set, rather
+ than randomly. This makes the algorithm deterministic, which
+ is good for things like MERT'''
+ cdef cintlist.CIntList sample
+ cdef double i, stepsize
+ cdef int num_locations, val, j
+
+ sample = cintlist.CIntList()
+ if phrase_location.arr is None:
+ num_locations = phrase_location.sa_high - phrase_location.sa_low
+ if self.sampleSize == -1 or num_locations <= self.sampleSize:
+ sample._extend_arr(self.sa.arr + phrase_location.sa_low, num_locations)
+ else:
+ stepsize = float(num_locations)/float(self.sampleSize)
+ i = phrase_location.sa_low
+ while i < phrase_location.sa_high and sample.len < self.sampleSize:
+ '''Note: int(i) not guaranteed to have the desired
+ effect, according to the python documentation'''
+ if fmod(i,1.0) > 0.5:
+ val = int(ceil(i))
+ else:
+ val = int(floor(i))
+ sample._append(self.sa.arr[val])
+ i = i + stepsize
+ else:
+ num_locations = (phrase_location.arr_high - phrase_location.arr_low) / phrase_location.num_subpatterns
+ if self.sampleSize == -1 or num_locations <= self.sampleSize:
+ sample = phrase_location.arr
+ else:
+ stepsize = float(num_locations)/float(self.sampleSize)
+ i = phrase_location.arr_low
+ while i < num_locations and sample.len < self.sampleSize * phrase_location.num_subpatterns:
+ '''Note: int(i) not guaranteed to have the desired
+ effect, according to the python documentation'''
+ if fmod(i,1.0) > 0.5:
+ val = int(ceil(i))
+ else:
+ val = int(floor(i))
+ j = phrase_location.arr_low + (val*phrase_location.num_subpatterns)
+ sample._extend_arr(phrase_location.arr.arr + j, phrase_location.num_subpatterns)
+ i = i + stepsize
+ return sample
+
+
+cdef long nGramCount(PhraseLocation loc):
+ return (loc.arr_high - loc.arr_low)/ loc.num_subpatterns
+
+
+# struct used to encapsulate a single matching
+cdef struct Matching:
+ int* arr
+ int start
+ int end
+ int sent_id
+ int size
+
+
+cdef void assign_matching(Matching* m, int* arr, int start, int step, int* sent_id_arr):
+ m.arr = arr
+ m.start = start
+ m.end = start + step
+ m.sent_id = sent_id_arr[arr[start]]
+ m.size = step
+
+
+cdef int* append_combined_matching(int* arr, Matching* loc1, Matching* loc2,
+ int offset_by_one, int num_subpatterns, int* result_len):
+ cdef int i, new_len
+
+ new_len = result_len[0] + num_subpatterns
+ arr = <int*> realloc(arr, new_len*sizeof(int))
+
+ for i from 0 <= i < loc1.size:
+ arr[result_len[0]+i] = loc1.arr[loc1.start+i]
+ if num_subpatterns > loc1.size:
+ arr[new_len-1] = loc2.arr[loc2.end-1]
+ result_len[0] = new_len
+ return arr
+
+
+cdef int* extend_arr(int* arr, int* arr_len, int* appendix, int appendix_len):
+ cdef int new_len
+
+ new_len = arr_len[0] + appendix_len
+ arr = <int*> realloc(arr, new_len*sizeof(int))
+ memcpy(arr+arr_len[0], appendix, appendix_len*sizeof(int))
+ arr_len[0] = new_len
+ return arr
+
+
+#cdef matching2str(Matching* m):
+# cdef int i
+# cdef result
+
+# result = "("
+# for i from m.start <= i < m.end:
+# result = result + str(m.arr[i]) + " "
+# result = result + ")"
+# return result
+
+
+cdef int median(int low, int high, int step):
+ return low + (((high - low)/step)/2)*step
+
+
+cdef void findComparableMatchings(int low, int high, int* arr, int step, int loc, int* loc_minus, int* loc_plus):
+ # Returns (minus, plus) indices for the portion of the array
+ # in which all matchings have the same first index as the one
+ # starting at loc
+ loc_plus[0] = loc + step
+ while loc_plus[0] < high and arr[loc_plus[0]] == arr[loc]:
+ loc_plus[0] = loc_plus[0] + step
+ loc_minus[0] = loc
+ while loc_minus[0]-step >= low and arr[loc_minus[0]-step] == arr[loc]:
+ loc_minus[0] = loc_minus[0] - step
+
+
+cdef class HieroCachingRuleFactory:
+ '''This RuleFactory implements a caching
+ method using TrieTable, which makes phrase
+ generation somewhat speedier -- phrases only
+ need to be extracted once (however, it is
+ quite possible they need to be scored
+ for each input sentence, for contextual models)'''
+
+ cdef rules, grammar, context_manager
+
+ cdef int max_chunks
+ cdef int max_target_chunks
+ cdef int max_length
+ cdef int max_target_length
+ cdef int max_nonterminals
+ cdef int max_initial_size
+ cdef int train_max_initial_size
+ cdef int min_gap_size
+ cdef int train_min_gap_size
+ cdef int category
+
+ cdef cacheBetweenSents
+ cdef precomputed_index
+ cdef precomputed_collocations
+ cdef precompute_file
+ cdef max_rank
+ cdef int precompute_rank, precompute_secondary_rank
+ cdef useBaezaYates
+ cdef use_index
+ cdef use_collocations
+ cdef float by_slack_factor
+
+ cdef per_sentence_grammar
+ cdef rule_filehandler
+ cdef rule_file
+ cdef pruned_rule_file
+ cdef extract_file
+ cdef sample_file
+ cdef search_file
+ cdef timingFile
+ cdef log_int_stats
+ cdef prev_norm_prefix
+ cdef float intersect_time, extract_time
+ cdef csuf.SuffixArray fsa
+ cdef cdat.DataArray fda
+ cdef cdat.DataArray eda
+
+ cdef calignment.Alignment alignment
+ cdef cintlist.CIntList eid2symid
+ cdef cintlist.CIntList fid2symid
+ cdef int tight_phrases
+ cdef int require_aligned_terminal
+ cdef int require_aligned_chunks
+
+ cdef cintlist.CIntList findexes
+ cdef cintlist.CIntList findexes1
+
+ cdef int excluded_sent_id # exclude a sentence id
+
+ def __init__(self,
+ alignment=None, # compiled alignment object (REQUIRED)
+ by_slack_factor=1.0, # parameter for double-binary search; doesn't seem to matter much
+ category="[PHRASE]", # name of generic nonterminal used by Hiero
+ cacheBetweenSents=False, # prevent flushing of tree between sents; use carefully or you'll run out of memory
+ extract_file=None, # print raw extracted rules to this file
+ grammar=None, # empty grammar object -- must be supplied from outside (REQUIRED)
+ log_int_stats=False, # prints timing data on intersections to stderr
+ max_chunks=None, # maximum number of contiguous chunks of terminal symbols in RHS of a rule. If None, defaults to max_nonterminals+1
+ max_initial_size=10, # maximum span of a grammar rule in TEST DATA
+ max_length=5, # maximum number of symbols (both T and NT) allowed in a rule
+ max_nonterminals=2, # maximum number of nonterminals allowed in a rule (set >2 at your own risk)
+ max_target_chunks=None, # maximum number of contiguous chunks of terminal symbols in target-side RHS of a rule. If None, defaults to max_nonterminals+1
+ max_target_length=None, # maximum number of target side symbols (both T and NT) allowed in a rule. If None, defaults to max_initial_size
+ min_gap_size=2, # minimum span of a nonterminal in the RHS of a rule in TEST DATA
+ precompute_file=None, # filename of file containing precomputed collocations
+ precompute_secondary_rank=20, # maximum frequency rank of patterns used to compute triples (don't set higher than 20).
+ precompute_rank=100, # maximum frequency rank of patterns used to compute collocations (no need to set higher than maybe 200-300)
+ pruned_rule_file=None, # if specified, pruned grammars will be written to this filename
+ require_aligned_terminal=True, # require extracted rules to have at least one aligned word
+ require_aligned_chunks=False, # require each contiguous chunk of extracted rules to have at least one aligned word
+ per_sentence_grammar=True, # generate grammar files for each input segment
+ rule_file=None, # UNpruned grammars will be written to this filename
+ sample_file=None, # Sampling statistics will be written to this filename
+ search_file=None, # lookup statistics will be written to this filename
+ train_max_initial_size=10, # maximum span of a grammar rule extracted from TRAINING DATA
+ train_min_gap_size=2, # minimum span of an RHS nonterminal in a rule extracted from TRAINING DATA
+ tight_phrases=False, # True if phrases should be tight, False otherwise (False == slower but better results)
+ timingFile=None, # timing statistics will be written to this filename
+ useBaezaYates=True, # True to require use of double-binary alg, false otherwise
+ use_collocations=True, # True to enable used of precomputed collocations
+ use_index=True # True to enable use of precomputed inverted indices
+ ):
+ '''Note: we make a distinction between the min_gap_size
+ and max_initial_size used in test and train. The latter
+ are represented by train_min_gap_size and train_max_initial_size,
+ respectively. This is because Chiang's model does not require
+ them to be the same, therefore we don't either.'''
+ self.rules = TrieTable(True) # cache
+ self.rules.root = ExtendedTrieNode(phrase_location=PhraseLocation())
+ self.grammar = grammar
+ if alignment is None:
+ raise Exception("Must specify an alignment object")
+ self.alignment = alignment
+
+ self.excluded_sent_id = -1
+
+ # grammar parameters and settings
+ # NOTE: setting max_nonterminals > 2 is not currently supported in Hiero
+ self.max_length = max_length
+ self.max_nonterminals = max_nonterminals
+ self.max_initial_size = max_initial_size
+ self.train_max_initial_size = train_max_initial_size
+ self.min_gap_size = min_gap_size
+ self.train_min_gap_size = train_min_gap_size
+ self.category = sym.fromstring(category)
+
+ if max_chunks is None:
+ self.max_chunks = self.max_nonterminals + 1
+ else:
+ self.max_chunks = max_chunks
+
+ if max_target_chunks is None:
+ self.max_target_chunks = self.max_nonterminals + 1
+ else:
+ self.max_target_chunks = max_target_chunks
+
+ if max_target_length is None:
+ self.max_target_length = max_initial_size
+ else:
+ self.max_target_length = max_target_length
+
+ # algorithmic parameters and settings
+ self.cacheBetweenSents = not per_sentence_grammar
+ self.precomputed_collocations = {}
+ self.precomputed_index = {}
+ self.use_index = use_index
+ self.use_collocations = use_collocations
+ self.max_rank = {}
+ self.precompute_file = precompute_file
+ self.precompute_rank = precompute_rank
+ self.precompute_secondary_rank = precompute_secondary_rank
+ self.useBaezaYates = useBaezaYates
+ self.by_slack_factor = by_slack_factor
+ if tight_phrases:
+ self.tight_phrases = 1
+ else:
+ self.tight_phrases = 0
+
+ if require_aligned_chunks:
+ # one condition is a stronger version of the other.
+ self.require_aligned_chunks = 1
+ self.require_aligned_terminal = 1
+ elif require_aligned_terminal:
+ self.require_aligned_chunks = 0
+ self.require_aligned_terminal = 1
+ else:
+ self.require_aligned_chunks = 0
+ self.require_aligned_terminal = 0
+
+
+ self.per_sentence_grammar = per_sentence_grammar
+ if not self.per_sentence_grammar:
+ self.rule_filehandler = open(rule_file, "w")
+ # diagnostics
+ #if rule_file is None:
+ # self.rule_file = None
+ self.rule_file = rule_file
+ if extract_file is None:
+ self.extract_file = None
+ else:
+ self.extract_file = open(extract_file, "w")
+ if sample_file is None:
+ self.sample_file = None
+ else:
+ self.sample_file = open(sample_file, "w")
+ if search_file is None:
+ self.search_file = None
+ else:
+ self.search_file = open(search_file, "w")
+ self.pruned_rule_file = pruned_rule_file
+ self.timingFile = timingFile
+ self.log_int_stats = log_int_stats
+ self.prev_norm_prefix = ()
+
+ self.findexes = cintlist.CIntList(initial_len=10)
+ self.findexes1 = cintlist.CIntList(initial_len=10)
+
+ def registerContext(self, context_manager):
+ '''This gives the RuleFactory access to the Context object.
+ Here we also use it to precompute the most expensive intersections
+ in the corpus quickly.'''
+ self.context_manager = context_manager
+ self.fsa = context_manager.fsarray
+ self.fda = self.fsa.darray
+ self.eda = context_manager.edarray
+ self.fid2symid = self.set_idmap(self.fda)
+ self.eid2symid = self.set_idmap(self.eda)
+ self.precompute()
+
+ cdef set_idmap(self, cdat.DataArray darray):
+ cdef int word_id, new_word_id, N
+ cdef cintlist.CIntList idmap
+
+ N = len(darray.id2word)
+ idmap = cintlist.CIntList(initial_len=N)
+ for word_id from 0 <= word_id < N:
+ new_word_id = sym.fromstring(darray.id2word[word_id], terminal=True)
+ idmap.arr[word_id] = new_word_id
+ return idmap
+
+
+ def pattern2phrase(self, pattern):
+ # pattern is a tuple, which we must convert to a hiero rule.Phrase
+ result = ()
+ arity = 0
+ for word_id in pattern:
+ if word_id == -1:
+ arity = arity + 1
+ new_id = sym.setindex(self.category, arity)
+ else:
+ new_id = sym.fromstring(self.fda.id2word[word_id])
+ result = result + (new_id,)
+ return rule.Phrase(result)
+
+ def pattern2phrase_plus(self, pattern):
+ # returns a list containing both the pattern, and pattern
+ # suffixed/prefixed with the NT category.
+ patterns = []
+ result = ()
+ arity = 0
+ for word_id in pattern:
+ if word_id == -1:
+ arity = arity + 1
+ new_id = sym.setindex(self.category, arity)
+ else:
+ new_id = sym.fromstring(self.fda.id2word[word_id])
+ result = result + (new_id,)
+ patterns.append(rule.Phrase(result))
+ patterns.append(rule.Phrase(result + (sym.setindex(self.category, 1),)))
+ patterns.append(rule.Phrase((sym.setindex(self.category, 1),) + result))
+ return patterns
+
+ def precompute(self):
+ cdef precomputation.Precomputation pre
+
+ if self.precompute_file is not None:
+ start_time = monitor.cpu()
+ log.write("Reading precomputed data from file %s... " % self.precompute_file, 1)
+ pre = precomputation.Precomputation(self.precompute_file, from_binary=True)
+ # check parameters of precomputation -- some are critical and some are not
+ if pre.max_nonterminals != self.max_nonterminals:
+ log.writeln("\nWARNING: Precomputation done with max nonterminals %d, decoder uses %d" % (pre.max_nonterminals, self.max_nonterminals))
+ if pre.max_length != self.max_length:
+ log.writeln("\nWARNING: Precomputation done with max terminals %d, decoder uses %d" % (pre.max_length, self.max_length))
+ if pre.train_max_initial_size != self.train_max_initial_size:
+ log.writeln("\nERROR: Precomputation done with max initial size %d, decoder uses %d" % (pre.train_max_initial_size, self.train_max_initial_size))
+ raise Exception("Parameter mismatch with precomputed data")
+ if pre.train_min_gap_size != self.train_min_gap_size:
+ log.writeln("\nERROR: Precomputation done with min gap size %d, decoder uses %d" % (pre.train_min_gap_size, self.train_min_gap_size))
+ raise Exception("Parameter mismatch with precomputed data")
+ log.writeln("done.", 1)
+ if self.use_index:
+ log.write("Converting %d hash keys on precomputed inverted index... " % (len(pre.precomputed_index)), 1)
+ for pattern, arr in pre.precomputed_index.iteritems():
+ phrases = self.pattern2phrase_plus(pattern)
+ for phrase in phrases:
+ self.precomputed_index[phrase] = arr
+ log.writeln("done.", 1)
+ if self.use_collocations:
+ log.write("Converting %d hash keys on precomputed collocations... " % (len(pre.precomputed_collocations)), 1)
+ for pattern, arr in pre.precomputed_collocations.iteritems():
+ phrase = self.pattern2phrase(pattern)
+ self.precomputed_collocations[phrase] = arr
+ log.writeln("done.", 1)
+ stop_time = monitor.cpu()
+ log.writeln("Processing precomputations took %f seconds" % (stop_time - start_time), 1)
+
+
+ def getPrecomputedCollocation(self, phrase):
+ if phrase in self.precomputed_collocations:
+ arr = self.precomputed_collocations[phrase]
+ return PhraseLocation(arr=arr, arr_low=0, arr_high=len(arr), num_subpatterns=phrase.arity()+1)
+ return None
+
+
+ cdef int* baezaYatesHelper(self, int low1, int high1, int* arr1, int step1,
+ int low2, int high2, int* arr2, int step2,
+ int offset_by_one, int len_last, int num_subpatterns, int* result_len):
+ cdef int i1, i2, j1, j2, med1, med2, med1_plus, med1_minus, med2_minus, med2_plus
+ cdef int d_first, qsetsize, dsetsize, tmp, search_low, search_high
+ cdef int med_result_len, low_result_len, high_result_len
+ cdef long comparison
+ cdef int* result
+ cdef int* low_result
+ cdef int* med_result
+ cdef int* high_result
+ cdef Matching loc1, loc2
+
+ result = <int*> malloc(0*sizeof(int*))
+# log.writeln("%sBY: [%d, %d, %d] [%d, %d, %d]" % (pad, low1, high1, step1, low2, high2, step2,), 5)
+
+ d_first = 0
+ if high1 - low1 > high2 - low2:
+# log.writeln("%sD first" % (pad), 5)
+ d_first = 1
+# else:
+# log.writeln("%sQ first" % (pad), 5)
+
+# '''First, check to see if we are at any of the
+# recursive base cases'''
+#
+# '''Case 1: one of the sets is empty'''
+ if low1 >= high1 or low2 >= high2:
+# log.writeln("%sRETURN: set is empty" % (pad), 5)
+ return result
+
+# '''Case 2: sets are non-overlapping'''
+ assign_matching(&loc1, arr1, high1-step1, step1, self.fda.sent_id.arr)
+ assign_matching(&loc2, arr2, low2, step2, self.fda.sent_id.arr)
+ if self.compare_matchings(&loc1, &loc2, offset_by_one, len_last) == -1:
+# log.writeln("%s %s < %s" % (pad, tuple(arr1[high1-step1:high1]), tuple(arr2[low2:low2+step2])),5)
+# log.writeln("%sRETURN: non-overlapping sets" % (pad), 5)
+ return result
+
+ assign_matching(&loc1, arr1, low1, step1, self.fda.sent_id.arr)
+ assign_matching(&loc2, arr2, high2-step2, step2, self.fda.sent_id.arr)
+ if self.compare_matchings(&loc1, &loc2, offset_by_one, len_last) == 1:
+# log.writeln("%s %s > %s" % (pad, tuple(arr1[low1:low1+step1]), tuple(arr2[high2-step2:high2])),5)
+# log.writeln("%sRETURN: non-overlapping sets" % (pad), 5)
+ return result
+
+ # Case 3: query set and data set do not meet size mismatch constraints;
+ # We use mergesort instead in this case
+ qsetsize = (high1-low1) / step1
+ dsetsize = (high2-low2) / step2
+ if d_first:
+ tmp = qsetsize
+ qsetsize = dsetsize
+ dsetsize = tmp
+
+ if self.by_slack_factor * qsetsize * cmath.log(dsetsize) / cmath.log(2) > dsetsize:
+ free(result)
+ return self.mergeHelper(low1, high1, arr1, step1, low2, high2, arr2, step2, offset_by_one, len_last, num_subpatterns, result_len)
+
+ # binary search. There are two flavors, depending on
+ # whether the queryset or dataset is first
+ if d_first:
+ med2 = median(low2, high2, step2)
+ assign_matching(&loc2, arr2, med2, step2, self.fda.sent_id.arr)
+
+ search_low = low1
+ search_high = high1
+ while search_low < search_high:
+ med1 = median(search_low, search_high, step1)
+ findComparableMatchings(low1, high1, arr1, step1, med1, &med1_minus, &med1_plus)
+ comparison = self.compareMatchingsSet(med1_minus, med1_plus, arr1, step1, &loc2, offset_by_one, len_last)
+ if comparison == -1:
+ search_low = med1_plus
+ elif comparison == 1:
+ search_high = med1_minus
+ else:
+ break
+ else:
+ med1 = median(low1, high1, step1)
+ findComparableMatchings(low1, high1, arr1, step1, med1, &med1_minus, &med1_plus)
+
+ search_low = low2
+ search_high = high2
+ while search_low < search_high:
+ med2 = median(search_low, search_high, step2)
+ assign_matching(&loc2, arr2, med2, step2, self.fda.sent_id.arr)
+ comparison = self.compareMatchingsSet(med1_minus, med1_plus, arr1, step1, &loc2, offset_by_one, len_last)
+ if comparison == -1:
+ search_high = med2
+ elif comparison == 1:
+ search_low = med2 + step2
+ else:
+ break
+
+ med_result_len = 0
+ med_result = <int*> malloc(0*sizeof(int*))
+ if search_high > search_low:
+# '''Then there is a match for the median element of Q'''
+#
+# '''What we want to find is the group of all bindings in the first set
+# s.t. their first element == the first element of med1. Then we
+# want to store the bindings for all of those elements. We can
+# subsequently throw all of them away.'''
+ med2_minus = med2
+ med2_plus = med2 + step2
+ i1 = med1_minus
+ while i1 < med1_plus:
+ assign_matching(&loc1, arr1, i1, step1, self.fda.sent_id.arr)
+ while med2_minus-step2 >= low2:
+ assign_matching(&loc2, arr2, med2_minus-step2, step2, self.fda.sent_id.arr)
+ if self.compare_matchings(&loc1, &loc2, offset_by_one, len_last) < 1:
+ med2_minus = med2_minus - step2
+ else:
+ break
+ i2 = med2_minus
+ while i2 < high2:
+ assign_matching(&loc2, arr2, i2, step2, self.fda.sent_id.arr)
+ comparison = self.compare_matchings(&loc1, &loc2, offset_by_one, len_last)
+ if comparison == 0:
+ pass
+ med_result = append_combined_matching(med_result, &loc1, &loc2, offset_by_one, num_subpatterns, &med_result_len)
+ if comparison == -1:
+ break
+ i2 = i2 + step2
+ if i2 > med2_plus:
+ med2_plus = i2
+ i1 = i1 + step1
+
+ tmp = med1_minus
+ med1_minus = med1_plus
+ med1_plus = tmp
+ else:
+ # No match; need to figure out the point of division in D and Q
+ med2_minus = med2
+ med2_plus = med2
+ if d_first:
+ med2_minus = med2_minus + step2
+ if comparison == -1:
+ med1_minus = med1_plus
+ if comparison == 1:
+ med1_plus = med1_minus
+ else:
+ tmp = med1_minus
+ med1_minus = med1_plus
+ med1_plus = tmp
+ if comparison == 1:
+ med2_minus = med2_minus + step2
+ med2_plus = med2_plus + step2
+
+ low_result_len = 0
+ low_result = self.baezaYatesHelper(low1, med1_plus, arr1, step1, low2, med2_plus, arr2, step2, offset_by_one, len_last, num_subpatterns, &low_result_len)
+ high_result_len = 0
+ high_result = self.baezaYatesHelper(med1_minus, high1, arr1, step1, med2_minus, high2, arr2, step2, offset_by_one, len_last, num_subpatterns, &high_result_len)
+
+ result = extend_arr(result, result_len, low_result, low_result_len)
+ result = extend_arr(result, result_len, med_result, med_result_len)
+ result = extend_arr(result, result_len, high_result, high_result_len)
+ free(low_result)
+ free(med_result)
+ free(high_result)
+
+ return result
+
+
+
+ cdef long compareMatchingsSet(self, int i1_minus, int i1_plus, int* arr1, int step1,
+ Matching* loc2, int offset_by_one, int len_last):
+# '''Compares a *set* of bindings, all with the same first element,
+# to a single binding. Returns -1 if all comparisons == -1, 1 if all
+# comparisons == 1, and 0 otherwise.'''
+ cdef int i1, comparison, prev_comparison
+ cdef Matching l1_stack
+ cdef Matching* loc1
+
+ loc1 = &l1_stack
+
+ i1 = i1_minus
+ while i1 < i1_plus:
+ assign_matching(loc1, arr1, i1, step1, self.fda.sent_id.arr)
+ comparison = self.compare_matchings(loc1, loc2, offset_by_one, len_last)
+ if comparison == 0:
+ prev_comparison = 0
+ break
+ elif i1 == i1_minus:
+ prev_comparison = comparison
+ else:
+ if comparison != prev_comparison:
+ prev_comparison = 0
+ break
+ i1 = i1 + step1
+ return prev_comparison
+
+
+ cdef long compare_matchings(self, Matching* loc1, Matching* loc2, int offset_by_one, int len_last):
+ cdef int i
+
+ if loc1.sent_id > loc2.sent_id:
+ return 1
+ if loc2.sent_id > loc1.sent_id:
+ return -1
+
+ if loc1.size == 1 and loc2.size == 1:
+ if loc2.arr[loc2.start] - loc1.arr[loc1.start] <= self.train_min_gap_size:
+ return 1
+
+ elif offset_by_one:
+ for i from 1 <= i < loc1.size:
+ if loc1.arr[loc1.start+i] > loc2.arr[loc2.start+i-1]:
+ return 1
+ if loc1.arr[loc1.start+i] < loc2.arr[loc2.start+i-1]:
+ return -1
+
+ else:
+ if loc1.arr[loc1.start]+1 > loc2.arr[loc2.start]:
+ return 1
+ if loc1.arr[loc1.start]+1 < loc2.arr[loc2.start]:
+ return -1
+
+ for i from 1 <= i < loc1.size:
+ if loc1.arr[loc1.start+i] > loc2.arr[loc2.start+i]:
+ return 1
+ if loc1.arr[loc1.start+i] < loc2.arr[loc2.start+i]:
+ return -1
+
+ if loc2.arr[loc2.end-1] + len_last - loc1.arr[loc1.start] > self.train_max_initial_size:
+ return -1
+ return 0
+
+
+ cdef int* mergeHelper(self, int low1, int high1, int* arr1, int step1,
+ int low2, int high2, int* arr2, int step2,
+ int offset_by_one, int len_last, int num_subpatterns, int* result_len):
+ cdef int i1, i2, j1, j2
+ cdef long comparison
+ cdef int* result
+ cdef Matching loc1, loc2
+# cdef int i
+
+# pad = " "
+# log.writeln("->mergeHelper", 5)
+
+ result_len[0] = 0
+ result = <int*> malloc(0*sizeof(int))
+
+ i1 = low1
+ i2 = low2
+# if log.level==5:
+# log.writeln("%sMERGE lists [%d,%d,%d] and [%d,%d,%d]" % (pad,low1,high1,step1,low2,high2,step2), 5)
+# log.writeln("%soffset_by_one: %d, len_last: %d" % (pad, offset_by_one, len_last), 5)
+# log.write("[")
+# for i from low1 <= i < high1:
+# log.write("%d, " % arr1.arr[i],5)
+# log.writeln("]")
+# log.write("[")
+# for i from low2 <= i < high2:
+# log.write("%d, " % arr2.arr[i],5)
+# log.writeln("]")
+ while i1 < high1 and i2 < high2:
+
+# '''First, pop all unneeded loc2's off the stack'''
+ assign_matching(&loc1, arr1, i1, step1, self.fda.sent_id.arr)
+# if log.level==5:
+# log.writeln("%s TOP1 %s" % (pad,matching2str(loc1)),5)
+ while i2 < high2:
+ assign_matching(&loc2, arr2, i2, step2, self.fda.sent_id.arr)
+ if self.compare_matchings(&loc1, &loc2, offset_by_one, len_last) == 1:
+# if log.level==5:
+# log.writeln("%s %s > %s" % (pad,matching2str(loc1),matching2str(loc2)),5)
+# log.writeln("%s POP2 %s" % (pad,matching2str(loc2)),5)
+ i2 = i2 + step2
+ else:
+ break
+
+# '''Next: process all loc1's with the same starting val'''
+ j1 = i1
+ while i1 < high1 and arr1[j1] == arr1[i1]:
+ assign_matching(&loc1, arr1, i1, step1, self.fda.sent_id.arr)
+ j2 = i2
+ while j2 < high2:
+ assign_matching(&loc2, arr2, j2, step2, self.fda.sent_id.arr)
+ comparison = self.compare_matchings(&loc1, &loc2, offset_by_one, len_last)
+ if comparison == 0:
+# if log.level==5:
+# log.writeln("%s %s == %s" % (pad,matching2str(loc1),matching2str(loc2)),5)
+ result = append_combined_matching(result, &loc1, &loc2, offset_by_one, num_subpatterns, result_len)
+ if comparison == 1:
+# if log.level==5:
+# log.writeln("%s %s > %s" % (pad,matching2str(loc1),matching2str(loc2)),5)
+ pass
+ if comparison == -1:
+# if log.level==5:
+# log.writeln("%s %s < %s" % (pad,matching2str(loc1),matching2str(loc2)),5)
+ break
+ else:
+ j2 = j2 + step2
+# if log.level==5:
+# log.writeln("%s POP1 %s" % (pad,matching2str(loc1)),5)
+ i1 = i1 + step1
+
+# log.writeln("<-mergeHelper", 5)
+ return result
+
+
+ cdef void sortPhraseLoc(self, cintlist.CIntList arr, PhraseLocation loc, rule.Phrase phrase):
+ cdef int i, j
+ cdef cveb.VEB veb
+ cdef cintlist.CIntList result
+
+ if phrase in self.precomputed_index:
+ loc.arr = self.precomputed_index[phrase]
+ else:
+ loc.arr = cintlist.CIntList(initial_len=loc.sa_high-loc.sa_low)
+ veb = cveb.VEB(arr.len)
+ for i from loc.sa_low <= i < loc.sa_high:
+ veb._insert(arr.arr[i])
+ i = veb.veb.min_val
+ for j from 0 <= j < loc.sa_high-loc.sa_low:
+ loc.arr.arr[j] = i
+ i = veb._findsucc(i)
+ loc.arr_low = 0
+ loc.arr_high = loc.arr.len
+
+
+ cdef intersectHelper(self, rule.Phrase prefix, rule.Phrase suffix,
+ PhraseLocation prefix_loc, PhraseLocation suffix_loc, int algorithm):
+
+ cdef cintlist.CIntList arr1, arr2, result
+ cdef int low1, high1, step1, low2, high2, step2, offset_by_one, len_last, num_subpatterns, result_len
+ cdef int* result_ptr
+ cdef csuf.SuffixArray suf
+
+ result_len = 0
+
+ if sym.isvar(suffix[0]):
+ offset_by_one = 1
+ else:
+ offset_by_one = 0
+
+ len_last = len(suffix.getchunk(suffix.arity()))
+
+ if prefix_loc.arr is None:
+ suf = self.context_manager.fsarray
+ self.sortPhraseLoc(suf.sa, prefix_loc, prefix)
+ arr1 = prefix_loc.arr
+ low1 = prefix_loc.arr_low
+ high1 = prefix_loc.arr_high
+ step1 = prefix_loc.num_subpatterns
+
+ if suffix_loc.arr is None:
+ suf = self.context_manager.fsarray
+ self.sortPhraseLoc(suf.sa, suffix_loc, suffix)
+ arr2 = suffix_loc.arr
+ low2 = suffix_loc.arr_low
+ high2 = suffix_loc.arr_high
+ step2 = suffix_loc.num_subpatterns
+
+ num_subpatterns = prefix.arity()+1
+
+ if algorithm == MERGE:
+ result_ptr = self.mergeHelper(low1, high1, arr1.arr, step1,
+ low2, high2, arr2.arr, step2,
+ offset_by_one, len_last, num_subpatterns, &result_len)
+ else:
+ result_ptr = self.baezaYatesHelper(low1, high1, arr1.arr, step1,
+ low2, high2, arr2.arr, step2,
+ offset_by_one, len_last, num_subpatterns, &result_len)
+
+ if result_len == 0:
+ free(result_ptr)
+ return None
+ else:
+ result = cintlist.CIntList()
+ free(result.arr)
+ result.arr = result_ptr
+ result.len = result_len
+ result.size = result_len
+ return PhraseLocation(arr_low=0, arr_high=result_len, arr=result, num_subpatterns=num_subpatterns)
+
+ cdef loc2str(self, PhraseLocation loc):
+ cdef int i, j
+ result = "{"
+ i = 0
+ while i < loc.arr_high:
+ result = result + "("
+ for j from i <= j < i + loc.num_subpatterns:
+ result = result + ("%d " %loc.arr[j])
+ result = result + ")"
+ i = i + loc.num_subpatterns
+ result = result + "}"
+ return result
+
+# cdef compareResults(self, PhraseLocation loc1, PhraseLocation loc2, phrase, type1, type2):
+# cdef i
+# if loc1 is None and type1=="pre":
+# return
+# if loc1 is None:
+# if loc2 is None or loc2.arr_high == 0:
+# return
+# if loc2 is None:
+# if loc1.arr_high == 0:
+# return
+# if loc1.arr_high != loc2.arr_high:
+# log.writeln("ERROR: %d vs %d (%s vs %s)" % (loc1.arr_high, loc2.arr_high, type1, type2))
+# #log.writeln(" %s" % self.loc2str(loc2))
+# if loc1.arr_high == 0:
+# return
+# elif loc1.num_subpatterns != loc2.num_subpatterns:
+# log.writeln("ERROR 2: %d vs %d (%d v %d) %s" % (loc1.num_subpatterns, loc2.num_subpatterns, loc1.arr_high, loc2.arr_high, phrase))
+# for i from 0 <= i < loc1.arr_high:
+# if loc1.arr[i] != loc2.arr[i]:
+# log.writeln("ERROR 3")
+#
+ cdef PhraseLocation intersect(self, prefix_node, suffix_node, rule.Phrase phrase):
+ cdef rule.Phrase prefix, suffix
+ cdef PhraseLocation prefix_loc, suffix_loc, result
+
+ start_time = monitor.cpu()
+ prefix = prefix_node.phrase
+ suffix = suffix_node.phrase
+ prefix_loc = prefix_node.phrase_location
+ suffix_loc = suffix_node.phrase_location
+
+ result = self.getPrecomputedCollocation(phrase)
+ if result is not None:
+ intersect_method = "precomputed"
+
+ if result is None:
+ if self.useBaezaYates:
+ result = self.intersectHelper(prefix, suffix, prefix_loc, suffix_loc, BAEZA_YATES)
+ intersect_method="double binary"
+ else:
+ result = self.intersectHelper(prefix, suffix, prefix_loc, suffix_loc, MERGE)
+ intersect_method="merge"
+ stop_time = monitor.cpu()
+ intersect_time = stop_time - start_time
+ if self.log_int_stats:
+ if intersect_method == "precomputed":
+ sort1 = "none"
+ sort2 = "none"
+ else:
+ if prefix in self.precomputed_index:
+ sort1 = "index"
+ else:
+ sort1 = "veb"
+ if suffix in self.precomputed_index:
+ sort2 = "index"
+ else:
+ sort2 = "veb"
+ result_len=0
+ if result is not None:
+ result_len = len(result.arr)/result.num_subpatterns
+ rank = 0
+# if phrase in self.max_rank:
+# rank = self.max_rank[phrase]
+# else:
+# rank = self.precompute_rank + 10
+ log.writeln("INT %d %d %d %d %d %f %d %s %s %s" %
+ (len(prefix)+1 - prefix.arity(), prefix.arity(),
+ nGramCount(prefix_node.phrase_location),
+ nGramCount(suffix_node.phrase_location),
+ result_len, intersect_time, rank, intersect_method, sort1, sort2))
+ return result
+
+ def advance(self, frontier, res, fwords):
+ nf = []
+ for (toskip, (i, alt, pathlen)) in frontier:
+ spanlen = fwords[i][alt][2]
+ if (toskip == 0):
+ #log.writeln("RES: (%d %d %d)" % (i, alt, pathlen), 3)
+ res.append((i, alt, pathlen))
+ ni = i + spanlen
+ #log.writeln("proc: %d (%d %d %d) sl=%d ni=%d len(fwords)=%d" % (toskip, i, alt, pathlen, spanlen, ni, len(fwords)), 3)
+ if (ni < len(fwords) and (pathlen + 1) < self.max_initial_size):
+ for na in xrange(len(fwords[ni])):
+ nf.append((toskip - 1, (ni, na, pathlen + 1)))
+ if (len(nf) > 0):
+ return self.advance(nf, res, fwords)
+ else:
+ return res
+
+ def get_all_nodes_isteps_away(self, skip, i, spanlen, pathlen, fwords, next_states, reachable_buffer):
+ frontier = []
+ if (i+spanlen+skip >= len(next_states)):
+ return frontier
+ #print "get_all_nodes_isteps_away from %i" % (i)
+ key = tuple([i,spanlen])
+ reachable = []
+ if (key in reachable_buffer):
+ reachable = reachable_buffer[key]
+ else:
+ reachable = self.reachable(fwords, i, spanlen)
+ reachable_buffer[key] = reachable
+ #print "reachable(from=%i,dist=%i) = " % (i,spanlen)
+ #print reachable
+ for nextreachable in reachable:
+ for next_id in next_states[nextreachable]:
+ jump = self.shortest(fwords,i,next_id)
+ #print "checking next_id = %i, pathlen[sofar] = %i, jump = %i" % (next_id,pathlen,jump)
+ #if (next_id - (i+spanlen)) < skip:
+ if jump < skip:
+ continue
+ #if next_id-(i-pathlen) < self.max_initial_size:
+ if pathlen+jump <= self.max_initial_size:
+ for alt_id in xrange(len(fwords[next_id])):
+ if (fwords[next_id][alt_id][0] != cn.epsilon):
+ #frontier.append((next_id,alt_id,next_id-(i-pathlen)));
+ #print "finding the shortest from %i to %i" % (i, next_id)
+ newel = (next_id,alt_id,pathlen+jump)
+ if newel not in frontier:
+ frontier.append((next_id,alt_id,pathlen+jump))
+ #print "appending to frontier = next_id=%i, alt_id=%i, pathlen=%i" % (next_id,alt_id,pathlen+jump)
+ #else:
+ #print "NOT appending to frontier = next_id=%i, alt_id=%i, pathlen=%i" % (next_id,alt_id,pathlen+jump)
+ #else:
+ #print "next_id = %s is aborted\n" % next_id
+ #print "returning frontier"
+ #print frontier
+ return frontier
+
+ def reachable(self, fwords, ifrom, dist):
+ #print "inside reachable(%i,%i)" % (ifrom,dist)
+ ret = []
+ if (ifrom >= len(fwords)):
+ return ret
+ for alt_id in xrange(len(fwords[ifrom])):
+ if (fwords[ifrom][alt_id][0] == cn.epsilon):
+ ret.extend(self.reachable(fwords,ifrom+fwords[ifrom][alt_id][2],dist))
+ else:
+ if (dist==0):
+ if (ifrom not in ret):
+ ret.append(ifrom)
+ else:
+ for ifromchild in self.reachable(fwords,ifrom+fwords[ifrom][alt_id][2],dist-1):
+ if (ifromchild not in ret):
+ ret.append(ifromchild)
+
+ return ret
+
+ def shortest(self, fwords, ifrom, ito):
+ min = 1000
+ #print "shortest ifrom=%i, ito=%i" % (ifrom,ito)
+ if (ifrom > ito):
+ return min
+ if (ifrom == ito):
+ return 0
+ for alt_id in xrange(len(fwords[ifrom])):
+ currmin = self.shortest(fwords,ifrom+fwords[ifrom][alt_id][2],ito)
+ if (fwords[ifrom][alt_id][0] != cn.epsilon):
+ currmin += 1
+ if (currmin<min):
+ min = currmin
+ return min
+
+ def get_next_states(self, _columns, curr_idx, min_dist=2):
+ result = []
+ candidate = [[curr_idx,0]]
+
+ while len(candidate) > 0:
+ curr = candidate.pop()
+ if curr[0] >= len(_columns):
+ continue
+ if curr[0] not in result and min_dist <= curr[1] <= self.max_initial_size:
+ result.append(curr[0]);
+ curr_col = _columns[curr[0]]
+ for alt in curr_col:
+ next_id = curr[0]+alt[2]
+ jump = 1
+ if (alt[0] == cn.epsilon):
+ jump = 0
+ if next_id not in result and min_dist <= curr[1]+jump <= self.max_initial_size+1:
+ candidate.append([next_id,curr[1]+jump])
+ return sorted(result);
+
+ def input(self, fwords, meta):
+ '''When this function is called on the RuleFactory,
+ it looks up all of the rules that can be used to translate
+ the input sentence'''
+ cdef int i, j, k, flen, arity, num_subpatterns, num_samples
+ cdef float start_time
+ cdef PhraseLocation phrase_location
+ cdef cintlist.CIntList sample, chunklen
+ cdef Matching matching
+ cdef rule.Phrase hiero_phrase
+
+ #fwords = [ ((1,0.0,1),), fwords1 ] #word id for <s> = 1, cost = 0.0, next = 1
+ #print fwords
+ flen = len(fwords)
+ #print "length = %i" % flen
+ start_time = monitor.cpu()
+ self.intersect_time = 0.0
+ self.extract_time = 0.0
+ nodes_isteps_away_buffer = {}
+ hit = 0
+ reachable_buffer = {}
+ #print "id = ",meta
+ #print "rule_file = ",self.rule_file
+ dattrs = sgml.attrs_to_dict(meta)
+ id = dattrs.get('id', 'NOID')
+ if self.per_sentence_grammar:
+ self.rule_filehandler = open(self.rule_file+'.'+id, 'w')
+ self.excluded_sent_id = int(dattrs.get('exclude', '-1'))
+
+ #print "max_initial_size = %i" % self.max_initial_size
+
+ if not self.cacheBetweenSents:
+ self.rules.root = ExtendedTrieNode(phrase_location=PhraseLocation())
+ self.grammar.root = [None, {}]
+
+ frontier = []
+ for i in xrange(len(fwords)):
+ for alt in xrange(0, len(fwords[i])):
+ if fwords[i][alt][0] != cn.epsilon:
+ frontier.append((i, i, alt, 0, self.rules.root, (), False))
+
+ xroot = None
+ x1 = sym.setindex(self.category, 1)
+ if x1 in self.rules.root.children:
+ xroot = self.rules.root.children[x1]
+ else:
+ xroot = ExtendedTrieNode(suffix_link=self.rules.root, phrase_location=PhraseLocation())
+ self.rules.root.children[x1] = xroot
+
+ for i in xrange(self.min_gap_size, len(fwords)):
+ for alt in xrange(0, len(fwords[i])):
+ if fwords[i][alt][0] != cn.epsilon:
+ frontier.append((i-self.min_gap_size, i, alt, self.min_gap_size, xroot, (x1,), True))
+ '''for k, i, alt, pathlen, node, prefix, is_shadow_path in frontier:
+ if len(prefix)>0:
+ print k, i, alt, pathlen, node, map(sym.tostring,prefix), is_shadow_path
+ else:
+ print k, i, alt, pathlen, node, prefix, is_shadow_path'''
+
+ #for wid in xrange(1000):
+ # print "%i = %s" % (wid, sym.tostring(wid))
+ next_states = []
+ for i in xrange(len(fwords)):
+ next_states.append(self.get_next_states(fwords,i,self.min_gap_size))
+ #print "next state of %i" % i
+ #print next_states[i]
+
+ while len(frontier) > 0:
+ #print "frontier = %i" % len(frontier)
+ this_iter_intersect_time = self.intersect_time
+ new_frontier = []
+ for k, i, alt, pathlen, node, prefix, is_shadow_path in frontier:
+ #print "looking at: "
+ #if len(prefix)>0:
+ # print k, i, alt, pathlen, node, map(sym.tostring,prefix), is_shadow_path
+ #else:
+ # print k, i, alt, pathlen, node, prefix, is_shadow_path
+ word_id = fwords[i][alt][0]
+ spanlen = fwords[i][alt][2]
+ #print "word_id = %i, %s" % (word_id, sym.tostring(word_id))
+ # to prevent .. [X] </S>
+ #print "prefix = ",prefix
+ #if word_id == 2 and len(prefix)>=2:
+ #print "at the end: %s" % (prefix[len(prefix)-1])
+ #if prefix[len(prefix)-1]<0:
+ #print "break"
+ #continue
+ #print "continuing"
+ #if pathlen + spanlen > self.max_initial_size:
+ #continue
+ # TODO get rid of k -- pathlen is replacing it
+ if word_id == cn.epsilon:
+ #print "skipping because word_id is epsilon"
+ if i+spanlen >= len(fwords):
+ continue
+ for nualt in xrange(0,len(fwords[i+spanlen])):
+ frontier.append((k, i+spanlen, nualt, pathlen, node, prefix, is_shadow_path))
+ continue
+
+ phrase = prefix + (word_id,)
+ str_phrase = map(sym.tostring, phrase)
+ hiero_phrase = rule.Phrase(phrase)
+ arity = hiero_phrase.arity()
+
+ #print "pos %2d, node %5d, '%s'" % (i, node.id, hiero_phrase)
+ if self.search_file is not None:
+ self.search_file.write("%s\n" % hiero_phrase)
+
+ lookup_required = False
+ if word_id in node.children:
+ if node.children[word_id] is None:
+ #print "Path dead-ends at this node\n"
+ continue
+ else:
+ #print "Path continues at this node\n"
+ node = node.children[word_id]
+ else:
+ if node.suffix_link is None:
+ #print "Current node is root; lookup required\n"
+ lookup_required = True
+ else:
+ if word_id in node.suffix_link.children:
+ if node.suffix_link.children[word_id] is None:
+ #print "Suffix link reports path is dead end\n"
+ node.children[word_id] = None
+ continue
+ else:
+ #print "Suffix link indicates lookup is reqired\n"
+ lookup_required = True
+ else:
+ #print "ERROR: We never get here\n"
+ raise Exception("Keyword trie error")
+ #new_frontier.append((k, i, alt, pathlen, node, prefix, is_shadow_path))
+ #print "checking whether lookup_required\n"
+ if lookup_required:
+ new_node = None
+ if is_shadow_path:
+ #print "Extending shadow path for %s \n"
+ # on the shadow path we don't do any search, we just use info from suffix link
+ new_node = ExtendedTrieNode(phrase_location=node.suffix_link.children[word_id].phrase_location,
+ suffix_link=node.suffix_link.children[word_id],
+ phrase=hiero_phrase)
+ else:
+ if arity > 0:
+ #print "Intersecting for %s because of arity > 0\n" % hiero_phrase
+ phrase_location = self.intersect(node, node.suffix_link.children[word_id], hiero_phrase)
+ else:
+ #print "Suffix array search for %s" % hiero_phrase
+ phrase_location = node.phrase_location
+ sa_range = self.context_manager.fsarray.lookup(str_phrase[-1], len(str_phrase)-1, phrase_location.sa_low, phrase_location.sa_high)
+ if sa_range is not None:
+ phrase_location = PhraseLocation(sa_low=sa_range[0], sa_high=sa_range[1])
+ else:
+ phrase_location = None
+
+ if phrase_location is None:
+ node.children[word_id] = None
+ #print "Search failed\n"
+ continue
+ #print "Search succeeded\n"
+ suffix_link = self.rules.root
+ if node.suffix_link is not None:
+ suffix_link = node.suffix_link.children[word_id]
+ new_node = ExtendedTrieNode(phrase_location=phrase_location,
+ suffix_link=suffix_link,
+ phrase=hiero_phrase)
+ node.children[word_id] = new_node
+ node = new_node
+ #print "Added node %d with suffix link %d\n" % (node.id, node.suffix_link.id)
+
+ '''Automatically add a trailing X node, if allowed --
+ This should happen before we get to extraction (so that
+ the node will exist if needed)'''
+ if arity < self.max_nonterminals:
+ xcat_index = arity+1
+ xcat = sym.setindex(self.category, xcat_index)
+ suffix_link_xcat_index = xcat_index
+ if is_shadow_path:
+ suffix_link_xcat_index = xcat_index-1
+ suffix_link_xcat = sym.setindex(self.category, suffix_link_xcat_index)
+ node.children[xcat] = ExtendedTrieNode(phrase_location=node.phrase_location,
+ suffix_link=node.suffix_link.children[suffix_link_xcat],
+ phrase= rule.Phrase(phrase + (xcat,)))
+ #log.writeln("Added node %d with suffix link %d (for X)" % (node.children[xcat].id, node.children[xcat].suffix_link.id), 4)
+
+ # sample from range
+ if not is_shadow_path:
+ #print "is_not_shadow_path"
+ sample = self.context_manager.sampler.sample(node.phrase_location)
+ #print "node.phrase_location %s" % str(node.phrase_location)
+ #print "sample.len = %i" % len(sample)
+ num_subpatterns = (<PhraseLocation> node.phrase_location).num_subpatterns
+ chunklen = cintlist.CIntList(initial_len=num_subpatterns)
+ for j from 0 <= j < num_subpatterns:
+ chunklen.arr[j] = hiero_phrase.chunklen(j)
+ extracts = []
+ j = 0
+ extract_start = monitor.cpu()
+ '''orig_tight_phrases = self.tight_phrases
+ orig_require_aligned_terminal = self.require_aligned_terminal
+ orig_require_aligned_chunks = self.require_aligned_chunks
+ if k==0 or i==len(fwords)-1:
+ self.tight_phrases = 0
+ self.require_aligned_terminal = 0
+ self.require_aligned_chunks = 0'''
+ while j < sample.len:
+ extract = []
+
+ assign_matching(&matching, sample.arr, j, num_subpatterns, self.fda.sent_id.arr)
+ '''print "tight_phrase = "
+ print self.tight_phrases
+ print "require_aligned_terminal = "
+ print self.require_aligned_terminal
+ print "require_aligned_chunks = "
+ print self.require_aligned_chunks'''
+
+ extract = self.extract(hiero_phrase, &matching, chunklen.arr, num_subpatterns)
+ extracts.extend(extract)
+ j = j + num_subpatterns
+ '''self.tight_phrases = orig_tight_phrases
+ sttice+sa.nw.normelf.require_aligned_terminal = orig_require_aligned_terminal
+ self.require_aligned_chunks = orig_require_aligned_chunks'''
+ num_samples = sample.len/num_subpatterns
+ extract_stop = monitor.cpu()
+ self.extract_time = self.extract_time + extract_stop - extract_start
+ #print "extract.size = %i" % len(extracts)
+ if len(extracts) > 0:
+ fphrases = {}
+ fals = {}
+ fcount = {}
+ for f, e, count, als in extracts:
+ fcount.setdefault(f, 0.0)
+ fcount[f] = fcount[f] + count
+ fphrases.setdefault(f, {})
+ fphrases[f].setdefault(e, {})
+ #fphrases[f][e] = fphrases[f][e] + count
+ fphrases[f][e].setdefault(als,0.0)
+ fphrases[f][e][als] = fphrases[f][e][als] + count
+ #print "f,e,als ",f," : ",e," : ",als," count = ",fphrases[f][e][als]
+ #fals[str(f)+" ||| "+str(e)] = als
+ for f, elist in fphrases.iteritems():
+ #print "f = '%s'" % f
+ #if (str(f) in ['<s>','</s>','<s> [X,1]','[X,1] </s>']):
+ # print "rejected"
+ # continue
+ f_margin = fcount[f]
+ for e, alslist in elist.iteritems():
+ alignment = None
+ count = 0
+ for als, currcount in alslist.iteritems():
+ #print "als = ",als,", count = ",currcount
+ if currcount > count:
+ alignment = als
+ count = currcount
+ #alignment = fals[str(f)+" ||| "+str(e)]
+ #print "selected = ",alignment," with count = ",count
+ scores = []
+ for m in self.context_manager.models:
+ scores.append(m.compute_contextless_score(f, e, count, fcount[f], num_samples))
+ r = rule.Rule(self.category, f, e, scores=scores, owner="context", word_alignments = alignment)
+ self.grammar.add(r)
+ if self.rule_filehandler is not None:
+ self.rule_filehandler.write("%s\n" % r.to_line())
+ #print "adding a rule = %s" % r
+
+ #if len(phrase) < self.max_length and i+spanlen < len(fwords) and pathlen+spanlen < self.max_initial_size:
+ if len(phrase) < self.max_length and i+spanlen < len(fwords) and pathlen+1 <= self.max_initial_size:
+ #to prevent [X] </s>
+ #print "lexicalized"
+ for alt_id in xrange(len(fwords[i+spanlen])):
+ #if (fwords[i+spanlen][alt_id][2]+pathlen+spanlen <= self.max_initial_size):
+ #new_frontier.append((k, i+spanlen, alt_id, pathlen + spanlen, node, phrase, is_shadow_path))
+ #print "alt_id = %d\n" % alt_id
+ new_frontier.append((k, i+spanlen, alt_id, pathlen + 1, node, phrase, is_shadow_path))
+ #print (k, i+spanlen, alt_id, pathlen + spanlen, node, map(sym.tostring,phrase), is_shadow_path)
+ #print "end lexicalized"
+ num_subpatterns = arity
+ if not is_shadow_path:
+ num_subpatterns = num_subpatterns + 1
+ #to avoid <s> X ... we want <s> next to a lexicalized item
+ #if k>0 and i<len(fwords)-1 and len(phrase)+1 < self.max_length and arity < self.max_nonterminals and num_subpatterns < self.max_chunks:
+ if len(phrase)+1 < self.max_length and arity < self.max_nonterminals and num_subpatterns < self.max_chunks:
+ #print "masuk kondisi"
+ xcat = sym.setindex(self.category, arity+1)
+ xnode = node.children[xcat]
+ #frontier_nodes = self.get_all_nodes_isteps_away(self.min_gap_size, i, spanlen, pathlen, fwords, next_states)
+ # I put spanlen=1 below
+ key = tuple([self.min_gap_size, i, 1, pathlen])
+ frontier_nodes = []
+ if (key in nodes_isteps_away_buffer):
+ frontier_nodes = nodes_isteps_away_buffer[key]
+ else:
+ frontier_nodes = self.get_all_nodes_isteps_away(self.min_gap_size, i, 1, pathlen, fwords, next_states, reachable_buffer)
+ nodes_isteps_away_buffer[key] = frontier_nodes
+
+ #print "new frontier:\n"
+ for (i, alt, pathlen) in frontier_nodes:
+ #if (pathlen+fwords[i][alt][2] <= self.max_initial_size):
+ new_frontier.append((k, i, alt, pathlen, xnode, phrase +(xcat,), is_shadow_path))
+ #print k, i, alt, pathlen, node, map(sym.tostring,phrase +(xcat,)), is_shadow_path
+ #print "all end\n";
+ #else:
+ #print "no new frontier1\n";
+ #else :
+ #print "no new frontier2\n"
+ if self.log_int_stats:
+ log.writeln("This iteration intersect time = %f" % (self.intersect_time - this_iter_intersect_time))
+ frontier = new_frontier
+
+ stop_time = monitor.cpu()
+ log.writeln("Total time for rule lookup, extraction, and scoring = %f seconds" % (stop_time - start_time))
+ #log.writeln(" Intersect time = %f seconds" % self.intersect_time)
+ gc.collect()
+ log.writeln(" Extract time = %f seconds" % self.extract_time)
+ if self.pruned_rule_file:
+ self.grammar.dump(self.pruned_rule_file)
+ if self.per_sentence_grammar:
+ self.rule_filehandler.close()
+
+# else:
+# self.rule_filehandler.write("###EOS_"+ id +"\n")
+
+
+ cdef int find_fixpoint(self,
+ int f_low, f_high,
+ int* f_links_low, int* f_links_high,
+ int* e_links_low, int* e_links_high,
+ int e_in_low, int e_in_high,
+ int* e_low, int* e_high,
+ int* f_back_low, int* f_back_high,
+ int f_sent_len, int e_sent_len,
+ int max_f_len, int max_e_len,
+ int min_fx_size, int min_ex_size,
+ int max_new_x,
+ int allow_low_x, int allow_high_x,
+ int allow_arbitrary_x, int write_log):
+ cdef int e_low_prev, e_high_prev, f_low_prev, f_high_prev, new_x, new_low_x, new_high_x
+
+ e_low[0] = e_in_low
+ e_high[0] = e_in_high
+ self.find_projection(f_low, f_high, f_links_low, f_links_high, e_low, e_high)
+ if e_low[0] == -1:
+ # low-priority corner case: if phrase w is unaligned,
+ # but we don't require aligned terminals, then returning
+ # an error here might prevent extraction of allowed
+ # rule X -> X_1 w X_2 / X_1 X_2. This is probably
+ # not worth the bother, though.
+ #print "find_fixpoint0"
+ return 0
+ elif e_in_low != -1 and e_low[0] != e_in_low:
+ if e_in_low - e_low[0] < min_ex_size:
+ e_low[0] = e_in_low - min_ex_size
+ if e_low[0] < 0:
+ #print "find_fixpoint1"
+ return 0
+
+ if e_high[0] - e_low[0] > max_e_len:
+ #print "find_fixpoint2"
+ return 0
+ elif e_in_high != -1 and e_high[0] != e_in_high:
+ if e_high[0] - e_in_high < min_ex_size:
+ e_high[0] = e_in_high + min_ex_size
+ if e_high[0] > e_sent_len:
+ #print "find_fixpoint3"
+ return 0
+
+ f_back_low[0] = -1
+ f_back_high[0] = -1
+ f_low_prev = f_low
+ f_high_prev = f_high
+ new_x = 0
+ new_low_x = 0
+ new_high_x = 0
+
+ while True:
+
+ if f_back_low[0] == -1:
+ self.find_projection(e_low[0], e_high[0], e_links_low, e_links_high, f_back_low, f_back_high)
+ else:
+ self.find_projection(e_low[0], e_low_prev, e_links_low, e_links_high, f_back_low, f_back_high)
+ self.find_projection(e_high_prev, e_high[0], e_links_low, e_links_high, f_back_low, f_back_high)
+
+ if f_back_low[0] > f_low:
+ f_back_low[0] = f_low
+
+ if f_back_high[0] < f_high:
+ f_back_high[0] = f_high
+
+ if f_back_low[0] == f_low_prev and f_back_high[0] == f_high_prev:
+ return 1
+
+ if allow_low_x == 0 and f_back_low[0] < f_low:
+# log.writeln(" FAIL: f phrase is not tight")
+ #print " FAIL: f phrase is not tight"
+ return 0
+
+ if f_back_high[0] - f_back_low[0] > max_f_len:
+# log.writeln(" FAIL: f back projection is too wide")
+ #print " FAIL: f back projection is too wide"
+ return 0
+
+ if allow_high_x == 0 and f_back_high[0] > f_high:
+# log.writeln(" FAIL: extension on high side not allowed")
+ #print " FAIL: extension on high side not allowed"
+ return 0
+
+ if f_low != f_back_low[0]:
+ if new_low_x == 0:
+ if new_x >= max_new_x:
+# log.writeln(" FAIL: extension required on low side violates max # of gaps")
+ #print " FAIL: extension required on low side violates max # of gaps"
+ return 0
+ else:
+ new_x = new_x + 1
+ new_low_x = 1
+ if f_low - f_back_low[0] < min_fx_size:
+ f_back_low[0] = f_low - min_fx_size
+ if f_back_high[0] - f_back_low[0] > max_f_len:
+# log.writeln(" FAIL: extension required on low side violates max initial length")
+ #print " FAIL: extension required on low side violates max initial length"
+ return 0
+ if f_back_low[0] < 0:
+# log.writeln(" FAIL: extension required on low side violates sentence boundary")
+ #print " FAIL: extension required on low side violates sentence boundary"
+ return 0
+
+ if f_high != f_back_high[0]:
+ if new_high_x == 0:
+ if new_x >= max_new_x:
+# log.writeln(" FAIL: extension required on high side violates max # of gaps")
+ #print " FAIL: extension required on high side violates max # of gaps"
+ return 0
+ else:
+ new_x = new_x + 1
+ new_high_x = 1
+ if f_back_high[0] - f_high < min_fx_size:
+ f_back_high[0] = f_high + min_fx_size
+ if f_back_high[0] - f_back_low[0] > max_f_len:
+# log.writeln(" FAIL: extension required on high side violates max initial length")
+ #print " FAIL: extension required on high side violates max initial length"
+ return 0
+ if f_back_high[0] > f_sent_len:
+# log.writeln(" FAIL: extension required on high side violates sentence boundary")
+ #print " FAIL: extension required on high side violates sentence boundary"
+ return 0
+
+ e_low_prev = e_low[0]
+ e_high_prev = e_high[0]
+
+ self.find_projection(f_back_low[0], f_low_prev, f_links_low, f_links_high, e_low, e_high)
+ self.find_projection(f_high_prev, f_back_high[0], f_links_low, f_links_high, e_low, e_high)
+ if e_low[0] == e_low_prev and e_high[0] == e_high_prev:
+ return 1
+ if allow_arbitrary_x == 0:
+# log.writeln(" FAIL: arbitrary expansion not permitted")
+ #print " FAIL: arbitrary expansion not permitted"
+ return 0
+ if e_high[0] - e_low[0] > max_e_len:
+# log.writeln(" FAIL: re-projection violates sentence max phrase length")
+ #print " FAIL: re-projection violates sentence max phrase length"
+ return 0
+ f_low_prev = f_back_low[0]
+ f_high_prev = f_back_high[0]
+
+
+ cdef find_projection(self, int in_low, int in_high, int* in_links_low, int* in_links_high,
+ int* out_low, int* out_high):
+ cdef int i
+ for i from in_low <= i < in_high:
+ if in_links_low[i] != -1:
+ if out_low[0] == -1 or in_links_low[i] < out_low[0]:
+ out_low[0] = in_links_low[i]
+ if out_high[0] == -1 or in_links_high[i] > out_high[0]:
+ out_high[0] = in_links_high[i]
+
+
+ cdef int* int_arr_extend(self, int* arr, int* arr_len, int* data, int data_len):
+ cdef int new_len
+ new_len = arr_len[0] + data_len
+ arr = <int*> realloc(arr, new_len*sizeof(int))
+ memcpy(arr+arr_len[0], data, data_len*sizeof(int))
+ arr_len[0] = new_len
+ return arr
+
+
+ cdef extract_phrases(self, int e_low, int e_high, int* e_gap_low, int* e_gap_high, int* e_links_low, int num_gaps,
+ int f_low, int f_high, int* f_gap_low, int* f_gap_high, int* f_links_low,
+ int sent_id, int e_sent_len, int e_sent_start):
+ cdef int i, j, k, m, n, *e_gap_order, e_x_low, e_x_high, e_x_gap_low, e_x_gap_high
+ cdef int *e_gaps1, *e_gaps2, len1, len2, step, num_chunks
+ cdef cintlist.CIntList ephr_arr
+ cdef result
+
+ #print "inside extract_phrases"
+ #print "f_low=%d, f_high=%d" % (f_low,f_high)
+ result = []
+ len1 = 0
+ e_gaps1 = <int*> malloc(0)
+ ephr_arr = cintlist.CIntList()
+
+ e_gap_order = <int*> malloc(num_gaps*sizeof(int))
+ if num_gaps > 0:
+ e_gap_order[0] = 0
+ for i from 1 <= i < num_gaps:
+ for j from 0 <= j < i:
+ if e_gap_low[i] < e_gap_low[j]:
+ for k from j <= k < i:
+ e_gap_order[k+1] = e_gap_order[k]
+ e_gap_order[j] = i
+ break
+ else:
+ e_gap_order[i] = i
+
+ e_x_low = e_low
+ e_x_high = e_high
+ if self.tight_phrases == 0:
+ while e_x_low > 0 and e_high - e_x_low < self.train_max_initial_size and e_links_low[e_x_low-1] == -1:
+ e_x_low = e_x_low - 1
+ while e_x_high < e_sent_len and e_x_high - e_low < self.train_max_initial_size and e_links_low[e_x_high] == -1:
+ e_x_high = e_x_high + 1
+
+ for i from e_x_low <= i <= e_low:
+ e_gaps1 = self.int_arr_extend(e_gaps1, &len1, &i, 1)
+
+ for i from 0 <= i < num_gaps:
+ e_gaps2 = <int*> malloc(0)
+ len2 = 0
+
+ j = e_gap_order[i]
+ e_x_gap_low = e_gap_low[j]
+ e_x_gap_high = e_gap_high[j]
+ if self.tight_phrases == 0:
+ while e_x_gap_low > e_x_low and e_links_low[e_x_gap_low-1] == -1:
+ e_x_gap_low = e_x_gap_low - 1
+ while e_x_gap_high < e_x_high and e_links_low[e_x_gap_high] == -1:
+ e_x_gap_high = e_x_gap_high + 1
+
+ k = 0
+ step = 1+(i*2)
+ while k < len1:
+ for m from e_x_gap_low <= m <= e_gap_low[j]:
+ if m >= e_gaps1[k+step-1]:
+ for n from e_gap_high[j] <= n <= e_x_gap_high:
+ if n-m >= 1: # extractor.py doesn't restrict target-side gap length
+ e_gaps2 = self.int_arr_extend(e_gaps2, &len2, e_gaps1+k, step)
+ e_gaps2 = self.int_arr_extend(e_gaps2, &len2, &m, 1)
+ e_gaps2 = self.int_arr_extend(e_gaps2, &len2, &n, 1)
+ k = k + step
+ free(e_gaps1)
+ e_gaps1 = e_gaps2
+ len1 = len2
+
+ step = 1+(num_gaps*2)
+ e_gaps2 = <int*> malloc(0)
+ len2 = 0
+ for i from e_high <= i <= e_x_high:
+ j = 0
+ while j < len1:
+ if i - e_gaps1[j] <= self.train_max_initial_size and i >= e_gaps1[j+step-1]:
+ e_gaps2 = self.int_arr_extend(e_gaps2, &len2, e_gaps1+j, step)
+ e_gaps2 = self.int_arr_extend(e_gaps2, &len2, &i, 1)
+ j = j + step
+ free(e_gaps1)
+ e_gaps1 = e_gaps2
+ len1 = len2
+
+ step = (num_gaps+1)*2
+ i = 0
+
+ while i < len1:
+ ephr_arr._clear()
+ num_chunks = 0
+ indexes = []
+ for j from 0 <= j < num_gaps+1:
+ if e_gaps1[i+2*j] < e_gaps1[i+(2*j)+1]:
+ num_chunks = num_chunks + 1
+ for k from e_gaps1[i+2*j] <= k < e_gaps1[i+(2*j)+1]:
+ indexes.append(k)
+ ephr_arr._append(self.eid2symid[self.eda.data.arr[e_sent_start+k]])
+ if j < num_gaps:
+ indexes.append(sym.setindex(self.category, e_gap_order[j]+1))
+ ephr_arr._append(sym.setindex(self.category, e_gap_order[j]+1))
+ i = i + step
+ if ephr_arr.len <= self.max_target_length and num_chunks <= self.max_target_chunks:
+ result.append((rule.Phrase(ephr_arr),indexes))
+
+ free(e_gaps1)
+ free(e_gap_order)
+ return result
+
+ cdef create_alignments(self, int* sent_links, int num_links, findexes, eindexes):
+ #print "create_alignments"
+ #s = "sent_links = "
+ #i = 0
+ #while (i < num_links*2):
+ # s = s+"%d-%d " % (sent_links[i],sent_links[i+1])
+ # i += 2
+ #print s
+ #print findexes
+ #print eindexes
+
+ ret = cintlist.CIntList()
+ for i in xrange(len(findexes)):
+ s = findexes[i]
+ if (s<0):
+ continue
+ idx = 0
+ while (idx < num_links*2):
+ if (sent_links[idx] == s):
+ j = eindexes.index(sent_links[idx+1])
+ ret.append(i*65536+j)
+ idx += 2
+ return ret
+
+ cdef extract(self, rule.Phrase phrase, Matching* matching, int* chunklen, int num_chunks):
+ cdef int* sent_links, *e_links_low, *e_links_high, *f_links_low, *f_links_high
+ cdef int *f_gap_low, *f_gap_high, *e_gap_low, *e_gap_high, num_gaps, gap_start
+ cdef int i, j, k, e_i, f_i, num_links, num_aligned_chunks, met_constraints
+ cdef int f_low, f_high, e_low, e_high, f_back_low, f_back_high
+ cdef int e_sent_start, e_sent_end, f_sent_start, f_sent_end, e_sent_len, f_sent_len
+ cdef int e_word_count, f_x_low, f_x_high, e_x_low, e_x_high, phrase_len
+ cdef float pair_count
+ cdef float available_mass
+ cdef extracts, phrase_list
+ cdef cintlist.CIntList fphr_arr
+ cdef rule.Phrase fphr
+ cdef reason_for_failure
+
+ fphr_arr = cintlist.CIntList()
+ phrase_len = phrase.n
+ extracts = []
+ sent_links = self.alignment._get_sent_links(matching.sent_id, &num_links)
+
+ e_sent_start = self.eda.sent_index.arr[matching.sent_id]
+ e_sent_end = self.eda.sent_index.arr[matching.sent_id+1]
+ e_sent_len = e_sent_end - e_sent_start - 1
+ f_sent_start = self.fda.sent_index.arr[matching.sent_id]
+ f_sent_end = self.fda.sent_index.arr[matching.sent_id+1]
+ f_sent_len = f_sent_end - f_sent_start - 1
+ available_mass = 1.0
+ if matching.sent_id == self.excluded_sent_id:
+ available_mass = 0.0
+
+ self.findexes1.reset()
+ sofar = 0
+ for i in xrange(num_chunks):
+ for j in xrange(chunklen[i]):
+ self.findexes1.append(matching.arr[matching.start+i]+j-f_sent_start);
+ sofar += 1
+ if (i+1<num_chunks):
+ self.findexes1.append(phrase[sofar])
+ sofar += 1
+
+
+ e_links_low = <int*> malloc(e_sent_len*sizeof(int))
+ e_links_high = <int*> malloc(e_sent_len*sizeof(int))
+ f_links_low = <int*> malloc(f_sent_len*sizeof(int))
+ f_links_high = <int*> malloc(f_sent_len*sizeof(int))
+ f_gap_low = <int*> malloc((num_chunks+1)*sizeof(int))
+ f_gap_high = <int*> malloc((num_chunks+1)*sizeof(int))
+ e_gap_low = <int*> malloc((num_chunks+1)*sizeof(int))
+ e_gap_high = <int*> malloc((num_chunks+1)*sizeof(int))
+ memset(f_gap_low, 0, (num_chunks+1)*sizeof(int))
+ memset(f_gap_high, 0, (num_chunks+1)*sizeof(int))
+ memset(e_gap_low, 0, (num_chunks+1)*sizeof(int))
+ memset(e_gap_high, 0, (num_chunks+1)*sizeof(int))
+
+ reason_for_failure = ""
+
+ for i from 0 <= i < e_sent_len:
+ e_links_low[i] = -1
+ e_links_high[i] = -1
+ for i from 0 <= i < f_sent_len:
+ f_links_low[i] = -1
+ f_links_high[i] = -1
+
+ # this is really inefficient -- might be good to
+ # somehow replace with binary search just for the f
+ # links that we care about (but then how to look up
+ # when we want to check something on the e side?)
+ i = 0
+ while i < num_links*2:
+ f_i = sent_links[i]
+ e_i = sent_links[i+1]
+ if f_links_low[f_i] == -1 or f_links_low[f_i] > e_i:
+ f_links_low[f_i] = e_i
+ if f_links_high[f_i] == -1 or f_links_high[f_i] < e_i + 1:
+ f_links_high[f_i] = e_i + 1
+ if e_links_low[e_i] == -1 or e_links_low[e_i] > f_i:
+ e_links_low[e_i] = f_i
+ if e_links_high[e_i] == -1 or e_links_high[e_i] < f_i + 1:
+ e_links_high[e_i] = f_i + 1
+ i = i + 2
+
+ als = []
+ for x in xrange(matching.start,matching.end):
+ al = (matching.arr[x]-f_sent_start,f_links_low[matching.arr[x]-f_sent_start])
+ als.append(al)
+ # check all source-side alignment constraints
+ met_constraints = 1
+ if self.require_aligned_terminal:
+ num_aligned_chunks = 0
+ for i from 0 <= i < num_chunks:
+ for j from 0 <= j < chunklen[i]:
+ if f_links_low[matching.arr[matching.start+i]+j-f_sent_start] > -1:
+ num_aligned_chunks = num_aligned_chunks + 1
+ break
+ if num_aligned_chunks == 0:
+ reason_for_failure = "No aligned terminals"
+ met_constraints = 0
+ if self.require_aligned_chunks and num_aligned_chunks < num_chunks:
+ reason_for_failure = "Unaligned chunk"
+ met_constraints = 0
+
+ if met_constraints and self.tight_phrases:
+ # outside edge constraints are checked later
+ for i from 0 <= i < num_chunks-1:
+ if f_links_low[matching.arr[matching.start+i]+chunklen[i]-f_sent_start] == -1:
+ reason_for_failure = "Gaps are not tight phrases"
+ met_constraints = 0
+ break
+ if f_links_low[matching.arr[matching.start+i+1]-1-f_sent_start] == -1:
+ reason_for_failure = "Gaps are not tight phrases"
+ met_constraints = 0
+ break
+
+ f_low = matching.arr[matching.start] - f_sent_start
+ f_high = matching.arr[matching.start + matching.size - 1] + chunklen[num_chunks-1] - f_sent_start
+ if met_constraints:
+
+ if self.find_fixpoint(f_low, f_high, f_links_low, f_links_high, e_links_low, e_links_high,
+ -1, -1, &e_low, &e_high, &f_back_low, &f_back_high, f_sent_len, e_sent_len,
+ self.train_max_initial_size, self.train_max_initial_size,
+ self.train_min_gap_size, 0,
+ self.max_nonterminals - num_chunks + 1, 1, 1, 0, 0):
+ gap_error = 0
+ num_gaps = 0
+
+ if f_back_low < f_low:
+ f_gap_low[0] = f_back_low
+ f_gap_high[0] = f_low
+ num_gaps = 1
+ gap_start = 0
+ phrase_len = phrase_len+1
+ if phrase_len > self.max_length:
+ gap_error = 1
+ if self.tight_phrases:
+ if f_links_low[f_back_low] == -1 or f_links_low[f_low-1] == -1:
+ gap_error = 1
+ reason_for_failure = "Inside edges of preceding subphrase are not tight"
+ else:
+ gap_start = 1
+ if self.tight_phrases and f_links_low[f_low] == -1:
+ # this is not a hard error. we can't extract this phrase
+ # but we still might be able to extract a superphrase
+ met_constraints = 0
+
+ for i from 0 <= i < matching.size - 1:
+ f_gap_low[1+i] = matching.arr[matching.start+i] + chunklen[i] - f_sent_start
+ f_gap_high[1+i] = matching.arr[matching.start+i+1] - f_sent_start
+ num_gaps = num_gaps + 1
+
+ if f_high < f_back_high:
+ f_gap_low[gap_start+num_gaps] = f_high
+ f_gap_high[gap_start+num_gaps] = f_back_high
+ num_gaps = num_gaps + 1
+ phrase_len = phrase_len+1
+ if phrase_len > self.max_length:
+ gap_error = 1
+ if self.tight_phrases:
+ if f_links_low[f_back_high-1] == -1 or f_links_low[f_high] == -1:
+ gap_error = 1
+ reason_for_failure = "Inside edges of following subphrase are not tight"
+ else:
+ if self.tight_phrases and f_links_low[f_high-1] == -1:
+ met_constraints = 0
+
+ if gap_error == 0:
+ e_word_count = e_high - e_low
+ for i from 0 <= i < num_gaps: # check integrity of subphrases
+ if self.find_fixpoint(f_gap_low[gap_start+i], f_gap_high[gap_start+i],
+ f_links_low, f_links_high, e_links_low, e_links_high,
+ -1, -1, e_gap_low+gap_start+i, e_gap_high+gap_start+i,
+ f_gap_low+gap_start+i, f_gap_high+gap_start+i,
+ f_sent_len, e_sent_len,
+ self.train_max_initial_size, self.train_max_initial_size,
+ 0, 0, 0, 0, 0, 0, 0) == 0:
+ gap_error = 1
+ reason_for_failure = "Subphrase [%d, %d] failed integrity check" % (f_gap_low[gap_start+i], f_gap_high[gap_start+i])
+ break
+
+ if gap_error == 0:
+ i = 1
+ self.findexes.reset()
+ if f_back_low < f_low:
+ fphr_arr._append(sym.setindex(self.category, i))
+ i = i+1
+ self.findexes.append(sym.setindex(self.category, i))
+ self.findexes.extend(self.findexes1)
+ for j from 0 <= j < phrase.n:
+ if sym.isvar(phrase.syms[j]):
+ fphr_arr._append(sym.setindex(self.category, i))
+ i = i + 1
+ else:
+ fphr_arr._append(phrase.syms[j])
+ if f_back_high > f_high:
+ fphr_arr._append(sym.setindex(self.category, i))
+ self.findexes.append(sym.setindex(self.category, i))
+
+ fphr = rule.Phrase(fphr_arr)
+ if met_constraints:
+ phrase_list = self.extract_phrases(e_low, e_high, e_gap_low + gap_start, e_gap_high + gap_start, e_links_low, num_gaps,
+ f_back_low, f_back_high, f_gap_low + gap_start, f_gap_high + gap_start, f_links_low,
+ matching.sent_id, e_sent_len, e_sent_start)
+ #print "e_low=%d, e_high=%d, gap_start=%d, num_gaps=%d, f_back_low=%d, f_back_high=%d" & (e_low, e_high, gap_start, num_gaps, f_back_low, f_back_high)
+ if len(phrase_list) > 0:
+ pair_count = available_mass / len(phrase_list)
+ else:
+ pair_count = 0
+ reason_for_failure = "Didn't extract anything from [%d, %d] -> [%d, %d]" % (f_back_low, f_back_high, e_low, e_high)
+ for (phrase2,eindexes) in phrase_list:
+ als1 = self.create_alignments(sent_links,num_links,self.findexes,eindexes)
+ extracts.append((fphr, phrase2, pair_count, tuple(als1)))
+ if self.extract_file:
+ self.extract_file.write("%s ||| %s ||| %f ||| %d: [%d, %d] -> [%d, %d]\n" % (fphr, phrase2, pair_count, matching.sent_id+1, f_back_low, f_back_high, e_low, e_high))
+ #print "extract_phrases1: %s ||| %s ||| %f ||| %d: [%d, %d] -> [%d, %d]\n" % (fphr, phrase2, pair_count, matching.sent_id+1, f_back_low, f_back_high, e_low, e_high)
+
+ if (num_gaps < self.max_nonterminals and
+ phrase_len < self.max_length and
+ f_back_high - f_back_low + self.train_min_gap_size <= self.train_max_initial_size):
+ if (f_back_low == f_low and
+ f_low >= self.train_min_gap_size and
+ ((not self.tight_phrases) or (f_links_low[f_low-1] != -1 and f_links_low[f_back_high-1] != -1))):
+ f_x_low = f_low-self.train_min_gap_size
+ met_constraints = 1
+ if self.tight_phrases:
+ while f_x_low >= 0 and f_links_low[f_x_low] == -1:
+ f_x_low = f_x_low - 1
+ if f_x_low < 0 or f_back_high - f_x_low > self.train_max_initial_size:
+ met_constraints = 0
+
+ if (met_constraints and
+ self.find_fixpoint(f_x_low, f_back_high,
+ f_links_low, f_links_high, e_links_low, e_links_high,
+ e_low, e_high, &e_x_low, &e_x_high, &f_x_low, &f_x_high,
+ f_sent_len, e_sent_len,
+ self.train_max_initial_size, self.train_max_initial_size,
+ 1, 1, 1, 1, 0, 1, 0) and
+ ((not self.tight_phrases) or f_links_low[f_x_low] != -1) and
+ self.find_fixpoint(f_x_low, f_low, # check integrity of new subphrase
+ f_links_low, f_links_high, e_links_low, e_links_high,
+ -1, -1, e_gap_low, e_gap_high, f_gap_low, f_gap_high,
+ f_sent_len, e_sent_len,
+ self.train_max_initial_size, self.train_max_initial_size,
+ 0, 0, 0, 0, 0, 0, 0)):
+ fphr_arr._clear()
+ i = 1
+ self.findexes.reset()
+ self.findexes.append(sym.setindex(self.category, i))
+ fphr_arr._append(sym.setindex(self.category, i))
+ i = i+1
+ self.findexes.extend(self.findexes1)
+ for j from 0 <= j < phrase.n:
+ if sym.isvar(phrase.syms[j]):
+ fphr_arr._append(sym.setindex(self.category, i))
+ i = i + 1
+ else:
+ fphr_arr._append(phrase.syms[j])
+ if f_back_high > f_high:
+ fphr_arr._append(sym.setindex(self.category, i))
+ self.findexes.append(sym.setindex(self.category, i))
+ fphr = rule.Phrase(fphr_arr)
+ phrase_list = self.extract_phrases(e_x_low, e_x_high, e_gap_low, e_gap_high, e_links_low, num_gaps+1,
+ f_x_low, f_x_high, f_gap_low, f_gap_high, f_links_low, matching.sent_id,
+ e_sent_len, e_sent_start)
+ if len(phrase_list) > 0:
+ pair_count = available_mass / len(phrase_list)
+ else:
+ pair_count = 0
+ for phrase2,eindexes in phrase_list:
+ als2 = self.create_alignments(sent_links,num_links,self.findexes,eindexes)
+ extracts.append((fphr, phrase2, pair_count, tuple(als2)))
+ if self.extract_file:
+ self.extract_file.write("%s ||| %s ||| %f ||| %d: [%d, %d] -> [%d, %d]\n" % (fphr, phrase2, pair_count, matching.sent_id+1, f_x_low, f_x_high, e_x_low, e_x_high))
+ #print "extract_phrases2: %s ||| %s ||| %f ||| %d: [%d, %d] -> [%d, %d]\n" % (fphr, phrase2, pair_count, matching.sent_id+1, f_x_low, f_x_high, e_x_low, e_x_high)
+
+ if (f_back_high == f_high and
+ f_sent_len - f_high >= self.train_min_gap_size and
+ ((not self.tight_phrases) or (f_links_low[f_high] != -1 and f_links_low[f_back_low] != -1))):
+ f_x_high = f_high+self.train_min_gap_size
+ met_constraints = 1
+ if self.tight_phrases:
+ while f_x_high <= f_sent_len and f_links_low[f_x_high-1] == -1:
+ f_x_high = f_x_high + 1
+ if f_x_high > f_sent_len or f_x_high - f_back_low > self.train_max_initial_size:
+ met_constraints = 0
+
+ if (met_constraints and
+ self.find_fixpoint(f_back_low, f_x_high,
+ f_links_low, f_links_high, e_links_low, e_links_high,
+ e_low, e_high, &e_x_low, &e_x_high, &f_x_low, &f_x_high,
+ f_sent_len, e_sent_len,
+ self.train_max_initial_size, self.train_max_initial_size,
+ 1, 1, 1, 0, 1, 1, 0) and
+ ((not self.tight_phrases) or f_links_low[f_x_high-1] != -1) and
+ self.find_fixpoint(f_high, f_x_high,
+ f_links_low, f_links_high, e_links_low, e_links_high,
+ -1, -1, e_gap_low+gap_start+num_gaps, e_gap_high+gap_start+num_gaps,
+ f_gap_low+gap_start+num_gaps, f_gap_high+gap_start+num_gaps,
+ f_sent_len, e_sent_len,
+ self.train_max_initial_size, self.train_max_initial_size,
+ 0, 0, 0, 0, 0, 0, 0)):
+ fphr_arr._clear()
+ i = 1
+ self.findexes.reset()
+ if f_back_low < f_low:
+ fphr_arr._append(sym.setindex(self.category, i))
+ i = i+1
+ self.findexes.append(sym.setindex(self.category, i))
+ self.findexes.extend(self.findexes1)
+ for j from 0 <= j < phrase.n:
+ if sym.isvar(phrase.syms[j]):
+ fphr_arr._append(sym.setindex(self.category, i))
+ i = i + 1
+ else:
+ fphr_arr._append(phrase.syms[j])
+ fphr_arr._append(sym.setindex(self.category, i))
+ self.findexes.append(sym.setindex(self.category, i))
+ fphr = rule.Phrase(fphr_arr)
+ phrase_list = self.extract_phrases(e_x_low, e_x_high, e_gap_low+gap_start, e_gap_high+gap_start, e_links_low, num_gaps+1,
+ f_x_low, f_x_high, f_gap_low+gap_start, f_gap_high+gap_start, f_links_low,
+ matching.sent_id, e_sent_len, e_sent_start)
+ if len(phrase_list) > 0:
+ pair_count = available_mass / len(phrase_list)
+ else:
+ pair_count = 0
+ for phrase2, eindexes in phrase_list:
+ als3 = self.create_alignments(sent_links,num_links,self.findexes,eindexes)
+ extracts.append((fphr, phrase2, pair_count, tuple(als3)))
+ if self.extract_file:
+ self.extract_file.write("%s ||| %s ||| %f ||| %d: [%d, %d] -> [%d, %d]\n" % (fphr, phrase2, pair_count, matching.sent_id+1, f_x_low, f_x_high, e_x_low, e_x_high))
+ #print "extract_phrases3: %s ||| %s ||| %f ||| %d: [%d, %d] -> [%d, %d]\n" % (fphr, phrase2, pair_count, matching.sent_id+1, f_x_low, f_x_high, e_x_low, e_x_high)
+ if (num_gaps < self.max_nonterminals - 1 and
+ phrase_len+1 < self.max_length and
+ f_back_high == f_high and
+ f_back_low == f_low and
+ f_back_high - f_back_low + (2*self.train_min_gap_size) <= self.train_max_initial_size and
+ f_low >= self.train_min_gap_size and
+ f_high <= f_sent_len - self.train_min_gap_size and
+ ((not self.tight_phrases) or (f_links_low[f_low-1] != -1 and f_links_low[f_high] != -1))):
+
+ met_constraints = 1
+ f_x_low = f_low-self.train_min_gap_size
+ if self.tight_phrases:
+ while f_x_low >= 0 and f_links_low[f_x_low] == -1:
+ f_x_low = f_x_low - 1
+ if f_x_low < 0:
+ met_constraints = 0
+
+ f_x_high = f_high+self.train_min_gap_size
+ if self.tight_phrases:
+ while f_x_high <= f_sent_len and f_links_low[f_x_high-1] == -1:
+ f_x_high = f_x_high + 1
+ if f_x_high > f_sent_len or f_x_high - f_x_low > self.train_max_initial_size:
+ met_constraints = 0
+
+ if (met_constraints and
+ self.find_fixpoint(f_x_low, f_x_high,
+ f_links_low, f_links_high, e_links_low, e_links_high,
+ e_low, e_high, &e_x_low, &e_x_high, &f_x_low, &f_x_high,
+ f_sent_len, e_sent_len,
+ self.train_max_initial_size, self.train_max_initial_size,
+ 1, 1, 2, 1, 1, 1, 1) and
+ ((not self.tight_phrases) or (f_links_low[f_x_low] != -1 and f_links_low[f_x_high-1] != -1)) and
+ self.find_fixpoint(f_x_low, f_low,
+ f_links_low, f_links_high, e_links_low, e_links_high,
+ -1, -1, e_gap_low, e_gap_high, f_gap_low, f_gap_high,
+ f_sent_len, e_sent_len,
+ self.train_max_initial_size, self.train_max_initial_size,
+ 0, 0, 0, 0, 0, 0, 0) and
+ self.find_fixpoint(f_high, f_x_high,
+ f_links_low, f_links_high, e_links_low, e_links_high,
+ -1, -1, e_gap_low+1+num_gaps, e_gap_high+1+num_gaps,
+ f_gap_low+1+num_gaps, f_gap_high+1+num_gaps,
+ f_sent_len, e_sent_len,
+ self.train_max_initial_size, self.train_max_initial_size,
+ 0, 0, 0, 0, 0, 0, 0)):
+ fphr_arr._clear()
+ i = 1
+ self.findexes.reset()
+ self.findexes.append(sym.setindex(self.category, i))
+ fphr_arr._append(sym.setindex(self.category, i))
+ i = i+1
+ self.findexes.extend(self.findexes1)
+ for j from 0 <= j < phrase.n:
+ if sym.isvar(phrase.syms[j]):
+ fphr_arr._append(sym.setindex(self.category, i))
+ i = i + 1
+ else:
+ fphr_arr._append(phrase.syms[j])
+ fphr_arr._append(sym.setindex(self.category, i))
+ self.findexes.append(sym.setindex(self.category, i))
+ fphr = rule.Phrase(fphr_arr)
+ phrase_list = self.extract_phrases(e_x_low, e_x_high, e_gap_low, e_gap_high, e_links_low, num_gaps+2,
+ f_x_low, f_x_high, f_gap_low, f_gap_high, f_links_low,
+ matching.sent_id, e_sent_len, e_sent_start)
+ if len(phrase_list) > 0:
+ pair_count = available_mass / len(phrase_list)
+ else:
+ pair_count = 0
+ for phrase2, eindexes in phrase_list:
+ als4 = self.create_alignments(sent_links,num_links,self.findexes,eindexes)
+ extracts.append((fphr, phrase2, pair_count, tuple(als4)))
+ if self.extract_file:
+ self.extract_file.write("%s ||| %s ||| %f ||| %d: [%d, %d] -> [%d, %d]\n" % (fphr, phrase2, pair_count, matching.sent_id+1, f_x_low, f_x_high, e_x_low, e_x_high))
+ #print "extract_phrases4 %s ||| %s ||| %f ||| %d: [%d, %d] -> [%d, %d]\n" % (fphr, phrase2, pair_count, matching.sent_id+1, f_x_low, f_x_high, e_x_low, e_x_high)
+ else:
+ reason_for_failure = "Unable to extract basic phrase"
+
+ free(sent_links)
+ free(f_links_low)
+ free(f_links_high)
+ free(e_links_low)
+ free(e_links_high)
+ free(f_gap_low)
+ free(f_gap_high)
+ free(e_gap_low)
+ free(e_gap_high)
+
+ if self.sample_file is not None:
+ self.sample_file.write("%s ||| %d: [%d, %d] ||| %d ||| %s\n" % (phrase, matching.sent_id+1, f_low, f_high, len(extracts), reason_for_failure))
+
+ #print "%s ||| %d: [%d, %d] ||| %d ||| %s\n" % (phrase, matching.sent_id+1, f_low, f_high, len(extracts), reason_for_failure)
+
+
+ return extracts
+
diff --git a/sa-extract/sa-compile.pl b/sa-extract/sa-compile.pl
new file mode 100755
index 00000000..1cae83a7
--- /dev/null
+++ b/sa-extract/sa-compile.pl
@@ -0,0 +1,322 @@
+#!/usr/bin/env perl
+
+use strict;
+use Getopt::Long;
+
+my $cwd; BEGIN { use Cwd qw/ abs_path cwd /; use File::Basename; $cwd = cwd(); }
+
+my $rootdir = `dirname $0`; chomp $rootdir;
+my $compile = "$rootdir/compile_bin.py";
+my $lcp = "$rootdir/lcp_ops.py";
+die "Can't find $compile" unless -f $compile;
+die "Can't execute $compile" unless -x $compile;
+
+sub print_help;
+sub cleanup;
+
+my $alignment;
+my $bitext;
+my $catalog;
+my $dryrun = 0;
+my $group;
+my $help = 0;
+my $ini = "$rootdir/extract.ini";
+my $lm;
+my $precomp;
+my $no_ini = 0;
+my $remove;
+my $type;
+my $local_only = 1;
+my $output;
+
+# Process command-line options
+if (GetOptions(
+ "alignment=s" => \$alignment,
+ "bitext=s" => \$bitext,
+ "help" => \$help,
+ "ini=s" => \$ini,
+ "output=s" => \$output,
+ "precomp-options=s" => \$precomp,
+ "no-ini" => \$no_ini,
+) == 0 || $help == 1 || @ARGV > 0){
+ print_help;
+ die "\n";
+}
+
+open(INI, $ini) or die "Can't read $ini: $!";
+
+$bitext || die "You must specify a bitext with -b\n";
+$alignment || die "You must specify an alignment with -a\n";
+
+my $top_dir;
+if (defined $output) {
+ $top_dir = $output;
+} else {
+ $top_dir = "$cwd/sa-compiled";
+}
+
+my $type_dir = "$top_dir";
+
+my $bitext_name;
+my $bitext_f_file;
+my $bitext_e_file;
+my $bitext_dir;
+if ($bitext){
+ if ($bitext =~ /(.*)=(.*),(.*)/){
+ $bitext_name = $1;
+ $bitext_f_file = $2;
+ $bitext_e_file = $3;
+ -e $bitext_f_file || die "Could not find file $bitext_f_file\n";
+ -e $bitext_e_file || die "Could not find file $bitext_e_file\n";
+ } else {
+ $bitext_name = $bitext;
+ }
+
+ $bitext_dir = "$type_dir/bitext/$bitext_name";
+ if ($bitext_f_file){
+ if (-e $bitext_dir) {
+ die "Bitext $bitext_name already exists\n";
+ }
+ } else {
+ unless (-e $bitext_dir){
+ die "No bitext $bitext_name. You must specify bitext files with -b\n";
+ }
+ }
+}
+
+my $max_nt = 2;
+my $max_len = 5;
+my $max_size = 15;
+my $min_gap = 1;
+my $rank1 = 100;
+my $rank2 = 10;
+my $precomp_file;
+if ($precomp){
+ unless ($bitext_name){
+ die "You must specify a bitext with -b if using -p\n";
+ }
+ my @precomp_args = split(/,/, $precomp);
+ my $precomp_arg;
+ for $precomp_arg (@precomp_args){
+ if ($precomp_arg =~ /(.*)=(.*)/){
+ my $key = $1;
+ my $value = $2;
+ unless ($value =~ /^\d+$/){
+ die "Value for -p option must be a positive integer, found $value\n";
+ }
+ if ($key eq "max-len"){ $max_len = $value; }
+ elsif ($key eq "max-nt"){ $max_nt = $value; }
+ elsif ($key eq "max-size"){ $max_size = $value; }
+ elsif ($key eq "min-gap"){ $min_gap = $value; }
+ elsif ($key eq "rank1"){ $rank1 = $value; }
+ elsif ($key eq "rank2"){ $rank2 = $value; }
+ else{
+ die "Unknown option $key given for -p\n";
+ }
+ } else {
+ die "When using -p, you must specify key-value pairs using syntax: <key1>=<value1>,...,<keyN>=<valueN>\n";
+ }
+ }
+}
+my $precomp_compile_needed = 0;
+if ($bitext_name){
+ $precomp_file = "$bitext_dir/precomp.$max_len.$max_nt.$max_size.$min_gap.$rank1.$rank2.bin";
+ unless (-e $precomp_file){
+ $precomp_compile_needed = 1;
+ }
+}
+
+my $alignment_name;
+my $alignment_file;
+my $alignment_dir;
+if ($alignment){
+ $bitext || die "Specified alignment $alignment without specifying bitext using -b\n";
+ if ($alignment =~ /(.*)=(.*)/){
+ $alignment_name = $1;
+ $alignment_file = $2;
+ -e $alignment_file || die "Could not find file $alignment_file\n";
+ } else {
+ $alignment_name = $alignment;
+ }
+
+ $alignment_dir = "$bitext_dir/a/$alignment_name";
+ if ($alignment_file){
+ if (-e $alignment_dir){
+ die "Alignment $alignment_name already exists for bitext $bitext_name\n";
+ }
+ } else {
+ require_top_dirs();
+ unless (-e $alignment_dir){
+ die "No alignment $alignment_name for bitext $bitext_name\n";
+ }
+ }
+}
+
+if ($bitext_name){
+ print STDERR " from files $bitext_f_file and $bitext_e_file\n";
+} else {
+ print " No bitext\n";
+}
+if ($precomp_compile_needed){
+ print STDERR " Precompilation needed: max-len=$max_len, max-nt=$max_nt, max-size=$max_size, min-gap=$min_gap, rank1=$rank1, rank2=$rank2\n";
+}
+if ($alignment_name){
+ print STDERR " Alignment = $alignment_name";
+ if ($alignment_file){
+ print STDERR " from file $alignment_file\n";
+ }
+} else {
+ print STDERR " No alignment\n";
+}
+
+my $script;
+my $compile_dir;
+$SIG{INT} = "cleanup";
+$SIG{TERM} = "cleanup";
+$SIG{HUP} = "cleanup";
+
+ if ($bitext_e_file || $precomp_compile_needed || $alignment_file){
+ my $compiled_e_file;
+ my $compiled_f_file;
+
+ $compile_dir = $top_dir;
+ my $compile_top_dir = "$compile_dir";
+
+ my $compile_bitext_dir = "$compile_top_dir/bitext/$bitext_name";
+ if ($bitext_e_file){
+ `mkdir -p $compile_bitext_dir`;
+ print STDERR "\nCompiling bitext (f side)...\n";
+ `$compile -s $bitext_f_file $compile_bitext_dir/f.sa.bin`;
+ die "Command failed: $!" unless $? == 0;
+ print STDERR "\nCompiling bitext (e side)...\n";
+ `$compile -d $bitext_e_file $compile_bitext_dir/e.bin`;
+ die "Command failed: $!" unless $? == 0;
+
+ $compiled_f_file = "$compile_bitext_dir/f.sa.bin";
+ $compiled_e_file = "$compile_bitext_dir/e.bin";
+ } else { # bitext already compiled
+ $compiled_f_file = "$bitext_dir/f.sa.bin";
+ $compiled_e_file = "$bitext_dir/e.bin";
+ }
+
+ if ($precomp_compile_needed){
+ `mkdir -p $compile_bitext_dir`;
+ my $top_stats_file = "$compile_bitext_dir/f.top.$rank1";
+ my $compiled_precomp_file = "$compile_bitext_dir/precomp.$max_len.$max_nt.$max_size.$min_gap.$rank1.$rank2.bin";
+ my $cmd = "$lcp -t 4 $compiled_f_file | sort -nr | head -$rank1 > $top_stats_file";
+ print STDERR "$cmd\n";
+ `$cmd`;
+ die "Command failed: $cmd" unless $? == 0;
+ `$compile -r max-len=$max_len max-nt=$max_nt max-size=$max_size min-gap=$min_gap rank1=$rank1 rank2=$rank2 sa=$compiled_f_file $top_stats_file $compiled_precomp_file`;
+ die "Command failed: $!" unless $? == 0;
+ }
+
+ if ($alignment_file){
+ my $compile_alignment_dir = "$compile_top_dir/bitext/$bitext_name/a/$alignment_name";
+ `mkdir -p $compile_alignment_dir`;
+ print STDERR "\nCompiling alignment...\n";
+ my $cmd= "$compile -a $alignment_file $compile_alignment_dir/a.bin";
+ print STDERR " $cmd\n";
+ `$cmd`;
+ die "Command failed: $!" unless $? == 0;
+
+ print STDERR "\nCompiling lexical weights file...\n";
+ $cmd="$compile -x $compiled_f_file $compiled_e_file $compile_alignment_dir/a.bin $compile_alignment_dir/lex.bin";
+ print STDERR " $cmd\n";
+ `$cmd`;
+ die "Command failed: $!" unless $? == 0;
+ }
+
+ chdir $compile_dir;
+ print STDERR "Compiling done: $compile_dir\n";
+ }
+
+ unless ($no_ini){
+ my $line;
+ while($line=<INI>){
+ $line =~ s/^([^#]*a_file\s*=\s*")(.*)("\s*)$/$1$alignment_dir\/a.bin$3/;
+ $line =~ s/^([^#]*lex_file\s*=\s*")(.*)("\s*)$/$1$alignment_dir\/lex.bin$3/;
+ $line =~ s/^([^#]*f_sa_file\s*=\s*")(.*)("\s*)$/$1$bitext_dir\/f.sa.bin$3/;
+ $line =~ s/^([^#]*e_file\s*=\s*")(.*)("\s*)$/$1$bitext_dir\/e.bin$3/;
+ $line =~ s/^([^#]*precompute_file\s*=\s*")(.*)("\s*)$/$1$bitext_dir\/precomp.$max_len.$max_nt.$max_size.$min_gap.$rank1.$rank2.bin$3/;
+
+ $line =~ s/^([^#]*max_len\s*=\s*)(.*)(\s*)$/$1$max_len$3/;
+ $line =~ s/^([^#]*max_nt\s*=\s*)(.*)(\s*)$/$1$max_nt$3/;
+ $line =~ s/^([^#]*max_size\s*=\s*)(.*)(\s*)$/$1$max_size$3/;
+ $line =~ s/^([^#]*min_gap\s*=\s*)(.*)(\s*)$/$1$min_gap$3/;
+ $line =~ s/^([^#]*rank1\s*=\s*)(.*)(\s*)$/$1$rank1$3/;
+ $line =~ s/^([^#]*rank2\s*=\s*)(.*)(\s*)$/$1$rank2$3/;
+
+ print $line;
+ }
+ }
+
+exit(0);
+
+sub cleanup {
+ die "Cleanup.\n";
+}
+
+sub print_help
+{
+ my $name = `basename $0`; chomp $name;
+ print << "Help";
+
+usage: $name [options]
+
+ Manage compilation of SA-Hiero files and creation of ini files.
+ In the default usage, the command deploys a set of files needed
+ to create a system, and writes an ini for the system on stdout.
+
+options:
+
+ -a, --alignment <name>[=<filename>]
+ Name of an alignment of a bitext (which must be specified
+ with -b unless using the -c flag). If used with -r, the
+ alignment is removed from the deployment. If used with -c,
+ only alignments with this name are listed. If a filename is
+ given, then the file will be deployed using the name.
+
+ -b, --bitext <name>[=<f file>,<e file>]
+ Name of a bitext for a particular system type (which must be
+ specified with -t unless using the -c flag). If used with -r,
+ the bitext is removed from the deployment. If used with -c,
+ only bitexts with the name are listed. If a filename is given,
+ then the file will be deployed using the name.
+
+ -h, --help
+ Prints this message.
+
+ -i, --ini <filename>
+ Use a specific ini file as the template for a system, rather than
+ the default ini file.
+
+ -p, --precomp-options <key1>=<value1>[,<key2>=<value2>,...,<keyN>=<valueN>]
+ Set parameters of the grammar. This must be set by $name because
+ many parameters involve precomputation. There are six keys that can
+ be set:
+ max-len: maximum number of symbols (T and NT) in a grammar rule
+ max-nt: maximum number of nonterminals in a grammar rule
+ max-size: maximum span of a grammar rule extracted from training
+ min-gap: minimum gap spanned by a nonterminal in training
+ rank1: number of frequent words to precompute collocations for.
+ rank2: number of super-frequent words to precompute triple
+ collocations for.
+ All values must be positive integers. If not specified, defaults are:
+ max-len = 5
+ max-nt = 2 (>2 not supported)
+ max-size = 10
+ min-gap = 2
+ rank1 = 100 (>300 not recommended)
+ rank2 = 10 (>10 not recommended)
+
+ -n, --no-ini
+ Do not generate an ini file on stdout. If this option is used, then
+ the requirement to specify a full system is relaxed. Therefore, this
+ option can be used when the sole objective is deployment of files.
+
+ -o, --output-dir
+ Write the compiled model to this directory.
+
+Help
+}
diff --git a/sa-extract/setup.cfg b/sa-extract/setup.cfg
new file mode 100644
index 00000000..8f696136
--- /dev/null
+++ b/sa-extract/setup.cfg
@@ -0,0 +1,2 @@
+[build_ext]
+inplace=1
diff --git a/sa-extract/setup.py b/sa-extract/setup.py
new file mode 100644
index 00000000..cdcbfb54
--- /dev/null
+++ b/sa-extract/setup.py
@@ -0,0 +1,45 @@
+from distutils.core import setup, Extension
+from distutils.util import get_platform
+import os.path
+
+cstrmap_module = Extension('cstrmap', sources = ['cstrmap.c', 'strmap.cc'])
+setup (name = 'CStringMap', version = '1.0', description = 'C string->int map', ext_modules = [cstrmap_module])
+
+rule_module = Extension('rule',
+ sources = ['rule.c', 'strutil.c'])
+setup (name = 'Rule', version = '1.0', description = 'rule class', ext_modules = [rule_module])
+
+sym_module = Extension('sym',
+ sources = ['sym.c'])
+setup (name = 'Sym', version = '1.0', description = 'symbol class', ext_modules = [sym_module])
+
+cdat_module = Extension('cdat', sources = ['cdat.c'])
+setup(name = "CDat", version = '1.0', description = 'C Data class', ext_modules = [cdat_module])
+
+cintlist_module = Extension('cintlist', sources = ['cintlist.c'])
+setup(name = "CIntList", version = '1.0', description = 'C int array/list class', ext_modules = [cintlist_module])
+
+cfloatlist_module = Extension('cfloatlist', sources = ['cfloatlist.c'])
+setup(name = "CFloatList", version = '1.0', description = 'C float array/list class', ext_modules = [cfloatlist_module])
+
+calignment_module = Extension('calignment', sources = ['calignment.c'])
+setup(name = "CAlignment", version = '1.0', description = 'C alignment class', ext_modules = [calignment_module])
+
+csuf_module = Extension('csuf', sources = ['csuf.c'])
+setup(name = "CSuffixArray", version = '1.0', description = 'C suffix array class', ext_modules = [csuf_module])
+
+clex_module = Extension('clex', sources = ['clex.c'])
+setup(name = "CLex", version = '1.0', description = 'C lexical class', ext_modules = [clex_module])
+
+factory_module = Extension('rulefactory', sources = ['rulefactory.c'])
+setup(name = "RuleFactory", version = '1.0', description = 'C rule factory classes', ext_modules = [factory_module])
+
+cveb_module = Extension('cveb', sources = ['cveb.c'])
+setup(name = "CVEB", version = '1.0', description = 'C impl. of van Emde Boas tree', ext_modules = [cveb_module])
+
+lcp_module = Extension('lcp', sources = ['lcp.c'])
+setup(name = "LCP", version = '1.0', description = 'C impl. of LCP', ext_modules = [lcp_module])
+
+precomp_module = Extension('precomputation', sources = ['precomputation.c'])
+setup(name = "Precomputation", version = '1.0', description = 'Precomputation Algorithm', ext_modules = [precomp_module])
+
diff --git a/sa-extract/sgml.py b/sa-extract/sgml.py
new file mode 100644
index 00000000..2db8b5dc
--- /dev/null
+++ b/sa-extract/sgml.py
@@ -0,0 +1,194 @@
+import sys, sgmllib, xml.sax.saxutils, sym
+
+def attrs_to_str(d):
+ if len(d) == 0:
+ return ""
+ l = [""]+["%s=%s" % (name, xml.sax.saxutils.quoteattr(value)) for (name, value) in d]
+ return " ".join(l)
+
+def attrs_to_dict(a):
+ d = {}
+ for (name, value) in a:
+ if d.has_key(name.lower()):
+ raise ValueError, "duplicate attribute names"
+ d[name.lower()] = value
+ return d
+
+class Sentence(object):
+ def __init__(self, words=None, meta=None):
+ if words is not None:
+ self.words = list(words)
+ else:
+ self.words = []
+ if meta is not None:
+ self.meta = meta
+ else:
+ self.meta = []
+
+ def copy(self):
+ return Sentence(self.words, list(self.meta))
+
+ def mark(self, tag, attrs):
+ self.meta.append((tag, attrs, 0, len(self.words)))
+
+ def getmark(self):
+ if len(self.meta) > 0:
+ (tag, attrs, i, j) = self.meta[-1]
+ if i == 0 and j == len(self.words):
+ return (tag, attrs)
+ else:
+ return None
+ else:
+ return None
+
+ def unmark(self):
+ mark = self.getmark()
+ if mark is not None:
+ self.meta = self.meta[:-1]
+ return mark
+
+ def __cmp__(self, other):
+ return cmp((self.words, self.meta), (other.words, other.meta))
+
+ def __str__(self):
+ def cmp_spans((tag1,attr1,i1,j1),(tag2,attr2,i2,j2)):
+ if i1==i2<=j1==j2:
+ return 0
+ elif i2<=i1<=j1<=j2:
+ return -1
+ elif i1<=i2<=j2<=j1:
+ return 1
+ else:
+ return cmp((i1,j1),(i2,j2)) # don't care
+ # this guarantees that equal spans will come out nested
+ # we want the later spans to be outer
+ # this relies on stable sort
+ open = [[] for i in xrange(len(self.words)+1)]
+ # there seems to be a bug still with empty spans
+ empty = [[] for i in xrange(len(self.words)+1)]
+ close = [[] for j in xrange(len(self.words)+1)]
+ for (tag,attrs,i,j) in sorted(self.meta, cmp=cmp_spans):
+ if i == j:
+ # do we want these to nest?
+ empty[i].append("<%s%s></%s>\n" % (tag, attrs_to_str(attrs), tag))
+ else:
+ open[i].append("<%s%s>\n" % (tag, attrs_to_str(attrs)))
+ close[j].append("</%s>\n" % tag)
+
+ result = []
+ if len(empty[0]) > 0:
+ result.extend(empty[0])
+ for i in xrange(len(self.words)):
+ if i > 0:
+ result.append(" ")
+ result.extend(reversed(open[i]))
+ result.append(xml.sax.saxutils.escape(sym.tostring(self.words[i])))
+ result.extend(close[i+1])
+ if len(empty[i+1]) > 0:
+ result.extend(empty[i+1])
+
+ return "".join(result)
+
+ def __add__(self, other):
+ if type(other) in (list, tuple):
+ return Sentence(self.words + list(other), self.meta)
+ else:
+ othermeta = [(tag, attrs, i+len(self.words), j+len(self.words)) for (tag, attrs, i, j) in other.meta]
+ return Sentence(self.words + other.words, self.meta+othermeta)
+
+def read_raw(f):
+ """Read a raw file into a list of Sentences."""
+ if type(f) is str:
+ f = file(f, "r")
+ i = 0
+ line = f.readline()
+ while line != "":
+ sent = process_sgml_line(line, i)
+ mark = sent.getmark()
+ if mark is None:
+ sent.mark('seg', [('id',str(i))])
+ else:
+ (tag, attrs) = mark
+ if tag == "seg" and not attrs_to_dict(attrs).has_key('id'):
+ x = ('id',str(i))
+ attrs.append(x)
+ sent.mark('seg', attrs)
+ if tag != "seg":
+ sent.mark('seg', [('id',str(i))])
+ yield sent
+ i += 1
+ line = f.readline()
+
+def process_sgml_line(line, id=None):
+ p = DatasetParser(None)
+ p.pos = 0
+ p.words = []
+ p.meta = []
+ p.feed(line)
+ p.close()
+ sent = Sentence(p.words, p.meta)
+ return sent
+
+class DatasetParser(sgmllib.SGMLParser):
+ def __init__(self, set):
+ sgmllib.SGMLParser.__init__(self)
+ self.words = None
+ self.mystack = []
+ self.string = None
+ self.convref = d = {"amp":"&", "lt":"<", "gt":">", "quot":'"', "squot":"'"}
+ def close(self):
+ self.convert()
+ sgmllib.SGMLParser.close(self)
+
+ def handle_starttag(self, tag, method, attrs):
+ thing = method(attrs)
+ self.mystack.append(thing)
+
+ def handle_endtag(self, tag, method):
+ thing = self.mystack.pop()
+ method(thing)
+
+ def unknown_starttag(self, tag, attrs):
+ thing = self.start(tag, attrs)
+ self.mystack.append(thing)
+
+ def unknown_endtag(self, tag):
+ thing = self.mystack.pop()
+ self.end(tag, thing)
+
+ def start(self, tag, attrs):
+ self.convert()
+ if self.words is not None:
+ return (tag, attrs, self.pos, None)
+ else:
+ return None
+
+ def convert(self):
+ if self.words is not None and self.string is not None:
+ words = self.string.split()
+ self.pos += len(words)
+ self.words.extend(words)
+ self.string = None
+
+ def end(self, tag, thing):
+ self.convert()
+ if self.words is not None:
+ (tag, attrs, i, j) = thing
+ self.meta.append((tag, attrs, i, self.pos))
+
+ def handle_data(self, s):
+ if self.words is not None:
+ if (self.string is None):
+ self.string = s
+ else:
+ self.string += s
+
+ def handle_entityref(self, ref):
+ # s=self.convert_entityref(ref) # if python 2.5
+ s=self.convref[ref]
+ if self.words is not None:
+ if (self.string is None):
+ self.string = s
+ else:
+ self.string += s
+
diff --git a/sa-extract/strmap.cc b/sa-extract/strmap.cc
new file mode 100644
index 00000000..5040477e
--- /dev/null
+++ b/sa-extract/strmap.cc
@@ -0,0 +1,232 @@
+#include "strmap.h"
+
+#include <vector>
+#include <string>
+#include <tr1/unordered_map>
+#include <stdint.h>
+
+using namespace std;
+using namespace std::tr1;
+
+#undef HAVE_64_BITS
+
+#if INTPTR_MAX == INT32_MAX
+# define HAVE_64_BITS 0
+#elif INTPTR_MAX >= INT64_MAX
+# define HAVE_64_BITS 1
+#else
+# error "couldn't tell if HAVE_64_BITS from INTPTR_MAX INT32_MAX INT64_MAX"
+#endif
+
+typedef uintptr_t MurmurInt;
+
+// MurmurHash2, by Austin Appleby
+
+static const uint32_t DEFAULT_SEED=2654435769U;
+
+#if HAVE_64_BITS
+//MurmurInt MurmurHash(void const *key, int len, uint32_t seed=DEFAULT_SEED);
+
+inline uint64_t MurmurHash64( const void * key, int len, unsigned int seed=DEFAULT_SEED )
+{
+ const uint64_t m = 0xc6a4a7935bd1e995ULL;
+ const int r = 47;
+
+ uint64_t h = seed ^ (len * m);
+
+ const uint64_t * data = (const uint64_t *)key;
+ const uint64_t * end = data + (len/8);
+
+ while(data != end)
+ {
+ uint64_t k = *data++;
+
+ k *= m;
+ k ^= k >> r;
+ k *= m;
+
+ h ^= k;
+ h *= m;
+ }
+
+ const unsigned char * data2 = (const unsigned char*)data;
+
+ switch(len & 7)
+ {
+ case 7: h ^= uint64_t(data2[6]) << 48;
+ case 6: h ^= uint64_t(data2[5]) << 40;
+ case 5: h ^= uint64_t(data2[4]) << 32;
+ case 4: h ^= uint64_t(data2[3]) << 24;
+ case 3: h ^= uint64_t(data2[2]) << 16;
+ case 2: h ^= uint64_t(data2[1]) << 8;
+ case 1: h ^= uint64_t(data2[0]);
+ h *= m;
+ };
+
+ h ^= h >> r;
+ h *= m;
+ h ^= h >> r;
+
+ return h;
+}
+
+inline uint32_t MurmurHash32(void const *key, int len, uint32_t seed=DEFAULT_SEED)
+{
+ return (uint32_t) MurmurHash64(key,len,seed);
+}
+
+inline MurmurInt MurmurHash(void const *key, int len, uint32_t seed=DEFAULT_SEED)
+{
+ return MurmurHash64(key,len,seed);
+}
+
+#else
+// 32-bit
+
+// Note - This code makes a few assumptions about how your machine behaves -
+// 1. We can read a 4-byte value from any address without crashing
+// 2. sizeof(int) == 4
+inline uint32_t MurmurHash32 ( const void * key, int len, uint32_t seed=DEFAULT_SEED)
+{
+ // 'm' and 'r' are mixing constants generated offline.
+ // They're not really 'magic', they just happen to work well.
+
+ const uint32_t m = 0x5bd1e995;
+ const int r = 24;
+
+ // Initialize the hash to a 'random' value
+
+ uint32_t h = seed ^ len;
+
+ // Mix 4 bytes at a time into the hash
+
+ const unsigned char * data = (const unsigned char *)key;
+
+ while(len >= 4)
+ {
+ uint32_t k = *(uint32_t *)data;
+
+ k *= m;
+ k ^= k >> r;
+ k *= m;
+
+ h *= m;
+ h ^= k;
+
+ data += 4;
+ len -= 4;
+ }
+
+ // Handle the last few bytes of the input array
+
+ switch(len)
+ {
+ case 3: h ^= data[2] << 16;
+ case 2: h ^= data[1] << 8;
+ case 1: h ^= data[0];
+ h *= m;
+ };
+
+ // Do a few final mixes of the hash to ensure the last few
+ // bytes are well-incorporated.
+
+ h ^= h >> 13;
+ h *= m;
+ h ^= h >> 15;
+
+ return h;
+}
+
+inline MurmurInt MurmurHash ( const void * key, int len, uint32_t seed=DEFAULT_SEED) {
+ return MurmurHash32(key,len,seed);
+}
+
+// 64-bit hash for 32-bit platforms
+
+inline uint64_t MurmurHash64 ( const void * key, int len, uint32_t seed=DEFAULT_SEED)
+{
+ const uint32_t m = 0x5bd1e995;
+ const int r = 24;
+
+ uint32_t h1 = seed ^ len;
+ uint32_t h2 = 0;
+
+ const uint32_t * data = (const uint32_t *)key;
+
+ while(len >= 8)
+ {
+ uint32_t k1 = *data++;
+ k1 *= m; k1 ^= k1 >> r; k1 *= m;
+ h1 *= m; h1 ^= k1;
+ len -= 4;
+
+ uint32_t k2 = *data++;
+ k2 *= m; k2 ^= k2 >> r; k2 *= m;
+ h2 *= m; h2 ^= k2;
+ len -= 4;
+ }
+
+ if(len >= 4)
+ {
+ uint32_t k1 = *data++;
+ k1 *= m; k1 ^= k1 >> r; k1 *= m;
+ h1 *= m; h1 ^= k1;
+ len -= 4;
+ }
+
+ switch(len)
+ {
+ case 3: h2 ^= ((unsigned char*)data)[2] << 16;
+ case 2: h2 ^= ((unsigned char*)data)[1] << 8;
+ case 1: h2 ^= ((unsigned char*)data)[0];
+ h2 *= m;
+ };
+
+ h1 ^= h2 >> 18; h1 *= m;
+ h2 ^= h1 >> 22; h2 *= m;
+ h1 ^= h2 >> 17; h1 *= m;
+ h2 ^= h1 >> 19; h2 *= m;
+
+ uint64_t h = h1;
+
+ h = (h << 32) | h2;
+
+ return h;
+}
+
+#endif
+//32bit
+
+struct MurmurHasher {
+ size_t operator()(const string& s) const {
+ return MurmurHash(s.c_str(), s.size());
+ }
+};
+
+struct StrMap {
+ StrMap() { keys_.reserve(10000); keys_.push_back("<bad0>"); map_[keys_[0]] = 0; }
+ unordered_map<string, int, MurmurHasher> map_;
+ vector<string> keys_;
+};
+
+StrMap* stringmap_new() {
+ return new StrMap;
+}
+
+void stringmap_delete(StrMap *vocab) {
+ delete vocab;
+}
+
+int stringmap_index(StrMap *vocab, char *s) {
+ int& cell = vocab->map_[s];
+ if (!cell) {
+ cell = vocab->keys_.size();
+ vocab->keys_.push_back(s);
+ }
+ return cell;
+}
+
+char* stringmap_word(StrMap *vocab, int i) {
+ return const_cast<char *>(vocab->keys_[i].c_str());
+}
+
diff --git a/sa-extract/strmap.h b/sa-extract/strmap.h
new file mode 100644
index 00000000..a218a4c0
--- /dev/null
+++ b/sa-extract/strmap.h
@@ -0,0 +1,22 @@
+#ifndef _STRMAP_H_
+#define _STRMAP_H_
+
+#ifdef __cplusplus
+ extern "C" {
+#else
+ typedef struct StrMap StrMap; /* dummy type to stand in for class */
+#endif
+
+struct StrMap;
+
+StrMap* stringmap_new();
+void stringmap_delete(StrMap *vocab);
+int stringmap_index(StrMap *vocab, char *s);
+char* stringmap_word(StrMap *vocab, int i);
+
+#ifdef __cplusplus
+ }
+#endif
+
+
+#endif
diff --git a/sa-extract/strutil.c b/sa-extract/strutil.c
new file mode 100644
index 00000000..456de87a
--- /dev/null
+++ b/sa-extract/strutil.c
@@ -0,0 +1,63 @@
+#include <string.h>
+#include <stdlib.h>
+
+/* Like strsep(3) except that the delimiter is a string, not a set of characters.
+*/
+char *strstrsep(char **stringp, const char *delim) {
+ char *match, *save;
+ save = *stringp;
+ if (*stringp == NULL)
+ return NULL;
+ match = strstr(*stringp, delim);
+ if (match == NULL) {
+ *stringp = NULL;
+ return save;
+ }
+ *match = '\0';
+ *stringp = match + strlen(delim);
+ return save;
+}
+
+static char **words = NULL;
+static int max_words;
+char **split(char *s, const char *delim, int *pn) {
+ int i;
+ char *tok, *rest;
+
+ if (words == NULL) {
+ max_words = 10;
+ words = malloc(max_words*sizeof(char *));
+ }
+ i = 0;
+ rest = s;
+ while ((tok = (delim ? strstrsep(&rest, delim) : strsep(&rest, " \t\n"))) != NULL) {
+ if (!delim && !*tok) // empty token
+ continue;
+ while (i+1 >= max_words) {
+ max_words *= 2;
+ words = realloc(words, max_words*sizeof(char *));
+ }
+ words[i] = tok;
+ i++;
+ }
+ words[i] = NULL;
+ if (pn != NULL)
+ *pn = i;
+ return words;
+}
+
+inline int isspace(char c) {
+ return (c == ' ' || c == '\t' || c == '\n');
+}
+
+char *strip(char *s) {
+ int n;
+ while (isspace(*s) && *s != '\0')
+ s++;
+ n = strlen(s);
+ while (n > 0 && isspace(s[n-1])) {
+ s[n-1] = '\0';
+ n--;
+ }
+ return s;
+}
diff --git a/sa-extract/strutil.h b/sa-extract/strutil.h
new file mode 100644
index 00000000..94a77033
--- /dev/null
+++ b/sa-extract/strutil.h
@@ -0,0 +1,8 @@
+#ifndef STRUTIL_H
+#define STRUTIL_H
+
+char *strstrsep(char **stringp, const char *delim);
+char *strip(char *s);
+char **split(char *s, const char *delim, int *pn);
+
+#endif
diff --git a/sa-extract/sym.pxd b/sa-extract/sym.pxd
new file mode 100644
index 00000000..d0650f46
--- /dev/null
+++ b/sa-extract/sym.pxd
@@ -0,0 +1,17 @@
+cimport cstrmap
+
+cdef class Alphabet:
+ cdef readonly cstrmap.StringMap terminals, nonterminals
+ cdef int first_nonterminal, last_nonterminal
+ cdef int isvar(self, int sym)
+ cdef int isword(self, int sym)
+ cdef int getindex(self, int sym)
+ cdef int setindex(self, int sym, int ind)
+ cdef int clearindex(self, int sym)
+ cdef int match(self, int sym1, int sym2)
+ cdef char* tocat(self, int sym)
+ cdef int fromcat(self, char *s)
+ cdef char* tostring(self, int sym)
+ cdef int fromstring(self, char *s, int terminal)
+
+
diff --git a/sa-extract/sym.pyx b/sa-extract/sym.pyx
new file mode 100644
index 00000000..264cfcd9
--- /dev/null
+++ b/sa-extract/sym.pyx
@@ -0,0 +1,155 @@
+from libc.string cimport strrchr, strstr, strcpy, strlen
+from libc.stdlib cimport malloc, realloc, strtol
+
+cdef int index_shift, index_mask, n_index
+index_shift = 3
+n_index = 1<<index_shift
+index_mask = (1<<index_shift)-1
+cdef id2sym
+id2sym = {}
+
+cdef class Alphabet:
+ def __cinit__(self):
+ self.terminals = cstrmap.StringMap()
+ self.nonterminals = cstrmap.StringMap()
+
+ def __init__(self):
+ self.first_nonterminal = -1
+
+ def __dealloc__(self):
+ pass
+
+ cdef int isvar(self, int sym):
+ return sym < 0
+
+ cdef int isword(self, int sym):
+ return sym >= 0
+
+ cdef int getindex(self, int sym):
+ return -sym & index_mask
+
+ cdef int setindex(self, int sym, int ind):
+ return -(-sym & ~index_mask | ind)
+
+ cdef int clearindex(self, int sym):
+ return -(-sym& ~index_mask)
+
+ cdef int match(self, int sym1, int sym2):
+ return self.clearindex(sym1) == self.clearindex(sym2);
+
+ cdef char* tocat(self, int sym):
+ return self.nonterminals.word((-sym >> index_shift)-1)
+
+ cdef int fromcat(self, char *s):
+ cdef int i
+ i = self.nonterminals.index(s)
+ if self.first_nonterminal == -1:
+ self.first_nonterminal = i
+ if i > self.last_nonterminal:
+ self.last_nonterminal = i
+ return -(i+1 << index_shift)
+
+ cdef char* tostring(self, int sym):
+ cdef int ind
+ if self.isvar(sym):
+ if sym in id2sym:
+ return id2sym[sym]
+
+ ind = self.getindex(sym)
+ if ind > 0:
+ id2sym[sym] = "[%s,%d]" % (self.tocat(sym), ind)
+ else:
+ id2sym[sym] = "[%s]" % self.tocat(sym)
+ return id2sym[sym]
+
+ else:
+ return self.terminals.word(sym)
+
+ cdef int fromstring(self, char *s, int terminal):
+ """Warning: this method is allowed to alter s."""
+ cdef char *comma
+ cdef int n
+ n = strlen(s)
+ cdef char *sep
+ sep = strstr(s,"_SEP_")
+ if n >= 3 and s[0] == c'[' and s[n-1] == c']' and sep == NULL:
+ if terminal:
+ s1 = "\\"+s
+ return self.terminals.index(s1)
+ s[n-1] = c'\0'
+ s = s + 1
+ comma = strrchr(s, c',')
+ if comma != NULL:
+ comma[0] = c'\0'
+ return self.setindex(self.fromcat(s), strtol(comma+1, NULL, 10))
+ else:
+ return self.fromcat(s)
+ else:
+ return self.terminals.index(s)
+
+# Expose Python functions as top-level functions for backward compatibility
+
+alphabet = Alphabet()
+
+cdef Alphabet calphabet
+calphabet = alphabet
+
+def isvar(int sym):
+ return calphabet.isvar(sym)
+
+def isword(int sym):
+ return calphabet.isword(sym)
+
+def getindex(int sym):
+ return calphabet.getindex(sym)
+
+def setindex(int sym, int ind):
+ return calphabet.setindex(sym, ind)
+
+def clearindex(int sym):
+ return calphabet.clearindex(sym)
+
+def match(int sym1, int sym2):
+ return calphabet.match(sym1, sym2) != 0
+
+def totag(int sym):
+ return calphabet.tocat(sym)
+
+def fromtag(s):
+ s = s.upper()
+ return calphabet.fromcat(s)
+
+def tostring(sym):
+ if type(sym) is str:
+ return sym
+ else:
+ return calphabet.tostring(sym)
+
+cdef int bufsize
+cdef char *buf
+bufsize = 100
+buf = <char *>malloc(bufsize)
+cdef ensurebufsize(int size):
+ global buf, bufsize
+ if size > bufsize:
+ buf = <char *>realloc(buf, size*sizeof(char))
+ bufsize = size
+
+def fromstring(s, terminal=False):
+ cdef bytes bs
+ cdef char* cs
+ if terminal:
+ return calphabet.fromstring(s, 1)
+ else:
+ bs = s
+ cs = bs
+ ensurebufsize(len(s)+1)
+ strcpy(buf, cs)
+ return calphabet.fromstring(buf, 0)
+
+def nonterminals():
+ cdef i
+ l = []
+ for i from calphabet.first_nonterminal <= i <= calphabet.last_nonterminal:
+ l.append(-(i+1 << index_shift))
+ return l