From 8e5fad9bcbadf36bbab3c1c5b053e3c8f7dddbce Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Thu, 2 Feb 2012 06:29:50 +0000 Subject: lopez suffix array extractor with copyrighted david chiang code excised --- sa-extract/README | 50 + sa-extract/calignment.pxd | 10 + sa-extract/calignment.pyx | 128 ++ sa-extract/cdat.pxd | 12 + sa-extract/cdat.pyx | 178 +++ sa-extract/cfloatlist.pxd | 10 + sa-extract/cfloatlist.pyx | 93 ++ sa-extract/cintlist.pxd | 15 + sa-extract/cintlist.pyx | 196 +++ sa-extract/clex.pyx | 460 +++++++ sa-extract/cmath.pxd | 2 + sa-extract/cn.py | 164 +++ sa-extract/compile_bin.py | 148 +++ sa-extract/context_model.py | 234 ++++ sa-extract/cstrmap.pxd | 12 + sa-extract/cstrmap.pyx | 14 + sa-extract/csuf.pxd | 11 + sa-extract/csuf.pyx | 321 +++++ sa-extract/cveb.pxd | 15 + sa-extract/cveb.pyx | 390 ++++++ sa-extract/example/README | 8 + sa-extract/example/corpus.align.gz | Bin 0 -> 829334 bytes sa-extract/example/corpus.de.gz | Bin 0 -> 1724393 bytes sa-extract/example/corpus.en.gz | Bin 0 -> 1457711 bytes sa-extract/example/test.de | 10 + sa-extract/example/test.ref.en | 10 + sa-extract/extract.ini | 116 ++ sa-extract/extractor.py | 60 + sa-extract/lcp.pyx | 113 ++ sa-extract/lcp_ops.py | 52 + sa-extract/log.py | 18 + sa-extract/manager.py | 100 ++ sa-extract/model.py | 12 + sa-extract/monitor.py | 48 + sa-extract/precomputation.pxd | 13 + sa-extract/precomputation.pyx | 478 ++++++++ sa-extract/rule.pxd | 13 + sa-extract/rule.pyx | 286 +++++ sa-extract/rulefactory.pyx | 2360 ++++++++++++++++++++++++++++++++++++ sa-extract/sa-compile.pl | 322 +++++ sa-extract/setup.cfg | 2 + sa-extract/setup.py | 45 + sa-extract/sgml.py | 194 +++ sa-extract/strmap.cc | 232 ++++ sa-extract/strmap.h | 22 + sa-extract/strutil.c | 63 + sa-extract/strutil.h | 8 + sa-extract/sym.pxd | 17 + sa-extract/sym.pyx | 155 +++ 49 files changed, 7220 insertions(+) create mode 100644 sa-extract/README create mode 100644 sa-extract/calignment.pxd create mode 100644 sa-extract/calignment.pyx create mode 100644 sa-extract/cdat.pxd create mode 100644 sa-extract/cdat.pyx create mode 100644 sa-extract/cfloatlist.pxd create mode 100644 sa-extract/cfloatlist.pyx create mode 100644 sa-extract/cintlist.pxd create mode 100644 sa-extract/cintlist.pyx create mode 100644 sa-extract/clex.pyx create mode 100644 sa-extract/cmath.pxd create mode 100644 sa-extract/cn.py create mode 100755 sa-extract/compile_bin.py create mode 100644 sa-extract/context_model.py create mode 100644 sa-extract/cstrmap.pxd create mode 100644 sa-extract/cstrmap.pyx create mode 100644 sa-extract/csuf.pxd create mode 100644 sa-extract/csuf.pyx create mode 100644 sa-extract/cveb.pxd create mode 100644 sa-extract/cveb.pyx create mode 100644 sa-extract/example/README create mode 100644 sa-extract/example/corpus.align.gz create mode 100644 sa-extract/example/corpus.de.gz create mode 100644 sa-extract/example/corpus.en.gz create mode 100644 sa-extract/example/test.de create mode 100644 sa-extract/example/test.ref.en create mode 100644 sa-extract/extract.ini create mode 100755 sa-extract/extractor.py create mode 100644 sa-extract/lcp.pyx create mode 100755 sa-extract/lcp_ops.py create mode 100644 sa-extract/log.py create mode 100644 sa-extract/manager.py create mode 100644 sa-extract/model.py create mode 100644 sa-extract/monitor.py create mode 100644 sa-extract/precomputation.pxd create mode 100644 sa-extract/precomputation.pyx create mode 100644 sa-extract/rule.pxd create mode 100644 sa-extract/rule.pyx create mode 100644 sa-extract/rulefactory.pyx create mode 100755 sa-extract/sa-compile.pl create mode 100644 sa-extract/setup.cfg create mode 100644 sa-extract/setup.py create mode 100644 sa-extract/sgml.py create mode 100644 sa-extract/strmap.cc create mode 100644 sa-extract/strmap.h create mode 100644 sa-extract/strutil.c create mode 100644 sa-extract/strutil.h create mode 100644 sa-extract/sym.pxd create mode 100644 sa-extract/sym.pyx diff --git a/sa-extract/README b/sa-extract/README new file mode 100644 index 00000000..f43e58cc --- /dev/null +++ b/sa-extract/README @@ -0,0 +1,50 @@ +SUFFIX-ARRAY-EXTRACT README + Feb 1, 2012 + +Written by Adam Lopez, repackaged by Chris Dyer. + +Originally based on parts of Hiero, by David Chiang, but these dependencies +have been removed or rewritten. + + +BUILD INSTRUCTIONS +============================================================================== + +Requirements: + Python 2.7 or later (http://www.python.org) + Cython 0.14.1 or later (http://cython.org/) + +- Edit Makefile to set the location of Python/Cython then do: + + make + + +COMPILING A PARALLEL CORPUS AND WORD ALIGNMENT +============================================================================== +- Run sa-compile.pl to compile the training data and generate an extract.ini + file (which is written to STDOUT): + + sa-compile.pl -b bitext_name=source.fr,target.en \ + -a alignment_name=alignment.txt > extract.ini + + +EXTRACTION OF PER-SENTENCE GRAMMARS +============================================================================== +- Example: + cat test.fr | extractor.py -c extract.ini + + +EXTRACTION OF COMPLETE TEST-SET GRAMMARS +============================================================================== +Edit the generated extract.ini file a change per_sentence_grammar +to False. Then, run extraction as normal. + +Note: extracting a single grammar for an entire test set will consume more +memory during extraction and (probably) during decoding. + + +EXAMPLE +============================================================================== +- See example/ and the README therein. + + diff --git a/sa-extract/calignment.pxd b/sa-extract/calignment.pxd new file mode 100644 index 00000000..a7d3001f --- /dev/null +++ b/sa-extract/calignment.pxd @@ -0,0 +1,10 @@ +cimport cintlist +from libc.stdio cimport FILE + +cdef class Alignment: + + cdef cintlist.CIntList links + cdef cintlist.CIntList sent_index + cdef int link(self, int i, int j) + cdef _unlink(self, int link, int* f, int* e) + cdef int* _get_sent_links(self, int sent_id, int* num_links) diff --git a/sa-extract/calignment.pyx b/sa-extract/calignment.pyx new file mode 100644 index 00000000..976fcd66 --- /dev/null +++ b/sa-extract/calignment.pyx @@ -0,0 +1,128 @@ +import log +import gzip +import cintlist + +from libc.stdio cimport FILE, fopen, fread, fwrite, fclose +from libc.stdlib cimport malloc, realloc, free + +# Note: Callison-Burch uses short instead of int. +# We have the space for our corpus, so this is not a problem; +# May need to revisit if things get really tight, though. +cdef class Alignment: + + + cdef int link(self, int i, int j): + '''Integerizes an alignment link pair''' + return i*65536 + j + + + def unlink(self, link): + '''De-integerizes an alignment link pair''' + return (link/65536, link%65536) + + + cdef _unlink(self, int link, int* f, int* e): + f[0] = link/65536 + e[0] = link%65536 + + + def get_sent_links(self, int sent_id): + cdef cintlist.CIntList sent_links + cdef int* arr + cdef int arr_len + + sent_links = cintlist.CIntList() + arr = self._get_sent_links(sent_id, &arr_len) + sent_links._extend_arr(arr, arr_len*2) + free(arr) + return sent_links + + + cdef int* _get_sent_links(self, int sent_id, int* num_links): + cdef int* sent_links + cdef int i, start, end + + start = self.sent_index.arr[sent_id] + end = self.sent_index.arr[sent_id+1] + num_links[0] = end - start + sent_links = malloc(2*num_links[0]*sizeof(int)) + for i from 0 <= i < num_links[0]: + self._unlink(self.links.arr[start + i], sent_links + (2*i), sent_links + (2*i) + 1) + return sent_links + + + def __cinit__(self, filename, from_binary=False): + self.links = cintlist.CIntList(1000,1000) + self.sent_index = cintlist.CIntList(1000,1000) + log.writeln("Reading alignment from file %s" % filename) + if from_binary: + self.read_binary(filename) + else: + self.read_text(filename) + + + def read_text(self, filename): + if filename[-2:] == "gz": + f = gzip.GzipFile(filename) + else: + f = open(filename) + for line in f: + self.sent_index.append(len(self.links)) + pairs = line.split() + for pair in pairs: + (i, j) = map(int, pair.split('-')) + self.links.append(self.link(i, j)) + self.sent_index.append(len(self.links)) + + + def read_binary(self, filename): + cdef FILE* f + cdef bytes bfilename = filename + cdef char* cfilename = bfilename + f = fopen(cfilename, "r") + self.links.read_handle(f) + self.sent_index.read_handle(f) + fclose(f) + + + def write_text(self, filename): + f = open(filename, "w") + sent_num = 0 + for i, link in enumerate(self.links): + while i >= self.sent_index[sent_num]: + f.write("\n") + sent_num = sent_num + 1 + f.write("%d-%d " % self.unlink(link)) + f.write("\n") + + + def write_binary(self, filename): + cdef FILE* f + cdef bytes bfilename = filename + cdef char* cfilename = bfilename + f = fopen(cfilename, "w") + self.links.write_handle(f) + self.sent_index.write_handle(f) + fclose(f) + + + def write_enhanced(self, filename): + f = open(filename, "w") + sent_num = 1 + for link in self.links: + f.write("%d " % link) + f.write("\n") + for i in self.sent_index: + f.write("%d " % i) + f.write("\n") + + + def alignment(self, i): + '''Return all (e,f) pairs for sentence i''' + cdef int j, start, end + result = [] + start = self.sent_index.arr[i] + end = self.sent_index.arr[i+1] + for j from start <= j < end: + result.append(self.unlink(self.links.arr[j])) + return result diff --git a/sa-extract/cdat.pxd b/sa-extract/cdat.pxd new file mode 100644 index 00000000..b686f611 --- /dev/null +++ b/sa-extract/cdat.pxd @@ -0,0 +1,12 @@ +cimport cintlist +from libc.stdio cimport FILE + +cdef class DataArray: + cdef word2id + cdef id2word + cdef cintlist.CIntList data + cdef cintlist.CIntList sent_id + cdef cintlist.CIntList sent_index + cdef use_sent_id + cdef void write_handle(self, FILE* f) + cdef void read_handle(self, FILE* f) diff --git a/sa-extract/cdat.pyx b/sa-extract/cdat.pyx new file mode 100644 index 00000000..57d3ad63 --- /dev/null +++ b/sa-extract/cdat.pyx @@ -0,0 +1,178 @@ +# cdat.pyx +# Defines "data arrays" that can be directly written to/read from disk in binary format +# In particular, the array itself is written/read directly as a glob of binary data +# Adam Lopez + +import sys +import gzip +import log +import cintlist + +from libc.stdio cimport FILE, fopen, fread, fwrite, fclose +from libc.stdlib cimport malloc, realloc, free +from libc.string cimport memset, strcpy, strlen + +cdef class DataArray: + + def __init__(self, filename=None, from_binary=False, use_sent_id=False): + self.word2id = {"END_OF_FILE":0, "END_OF_LINE":1} + self.id2word = ["END_OF_FILE", "END_OF_LINE"] + self.data = cintlist.CIntList(1000,1000) + self.sent_id = cintlist.CIntList(1000,1000) + self.sent_index = cintlist.CIntList(1000,1000) + self.use_sent_id = use_sent_id + + if filename is not None: + if from_binary: + self.read_binary(filename) + else: + self.read_text(filename) + + + def __len__(self): + return len(self.data) + + + def getSentId(self, i): + return self.sent_id.arr[i] + + + def getSent(self, i): + cdef int j, start, stop + sent = [] + start = self.sent_index.arr[i] + stop = self.sent_index.arr[i+1] + for i from start <= i < stop: + sent.append(self.id2word[self.data.arr[i]]) + return sent + + + def getSentPos(self, loc): + return loc - self.sent_index.arr[self.sent_id.arr[loc]] + + + def get_id(self, word): + if not word in self.word2id: + self.word2id[word] = len(self.id2word) + self.id2word.append(word) + return self.word2id[word] + + + def get_word(self, id): + return self.id2word[id] + + + def write_text(self, filename): + f = open(filename, "w") + for w_id in self.data: + if w_id > 1: + f.write("%s " % self.get_word(w_id)) + if w_id == 1: + f.write("\n") + f.close() + + + def read_text(self, filename): + cdef int word_count + + if filename[-2:] == "gz": + file = gzip.GzipFile(filename) + else: + file = open(filename) + word_count = 0 + for line_num, line in enumerate(file): + self.sent_index.append(word_count) + for word in line.split(): + self.data.append(self.get_id(word)) + if self.use_sent_id: + self.sent_id.append(line_num) + word_count = word_count + 1 + self.data.append(1) + if self.use_sent_id: + self.sent_id.append(line_num) + word_count = word_count + 1 + self.data.append(0) + self.sent_index.append(word_count) + + def read_binary(self, filename): + cdef FILE* f + cdef bytes bfilename = filename + cdef char* cfilename = bfilename + f = fopen(cfilename, "r") + self.read_handle(f) + fclose(f) + + + cdef void read_handle(self, FILE* f): + cdef int num_words + cdef int word_len + cdef char* c_word + cdef bytes py_word + self.data.read_handle(f) + self.sent_index.read_handle(f) + self.sent_id.read_handle(f) + fread(&(num_words), sizeof(int), 1, f) + for i in xrange(num_words): + fread(&(word_len), sizeof(int), 1, f) + c_word = malloc (word_len * sizeof(char)) + fread(c_word, sizeof(char), word_len, f) + py_word = c_word + free(c_word) + self.word2id[py_word] = len(self.id2word) + self.id2word.append(py_word) + if len(self.sent_id) == 0: + self.use_sent_id = False + else: + self.use_sent_id = True + + + cdef void write_handle(self, FILE* f): + cdef int word_len + cdef int num_words + cdef char* c_word + + self.data.write_handle(f) + self.sent_index.write_handle(f) + self.sent_id.write_handle(f) + num_words = len(self.id2word) - 2 + fwrite(&(num_words), sizeof(int), 1, f) + for word in self.id2word[2:]: + c_word = word + word_len = strlen(c_word) + 1 + fwrite(&(word_len), sizeof(int), 1, f) + fwrite(c_word, sizeof(char), word_len, f) + + + def write_binary(self, filename): + cdef FILE* f + cdef bytes bfilename = filename + cdef char* cfilename = bfilename + f = fopen(cfilename, "w") + self.write_handle(f) + fclose(f) + + + def write_enhanced_handle(self, f): + for i in self.data: + f.write("%d " %i) + f.write("\n") + for i in self.sent_index: + f.write("%d " %i) + f.write("\n") + for i in self.sent_id: + f.write("%d " %i) + f.write("\n") + for word in self.id2word: + f.write("%s %d " % (word, self.word2id[word])) + f.write("\n") + + + + def write_enhanced(self, filename): + f = open(filename, "w") + self.write_enhanced_handle(self, f) + f.close() + + + + diff --git a/sa-extract/cfloatlist.pxd b/sa-extract/cfloatlist.pxd new file mode 100644 index 00000000..026f2739 --- /dev/null +++ b/sa-extract/cfloatlist.pxd @@ -0,0 +1,10 @@ +from libc.stdio cimport FILE + +cdef class CFloatList: + cdef int size + cdef int increment + cdef int len + cdef float* arr + cdef void write_handle(self, FILE* f) + cdef void read_handle(self, FILE* f) + cdef void set(self, int i, float v) diff --git a/sa-extract/cfloatlist.pyx b/sa-extract/cfloatlist.pyx new file mode 100644 index 00000000..18a0ef2a --- /dev/null +++ b/sa-extract/cfloatlist.pyx @@ -0,0 +1,93 @@ +# conveniencelist.pyx +# defines int arrays in C, with some convenience methods +# for reading arrays directly as globs directly from disk. +# Adam Lopez + +from libc.stdio cimport FILE, fopen, fread, fwrite, fclose +from libc.stdlib cimport malloc, realloc, free +from libc.string cimport memset, strcpy, strlen + +cdef class CFloatList: + + def __cinit__(self, size=0, increment=1, initial_len=0): + if initial_len > size: + size = initial_len + self.arr = malloc(size*sizeof(float)) + memset(self.arr, 0, initial_len*sizeof(float)) + + + def __init__(self, size=0, increment=1, initial_len=0): + self.size = size + if initial_len > size: + self.size = initial_len + self.increment = increment + self.len = initial_len + + + def __dealloc__(self): + free(self.arr) + + + def __getitem__(self, i): + j = i + if i<0: + j = self.len + i + if j<0 or j>=self.len: + raise IndexError("Requested index %d of %d-length FloatList" % (i, self.len)) + return self.arr[j] + + + cdef void set(self, int i, float v): + j = i + if i<0: + j = self.len + i + if j<0 or j>=self.len: + raise IndexError("Requested index %d of %d-length FloatList" % (i, self.len)) + self.arr[j] = v + + def __setitem__(self, i, val): + self.set(i, val) + + def __len__(self): + return self.len + + + def append(self, float val): + if self.len == self.size: + self.size = self.size + self.increment + self.arr = realloc(self.arr, self.size*sizeof(float)) + self.arr[self.len] = val + self.len = self.len + 1 + + + cdef void write_handle(self, FILE* f): + fwrite(&(self.len), sizeof(float), 1, f) + fwrite(self.arr, sizeof(float), self.len, f) + + + def write(self, filename): + cdef FILE* f + cdef bytes bfilename = filename + cdef char* cfilename = bfilename + f = fopen(cfilename, "w") + self.write_handle(f) + fclose(f) + + + cdef void read_handle(self, FILE* f): + free(self.arr) + fread(&(self.len), sizeof(float), 1, f) + self.arr = malloc(self.len * sizeof(float)) + self.size = self.len + fread(self.arr, sizeof(float), self.len, f) + + + def read(self, filename): + cdef FILE* f + cdef bytes bfilename = filename + cdef char* cfilename = bfilename + f = fopen(cfilename, "r") + self.read_handle(f) + fclose(f) + + diff --git a/sa-extract/cintlist.pxd b/sa-extract/cintlist.pxd new file mode 100644 index 00000000..8a3a655c --- /dev/null +++ b/sa-extract/cintlist.pxd @@ -0,0 +1,15 @@ +from libc.stdio cimport FILE + +cdef class CIntList: + cdef int size + cdef int increment + cdef int len + cdef int* arr + cdef void write_handle(self, FILE* f) + cdef void read_handle(self, FILE* f) + cdef void _append(self, int val) + cdef void _extend(self, CIntList other) + cdef void _extend_arr(self, int* other, int other_len) + cdef void _clear(self) + cdef void set(self, int i, int val) + diff --git a/sa-extract/cintlist.pyx b/sa-extract/cintlist.pyx new file mode 100644 index 00000000..9d0a058e --- /dev/null +++ b/sa-extract/cintlist.pyx @@ -0,0 +1,196 @@ +# cintlist.pyx +# defines int arrays in C, with some convenience methods +# for reading arrays as globs directly from disk. +# Adam Lopez + +from libc.stdio cimport FILE, fopen, fread, fwrite, fclose +from libc.stdlib cimport malloc, realloc, free +from libc.string cimport memset, memcpy + +cdef class CIntList: + + def __cinit__(self, size=0, increment=1, initial_len=0): + if initial_len > size: + size = initial_len + self.arr = malloc(size*sizeof(int)) + memset(self.arr, 0, initial_len*sizeof(int)) + + def __str__(self): + ret = "CIntList[" + for idx in xrange(self.size): + if idx>0: + ret += "," + ret += str(self.arr[idx]) + ret += "]" + ret += "len=" + ret += self.len + return ret + + def index(self, val): + for i in xrange(self.len): + if self.arr[i] == val: + return i + + return IndexError + + def partition(self,start,end): + pivot = self.arr[end] + bottom = start-1 + top = end + done = 0 + while not done: + while not done: + bottom += 1 + if bottom == top: + done = 1 + break + if self.arr[bottom] > pivot: + self.arr[top] = self.arr[bottom] + break + while not done: + top -= 1 + if top == bottom: + done = 1 + break + if self.arr[top] < pivot: + self.arr[bottom] = self.arr[top] + break + self.arr[top] = pivot + return top + + def _doquicksort(self,start,end): + if start < end: + split = self.partition(start,end) + self._doquicksort(start,split-1) + self._doquicksort(split+1,end) + else: + return + + def sort(self): + self._doquicksort(0,self.len-1) + + def reset(self): + self.len = 0 + + def __init__(self, size=0, increment=1, initial_len=0): + self.size = size + if initial_len > size: + self.size = initial_len + self.increment = increment + self.len = initial_len + + + def __dealloc__(self): + free(self.arr) + + + def __getitem__(self, index): + cdef int i, j, k + + if type(index) == int: + j = index + if j < 0: + j = self.len + j + if j<0 or j>=self.len: + raise IndexError("Requested index %d of %d-length CIntList" % (index, self.len)) + return self.arr[j] + elif type(index) == slice: + i = index.start + j = index.stop + if i < 0: + i = self.len + i + if j < 0: + j = self.len + j + if i < 0 or i >= self.len or j < 0 or j > self.len: + raise IndexError("Requested index %d:%d of %d-length CIntList" % (index.start, index.stop, self.len)) + result = () + for k from i <= k < j: + result = result + (self.arr[k],) + return result + else: + raise IndexError("Illegal key type %s for CIntList" % (type(index))) + + cdef void set(self, int i, int val): + j = i + if i<0: + j = self.len + i + if j<0 or j>=self.len: + raise IndexError("Requested index %d of %d-length IntList" % (i, self.len)) + if type(val) != int: + raise TypeError + self.arr[j] = val + + + def __setitem__(self, i, val): + self.set(i, val) + + def __len__(self): + return self.len + + def getSize(self): + return self.size + + def append(self, int val): + self._append(val) + + cdef void _append(self, int val): + if self.len == self.size: + self.size = self.size + self.increment + self.arr = realloc(self.arr, self.size*sizeof(int)) + self.arr[self.len] = val + self.len = self.len + 1 + + def extend(self, other): + self._extend(other) + + + cdef void _extend(self, CIntList other): + self._extend_arr(other.arr, other.len) + + + cdef void _extend_arr(self, int* other, int other_len): + if self.size < self.len + other_len: + self.size = self.len + other_len + self.arr = realloc(self.arr, self.size*sizeof(int)) + memcpy(self.arr+self.len, other, other_len*sizeof(int)) + self.len = self.len + other_len + + + cdef void _clear(self): + free(self.arr) + self.len = 0 + self.size = 0 + self.arr = malloc(0) + + + cdef void write_handle(self, FILE* f): + fwrite(&(self.len), sizeof(int), 1, f) + fwrite(self.arr, sizeof(int), self.len, f) + + + def write(self, filename): + cdef FILE* f + cdef bytes bfilename = filename + cdef char* cfilename = bfilename + f = fopen(cfilename, "w") + self.write_handle(f) + fclose(f) + + + cdef void read_handle(self, FILE* f): + (self.arr) + fread(&(self.len), sizeof(int), 1, f) + self.arr = malloc(self.len * sizeof(int)) + self.size = self.len + fread(self.arr, sizeof(int), self.len, f) + + + def read(self, filename): + cdef FILE* f + cdef bytes bfilename = filename + cdef char* cfilename = bfilename + f = fopen(cfilename, "r") + self.read_handle(f) + fclose(f) + + diff --git a/sa-extract/clex.pyx b/sa-extract/clex.pyx new file mode 100644 index 00000000..fa30caad --- /dev/null +++ b/sa-extract/clex.pyx @@ -0,0 +1,460 @@ +# clex.pyx +# defines bilexical dictionaries in C, with some convenience methods +# for reading arrays directly as globs directly from disk. +# Adam Lopez + +import gzip +import sys +import context_model + +cimport cintlist +cimport cfloatlist +cimport calignment +cimport csuf +cimport cdat + +from libc.stdio cimport FILE, fopen, fread, fwrite, fclose +from libc.stdlib cimport malloc, realloc, free +from libc.string cimport memset, strcpy, strlen + +cdef struct _node: + _node* smaller + _node* bigger + int key + int val + +cdef _node* new_node(int key): + cdef _node* n + n = <_node*> malloc(sizeof(_node)) + n.smaller = NULL + n.bigger = NULL + n.key = key + n.val = 0 + return n + + +cdef del_node(_node* n): + if n.smaller != NULL: + del_node(n.smaller) + if n.bigger != NULL: + del_node(n.bigger) + free(n) + + +cdef int* get_val(_node* n, int key): + if key == n.key: + return &n.val + elif key < n.key: + if n.smaller == NULL: + n.smaller = new_node(key) + return &(n.smaller.val) + return get_val(n.smaller, key) + else: + if n.bigger == NULL: + n.bigger = new_node(key) + return &(n.bigger.val) + return get_val(n.bigger, key) + + +cdef class CLex: + + cdef cfloatlist.CFloatList col1 + cdef cfloatlist.CFloatList col2 + cdef cintlist.CIntList f_index + cdef cintlist.CIntList e_index + cdef id2eword, id2fword, eword2id, fword2id + + def __init__(self, filename, from_binary=False, + from_data=False, earray=None, fsarray=None): + self.id2eword = [] + self.id2fword = [] + self.eword2id = {} + self.fword2id = {} + self.e_index = cintlist.CIntList() + self.f_index = cintlist.CIntList() + self.col1 = cfloatlist.CFloatList() + self.col2 = cfloatlist.CFloatList() + if from_binary: + self.read_binary(filename) + else: + if from_data: + self.compute_from_data(fsarray, earray, filename) + else: + self.read_text(filename) + '''print "self.eword2id" + print "=============" + for x in self.eword2id: + print x + print "self.fword2id" + print "=============" + for x in self.fword2id: + print x + print "-------------"''' + + + cdef compute_from_data(self, csuf.SuffixArray fsa, cdat.DataArray eda, calignment.Alignment aa): + cdef int sent_id, num_links, l, i, j, f_i, e_j, I, J, V_E, V_F, num_pairs + cdef int *fsent, *esent, *alignment, *links, *ealigned, *faligned + cdef _node** dict + cdef int *fmargin, *emargin, *count + cdef bytes word + cdef int null_word + + null_word = 0 + for word in fsa.darray.id2word: # I miss list comprehensions + self.id2fword.append(word) + self.id2fword[null_word] = "NULL" + for id, word in enumerate(self.id2fword): + self.fword2id[word] = id + + for word in eda.id2word: + self.id2eword.append(word) + self.id2eword[null_word] = "NULL" + for id, word in enumerate(self.id2eword): + self.eword2id[word] = id + + num_pairs = 0 + + V_E = len(eda.id2word) + V_F = len(fsa.darray.id2word) + fmargin = malloc(V_F*sizeof(int)) + emargin = malloc(V_E*sizeof(int)) + memset(fmargin, 0, V_F*sizeof(int)) + memset(emargin, 0, V_E*sizeof(int)) + + dict = <_node**> malloc(V_F*sizeof(_node*)) + memset(dict, 0, V_F*sizeof(_node*)) + + num_sents = len(fsa.darray.sent_index) + for sent_id from 0 <= sent_id < num_sents-1: + + fsent = fsa.darray.data.arr + fsa.darray.sent_index.arr[sent_id] + I = fsa.darray.sent_index.arr[sent_id+1] - fsa.darray.sent_index.arr[sent_id] - 1 + faligned = malloc(I*sizeof(int)) + memset(faligned, 0, I*sizeof(int)) + + esent = eda.data.arr + eda.sent_index.arr[sent_id] + J = eda.sent_index.arr[sent_id+1] - eda.sent_index.arr[sent_id] - 1 + ealigned = malloc(J*sizeof(int)) + memset(ealigned, 0, J*sizeof(int)) + + links = aa._get_sent_links(sent_id, &num_links) + + for l from 0 <= l < num_links: + i = links[l*2] + j = links[l*2+1] + if i >= I or j >= J: + sys.stderr.write(" %d-%d out of bounds (I=%d,J=%d) in line %d\n" % (i,j,I,J,sent_id+1)) + assert i < I + assert j < J + f_i = fsent[i] + e_j = esent[j] + fmargin[f_i] = fmargin[f_i]+1 + emargin[e_j] = emargin[e_j]+1 + if dict[f_i] == NULL: + dict[f_i] = new_node(e_j) + dict[f_i].val = 1 + num_pairs = num_pairs + 1 + else: + count = get_val(dict[f_i], e_j) + if count[0] == 0: + num_pairs = num_pairs + 1 + count[0] = count[0] + 1 + # add count + faligned[i] = 1 + ealigned[j] = 1 + for i from 0 <= i < I: + if faligned[i] == 0: + f_i = fsent[i] + fmargin[f_i] = fmargin[f_i] + 1 + emargin[null_word] = emargin[null_word] + 1 + if dict[f_i] == NULL: + dict[f_i] = new_node(null_word) + dict[f_i].val = 1 + num_pairs = num_pairs + 1 + else: + count = get_val(dict[f_i], null_word) + if count[0] == 0: + num_pairs = num_pairs + 1 + count[0] = count[0] + 1 + for j from 0 <= j < J: + if ealigned[j] == 0: + e_j = esent[j] + fmargin[null_word] = fmargin[null_word] + 1 + emargin[e_j] = emargin[e_j] + 1 + if dict[null_word] == NULL: + dict[null_word] = new_node(e_j) + dict[null_word].val = 1 + num_pairs = num_pairs + 1 + else: + count = get_val(dict[null_word], e_j) + if count[0] == 0: + num_pairs = num_pairs + 1 + count[0] = count[0] + 1 + free(links) + free(faligned) + free(ealigned) + self.f_index = cintlist.CIntList(initial_len=V_F) + self.e_index = cintlist.CIntList(initial_len=num_pairs) + self.col1 = cfloatlist.CFloatList(initial_len=num_pairs) + self.col2 = cfloatlist.CFloatList(initial_len=num_pairs) + + num_pairs = 0 + for i from 0 <= i < V_F: + #self.f_index[i] = num_pairs + self.f_index.set(i, num_pairs) + if dict[i] != NULL: + self._add_node(dict[i], &num_pairs, float(fmargin[i]), emargin) + del_node(dict[i]) + free(fmargin) + free(emargin) + free(dict) + return + + + cdef _add_node(self, _node* n, int* num_pairs, float fmargin, int* emargin): + cdef int loc + if n.smaller != NULL: + self._add_node(n.smaller, num_pairs, fmargin, emargin) + loc = num_pairs[0] + self.e_index.set(loc, n.key) + self.col1.set(loc, float(n.val)/fmargin) + self.col2.set(loc, float(n.val)/float(emargin[n.key])) + num_pairs[0] = loc + 1 + if n.bigger != NULL: + self._add_node(n.bigger, num_pairs, fmargin, emargin) + + + def write_binary(self, filename): + cdef FILE* f + cdef bytes bfilename = filename + cdef char* cfilename = bfilename + f = fopen(cfilename, "w") + self.f_index.write_handle(f) + self.e_index.write_handle(f) + self.col1.write_handle(f) + self.col2.write_handle(f) + self.write_wordlist(self.id2fword, f) + self.write_wordlist(self.id2eword, f) + fclose(f) + + + cdef write_wordlist(self, wordlist, FILE* f): + cdef int word_len + cdef int num_words + cdef char* c_word + + num_words = len(wordlist) + fwrite(&(num_words), sizeof(int), 1, f) + for word in wordlist: + c_word = word + word_len = strlen(c_word) + 1 + fwrite(&(word_len), sizeof(int), 1, f) + fwrite(c_word, sizeof(char), word_len, f) + + + cdef read_wordlist(self, word2id, id2word, FILE* f): + cdef int num_words + cdef int word_len + cdef char* c_word + cdef bytes py_word + + fread(&(num_words), sizeof(int), 1, f) + for i from 0 <= i < num_words: + fread(&(word_len), sizeof(int), 1, f) + c_word = malloc (word_len * sizeof(char)) + fread(c_word, sizeof(char), word_len, f) + py_word = c_word + free(c_word) + word2id[py_word] = len(id2word) + id2word.append(py_word) + + def read_binary(self, filename): + cdef FILE* f + cdef bytes bfilename = filename + cdef char* cfilename = bfilename + f = fopen(cfilename, "r") + self.f_index.read_handle(f) + self.e_index.read_handle(f) + self.col1.read_handle(f) + self.col2.read_handle(f) + self.read_wordlist(self.fword2id, self.id2fword, f) + self.read_wordlist(self.eword2id, self.id2eword, f) + fclose(f) + + + def get_e_id(self, eword): + if eword not in self.eword2id: + e_id = len(self.id2eword) + self.id2eword.append(eword) + self.eword2id[eword] = e_id + return self.eword2id[eword] + + + def get_f_id(self, fword): + if fword not in self.fword2id: + f_id = len(self.id2fword) + self.id2fword.append(fword) + self.fword2id[fword] = f_id + return self.fword2id[fword] + + + def read_text(self, filename): + cdef i, j, w, e_id, f_id, n_f, n_e, N + cdef cintlist.CIntList fcount + + fcount = cintlist.CIntList() + if filename[-2:] == "gz": + f = gzip.GzipFile(filename) + else: + f = open(filename) + + # first loop merely establishes size of array objects + sys.stderr.write("Initial read...\n") + for line in f: + (fword, eword, score1, score2) = line.split() + f_id = self.get_f_id(fword) + e_id = self.get_e_id(eword) + while f_id >= len(fcount): + fcount.append(0) + fcount.arr[f_id] = fcount.arr[f_id] + 1 + + # Allocate space for dictionary in arrays + N = 0 + n_f = len(fcount) + self.f_index = cintlist.CIntList(initial_len=n_f+1) + for i from 0 <= i < n_f: + self.f_index.arr[i] = N + N = N + fcount.arr[i] + fcount.arr[i] = 0 + self.f_index.arr[n_f] = N + self.e_index = cintlist.CIntList(initial_len=N) + self.col1 = cfloatlist.CFloatList(initial_len=N) + self.col2 = cfloatlist.CFloatList(initial_len=N) + + # Re-read file, placing words into buckets + sys.stderr.write("Bucket sort...\n") + f.seek(0) + for line in f: + (fword, eword, score1, score2) = line.split() + f_id = self.get_f_id(fword) + e_id = self.get_e_id(eword) + index = self.f_index.arr[f_id] + fcount.arr[f_id] + fcount.arr[f_id] = fcount.arr[f_id] + 1 + self.e_index.arr[index] = int(e_id) + self.col1[index] = float(score1) + self.col2[index] = float(score2) + f.close() + + sys.stderr.write("Final sort...\n") + # Sort buckets by eword + for b from 0 <= b < n_f: + i = self.f_index.arr[b] + j = self.f_index.arr[b+1] + self.qsort(i,j, "") + + + cdef swap(self, int i, int j): + cdef int itmp + cdef float ftmp + + if i == j: + return + + itmp = self.e_index.arr[i] + self.e_index.arr[i] = self.e_index.arr[j] + self.e_index.arr[j] = itmp + + ftmp = self.col1.arr[i] + self.col1.arr[i] = self.col1.arr[j] + self.col1.arr[j] = ftmp + + ftmp = self.col2.arr[i] + self.col2.arr[i] = self.col2.arr[j] + self.col2.arr[j] = ftmp + + + cdef qsort(self, int i, int j, pad): + cdef int pval, p + + if i > j: + raise Exception("Sort error in CLex") + if i == j: #empty interval + return + if i == j-1: # singleton interval + return + + p = (i+j)/2 + pval = self.e_index.arr[p] + self.swap(i, p) + p = i + for k from i+1 <= k < j: + if pval >= self.e_index.arr[k]: + self.swap(p+1, k) + self.swap(p, p+1) + p = p + 1 + self.qsort(i,p, pad+" ") + self.qsort(p+1,j, pad+" ") + + + def write_enhanced(self, filename): + f = open(filename, "w") + for i in self.f_index: + f.write("%d " % i) + f.write("\n") + for i, s1, s2 in zip(self.e_index, self.col1, self.col2): + f.write("%d %f %f " % (i, s1, s2)) + f.write("\n") + for i, w in enumerate(self.id2fword): + f.write("%d %s " % (i, w)) + f.write("\n") + for i, w in enumerate(self.id2eword): + f.write("%d %s " % (i, w)) + f.write("\n") + f.close() + + + def get_score(self, fword, eword, col): + cdef e_id, f_id, low, high, midpoint, val + #print "get_score fword=",fword,", eword=",eword,", col=",col + + if eword not in self.eword2id: + return None + if fword not in self.fword2id: + return None + f_id = self.fword2id[fword] + e_id = self.eword2id[eword] + low = self.f_index.arr[f_id] + high = self.f_index.arr[f_id+1] + while high - low > 0: + midpoint = (low+high)/2 + val = self.e_index.arr[midpoint] + if val == e_id: + if col == 0: + return self.col1.arr[midpoint] + if col == 1: + return self.col2.arr[midpoint] + if val > e_id: + high = midpoint + if val < e_id: + low = midpoint + 1 + return None + + + def write_text(self, filename): + """Note: does not guarantee writing the dictionary in the original order""" + cdef i, N, e_id, f_id + + f = open(filename, "w") + N = len(self.e_index) + f_id = 0 + for i from 0 <= i < N: + while self.f_index.arr[f_id+1] == i: + f_id = f_id + 1 + e_id = self.e_index.arr[i] + score1 = self.col1.arr[i] + score2 = self.col2.arr[i] + f.write("%s %s %.6f %.6f\n" % (self.id2fword[f_id], self.id2eword[e_id], score1, score2)) + f.close() + + diff --git a/sa-extract/cmath.pxd b/sa-extract/cmath.pxd new file mode 100644 index 00000000..3aaaa2a3 --- /dev/null +++ b/sa-extract/cmath.pxd @@ -0,0 +1,2 @@ +cdef extern from "math.h": + double log(double) diff --git a/sa-extract/cn.py b/sa-extract/cn.py new file mode 100644 index 00000000..e534783f --- /dev/null +++ b/sa-extract/cn.py @@ -0,0 +1,164 @@ +# cn.py +# Chris Dyer +# Copyright (c) 2006 University of Maryland. + +# vim:tabstop=4:autoindent:expandtab + +import sys +import math +import sym +import log +import sgml + +epsilon = sym.fromstring('*EPS*'); + +class CNStats(object): + def __init__(self): + self.read = 0 + self.colls = 0 + self.words = 0 + + def collect(self, cn): + self.read += 1 + self.colls += cn.get_length() + for col in cn.columns: + self.words += len(col) + + def __str__(self): + return "confusion net statistics:\n succ. read: %d\n columns: %d\n words: %d\n avg. words/column:\t%f\n avg. cols/sent:\t%f\n\n" % (self.read, self.colls, self.words, float(self.words)/float(self.colls), float(self.colls)/float(self.read)) + +class ConfusionNet(object): + def __init__(self, sent): + object.__init__(self) + if (len(sent.words) == 0): + self.columns = () + return # empty line, it happens + line = sent.words[0] + if (line.startswith("(((")): + if (len(sent.words) > 1): + log.write("Bad sentence: %s\n" % (line)) + assert(len(sent.words) == 1) # make sure there are no spaces in your confusion nets! + line = "((('',1.0,1),),"+line[1:len(line)-1]+"(('',1.0,1),))" + cols = eval(line) + res = [] + for col in cols: + x = [] + for alt in col: + costs = alt[1] + if (type(costs) != type((1,2))): + costs=(float(costs),) + j=[] + for c in costs: + j.append(float(c)) + cost = tuple(j) + spanlen = 1 + if (len(alt) == 3): + spanlen = alt[2] + x.append((sym.fromstring(alt[0],terminal=True), None, spanlen)) + res.append(tuple(x)) + self.columns = tuple(res) + else: # convert a string of input into a CN + res = []; + res.append(((sym.fromstring('',terminal=True), None, 1), )) + for word in sent.words: + res.append(((sym.fromstring(word,terminal=True), None, 1), )); # (alt=word, cost=0.0) + res.append(((sym.fromstring('',terminal=True), None, 1), )) + self.columns = tuple(res) + + def is_epsilon(self, position): + x = self.columns[position[0]][position[1]][0] + return x == epsilon + + def compute_epsilon_run_length(self, cn_path): + if (len(cn_path) == 0): + return 0 + x = len(cn_path) - 1 + res = 0 + ''' -1 denotes a non-terminal ''' + while (x >= 0 and cn_path[x][0] >= 0 and self.is_epsilon(cn_path[x])): + res += 1 + x -= 1 + return res + + def compute_cn_cost(self, cn_path): + c = None + for (col, row) in cn_path: + if (col >= 0): + if c is None: + c = self.columns[col][row][1].clone() + else: + c += self.columns[col][row][1] + return c + + def get_column(self, col): + return self.columns[col] + + def get_length(self): + return len(self.columns) + + def __str__(self): + r = "conf net: %d\n" % (len(self.columns),) + i = 0 + for col in self.columns: + r += "%d -- " % i + i += 1 + for alternative in col: + r += "(%s, %s, %s) " % (sym.tostring(alternative[0]), alternative[1], alternative[2]) + r += "\n" + return r + + def listdown(_columns, col = 0): + # output all the possible sentences out of the self lattice + # will be used by the "dumb" adaptation of lattice decoding with suffix array + result = [] + for entry in _columns[col]: + if col+entry[2]+1<=len(_columns) : + for suffix in self.listdown(_columns,col+entry[2]): + result.append(entry[0]+' '+suffix) + #result.append(entry[0]+' '+suffix) + else: + result.append(entry[0]) + #result.append(entry[0]) + return result + + def next(self,_columns,curr_idx, min_dist=1): + # can be used only when prev_id is defined + result = [] + #print "curr_idx=%i\n" % curr_idx + if curr_idx+min_dist >= len(_columns): + return result + for alt_idx in xrange(len(_columns[curr_idx])): + alt = _columns[curr_idx][alt_idx] + #print "checking %i alternative : " % alt_idx + #print "%s %f %i\n" % (alt[0],alt[1],alt[2]) + #print alt + if alt[2] "+ + "\n\nNote: -d,-s,-a, and -p are mutually exclusive") + parser.add_option("-d", "--data-array", + action="store_true", default=False, + dest="da", help="Compile file into data array (default)") + parser.add_option("-s", "--suffix-array", + action="store_true", default=False, + dest="sa", help="Compile file into suffix array") + parser.add_option("-a", "--alignment", + action="store_true", default=False, + dest="a", help="Compile file into alignment") + parser.add_option("-l", "--lexical", + action="store_true", default=False, + dest="l", help="Compile file into lex file") + parser.add_option("-x", "--compute_lexical", action="store", nargs=2, + dest="lex_args", help="Compute lex file from data", + metavar=" ") + parser.add_option("-p", "--parse", + action="store_true", default=False, + dest="p", help="Compile file into parse") + parser.add_option("-b", "--binary-infile", + action="store_true", default=False, + dest="bin", help="Input file is binary (default: text)") + parser.add_option("-t", "--text-outfile", + action="store_true", default=False, + dest="text", help="Output file is text (default: binary)") + parser.add_option("-e", "--enhanced-outfile", + action="store_true", default=False, + dest="enhanced", help="Output file is enhanced text (default: binary)") + parser.add_option("-r", action="store", nargs=7, + dest="precomp_args", help="Precompute collocations (Hiero only)", + metavar="max-len= max-nt= max-size= min-gap= rank1= rank2= sa=") + (options, args) = parser.parse_args() + + filetype_opts = [options.da, options.sa, options.a, options.p] + + if (len(filter(lambda x: x, filetype_opts))) > 1 or len(args) != 2: + parser.print_help() + sys.exit(1) + + (infilename, outfilename) = args + if options.bin: + bin = " binary" + else: + bin = "" + + start_time = monitor.cpu() + if options.precomp_args: + if options.bin: + obj = precomputation.Precomputation(infilename, from_binary=True) + else: + keys = set(["max-len", "max-nt", "max-size", "min-gap", "rank1", "rank2", "sa"]) + precomp_opts = {} + sys.stderr.write("Precomputing statistics for list %s\n" % infilename) + for pair in options.precomp_args: + (key, val) = pair.split("=") + if key in keys: + keys.remove(key) + if key != "sa": + val = int(val) + precomp_opts[key] = val + else: + sys.stderr.write("Unknown keyword arg %s for -r (must be one of: max-len, max-nt, max-size, min-gap, rank1, rank2)\n" % key) + return 1 + sa = csuf.SuffixArray(precomp_opts["sa"], True) + obj = precomputation.Precomputation(infilename, sa, + precompute_rank=precomp_opts["rank1"], + precompute_secondary_rank=precomp_opts["rank2"], + max_length=precomp_opts["max-len"], + max_nonterminals=precomp_opts["max-nt"], + train_max_initial_size=precomp_opts["max-size"], + train_min_gap_size=precomp_opts["min-gap"]) + elif options.sa: + sys.stderr.write("Reading %s as%s suffix array...\n" % (infilename, bin)) + obj = csuf.SuffixArray(infilename, options.bin) + elif options.a: + sys.stderr.write("Reading %s as%s alignment array...\n" % (infilename, bin)) + obj = calignment.Alignment(infilename, options.bin) + elif options.p: + sys.stderr.write("Reading %s as%s parse array...\n" % (infilename, bin)) + obj = parse.ParseArray(infilename, options.bin) + elif options.l: + sys.stderr.write("Reading %s as%s lex array...\n" % (infilename, bin)) + obj = clex.CLex(infilename, options.bin) + elif options.lex_args: + ffile = options.lex_args[0] + efile = options.lex_args[1] + sys.stderr.write("Computing lex array from:\n A=%s\n F=%s\n E=%s\n" % (infilename, ffile, efile)) + fsarray = csuf.SuffixArray(ffile, True) + earray = cdat.DataArray(efile, True) + aarray = calignment.Alignment(infilename, True) + obj = clex.CLex(aarray, from_data=True, earray=earray, fsarray=fsarray) + else: + sys.stderr.write("Reading %s as%s data array...\n" % (infilename, bin)) + obj = cdat.DataArray(infilename, options.bin) + + sys.stderr.write(" Total time for read: %f\n" % (monitor.cpu() - start_time)) + start_time = monitor.cpu() + if options.text: + sys.stderr.write("Writing text file %s...\n" % outfilename) + obj.write_text(outfilename) + elif options.enhanced: + sys.stderr.write("Writing enhanced text file %s...\n" % outfilename) + obj.write_enhanced(outfilename) + else: + sys.stderr.write("Writing binary file %s...\n" % outfilename) + obj.write_binary(outfilename) + sys.stderr.write("Finished.\n") + sys.stderr.write(" Total time for write: %f\n" % (monitor.cpu() - start_time)) + + mem_use = float(monitor.memory()) + metric = "B" + if mem_use / 1000 > 1: + mem_use /= 1000 + metric = "KB" + if mem_use / 1000 > 1: + mem_use /= 1000 + metric = "MB" + if mem_use / 1000 > 1: + mem_use /= 1000 + metric = "GB" + sys.stderr.write(" Memory usage: %.1f%s\n" % (mem_use, metric)) + + + +if __name__ == "__main__": + sys.exit(main(sys.argv)) diff --git a/sa-extract/context_model.py b/sa-extract/context_model.py new file mode 100644 index 00000000..8cb6c174 --- /dev/null +++ b/sa-extract/context_model.py @@ -0,0 +1,234 @@ +#!/usr/bin/env python +import sys +import model +import sym +import log +import math + +class ContextModel(model.Model): + '''A ContextModel is one that is computed using information + from the Context object''' + + def __init__(self, context_manager, default=0.0): + model.Model.__init__(self) + self.wordless = 0 + self.initial = None + self.default = default + self.context_manager = context_manager + self.id = self.context_manager.add_model(self) + + '''The next feature is true if the model depends in + some way on the entire input sentence; that is, if + it cannot be scored when created, but must be scored + no earlier than during the input method (note that + this is less strict than stateful)''' + self.contextual = True + ''' It may seem somewhat counterintuitive that a + ContextModel can be non-contextual, but a good + example is the rule probabilites; although these + are computed using the Context object, they don't + really depend in any way on context''' + + + '''inherited from model.Model, called once for each input sentence''' + def input(self, fwords, meta): + # all ContextModels must make this call + self.context_manager.input(self, fwords, meta) + + + '''This function will be called via the input method + only for contextual models''' + def compute_contextual_score(self, r): + return 0.0 + + '''This function is only called on rule creation for + contextless models''' + def compute_contextless_score(self, fphrase, ephrase, paircount, fcount, fsample_count): + return 0.0 + + '''Stateless models should not need to + override this function, unless they define + something for model.TO_GOAL''' + def transition (self, r, antstates, i, j, j1=None): + return (None, 0.0) + + def estimate(self, r): + return r.getscore("context", self.id) + + def transition(self, r, antstates, i, j, j1=None): + return (None, r.getscore("context", self.id)) + + def finaltransition(self, state): + return 0.0 + + def rescore(self, ewords, score): + return score + + + +'''p(e|f)''' +class EgivenF(ContextModel): + + def __init__(self, context_manager, default=0.0): + ContextModel.__init__(self, context_manager) + self.contextual = False + + def compute_contextless_score(self, fphrase, ephrase, paircount, fcount, fsample_count): + prob = float(paircount)/float(fcount) + return -math.log10(prob) + +class CountEF(ContextModel): + + def __init__(self, context_manager, default=0.0): + ContextModel.__init__(self, context_manager) + self.contextual = False + + def compute_contextless_score(self, fphrase, ephrase, paircount, fcount, fsample_count): + return math.log10(1.0 + float(paircount)) + +class SampleCountF(ContextModel): + + def __init__(self, context_manager, default=0.0): + ContextModel.__init__(self, context_manager) + self.contextual = False + + def compute_contextless_score(self, fphrase, ephrase, paircount, fcount, fsample_count): + return math.log10(1.0 + float(fsample_count)) + + + +class EgivenFCoherent(ContextModel): + + def __init__(self, context_manager, default=0.0): + ContextModel.__init__(self, context_manager) + self.contextual = False + + def compute_contextless_score(self, fphrase, ephrase, paircount, fcount, fsample_count): + prob = float(paircount)/float(fsample_count) + #print "paircount=",paircount," , fsample_count=",fsample_count,", prob=",prob + if (prob == 0.0): return 99.0 + return -math.log10(prob) + + + +class CoherenceProb(ContextModel): + + def __init__(self, context_manager, default=0.0): + ContextModel.__init__(self, context_manager) + self.contextual = False + + def compute_contextless_score(self, fphrase, ephrase, paircount, fcount, fsample_count): + prob = float(fcount)/float(fsample_count) + return -math.log10(prob) + + + +class MaxLexEgivenF(ContextModel): + + def __init__(self, context_manager, ttable, col=0): + ContextModel.__init__(self, context_manager) + self.ttable = ttable + self.col = col + self.wordless = 0 + self.initial = None + self.contextual = False + + + def compute_contextless_score(self, fphrase, ephrase, paircount, fcount, fsample_count): + totalscore = 1.0 + fwords = map(sym.tostring, filter(lambda x: not sym.isvar(x), fphrase)) + fwords.append("NULL") + ewords = map(sym.tostring, filter(lambda x: not sym.isvar(x), ephrase)) + for e in ewords: + maxScore = 0.0 + for f in fwords: + score = self.ttable.get_score(f, e, self.col) + #print "score(MaxLexEgivenF) = ",score + if score > maxScore: + maxScore = score + totalscore *= maxScore + if totalscore == 0.0: + return 999 + else: + return -math.log10(totalscore) + + +class MaxLexFgivenE(ContextModel): + + def __init__(self, context_manager, ttable, col=1): + ContextModel.__init__(self, context_manager) + self.ttable = ttable + self.col = col + self.wordless = 0 + self.initial = None + self.contextual = False + + + def compute_contextless_score(self, fphrase, ephrase, paircount, fcount, fsample_count): + totalscore = 1.0 + fwords = map(sym.tostring, filter(lambda x: not sym.isvar(x), fphrase)) + ewords = map(sym.tostring, filter(lambda x: not sym.isvar(x), ephrase)) + ewords.append("NULL") + for f in fwords: + maxScore = 0.0 + for e in ewords: + score = self.ttable.get_score(f, e, self.col) + #print "score(MaxLexFgivenE) = ",score + if score > maxScore: + maxScore = score + totalscore *= maxScore + if totalscore == 0.0: + return 999 + else: + return -math.log10(totalscore) + + +class IsSingletonF(ContextModel): + + def __init__(self, context_manager, default=0.0): + ContextModel.__init__(self, context_manager) + self.contextual = False + + def compute_contextless_score(self, fphrase, ephrase, paircount, fcount, fsample_count): + return (fcount==1) + + +class IsSingletonFE(ContextModel): + + def __init__(self, context_manager, default=0.0): + ContextModel.__init__(self, context_manager) + self.contextual = False + + def compute_contextless_score(self, fphrase, ephrase, paircount, fcount, fsample_count): + return (paircount==1) + +class IsNotSingletonF(ContextModel): + + def __init__(self, context_manager, default=0.0): + ContextModel.__init__(self, context_manager) + self.contextual = False + + def compute_contextless_score(self, fphrase, ephrase, paircount, fcount, fsample_count): + return (fcount>1) + + +class IsNotSingletonFE(ContextModel): + + def __init__(self, context_manager, default=0.0): + ContextModel.__init__(self, context_manager) + self.contextual = False + + def compute_contextless_score(self, fphrase, ephrase, paircount, fcount, fsample_count): + return (paircount>1) + + +class IsFEGreaterThanZero(ContextModel): + + def __init__(self, context_manager, default=0.0): + ContextModel.__init__(self, context_manager) + self.contextual = False + + def compute_contextless_score(self, fphrase, ephrase, paircount, fcount, fsample_count): + return (paircount > 0.01) + + diff --git a/sa-extract/cstrmap.pxd b/sa-extract/cstrmap.pxd new file mode 100644 index 00000000..53becbc6 --- /dev/null +++ b/sa-extract/cstrmap.pxd @@ -0,0 +1,12 @@ +cdef extern from "strmap.h": + ctypedef struct StrMap + StrMap* stringmap_new() + void stringmap_delete(StrMap *vocab) + int stringmap_index(StrMap *vocab, char *s) + char* stringmap_word(StrMap *vocab, int i) + +cdef class StringMap: + cdef StrMap *vocab + cdef char *word(self, int i) + cdef int index(self, char *s) + diff --git a/sa-extract/cstrmap.pyx b/sa-extract/cstrmap.pyx new file mode 100644 index 00000000..d3883ea5 --- /dev/null +++ b/sa-extract/cstrmap.pyx @@ -0,0 +1,14 @@ + +cdef class StringMap: + def __cinit__(self): + self.vocab = stringmap_new() + + def __dealloc__(self): + stringmap_delete(self.vocab) + + cdef char *word(self, int i): + return stringmap_word(self.vocab, i) + + cdef int index(self, char *s): + return stringmap_index(self.vocab, s) + diff --git a/sa-extract/csuf.pxd b/sa-extract/csuf.pxd new file mode 100644 index 00000000..f44167dd --- /dev/null +++ b/sa-extract/csuf.pxd @@ -0,0 +1,11 @@ +cimport cdat +cimport cintlist + +cdef class SuffixArray: + cdef cdat.DataArray darray + cdef cintlist.CIntList sa + cdef cintlist.CIntList ha + cdef __lookup_helper(self, int word_id, int offset, int low, int high) + cdef __get_range(self, int word_id, int offset, int low, int high, int midpoint) + cdef __search_low(self, int word_id, int offset, int low, int high) + cdef __search_high(self, word_id, offset, low, high) diff --git a/sa-extract/csuf.pyx b/sa-extract/csuf.pyx new file mode 100644 index 00000000..64c44788 --- /dev/null +++ b/sa-extract/csuf.pyx @@ -0,0 +1,321 @@ +# csuf.pyx +# Defines suffix arrays that can be directly written to/read from disk in binary format +# Adam Lopez + +import sys +import log +import cdat +import cintlist +import monitor + +from libc.stdio cimport FILE, fclose, fopen + +cdef class SuffixArray: + + def __init__(self, filename, from_binary=False): + self.darray = cdat.DataArray() + self.sa = cintlist.CIntList() + self.ha = cintlist.CIntList() + if from_binary: + self.read_binary(filename) + else: + self.read_text(filename) + + + def __getitem__(self, i): + return self.sa.arr[i] + + + def getSentId(self, i): + return self.darray.getSentId(i) + + + def getSent(self, i): + return self.darray.getSent(i) + + + def getSentPos(self, loc): + return self.darray.getSentPos(loc) + + def read_text(self, filename): + '''Constructs suffix array using the algorithm + of Larsson & Sadahkane (1999)''' + cdef int V, N, i, j, h, a_i, n, current_run, skip + cdef cintlist.CIntList isa, word_count + + self.darray = cdat.DataArray(filename, from_binary=False, use_sent_id=True) + N = len(self.darray) + V = len(self.darray.id2word) + + self.sa = cintlist.CIntList(initial_len=N) + self.ha = cintlist.CIntList(initial_len=V+1) + + isa = cintlist.CIntList(initial_len=N) + word_count = cintlist.CIntList(initial_len=V+1) + + '''Step 1: bucket sort data''' + sort_start_time = monitor.cpu() + start_time = sort_start_time + for i from 0 <= i < N: + a_i = self.darray.data.arr[i] + word_count.arr[a_i] = word_count.arr[a_i] + 1 + + n = 0 + for i from 0 <= i < V+1: + self.ha.arr[i] = n + n = n + word_count.arr[i] + word_count.arr[i] = 0 + + for i from 0 <= i < N: + a_i = self.darray.data.arr[i] + self.sa.arr[self.ha.arr[a_i] + word_count.arr[a_i]] = i + isa.arr[i] = self.ha.arr[a_i + 1] - 1 # bucket pointer is last index in bucket + word_count.arr[a_i] = word_count.arr[a_i] + 1 + + '''Determine size of initial runs''' + current_run = 0 + for i from 0 <= i < V+1: + if i < V and self.ha.arr[i+1] - self.ha.arr[i] == 1: + current_run = current_run + 1 + else: + if current_run > 0: + self.sa.arr[self.ha.arr[i] - current_run] = -current_run + current_run = 0 + + sys.stderr.write(" Bucket sort took %f seconds\n" % (monitor.cpu() - sort_start_time)) + + '''Step 2: prefix-doubling sort''' + h = 1 + while self.sa.arr[0] != -N: + sort_start_time = monitor.cpu() + sys.stderr.write(" Refining, sort depth = %d\n" % (h,)) + i = 0 + skip = 0 + while i < N: + if self.sa.arr[i] < 0: + #sys.stderr.write("Skip from %d to %d\n" % (i, i-self.sa.arr[i]-1)) + skip = skip + self.sa.arr[i] + i = i - self.sa.arr[i] + else: + if skip < 0: + self.sa.arr[i+skip] = skip + skip = 0 + j = isa.arr[self.sa.arr[i]] + #sys.stderr.write("Process from %d to %d (%d, %d, %d)\n" % (i, j, self.sa.arr[i], self.darray.data.arr[self.sa.arr[i]], isa.arr[self.sa.arr[i]])) + self.q3sort(i, j, h, isa) + i = j+1 + if skip < 0: + self.sa.arr[i+skip] = skip + h = h * 2 + sys.stderr.write(" Refinement took %f seconds\n" % (monitor.cpu() - sort_start_time)) + + '''Step 3: read off suffix array from inverse suffix array''' + sys.stderr.write(" Finalizing sort...\n") + for i from 0 <= i < N: + j = isa.arr[i] + self.sa.arr[j] = i + sys.stderr.write("Suffix array construction took %f seconds\n" % (monitor.cpu() - start_time)) + + def q3sort(self, int i, int j, int h, cintlist.CIntList isa, pad=""): + '''This is a ternary quicksort. It divides the array into + three partitions: items less than the pivot, items equal + to pivot, and items greater than pivot. The first and last + of these partitions are then recursively sorted''' + cdef int k, midpoint, pval, phead, ptail, tmp + + if j-i < -1: + raise Exception("Unexpected condition found in q3sort: sort from %d to %d" % (i,j)) + if j-i == -1: # recursive base case -- empty interval + return + if (j-i == 0): # recursive base case -- singleton interval + isa.arr[self.sa.arr[i]] = i + self.sa.arr[i] = -1 + return + + # NOTE: choosing the first item as a pivot value resulted in + # stack overflow for some very large buckets. I think there + # is a natural bias towards order due the way the word ids are + # assigned; thus this resulted in the range to the left of the + # pivot being nearly empty. Therefore, choose the middle item. + # If the method of assigning word_id's is changed, this method + # may need to be reconsidered as well. + midpoint = (i+j)/2 + pval = isa.arr[self.sa.arr[midpoint] + h] + if i != midpoint: + tmp = self.sa.arr[midpoint] + self.sa.arr[midpoint] = self.sa.arr[i] + self.sa.arr[i] = tmp + phead = i + ptail = i + + # find the three partitions. phead marks the first element + # of the middle partition, and ptail marks the last element + for k from i+1 <= k < j+1: + if isa.arr[self.sa.arr[k] + h] < pval: + if k > ptail+1: + tmp = self.sa.arr[phead] + self.sa.arr[phead] = self.sa.arr[k] + self.sa.arr[k] = self.sa.arr[ptail+1] + self.sa.arr[ptail+1] = tmp + else: # k == ptail+1 + tmp = self.sa.arr[phead] + self.sa.arr[phead] = self.sa.arr[k] + self.sa.arr[k] = tmp + phead = phead + 1 + ptail = ptail + 1 + else: + if isa.arr[self.sa.arr[k] + h] == pval: + if k > ptail+1: + tmp = self.sa.arr[ptail+1] + self.sa.arr[ptail+1] = self.sa.arr[k] + self.sa.arr[k] = tmp + ptail = ptail + 1 + + # recursively sort smaller suffixes + self.q3sort(i, phead-1, h, isa, pad+" ") + + # update suffixes with pivot value + # corresponds to update_group function in Larsson & Sadakane + for k from phead <= k < ptail+1: + isa.arr[self.sa.arr[k]] = ptail + if phead == ptail: + self.sa.arr[phead] = -1 + + # recursively sort larger suffixes + self.q3sort(ptail+1, j, h, isa, pad+" ") + + + def write_text(self, filename): + self.darray.write_text(filename) + + + def read_binary(self, filename): + cdef FILE *f + cdef bytes bfilename = filename + cdef char* cfilename = bfilename + f = fopen(cfilename, "r") + self.darray.read_handle(f) + self.sa.read_handle(f) + self.ha.read_handle(f) + fclose(f) + + + def write_binary(self, filename): + cdef FILE* f + cdef bytes bfilename = filename + cdef char* cfilename = bfilename + f = fopen(cfilename, "w") + self.darray.write_handle(f) + self.sa.write_handle(f) + self.ha.write_handle(f) + fclose(f) + + + def write_enhanced(self, filename): + f = open(filename, "w") + self.darray.write_enhanced_handle(f) + for a_i in self.sa: + f.write("%d " % a_i) + f.write("\n") + for w_i in self.ha: + f.write("%d " % w_i) + f.write("\n") + f.close() + + + cdef __search_high(self, word_id, offset, low, high): + cdef int midpoint + + if low >= high: + return high + midpoint = (high + low) / 2 + if self.darray.data.arr[self.sa.arr[midpoint] + offset] == word_id: + return self.__search_high(word_id, offset, midpoint+1, high) + else: + return self.__search_high(word_id, offset, low, midpoint) + + + cdef __search_low(self, int word_id, int offset, int low, int high): + cdef int midpoint + + if low >= high: + return high + midpoint = (high + low) / 2 + if self.darray.data.arr[self.sa.arr[midpoint] + offset] == word_id: + return self.__search_low(word_id, offset, low, midpoint) + else: + return self.__search_low(word_id, offset, midpoint+1, high) + + + cdef __get_range(self, int word_id, int offset, int low, int high, int midpoint): + return (self.__search_low(word_id, offset, low, midpoint), + self.__search_high(word_id, offset, midpoint, high)) + + + cdef __lookup_helper(self, int word_id, int offset, int low, int high): + cdef int midpoint + + if offset == 0: + return (self.ha.arr[word_id], self.ha.arr[word_id+1]) + if low >= high: + return None + + midpoint = (high + low) / 2 + if self.darray.data.arr[self.sa.arr[midpoint] + offset] == word_id: + return self.__get_range(word_id, offset, low, high, midpoint) + if self.darray.data.arr[self.sa.arr[midpoint] + offset] > word_id: + return self.__lookup_helper(word_id, offset, low, midpoint) + else: + return self.__lookup_helper(word_id, offset, midpoint+1, high) + + + def lookup(self, word, offset, int low, int high): + if low == -1: + low = 0 + if high == -1: + high = len(self.sa) + if word in self.darray.word2id: + word_id = self.darray.word2id[word] + return self.__lookup_helper(word_id, offset, low, high) + else: + return None + + + + def print_sa(self, isa): + '''Slow; Use only in case of emergency''' + cdef int i, j, k, N + cdef cintlist.CIntList tmp_sa + + N = len(self.sa) + for i from 0 <= i < N: + sys.stderr.write("%2d " % i) + sys.stderr.write("\n") + for i from 0 <= i < N: + sys.stderr.write("%2d " % self.darray.data.arr[i]) + sys.stderr.write("\n") + for i from 0 <= i < N: + sys.stderr.write("%2d " % isa.arr[i]) + sys.stderr.write("\n\n\n") + + # Recover partially sorted array + tmp_sa = cintlist.CIntList(initial_len=N) + for i from 0 <= i < N: + j = isa.arr[i] + tmp_sa.arr[j] = i + for i from 0 <= i < N: + if self.sa.arr[i] > 0: + tmp_sa.arr[i] = self.sa.arr[i] + + for i from 0 <= i < N: + j = tmp_sa.arr[i] + sys.stderr.write("%2d %2d | " % (i, self.sa.arr[i])) + for k from j <= k < N: + sys.stderr.write("%2d " % self.darray.data.arr[k]) + sys.stderr.write("\n") + sys.stderr.write("\n") + + + + + diff --git a/sa-extract/cveb.pxd b/sa-extract/cveb.pxd new file mode 100644 index 00000000..8967f8e3 --- /dev/null +++ b/sa-extract/cveb.pxd @@ -0,0 +1,15 @@ +cdef struct _VEB: + int top_universe_size + int num_bottom_bits + int max_val + int min_val + int size + void* top + void** bottom + + +cdef class VEB: + cdef _VEB* veb + cdef int _findsucc(self, int i) + cdef int _insert(self, int i) + cdef int _first(self) diff --git a/sa-extract/cveb.pyx b/sa-extract/cveb.pyx new file mode 100644 index 00000000..ca87becc --- /dev/null +++ b/sa-extract/cveb.pyx @@ -0,0 +1,390 @@ +#!/usr/bin/env python2.4 +'''This module implements a partial stratified tree (van Emde Boas, 1977). +Only insert findsucc, __iter__, and __contains__ are implemented. +Delete is currently not supported. +There is very little error-checking in this code -- it is designed +to be used in the limited situation described in Lopez (EMNLP-CoNLL 2007), +which doesn't cover all of the possible ways that you could misuse it +(e.g. trying to insert a key larger than the universe size) +Other notes -- this code is really rather ugly C code masquerading as +Pyrex/Python. Virtual function calls are bypassed by hand in several +places for the sake of efficiency, and other Python niceties are +removed for the same reason.''' + +from libc.stdlib cimport malloc, free +from libc.math cimport log, ceil +from libc.string cimport memset + +cdef int MIN_BOTTOM_SIZE +cdef int MIN_BOTTOM_BITS + +MIN_BOTTOM_SIZE = 32 +MIN_BOTTOM_BITS = 5 + +cdef int lower_mask[32] +cdef int i, mask + +for i from 0 <= i < MIN_BOTTOM_SIZE: + mask = (mask << 1) + 1 + lower_mask[i] = mask + + +cdef struct _BitSet: + long bitset + int min_val + int max_val + int size + + +cdef _BitSet* new_BitSet(): + cdef _BitSet* b + + b = <_BitSet*> malloc(sizeof(_BitSet)) + b.bitset = 0 + b.min_val = -1 + b.max_val = -1 + b.size = 0 + return b + + +cdef int bitset_findsucc(_BitSet* b, int i): + cdef int bitset, mask + cdef int low, high, mid + + if b.max_val == -1 or i >= b.max_val: + return -1 + if i < b.min_val: + return b.min_val + + bitset = b.bitset & ~lower_mask[i] + low = i+1 + high = b.max_val+1 + while low < high-1: + mid = (high + low)/2 + mask = ~(lower_mask[high-1] ^ lower_mask[mid-1]) + if bitset & mask == 0: + low = mid + else: + bitset = bitset & mask + high = mid + return low + + +cdef int bitset_insert(_BitSet* b, int i): + cdef int val + + val = 1 << i + if b.bitset & val == 0: + b.bitset = b.bitset | val + if b.size == 0: + b.min_val = i + b.max_val = i + else: + if i < b.min_val: + b.min_val = i + if i > b.max_val: + b.max_val = i + b.size = b.size + 1 + return 1 + return 0 + + +cdef int bitset_contains(_BitSet* b, int i): + cdef int val + + val = 1 << i + if b.bitset & val == 0: + return 0 + else: + return 1 + + +cdef class BitSetIterator: + cdef _BitSet* b + cdef int next_val + + def __next__(self): + cdef int ret_val + + if self.next_val == -1: + raise StopIteration() + ret_val = self.next_val + self.next_val = bitset_findsucc(self.b, ret_val) + return ret_val + + + +# This is a Python wrapper class to give access to the +# (entirely C-implemented) _BitSet struct. +# Very slow; use only for debugging +cdef class BitSet: + + cdef _BitSet* b + + def __cinit__(self): + self.b = new_BitSet() + + def __dealloc__(self): + free(self.b) + + def __iter__(self): + cdef BitSetIterator it + it = BitSetIterator() + it.b = self.b + it.next_val = self.b.min_val + return it + + def insert(self, i): + return bitset_insert(self.b, i) + + def findsucc(self, i): + return bitset_findsucc(self.b, i) + + def __str__(self): + return dec2bin(self.b.bitset)+" ("+str(self.b.size)+","+str(self.b.min_val)+","+str(self.b.max_val)+")" + + def min(self): + return self.b.min_val + + def max(self): + return self.b.max_val + + def __len__(self): + return self.b.size + + def __contains__(self, i): + return bool(bitset_contains(self.b, i)) + + +def dec2bin(i): + cdef d + result = "" + for d from 0 <= d < MIN_BOTTOM_SIZE: + if i & lower_mask[0] == 0: + result = "0"+result + else: + result = "1"+result + i = i >> 1 + return result + + +cdef _VEB* new_VEB(int n): + cdef _VEB* veb + cdef int num_bits, num_top_bits, i + + veb = <_VEB*> malloc(sizeof(_VEB)) + + num_bits = int(ceil(log(n) / log(2))) + veb.num_bottom_bits = num_bits/2 + if veb.num_bottom_bits < MIN_BOTTOM_BITS: + veb.num_bottom_bits = MIN_BOTTOM_BITS + veb.top_universe_size = (n >> veb.num_bottom_bits) + 1 + + veb.bottom = malloc(veb.top_universe_size * sizeof(void*)) + memset(veb.bottom, 0, veb.top_universe_size * sizeof(void*)) + + if veb.top_universe_size > MIN_BOTTOM_SIZE: + veb.top = new_VEB(veb.top_universe_size) + else: + veb.top = new_BitSet() + + veb.max_val = -1 + veb.min_val = -1 + veb.size = 0 + return veb + + +cdef int VEB_insert(_VEB* veb, int i): + cdef _VEB* subv + cdef _BitSet* subb + cdef int a, b, tmp + + if veb.size == 0: + veb.min_val = i + veb.max_val = i + elif i == veb.min_val or i == veb.max_val: + return 0 + else: + if i < veb.min_val: + tmp = i + i = veb.min_val + veb.min_val = tmp + a = i >> veb.num_bottom_bits + b = i & lower_mask[veb.num_bottom_bits-1] + if veb.bottom[a] == NULL: + if veb.top_universe_size > MIN_BOTTOM_SIZE: + subv = <_VEB*> veb.top + VEB_insert(subv, a) + else: + subb = <_BitSet*> veb.top + bitset_insert(subb, a) + if veb.num_bottom_bits > MIN_BOTTOM_BITS: + veb.bottom[a] = new_VEB(1 << veb.num_bottom_bits) + else: + veb.bottom[a] = new_BitSet() + if veb.num_bottom_bits > MIN_BOTTOM_BITS: + subv = <_VEB*> veb.bottom[a] + if VEB_insert(subv, b) == 0: + return 0 + else: + subb = <_BitSet*> veb.bottom[a] + if bitset_insert(subb, b) == 0: + return 0 + + if i > veb.max_val: + veb.max_val = i + veb.size = veb.size + 1 + return 1 + + +cdef del_VEB(_VEB* veb): + cdef int i + + if veb.top_universe_size > MIN_BOTTOM_SIZE: + i = (<_VEB*> veb.top).min_val + else: + i = (<_BitSet*> veb.top).min_val + + while i != -1: + if veb.num_bottom_bits > MIN_BOTTOM_BITS: + del_VEB(<_VEB*> veb.bottom[i]) + else: + free(<_BitSet*> veb.bottom[i]) + + if veb.top_universe_size > MIN_BOTTOM_SIZE: + i = VEB_findsucc(<_VEB*> veb.top, i) + else: + i = bitset_findsucc(<_BitSet*> veb.top, i) + + if veb.top_universe_size > MIN_BOTTOM_SIZE: + del_VEB(<_VEB*> veb.top) + else: + free(<_BitSet*> veb.top) + free(veb.bottom) + free(veb) + + +cdef int VEB_findsucc(_VEB* veb, int i): + cdef _VEB* subv + cdef _BitSet* subb + cdef int a, b, j, c, found + + if veb.max_val == -1 or i>=veb.max_val: + return -1 + if i < veb.min_val: + return veb.min_val + + a = i >> veb.num_bottom_bits + b = i & lower_mask[veb.num_bottom_bits-1] + found = 0 + if veb.bottom[a] != NULL: + if veb.num_bottom_bits > MIN_BOTTOM_BITS: + subv = <_VEB*> veb.bottom[a] + if subv.max_val > b: + j = (a << veb.num_bottom_bits) + VEB_findsucc(subv, b) + found = 1 + else: + subb = <_BitSet*> veb.bottom[a] + if subb.max_val > b: + j = (a << veb.num_bottom_bits) + bitset_findsucc(subb, b) + found = 1 + if found==0: + if veb.top_universe_size > MIN_BOTTOM_SIZE: + subv = <_VEB*> veb.top + c = VEB_findsucc(subv, a) + else: + subb = <_BitSet*> veb.top + c = bitset_findsucc(subb, a) + if veb.num_bottom_bits > MIN_BOTTOM_BITS: + subv = <_VEB*> veb.bottom[c] + j = (c << veb.num_bottom_bits) + subv.min_val + else: + subb = <_BitSet*> veb.bottom[c] + j = (c << veb.num_bottom_bits) + subb.min_val + return j + + +cdef int VEB_contains(_VEB* veb, int i): + cdef _VEB* subv + cdef _BitSet* subb + cdef int a, b + + if veb.size == 0 or i < veb.min_val or i > veb.max_val: + return 0 + + if veb.min_val == i: + return 1 + else: + if veb.size == 1: + return 0 + + a = i >> veb.num_bottom_bits + b = i & lower_mask[veb.num_bottom_bits-1] + if veb.bottom[a] == NULL: + return 0 + else: + if veb.num_bottom_bits > MIN_BOTTOM_BITS: + subv = <_VEB*> veb.bottom[a] + return VEB_contains(subv, b) + else: + subb = <_BitSet*> veb.bottom[a] + return bitset_contains(subb, b) + + +cdef class VEBIterator: + cdef _VEB* v + cdef int next_val + + def __next__(self): + cdef int ret_val + + if self.next_val == -1: + raise StopIteration() + ret_val = self.next_val + self.next_val = VEB_findsucc(self.v, ret_val) + return ret_val + + +cdef class VEB: + + def __init__(self, size): + pass + + def __cinit__(self, int size): + self.veb = new_VEB(size) + + def __dealloc__(self): + del_VEB(self.veb) + + def __iter__(self): + cdef VEBIterator it + it = VEBIterator() + it.v = self.veb + it.next_val = self.veb.min_val + return it + + def insert(self, i): + return VEB_insert(self.veb, i) + + cdef int _insert(self, int i): + return VEB_insert(self.veb, i) + + def findsucc(self, i): + return VEB_findsucc(self.veb, i) + + cdef int _first(self): + return self.veb.min_val + + cdef int _findsucc(self, int i): + return VEB_findsucc(self.veb, i) + + def __len__(self): + return self.veb.size + + def __contains__(self, i): + return VEB_contains(self.veb, i) + + + + + diff --git a/sa-extract/example/README b/sa-extract/example/README new file mode 100644 index 00000000..9819ba5f --- /dev/null +++ b/sa-extract/example/README @@ -0,0 +1,8 @@ +Commands to compile a corpus and extract some grammars +====================================================== + +# compile +../sa-compile.pl -b nc=corpus.de.gz,corpus.en.gz -a gdfa=corpus.align.gz > extract.ini +# extract +cat test.de | ../extractor.py -c extract.ini + diff --git a/sa-extract/example/corpus.align.gz b/sa-extract/example/corpus.align.gz new file mode 100644 index 00000000..741de7e4 Binary files /dev/null and b/sa-extract/example/corpus.align.gz differ diff --git a/sa-extract/example/corpus.de.gz b/sa-extract/example/corpus.de.gz new file mode 100644 index 00000000..0d66470a Binary files /dev/null and b/sa-extract/example/corpus.de.gz differ diff --git a/sa-extract/example/corpus.en.gz b/sa-extract/example/corpus.en.gz new file mode 100644 index 00000000..28cb5c58 Binary files /dev/null and b/sa-extract/example/corpus.en.gz differ diff --git a/sa-extract/example/test.de b/sa-extract/example/test.de new file mode 100644 index 00000000..8923329f --- /dev/null +++ b/sa-extract/example/test.de @@ -0,0 +1,10 @@ +dies ist der richtige ansatz für diejenigen in chinas politischer führung , die aus der who den maximalen nutzen für die unterstützung der inneren reform ziehen wollen . +taiwan hat sich auch vorgenommen , konstruktiv zu sein - wenn china mitspielt . +die stadt staaten hongkong und singapur verfolgen eine klarsichtige who - politik und konzentrieren sich auf markt zugänge und starke regeln . +malaysia und thailand sind auch recht aktiv innerhalb der who , mit verschiedenen positionen , die vom frei handel bis zum protektionismus reichen . +indonesien und die philippinen sind schwächer , überwältigt von politischer zusammen hanglosigkeit und ganz in anspruch genommen von den anstrengungen , das schlimmste zu hause zu verhüten , so dass nur geringe kräfte übrig bleiben , mit der stets anschwellenden und immer komplizierteren agenda der who fertig zu werden . +die who steht vor einer wichtigen entscheidung . +sie muss dringend den handel progressiv liberalisieren . +eine starke führung seitens der usa ist erforderlich , damit die who in diese richtung gebracht werden kann und man gleichzeitig vermeidet , die zukunft nach dem muster der eu zu gestalten ( regel wucherung ) oder nach dem muster der uno ( macht lose gespräch runde ) . +dies geschieht sicher besser unter bush , mit einem klaren bekenntnis zum offenen markt und einer aktiveren außen politik , als es unter irgendeiner demokratischen alternative geschehen könnte . +robert zoellick , präsident bushs handel beauftragter , braucht aber verbündete . diff --git a/sa-extract/example/test.ref.en b/sa-extract/example/test.ref.en new file mode 100644 index 00000000..e50edcac --- /dev/null +++ b/sa-extract/example/test.ref.en @@ -0,0 +1,10 @@ +this is the right approach for those in china 's leadership who wish to extract maximum benefits from the wto to bolster domestic reform . +taiwan is also set to play a constructive role -- if mainland china plays along . +the city states , hong kong and singapore , have clear - sighted wto policies , focusing on market access and strong rules . +malaysia and thailand are also fairly active in the wto , with a mix of free - market and protectionist positions . +indonesia and the philippines are weaker , overwhelmed by policy incoherence and fire - fighting at home , and with insufficient capacity to deal with the wto 's burgeoning and increasingly complicated agenda . +the wto is at a crossroads . +it sorely needs to liberalize trade progressively . +strong us leadership is required to push the wto in this direction while avoiding an eu - style future ( regulatory overload ) or a un - style future ( an irrelevant talking shop ) . +this is more likely under a bush administration with better open - market credentials and a more assertive foreign policy than any democratic alternative . +however , robert zoellick , president bush 's trade representative , needs allies . diff --git a/sa-extract/extract.ini b/sa-extract/extract.ini new file mode 100644 index 00000000..56913245 --- /dev/null +++ b/sa-extract/extract.ini @@ -0,0 +1,116 @@ +# This .ini file extracts grammars to a file using +# the pattern matching infrastructure. +# +# Does not do any decoding. +# +# Variables can be set using sa-system.pl +# +# Usage: decoder.py -c [-x ] +# +# If the -x option is used, grammar will be written to the +# specified file, otherwise it is written to $PWD/grammar.out +# +# NOTE: all information about rules is cached, so use generous +# memory limits (rules themselves are not cached.) + +import os +import manager +import clex +import context_model +import rulefactory +import calignment +import sys + +out_grammar_file = "grammar.out" +if opts.extra: + out_grammar_file = opts.extra + +# *** these variables written by sa-system.pl. Do not modify *** +lm_file = "/tmp/sa-redpony/de-en/lm/lm/lm.gz" +f_sa_file = "/tmp/sa-redpony/de-en/bitext/soseos.vc/f.sa.bin" +e_file = "/tmp/sa-redpony/de-en/bitext/soseos.vc/e.bin" +a_file = "/tmp/sa-redpony/de-en/bitext/soseos.vc/a/gdfa/a.bin" +lex_file = "/tmp/sa-redpony/de-en/bitext/soseos.vc/a/gdfa/lex.bin" +max_len = 5 +max_nt = 2 +max_size=10 +min_gap=1 +rank1 = 100 +rank2 = 10 +precompute_file = "/tmp/sa-redpony/de-en/bitext/soseos.vc/precomp.5.2.10.1.100.10.bin" + +# check for path errors +if not os.path.exists(f_sa_file): + raise Exception("Cannot find compiled source language suffix array file %s" % f_sa_file) +if not os.path.exists(e_file): + raise Exception("Cannot find compiled target language array file %s" % e_file) +if not os.path.exists(a_file): + raise Exception("Cannot find compiled alignment file %s" % a_file) +if not os.path.exists(lex_file): + raise Exception("Cannot find compiled lexical weights file %s" % lex_file) +if not os.path.exists(precompute_file): + log.writeln("Could not find precomputed collocations %s, decoding will be slower" % precompute_file) + precompute_file = None + +### Output options +mark_phrases = False # show derivation as SGML markup in output +mert_mark_phrases = False # do the same when generating n-best lists (don't use this with minimum error rate training!) + +# Verbosity. 0 = silent, 1 = normal, 2-5 = verbose +log.level = 1 +log.file = sys.stderr + +# pattern-matching stuff +class PhonyGrammar: # saves us the cost of keeping the rules around + def add(self, thing): + pass + +local_grammar = PhonyGrammar() +xcat="X" + +cm = manager.ContextManager( + f_sa_file, + e_file, + sampler=rulefactory.Sampler(300), # lower=faster, higher=better; improvements level off above 200-300 range, -1 = don't sample, use all data (VERY SLOW!) + rulefactory=rulefactory.HieroCachingRuleFactory( + alignment=calignment.Alignment( # compiled alignment object (REQUIRED) + a_file, + from_binary=True + ), + category="["+xcat+"]", # name of generic nonterminal used by Hiero + grammar=local_grammar, # do not change for extraction + max_chunks=None, # maximum number of contiguous chunks of terminal symbols in RHS of a rule. If None, defaults to max_nonterminals+1 + max_initial_size=15, # maximum span of a grammar rule in TEST DATA + max_length=max_len, # maximum number of symbols (both T and NT) allowed in a rule + max_nonterminals=max_nt, # maximum number of nonterminals allowed in a rule (set >2 at your own risk) + max_target_chunks=None, # maximum number of contiguous chunks of terminal symbols in target-side RHS of a rule. If None, defaults to max_nonterminals+1 + max_target_length=None, # maximum number of target side symbols (both T and NT) allowed in a rule. If None, defaults to max_initial_size + min_gap_size=1, # minimum span of a nonterminal in the RHS of a rule in TEST DATA + precompute_file=precompute_file, # filename of file containing precomputed collocations + precompute_secondary_rank=rank2, # maximum frequency rank of patterns used to compute triples (don't set higher than 20). + precompute_rank=rank1, # maximum frequency rank of patterns used to compute collocations (no need to set higher than maybe 200-300) + require_aligned_terminal=True, # require extracted rules to have at least one aligned word + require_aligned_chunks=False, # require each contiguous chunk of extracted rules to have at least one aligned word + per_sentence_grammar=True, # generate a complete grammar for each input sentence + rule_file=out_grammar_file, # grammar is written to this file (sentence id is added to file name for per sentence grammars) + train_max_initial_size=max_size, # maximum span of a grammar rule extracted from TRAINING DATA + train_min_gap_size=min_gap, # minimum span of an RHS nonterminal in a rule extracted from TRAINING DATA + tight_phrases=True, # True if phrases should be tight, False otherwise (False seems to give better results but is slower) + ), + from_binary=True + ) + +# lexical weighting tables +tt = clex.CLex(lex_file, from_binary=True) + +# Only include features that depend on rule identity here +add_model(context_model.EgivenFCoherent(cm), 0.125) +add_model(context_model.SampleCountF(cm), 0.125) +add_model(context_model.CountEF(cm), 0.125) +add_model(context_model.MaxLexFgivenE(cm, tt), 0.125) +add_model(context_model.MaxLexEgivenF(cm, tt), 0.125) +add_model(context_model.IsSingletonF(cm), 0.125) +add_model(context_model.IsSingletonFE(cm), 0.125) + +# grammars, search parameters and all that other stuff are irrelevant + diff --git a/sa-extract/extractor.py b/sa-extract/extractor.py new file mode 100755 index 00000000..9d66ebf0 --- /dev/null +++ b/sa-extract/extractor.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python + +# vim:expandtab:shiftwidth=4 + +import sys, gc, monitor, sgml +import optparse +import model +import log +import cn + +models = [] + +def add_model(m,w=0.0): + models.append(m) + +def extract_grammar(input): + confnet = cn.ConfusionNet(input) + meta = input.meta + for m in models: + m.input(confnet.columns, meta) + +if __name__ == "__main__": + optparser = optparse.OptionParser() + optparser.add_option("-c", "--config", dest="config", help="configuration module") + optparser.add_option("-x", "--extra", dest="extra", help="output grammar name override") + (opts,args) = optparser.parse_args() + + if opts.config is None: + raise ValueError, "You must specify a configuration file." + else: + if log.level >= 1: + log.write("Reading configuration from %s\n" % opts.config) + execfile(opts.config) + + if len(args) >= 1 and args[0] != "-": + input_file = file(args[0], "r") + else: + input_file = sys.stdin + + if len(args) >= 2 and args[1] != "-": + output_file = file(args[1], "w") + else: + output_file = sys.stdout + + gc.collect() + if log.level >= 1: + log.write("all structures loaded, memory %s, time %s\n" % (monitor.memory(), monitor.cpu())) + log.write("models: %s\n" % (" ".join(str(x.name) for x in models))) + + sents = sgml.read_raw(input_file) + for sent in sents: + mark = sent.getmark() + if mark is not None: + (tag, attrs) = mark + if tag == "seg": + sent.unmark() + dattrs = sgml.attrs_to_dict(attrs) + sent.meta = attrs + extract_grammar(sent) + diff --git a/sa-extract/lcp.pyx b/sa-extract/lcp.pyx new file mode 100644 index 00000000..a992d3ee --- /dev/null +++ b/sa-extract/lcp.pyx @@ -0,0 +1,113 @@ +#!/usr/bin/env python2.4 + +'''Compute LCP array for a suffix array using the Kasai et al. algorithm''' +'''Can also be used to compute statistics such +as k most frequent n-grams''' + +import sys + +cimport cintlist +cimport csuf +cimport cdat +cimport cveb + +cdef class LCP: + + cdef csuf.SuffixArray sa + cdef cintlist.CIntList lcp + + def __init__(self, sa): + self._construct(sa) + + cdef _construct(self, csuf.SuffixArray sa): + cdef int i, k, j, h, n + cdef cintlist.CIntList rank + + sys.stderr.write("Constructing LCP array\n") + self.sa = sa + n = self.sa.sa.len + self.lcp = cintlist.CIntList(initial_len=n) + + rank = cintlist.CIntList(initial_len=n) + for i from 0 <= i < n: + rank.arr[sa.sa.arr[i]] = i + + h = 0 + for i from 0 <= i < n: + k = rank.arr[i] + if k == 0: + self.lcp.arr[k] = -1 + else: + j = sa.sa.arr[k-1] + while i+h < n and j+h < n and sa.darray.data.arr[i+h] == sa.darray.data.arr[j+h]: + h = h+1 + self.lcp.arr[k] = h + if h > 0: + h = h-1 + sys.stderr.write("LCP array completed\n") + + + def compute_stats(self, max_n): + self._compute_stats(max_n) + + cdef _compute_stats(self, int max_n): + '''Note: the output of this function is not exact. In + particular, the frequency associated with each word is + not guaranteed to be correct. This is due to a bit of + laxness in the design; the function is intended only to + obtain a list of the most frequent words; for this + purpose it is perfectly fine''' + cdef int i, ii, iii, j, k, h, n, N, rs, freq, valid + cdef cintlist.CIntList run_start + cdef cintlist.CIntList ngram_start + cdef cveb.VEB veb + + N = self.sa.sa.len + + ngram_starts = [] + for n from 0 <= n < max_n: + ngram_starts.append(cintlist.CIntList(initial_len=N)) + + run_start = cintlist.CIntList(initial_len=max_n) + veb = cveb.VEB(N) + + for i from 0 <= i < N: + h = self.lcp.arr[i] + if h < 0: + h = 0 + for n from h <= n < max_n: + rs = run_start.arr[n] + run_start.arr[n] = i + freq = i - rs + if freq > 1000: # arbitrary, but see note below + veb._insert(freq) + ngram_start = ngram_starts[n] + while ngram_start.arr[freq] > 0: + freq = freq + 1 # cheating a bit, should be ok for sparse histogram + ngram_start.arr[freq] = rs + i = veb.veb.min_val + while i != -1: + ii = veb._findsucc(i) + for n from 0 <= n < max_n: + ngram_start = ngram_starts[n] + iii = i + rs = ngram_start.arr[iii] + while (ii==-1 or iii < ii) and rs != 0: + j = self.sa.sa.arr[rs] + valid = 1 + for k from 0 <= k < n+1: + if self.sa.darray.data.arr[j+k] < 2: + valid = 0 + if valid: + ngram = "" + for k from 0 <= k < n+1: + ngram= ngram+ self.sa.darray.id2word[self.sa.darray.data.arr[j+k]] + " " + print i, n+1, ngram + iii = iii + 1 + rs = ngram_start.arr[iii] + i = ii + + + + + diff --git a/sa-extract/lcp_ops.py b/sa-extract/lcp_ops.py new file mode 100755 index 00000000..9df6e82a --- /dev/null +++ b/sa-extract/lcp_ops.py @@ -0,0 +1,52 @@ +#!/usr/bin/env python + +import lcp +import sys +import optparse +import csuf + +'''Run various computations using the LCP array''' +def main(): + + optparser = optparse.OptionParser() +# optparser.add_option("-c", "--config", dest="config", help="configuration module") + optparser.add_option("-s", "--sa-check", dest="sa_check", default=False, action="store_true") + optparser.add_option("-l", "--lcp-check", dest="lcp_check", default=False, action="store_true") + optparser.add_option("-t", "--stats", dest="stats", default=0, type="int", action="store") + optparser.add_option("-u", "--unigram", dest="uni_check", default=False, action="store_true") + optparser.add_option("-r", "--report-long-lcps", dest="long_lcp", type="int", default=0, action="store") + (opts,args) = optparser.parse_args() + + if len(args) < 1: + print >>sys.stderr, "Usage: lcp.py [opts] " + sys.exit(1) + + safile = args[0] + sa = csuf.SuffixArray(safile, from_binary=True) + +# if opts.sa_check: +# check_sufarray(sa) + + l = lcp.LCP(sa) + + if opts.lcp_check: + print >>sys.stderr, "Checking LCP Array..." + l.check() + print >>sys.stderr, "Check finished" + + if opts.stats > 0: + l.compute_stats(opts.stats) + +# if opts.uni_check: +# if lcp is None: +# lcp = LCP(sa) +# unigram_stats(sa, lcp) +# +# if opts.long_lcp: +# if lcp is None: +# lcp = LCP(sa, opts.long_lcp) + +if __name__ == "__main__": + sys.exit(main()) + + diff --git a/sa-extract/log.py b/sa-extract/log.py new file mode 100644 index 00000000..d4f96cb4 --- /dev/null +++ b/sa-extract/log.py @@ -0,0 +1,18 @@ +import sys + +level = 1 +file = sys.stderr + +def writeln(s="", l=0): + if level >= l: + file.write("%s\n" % s) + file.flush() + +def write(s, l=0): + if level >= l: + file.write(s) + file.flush() + + + + diff --git a/sa-extract/manager.py b/sa-extract/manager.py new file mode 100644 index 00000000..767192c1 --- /dev/null +++ b/sa-extract/manager.py @@ -0,0 +1,100 @@ +import csuf +import cdat + +class Sampler(object): + '''A Sampler implements a logic for choosing + samples from a population range''' + + def __init__(self): + pass + + def registerContext(self, context_manager): + self.context_manager = context_manager + + def sample(self, phrase_location): + return cintlist.CIntList() + + + +class Extractor(object): + '''Extractor is responsible for extracting rules + from a given context; once a sentence id/location + is found for the source fwords, extractor is + responsible for producing any matching rule(s). + Optionally, extractor may return an empty list''' + + def __init__(self): + pass + + def registerContext(self, context_manager): + self.context_manager = context_manager + + def extract(self, fwords, loc): + return [] + + + +class RuleFactory(object): + '''RuleFactory is a class that manages the + generation of translation rules, using the Context + and (optionally) any of its contained classes or + data. The RuleFactory is responsible for handling + any caching (i.e. when presented with an input + sentence, it may lookup a rule from its cache + rather than extracting a new rule)''' + + def __init__(self): + self.num_lookups = 0 + self.num_extractions = 0 + self.num_rules = 0 + self.time = 0.0 + + + def registerContext(self, context_manager): + self.context_manager = context_manager + + + def input(self, fwords): + '''Manages the process of enumerating + rules for a given input sentence, and + looking them with calls to Context, + Sampler, and Extractor''' + return [] + + +class ContextManager(object): + + def __init__(self, ffile, efile, extractor=None, sampler=None, rulefactory=None, from_binary=False): + # NOTE: Extractor does not have a default value because + # the only nontrivial extractor right now depends on an + # alignment file + + self.fsarray = csuf.SuffixArray(ffile, from_binary) + self.edarray = cdat.DataArray(efile, from_binary) + + self.factory = rulefactory + self.factory.registerContext(self) + + self.sampler = sampler + self.sampler.registerContext(self) + + self.models = [] + self.owner = None + + + def add_model(self, model): + if self.owner is None: + self.owner = model + model_id = len(self.models) + self.models.append(model) + return model_id + + + def input(self, model, fwords, meta): + if model != self.owner: + return + self.fwords = fwords + self.factory.input(self.fwords, meta) + + + diff --git a/sa-extract/model.py b/sa-extract/model.py new file mode 100644 index 00000000..66c51051 --- /dev/null +++ b/sa-extract/model.py @@ -0,0 +1,12 @@ + +class Model(object): + def __init__(self, name=None): + object.__init__(self) + if name is None: + self.name = self.__class__.__name__ + else: + self.name = name + + def input(self, fwords, meta): + pass + diff --git a/sa-extract/monitor.py b/sa-extract/monitor.py new file mode 100644 index 00000000..eb0bed57 --- /dev/null +++ b/sa-extract/monitor.py @@ -0,0 +1,48 @@ +import os, resource + +def cpu(): + return (resource.getrusage(resource.RUSAGE_SELF).ru_utime+ + resource.getrusage(resource.RUSAGE_SELF).ru_stime) + +# from http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/286222 + + +_proc_status = '/proc/%d/status' % os.getpid() + +_scale = {'kB': 1024.0, 'mB': 1024.0*1024.0, + 'KB': 1024.0, 'MB': 1024.0*1024.0} + +def _VmB(VmKey): + '''Private. + ''' + global _proc_status, _scale + # get pseudo file /proc//status + try: + t = open(_proc_status) + v = t.read() + t.close() + except: + return 0.0 # non-Linux? + # get VmKey line e.g. 'VmRSS: 9999 kB\n ...' + i = v.index(VmKey) + v = v[i:].split(None, 3) # whitespace + if len(v) < 3: + return 0.0 # invalid format? + # convert Vm value to bytes + return float(v[1]) * _scale[v[2]] + +def memory(since=0.0): + '''Return memory usage in bytes. + ''' + return _VmB('VmSize:') - since + +def resident(since=0.0): + '''Return resident memory usage in bytes. + ''' + return _VmB('VmRSS:') - since + + +def stacksize(since=0.0): + '''Return stack size in bytes. + ''' + return _VmB('VmStk:') - since diff --git a/sa-extract/precomputation.pxd b/sa-extract/precomputation.pxd new file mode 100644 index 00000000..c75d5eef --- /dev/null +++ b/sa-extract/precomputation.pxd @@ -0,0 +1,13 @@ +from libc.stdio cimport FILE + +cdef class Precomputation: + cdef int precompute_rank + cdef int precompute_secondary_rank + cdef int max_length + cdef int max_nonterminals + cdef int train_max_initial_size + cdef int train_min_gap_size + cdef precomputed_index + cdef precomputed_collocations + cdef read_map(self, FILE* f) + cdef write_map(self, m, FILE* f) diff --git a/sa-extract/precomputation.pyx b/sa-extract/precomputation.pyx new file mode 100644 index 00000000..ce4c21aa --- /dev/null +++ b/sa-extract/precomputation.pyx @@ -0,0 +1,478 @@ +# precomputes a set of collocations by advancing over the text. +# warning: nasty C code + +import log +import monitor + +cimport csuf +cimport cdat +cimport cintlist + +from libc.stdio cimport FILE, fopen, fread, fwrite, fclose +from libc.stdlib cimport malloc, realloc, free +from libc.string cimport memset, memcpy + +cdef struct _Trie_Node # forward decl + +cdef struct _Trie_Edge: + int val + _Trie_Node* node + _Trie_Edge* bigger + _Trie_Edge* smaller + +cdef struct _Trie_Node: + _Trie_Edge* root + int* arr + int arr_len + +cdef _Trie_Node* new_trie_node(): + cdef _Trie_Node* node + node = <_Trie_Node*> malloc(sizeof(_Trie_Node)) + node.root = NULL + node.arr_len = 0 + node.arr = malloc(sizeof(0*sizeof(int))) + return node + +cdef _Trie_Edge* new_trie_edge(int val): + cdef _Trie_Edge* edge + edge = <_Trie_Edge*> malloc(sizeof(_Trie_Edge)) + edge.node = new_trie_node() + edge.bigger = NULL + edge.smaller = NULL + edge.val = val + return edge + +cdef free_trie_node(_Trie_Node* node): + if node != NULL: + free_trie_edge(node.root) + free(node.arr) + +cdef free_trie_edge(_Trie_Edge* edge): + if edge != NULL: + free_trie_node(edge.node) + free_trie_edge(edge.bigger) + free_trie_edge(edge.smaller) + +cdef _Trie_Node* trie_find(_Trie_Node* node, int val): + cdef _Trie_Edge* cur + cur = node.root + while cur != NULL and cur.val != val: + if val > cur.val: + cur = cur.bigger + elif val < cur.val: + cur = cur.smaller + if cur == NULL: + return NULL + else: + return cur.node + +cdef trie_node_data_append(_Trie_Node* node, int val): + cdef int new_len + new_len = node.arr_len + 1 + node.arr = realloc(node.arr, new_len*sizeof(int)) + node.arr[node.arr_len] = val + node.arr_len = new_len + +cdef trie_node_data_extend(_Trie_Node* node, int* vals, int num_vals): + cdef int new_len + new_len = node.arr_len + num_vals + node.arr = realloc(node.arr, new_len*sizeof(int)) + memcpy(node.arr + node.arr_len, vals, num_vals*sizeof(int)) + node.arr_len = new_len + + +cdef _Trie_Node* trie_insert(_Trie_Node* node, int val): + cdef _Trie_Edge** cur + cur = &node.root + while cur[0] != NULL and cur[0].val != val: + if val > cur[0].val: + cur = &cur[0].bigger + elif val < cur[0].val: + cur = &cur[0].smaller + if cur[0] == NULL: + cur[0] = new_trie_edge(val) + return cur[0].node + +cdef trie_node_to_map(_Trie_Node* node, result, prefix, int include_zeros): + cdef cintlist.CIntList arr + + if include_zeros or node.arr_len > 0: + arr = cintlist.CIntList() + free(arr.arr) + arr.arr = malloc(node.arr_len * sizeof(int)) + memcpy(arr.arr, node.arr, node.arr_len * sizeof(int)) + arr.len = node.arr_len + arr.size = node.arr_len + result[prefix] = arr + trie_edge_to_map(node.root, result, prefix, include_zeros) + +cdef trie_edge_to_map(_Trie_Edge* edge, result, prefix, int include_zeros): + if edge != NULL: + trie_edge_to_map(edge.smaller, result, prefix, include_zeros) + trie_edge_to_map(edge.bigger, result, prefix, include_zeros) + prefix = prefix + (edge.val,) + trie_node_to_map(edge.node, result, prefix, include_zeros) + +cdef class TrieMap: + + cdef _Trie_Node** root + cdef int V + + def __init__(self, alphabet_size): + self.V = alphabet_size + self.root = <_Trie_Node**> malloc(self.V * sizeof(_Trie_Node*)) + memset(self.root, 0, self.V * sizeof(_Trie_Node*)) + + + def __dealloc__(self): + cdef int i + for i from 0 <= i < self.V: + if self.root[i] != NULL: + free_trie_node(self.root[i]) + free(self.root) + + + def insert(self, pattern): + cdef int* p + cdef int i, l + l = len(pattern) + p = malloc(l*sizeof(int)) + for i from 0 <= i < l: + p[i] = pattern[i] + self._insert(p,l) + free(p) + + + cdef _Trie_Node* _insert(self, int* pattern, int pattern_len): + cdef int i + cdef _Trie_Node* node + if self.root[pattern[0]] == NULL: + self.root[pattern[0]] = new_trie_node() + node = self.root[pattern[0]] + for i from 1 <= i < pattern_len: + node = trie_insert(node, pattern[i]) + return node + + def contains(self, pattern): + cdef int* p + cdef int i, l + cdef _Trie_Node* node + l = len(pattern) + p = malloc(l*sizeof(int)) + for i from 0 <= i < l: + p[i] = pattern[i] + node = self._contains(p,l) + free(p) + if node == NULL: + return False + else: + return True + + cdef _Trie_Node* _contains(self, int* pattern, int pattern_len): + cdef int i + cdef _Trie_Node* node + node = self.root[pattern[0]] + i = 1 + while node != NULL and i < pattern_len: + node = trie_find(node, pattern[i]) + i = i+1 + return node + + def toMap(self, flag): + cdef int i, include_zeros + + if flag: + include_zeros=1 + else: + include_zeros=0 + result = {} + for i from 0 <= i < self.V: + if self.root[i] != NULL: + trie_node_to_map(self.root[i], result, (i,), include_zeros) + return result + + +cdef class Precomputation: + +# Defined in .pxd file, here for reference: +# cdef int precompute_rank +# cdef int precompute_secondary_rank +# cdef int max_length +# cdef int max_nonterminals +# cdef int train_max_initial_size +# cdef int train_min_gap_size +# cdef precomputed_index +# cdef precomputed_collocations + + def __init__(self, filename, sa=None, precompute_rank=1000, precompute_secondary_rank=20, max_length=5, + max_nonterminals=2, train_max_initial_size=10, train_min_gap_size=2, from_binary=False): + self.precompute_rank = precompute_rank + self.precompute_secondary_rank = precompute_secondary_rank + self.max_length = max_length + self.max_nonterminals = max_nonterminals + self.train_max_initial_size = train_max_initial_size + self.train_min_gap_size = train_min_gap_size + if from_binary: + self.read_binary(filename) + else: + self.precompute(filename, sa) + + + def read_binary(self, filename): + cdef FILE* f + cdef bytes bfilename = filename + cdef char* cfilename = bfilename + f = fopen(cfilename, "r") + fread(&(self.precompute_rank), sizeof(int), 1, f) + fread(&(self.precompute_secondary_rank), sizeof(int), 1, f) + fread(&(self.max_length), sizeof(int), 1, f) + fread(&(self.max_nonterminals), sizeof(int), 1, f) + fread(&(self.train_max_initial_size), sizeof(int), 1, f) + fread(&(self.train_min_gap_size), sizeof(int), 1, f) + self.precomputed_index = self.read_map(f) + self.precomputed_collocations = self.read_map(f) + fclose(f) + + + def write_binary(self, filename): + cdef FILE* f + cdef bytes bfilename = filename + cdef char* cfilename = bfilename + + f = fopen(cfilename, "w") + fwrite(&(self.precompute_rank), sizeof(int), 1, f) + fwrite(&(self.precompute_secondary_rank), sizeof(int), 1, f) + fwrite(&(self.max_length), sizeof(int), 1, f) + fwrite(&(self.max_nonterminals), sizeof(int), 1, f) + fwrite(&(self.train_max_initial_size), sizeof(int), 1, f) + fwrite(&(self.train_min_gap_size), sizeof(int), 1, f) + self.write_map(self.precomputed_index, f) + self.write_map(self.precomputed_collocations, f) + fclose(f) + + + cdef write_map(self, m, FILE* f): + cdef int i, N + cdef cintlist.CIntList arr + + N = len(m) + fwrite(&(N), sizeof(int), 1, f) + for pattern, val in m.iteritems(): + N = len(pattern) + fwrite(&(N), sizeof(int), 1, f) + for word_id in pattern: + i = word_id + fwrite(&(i), sizeof(int), 1, f) + arr = val + arr.write_handle(f) + + + cdef read_map(self, FILE* f): + cdef int i, j, k, word_id, N + cdef cintlist.CIntList arr + + m = {} + fread(&(N), sizeof(int), 1, f) + for j from 0 <= j < N: + fread(&(i), sizeof(int), 1, f) + key = () + for k from 0 <= k < i: + fread(&(word_id), sizeof(int), 1, f) + key = key + (word_id,) + arr = cintlist.CIntList() + arr.read_handle(f) + m[key] = arr + return m + + + def precompute(self, filename, sa): + cdef int i, l, N, max_pattern_len, i1, l1, i2, l2, i3, l3, ptr1, ptr2, ptr3, is_super, sent_count, max_rank + cdef csuf.SuffixArray sarray + cdef cdat.DataArray darray + cdef cintlist.CIntList data, queue, cost_by_rank, count_by_rank + cdef TrieMap frequent_patterns, super_frequent_patterns, collocations + cdef _Trie_Node* node + + sarray = sa + darray = sarray.darray + data = darray.data + + frequent_patterns = TrieMap(len(darray.id2word)) + super_frequent_patterns = TrieMap(len(darray.id2word)) + collocations = TrieMap(len(darray.id2word)) + + I_set = set() + J_set = set() + J2_set = set() + IJ_set = set() + pattern_rank = {} + + log.writeln("Precomputing frequent intersections\n", 1) + start_time = monitor.cpu() + + max_pattern_len = 0 + if filename is not None: + precompute_file = open(filename) + for rank, line in enumerate(precompute_file): + if rank >= self.precompute_rank: + break + phrase_words = line.split()[2:] + phrase = () + for word in phrase_words: + phrase = phrase + (darray.word2id[word],) + max_pattern_len = max(max_pattern_len, len(phrase)) + frequent_patterns.insert(phrase) + I_set.add(phrase) + pattern_rank[phrase] = rank + if rank < self.precompute_secondary_rank: + super_frequent_patterns.insert(phrase) + J_set.add(phrase) + precompute_file.close() + + queue = cintlist.CIntList(increment=1000) + + log.writeln(" Computing inverted indexes...", 1) + N = len(data) + for i from 0 <= i < N: + sa_word_id = data.arr[i] + if sa_word_id == 1: + queue._append(-1) + else: + for l from 1 <= l <= max_pattern_len: + node = frequent_patterns._contains(data.arr+i, l) + if node == NULL: + break + queue._append(i) + queue._append(l) + trie_node_data_append(node, i) + + log.writeln(" Computing collocations...", 1) + N = len(queue) + ptr1 = 0 + sent_count = 0 + while ptr1 < N: # main loop + i1 = queue.arr[ptr1] + if i1 > -1: + l1 = queue.arr[ptr1+1] + ptr2 = ptr1 + 2 + while ptr2 < N: + i2 = queue.arr[ptr2] + if i2 == -1 or i2 - i1 >= self.train_max_initial_size: + break + l2 = queue.arr[ptr2+1] + if i2 - i1 - l1 >= self.train_min_gap_size and i2 + l2 - i1 <= self.train_max_initial_size and l1+l2+1 <= self.max_length: + node = collocations._insert(data.arr+i1, l1) + node = trie_insert(node, -1) + for i from i2 <= i < i2+l2: + node = trie_insert(node, data.arr[i]) + trie_node_data_append(node, i1) + trie_node_data_append(node, i2) + if super_frequent_patterns._contains(data.arr+i2, l2) != NULL: + if super_frequent_patterns._contains(data.arr+i1, l1) != NULL: + is_super = 1 + else: + is_super = 0 + ptr3 = ptr2 + 2 + while ptr3 < N: + i3 = queue.arr[ptr3] + if i3 == -1 or i3 - i1 >= self.train_max_initial_size: + break + l3 = queue.arr[ptr3+1] + if i3 - i2 - l2 >= self.train_min_gap_size and i3 + l3 - i1 <= self.train_max_initial_size and l1+l2+l3+2 <= self.max_length: + if is_super or super_frequent_patterns._contains(data.arr+i3, l3) != NULL: + node = collocations._insert(data.arr+i1, l1) + node = trie_insert(node, -1) + for i from i2 <= i < i2+l2: + node = trie_insert(node, data.arr[i]) + node = trie_insert(node, -1) + for i from i3 <= i < i3+l3: + node = trie_insert(node, data.arr[i]) + trie_node_data_append(node, i1) + trie_node_data_append(node, i2) + trie_node_data_append(node, i3) + ptr3 = ptr3 + 2 + ptr2 = ptr2 + 2 + ptr1 = ptr1 + 2 + else: + sent_count = sent_count + 1 + if sent_count % 10000 == 0: + log.writeln(" %d sentences" % sent_count) + ptr1 = ptr1 + 1 + + self.precomputed_collocations = collocations.toMap(False) + self.precomputed_index = frequent_patterns.toMap(True) + + x = 0 + for pattern1 in J_set: + for pattern2 in J_set: + if len(pattern1) + len(pattern2) + 1 < self.max_length: + combined_pattern = pattern1 + (-1,) + pattern2 + J2_set.add(combined_pattern) + + for pattern1 in I_set: + for pattern2 in I_set: + x = x+1 + if len(pattern1) + len(pattern2) + 1 <= self.max_length: + combined_pattern = pattern1 + (-1,) + pattern2 + IJ_set.add(combined_pattern) + + for pattern1 in I_set: + for pattern2 in J2_set: + x = x+2 + if len(pattern1) + len(pattern2) + 1<= self.max_length: + combined_pattern = pattern1 + (-1,) + pattern2 + IJ_set.add(combined_pattern) + combined_pattern = pattern2 + (-1,) + pattern1 + IJ_set.add(combined_pattern) + + N = len(pattern_rank) + cost_by_rank = cintlist.CIntList(initial_len=N) + count_by_rank = cintlist.CIntList(initial_len=N) + for pattern, arr in self.precomputed_collocations.iteritems(): + if pattern not in IJ_set: + s = "" + for word_id in pattern: + if word_id == -1: + s = s + "X " + else: + s = s + darray.id2word[word_id] + " " + log.writeln("ERROR: unexpected pattern %s in set of precomputed collocations" % (s), 1) + else: + chunk = () + max_rank = 0 + arity = 0 + for word_id in pattern: + if word_id == -1: + max_rank = max(max_rank, pattern_rank[chunk]) + arity = arity + 1 + chunk = () + else: + chunk = chunk + (word_id,) + max_rank = max(max_rank, pattern_rank[chunk]) + cost_by_rank.arr[max_rank] = cost_by_rank.arr[max_rank] + (4*len(arr)) + count_by_rank.arr[max_rank] = count_by_rank.arr[max_rank] + (len(arr)/(arity+1)) + + cumul_cost = 0 + cumul_count = 0 + for i from 0 <= i < N: + cumul_cost = cumul_cost + cost_by_rank.arr[i] + cumul_count = cumul_count + count_by_rank.arr[i] + log.writeln("RANK %d\tCOUNT, COST: %d %d\tCUMUL: %d, %d" % (i, count_by_rank.arr[i], cost_by_rank.arr[i], cumul_count, cumul_cost)) + + num_found_patterns = len(self.precomputed_collocations) + for pattern in IJ_set: + if pattern not in self.precomputed_collocations: + self.precomputed_collocations[pattern] = cintlist.CIntList() + + stop_time = monitor.cpu() + log.writeln("Precomputed collocations for %d patterns out of %d possible (upper bound %d)" % (num_found_patterns,len(self.precomputed_collocations),x)) + log.writeln("Precomputed inverted index for %d patterns " % len(self.precomputed_index)) + log.writeln("Precomputation took %f seconds" % (stop_time - start_time)) + log.writeln("Detailed statistics:") + + + + + + + diff --git a/sa-extract/rule.pxd b/sa-extract/rule.pxd new file mode 100644 index 00000000..c9c84e5c --- /dev/null +++ b/sa-extract/rule.pxd @@ -0,0 +1,13 @@ +cdef class Phrase: + cdef int *syms + cdef int n, *varpos, n_vars + cdef public int chunkpos(self, int k) + cdef public int chunklen(self, int k) + +cdef class Rule: + cdef public int lhs + cdef readonly Phrase f, e + cdef float *cscores + cdef int n_scores + cdef public owner, word_alignments + diff --git a/sa-extract/rule.pyx b/sa-extract/rule.pyx new file mode 100644 index 00000000..7cd3efda --- /dev/null +++ b/sa-extract/rule.pyx @@ -0,0 +1,286 @@ +from libc.stdlib cimport malloc, calloc, realloc, free, strtof, strtol +from libc.string cimport strsep, strcpy, strlen + +cdef extern from "strutil.h": + char *strstrsep(char **stringp, char *delim) + char *strip(char *s) + char **split(char *s, char *delim, int *pn) + +import sys + +import sym +cimport sym +cdef sym.Alphabet alphabet +alphabet = sym.alphabet + +global span_limit +span_limit = None + +cdef int bufsize +cdef char *buf +bufsize = 100 +buf = malloc(bufsize) +cdef ensurebufsize(int size): + global buf, bufsize + if size > bufsize: + buf = realloc(buf, size*sizeof(char)) + bufsize = size + +cdef class Phrase: + def __cinit__(self, words): + cdef int i, j, n, n_vars + cdef char **toks + cdef bytes bwords + cdef char* cwords + + n_vars = 0 + if type(words) is str: + ensurebufsize(len(words)+1) + bwords = words + cwords = bwords + strcpy(buf, cwords) + toks = split(buf, NULL, &n) + self.syms = malloc(n*sizeof(int)) + for i from 0 <= i < n: + self.syms[i] = alphabet.fromstring(toks[i], 0) + if alphabet.isvar(self.syms[i]): + n_vars = n_vars + 1 + + else: + n = len(words) + self.syms = malloc(n*sizeof(int)) + for i from 0 <= i < n: + self.syms[i] = words[i] + if alphabet.isvar(self.syms[i]): + n_vars = n_vars + 1 + self.n = n + self.n_vars = n_vars + self.varpos = malloc(n_vars*sizeof(int)) + j = 0 + for i from 0 <= i < n: + if alphabet.isvar(self.syms[i]): + self.varpos[j] = i + j = j + 1 + + def __dealloc__(self): + free(self.syms) + free(self.varpos) + + def __str__(self): + strs = [] + cdef int i, s + for i from 0 <= i < self.n: + s = self.syms[i] + strs.append(alphabet.tostring(s)) + return " ".join(strs) + + def instantiable(self, i, j, n): + return span_limit is None or (j-i) <= span_limit + + def handle(self): + """return a hashable representation that normalizes the ordering + of the nonterminal indices""" + norm = [] + cdef int i, j, s + i = 1 + j = 0 + for j from 0 <= j < self.n: + s = self.syms[j] + if alphabet.isvar(s): + s = alphabet.setindex(s,i) + i = i + 1 + norm.append(s) + return tuple(norm) + + def strhandle(self): + strs = [] + norm = [] + cdef int i, j, s + i = 1 + j = 0 + for j from 0 <= j < self.n: + s = self.syms[j] + if alphabet.isvar(s): + s = alphabet.setindex(s,i) + i = i + 1 + norm.append(alphabet.tostring(s)) + return " ".join(norm) + + def arity(self): + return self.n_vars + + def getvarpos(self, i): + if 0 <= i < self.n_vars: + return self.varpos[i] + else: + raise IndexError + + def getvar(self, i): + if 0 <= i < self.n_vars: + return self.syms[self.varpos[i]] + else: + raise IndexError + + cdef int chunkpos(self, int k): + if k == 0: + return 0 + else: + return self.varpos[k-1]+1 + + cdef int chunklen(self, int k): + if self.n_vars == 0: + return self.n + elif k == 0: + return self.varpos[0] + elif k == self.n_vars: + return self.n-self.varpos[k-1]-1 + else: + return self.varpos[k]-self.varpos[k-1]-1 + + def clen(self, k): + return self.chunklen(k) + + def getchunk(self, ci): + cdef int start, stop + start = self.chunkpos(ci) + stop = start+self.chunklen(ci) + chunk = [] + for i from start <= i < stop: + chunk.append(self.syms[i]) + return chunk + + def __cmp__(self, other): + cdef Phrase otherp + cdef int i + otherp = other + for i from 0 <= i < min(self.n, otherp.n): + if self.syms[i] < otherp.syms[i]: + return -1 + elif self.syms[i] > otherp.syms[i]: + return 1 + if self.n < otherp.n: + return -1 + elif self.n > otherp.n: + return 1 + else: + return 0 + + def __hash__(self): + cdef int i + cdef unsigned h + h = 0 + for i from 0 <= i < self.n: + if self.syms[i] > 0: + h = (h << 1) + self.syms[i] + else: + h = (h << 1) + -self.syms[i] + return h + + def __len__(self): + return self.n + + def __getitem__(self, i): + return self.syms[i] + + def __iter__(self): + cdef int i + l = [] + for i from 0 <= i < self.n: + l.append(self.syms[i]) + return iter(l) + + def subst(self, start, children): + cdef int i + for i from 0 <= i < self.n: + if alphabet.isvar(self.syms[i]): + start = start + children[alphabet.getindex(self.syms[i])-1] + else: + start = start + (self.syms[i],) + return start + +cdef class Rule: + def __cinit__(self, lhs, f, e, owner=None, scores=None, word_alignments=None): + cdef int i, n + cdef char *rest + + self.word_alignments = word_alignments + if scores is None: + self.cscores = NULL + self.n_scores = 0 + else: + n = len(scores) + self.cscores = malloc(n*sizeof(float)) + self.n_scores = n + for i from 0 <= i < n: + self.cscores[i] = scores[i] + + def __init__(self, lhs, f, e, owner=None, scores=None, word_alignments=None): + if not sym.isvar(lhs): + sys.stderr.write("error: lhs=%d\n" % lhs) + self.lhs = lhs + self.f = f + self.e = e + self.word_alignments = word_alignments + + def __dealloc__(self): + if self.cscores != NULL: + free(self.cscores) + + def __str__(self): + return self.to_line() + + def __hash__(self): + return hash((self.lhs, self.f, self.e)) + + def __cmp__(self, Rule other): + return cmp((self.lhs, self.f, self.e, self.word_alignments), (other.lhs, other.f, other.e, self.word_alignments)) + + def __iadd__(self, Rule other): + if self.n_scores != other.n_scores: + raise ValueError + for i from 0 <= i < self.n_scores: + self.cscores[i] = self.cscores[i] + other.cscores[i] + return self + + def fmerge(self, Phrase f): + if self.f == f: + self.f = f + + def arity(self): + return self.f.arity() + + def to_line(self): + scorestrs = [] + for i from 0 <= i < self.n_scores: + scorestrs.append(str(self.cscores[i])) + fields = [alphabet.tostring(self.lhs), str(self.f), str(self.e), " ".join(scorestrs)] + if self.word_alignments is not None: + alignstr = [] + for i from 0 <= i < len(self.word_alignments): + alignstr.append("%d-%d" % (self.word_alignments[i]/65536, self.word_alignments[i]%65536)) + #for s,t in self.word_alignments: + #alignstr.append("%d-%d" % (s,t)) + fields.append(" ".join(alignstr)) + + return " ||| ".join(fields) + + property scores: + def __get__(self): + s = [None]*self.n_scores + for i from 0 <= i < self.n_scores: + s[i] = self.cscores[i] + return s + + def __set__(self, s): + if self.cscores != NULL: + free(self.cscores) + self.cscores = malloc(len(s)*sizeof(float)) + self.n_scores = len(s) + for i from 0 <= i < self.n_scores: + self.cscores[i] = s[i] + +def rule_copy(r): + r1 = Rule(r.lhs, r.f, r.e, r.owner, r.scores) + r1.word_alignments = r.word_alignments + return r1 + diff --git a/sa-extract/rulefactory.pyx b/sa-extract/rulefactory.pyx new file mode 100644 index 00000000..20ea80d2 --- /dev/null +++ b/sa-extract/rulefactory.pyx @@ -0,0 +1,2360 @@ +# Pyrex implementation of the algorithms described in +# Lopez, EMNLP-CoNLL 2007 +# Much faster than the Python numbers reported there. +# Note to reader: this code is closer to C than Python +import sys +import sym +import log +import rule +import monitor +import cintlist +import csuf +import cdat +import cveb +import precomputation +import gc +import cn +import sgml + +cimport cmath +cimport csuf +cimport cdat +cimport cintlist +cimport rule +cimport cveb +cimport precomputation +cimport calignment + +from libc.stdlib cimport malloc, realloc, free +from libc.string cimport memset, memcpy +from libc.math cimport fmod, ceil, floor + +cdef int PRECOMPUTE +cdef int MERGE +cdef int BAEZA_YATES + +PRECOMPUTE = 0 +MERGE = 1 +BAEZA_YATES = 2 + +#cdef int node_count +#node_count = 0 + +cdef class TrieNode: + cdef public children + #cdef int id + + def __init__(self): + self.children = {} + #self.id = node_count + #node_count += 1 + + +cdef class ExtendedTrieNode(TrieNode): + cdef public phrase + cdef public phrase_location + cdef public suffix_link + + def __init__(self, phrase=None, phrase_location=None, suffix_link=None): + TrieNode.__init__(self) + self.phrase = phrase + self.phrase_location = phrase_location + self.suffix_link = suffix_link + + +cdef class TrieTable: + cdef public int extended + cdef public int count + cdef public root + def __cinit__(self, extended=False): + self.count = 0 + self.extended = extended + if extended: + self.root = ExtendedTrieNode() + else: + self.root = TrieNode() + +# linked list structure for storing matches in BaselineRuleFactory +cdef struct match_node: + int* match + match_node* next + +cdef class BaselineRuleFactory: + + cdef grammar, context_manager + cdef int max_terminals, max_nonterminals + cdef int max_initial_size, train_max_initial_size + cdef int min_gap_size, train_min_gap_size + cdef int category + cdef int visit + cdef float intersect_time, extract_time + cdef ruleFile, timingFile + cdef int* last_visit1 + cdef int* last_visit2 + cdef match_node** intersector1 + cdef match_node** intersector2 + cdef csuf.SuffixArray sa + cdef cintlist.CIntList sent_id + + def input(self, fwords): + flen = len(fwords) + start_time = monitor.cpu() + self.intersect_time = 0.0 + self.extract_time = 0.0 + + pyro_phrase_count = 0 + hiero_phrase_count = 0 + + frontier = [] + for i in xrange(len(fwords)): + frontier.append((i, (), False)) + + while len(frontier) > 0: + this_iter_intersect_time = self.intersect_time + new_frontier = [] + for i, prefix, is_shadow_path in frontier: + + word_id = fwords[i][0][0] + #print "word_id = %i" % word_id + phrase = prefix + (word_id,) + str_phrase = map(sym.tostring, phrase) + hiero_phrase = rule.Phrase(phrase) + + #log.writeln("pos %2d, '%s'" % (i, hiero_phrase)) + self.lookup(hiero_phrase) + if hiero_phrase.arity() == 0: + pyro_phrase_count = pyro_phrase_count + 1 + else: + hiero_phrase_count = hiero_phrase_count + 1 + + if len(phrase) - hiero_phrase.arity() < self.max_terminals and i+1 < len(fwords): + new_frontier.append((i+1, phrase, is_shadow_path)) + if hiero_phrase.arity() < self.max_nonterminals: + xcat = sym.setindex(self.category, hiero_phrase.arity()+1) + for j in xrange(i+1+self.min_gap_size, min(i+self.max_initial_size, len(fwords))): + new_frontier.append((j, phrase+(xcat,), is_shadow_path)) + log.writeln("This iteration intersect time = %f" % (self.intersect_time - this_iter_intersect_time)) + frontier = new_frontier + stop_time = monitor.cpu() + log.writeln("COUNT %d %d" % (pyro_phrase_count, hiero_phrase_count)) + + + def lookup(self, phrase): + cdef int j, g, start, stop, sent_id, num_ranges, arity + cdef match_node** cur_intersector + cdef match_node** next_intersector + cdef match_node** tmp_intersector + cdef match_node* node + cdef match_node* cur_node + cdef match_node* prev_node + cdef match_node** node_ptr + cdef int* cur_visit + cdef int* next_visit + cdef int* tmp_visit + cdef int* chunklen + + #print "\n\nLOOKUP\n\n" + ranges = [] + sizes = [] + arity = phrase.arity() + chunklen = malloc(arity*sizeof(int)) + for i from 0 <= i < arity+1: + chunk = phrase.getchunk(i) + chunklen[i] = len(chunk) + sa_range = None + phr = () + for offset, word_id in enumerate(chunk): + word = sym.tostring(word_id) + sa_range = self.context_manager.fsarray.lookup(word, offset, sa_range[0], sa_range[1]) + if sa_range is None: + #log.writeln("Returned for phrase %s" % rule.Phrase(phr)) + return + #log.writeln("Found range %s for phrase %s" % (sa_range, rule.Phrase(phr))) + ranges.append(sa_range) + sizes.append(sa_range[1]-sa_range[0]) + if phrase.arity() == 0: + return + + cur_intersector = self.intersector1 + next_intersector = self.intersector2 + cur_visit = self.last_visit1 + next_visit = self.last_visit2 + + num_ranges = len(ranges) + for i from 0 <= i < num_ranges: + sa_range = ranges[i] + start_time = monitor.cpu() + self.visit = self.visit + 1 + intersect_count = 0 + + start = sa_range[0] + stop = sa_range[1] + for j from start <= j < stop: + g = self.sa.sa.arr[j] + sent_id = self.sent_id.arr[g] + if i==0: + if next_visit[sent_id] != self.visit: + # clear intersector + node = next_intersector[sent_id] + next_intersector[sent_id] = NULL + while node != NULL: + prev_node = node + node = node.next + free(prev_node.match) + free(prev_node) + next_visit[sent_id] = self.visit + node_ptr = &(next_intersector[sent_id]) + while node_ptr[0] != NULL: + node_ptr = &(node_ptr[0].next) + node_ptr[0] = malloc(sizeof(match_node)) + node_ptr[0].match = malloc(sizeof(int)) + node_ptr[0].match[0] = g + node_ptr[0].next = NULL + intersect_count = intersect_count + 1 + else: + if cur_visit[sent_id] == self.visit-1: + cur_node = cur_intersector[sent_id] + while cur_node != NULL: + if g - cur_node.match[0] + chunklen[i] <= self.train_max_initial_size and g - cur_node.match[i-1] - chunklen[i-1] >= self.train_min_gap_size: + if next_visit[sent_id] != self.visit: + # clear intersector -- note that we only do this if we've got something to put there + node = next_intersector[sent_id] + next_intersector[sent_id] = NULL + while node != NULL: + prev_node = node + node = node.next + free(prev_node.match) + free(prev_node) + next_visit[sent_id] = self.visit + node_ptr = &(next_intersector[sent_id]) + while node_ptr[0] != NULL: + node_ptr = &(node_ptr[0].next) + node_ptr[0] = malloc(sizeof(match_node)) + node_ptr[0].match = malloc((i+1) * sizeof(int)) + memcpy(node_ptr[0].match, cur_node.match, i*sizeof(int)) + node_ptr[0].match[i] = g + node_ptr[0].next = NULL + intersect_count = intersect_count + 1 + cur_node = cur_node.next + tmp_intersector = cur_intersector + cur_intersector = next_intersector + next_intersector = tmp_intersector + + tmp_visit = cur_visit + cur_visit = next_visit + next_visit = tmp_visit + + intersect_time = monitor.cpu() - start_time + if i > 0: + log.writeln("INT %d %d %d %d %f baseline" % + (arity, prev_intersect_count, sa_range[1]-sa_range[0], + intersect_count, intersect_time)) + if intersect_count == 0: + return None + prev_intersect_count = intersect_count + free(chunklen) + + + + def __init__(self, max_terminals=5, max_nonterminals=2, + max_initial_size=10, train_max_initial_size=10, + min_gap_size=1, train_min_gap_size=2, + category='[PHRASE]', grammar=None, + ruleFile=None, timingFile=None): + self.grammar = grammar + self.max_terminals = max_terminals + self.max_nonterminals = max_nonterminals + self.max_initial_size = max_initial_size + self.train_max_initial_size = train_max_initial_size + self.min_gap_size = min_gap_size + self.train_min_gap_size = train_min_gap_size + self.category = sym.fromstring(category) + self.ruleFile = ruleFile + self.timingFile = timingFile + self.visit = 0 + + + def registerContext(self, context_manager): + cdef int num_sents + self.context_manager = context_manager + self.sa = context_manager.fsarray + self.sent_id = self.sa.darray.sent_id + + num_sents = len(self.sa.darray.sent_index) + self.last_visit1 = malloc(num_sents * sizeof(int)) + memset(self.last_visit1, 0, num_sents * sizeof(int)) + + self.last_visit2 = malloc(num_sents * sizeof(int)) + memset(self.last_visit2, 0, num_sents * sizeof(int)) + + self.intersector1 = malloc(num_sents * sizeof(match_node*)) + memset(self.intersector1, 0, num_sents * sizeof(match_node*)) + + self.intersector2 = malloc(num_sents * sizeof(match_node*)) + memset(self.intersector2, 0, num_sents * sizeof(match_node*)) + + +# encodes information needed to find a (hierarchical) phrase +# in the text. If phrase is contiguous, that's just a range +# in the suffix array; if discontiguous, it is the set of +# actual locations (packed into an array) +cdef class PhraseLocation: + cdef int sa_low + cdef int sa_high + cdef int arr_low + cdef int arr_high + cdef cintlist.CIntList arr + cdef int num_subpatterns + + # returns true if sent_id is contained + cdef int contains(self, int sent_id): + return 1 + + def __init__(self, sa_low=-1, sa_high=-1, arr_low=-1, arr_high=-1, arr=None, num_subpatterns=1): + self.sa_low = sa_low + self.sa_high = sa_high + self.arr_low = arr_low + self.arr_high = arr_high + self.arr = arr + self.num_subpatterns = num_subpatterns + + + +cdef class Sampler: + '''A Sampler implements a logic for choosing + samples from a population range''' + + cdef int sampleSize + cdef context_manager + cdef cintlist.CIntList sa + + def __init__(self, sampleSize=0): + self.sampleSize = sampleSize + if sampleSize > 0: + log.writeln("Sampling strategy: uniform, max sample size = %d" % sampleSize, 1) + else: + log.writeln("Sampling strategy: no sampling", 1) + + def registerContext(self, context_manager): + self.context_manager = context_manager + self.sa = ( context_manager.fsarray).sa + + + def sample(self, PhraseLocation phrase_location): + '''Returns a sample of the locations for + the phrase. If there are less than self.sampleSize + locations, return all of them; otherwise, return + up to self.sampleSize locations. In the latter case, + we choose to sample UNIFORMLY -- that is, the locations + are chosen at uniform intervals over the entire set, rather + than randomly. This makes the algorithm deterministic, which + is good for things like MERT''' + cdef cintlist.CIntList sample + cdef double i, stepsize + cdef int num_locations, val, j + + sample = cintlist.CIntList() + if phrase_location.arr is None: + num_locations = phrase_location.sa_high - phrase_location.sa_low + if self.sampleSize == -1 or num_locations <= self.sampleSize: + sample._extend_arr(self.sa.arr + phrase_location.sa_low, num_locations) + else: + stepsize = float(num_locations)/float(self.sampleSize) + i = phrase_location.sa_low + while i < phrase_location.sa_high and sample.len < self.sampleSize: + '''Note: int(i) not guaranteed to have the desired + effect, according to the python documentation''' + if fmod(i,1.0) > 0.5: + val = int(ceil(i)) + else: + val = int(floor(i)) + sample._append(self.sa.arr[val]) + i = i + stepsize + else: + num_locations = (phrase_location.arr_high - phrase_location.arr_low) / phrase_location.num_subpatterns + if self.sampleSize == -1 or num_locations <= self.sampleSize: + sample = phrase_location.arr + else: + stepsize = float(num_locations)/float(self.sampleSize) + i = phrase_location.arr_low + while i < num_locations and sample.len < self.sampleSize * phrase_location.num_subpatterns: + '''Note: int(i) not guaranteed to have the desired + effect, according to the python documentation''' + if fmod(i,1.0) > 0.5: + val = int(ceil(i)) + else: + val = int(floor(i)) + j = phrase_location.arr_low + (val*phrase_location.num_subpatterns) + sample._extend_arr(phrase_location.arr.arr + j, phrase_location.num_subpatterns) + i = i + stepsize + return sample + + +cdef long nGramCount(PhraseLocation loc): + return (loc.arr_high - loc.arr_low)/ loc.num_subpatterns + + +# struct used to encapsulate a single matching +cdef struct Matching: + int* arr + int start + int end + int sent_id + int size + + +cdef void assign_matching(Matching* m, int* arr, int start, int step, int* sent_id_arr): + m.arr = arr + m.start = start + m.end = start + step + m.sent_id = sent_id_arr[arr[start]] + m.size = step + + +cdef int* append_combined_matching(int* arr, Matching* loc1, Matching* loc2, + int offset_by_one, int num_subpatterns, int* result_len): + cdef int i, new_len + + new_len = result_len[0] + num_subpatterns + arr = realloc(arr, new_len*sizeof(int)) + + for i from 0 <= i < loc1.size: + arr[result_len[0]+i] = loc1.arr[loc1.start+i] + if num_subpatterns > loc1.size: + arr[new_len-1] = loc2.arr[loc2.end-1] + result_len[0] = new_len + return arr + + +cdef int* extend_arr(int* arr, int* arr_len, int* appendix, int appendix_len): + cdef int new_len + + new_len = arr_len[0] + appendix_len + arr = realloc(arr, new_len*sizeof(int)) + memcpy(arr+arr_len[0], appendix, appendix_len*sizeof(int)) + arr_len[0] = new_len + return arr + + +#cdef matching2str(Matching* m): +# cdef int i +# cdef result + +# result = "(" +# for i from m.start <= i < m.end: +# result = result + str(m.arr[i]) + " " +# result = result + ")" +# return result + + +cdef int median(int low, int high, int step): + return low + (((high - low)/step)/2)*step + + +cdef void findComparableMatchings(int low, int high, int* arr, int step, int loc, int* loc_minus, int* loc_plus): + # Returns (minus, plus) indices for the portion of the array + # in which all matchings have the same first index as the one + # starting at loc + loc_plus[0] = loc + step + while loc_plus[0] < high and arr[loc_plus[0]] == arr[loc]: + loc_plus[0] = loc_plus[0] + step + loc_minus[0] = loc + while loc_minus[0]-step >= low and arr[loc_minus[0]-step] == arr[loc]: + loc_minus[0] = loc_minus[0] - step + + +cdef class HieroCachingRuleFactory: + '''This RuleFactory implements a caching + method using TrieTable, which makes phrase + generation somewhat speedier -- phrases only + need to be extracted once (however, it is + quite possible they need to be scored + for each input sentence, for contextual models)''' + + cdef rules, grammar, context_manager + + cdef int max_chunks + cdef int max_target_chunks + cdef int max_length + cdef int max_target_length + cdef int max_nonterminals + cdef int max_initial_size + cdef int train_max_initial_size + cdef int min_gap_size + cdef int train_min_gap_size + cdef int category + + cdef cacheBetweenSents + cdef precomputed_index + cdef precomputed_collocations + cdef precompute_file + cdef max_rank + cdef int precompute_rank, precompute_secondary_rank + cdef useBaezaYates + cdef use_index + cdef use_collocations + cdef float by_slack_factor + + cdef per_sentence_grammar + cdef rule_filehandler + cdef rule_file + cdef pruned_rule_file + cdef extract_file + cdef sample_file + cdef search_file + cdef timingFile + cdef log_int_stats + cdef prev_norm_prefix + cdef float intersect_time, extract_time + cdef csuf.SuffixArray fsa + cdef cdat.DataArray fda + cdef cdat.DataArray eda + + cdef calignment.Alignment alignment + cdef cintlist.CIntList eid2symid + cdef cintlist.CIntList fid2symid + cdef int tight_phrases + cdef int require_aligned_terminal + cdef int require_aligned_chunks + + cdef cintlist.CIntList findexes + cdef cintlist.CIntList findexes1 + + cdef int excluded_sent_id # exclude a sentence id + + def __init__(self, + alignment=None, # compiled alignment object (REQUIRED) + by_slack_factor=1.0, # parameter for double-binary search; doesn't seem to matter much + category="[PHRASE]", # name of generic nonterminal used by Hiero + cacheBetweenSents=False, # prevent flushing of tree between sents; use carefully or you'll run out of memory + extract_file=None, # print raw extracted rules to this file + grammar=None, # empty grammar object -- must be supplied from outside (REQUIRED) + log_int_stats=False, # prints timing data on intersections to stderr + max_chunks=None, # maximum number of contiguous chunks of terminal symbols in RHS of a rule. If None, defaults to max_nonterminals+1 + max_initial_size=10, # maximum span of a grammar rule in TEST DATA + max_length=5, # maximum number of symbols (both T and NT) allowed in a rule + max_nonterminals=2, # maximum number of nonterminals allowed in a rule (set >2 at your own risk) + max_target_chunks=None, # maximum number of contiguous chunks of terminal symbols in target-side RHS of a rule. If None, defaults to max_nonterminals+1 + max_target_length=None, # maximum number of target side symbols (both T and NT) allowed in a rule. If None, defaults to max_initial_size + min_gap_size=2, # minimum span of a nonterminal in the RHS of a rule in TEST DATA + precompute_file=None, # filename of file containing precomputed collocations + precompute_secondary_rank=20, # maximum frequency rank of patterns used to compute triples (don't set higher than 20). + precompute_rank=100, # maximum frequency rank of patterns used to compute collocations (no need to set higher than maybe 200-300) + pruned_rule_file=None, # if specified, pruned grammars will be written to this filename + require_aligned_terminal=True, # require extracted rules to have at least one aligned word + require_aligned_chunks=False, # require each contiguous chunk of extracted rules to have at least one aligned word + per_sentence_grammar=True, # generate grammar files for each input segment + rule_file=None, # UNpruned grammars will be written to this filename + sample_file=None, # Sampling statistics will be written to this filename + search_file=None, # lookup statistics will be written to this filename + train_max_initial_size=10, # maximum span of a grammar rule extracted from TRAINING DATA + train_min_gap_size=2, # minimum span of an RHS nonterminal in a rule extracted from TRAINING DATA + tight_phrases=False, # True if phrases should be tight, False otherwise (False == slower but better results) + timingFile=None, # timing statistics will be written to this filename + useBaezaYates=True, # True to require use of double-binary alg, false otherwise + use_collocations=True, # True to enable used of precomputed collocations + use_index=True # True to enable use of precomputed inverted indices + ): + '''Note: we make a distinction between the min_gap_size + and max_initial_size used in test and train. The latter + are represented by train_min_gap_size and train_max_initial_size, + respectively. This is because Chiang's model does not require + them to be the same, therefore we don't either.''' + self.rules = TrieTable(True) # cache + self.rules.root = ExtendedTrieNode(phrase_location=PhraseLocation()) + self.grammar = grammar + if alignment is None: + raise Exception("Must specify an alignment object") + self.alignment = alignment + + self.excluded_sent_id = -1 + + # grammar parameters and settings + # NOTE: setting max_nonterminals > 2 is not currently supported in Hiero + self.max_length = max_length + self.max_nonterminals = max_nonterminals + self.max_initial_size = max_initial_size + self.train_max_initial_size = train_max_initial_size + self.min_gap_size = min_gap_size + self.train_min_gap_size = train_min_gap_size + self.category = sym.fromstring(category) + + if max_chunks is None: + self.max_chunks = self.max_nonterminals + 1 + else: + self.max_chunks = max_chunks + + if max_target_chunks is None: + self.max_target_chunks = self.max_nonterminals + 1 + else: + self.max_target_chunks = max_target_chunks + + if max_target_length is None: + self.max_target_length = max_initial_size + else: + self.max_target_length = max_target_length + + # algorithmic parameters and settings + self.cacheBetweenSents = not per_sentence_grammar + self.precomputed_collocations = {} + self.precomputed_index = {} + self.use_index = use_index + self.use_collocations = use_collocations + self.max_rank = {} + self.precompute_file = precompute_file + self.precompute_rank = precompute_rank + self.precompute_secondary_rank = precompute_secondary_rank + self.useBaezaYates = useBaezaYates + self.by_slack_factor = by_slack_factor + if tight_phrases: + self.tight_phrases = 1 + else: + self.tight_phrases = 0 + + if require_aligned_chunks: + # one condition is a stronger version of the other. + self.require_aligned_chunks = 1 + self.require_aligned_terminal = 1 + elif require_aligned_terminal: + self.require_aligned_chunks = 0 + self.require_aligned_terminal = 1 + else: + self.require_aligned_chunks = 0 + self.require_aligned_terminal = 0 + + + self.per_sentence_grammar = per_sentence_grammar + if not self.per_sentence_grammar: + self.rule_filehandler = open(rule_file, "w") + # diagnostics + #if rule_file is None: + # self.rule_file = None + self.rule_file = rule_file + if extract_file is None: + self.extract_file = None + else: + self.extract_file = open(extract_file, "w") + if sample_file is None: + self.sample_file = None + else: + self.sample_file = open(sample_file, "w") + if search_file is None: + self.search_file = None + else: + self.search_file = open(search_file, "w") + self.pruned_rule_file = pruned_rule_file + self.timingFile = timingFile + self.log_int_stats = log_int_stats + self.prev_norm_prefix = () + + self.findexes = cintlist.CIntList(initial_len=10) + self.findexes1 = cintlist.CIntList(initial_len=10) + + def registerContext(self, context_manager): + '''This gives the RuleFactory access to the Context object. + Here we also use it to precompute the most expensive intersections + in the corpus quickly.''' + self.context_manager = context_manager + self.fsa = context_manager.fsarray + self.fda = self.fsa.darray + self.eda = context_manager.edarray + self.fid2symid = self.set_idmap(self.fda) + self.eid2symid = self.set_idmap(self.eda) + self.precompute() + + cdef set_idmap(self, cdat.DataArray darray): + cdef int word_id, new_word_id, N + cdef cintlist.CIntList idmap + + N = len(darray.id2word) + idmap = cintlist.CIntList(initial_len=N) + for word_id from 0 <= word_id < N: + new_word_id = sym.fromstring(darray.id2word[word_id], terminal=True) + idmap.arr[word_id] = new_word_id + return idmap + + + def pattern2phrase(self, pattern): + # pattern is a tuple, which we must convert to a hiero rule.Phrase + result = () + arity = 0 + for word_id in pattern: + if word_id == -1: + arity = arity + 1 + new_id = sym.setindex(self.category, arity) + else: + new_id = sym.fromstring(self.fda.id2word[word_id]) + result = result + (new_id,) + return rule.Phrase(result) + + def pattern2phrase_plus(self, pattern): + # returns a list containing both the pattern, and pattern + # suffixed/prefixed with the NT category. + patterns = [] + result = () + arity = 0 + for word_id in pattern: + if word_id == -1: + arity = arity + 1 + new_id = sym.setindex(self.category, arity) + else: + new_id = sym.fromstring(self.fda.id2word[word_id]) + result = result + (new_id,) + patterns.append(rule.Phrase(result)) + patterns.append(rule.Phrase(result + (sym.setindex(self.category, 1),))) + patterns.append(rule.Phrase((sym.setindex(self.category, 1),) + result)) + return patterns + + def precompute(self): + cdef precomputation.Precomputation pre + + if self.precompute_file is not None: + start_time = monitor.cpu() + log.write("Reading precomputed data from file %s... " % self.precompute_file, 1) + pre = precomputation.Precomputation(self.precompute_file, from_binary=True) + # check parameters of precomputation -- some are critical and some are not + if pre.max_nonterminals != self.max_nonterminals: + log.writeln("\nWARNING: Precomputation done with max nonterminals %d, decoder uses %d" % (pre.max_nonterminals, self.max_nonterminals)) + if pre.max_length != self.max_length: + log.writeln("\nWARNING: Precomputation done with max terminals %d, decoder uses %d" % (pre.max_length, self.max_length)) + if pre.train_max_initial_size != self.train_max_initial_size: + log.writeln("\nERROR: Precomputation done with max initial size %d, decoder uses %d" % (pre.train_max_initial_size, self.train_max_initial_size)) + raise Exception("Parameter mismatch with precomputed data") + if pre.train_min_gap_size != self.train_min_gap_size: + log.writeln("\nERROR: Precomputation done with min gap size %d, decoder uses %d" % (pre.train_min_gap_size, self.train_min_gap_size)) + raise Exception("Parameter mismatch with precomputed data") + log.writeln("done.", 1) + if self.use_index: + log.write("Converting %d hash keys on precomputed inverted index... " % (len(pre.precomputed_index)), 1) + for pattern, arr in pre.precomputed_index.iteritems(): + phrases = self.pattern2phrase_plus(pattern) + for phrase in phrases: + self.precomputed_index[phrase] = arr + log.writeln("done.", 1) + if self.use_collocations: + log.write("Converting %d hash keys on precomputed collocations... " % (len(pre.precomputed_collocations)), 1) + for pattern, arr in pre.precomputed_collocations.iteritems(): + phrase = self.pattern2phrase(pattern) + self.precomputed_collocations[phrase] = arr + log.writeln("done.", 1) + stop_time = monitor.cpu() + log.writeln("Processing precomputations took %f seconds" % (stop_time - start_time), 1) + + + def getPrecomputedCollocation(self, phrase): + if phrase in self.precomputed_collocations: + arr = self.precomputed_collocations[phrase] + return PhraseLocation(arr=arr, arr_low=0, arr_high=len(arr), num_subpatterns=phrase.arity()+1) + return None + + + cdef int* baezaYatesHelper(self, int low1, int high1, int* arr1, int step1, + int low2, int high2, int* arr2, int step2, + int offset_by_one, int len_last, int num_subpatterns, int* result_len): + cdef int i1, i2, j1, j2, med1, med2, med1_plus, med1_minus, med2_minus, med2_plus + cdef int d_first, qsetsize, dsetsize, tmp, search_low, search_high + cdef int med_result_len, low_result_len, high_result_len + cdef long comparison + cdef int* result + cdef int* low_result + cdef int* med_result + cdef int* high_result + cdef Matching loc1, loc2 + + result = malloc(0*sizeof(int*)) +# log.writeln("%sBY: [%d, %d, %d] [%d, %d, %d]" % (pad, low1, high1, step1, low2, high2, step2,), 5) + + d_first = 0 + if high1 - low1 > high2 - low2: +# log.writeln("%sD first" % (pad), 5) + d_first = 1 +# else: +# log.writeln("%sQ first" % (pad), 5) + +# '''First, check to see if we are at any of the +# recursive base cases''' +# +# '''Case 1: one of the sets is empty''' + if low1 >= high1 or low2 >= high2: +# log.writeln("%sRETURN: set is empty" % (pad), 5) + return result + +# '''Case 2: sets are non-overlapping''' + assign_matching(&loc1, arr1, high1-step1, step1, self.fda.sent_id.arr) + assign_matching(&loc2, arr2, low2, step2, self.fda.sent_id.arr) + if self.compare_matchings(&loc1, &loc2, offset_by_one, len_last) == -1: +# log.writeln("%s %s < %s" % (pad, tuple(arr1[high1-step1:high1]), tuple(arr2[low2:low2+step2])),5) +# log.writeln("%sRETURN: non-overlapping sets" % (pad), 5) + return result + + assign_matching(&loc1, arr1, low1, step1, self.fda.sent_id.arr) + assign_matching(&loc2, arr2, high2-step2, step2, self.fda.sent_id.arr) + if self.compare_matchings(&loc1, &loc2, offset_by_one, len_last) == 1: +# log.writeln("%s %s > %s" % (pad, tuple(arr1[low1:low1+step1]), tuple(arr2[high2-step2:high2])),5) +# log.writeln("%sRETURN: non-overlapping sets" % (pad), 5) + return result + + # Case 3: query set and data set do not meet size mismatch constraints; + # We use mergesort instead in this case + qsetsize = (high1-low1) / step1 + dsetsize = (high2-low2) / step2 + if d_first: + tmp = qsetsize + qsetsize = dsetsize + dsetsize = tmp + + if self.by_slack_factor * qsetsize * cmath.log(dsetsize) / cmath.log(2) > dsetsize: + free(result) + return self.mergeHelper(low1, high1, arr1, step1, low2, high2, arr2, step2, offset_by_one, len_last, num_subpatterns, result_len) + + # binary search. There are two flavors, depending on + # whether the queryset or dataset is first + if d_first: + med2 = median(low2, high2, step2) + assign_matching(&loc2, arr2, med2, step2, self.fda.sent_id.arr) + + search_low = low1 + search_high = high1 + while search_low < search_high: + med1 = median(search_low, search_high, step1) + findComparableMatchings(low1, high1, arr1, step1, med1, &med1_minus, &med1_plus) + comparison = self.compareMatchingsSet(med1_minus, med1_plus, arr1, step1, &loc2, offset_by_one, len_last) + if comparison == -1: + search_low = med1_plus + elif comparison == 1: + search_high = med1_minus + else: + break + else: + med1 = median(low1, high1, step1) + findComparableMatchings(low1, high1, arr1, step1, med1, &med1_minus, &med1_plus) + + search_low = low2 + search_high = high2 + while search_low < search_high: + med2 = median(search_low, search_high, step2) + assign_matching(&loc2, arr2, med2, step2, self.fda.sent_id.arr) + comparison = self.compareMatchingsSet(med1_minus, med1_plus, arr1, step1, &loc2, offset_by_one, len_last) + if comparison == -1: + search_high = med2 + elif comparison == 1: + search_low = med2 + step2 + else: + break + + med_result_len = 0 + med_result = malloc(0*sizeof(int*)) + if search_high > search_low: +# '''Then there is a match for the median element of Q''' +# +# '''What we want to find is the group of all bindings in the first set +# s.t. their first element == the first element of med1. Then we +# want to store the bindings for all of those elements. We can +# subsequently throw all of them away.''' + med2_minus = med2 + med2_plus = med2 + step2 + i1 = med1_minus + while i1 < med1_plus: + assign_matching(&loc1, arr1, i1, step1, self.fda.sent_id.arr) + while med2_minus-step2 >= low2: + assign_matching(&loc2, arr2, med2_minus-step2, step2, self.fda.sent_id.arr) + if self.compare_matchings(&loc1, &loc2, offset_by_one, len_last) < 1: + med2_minus = med2_minus - step2 + else: + break + i2 = med2_minus + while i2 < high2: + assign_matching(&loc2, arr2, i2, step2, self.fda.sent_id.arr) + comparison = self.compare_matchings(&loc1, &loc2, offset_by_one, len_last) + if comparison == 0: + pass + med_result = append_combined_matching(med_result, &loc1, &loc2, offset_by_one, num_subpatterns, &med_result_len) + if comparison == -1: + break + i2 = i2 + step2 + if i2 > med2_plus: + med2_plus = i2 + i1 = i1 + step1 + + tmp = med1_minus + med1_minus = med1_plus + med1_plus = tmp + else: + # No match; need to figure out the point of division in D and Q + med2_minus = med2 + med2_plus = med2 + if d_first: + med2_minus = med2_minus + step2 + if comparison == -1: + med1_minus = med1_plus + if comparison == 1: + med1_plus = med1_minus + else: + tmp = med1_minus + med1_minus = med1_plus + med1_plus = tmp + if comparison == 1: + med2_minus = med2_minus + step2 + med2_plus = med2_plus + step2 + + low_result_len = 0 + low_result = self.baezaYatesHelper(low1, med1_plus, arr1, step1, low2, med2_plus, arr2, step2, offset_by_one, len_last, num_subpatterns, &low_result_len) + high_result_len = 0 + high_result = self.baezaYatesHelper(med1_minus, high1, arr1, step1, med2_minus, high2, arr2, step2, offset_by_one, len_last, num_subpatterns, &high_result_len) + + result = extend_arr(result, result_len, low_result, low_result_len) + result = extend_arr(result, result_len, med_result, med_result_len) + result = extend_arr(result, result_len, high_result, high_result_len) + free(low_result) + free(med_result) + free(high_result) + + return result + + + + cdef long compareMatchingsSet(self, int i1_minus, int i1_plus, int* arr1, int step1, + Matching* loc2, int offset_by_one, int len_last): +# '''Compares a *set* of bindings, all with the same first element, +# to a single binding. Returns -1 if all comparisons == -1, 1 if all +# comparisons == 1, and 0 otherwise.''' + cdef int i1, comparison, prev_comparison + cdef Matching l1_stack + cdef Matching* loc1 + + loc1 = &l1_stack + + i1 = i1_minus + while i1 < i1_plus: + assign_matching(loc1, arr1, i1, step1, self.fda.sent_id.arr) + comparison = self.compare_matchings(loc1, loc2, offset_by_one, len_last) + if comparison == 0: + prev_comparison = 0 + break + elif i1 == i1_minus: + prev_comparison = comparison + else: + if comparison != prev_comparison: + prev_comparison = 0 + break + i1 = i1 + step1 + return prev_comparison + + + cdef long compare_matchings(self, Matching* loc1, Matching* loc2, int offset_by_one, int len_last): + cdef int i + + if loc1.sent_id > loc2.sent_id: + return 1 + if loc2.sent_id > loc1.sent_id: + return -1 + + if loc1.size == 1 and loc2.size == 1: + if loc2.arr[loc2.start] - loc1.arr[loc1.start] <= self.train_min_gap_size: + return 1 + + elif offset_by_one: + for i from 1 <= i < loc1.size: + if loc1.arr[loc1.start+i] > loc2.arr[loc2.start+i-1]: + return 1 + if loc1.arr[loc1.start+i] < loc2.arr[loc2.start+i-1]: + return -1 + + else: + if loc1.arr[loc1.start]+1 > loc2.arr[loc2.start]: + return 1 + if loc1.arr[loc1.start]+1 < loc2.arr[loc2.start]: + return -1 + + for i from 1 <= i < loc1.size: + if loc1.arr[loc1.start+i] > loc2.arr[loc2.start+i]: + return 1 + if loc1.arr[loc1.start+i] < loc2.arr[loc2.start+i]: + return -1 + + if loc2.arr[loc2.end-1] + len_last - loc1.arr[loc1.start] > self.train_max_initial_size: + return -1 + return 0 + + + cdef int* mergeHelper(self, int low1, int high1, int* arr1, int step1, + int low2, int high2, int* arr2, int step2, + int offset_by_one, int len_last, int num_subpatterns, int* result_len): + cdef int i1, i2, j1, j2 + cdef long comparison + cdef int* result + cdef Matching loc1, loc2 +# cdef int i + +# pad = " " +# log.writeln("->mergeHelper", 5) + + result_len[0] = 0 + result = malloc(0*sizeof(int)) + + i1 = low1 + i2 = low2 +# if log.level==5: +# log.writeln("%sMERGE lists [%d,%d,%d] and [%d,%d,%d]" % (pad,low1,high1,step1,low2,high2,step2), 5) +# log.writeln("%soffset_by_one: %d, len_last: %d" % (pad, offset_by_one, len_last), 5) +# log.write("[") +# for i from low1 <= i < high1: +# log.write("%d, " % arr1.arr[i],5) +# log.writeln("]") +# log.write("[") +# for i from low2 <= i < high2: +# log.write("%d, " % arr2.arr[i],5) +# log.writeln("]") + while i1 < high1 and i2 < high2: + +# '''First, pop all unneeded loc2's off the stack''' + assign_matching(&loc1, arr1, i1, step1, self.fda.sent_id.arr) +# if log.level==5: +# log.writeln("%s TOP1 %s" % (pad,matching2str(loc1)),5) + while i2 < high2: + assign_matching(&loc2, arr2, i2, step2, self.fda.sent_id.arr) + if self.compare_matchings(&loc1, &loc2, offset_by_one, len_last) == 1: +# if log.level==5: +# log.writeln("%s %s > %s" % (pad,matching2str(loc1),matching2str(loc2)),5) +# log.writeln("%s POP2 %s" % (pad,matching2str(loc2)),5) + i2 = i2 + step2 + else: + break + +# '''Next: process all loc1's with the same starting val''' + j1 = i1 + while i1 < high1 and arr1[j1] == arr1[i1]: + assign_matching(&loc1, arr1, i1, step1, self.fda.sent_id.arr) + j2 = i2 + while j2 < high2: + assign_matching(&loc2, arr2, j2, step2, self.fda.sent_id.arr) + comparison = self.compare_matchings(&loc1, &loc2, offset_by_one, len_last) + if comparison == 0: +# if log.level==5: +# log.writeln("%s %s == %s" % (pad,matching2str(loc1),matching2str(loc2)),5) + result = append_combined_matching(result, &loc1, &loc2, offset_by_one, num_subpatterns, result_len) + if comparison == 1: +# if log.level==5: +# log.writeln("%s %s > %s" % (pad,matching2str(loc1),matching2str(loc2)),5) + pass + if comparison == -1: +# if log.level==5: +# log.writeln("%s %s < %s" % (pad,matching2str(loc1),matching2str(loc2)),5) + break + else: + j2 = j2 + step2 +# if log.level==5: +# log.writeln("%s POP1 %s" % (pad,matching2str(loc1)),5) + i1 = i1 + step1 + +# log.writeln("<-mergeHelper", 5) + return result + + + cdef void sortPhraseLoc(self, cintlist.CIntList arr, PhraseLocation loc, rule.Phrase phrase): + cdef int i, j + cdef cveb.VEB veb + cdef cintlist.CIntList result + + if phrase in self.precomputed_index: + loc.arr = self.precomputed_index[phrase] + else: + loc.arr = cintlist.CIntList(initial_len=loc.sa_high-loc.sa_low) + veb = cveb.VEB(arr.len) + for i from loc.sa_low <= i < loc.sa_high: + veb._insert(arr.arr[i]) + i = veb.veb.min_val + for j from 0 <= j < loc.sa_high-loc.sa_low: + loc.arr.arr[j] = i + i = veb._findsucc(i) + loc.arr_low = 0 + loc.arr_high = loc.arr.len + + + cdef intersectHelper(self, rule.Phrase prefix, rule.Phrase suffix, + PhraseLocation prefix_loc, PhraseLocation suffix_loc, int algorithm): + + cdef cintlist.CIntList arr1, arr2, result + cdef int low1, high1, step1, low2, high2, step2, offset_by_one, len_last, num_subpatterns, result_len + cdef int* result_ptr + cdef csuf.SuffixArray suf + + result_len = 0 + + if sym.isvar(suffix[0]): + offset_by_one = 1 + else: + offset_by_one = 0 + + len_last = len(suffix.getchunk(suffix.arity())) + + if prefix_loc.arr is None: + suf = self.context_manager.fsarray + self.sortPhraseLoc(suf.sa, prefix_loc, prefix) + arr1 = prefix_loc.arr + low1 = prefix_loc.arr_low + high1 = prefix_loc.arr_high + step1 = prefix_loc.num_subpatterns + + if suffix_loc.arr is None: + suf = self.context_manager.fsarray + self.sortPhraseLoc(suf.sa, suffix_loc, suffix) + arr2 = suffix_loc.arr + low2 = suffix_loc.arr_low + high2 = suffix_loc.arr_high + step2 = suffix_loc.num_subpatterns + + num_subpatterns = prefix.arity()+1 + + if algorithm == MERGE: + result_ptr = self.mergeHelper(low1, high1, arr1.arr, step1, + low2, high2, arr2.arr, step2, + offset_by_one, len_last, num_subpatterns, &result_len) + else: + result_ptr = self.baezaYatesHelper(low1, high1, arr1.arr, step1, + low2, high2, arr2.arr, step2, + offset_by_one, len_last, num_subpatterns, &result_len) + + if result_len == 0: + free(result_ptr) + return None + else: + result = cintlist.CIntList() + free(result.arr) + result.arr = result_ptr + result.len = result_len + result.size = result_len + return PhraseLocation(arr_low=0, arr_high=result_len, arr=result, num_subpatterns=num_subpatterns) + + cdef loc2str(self, PhraseLocation loc): + cdef int i, j + result = "{" + i = 0 + while i < loc.arr_high: + result = result + "(" + for j from i <= j < i + loc.num_subpatterns: + result = result + ("%d " %loc.arr[j]) + result = result + ")" + i = i + loc.num_subpatterns + result = result + "}" + return result + +# cdef compareResults(self, PhraseLocation loc1, PhraseLocation loc2, phrase, type1, type2): +# cdef i +# if loc1 is None and type1=="pre": +# return +# if loc1 is None: +# if loc2 is None or loc2.arr_high == 0: +# return +# if loc2 is None: +# if loc1.arr_high == 0: +# return +# if loc1.arr_high != loc2.arr_high: +# log.writeln("ERROR: %d vs %d (%s vs %s)" % (loc1.arr_high, loc2.arr_high, type1, type2)) +# #log.writeln(" %s" % self.loc2str(loc2)) +# if loc1.arr_high == 0: +# return +# elif loc1.num_subpatterns != loc2.num_subpatterns: +# log.writeln("ERROR 2: %d vs %d (%d v %d) %s" % (loc1.num_subpatterns, loc2.num_subpatterns, loc1.arr_high, loc2.arr_high, phrase)) +# for i from 0 <= i < loc1.arr_high: +# if loc1.arr[i] != loc2.arr[i]: +# log.writeln("ERROR 3") +# + cdef PhraseLocation intersect(self, prefix_node, suffix_node, rule.Phrase phrase): + cdef rule.Phrase prefix, suffix + cdef PhraseLocation prefix_loc, suffix_loc, result + + start_time = monitor.cpu() + prefix = prefix_node.phrase + suffix = suffix_node.phrase + prefix_loc = prefix_node.phrase_location + suffix_loc = suffix_node.phrase_location + + result = self.getPrecomputedCollocation(phrase) + if result is not None: + intersect_method = "precomputed" + + if result is None: + if self.useBaezaYates: + result = self.intersectHelper(prefix, suffix, prefix_loc, suffix_loc, BAEZA_YATES) + intersect_method="double binary" + else: + result = self.intersectHelper(prefix, suffix, prefix_loc, suffix_loc, MERGE) + intersect_method="merge" + stop_time = monitor.cpu() + intersect_time = stop_time - start_time + if self.log_int_stats: + if intersect_method == "precomputed": + sort1 = "none" + sort2 = "none" + else: + if prefix in self.precomputed_index: + sort1 = "index" + else: + sort1 = "veb" + if suffix in self.precomputed_index: + sort2 = "index" + else: + sort2 = "veb" + result_len=0 + if result is not None: + result_len = len(result.arr)/result.num_subpatterns + rank = 0 +# if phrase in self.max_rank: +# rank = self.max_rank[phrase] +# else: +# rank = self.precompute_rank + 10 + log.writeln("INT %d %d %d %d %d %f %d %s %s %s" % + (len(prefix)+1 - prefix.arity(), prefix.arity(), + nGramCount(prefix_node.phrase_location), + nGramCount(suffix_node.phrase_location), + result_len, intersect_time, rank, intersect_method, sort1, sort2)) + return result + + def advance(self, frontier, res, fwords): + nf = [] + for (toskip, (i, alt, pathlen)) in frontier: + spanlen = fwords[i][alt][2] + if (toskip == 0): + #log.writeln("RES: (%d %d %d)" % (i, alt, pathlen), 3) + res.append((i, alt, pathlen)) + ni = i + spanlen + #log.writeln("proc: %d (%d %d %d) sl=%d ni=%d len(fwords)=%d" % (toskip, i, alt, pathlen, spanlen, ni, len(fwords)), 3) + if (ni < len(fwords) and (pathlen + 1) < self.max_initial_size): + for na in xrange(len(fwords[ni])): + nf.append((toskip - 1, (ni, na, pathlen + 1))) + if (len(nf) > 0): + return self.advance(nf, res, fwords) + else: + return res + + def get_all_nodes_isteps_away(self, skip, i, spanlen, pathlen, fwords, next_states, reachable_buffer): + frontier = [] + if (i+spanlen+skip >= len(next_states)): + return frontier + #print "get_all_nodes_isteps_away from %i" % (i) + key = tuple([i,spanlen]) + reachable = [] + if (key in reachable_buffer): + reachable = reachable_buffer[key] + else: + reachable = self.reachable(fwords, i, spanlen) + reachable_buffer[key] = reachable + #print "reachable(from=%i,dist=%i) = " % (i,spanlen) + #print reachable + for nextreachable in reachable: + for next_id in next_states[nextreachable]: + jump = self.shortest(fwords,i,next_id) + #print "checking next_id = %i, pathlen[sofar] = %i, jump = %i" % (next_id,pathlen,jump) + #if (next_id - (i+spanlen)) < skip: + if jump < skip: + continue + #if next_id-(i-pathlen) < self.max_initial_size: + if pathlen+jump <= self.max_initial_size: + for alt_id in xrange(len(fwords[next_id])): + if (fwords[next_id][alt_id][0] != cn.epsilon): + #frontier.append((next_id,alt_id,next_id-(i-pathlen))); + #print "finding the shortest from %i to %i" % (i, next_id) + newel = (next_id,alt_id,pathlen+jump) + if newel not in frontier: + frontier.append((next_id,alt_id,pathlen+jump)) + #print "appending to frontier = next_id=%i, alt_id=%i, pathlen=%i" % (next_id,alt_id,pathlen+jump) + #else: + #print "NOT appending to frontier = next_id=%i, alt_id=%i, pathlen=%i" % (next_id,alt_id,pathlen+jump) + #else: + #print "next_id = %s is aborted\n" % next_id + #print "returning frontier" + #print frontier + return frontier + + def reachable(self, fwords, ifrom, dist): + #print "inside reachable(%i,%i)" % (ifrom,dist) + ret = [] + if (ifrom >= len(fwords)): + return ret + for alt_id in xrange(len(fwords[ifrom])): + if (fwords[ifrom][alt_id][0] == cn.epsilon): + ret.extend(self.reachable(fwords,ifrom+fwords[ifrom][alt_id][2],dist)) + else: + if (dist==0): + if (ifrom not in ret): + ret.append(ifrom) + else: + for ifromchild in self.reachable(fwords,ifrom+fwords[ifrom][alt_id][2],dist-1): + if (ifromchild not in ret): + ret.append(ifromchild) + + return ret + + def shortest(self, fwords, ifrom, ito): + min = 1000 + #print "shortest ifrom=%i, ito=%i" % (ifrom,ito) + if (ifrom > ito): + return min + if (ifrom == ito): + return 0 + for alt_id in xrange(len(fwords[ifrom])): + currmin = self.shortest(fwords,ifrom+fwords[ifrom][alt_id][2],ito) + if (fwords[ifrom][alt_id][0] != cn.epsilon): + currmin += 1 + if (currmin 0: + curr = candidate.pop() + if curr[0] >= len(_columns): + continue + if curr[0] not in result and min_dist <= curr[1] <= self.max_initial_size: + result.append(curr[0]); + curr_col = _columns[curr[0]] + for alt in curr_col: + next_id = curr[0]+alt[2] + jump = 1 + if (alt[0] == cn.epsilon): + jump = 0 + if next_id not in result and min_dist <= curr[1]+jump <= self.max_initial_size+1: + candidate.append([next_id,curr[1]+jump]) + return sorted(result); + + def input(self, fwords, meta): + '''When this function is called on the RuleFactory, + it looks up all of the rules that can be used to translate + the input sentence''' + cdef int i, j, k, flen, arity, num_subpatterns, num_samples + cdef float start_time + cdef PhraseLocation phrase_location + cdef cintlist.CIntList sample, chunklen + cdef Matching matching + cdef rule.Phrase hiero_phrase + + #fwords = [ ((1,0.0,1),), fwords1 ] #word id for = 1, cost = 0.0, next = 1 + #print fwords + flen = len(fwords) + #print "length = %i" % flen + start_time = monitor.cpu() + self.intersect_time = 0.0 + self.extract_time = 0.0 + nodes_isteps_away_buffer = {} + hit = 0 + reachable_buffer = {} + #print "id = ",meta + #print "rule_file = ",self.rule_file + dattrs = sgml.attrs_to_dict(meta) + id = dattrs.get('id', 'NOID') + if self.per_sentence_grammar: + self.rule_filehandler = open(self.rule_file+'.'+id, 'w') + self.excluded_sent_id = int(dattrs.get('exclude', '-1')) + + #print "max_initial_size = %i" % self.max_initial_size + + if not self.cacheBetweenSents: + self.rules.root = ExtendedTrieNode(phrase_location=PhraseLocation()) + self.grammar.root = [None, {}] + + frontier = [] + for i in xrange(len(fwords)): + for alt in xrange(0, len(fwords[i])): + if fwords[i][alt][0] != cn.epsilon: + frontier.append((i, i, alt, 0, self.rules.root, (), False)) + + xroot = None + x1 = sym.setindex(self.category, 1) + if x1 in self.rules.root.children: + xroot = self.rules.root.children[x1] + else: + xroot = ExtendedTrieNode(suffix_link=self.rules.root, phrase_location=PhraseLocation()) + self.rules.root.children[x1] = xroot + + for i in xrange(self.min_gap_size, len(fwords)): + for alt in xrange(0, len(fwords[i])): + if fwords[i][alt][0] != cn.epsilon: + frontier.append((i-self.min_gap_size, i, alt, self.min_gap_size, xroot, (x1,), True)) + '''for k, i, alt, pathlen, node, prefix, is_shadow_path in frontier: + if len(prefix)>0: + print k, i, alt, pathlen, node, map(sym.tostring,prefix), is_shadow_path + else: + print k, i, alt, pathlen, node, prefix, is_shadow_path''' + + #for wid in xrange(1000): + # print "%i = %s" % (wid, sym.tostring(wid)) + next_states = [] + for i in xrange(len(fwords)): + next_states.append(self.get_next_states(fwords,i,self.min_gap_size)) + #print "next state of %i" % i + #print next_states[i] + + while len(frontier) > 0: + #print "frontier = %i" % len(frontier) + this_iter_intersect_time = self.intersect_time + new_frontier = [] + for k, i, alt, pathlen, node, prefix, is_shadow_path in frontier: + #print "looking at: " + #if len(prefix)>0: + # print k, i, alt, pathlen, node, map(sym.tostring,prefix), is_shadow_path + #else: + # print k, i, alt, pathlen, node, prefix, is_shadow_path + word_id = fwords[i][alt][0] + spanlen = fwords[i][alt][2] + #print "word_id = %i, %s" % (word_id, sym.tostring(word_id)) + # to prevent .. [X] + #print "prefix = ",prefix + #if word_id == 2 and len(prefix)>=2: + #print "at the end: %s" % (prefix[len(prefix)-1]) + #if prefix[len(prefix)-1]<0: + #print "break" + #continue + #print "continuing" + #if pathlen + spanlen > self.max_initial_size: + #continue + # TODO get rid of k -- pathlen is replacing it + if word_id == cn.epsilon: + #print "skipping because word_id is epsilon" + if i+spanlen >= len(fwords): + continue + for nualt in xrange(0,len(fwords[i+spanlen])): + frontier.append((k, i+spanlen, nualt, pathlen, node, prefix, is_shadow_path)) + continue + + phrase = prefix + (word_id,) + str_phrase = map(sym.tostring, phrase) + hiero_phrase = rule.Phrase(phrase) + arity = hiero_phrase.arity() + + #print "pos %2d, node %5d, '%s'" % (i, node.id, hiero_phrase) + if self.search_file is not None: + self.search_file.write("%s\n" % hiero_phrase) + + lookup_required = False + if word_id in node.children: + if node.children[word_id] is None: + #print "Path dead-ends at this node\n" + continue + else: + #print "Path continues at this node\n" + node = node.children[word_id] + else: + if node.suffix_link is None: + #print "Current node is root; lookup required\n" + lookup_required = True + else: + if word_id in node.suffix_link.children: + if node.suffix_link.children[word_id] is None: + #print "Suffix link reports path is dead end\n" + node.children[word_id] = None + continue + else: + #print "Suffix link indicates lookup is reqired\n" + lookup_required = True + else: + #print "ERROR: We never get here\n" + raise Exception("Keyword trie error") + #new_frontier.append((k, i, alt, pathlen, node, prefix, is_shadow_path)) + #print "checking whether lookup_required\n" + if lookup_required: + new_node = None + if is_shadow_path: + #print "Extending shadow path for %s \n" + # on the shadow path we don't do any search, we just use info from suffix link + new_node = ExtendedTrieNode(phrase_location=node.suffix_link.children[word_id].phrase_location, + suffix_link=node.suffix_link.children[word_id], + phrase=hiero_phrase) + else: + if arity > 0: + #print "Intersecting for %s because of arity > 0\n" % hiero_phrase + phrase_location = self.intersect(node, node.suffix_link.children[word_id], hiero_phrase) + else: + #print "Suffix array search for %s" % hiero_phrase + phrase_location = node.phrase_location + sa_range = self.context_manager.fsarray.lookup(str_phrase[-1], len(str_phrase)-1, phrase_location.sa_low, phrase_location.sa_high) + if sa_range is not None: + phrase_location = PhraseLocation(sa_low=sa_range[0], sa_high=sa_range[1]) + else: + phrase_location = None + + if phrase_location is None: + node.children[word_id] = None + #print "Search failed\n" + continue + #print "Search succeeded\n" + suffix_link = self.rules.root + if node.suffix_link is not None: + suffix_link = node.suffix_link.children[word_id] + new_node = ExtendedTrieNode(phrase_location=phrase_location, + suffix_link=suffix_link, + phrase=hiero_phrase) + node.children[word_id] = new_node + node = new_node + #print "Added node %d with suffix link %d\n" % (node.id, node.suffix_link.id) + + '''Automatically add a trailing X node, if allowed -- + This should happen before we get to extraction (so that + the node will exist if needed)''' + if arity < self.max_nonterminals: + xcat_index = arity+1 + xcat = sym.setindex(self.category, xcat_index) + suffix_link_xcat_index = xcat_index + if is_shadow_path: + suffix_link_xcat_index = xcat_index-1 + suffix_link_xcat = sym.setindex(self.category, suffix_link_xcat_index) + node.children[xcat] = ExtendedTrieNode(phrase_location=node.phrase_location, + suffix_link=node.suffix_link.children[suffix_link_xcat], + phrase= rule.Phrase(phrase + (xcat,))) + #log.writeln("Added node %d with suffix link %d (for X)" % (node.children[xcat].id, node.children[xcat].suffix_link.id), 4) + + # sample from range + if not is_shadow_path: + #print "is_not_shadow_path" + sample = self.context_manager.sampler.sample(node.phrase_location) + #print "node.phrase_location %s" % str(node.phrase_location) + #print "sample.len = %i" % len(sample) + num_subpatterns = ( node.phrase_location).num_subpatterns + chunklen = cintlist.CIntList(initial_len=num_subpatterns) + for j from 0 <= j < num_subpatterns: + chunklen.arr[j] = hiero_phrase.chunklen(j) + extracts = [] + j = 0 + extract_start = monitor.cpu() + '''orig_tight_phrases = self.tight_phrases + orig_require_aligned_terminal = self.require_aligned_terminal + orig_require_aligned_chunks = self.require_aligned_chunks + if k==0 or i==len(fwords)-1: + self.tight_phrases = 0 + self.require_aligned_terminal = 0 + self.require_aligned_chunks = 0''' + while j < sample.len: + extract = [] + + assign_matching(&matching, sample.arr, j, num_subpatterns, self.fda.sent_id.arr) + '''print "tight_phrase = " + print self.tight_phrases + print "require_aligned_terminal = " + print self.require_aligned_terminal + print "require_aligned_chunks = " + print self.require_aligned_chunks''' + + extract = self.extract(hiero_phrase, &matching, chunklen.arr, num_subpatterns) + extracts.extend(extract) + j = j + num_subpatterns + '''self.tight_phrases = orig_tight_phrases + sttice+sa.nw.normelf.require_aligned_terminal = orig_require_aligned_terminal + self.require_aligned_chunks = orig_require_aligned_chunks''' + num_samples = sample.len/num_subpatterns + extract_stop = monitor.cpu() + self.extract_time = self.extract_time + extract_stop - extract_start + #print "extract.size = %i" % len(extracts) + if len(extracts) > 0: + fphrases = {} + fals = {} + fcount = {} + for f, e, count, als in extracts: + fcount.setdefault(f, 0.0) + fcount[f] = fcount[f] + count + fphrases.setdefault(f, {}) + fphrases[f].setdefault(e, {}) + #fphrases[f][e] = fphrases[f][e] + count + fphrases[f][e].setdefault(als,0.0) + fphrases[f][e][als] = fphrases[f][e][als] + count + #print "f,e,als ",f," : ",e," : ",als," count = ",fphrases[f][e][als] + #fals[str(f)+" ||| "+str(e)] = als + for f, elist in fphrases.iteritems(): + #print "f = '%s'" % f + #if (str(f) in ['','',' [X,1]','[X,1] ']): + # print "rejected" + # continue + f_margin = fcount[f] + for e, alslist in elist.iteritems(): + alignment = None + count = 0 + for als, currcount in alslist.iteritems(): + #print "als = ",als,", count = ",currcount + if currcount > count: + alignment = als + count = currcount + #alignment = fals[str(f)+" ||| "+str(e)] + #print "selected = ",alignment," with count = ",count + scores = [] + for m in self.context_manager.models: + scores.append(m.compute_contextless_score(f, e, count, fcount[f], num_samples)) + r = rule.Rule(self.category, f, e, scores=scores, owner="context", word_alignments = alignment) + self.grammar.add(r) + if self.rule_filehandler is not None: + self.rule_filehandler.write("%s\n" % r.to_line()) + #print "adding a rule = %s" % r + + #if len(phrase) < self.max_length and i+spanlen < len(fwords) and pathlen+spanlen < self.max_initial_size: + if len(phrase) < self.max_length and i+spanlen < len(fwords) and pathlen+1 <= self.max_initial_size: + #to prevent [X] + #print "lexicalized" + for alt_id in xrange(len(fwords[i+spanlen])): + #if (fwords[i+spanlen][alt_id][2]+pathlen+spanlen <= self.max_initial_size): + #new_frontier.append((k, i+spanlen, alt_id, pathlen + spanlen, node, phrase, is_shadow_path)) + #print "alt_id = %d\n" % alt_id + new_frontier.append((k, i+spanlen, alt_id, pathlen + 1, node, phrase, is_shadow_path)) + #print (k, i+spanlen, alt_id, pathlen + spanlen, node, map(sym.tostring,phrase), is_shadow_path) + #print "end lexicalized" + num_subpatterns = arity + if not is_shadow_path: + num_subpatterns = num_subpatterns + 1 + #to avoid X ... we want next to a lexicalized item + #if k>0 and i X_1 w X_2 / X_1 X_2. This is probably + # not worth the bother, though. + #print "find_fixpoint0" + return 0 + elif e_in_low != -1 and e_low[0] != e_in_low: + if e_in_low - e_low[0] < min_ex_size: + e_low[0] = e_in_low - min_ex_size + if e_low[0] < 0: + #print "find_fixpoint1" + return 0 + + if e_high[0] - e_low[0] > max_e_len: + #print "find_fixpoint2" + return 0 + elif e_in_high != -1 and e_high[0] != e_in_high: + if e_high[0] - e_in_high < min_ex_size: + e_high[0] = e_in_high + min_ex_size + if e_high[0] > e_sent_len: + #print "find_fixpoint3" + return 0 + + f_back_low[0] = -1 + f_back_high[0] = -1 + f_low_prev = f_low + f_high_prev = f_high + new_x = 0 + new_low_x = 0 + new_high_x = 0 + + while True: + + if f_back_low[0] == -1: + self.find_projection(e_low[0], e_high[0], e_links_low, e_links_high, f_back_low, f_back_high) + else: + self.find_projection(e_low[0], e_low_prev, e_links_low, e_links_high, f_back_low, f_back_high) + self.find_projection(e_high_prev, e_high[0], e_links_low, e_links_high, f_back_low, f_back_high) + + if f_back_low[0] > f_low: + f_back_low[0] = f_low + + if f_back_high[0] < f_high: + f_back_high[0] = f_high + + if f_back_low[0] == f_low_prev and f_back_high[0] == f_high_prev: + return 1 + + if allow_low_x == 0 and f_back_low[0] < f_low: +# log.writeln(" FAIL: f phrase is not tight") + #print " FAIL: f phrase is not tight" + return 0 + + if f_back_high[0] - f_back_low[0] > max_f_len: +# log.writeln(" FAIL: f back projection is too wide") + #print " FAIL: f back projection is too wide" + return 0 + + if allow_high_x == 0 and f_back_high[0] > f_high: +# log.writeln(" FAIL: extension on high side not allowed") + #print " FAIL: extension on high side not allowed" + return 0 + + if f_low != f_back_low[0]: + if new_low_x == 0: + if new_x >= max_new_x: +# log.writeln(" FAIL: extension required on low side violates max # of gaps") + #print " FAIL: extension required on low side violates max # of gaps" + return 0 + else: + new_x = new_x + 1 + new_low_x = 1 + if f_low - f_back_low[0] < min_fx_size: + f_back_low[0] = f_low - min_fx_size + if f_back_high[0] - f_back_low[0] > max_f_len: +# log.writeln(" FAIL: extension required on low side violates max initial length") + #print " FAIL: extension required on low side violates max initial length" + return 0 + if f_back_low[0] < 0: +# log.writeln(" FAIL: extension required on low side violates sentence boundary") + #print " FAIL: extension required on low side violates sentence boundary" + return 0 + + if f_high != f_back_high[0]: + if new_high_x == 0: + if new_x >= max_new_x: +# log.writeln(" FAIL: extension required on high side violates max # of gaps") + #print " FAIL: extension required on high side violates max # of gaps" + return 0 + else: + new_x = new_x + 1 + new_high_x = 1 + if f_back_high[0] - f_high < min_fx_size: + f_back_high[0] = f_high + min_fx_size + if f_back_high[0] - f_back_low[0] > max_f_len: +# log.writeln(" FAIL: extension required on high side violates max initial length") + #print " FAIL: extension required on high side violates max initial length" + return 0 + if f_back_high[0] > f_sent_len: +# log.writeln(" FAIL: extension required on high side violates sentence boundary") + #print " FAIL: extension required on high side violates sentence boundary" + return 0 + + e_low_prev = e_low[0] + e_high_prev = e_high[0] + + self.find_projection(f_back_low[0], f_low_prev, f_links_low, f_links_high, e_low, e_high) + self.find_projection(f_high_prev, f_back_high[0], f_links_low, f_links_high, e_low, e_high) + if e_low[0] == e_low_prev and e_high[0] == e_high_prev: + return 1 + if allow_arbitrary_x == 0: +# log.writeln(" FAIL: arbitrary expansion not permitted") + #print " FAIL: arbitrary expansion not permitted" + return 0 + if e_high[0] - e_low[0] > max_e_len: +# log.writeln(" FAIL: re-projection violates sentence max phrase length") + #print " FAIL: re-projection violates sentence max phrase length" + return 0 + f_low_prev = f_back_low[0] + f_high_prev = f_back_high[0] + + + cdef find_projection(self, int in_low, int in_high, int* in_links_low, int* in_links_high, + int* out_low, int* out_high): + cdef int i + for i from in_low <= i < in_high: + if in_links_low[i] != -1: + if out_low[0] == -1 or in_links_low[i] < out_low[0]: + out_low[0] = in_links_low[i] + if out_high[0] == -1 or in_links_high[i] > out_high[0]: + out_high[0] = in_links_high[i] + + + cdef int* int_arr_extend(self, int* arr, int* arr_len, int* data, int data_len): + cdef int new_len + new_len = arr_len[0] + data_len + arr = realloc(arr, new_len*sizeof(int)) + memcpy(arr+arr_len[0], data, data_len*sizeof(int)) + arr_len[0] = new_len + return arr + + + cdef extract_phrases(self, int e_low, int e_high, int* e_gap_low, int* e_gap_high, int* e_links_low, int num_gaps, + int f_low, int f_high, int* f_gap_low, int* f_gap_high, int* f_links_low, + int sent_id, int e_sent_len, int e_sent_start): + cdef int i, j, k, m, n, *e_gap_order, e_x_low, e_x_high, e_x_gap_low, e_x_gap_high + cdef int *e_gaps1, *e_gaps2, len1, len2, step, num_chunks + cdef cintlist.CIntList ephr_arr + cdef result + + #print "inside extract_phrases" + #print "f_low=%d, f_high=%d" % (f_low,f_high) + result = [] + len1 = 0 + e_gaps1 = malloc(0) + ephr_arr = cintlist.CIntList() + + e_gap_order = malloc(num_gaps*sizeof(int)) + if num_gaps > 0: + e_gap_order[0] = 0 + for i from 1 <= i < num_gaps: + for j from 0 <= j < i: + if e_gap_low[i] < e_gap_low[j]: + for k from j <= k < i: + e_gap_order[k+1] = e_gap_order[k] + e_gap_order[j] = i + break + else: + e_gap_order[i] = i + + e_x_low = e_low + e_x_high = e_high + if self.tight_phrases == 0: + while e_x_low > 0 and e_high - e_x_low < self.train_max_initial_size and e_links_low[e_x_low-1] == -1: + e_x_low = e_x_low - 1 + while e_x_high < e_sent_len and e_x_high - e_low < self.train_max_initial_size and e_links_low[e_x_high] == -1: + e_x_high = e_x_high + 1 + + for i from e_x_low <= i <= e_low: + e_gaps1 = self.int_arr_extend(e_gaps1, &len1, &i, 1) + + for i from 0 <= i < num_gaps: + e_gaps2 = malloc(0) + len2 = 0 + + j = e_gap_order[i] + e_x_gap_low = e_gap_low[j] + e_x_gap_high = e_gap_high[j] + if self.tight_phrases == 0: + while e_x_gap_low > e_x_low and e_links_low[e_x_gap_low-1] == -1: + e_x_gap_low = e_x_gap_low - 1 + while e_x_gap_high < e_x_high and e_links_low[e_x_gap_high] == -1: + e_x_gap_high = e_x_gap_high + 1 + + k = 0 + step = 1+(i*2) + while k < len1: + for m from e_x_gap_low <= m <= e_gap_low[j]: + if m >= e_gaps1[k+step-1]: + for n from e_gap_high[j] <= n <= e_x_gap_high: + if n-m >= 1: # extractor.py doesn't restrict target-side gap length + e_gaps2 = self.int_arr_extend(e_gaps2, &len2, e_gaps1+k, step) + e_gaps2 = self.int_arr_extend(e_gaps2, &len2, &m, 1) + e_gaps2 = self.int_arr_extend(e_gaps2, &len2, &n, 1) + k = k + step + free(e_gaps1) + e_gaps1 = e_gaps2 + len1 = len2 + + step = 1+(num_gaps*2) + e_gaps2 = malloc(0) + len2 = 0 + for i from e_high <= i <= e_x_high: + j = 0 + while j < len1: + if i - e_gaps1[j] <= self.train_max_initial_size and i >= e_gaps1[j+step-1]: + e_gaps2 = self.int_arr_extend(e_gaps2, &len2, e_gaps1+j, step) + e_gaps2 = self.int_arr_extend(e_gaps2, &len2, &i, 1) + j = j + step + free(e_gaps1) + e_gaps1 = e_gaps2 + len1 = len2 + + step = (num_gaps+1)*2 + i = 0 + + while i < len1: + ephr_arr._clear() + num_chunks = 0 + indexes = [] + for j from 0 <= j < num_gaps+1: + if e_gaps1[i+2*j] < e_gaps1[i+(2*j)+1]: + num_chunks = num_chunks + 1 + for k from e_gaps1[i+2*j] <= k < e_gaps1[i+(2*j)+1]: + indexes.append(k) + ephr_arr._append(self.eid2symid[self.eda.data.arr[e_sent_start+k]]) + if j < num_gaps: + indexes.append(sym.setindex(self.category, e_gap_order[j]+1)) + ephr_arr._append(sym.setindex(self.category, e_gap_order[j]+1)) + i = i + step + if ephr_arr.len <= self.max_target_length and num_chunks <= self.max_target_chunks: + result.append((rule.Phrase(ephr_arr),indexes)) + + free(e_gaps1) + free(e_gap_order) + return result + + cdef create_alignments(self, int* sent_links, int num_links, findexes, eindexes): + #print "create_alignments" + #s = "sent_links = " + #i = 0 + #while (i < num_links*2): + # s = s+"%d-%d " % (sent_links[i],sent_links[i+1]) + # i += 2 + #print s + #print findexes + #print eindexes + + ret = cintlist.CIntList() + for i in xrange(len(findexes)): + s = findexes[i] + if (s<0): + continue + idx = 0 + while (idx < num_links*2): + if (sent_links[idx] == s): + j = eindexes.index(sent_links[idx+1]) + ret.append(i*65536+j) + idx += 2 + return ret + + cdef extract(self, rule.Phrase phrase, Matching* matching, int* chunklen, int num_chunks): + cdef int* sent_links, *e_links_low, *e_links_high, *f_links_low, *f_links_high + cdef int *f_gap_low, *f_gap_high, *e_gap_low, *e_gap_high, num_gaps, gap_start + cdef int i, j, k, e_i, f_i, num_links, num_aligned_chunks, met_constraints + cdef int f_low, f_high, e_low, e_high, f_back_low, f_back_high + cdef int e_sent_start, e_sent_end, f_sent_start, f_sent_end, e_sent_len, f_sent_len + cdef int e_word_count, f_x_low, f_x_high, e_x_low, e_x_high, phrase_len + cdef float pair_count + cdef float available_mass + cdef extracts, phrase_list + cdef cintlist.CIntList fphr_arr + cdef rule.Phrase fphr + cdef reason_for_failure + + fphr_arr = cintlist.CIntList() + phrase_len = phrase.n + extracts = [] + sent_links = self.alignment._get_sent_links(matching.sent_id, &num_links) + + e_sent_start = self.eda.sent_index.arr[matching.sent_id] + e_sent_end = self.eda.sent_index.arr[matching.sent_id+1] + e_sent_len = e_sent_end - e_sent_start - 1 + f_sent_start = self.fda.sent_index.arr[matching.sent_id] + f_sent_end = self.fda.sent_index.arr[matching.sent_id+1] + f_sent_len = f_sent_end - f_sent_start - 1 + available_mass = 1.0 + if matching.sent_id == self.excluded_sent_id: + available_mass = 0.0 + + self.findexes1.reset() + sofar = 0 + for i in xrange(num_chunks): + for j in xrange(chunklen[i]): + self.findexes1.append(matching.arr[matching.start+i]+j-f_sent_start); + sofar += 1 + if (i+1 malloc(e_sent_len*sizeof(int)) + e_links_high = malloc(e_sent_len*sizeof(int)) + f_links_low = malloc(f_sent_len*sizeof(int)) + f_links_high = malloc(f_sent_len*sizeof(int)) + f_gap_low = malloc((num_chunks+1)*sizeof(int)) + f_gap_high = malloc((num_chunks+1)*sizeof(int)) + e_gap_low = malloc((num_chunks+1)*sizeof(int)) + e_gap_high = malloc((num_chunks+1)*sizeof(int)) + memset(f_gap_low, 0, (num_chunks+1)*sizeof(int)) + memset(f_gap_high, 0, (num_chunks+1)*sizeof(int)) + memset(e_gap_low, 0, (num_chunks+1)*sizeof(int)) + memset(e_gap_high, 0, (num_chunks+1)*sizeof(int)) + + reason_for_failure = "" + + for i from 0 <= i < e_sent_len: + e_links_low[i] = -1 + e_links_high[i] = -1 + for i from 0 <= i < f_sent_len: + f_links_low[i] = -1 + f_links_high[i] = -1 + + # this is really inefficient -- might be good to + # somehow replace with binary search just for the f + # links that we care about (but then how to look up + # when we want to check something on the e side?) + i = 0 + while i < num_links*2: + f_i = sent_links[i] + e_i = sent_links[i+1] + if f_links_low[f_i] == -1 or f_links_low[f_i] > e_i: + f_links_low[f_i] = e_i + if f_links_high[f_i] == -1 or f_links_high[f_i] < e_i + 1: + f_links_high[f_i] = e_i + 1 + if e_links_low[e_i] == -1 or e_links_low[e_i] > f_i: + e_links_low[e_i] = f_i + if e_links_high[e_i] == -1 or e_links_high[e_i] < f_i + 1: + e_links_high[e_i] = f_i + 1 + i = i + 2 + + als = [] + for x in xrange(matching.start,matching.end): + al = (matching.arr[x]-f_sent_start,f_links_low[matching.arr[x]-f_sent_start]) + als.append(al) + # check all source-side alignment constraints + met_constraints = 1 + if self.require_aligned_terminal: + num_aligned_chunks = 0 + for i from 0 <= i < num_chunks: + for j from 0 <= j < chunklen[i]: + if f_links_low[matching.arr[matching.start+i]+j-f_sent_start] > -1: + num_aligned_chunks = num_aligned_chunks + 1 + break + if num_aligned_chunks == 0: + reason_for_failure = "No aligned terminals" + met_constraints = 0 + if self.require_aligned_chunks and num_aligned_chunks < num_chunks: + reason_for_failure = "Unaligned chunk" + met_constraints = 0 + + if met_constraints and self.tight_phrases: + # outside edge constraints are checked later + for i from 0 <= i < num_chunks-1: + if f_links_low[matching.arr[matching.start+i]+chunklen[i]-f_sent_start] == -1: + reason_for_failure = "Gaps are not tight phrases" + met_constraints = 0 + break + if f_links_low[matching.arr[matching.start+i+1]-1-f_sent_start] == -1: + reason_for_failure = "Gaps are not tight phrases" + met_constraints = 0 + break + + f_low = matching.arr[matching.start] - f_sent_start + f_high = matching.arr[matching.start + matching.size - 1] + chunklen[num_chunks-1] - f_sent_start + if met_constraints: + + if self.find_fixpoint(f_low, f_high, f_links_low, f_links_high, e_links_low, e_links_high, + -1, -1, &e_low, &e_high, &f_back_low, &f_back_high, f_sent_len, e_sent_len, + self.train_max_initial_size, self.train_max_initial_size, + self.train_min_gap_size, 0, + self.max_nonterminals - num_chunks + 1, 1, 1, 0, 0): + gap_error = 0 + num_gaps = 0 + + if f_back_low < f_low: + f_gap_low[0] = f_back_low + f_gap_high[0] = f_low + num_gaps = 1 + gap_start = 0 + phrase_len = phrase_len+1 + if phrase_len > self.max_length: + gap_error = 1 + if self.tight_phrases: + if f_links_low[f_back_low] == -1 or f_links_low[f_low-1] == -1: + gap_error = 1 + reason_for_failure = "Inside edges of preceding subphrase are not tight" + else: + gap_start = 1 + if self.tight_phrases and f_links_low[f_low] == -1: + # this is not a hard error. we can't extract this phrase + # but we still might be able to extract a superphrase + met_constraints = 0 + + for i from 0 <= i < matching.size - 1: + f_gap_low[1+i] = matching.arr[matching.start+i] + chunklen[i] - f_sent_start + f_gap_high[1+i] = matching.arr[matching.start+i+1] - f_sent_start + num_gaps = num_gaps + 1 + + if f_high < f_back_high: + f_gap_low[gap_start+num_gaps] = f_high + f_gap_high[gap_start+num_gaps] = f_back_high + num_gaps = num_gaps + 1 + phrase_len = phrase_len+1 + if phrase_len > self.max_length: + gap_error = 1 + if self.tight_phrases: + if f_links_low[f_back_high-1] == -1 or f_links_low[f_high] == -1: + gap_error = 1 + reason_for_failure = "Inside edges of following subphrase are not tight" + else: + if self.tight_phrases and f_links_low[f_high-1] == -1: + met_constraints = 0 + + if gap_error == 0: + e_word_count = e_high - e_low + for i from 0 <= i < num_gaps: # check integrity of subphrases + if self.find_fixpoint(f_gap_low[gap_start+i], f_gap_high[gap_start+i], + f_links_low, f_links_high, e_links_low, e_links_high, + -1, -1, e_gap_low+gap_start+i, e_gap_high+gap_start+i, + f_gap_low+gap_start+i, f_gap_high+gap_start+i, + f_sent_len, e_sent_len, + self.train_max_initial_size, self.train_max_initial_size, + 0, 0, 0, 0, 0, 0, 0) == 0: + gap_error = 1 + reason_for_failure = "Subphrase [%d, %d] failed integrity check" % (f_gap_low[gap_start+i], f_gap_high[gap_start+i]) + break + + if gap_error == 0: + i = 1 + self.findexes.reset() + if f_back_low < f_low: + fphr_arr._append(sym.setindex(self.category, i)) + i = i+1 + self.findexes.append(sym.setindex(self.category, i)) + self.findexes.extend(self.findexes1) + for j from 0 <= j < phrase.n: + if sym.isvar(phrase.syms[j]): + fphr_arr._append(sym.setindex(self.category, i)) + i = i + 1 + else: + fphr_arr._append(phrase.syms[j]) + if f_back_high > f_high: + fphr_arr._append(sym.setindex(self.category, i)) + self.findexes.append(sym.setindex(self.category, i)) + + fphr = rule.Phrase(fphr_arr) + if met_constraints: + phrase_list = self.extract_phrases(e_low, e_high, e_gap_low + gap_start, e_gap_high + gap_start, e_links_low, num_gaps, + f_back_low, f_back_high, f_gap_low + gap_start, f_gap_high + gap_start, f_links_low, + matching.sent_id, e_sent_len, e_sent_start) + #print "e_low=%d, e_high=%d, gap_start=%d, num_gaps=%d, f_back_low=%d, f_back_high=%d" & (e_low, e_high, gap_start, num_gaps, f_back_low, f_back_high) + if len(phrase_list) > 0: + pair_count = available_mass / len(phrase_list) + else: + pair_count = 0 + reason_for_failure = "Didn't extract anything from [%d, %d] -> [%d, %d]" % (f_back_low, f_back_high, e_low, e_high) + for (phrase2,eindexes) in phrase_list: + als1 = self.create_alignments(sent_links,num_links,self.findexes,eindexes) + extracts.append((fphr, phrase2, pair_count, tuple(als1))) + if self.extract_file: + self.extract_file.write("%s ||| %s ||| %f ||| %d: [%d, %d] -> [%d, %d]\n" % (fphr, phrase2, pair_count, matching.sent_id+1, f_back_low, f_back_high, e_low, e_high)) + #print "extract_phrases1: %s ||| %s ||| %f ||| %d: [%d, %d] -> [%d, %d]\n" % (fphr, phrase2, pair_count, matching.sent_id+1, f_back_low, f_back_high, e_low, e_high) + + if (num_gaps < self.max_nonterminals and + phrase_len < self.max_length and + f_back_high - f_back_low + self.train_min_gap_size <= self.train_max_initial_size): + if (f_back_low == f_low and + f_low >= self.train_min_gap_size and + ((not self.tight_phrases) or (f_links_low[f_low-1] != -1 and f_links_low[f_back_high-1] != -1))): + f_x_low = f_low-self.train_min_gap_size + met_constraints = 1 + if self.tight_phrases: + while f_x_low >= 0 and f_links_low[f_x_low] == -1: + f_x_low = f_x_low - 1 + if f_x_low < 0 or f_back_high - f_x_low > self.train_max_initial_size: + met_constraints = 0 + + if (met_constraints and + self.find_fixpoint(f_x_low, f_back_high, + f_links_low, f_links_high, e_links_low, e_links_high, + e_low, e_high, &e_x_low, &e_x_high, &f_x_low, &f_x_high, + f_sent_len, e_sent_len, + self.train_max_initial_size, self.train_max_initial_size, + 1, 1, 1, 1, 0, 1, 0) and + ((not self.tight_phrases) or f_links_low[f_x_low] != -1) and + self.find_fixpoint(f_x_low, f_low, # check integrity of new subphrase + f_links_low, f_links_high, e_links_low, e_links_high, + -1, -1, e_gap_low, e_gap_high, f_gap_low, f_gap_high, + f_sent_len, e_sent_len, + self.train_max_initial_size, self.train_max_initial_size, + 0, 0, 0, 0, 0, 0, 0)): + fphr_arr._clear() + i = 1 + self.findexes.reset() + self.findexes.append(sym.setindex(self.category, i)) + fphr_arr._append(sym.setindex(self.category, i)) + i = i+1 + self.findexes.extend(self.findexes1) + for j from 0 <= j < phrase.n: + if sym.isvar(phrase.syms[j]): + fphr_arr._append(sym.setindex(self.category, i)) + i = i + 1 + else: + fphr_arr._append(phrase.syms[j]) + if f_back_high > f_high: + fphr_arr._append(sym.setindex(self.category, i)) + self.findexes.append(sym.setindex(self.category, i)) + fphr = rule.Phrase(fphr_arr) + phrase_list = self.extract_phrases(e_x_low, e_x_high, e_gap_low, e_gap_high, e_links_low, num_gaps+1, + f_x_low, f_x_high, f_gap_low, f_gap_high, f_links_low, matching.sent_id, + e_sent_len, e_sent_start) + if len(phrase_list) > 0: + pair_count = available_mass / len(phrase_list) + else: + pair_count = 0 + for phrase2,eindexes in phrase_list: + als2 = self.create_alignments(sent_links,num_links,self.findexes,eindexes) + extracts.append((fphr, phrase2, pair_count, tuple(als2))) + if self.extract_file: + self.extract_file.write("%s ||| %s ||| %f ||| %d: [%d, %d] -> [%d, %d]\n" % (fphr, phrase2, pair_count, matching.sent_id+1, f_x_low, f_x_high, e_x_low, e_x_high)) + #print "extract_phrases2: %s ||| %s ||| %f ||| %d: [%d, %d] -> [%d, %d]\n" % (fphr, phrase2, pair_count, matching.sent_id+1, f_x_low, f_x_high, e_x_low, e_x_high) + + if (f_back_high == f_high and + f_sent_len - f_high >= self.train_min_gap_size and + ((not self.tight_phrases) or (f_links_low[f_high] != -1 and f_links_low[f_back_low] != -1))): + f_x_high = f_high+self.train_min_gap_size + met_constraints = 1 + if self.tight_phrases: + while f_x_high <= f_sent_len and f_links_low[f_x_high-1] == -1: + f_x_high = f_x_high + 1 + if f_x_high > f_sent_len or f_x_high - f_back_low > self.train_max_initial_size: + met_constraints = 0 + + if (met_constraints and + self.find_fixpoint(f_back_low, f_x_high, + f_links_low, f_links_high, e_links_low, e_links_high, + e_low, e_high, &e_x_low, &e_x_high, &f_x_low, &f_x_high, + f_sent_len, e_sent_len, + self.train_max_initial_size, self.train_max_initial_size, + 1, 1, 1, 0, 1, 1, 0) and + ((not self.tight_phrases) or f_links_low[f_x_high-1] != -1) and + self.find_fixpoint(f_high, f_x_high, + f_links_low, f_links_high, e_links_low, e_links_high, + -1, -1, e_gap_low+gap_start+num_gaps, e_gap_high+gap_start+num_gaps, + f_gap_low+gap_start+num_gaps, f_gap_high+gap_start+num_gaps, + f_sent_len, e_sent_len, + self.train_max_initial_size, self.train_max_initial_size, + 0, 0, 0, 0, 0, 0, 0)): + fphr_arr._clear() + i = 1 + self.findexes.reset() + if f_back_low < f_low: + fphr_arr._append(sym.setindex(self.category, i)) + i = i+1 + self.findexes.append(sym.setindex(self.category, i)) + self.findexes.extend(self.findexes1) + for j from 0 <= j < phrase.n: + if sym.isvar(phrase.syms[j]): + fphr_arr._append(sym.setindex(self.category, i)) + i = i + 1 + else: + fphr_arr._append(phrase.syms[j]) + fphr_arr._append(sym.setindex(self.category, i)) + self.findexes.append(sym.setindex(self.category, i)) + fphr = rule.Phrase(fphr_arr) + phrase_list = self.extract_phrases(e_x_low, e_x_high, e_gap_low+gap_start, e_gap_high+gap_start, e_links_low, num_gaps+1, + f_x_low, f_x_high, f_gap_low+gap_start, f_gap_high+gap_start, f_links_low, + matching.sent_id, e_sent_len, e_sent_start) + if len(phrase_list) > 0: + pair_count = available_mass / len(phrase_list) + else: + pair_count = 0 + for phrase2, eindexes in phrase_list: + als3 = self.create_alignments(sent_links,num_links,self.findexes,eindexes) + extracts.append((fphr, phrase2, pair_count, tuple(als3))) + if self.extract_file: + self.extract_file.write("%s ||| %s ||| %f ||| %d: [%d, %d] -> [%d, %d]\n" % (fphr, phrase2, pair_count, matching.sent_id+1, f_x_low, f_x_high, e_x_low, e_x_high)) + #print "extract_phrases3: %s ||| %s ||| %f ||| %d: [%d, %d] -> [%d, %d]\n" % (fphr, phrase2, pair_count, matching.sent_id+1, f_x_low, f_x_high, e_x_low, e_x_high) + if (num_gaps < self.max_nonterminals - 1 and + phrase_len+1 < self.max_length and + f_back_high == f_high and + f_back_low == f_low and + f_back_high - f_back_low + (2*self.train_min_gap_size) <= self.train_max_initial_size and + f_low >= self.train_min_gap_size and + f_high <= f_sent_len - self.train_min_gap_size and + ((not self.tight_phrases) or (f_links_low[f_low-1] != -1 and f_links_low[f_high] != -1))): + + met_constraints = 1 + f_x_low = f_low-self.train_min_gap_size + if self.tight_phrases: + while f_x_low >= 0 and f_links_low[f_x_low] == -1: + f_x_low = f_x_low - 1 + if f_x_low < 0: + met_constraints = 0 + + f_x_high = f_high+self.train_min_gap_size + if self.tight_phrases: + while f_x_high <= f_sent_len and f_links_low[f_x_high-1] == -1: + f_x_high = f_x_high + 1 + if f_x_high > f_sent_len or f_x_high - f_x_low > self.train_max_initial_size: + met_constraints = 0 + + if (met_constraints and + self.find_fixpoint(f_x_low, f_x_high, + f_links_low, f_links_high, e_links_low, e_links_high, + e_low, e_high, &e_x_low, &e_x_high, &f_x_low, &f_x_high, + f_sent_len, e_sent_len, + self.train_max_initial_size, self.train_max_initial_size, + 1, 1, 2, 1, 1, 1, 1) and + ((not self.tight_phrases) or (f_links_low[f_x_low] != -1 and f_links_low[f_x_high-1] != -1)) and + self.find_fixpoint(f_x_low, f_low, + f_links_low, f_links_high, e_links_low, e_links_high, + -1, -1, e_gap_low, e_gap_high, f_gap_low, f_gap_high, + f_sent_len, e_sent_len, + self.train_max_initial_size, self.train_max_initial_size, + 0, 0, 0, 0, 0, 0, 0) and + self.find_fixpoint(f_high, f_x_high, + f_links_low, f_links_high, e_links_low, e_links_high, + -1, -1, e_gap_low+1+num_gaps, e_gap_high+1+num_gaps, + f_gap_low+1+num_gaps, f_gap_high+1+num_gaps, + f_sent_len, e_sent_len, + self.train_max_initial_size, self.train_max_initial_size, + 0, 0, 0, 0, 0, 0, 0)): + fphr_arr._clear() + i = 1 + self.findexes.reset() + self.findexes.append(sym.setindex(self.category, i)) + fphr_arr._append(sym.setindex(self.category, i)) + i = i+1 + self.findexes.extend(self.findexes1) + for j from 0 <= j < phrase.n: + if sym.isvar(phrase.syms[j]): + fphr_arr._append(sym.setindex(self.category, i)) + i = i + 1 + else: + fphr_arr._append(phrase.syms[j]) + fphr_arr._append(sym.setindex(self.category, i)) + self.findexes.append(sym.setindex(self.category, i)) + fphr = rule.Phrase(fphr_arr) + phrase_list = self.extract_phrases(e_x_low, e_x_high, e_gap_low, e_gap_high, e_links_low, num_gaps+2, + f_x_low, f_x_high, f_gap_low, f_gap_high, f_links_low, + matching.sent_id, e_sent_len, e_sent_start) + if len(phrase_list) > 0: + pair_count = available_mass / len(phrase_list) + else: + pair_count = 0 + for phrase2, eindexes in phrase_list: + als4 = self.create_alignments(sent_links,num_links,self.findexes,eindexes) + extracts.append((fphr, phrase2, pair_count, tuple(als4))) + if self.extract_file: + self.extract_file.write("%s ||| %s ||| %f ||| %d: [%d, %d] -> [%d, %d]\n" % (fphr, phrase2, pair_count, matching.sent_id+1, f_x_low, f_x_high, e_x_low, e_x_high)) + #print "extract_phrases4 %s ||| %s ||| %f ||| %d: [%d, %d] -> [%d, %d]\n" % (fphr, phrase2, pair_count, matching.sent_id+1, f_x_low, f_x_high, e_x_low, e_x_high) + else: + reason_for_failure = "Unable to extract basic phrase" + + free(sent_links) + free(f_links_low) + free(f_links_high) + free(e_links_low) + free(e_links_high) + free(f_gap_low) + free(f_gap_high) + free(e_gap_low) + free(e_gap_high) + + if self.sample_file is not None: + self.sample_file.write("%s ||| %d: [%d, %d] ||| %d ||| %s\n" % (phrase, matching.sent_id+1, f_low, f_high, len(extracts), reason_for_failure)) + + #print "%s ||| %d: [%d, %d] ||| %d ||| %s\n" % (phrase, matching.sent_id+1, f_low, f_high, len(extracts), reason_for_failure) + + + return extracts + diff --git a/sa-extract/sa-compile.pl b/sa-extract/sa-compile.pl new file mode 100755 index 00000000..1cae83a7 --- /dev/null +++ b/sa-extract/sa-compile.pl @@ -0,0 +1,322 @@ +#!/usr/bin/env perl + +use strict; +use Getopt::Long; + +my $cwd; BEGIN { use Cwd qw/ abs_path cwd /; use File::Basename; $cwd = cwd(); } + +my $rootdir = `dirname $0`; chomp $rootdir; +my $compile = "$rootdir/compile_bin.py"; +my $lcp = "$rootdir/lcp_ops.py"; +die "Can't find $compile" unless -f $compile; +die "Can't execute $compile" unless -x $compile; + +sub print_help; +sub cleanup; + +my $alignment; +my $bitext; +my $catalog; +my $dryrun = 0; +my $group; +my $help = 0; +my $ini = "$rootdir/extract.ini"; +my $lm; +my $precomp; +my $no_ini = 0; +my $remove; +my $type; +my $local_only = 1; +my $output; + +# Process command-line options +if (GetOptions( + "alignment=s" => \$alignment, + "bitext=s" => \$bitext, + "help" => \$help, + "ini=s" => \$ini, + "output=s" => \$output, + "precomp-options=s" => \$precomp, + "no-ini" => \$no_ini, +) == 0 || $help == 1 || @ARGV > 0){ + print_help; + die "\n"; +} + +open(INI, $ini) or die "Can't read $ini: $!"; + +$bitext || die "You must specify a bitext with -b\n"; +$alignment || die "You must specify an alignment with -a\n"; + +my $top_dir; +if (defined $output) { + $top_dir = $output; +} else { + $top_dir = "$cwd/sa-compiled"; +} + +my $type_dir = "$top_dir"; + +my $bitext_name; +my $bitext_f_file; +my $bitext_e_file; +my $bitext_dir; +if ($bitext){ + if ($bitext =~ /(.*)=(.*),(.*)/){ + $bitext_name = $1; + $bitext_f_file = $2; + $bitext_e_file = $3; + -e $bitext_f_file || die "Could not find file $bitext_f_file\n"; + -e $bitext_e_file || die "Could not find file $bitext_e_file\n"; + } else { + $bitext_name = $bitext; + } + + $bitext_dir = "$type_dir/bitext/$bitext_name"; + if ($bitext_f_file){ + if (-e $bitext_dir) { + die "Bitext $bitext_name already exists\n"; + } + } else { + unless (-e $bitext_dir){ + die "No bitext $bitext_name. You must specify bitext files with -b\n"; + } + } +} + +my $max_nt = 2; +my $max_len = 5; +my $max_size = 15; +my $min_gap = 1; +my $rank1 = 100; +my $rank2 = 10; +my $precomp_file; +if ($precomp){ + unless ($bitext_name){ + die "You must specify a bitext with -b if using -p\n"; + } + my @precomp_args = split(/,/, $precomp); + my $precomp_arg; + for $precomp_arg (@precomp_args){ + if ($precomp_arg =~ /(.*)=(.*)/){ + my $key = $1; + my $value = $2; + unless ($value =~ /^\d+$/){ + die "Value for -p option must be a positive integer, found $value\n"; + } + if ($key eq "max-len"){ $max_len = $value; } + elsif ($key eq "max-nt"){ $max_nt = $value; } + elsif ($key eq "max-size"){ $max_size = $value; } + elsif ($key eq "min-gap"){ $min_gap = $value; } + elsif ($key eq "rank1"){ $rank1 = $value; } + elsif ($key eq "rank2"){ $rank2 = $value; } + else{ + die "Unknown option $key given for -p\n"; + } + } else { + die "When using -p, you must specify key-value pairs using syntax: =,...,=\n"; + } + } +} +my $precomp_compile_needed = 0; +if ($bitext_name){ + $precomp_file = "$bitext_dir/precomp.$max_len.$max_nt.$max_size.$min_gap.$rank1.$rank2.bin"; + unless (-e $precomp_file){ + $precomp_compile_needed = 1; + } +} + +my $alignment_name; +my $alignment_file; +my $alignment_dir; +if ($alignment){ + $bitext || die "Specified alignment $alignment without specifying bitext using -b\n"; + if ($alignment =~ /(.*)=(.*)/){ + $alignment_name = $1; + $alignment_file = $2; + -e $alignment_file || die "Could not find file $alignment_file\n"; + } else { + $alignment_name = $alignment; + } + + $alignment_dir = "$bitext_dir/a/$alignment_name"; + if ($alignment_file){ + if (-e $alignment_dir){ + die "Alignment $alignment_name already exists for bitext $bitext_name\n"; + } + } else { + require_top_dirs(); + unless (-e $alignment_dir){ + die "No alignment $alignment_name for bitext $bitext_name\n"; + } + } +} + +if ($bitext_name){ + print STDERR " from files $bitext_f_file and $bitext_e_file\n"; +} else { + print " No bitext\n"; +} +if ($precomp_compile_needed){ + print STDERR " Precompilation needed: max-len=$max_len, max-nt=$max_nt, max-size=$max_size, min-gap=$min_gap, rank1=$rank1, rank2=$rank2\n"; +} +if ($alignment_name){ + print STDERR " Alignment = $alignment_name"; + if ($alignment_file){ + print STDERR " from file $alignment_file\n"; + } +} else { + print STDERR " No alignment\n"; +} + +my $script; +my $compile_dir; +$SIG{INT} = "cleanup"; +$SIG{TERM} = "cleanup"; +$SIG{HUP} = "cleanup"; + + if ($bitext_e_file || $precomp_compile_needed || $alignment_file){ + my $compiled_e_file; + my $compiled_f_file; + + $compile_dir = $top_dir; + my $compile_top_dir = "$compile_dir"; + + my $compile_bitext_dir = "$compile_top_dir/bitext/$bitext_name"; + if ($bitext_e_file){ + `mkdir -p $compile_bitext_dir`; + print STDERR "\nCompiling bitext (f side)...\n"; + `$compile -s $bitext_f_file $compile_bitext_dir/f.sa.bin`; + die "Command failed: $!" unless $? == 0; + print STDERR "\nCompiling bitext (e side)...\n"; + `$compile -d $bitext_e_file $compile_bitext_dir/e.bin`; + die "Command failed: $!" unless $? == 0; + + $compiled_f_file = "$compile_bitext_dir/f.sa.bin"; + $compiled_e_file = "$compile_bitext_dir/e.bin"; + } else { # bitext already compiled + $compiled_f_file = "$bitext_dir/f.sa.bin"; + $compiled_e_file = "$bitext_dir/e.bin"; + } + + if ($precomp_compile_needed){ + `mkdir -p $compile_bitext_dir`; + my $top_stats_file = "$compile_bitext_dir/f.top.$rank1"; + my $compiled_precomp_file = "$compile_bitext_dir/precomp.$max_len.$max_nt.$max_size.$min_gap.$rank1.$rank2.bin"; + my $cmd = "$lcp -t 4 $compiled_f_file | sort -nr | head -$rank1 > $top_stats_file"; + print STDERR "$cmd\n"; + `$cmd`; + die "Command failed: $cmd" unless $? == 0; + `$compile -r max-len=$max_len max-nt=$max_nt max-size=$max_size min-gap=$min_gap rank1=$rank1 rank2=$rank2 sa=$compiled_f_file $top_stats_file $compiled_precomp_file`; + die "Command failed: $!" unless $? == 0; + } + + if ($alignment_file){ + my $compile_alignment_dir = "$compile_top_dir/bitext/$bitext_name/a/$alignment_name"; + `mkdir -p $compile_alignment_dir`; + print STDERR "\nCompiling alignment...\n"; + my $cmd= "$compile -a $alignment_file $compile_alignment_dir/a.bin"; + print STDERR " $cmd\n"; + `$cmd`; + die "Command failed: $!" unless $? == 0; + + print STDERR "\nCompiling lexical weights file...\n"; + $cmd="$compile -x $compiled_f_file $compiled_e_file $compile_alignment_dir/a.bin $compile_alignment_dir/lex.bin"; + print STDERR " $cmd\n"; + `$cmd`; + die "Command failed: $!" unless $? == 0; + } + + chdir $compile_dir; + print STDERR "Compiling done: $compile_dir\n"; + } + + unless ($no_ini){ + my $line; + while($line=){ + $line =~ s/^([^#]*a_file\s*=\s*")(.*)("\s*)$/$1$alignment_dir\/a.bin$3/; + $line =~ s/^([^#]*lex_file\s*=\s*")(.*)("\s*)$/$1$alignment_dir\/lex.bin$3/; + $line =~ s/^([^#]*f_sa_file\s*=\s*")(.*)("\s*)$/$1$bitext_dir\/f.sa.bin$3/; + $line =~ s/^([^#]*e_file\s*=\s*")(.*)("\s*)$/$1$bitext_dir\/e.bin$3/; + $line =~ s/^([^#]*precompute_file\s*=\s*")(.*)("\s*)$/$1$bitext_dir\/precomp.$max_len.$max_nt.$max_size.$min_gap.$rank1.$rank2.bin$3/; + + $line =~ s/^([^#]*max_len\s*=\s*)(.*)(\s*)$/$1$max_len$3/; + $line =~ s/^([^#]*max_nt\s*=\s*)(.*)(\s*)$/$1$max_nt$3/; + $line =~ s/^([^#]*max_size\s*=\s*)(.*)(\s*)$/$1$max_size$3/; + $line =~ s/^([^#]*min_gap\s*=\s*)(.*)(\s*)$/$1$min_gap$3/; + $line =~ s/^([^#]*rank1\s*=\s*)(.*)(\s*)$/$1$rank1$3/; + $line =~ s/^([^#]*rank2\s*=\s*)(.*)(\s*)$/$1$rank2$3/; + + print $line; + } + } + +exit(0); + +sub cleanup { + die "Cleanup.\n"; +} + +sub print_help +{ + my $name = `basename $0`; chomp $name; + print << "Help"; + +usage: $name [options] + + Manage compilation of SA-Hiero files and creation of ini files. + In the default usage, the command deploys a set of files needed + to create a system, and writes an ini for the system on stdout. + +options: + + -a, --alignment [=] + Name of an alignment of a bitext (which must be specified + with -b unless using the -c flag). If used with -r, the + alignment is removed from the deployment. If used with -c, + only alignments with this name are listed. If a filename is + given, then the file will be deployed using the name. + + -b, --bitext [=,] + Name of a bitext for a particular system type (which must be + specified with -t unless using the -c flag). If used with -r, + the bitext is removed from the deployment. If used with -c, + only bitexts with the name are listed. If a filename is given, + then the file will be deployed using the name. + + -h, --help + Prints this message. + + -i, --ini + Use a specific ini file as the template for a system, rather than + the default ini file. + + -p, --precomp-options =[,=,...,=] + Set parameters of the grammar. This must be set by $name because + many parameters involve precomputation. There are six keys that can + be set: + max-len: maximum number of symbols (T and NT) in a grammar rule + max-nt: maximum number of nonterminals in a grammar rule + max-size: maximum span of a grammar rule extracted from training + min-gap: minimum gap spanned by a nonterminal in training + rank1: number of frequent words to precompute collocations for. + rank2: number of super-frequent words to precompute triple + collocations for. + All values must be positive integers. If not specified, defaults are: + max-len = 5 + max-nt = 2 (>2 not supported) + max-size = 10 + min-gap = 2 + rank1 = 100 (>300 not recommended) + rank2 = 10 (>10 not recommended) + + -n, --no-ini + Do not generate an ini file on stdout. If this option is used, then + the requirement to specify a full system is relaxed. Therefore, this + option can be used when the sole objective is deployment of files. + + -o, --output-dir + Write the compiled model to this directory. + +Help +} diff --git a/sa-extract/setup.cfg b/sa-extract/setup.cfg new file mode 100644 index 00000000..8f696136 --- /dev/null +++ b/sa-extract/setup.cfg @@ -0,0 +1,2 @@ +[build_ext] +inplace=1 diff --git a/sa-extract/setup.py b/sa-extract/setup.py new file mode 100644 index 00000000..cdcbfb54 --- /dev/null +++ b/sa-extract/setup.py @@ -0,0 +1,45 @@ +from distutils.core import setup, Extension +from distutils.util import get_platform +import os.path + +cstrmap_module = Extension('cstrmap', sources = ['cstrmap.c', 'strmap.cc']) +setup (name = 'CStringMap', version = '1.0', description = 'C string->int map', ext_modules = [cstrmap_module]) + +rule_module = Extension('rule', + sources = ['rule.c', 'strutil.c']) +setup (name = 'Rule', version = '1.0', description = 'rule class', ext_modules = [rule_module]) + +sym_module = Extension('sym', + sources = ['sym.c']) +setup (name = 'Sym', version = '1.0', description = 'symbol class', ext_modules = [sym_module]) + +cdat_module = Extension('cdat', sources = ['cdat.c']) +setup(name = "CDat", version = '1.0', description = 'C Data class', ext_modules = [cdat_module]) + +cintlist_module = Extension('cintlist', sources = ['cintlist.c']) +setup(name = "CIntList", version = '1.0', description = 'C int array/list class', ext_modules = [cintlist_module]) + +cfloatlist_module = Extension('cfloatlist', sources = ['cfloatlist.c']) +setup(name = "CFloatList", version = '1.0', description = 'C float array/list class', ext_modules = [cfloatlist_module]) + +calignment_module = Extension('calignment', sources = ['calignment.c']) +setup(name = "CAlignment", version = '1.0', description = 'C alignment class', ext_modules = [calignment_module]) + +csuf_module = Extension('csuf', sources = ['csuf.c']) +setup(name = "CSuffixArray", version = '1.0', description = 'C suffix array class', ext_modules = [csuf_module]) + +clex_module = Extension('clex', sources = ['clex.c']) +setup(name = "CLex", version = '1.0', description = 'C lexical class', ext_modules = [clex_module]) + +factory_module = Extension('rulefactory', sources = ['rulefactory.c']) +setup(name = "RuleFactory", version = '1.0', description = 'C rule factory classes', ext_modules = [factory_module]) + +cveb_module = Extension('cveb', sources = ['cveb.c']) +setup(name = "CVEB", version = '1.0', description = 'C impl. of van Emde Boas tree', ext_modules = [cveb_module]) + +lcp_module = Extension('lcp', sources = ['lcp.c']) +setup(name = "LCP", version = '1.0', description = 'C impl. of LCP', ext_modules = [lcp_module]) + +precomp_module = Extension('precomputation', sources = ['precomputation.c']) +setup(name = "Precomputation", version = '1.0', description = 'Precomputation Algorithm', ext_modules = [precomp_module]) + diff --git a/sa-extract/sgml.py b/sa-extract/sgml.py new file mode 100644 index 00000000..2db8b5dc --- /dev/null +++ b/sa-extract/sgml.py @@ -0,0 +1,194 @@ +import sys, sgmllib, xml.sax.saxutils, sym + +def attrs_to_str(d): + if len(d) == 0: + return "" + l = [""]+["%s=%s" % (name, xml.sax.saxutils.quoteattr(value)) for (name, value) in d] + return " ".join(l) + +def attrs_to_dict(a): + d = {} + for (name, value) in a: + if d.has_key(name.lower()): + raise ValueError, "duplicate attribute names" + d[name.lower()] = value + return d + +class Sentence(object): + def __init__(self, words=None, meta=None): + if words is not None: + self.words = list(words) + else: + self.words = [] + if meta is not None: + self.meta = meta + else: + self.meta = [] + + def copy(self): + return Sentence(self.words, list(self.meta)) + + def mark(self, tag, attrs): + self.meta.append((tag, attrs, 0, len(self.words))) + + def getmark(self): + if len(self.meta) > 0: + (tag, attrs, i, j) = self.meta[-1] + if i == 0 and j == len(self.words): + return (tag, attrs) + else: + return None + else: + return None + + def unmark(self): + mark = self.getmark() + if mark is not None: + self.meta = self.meta[:-1] + return mark + + def __cmp__(self, other): + return cmp((self.words, self.meta), (other.words, other.meta)) + + def __str__(self): + def cmp_spans((tag1,attr1,i1,j1),(tag2,attr2,i2,j2)): + if i1==i2<=j1==j2: + return 0 + elif i2<=i1<=j1<=j2: + return -1 + elif i1<=i2<=j2<=j1: + return 1 + else: + return cmp((i1,j1),(i2,j2)) # don't care + # this guarantees that equal spans will come out nested + # we want the later spans to be outer + # this relies on stable sort + open = [[] for i in xrange(len(self.words)+1)] + # there seems to be a bug still with empty spans + empty = [[] for i in xrange(len(self.words)+1)] + close = [[] for j in xrange(len(self.words)+1)] + for (tag,attrs,i,j) in sorted(self.meta, cmp=cmp_spans): + if i == j: + # do we want these to nest? + empty[i].append("<%s%s>\n" % (tag, attrs_to_str(attrs), tag)) + else: + open[i].append("<%s%s>\n" % (tag, attrs_to_str(attrs))) + close[j].append("\n" % tag) + + result = [] + if len(empty[0]) > 0: + result.extend(empty[0]) + for i in xrange(len(self.words)): + if i > 0: + result.append(" ") + result.extend(reversed(open[i])) + result.append(xml.sax.saxutils.escape(sym.tostring(self.words[i]))) + result.extend(close[i+1]) + if len(empty[i+1]) > 0: + result.extend(empty[i+1]) + + return "".join(result) + + def __add__(self, other): + if type(other) in (list, tuple): + return Sentence(self.words + list(other), self.meta) + else: + othermeta = [(tag, attrs, i+len(self.words), j+len(self.words)) for (tag, attrs, i, j) in other.meta] + return Sentence(self.words + other.words, self.meta+othermeta) + +def read_raw(f): + """Read a raw file into a list of Sentences.""" + if type(f) is str: + f = file(f, "r") + i = 0 + line = f.readline() + while line != "": + sent = process_sgml_line(line, i) + mark = sent.getmark() + if mark is None: + sent.mark('seg', [('id',str(i))]) + else: + (tag, attrs) = mark + if tag == "seg" and not attrs_to_dict(attrs).has_key('id'): + x = ('id',str(i)) + attrs.append(x) + sent.mark('seg', attrs) + if tag != "seg": + sent.mark('seg', [('id',str(i))]) + yield sent + i += 1 + line = f.readline() + +def process_sgml_line(line, id=None): + p = DatasetParser(None) + p.pos = 0 + p.words = [] + p.meta = [] + p.feed(line) + p.close() + sent = Sentence(p.words, p.meta) + return sent + +class DatasetParser(sgmllib.SGMLParser): + def __init__(self, set): + sgmllib.SGMLParser.__init__(self) + self.words = None + self.mystack = [] + self.string = None + self.convref = d = {"amp":"&", "lt":"<", "gt":">", "quot":'"', "squot":"'"} + def close(self): + self.convert() + sgmllib.SGMLParser.close(self) + + def handle_starttag(self, tag, method, attrs): + thing = method(attrs) + self.mystack.append(thing) + + def handle_endtag(self, tag, method): + thing = self.mystack.pop() + method(thing) + + def unknown_starttag(self, tag, attrs): + thing = self.start(tag, attrs) + self.mystack.append(thing) + + def unknown_endtag(self, tag): + thing = self.mystack.pop() + self.end(tag, thing) + + def start(self, tag, attrs): + self.convert() + if self.words is not None: + return (tag, attrs, self.pos, None) + else: + return None + + def convert(self): + if self.words is not None and self.string is not None: + words = self.string.split() + self.pos += len(words) + self.words.extend(words) + self.string = None + + def end(self, tag, thing): + self.convert() + if self.words is not None: + (tag, attrs, i, j) = thing + self.meta.append((tag, attrs, i, self.pos)) + + def handle_data(self, s): + if self.words is not None: + if (self.string is None): + self.string = s + else: + self.string += s + + def handle_entityref(self, ref): + # s=self.convert_entityref(ref) # if python 2.5 + s=self.convref[ref] + if self.words is not None: + if (self.string is None): + self.string = s + else: + self.string += s + diff --git a/sa-extract/strmap.cc b/sa-extract/strmap.cc new file mode 100644 index 00000000..5040477e --- /dev/null +++ b/sa-extract/strmap.cc @@ -0,0 +1,232 @@ +#include "strmap.h" + +#include +#include +#include +#include + +using namespace std; +using namespace std::tr1; + +#undef HAVE_64_BITS + +#if INTPTR_MAX == INT32_MAX +# define HAVE_64_BITS 0 +#elif INTPTR_MAX >= INT64_MAX +# define HAVE_64_BITS 1 +#else +# error "couldn't tell if HAVE_64_BITS from INTPTR_MAX INT32_MAX INT64_MAX" +#endif + +typedef uintptr_t MurmurInt; + +// MurmurHash2, by Austin Appleby + +static const uint32_t DEFAULT_SEED=2654435769U; + +#if HAVE_64_BITS +//MurmurInt MurmurHash(void const *key, int len, uint32_t seed=DEFAULT_SEED); + +inline uint64_t MurmurHash64( const void * key, int len, unsigned int seed=DEFAULT_SEED ) +{ + const uint64_t m = 0xc6a4a7935bd1e995ULL; + const int r = 47; + + uint64_t h = seed ^ (len * m); + + const uint64_t * data = (const uint64_t *)key; + const uint64_t * end = data + (len/8); + + while(data != end) + { + uint64_t k = *data++; + + k *= m; + k ^= k >> r; + k *= m; + + h ^= k; + h *= m; + } + + const unsigned char * data2 = (const unsigned char*)data; + + switch(len & 7) + { + case 7: h ^= uint64_t(data2[6]) << 48; + case 6: h ^= uint64_t(data2[5]) << 40; + case 5: h ^= uint64_t(data2[4]) << 32; + case 4: h ^= uint64_t(data2[3]) << 24; + case 3: h ^= uint64_t(data2[2]) << 16; + case 2: h ^= uint64_t(data2[1]) << 8; + case 1: h ^= uint64_t(data2[0]); + h *= m; + }; + + h ^= h >> r; + h *= m; + h ^= h >> r; + + return h; +} + +inline uint32_t MurmurHash32(void const *key, int len, uint32_t seed=DEFAULT_SEED) +{ + return (uint32_t) MurmurHash64(key,len,seed); +} + +inline MurmurInt MurmurHash(void const *key, int len, uint32_t seed=DEFAULT_SEED) +{ + return MurmurHash64(key,len,seed); +} + +#else +// 32-bit + +// Note - This code makes a few assumptions about how your machine behaves - +// 1. We can read a 4-byte value from any address without crashing +// 2. sizeof(int) == 4 +inline uint32_t MurmurHash32 ( const void * key, int len, uint32_t seed=DEFAULT_SEED) +{ + // 'm' and 'r' are mixing constants generated offline. + // They're not really 'magic', they just happen to work well. + + const uint32_t m = 0x5bd1e995; + const int r = 24; + + // Initialize the hash to a 'random' value + + uint32_t h = seed ^ len; + + // Mix 4 bytes at a time into the hash + + const unsigned char * data = (const unsigned char *)key; + + while(len >= 4) + { + uint32_t k = *(uint32_t *)data; + + k *= m; + k ^= k >> r; + k *= m; + + h *= m; + h ^= k; + + data += 4; + len -= 4; + } + + // Handle the last few bytes of the input array + + switch(len) + { + case 3: h ^= data[2] << 16; + case 2: h ^= data[1] << 8; + case 1: h ^= data[0]; + h *= m; + }; + + // Do a few final mixes of the hash to ensure the last few + // bytes are well-incorporated. + + h ^= h >> 13; + h *= m; + h ^= h >> 15; + + return h; +} + +inline MurmurInt MurmurHash ( const void * key, int len, uint32_t seed=DEFAULT_SEED) { + return MurmurHash32(key,len,seed); +} + +// 64-bit hash for 32-bit platforms + +inline uint64_t MurmurHash64 ( const void * key, int len, uint32_t seed=DEFAULT_SEED) +{ + const uint32_t m = 0x5bd1e995; + const int r = 24; + + uint32_t h1 = seed ^ len; + uint32_t h2 = 0; + + const uint32_t * data = (const uint32_t *)key; + + while(len >= 8) + { + uint32_t k1 = *data++; + k1 *= m; k1 ^= k1 >> r; k1 *= m; + h1 *= m; h1 ^= k1; + len -= 4; + + uint32_t k2 = *data++; + k2 *= m; k2 ^= k2 >> r; k2 *= m; + h2 *= m; h2 ^= k2; + len -= 4; + } + + if(len >= 4) + { + uint32_t k1 = *data++; + k1 *= m; k1 ^= k1 >> r; k1 *= m; + h1 *= m; h1 ^= k1; + len -= 4; + } + + switch(len) + { + case 3: h2 ^= ((unsigned char*)data)[2] << 16; + case 2: h2 ^= ((unsigned char*)data)[1] << 8; + case 1: h2 ^= ((unsigned char*)data)[0]; + h2 *= m; + }; + + h1 ^= h2 >> 18; h1 *= m; + h2 ^= h1 >> 22; h2 *= m; + h1 ^= h2 >> 17; h1 *= m; + h2 ^= h1 >> 19; h2 *= m; + + uint64_t h = h1; + + h = (h << 32) | h2; + + return h; +} + +#endif +//32bit + +struct MurmurHasher { + size_t operator()(const string& s) const { + return MurmurHash(s.c_str(), s.size()); + } +}; + +struct StrMap { + StrMap() { keys_.reserve(10000); keys_.push_back(""); map_[keys_[0]] = 0; } + unordered_map map_; + vector keys_; +}; + +StrMap* stringmap_new() { + return new StrMap; +} + +void stringmap_delete(StrMap *vocab) { + delete vocab; +} + +int stringmap_index(StrMap *vocab, char *s) { + int& cell = vocab->map_[s]; + if (!cell) { + cell = vocab->keys_.size(); + vocab->keys_.push_back(s); + } + return cell; +} + +char* stringmap_word(StrMap *vocab, int i) { + return const_cast(vocab->keys_[i].c_str()); +} + diff --git a/sa-extract/strmap.h b/sa-extract/strmap.h new file mode 100644 index 00000000..a218a4c0 --- /dev/null +++ b/sa-extract/strmap.h @@ -0,0 +1,22 @@ +#ifndef _STRMAP_H_ +#define _STRMAP_H_ + +#ifdef __cplusplus + extern "C" { +#else + typedef struct StrMap StrMap; /* dummy type to stand in for class */ +#endif + +struct StrMap; + +StrMap* stringmap_new(); +void stringmap_delete(StrMap *vocab); +int stringmap_index(StrMap *vocab, char *s); +char* stringmap_word(StrMap *vocab, int i); + +#ifdef __cplusplus + } +#endif + + +#endif diff --git a/sa-extract/strutil.c b/sa-extract/strutil.c new file mode 100644 index 00000000..456de87a --- /dev/null +++ b/sa-extract/strutil.c @@ -0,0 +1,63 @@ +#include +#include + +/* Like strsep(3) except that the delimiter is a string, not a set of characters. +*/ +char *strstrsep(char **stringp, const char *delim) { + char *match, *save; + save = *stringp; + if (*stringp == NULL) + return NULL; + match = strstr(*stringp, delim); + if (match == NULL) { + *stringp = NULL; + return save; + } + *match = '\0'; + *stringp = match + strlen(delim); + return save; +} + +static char **words = NULL; +static int max_words; +char **split(char *s, const char *delim, int *pn) { + int i; + char *tok, *rest; + + if (words == NULL) { + max_words = 10; + words = malloc(max_words*sizeof(char *)); + } + i = 0; + rest = s; + while ((tok = (delim ? strstrsep(&rest, delim) : strsep(&rest, " \t\n"))) != NULL) { + if (!delim && !*tok) // empty token + continue; + while (i+1 >= max_words) { + max_words *= 2; + words = realloc(words, max_words*sizeof(char *)); + } + words[i] = tok; + i++; + } + words[i] = NULL; + if (pn != NULL) + *pn = i; + return words; +} + +inline int isspace(char c) { + return (c == ' ' || c == '\t' || c == '\n'); +} + +char *strip(char *s) { + int n; + while (isspace(*s) && *s != '\0') + s++; + n = strlen(s); + while (n > 0 && isspace(s[n-1])) { + s[n-1] = '\0'; + n--; + } + return s; +} diff --git a/sa-extract/strutil.h b/sa-extract/strutil.h new file mode 100644 index 00000000..94a77033 --- /dev/null +++ b/sa-extract/strutil.h @@ -0,0 +1,8 @@ +#ifndef STRUTIL_H +#define STRUTIL_H + +char *strstrsep(char **stringp, const char *delim); +char *strip(char *s); +char **split(char *s, const char *delim, int *pn); + +#endif diff --git a/sa-extract/sym.pxd b/sa-extract/sym.pxd new file mode 100644 index 00000000..d0650f46 --- /dev/null +++ b/sa-extract/sym.pxd @@ -0,0 +1,17 @@ +cimport cstrmap + +cdef class Alphabet: + cdef readonly cstrmap.StringMap terminals, nonterminals + cdef int first_nonterminal, last_nonterminal + cdef int isvar(self, int sym) + cdef int isword(self, int sym) + cdef int getindex(self, int sym) + cdef int setindex(self, int sym, int ind) + cdef int clearindex(self, int sym) + cdef int match(self, int sym1, int sym2) + cdef char* tocat(self, int sym) + cdef int fromcat(self, char *s) + cdef char* tostring(self, int sym) + cdef int fromstring(self, char *s, int terminal) + + diff --git a/sa-extract/sym.pyx b/sa-extract/sym.pyx new file mode 100644 index 00000000..264cfcd9 --- /dev/null +++ b/sa-extract/sym.pyx @@ -0,0 +1,155 @@ +from libc.string cimport strrchr, strstr, strcpy, strlen +from libc.stdlib cimport malloc, realloc, strtol + +cdef int index_shift, index_mask, n_index +index_shift = 3 +n_index = 1<= 0 + + cdef int getindex(self, int sym): + return -sym & index_mask + + cdef int setindex(self, int sym, int ind): + return -(-sym & ~index_mask | ind) + + cdef int clearindex(self, int sym): + return -(-sym& ~index_mask) + + cdef int match(self, int sym1, int sym2): + return self.clearindex(sym1) == self.clearindex(sym2); + + cdef char* tocat(self, int sym): + return self.nonterminals.word((-sym >> index_shift)-1) + + cdef int fromcat(self, char *s): + cdef int i + i = self.nonterminals.index(s) + if self.first_nonterminal == -1: + self.first_nonterminal = i + if i > self.last_nonterminal: + self.last_nonterminal = i + return -(i+1 << index_shift) + + cdef char* tostring(self, int sym): + cdef int ind + if self.isvar(sym): + if sym in id2sym: + return id2sym[sym] + + ind = self.getindex(sym) + if ind > 0: + id2sym[sym] = "[%s,%d]" % (self.tocat(sym), ind) + else: + id2sym[sym] = "[%s]" % self.tocat(sym) + return id2sym[sym] + + else: + return self.terminals.word(sym) + + cdef int fromstring(self, char *s, int terminal): + """Warning: this method is allowed to alter s.""" + cdef char *comma + cdef int n + n = strlen(s) + cdef char *sep + sep = strstr(s,"_SEP_") + if n >= 3 and s[0] == c'[' and s[n-1] == c']' and sep == NULL: + if terminal: + s1 = "\\"+s + return self.terminals.index(s1) + s[n-1] = c'\0' + s = s + 1 + comma = strrchr(s, c',') + if comma != NULL: + comma[0] = c'\0' + return self.setindex(self.fromcat(s), strtol(comma+1, NULL, 10)) + else: + return self.fromcat(s) + else: + return self.terminals.index(s) + +# Expose Python functions as top-level functions for backward compatibility + +alphabet = Alphabet() + +cdef Alphabet calphabet +calphabet = alphabet + +def isvar(int sym): + return calphabet.isvar(sym) + +def isword(int sym): + return calphabet.isword(sym) + +def getindex(int sym): + return calphabet.getindex(sym) + +def setindex(int sym, int ind): + return calphabet.setindex(sym, ind) + +def clearindex(int sym): + return calphabet.clearindex(sym) + +def match(int sym1, int sym2): + return calphabet.match(sym1, sym2) != 0 + +def totag(int sym): + return calphabet.tocat(sym) + +def fromtag(s): + s = s.upper() + return calphabet.fromcat(s) + +def tostring(sym): + if type(sym) is str: + return sym + else: + return calphabet.tostring(sym) + +cdef int bufsize +cdef char *buf +bufsize = 100 +buf = malloc(bufsize) +cdef ensurebufsize(int size): + global buf, bufsize + if size > bufsize: + buf = realloc(buf, size*sizeof(char)) + bufsize = size + +def fromstring(s, terminal=False): + cdef bytes bs + cdef char* cs + if terminal: + return calphabet.fromstring(s, 1) + else: + bs = s + cs = bs + ensurebufsize(len(s)+1) + strcpy(buf, cs) + return calphabet.fromstring(buf, 0) + +def nonterminals(): + cdef i + l = [] + for i from calphabet.first_nonterminal <= i <= calphabet.last_nonterminal: + l.append(-(i+1 << index_shift)) + return l -- cgit v1.2.3