diff options
Diffstat (limited to 'sa-extract')
54 files changed, 0 insertions, 7344 deletions
diff --git a/sa-extract/Makefile b/sa-extract/Makefile deleted file mode 100644 index 7b39ae4d..00000000 --- a/sa-extract/Makefile +++ /dev/null @@ -1,18 +0,0 @@ -PYVER=python2.7 -PYDIR=/usr/local/Cellar/python/2.7.2 -PYINCLUDE=$(PYDIR)/include/$(PYVER) -CYTHON=/usr/local/share/python/cython -PYTHON=$(PYDIR)/bin/python - -%.c: %.pyx - $(CYTHON) $< -o $@ - -%.o: %.cc - g++ -O6 -g -fPIC -c $< - -all: cstrmap.c strmap.cc rule.c sym.c cdat.c cintlist.c cfloatlist.c calignment.c csuf.c clex.c rulefactory.c cveb.c lcp.c precomputation.c - $(PYTHON) setup.py build - -clean: - rm -f cdat.c cstrmap.c sym.c rule.c cintlist.c cfloatlist.c calignment.c csuf.c clex.c rulefactory.c cveb.c lcp.c precomputation.c *.so *.o *.cxx *~ *.pyc - rm -rf build diff --git a/sa-extract/README b/sa-extract/README deleted file mode 100644 index e4022c7e..00000000 --- a/sa-extract/README +++ /dev/null @@ -1,62 +0,0 @@ -SUFFIX-ARRAY-EXTRACT README - Feb 1, 2012 - -Written by Adam Lopez, repackaged by Chris Dyer. - -Originally based on parts of Hiero, by David Chiang, but these dependencies -have been removed or rewritten. - - -BUILD INSTRUCTIONS -============================================================================== - -Requirements: - Python 2.7 or later (http://www.python.org) - Cython 0.14.1 or later (http://cython.org/) - -- Edit Makefile to set the location of Python/Cython then do: - - make - - -COMPILING A PARALLEL CORPUS AND WORD ALIGNMENT -============================================================================== -- Run sa-compile.pl to compile the training data and generate an extract.ini - file (which is written to STDOUT): - - sa-compile.pl -b bitext_name=source.fr,target.en \ - -a alignment_name=alignment.txt > extract.ini - - - The training data should be in two parallel text files (source.fr,source.en) - and the alignments are expected in "0-0 1-2 2-1 ..." format produced by - most alignment toolkits. The text files should NOT be escaped for non-XML - characters. - - -EXTRACTION OF PER-SENTENCE GRAMMARS -============================================================================== -The most common use-case we support is extraction of "per-sentence" grammars -for each segment in a testset. You may run the extractor on test set, but it -will try to interpret tags as SGML markup, so we provide a script that does -escaping: ./escape-testset.pl. - -- Example: - - cat test.fr | ./escape-testset.pl | ./extractor.py -c extract.ini - - -EXTRACTION OF COMPLETE TEST-SET GRAMMARS -============================================================================== -Edit the generated extract.ini file a change per_sentence_grammar -to False. Then, run extraction as normal. - -Note: extracting a single grammar for an entire test set will consume more -memory during extraction and (probably) during decoding. - - -EXAMPLE -============================================================================== -- See example/ and the README therein. - - diff --git a/sa-extract/calignment.pxd b/sa-extract/calignment.pxd deleted file mode 100644 index a7d3001f..00000000 --- a/sa-extract/calignment.pxd +++ /dev/null @@ -1,10 +0,0 @@ -cimport cintlist -from libc.stdio cimport FILE - -cdef class Alignment: - - cdef cintlist.CIntList links - cdef cintlist.CIntList sent_index - cdef int link(self, int i, int j) - cdef _unlink(self, int link, int* f, int* e) - cdef int* _get_sent_links(self, int sent_id, int* num_links) diff --git a/sa-extract/calignment.pyx b/sa-extract/calignment.pyx deleted file mode 100644 index 976fcd66..00000000 --- a/sa-extract/calignment.pyx +++ /dev/null @@ -1,128 +0,0 @@ -import log -import gzip -import cintlist - -from libc.stdio cimport FILE, fopen, fread, fwrite, fclose -from libc.stdlib cimport malloc, realloc, free - -# Note: Callison-Burch uses short instead of int. -# We have the space for our corpus, so this is not a problem; -# May need to revisit if things get really tight, though. -cdef class Alignment: - - - cdef int link(self, int i, int j): - '''Integerizes an alignment link pair''' - return i*65536 + j - - - def unlink(self, link): - '''De-integerizes an alignment link pair''' - return (link/65536, link%65536) - - - cdef _unlink(self, int link, int* f, int* e): - f[0] = link/65536 - e[0] = link%65536 - - - def get_sent_links(self, int sent_id): - cdef cintlist.CIntList sent_links - cdef int* arr - cdef int arr_len - - sent_links = cintlist.CIntList() - arr = self._get_sent_links(sent_id, &arr_len) - sent_links._extend_arr(arr, arr_len*2) - free(arr) - return sent_links - - - cdef int* _get_sent_links(self, int sent_id, int* num_links): - cdef int* sent_links - cdef int i, start, end - - start = self.sent_index.arr[sent_id] - end = self.sent_index.arr[sent_id+1] - num_links[0] = end - start - sent_links = <int*> malloc(2*num_links[0]*sizeof(int)) - for i from 0 <= i < num_links[0]: - self._unlink(self.links.arr[start + i], sent_links + (2*i), sent_links + (2*i) + 1) - return sent_links - - - def __cinit__(self, filename, from_binary=False): - self.links = cintlist.CIntList(1000,1000) - self.sent_index = cintlist.CIntList(1000,1000) - log.writeln("Reading alignment from file %s" % filename) - if from_binary: - self.read_binary(filename) - else: - self.read_text(filename) - - - def read_text(self, filename): - if filename[-2:] == "gz": - f = gzip.GzipFile(filename) - else: - f = open(filename) - for line in f: - self.sent_index.append(len(self.links)) - pairs = line.split() - for pair in pairs: - (i, j) = map(int, pair.split('-')) - self.links.append(self.link(i, j)) - self.sent_index.append(len(self.links)) - - - def read_binary(self, filename): - cdef FILE* f - cdef bytes bfilename = filename - cdef char* cfilename = bfilename - f = fopen(cfilename, "r") - self.links.read_handle(f) - self.sent_index.read_handle(f) - fclose(f) - - - def write_text(self, filename): - f = open(filename, "w") - sent_num = 0 - for i, link in enumerate(self.links): - while i >= self.sent_index[sent_num]: - f.write("\n") - sent_num = sent_num + 1 - f.write("%d-%d " % self.unlink(link)) - f.write("\n") - - - def write_binary(self, filename): - cdef FILE* f - cdef bytes bfilename = filename - cdef char* cfilename = bfilename - f = fopen(cfilename, "w") - self.links.write_handle(f) - self.sent_index.write_handle(f) - fclose(f) - - - def write_enhanced(self, filename): - f = open(filename, "w") - sent_num = 1 - for link in self.links: - f.write("%d " % link) - f.write("\n") - for i in self.sent_index: - f.write("%d " % i) - f.write("\n") - - - def alignment(self, i): - '''Return all (e,f) pairs for sentence i''' - cdef int j, start, end - result = [] - start = self.sent_index.arr[i] - end = self.sent_index.arr[i+1] - for j from start <= j < end: - result.append(self.unlink(self.links.arr[j])) - return result diff --git a/sa-extract/cdat.pxd b/sa-extract/cdat.pxd deleted file mode 100644 index b686f611..00000000 --- a/sa-extract/cdat.pxd +++ /dev/null @@ -1,12 +0,0 @@ -cimport cintlist -from libc.stdio cimport FILE - -cdef class DataArray: - cdef word2id - cdef id2word - cdef cintlist.CIntList data - cdef cintlist.CIntList sent_id - cdef cintlist.CIntList sent_index - cdef use_sent_id - cdef void write_handle(self, FILE* f) - cdef void read_handle(self, FILE* f) diff --git a/sa-extract/cdat.pyx b/sa-extract/cdat.pyx deleted file mode 100644 index 57d3ad63..00000000 --- a/sa-extract/cdat.pyx +++ /dev/null @@ -1,178 +0,0 @@ -# cdat.pyx -# Defines "data arrays" that can be directly written to/read from disk in binary format -# In particular, the array itself is written/read directly as a glob of binary data -# Adam Lopez <alopez@cs.umd.edu> - -import sys -import gzip -import log -import cintlist - -from libc.stdio cimport FILE, fopen, fread, fwrite, fclose -from libc.stdlib cimport malloc, realloc, free -from libc.string cimport memset, strcpy, strlen - -cdef class DataArray: - - def __init__(self, filename=None, from_binary=False, use_sent_id=False): - self.word2id = {"END_OF_FILE":0, "END_OF_LINE":1} - self.id2word = ["END_OF_FILE", "END_OF_LINE"] - self.data = cintlist.CIntList(1000,1000) - self.sent_id = cintlist.CIntList(1000,1000) - self.sent_index = cintlist.CIntList(1000,1000) - self.use_sent_id = use_sent_id - - if filename is not None: - if from_binary: - self.read_binary(filename) - else: - self.read_text(filename) - - - def __len__(self): - return len(self.data) - - - def getSentId(self, i): - return self.sent_id.arr[i] - - - def getSent(self, i): - cdef int j, start, stop - sent = [] - start = self.sent_index.arr[i] - stop = self.sent_index.arr[i+1] - for i from start <= i < stop: - sent.append(self.id2word[self.data.arr[i]]) - return sent - - - def getSentPos(self, loc): - return loc - self.sent_index.arr[self.sent_id.arr[loc]] - - - def get_id(self, word): - if not word in self.word2id: - self.word2id[word] = len(self.id2word) - self.id2word.append(word) - return self.word2id[word] - - - def get_word(self, id): - return self.id2word[id] - - - def write_text(self, filename): - f = open(filename, "w") - for w_id in self.data: - if w_id > 1: - f.write("%s " % self.get_word(w_id)) - if w_id == 1: - f.write("\n") - f.close() - - - def read_text(self, filename): - cdef int word_count - - if filename[-2:] == "gz": - file = gzip.GzipFile(filename) - else: - file = open(filename) - word_count = 0 - for line_num, line in enumerate(file): - self.sent_index.append(word_count) - for word in line.split(): - self.data.append(self.get_id(word)) - if self.use_sent_id: - self.sent_id.append(line_num) - word_count = word_count + 1 - self.data.append(1) - if self.use_sent_id: - self.sent_id.append(line_num) - word_count = word_count + 1 - self.data.append(0) - self.sent_index.append(word_count) - - def read_binary(self, filename): - cdef FILE* f - cdef bytes bfilename = filename - cdef char* cfilename = bfilename - f = fopen(cfilename, "r") - self.read_handle(f) - fclose(f) - - - cdef void read_handle(self, FILE* f): - cdef int num_words - cdef int word_len - cdef char* c_word - cdef bytes py_word - self.data.read_handle(f) - self.sent_index.read_handle(f) - self.sent_id.read_handle(f) - fread(&(num_words), sizeof(int), 1, f) - for i in xrange(num_words): - fread(&(word_len), sizeof(int), 1, f) - c_word = <char*> malloc (word_len * sizeof(char)) - fread(c_word, sizeof(char), word_len, f) - py_word = c_word - free(c_word) - self.word2id[py_word] = len(self.id2word) - self.id2word.append(py_word) - if len(self.sent_id) == 0: - self.use_sent_id = False - else: - self.use_sent_id = True - - - cdef void write_handle(self, FILE* f): - cdef int word_len - cdef int num_words - cdef char* c_word - - self.data.write_handle(f) - self.sent_index.write_handle(f) - self.sent_id.write_handle(f) - num_words = len(self.id2word) - 2 - fwrite(&(num_words), sizeof(int), 1, f) - for word in self.id2word[2:]: - c_word = word - word_len = strlen(c_word) + 1 - fwrite(&(word_len), sizeof(int), 1, f) - fwrite(c_word, sizeof(char), word_len, f) - - - def write_binary(self, filename): - cdef FILE* f - cdef bytes bfilename = filename - cdef char* cfilename = bfilename - f = fopen(cfilename, "w") - self.write_handle(f) - fclose(f) - - - def write_enhanced_handle(self, f): - for i in self.data: - f.write("%d " %i) - f.write("\n") - for i in self.sent_index: - f.write("%d " %i) - f.write("\n") - for i in self.sent_id: - f.write("%d " %i) - f.write("\n") - for word in self.id2word: - f.write("%s %d " % (word, self.word2id[word])) - f.write("\n") - - - - def write_enhanced(self, filename): - f = open(filename, "w") - self.write_enhanced_handle(self, f) - f.close() - - - - diff --git a/sa-extract/cfloatlist.pxd b/sa-extract/cfloatlist.pxd deleted file mode 100644 index 026f2739..00000000 --- a/sa-extract/cfloatlist.pxd +++ /dev/null @@ -1,10 +0,0 @@ -from libc.stdio cimport FILE - -cdef class CFloatList: - cdef int size - cdef int increment - cdef int len - cdef float* arr - cdef void write_handle(self, FILE* f) - cdef void read_handle(self, FILE* f) - cdef void set(self, int i, float v) diff --git a/sa-extract/cfloatlist.pyx b/sa-extract/cfloatlist.pyx deleted file mode 100644 index 18a0ef2a..00000000 --- a/sa-extract/cfloatlist.pyx +++ /dev/null @@ -1,93 +0,0 @@ -# conveniencelist.pyx -# defines int arrays in C, with some convenience methods -# for reading arrays directly as globs directly from disk. -# Adam Lopez <alopez@cs.umd.edu> - -from libc.stdio cimport FILE, fopen, fread, fwrite, fclose -from libc.stdlib cimport malloc, realloc, free -from libc.string cimport memset, strcpy, strlen - -cdef class CFloatList: - - def __cinit__(self, size=0, increment=1, initial_len=0): - if initial_len > size: - size = initial_len - self.arr = <float*> malloc(size*sizeof(float)) - memset(self.arr, 0, initial_len*sizeof(float)) - - - def __init__(self, size=0, increment=1, initial_len=0): - self.size = size - if initial_len > size: - self.size = initial_len - self.increment = increment - self.len = initial_len - - - def __dealloc__(self): - free(self.arr) - - - def __getitem__(self, i): - j = i - if i<0: - j = self.len + i - if j<0 or j>=self.len: - raise IndexError("Requested index %d of %d-length FloatList" % (i, self.len)) - return self.arr[j] - - - cdef void set(self, int i, float v): - j = i - if i<0: - j = self.len + i - if j<0 or j>=self.len: - raise IndexError("Requested index %d of %d-length FloatList" % (i, self.len)) - self.arr[j] = v - - def __setitem__(self, i, val): - self.set(i, val) - - def __len__(self): - return self.len - - - def append(self, float val): - if self.len == self.size: - self.size = self.size + self.increment - self.arr = <float*> realloc(self.arr, self.size*sizeof(float)) - self.arr[self.len] = val - self.len = self.len + 1 - - - cdef void write_handle(self, FILE* f): - fwrite(&(self.len), sizeof(float), 1, f) - fwrite(self.arr, sizeof(float), self.len, f) - - - def write(self, filename): - cdef FILE* f - cdef bytes bfilename = filename - cdef char* cfilename = bfilename - f = fopen(cfilename, "w") - self.write_handle(f) - fclose(f) - - - cdef void read_handle(self, FILE* f): - free(self.arr) - fread(&(self.len), sizeof(float), 1, f) - self.arr = <float*> malloc(self.len * sizeof(float)) - self.size = self.len - fread(self.arr, sizeof(float), self.len, f) - - - def read(self, filename): - cdef FILE* f - cdef bytes bfilename = filename - cdef char* cfilename = bfilename - f = fopen(cfilename, "r") - self.read_handle(f) - fclose(f) - - diff --git a/sa-extract/cintlist.pxd b/sa-extract/cintlist.pxd deleted file mode 100644 index 8a3a655c..00000000 --- a/sa-extract/cintlist.pxd +++ /dev/null @@ -1,15 +0,0 @@ -from libc.stdio cimport FILE - -cdef class CIntList: - cdef int size - cdef int increment - cdef int len - cdef int* arr - cdef void write_handle(self, FILE* f) - cdef void read_handle(self, FILE* f) - cdef void _append(self, int val) - cdef void _extend(self, CIntList other) - cdef void _extend_arr(self, int* other, int other_len) - cdef void _clear(self) - cdef void set(self, int i, int val) - diff --git a/sa-extract/cintlist.pyx b/sa-extract/cintlist.pyx deleted file mode 100644 index 9d0a058e..00000000 --- a/sa-extract/cintlist.pyx +++ /dev/null @@ -1,196 +0,0 @@ -# cintlist.pyx -# defines int arrays in C, with some convenience methods -# for reading arrays as globs directly from disk. -# Adam Lopez <alopez@cs.umd.edu> - -from libc.stdio cimport FILE, fopen, fread, fwrite, fclose -from libc.stdlib cimport malloc, realloc, free -from libc.string cimport memset, memcpy - -cdef class CIntList: - - def __cinit__(self, size=0, increment=1, initial_len=0): - if initial_len > size: - size = initial_len - self.arr = <int*> malloc(size*sizeof(int)) - memset(self.arr, 0, initial_len*sizeof(int)) - - def __str__(self): - ret = "CIntList[" - for idx in xrange(self.size): - if idx>0: - ret += "," - ret += str(self.arr[idx]) - ret += "]" - ret += "len=" - ret += self.len - return ret - - def index(self, val): - for i in xrange(self.len): - if self.arr[i] == val: - return i - - return IndexError - - def partition(self,start,end): - pivot = self.arr[end] - bottom = start-1 - top = end - done = 0 - while not done: - while not done: - bottom += 1 - if bottom == top: - done = 1 - break - if self.arr[bottom] > pivot: - self.arr[top] = self.arr[bottom] - break - while not done: - top -= 1 - if top == bottom: - done = 1 - break - if self.arr[top] < pivot: - self.arr[bottom] = self.arr[top] - break - self.arr[top] = pivot - return top - - def _doquicksort(self,start,end): - if start < end: - split = self.partition(start,end) - self._doquicksort(start,split-1) - self._doquicksort(split+1,end) - else: - return - - def sort(self): - self._doquicksort(0,self.len-1) - - def reset(self): - self.len = 0 - - def __init__(self, size=0, increment=1, initial_len=0): - self.size = size - if initial_len > size: - self.size = initial_len - self.increment = increment - self.len = initial_len - - - def __dealloc__(self): - free(self.arr) - - - def __getitem__(self, index): - cdef int i, j, k - - if type(index) == int: - j = index - if j < 0: - j = self.len + j - if j<0 or j>=self.len: - raise IndexError("Requested index %d of %d-length CIntList" % (index, self.len)) - return self.arr[j] - elif type(index) == slice: - i = index.start - j = index.stop - if i < 0: - i = self.len + i - if j < 0: - j = self.len + j - if i < 0 or i >= self.len or j < 0 or j > self.len: - raise IndexError("Requested index %d:%d of %d-length CIntList" % (index.start, index.stop, self.len)) - result = () - for k from i <= k < j: - result = result + (self.arr[k],) - return result - else: - raise IndexError("Illegal key type %s for CIntList" % (type(index))) - - cdef void set(self, int i, int val): - j = i - if i<0: - j = self.len + i - if j<0 or j>=self.len: - raise IndexError("Requested index %d of %d-length IntList" % (i, self.len)) - if type(val) != int: - raise TypeError - self.arr[j] = val - - - def __setitem__(self, i, val): - self.set(i, val) - - def __len__(self): - return self.len - - def getSize(self): - return self.size - - def append(self, int val): - self._append(val) - - cdef void _append(self, int val): - if self.len == self.size: - self.size = self.size + self.increment - self.arr = <int*> realloc(self.arr, self.size*sizeof(int)) - self.arr[self.len] = val - self.len = self.len + 1 - - def extend(self, other): - self._extend(other) - - - cdef void _extend(self, CIntList other): - self._extend_arr(other.arr, other.len) - - - cdef void _extend_arr(self, int* other, int other_len): - if self.size < self.len + other_len: - self.size = self.len + other_len - self.arr = <int*> realloc(self.arr, self.size*sizeof(int)) - memcpy(self.arr+self.len, other, other_len*sizeof(int)) - self.len = self.len + other_len - - - cdef void _clear(self): - free(self.arr) - self.len = 0 - self.size = 0 - self.arr = <int*> malloc(0) - - - cdef void write_handle(self, FILE* f): - fwrite(&(self.len), sizeof(int), 1, f) - fwrite(self.arr, sizeof(int), self.len, f) - - - def write(self, filename): - cdef FILE* f - cdef bytes bfilename = filename - cdef char* cfilename = bfilename - f = fopen(cfilename, "w") - self.write_handle(f) - fclose(f) - - - cdef void read_handle(self, FILE* f): - (self.arr) - fread(&(self.len), sizeof(int), 1, f) - self.arr = <int*> malloc(self.len * sizeof(int)) - self.size = self.len - fread(self.arr, sizeof(int), self.len, f) - - - def read(self, filename): - cdef FILE* f - cdef bytes bfilename = filename - cdef char* cfilename = bfilename - f = fopen(cfilename, "r") - self.read_handle(f) - fclose(f) - - diff --git a/sa-extract/clex.pyx b/sa-extract/clex.pyx deleted file mode 100644 index fa30caad..00000000 --- a/sa-extract/clex.pyx +++ /dev/null @@ -1,460 +0,0 @@ -# clex.pyx -# defines bilexical dictionaries in C, with some convenience methods -# for reading arrays directly as globs directly from disk. -# Adam Lopez <alopez@cs.umd.edu> - -import gzip -import sys -import context_model - -cimport cintlist -cimport cfloatlist -cimport calignment -cimport csuf -cimport cdat - -from libc.stdio cimport FILE, fopen, fread, fwrite, fclose -from libc.stdlib cimport malloc, realloc, free -from libc.string cimport memset, strcpy, strlen - -cdef struct _node: - _node* smaller - _node* bigger - int key - int val - -cdef _node* new_node(int key): - cdef _node* n - n = <_node*> malloc(sizeof(_node)) - n.smaller = NULL - n.bigger = NULL - n.key = key - n.val = 0 - return n - - -cdef del_node(_node* n): - if n.smaller != NULL: - del_node(n.smaller) - if n.bigger != NULL: - del_node(n.bigger) - free(n) - - -cdef int* get_val(_node* n, int key): - if key == n.key: - return &n.val - elif key < n.key: - if n.smaller == NULL: - n.smaller = new_node(key) - return &(n.smaller.val) - return get_val(n.smaller, key) - else: - if n.bigger == NULL: - n.bigger = new_node(key) - return &(n.bigger.val) - return get_val(n.bigger, key) - - -cdef class CLex: - - cdef cfloatlist.CFloatList col1 - cdef cfloatlist.CFloatList col2 - cdef cintlist.CIntList f_index - cdef cintlist.CIntList e_index - cdef id2eword, id2fword, eword2id, fword2id - - def __init__(self, filename, from_binary=False, - from_data=False, earray=None, fsarray=None): - self.id2eword = [] - self.id2fword = [] - self.eword2id = {} - self.fword2id = {} - self.e_index = cintlist.CIntList() - self.f_index = cintlist.CIntList() - self.col1 = cfloatlist.CFloatList() - self.col2 = cfloatlist.CFloatList() - if from_binary: - self.read_binary(filename) - else: - if from_data: - self.compute_from_data(fsarray, earray, filename) - else: - self.read_text(filename) - '''print "self.eword2id" - print "=============" - for x in self.eword2id: - print x - print "self.fword2id" - print "=============" - for x in self.fword2id: - print x - print "-------------"''' - - - cdef compute_from_data(self, csuf.SuffixArray fsa, cdat.DataArray eda, calignment.Alignment aa): - cdef int sent_id, num_links, l, i, j, f_i, e_j, I, J, V_E, V_F, num_pairs - cdef int *fsent, *esent, *alignment, *links, *ealigned, *faligned - cdef _node** dict - cdef int *fmargin, *emargin, *count - cdef bytes word - cdef int null_word - - null_word = 0 - for word in fsa.darray.id2word: # I miss list comprehensions - self.id2fword.append(word) - self.id2fword[null_word] = "NULL" - for id, word in enumerate(self.id2fword): - self.fword2id[word] = id - - for word in eda.id2word: - self.id2eword.append(word) - self.id2eword[null_word] = "NULL" - for id, word in enumerate(self.id2eword): - self.eword2id[word] = id - - num_pairs = 0 - - V_E = len(eda.id2word) - V_F = len(fsa.darray.id2word) - fmargin = <int*> malloc(V_F*sizeof(int)) - emargin = <int*> malloc(V_E*sizeof(int)) - memset(fmargin, 0, V_F*sizeof(int)) - memset(emargin, 0, V_E*sizeof(int)) - - dict = <_node**> malloc(V_F*sizeof(_node*)) - memset(dict, 0, V_F*sizeof(_node*)) - - num_sents = len(fsa.darray.sent_index) - for sent_id from 0 <= sent_id < num_sents-1: - - fsent = fsa.darray.data.arr + fsa.darray.sent_index.arr[sent_id] - I = fsa.darray.sent_index.arr[sent_id+1] - fsa.darray.sent_index.arr[sent_id] - 1 - faligned = <int*> malloc(I*sizeof(int)) - memset(faligned, 0, I*sizeof(int)) - - esent = eda.data.arr + eda.sent_index.arr[sent_id] - J = eda.sent_index.arr[sent_id+1] - eda.sent_index.arr[sent_id] - 1 - ealigned = <int*> malloc(J*sizeof(int)) - memset(ealigned, 0, J*sizeof(int)) - - links = aa._get_sent_links(sent_id, &num_links) - - for l from 0 <= l < num_links: - i = links[l*2] - j = links[l*2+1] - if i >= I or j >= J: - sys.stderr.write(" %d-%d out of bounds (I=%d,J=%d) in line %d\n" % (i,j,I,J,sent_id+1)) - assert i < I - assert j < J - f_i = fsent[i] - e_j = esent[j] - fmargin[f_i] = fmargin[f_i]+1 - emargin[e_j] = emargin[e_j]+1 - if dict[f_i] == NULL: - dict[f_i] = new_node(e_j) - dict[f_i].val = 1 - num_pairs = num_pairs + 1 - else: - count = get_val(dict[f_i], e_j) - if count[0] == 0: - num_pairs = num_pairs + 1 - count[0] = count[0] + 1 - # add count - faligned[i] = 1 - ealigned[j] = 1 - for i from 0 <= i < I: - if faligned[i] == 0: - f_i = fsent[i] - fmargin[f_i] = fmargin[f_i] + 1 - emargin[null_word] = emargin[null_word] + 1 - if dict[f_i] == NULL: - dict[f_i] = new_node(null_word) - dict[f_i].val = 1 - num_pairs = num_pairs + 1 - else: - count = get_val(dict[f_i], null_word) - if count[0] == 0: - num_pairs = num_pairs + 1 - count[0] = count[0] + 1 - for j from 0 <= j < J: - if ealigned[j] == 0: - e_j = esent[j] - fmargin[null_word] = fmargin[null_word] + 1 - emargin[e_j] = emargin[e_j] + 1 - if dict[null_word] == NULL: - dict[null_word] = new_node(e_j) - dict[null_word].val = 1 - num_pairs = num_pairs + 1 - else: - count = get_val(dict[null_word], e_j) - if count[0] == 0: - num_pairs = num_pairs + 1 - count[0] = count[0] + 1 - free(links) - free(faligned) - free(ealigned) - self.f_index = cintlist.CIntList(initial_len=V_F) - self.e_index = cintlist.CIntList(initial_len=num_pairs) - self.col1 = cfloatlist.CFloatList(initial_len=num_pairs) - self.col2 = cfloatlist.CFloatList(initial_len=num_pairs) - - num_pairs = 0 - for i from 0 <= i < V_F: - #self.f_index[i] = num_pairs - self.f_index.set(i, num_pairs) - if dict[i] != NULL: - self._add_node(dict[i], &num_pairs, float(fmargin[i]), emargin) - del_node(dict[i]) - free(fmargin) - free(emargin) - free(dict) - return - - - cdef _add_node(self, _node* n, int* num_pairs, float fmargin, int* emargin): - cdef int loc - if n.smaller != NULL: - self._add_node(n.smaller, num_pairs, fmargin, emargin) - loc = num_pairs[0] - self.e_index.set(loc, n.key) - self.col1.set(loc, float(n.val)/fmargin) - self.col2.set(loc, float(n.val)/float(emargin[n.key])) - num_pairs[0] = loc + 1 - if n.bigger != NULL: - self._add_node(n.bigger, num_pairs, fmargin, emargin) - - - def write_binary(self, filename): - cdef FILE* f - cdef bytes bfilename = filename - cdef char* cfilename = bfilename - f = fopen(cfilename, "w") - self.f_index.write_handle(f) - self.e_index.write_handle(f) - self.col1.write_handle(f) - self.col2.write_handle(f) - self.write_wordlist(self.id2fword, f) - self.write_wordlist(self.id2eword, f) - fclose(f) - - - cdef write_wordlist(self, wordlist, FILE* f): - cdef int word_len - cdef int num_words - cdef char* c_word - - num_words = len(wordlist) - fwrite(&(num_words), sizeof(int), 1, f) - for word in wordlist: - c_word = word - word_len = strlen(c_word) + 1 - fwrite(&(word_len), sizeof(int), 1, f) - fwrite(c_word, sizeof(char), word_len, f) - - - cdef read_wordlist(self, word2id, id2word, FILE* f): - cdef int num_words - cdef int word_len - cdef char* c_word - cdef bytes py_word - - fread(&(num_words), sizeof(int), 1, f) - for i from 0 <= i < num_words: - fread(&(word_len), sizeof(int), 1, f) - c_word = <char*> malloc (word_len * sizeof(char)) - fread(c_word, sizeof(char), word_len, f) - py_word = c_word - free(c_word) - word2id[py_word] = len(id2word) - id2word.append(py_word) - - def read_binary(self, filename): - cdef FILE* f - cdef bytes bfilename = filename - cdef char* cfilename = bfilename - f = fopen(cfilename, "r") - self.f_index.read_handle(f) - self.e_index.read_handle(f) - self.col1.read_handle(f) - self.col2.read_handle(f) - self.read_wordlist(self.fword2id, self.id2fword, f) - self.read_wordlist(self.eword2id, self.id2eword, f) - fclose(f) - - - def get_e_id(self, eword): - if eword not in self.eword2id: - e_id = len(self.id2eword) - self.id2eword.append(eword) - self.eword2id[eword] = e_id - return self.eword2id[eword] - - - def get_f_id(self, fword): - if fword not in self.fword2id: - f_id = len(self.id2fword) - self.id2fword.append(fword) - self.fword2id[fword] = f_id - return self.fword2id[fword] - - - def read_text(self, filename): - cdef i, j, w, e_id, f_id, n_f, n_e, N - cdef cintlist.CIntList fcount - - fcount = cintlist.CIntList() - if filename[-2:] == "gz": - f = gzip.GzipFile(filename) - else: - f = open(filename) - - # first loop merely establishes size of array objects - sys.stderr.write("Initial read...\n") - for line in f: - (fword, eword, score1, score2) = line.split() - f_id = self.get_f_id(fword) - e_id = self.get_e_id(eword) - while f_id >= len(fcount): - fcount.append(0) - fcount.arr[f_id] = fcount.arr[f_id] + 1 - - # Allocate space for dictionary in arrays - N = 0 - n_f = len(fcount) - self.f_index = cintlist.CIntList(initial_len=n_f+1) - for i from 0 <= i < n_f: - self.f_index.arr[i] = N - N = N + fcount.arr[i] - fcount.arr[i] = 0 - self.f_index.arr[n_f] = N - self.e_index = cintlist.CIntList(initial_len=N) - self.col1 = cfloatlist.CFloatList(initial_len=N) - self.col2 = cfloatlist.CFloatList(initial_len=N) - - # Re-read file, placing words into buckets - sys.stderr.write("Bucket sort...\n") - f.seek(0) - for line in f: - (fword, eword, score1, score2) = line.split() - f_id = self.get_f_id(fword) - e_id = self.get_e_id(eword) - index = self.f_index.arr[f_id] + fcount.arr[f_id] - fcount.arr[f_id] = fcount.arr[f_id] + 1 - self.e_index.arr[index] = int(e_id) - self.col1[index] = float(score1) - self.col2[index] = float(score2) - f.close() - - sys.stderr.write("Final sort...\n") - # Sort buckets by eword - for b from 0 <= b < n_f: - i = self.f_index.arr[b] - j = self.f_index.arr[b+1] - self.qsort(i,j, "") - - - cdef swap(self, int i, int j): - cdef int itmp - cdef float ftmp - - if i == j: - return - - itmp = self.e_index.arr[i] - self.e_index.arr[i] = self.e_index.arr[j] - self.e_index.arr[j] = itmp - - ftmp = self.col1.arr[i] - self.col1.arr[i] = self.col1.arr[j] - self.col1.arr[j] = ftmp - - ftmp = self.col2.arr[i] - self.col2.arr[i] = self.col2.arr[j] - self.col2.arr[j] = ftmp - - - cdef qsort(self, int i, int j, pad): - cdef int pval, p - - if i > j: - raise Exception("Sort error in CLex") - if i == j: #empty interval - return - if i == j-1: # singleton interval - return - - p = (i+j)/2 - pval = self.e_index.arr[p] - self.swap(i, p) - p = i - for k from i+1 <= k < j: - if pval >= self.e_index.arr[k]: - self.swap(p+1, k) - self.swap(p, p+1) - p = p + 1 - self.qsort(i,p, pad+" ") - self.qsort(p+1,j, pad+" ") - - - def write_enhanced(self, filename): - f = open(filename, "w") - for i in self.f_index: - f.write("%d " % i) - f.write("\n") - for i, s1, s2 in zip(self.e_index, self.col1, self.col2): - f.write("%d %f %f " % (i, s1, s2)) - f.write("\n") - for i, w in enumerate(self.id2fword): - f.write("%d %s " % (i, w)) - f.write("\n") - for i, w in enumerate(self.id2eword): - f.write("%d %s " % (i, w)) - f.write("\n") - f.close() - - - def get_score(self, fword, eword, col): - cdef e_id, f_id, low, high, midpoint, val - #print "get_score fword=",fword,", eword=",eword,", col=",col - - if eword not in self.eword2id: - return None - if fword not in self.fword2id: - return None - f_id = self.fword2id[fword] - e_id = self.eword2id[eword] - low = self.f_index.arr[f_id] - high = self.f_index.arr[f_id+1] - while high - low > 0: - midpoint = (low+high)/2 - val = self.e_index.arr[midpoint] - if val == e_id: - if col == 0: - return self.col1.arr[midpoint] - if col == 1: - return self.col2.arr[midpoint] - if val > e_id: - high = midpoint - if val < e_id: - low = midpoint + 1 - return None - - - def write_text(self, filename): - """Note: does not guarantee writing the dictionary in the original order""" - cdef i, N, e_id, f_id - - f = open(filename, "w") - N = len(self.e_index) - f_id = 0 - for i from 0 <= i < N: - while self.f_index.arr[f_id+1] == i: - f_id = f_id + 1 - e_id = self.e_index.arr[i] - score1 = self.col1.arr[i] - score2 = self.col2.arr[i] - f.write("%s %s %.6f %.6f\n" % (self.id2fword[f_id], self.id2eword[e_id], score1, score2)) - f.close() - - diff --git a/sa-extract/cmath.pxd b/sa-extract/cmath.pxd deleted file mode 100644 index 3aaaa2a3..00000000 --- a/sa-extract/cmath.pxd +++ /dev/null @@ -1,2 +0,0 @@ -cdef extern from "math.h": - double log(double) diff --git a/sa-extract/cn.py b/sa-extract/cn.py deleted file mode 100644 index 6e45bcf9..00000000 --- a/sa-extract/cn.py +++ /dev/null @@ -1,164 +0,0 @@ -# cn.py -# Chris Dyer <redpony@umd.edu> -# Copyright (c) 2006 University of Maryland. - -# vim:tabstop=4:autoindent:expandtab - -import sym -import log - -epsilon = sym.fromstring('*EPS*'); - -class CNStats(object): - def __init__(self): - self.read = 0 - self.colls = 0 - self.words = 0 - - def collect(self, cn): - self.read += 1 - self.colls += cn.get_length() - for col in cn.columns: - self.words += len(col) - - def __str__(self): - return "confusion net statistics:\n succ. read: %d\n columns: %d\n words: %d\n avg. words/column:\t%f\n avg. cols/sent:\t%f\n\n" % (self.read, self.colls, self.words, float(self.words)/float(self.colls), float(self.colls)/float(self.read)) - -class ConfusionNet(object): - def __init__(self, sent): - object.__init__(self) - if (len(sent.words) == 0): - self.columns = () - return # empty line, it happens - line = sent.words[0] - if (line.startswith("(((")): - if (len(sent.words) > 1): - log.write("Bad sentence: %s\n" % (line)) - assert(len(sent.words) == 1) # make sure there are no spaces in your confusion nets! - line = "((('<s>',1.0,1),),"+line[1:len(line)-1]+"(('</s>',1.0,1),))" - cols = eval(line) - res = [] - for col in cols: - x = [] - for alt in col: - costs = alt[1] - if (type(costs) != type((1,2))): - costs=(float(costs),) - j=[] - for c in costs: - j.append(float(c)) - cost = tuple(j) - spanlen = 1 - if (len(alt) == 3): - spanlen = alt[2] - x.append((sym.fromstring(alt[0],terminal=True), None, spanlen)) - res.append(tuple(x)) - self.columns = tuple(res) - else: # convert a string of input into a CN - res = []; - res.append(((sym.fromstring('<s>',terminal=True), None, 1), )) - for word in sent.words: - res.append(((sym.fromstring(word,terminal=True), None, 1), )); # (alt=word, cost=0.0) - res.append(((sym.fromstring('</s>',terminal=True), None, 1), )) - self.columns = tuple(res) - - def is_epsilon(self, position): - x = self.columns[position[0]][position[1]][0] - return x == epsilon - - def compute_epsilon_run_length(self, cn_path): - if (len(cn_path) == 0): - return 0 - x = len(cn_path) - 1 - res = 0 - ''' -1 denotes a non-terminal ''' - while (x >= 0 and cn_path[x][0] >= 0 and self.is_epsilon(cn_path[x])): - res += 1 - x -= 1 - return res - - def compute_cn_cost(self, cn_path): - c = None - for (col, row) in cn_path: - if (col >= 0): - if c is None: - c = self.columns[col][row][1].clone() - else: - c += self.columns[col][row][1] - return c - - def get_column(self, col): - return self.columns[col] - - def get_length(self): - return len(self.columns) - - def __str__(self): - r = "conf net: %d\n" % (len(self.columns),) - i = 0 - for col in self.columns: - r += "%d -- " % i - i += 1 - for alternative in col: - r += "(%s, %s, %s) " % (sym.tostring(alternative[0]), alternative[1], alternative[2]) - r += "\n" - return r - - def listdown(_columns, col = 0): - # output all the possible sentences out of the self lattice - # will be used by the "dumb" adaptation of lattice decoding with suffix array - result = [] - for entry in _columns[col]: - if col+entry[2]+1<=len(_columns) : - for suffix in self.listdown(_columns,col+entry[2]): - result.append(entry[0]+' '+suffix) - #result.append(entry[0]+' '+suffix) - else: - result.append(entry[0]) - #result.append(entry[0]) - return result - - def next(self,_columns,curr_idx, min_dist=1): - # can be used only when prev_id is defined - result = [] - #print "curr_idx=%i\n" % curr_idx - if curr_idx+min_dist >= len(_columns): - return result - for alt_idx in xrange(len(_columns[curr_idx])): - alt = _columns[curr_idx][alt_idx] - #print "checking %i alternative : " % alt_idx - #print "%s %f %i\n" % (alt[0],alt[1],alt[2]) - #print alt - if alt[2]<min_dist: - #print "recursive next(%i, %i, %i)\n" % (curr_idx,alt_idx,min_dist-alt[2]) - result.extend(self.next(_columns,curr_idx+alt[2],min_dist-alt[2])) - elif curr_idx+alt[2]<len(_columns): - #print "adding because the skip %i doesn't go beyong the length\n" % alt[2] - result.append(curr_idx+alt[2]) - return set(result) - - - -""" -import sys -import sgml -file = open(sys.argv[1], "rb") -sent = sgml.process_sgml_line(file.read()) -print sent -cn = ConfusionNet(sent) -print cn -results = cn.listdown() -for result in results: - print sym.tostring(result) -print cn.next(0); -print cn.next(1); -print cn.next(2); -print cn.next(3); -print cn -cn = ConfusionNet() -k = 0 -while (cn.read(file)): - print cn - -print cn.stats -""" diff --git a/sa-extract/compile_bin.py b/sa-extract/compile_bin.py deleted file mode 100755 index 0196e552..00000000 --- a/sa-extract/compile_bin.py +++ /dev/null @@ -1,148 +0,0 @@ -#!/usr/bin/env python - -'''This program compiles/decompiles binary data objects used -by the decoder''' - -import sys -import cdat -import calignment -import csuf -import clex -import precomputation -#import parse -import monitor -import optparse - -def main(argv=None): - '''Call this from the command-line to create a - pre-computed binary data array for later use''' - if argv is None: - argv = sys.argv - - parser = optparse.OptionParser(usage="Usage: %prog [-s|-d|-a|-p] <input file> <output file>"+ - "\n\nNote: -d,-s,-a, and -p are mutually exclusive") - parser.add_option("-d", "--data-array", - action="store_true", default=False, - dest="da", help="Compile file into data array (default)") - parser.add_option("-s", "--suffix-array", - action="store_true", default=False, - dest="sa", help="Compile file into suffix array") - parser.add_option("-a", "--alignment", - action="store_true", default=False, - dest="a", help="Compile file into alignment") - parser.add_option("-l", "--lexical", - action="store_true", default=False, - dest="l", help="Compile file into lex file") - parser.add_option("-x", "--compute_lexical", action="store", nargs=2, - dest="lex_args", help="Compute lex file from data", - metavar="<f file> <e file>") - parser.add_option("-p", "--parse", - action="store_true", default=False, - dest="p", help="Compile file into parse") - parser.add_option("-b", "--binary-infile", - action="store_true", default=False, - dest="bin", help="Input file is binary (default: text)") - parser.add_option("-t", "--text-outfile", - action="store_true", default=False, - dest="text", help="Output file is text (default: binary)") - parser.add_option("-e", "--enhanced-outfile", - action="store_true", default=False, - dest="enhanced", help="Output file is enhanced text (default: binary)") - parser.add_option("-r", action="store", nargs=7, - dest="precomp_args", help="Precompute collocations (Hiero only)", - metavar="max-len=<INT> max-nt=<INT> max-size=<INT> min-gap=<INT> rank1=<INT> rank2=<INT> sa=<FILE>") - (options, args) = parser.parse_args() - - filetype_opts = [options.da, options.sa, options.a, options.p] - - if (len(filter(lambda x: x, filetype_opts))) > 1 or len(args) != 2: - parser.print_help() - sys.exit(1) - - (infilename, outfilename) = args - if options.bin: - bin = " binary" - else: - bin = "" - - start_time = monitor.cpu() - if options.precomp_args: - if options.bin: - obj = precomputation.Precomputation(infilename, from_binary=True) - else: - keys = set(["max-len", "max-nt", "max-size", "min-gap", "rank1", "rank2", "sa"]) - precomp_opts = {} - sys.stderr.write("Precomputing statistics for list %s\n" % infilename) - for pair in options.precomp_args: - (key, val) = pair.split("=") - if key in keys: - keys.remove(key) - if key != "sa": - val = int(val) - precomp_opts[key] = val - else: - sys.stderr.write("Unknown keyword arg %s for -r (must be one of: max-len, max-nt, max-size, min-gap, rank1, rank2)\n" % key) - return 1 - sa = csuf.SuffixArray(precomp_opts["sa"], True) - obj = precomputation.Precomputation(infilename, sa, - precompute_rank=precomp_opts["rank1"], - precompute_secondary_rank=precomp_opts["rank2"], - max_length=precomp_opts["max-len"], - max_nonterminals=precomp_opts["max-nt"], - train_max_initial_size=precomp_opts["max-size"], - train_min_gap_size=precomp_opts["min-gap"]) - elif options.sa: - sys.stderr.write("Reading %s as%s suffix array...\n" % (infilename, bin)) - obj = csuf.SuffixArray(infilename, options.bin) - elif options.a: - sys.stderr.write("Reading %s as%s alignment array...\n" % (infilename, bin)) - obj = calignment.Alignment(infilename, options.bin) - elif options.p: - sys.stderr.write("Reading %s as%s parse array...\n" % (infilename, bin)) - obj = parse.ParseArray(infilename, options.bin) - elif options.l: - sys.stderr.write("Reading %s as%s lex array...\n" % (infilename, bin)) - obj = clex.CLex(infilename, options.bin) - elif options.lex_args: - ffile = options.lex_args[0] - efile = options.lex_args[1] - sys.stderr.write("Computing lex array from:\n A=%s\n F=%s\n E=%s\n" % (infilename, ffile, efile)) - fsarray = csuf.SuffixArray(ffile, True) - earray = cdat.DataArray(efile, True) - aarray = calignment.Alignment(infilename, True) - obj = clex.CLex(aarray, from_data=True, earray=earray, fsarray=fsarray) - else: - sys.stderr.write("Reading %s as%s data array...\n" % (infilename, bin)) - obj = cdat.DataArray(infilename, options.bin) - - sys.stderr.write(" Total time for read: %f\n" % (monitor.cpu() - start_time)) - start_time = monitor.cpu() - if options.text: - sys.stderr.write("Writing text file %s...\n" % outfilename) - obj.write_text(outfilename) - elif options.enhanced: - sys.stderr.write("Writing enhanced text file %s...\n" % outfilename) - obj.write_enhanced(outfilename) - else: - sys.stderr.write("Writing binary file %s...\n" % outfilename) - obj.write_binary(outfilename) - sys.stderr.write("Finished.\n") - sys.stderr.write(" Total time for write: %f\n" % (monitor.cpu() - start_time)) - - mem_use = float(monitor.memory()) - metric = "B" - if mem_use / 1000 > 1: - mem_use /= 1000 - metric = "KB" - if mem_use / 1000 > 1: - mem_use /= 1000 - metric = "MB" - if mem_use / 1000 > 1: - mem_use /= 1000 - metric = "GB" - sys.stderr.write(" Memory usage: %.1f%s\n" % (mem_use, metric)) - - - -if __name__ == "__main__": - sys.exit(main(sys.argv)) diff --git a/sa-extract/context_model.py b/sa-extract/context_model.py deleted file mode 100644 index 8cb6c174..00000000 --- a/sa-extract/context_model.py +++ /dev/null @@ -1,234 +0,0 @@ -#!/usr/bin/env python -import sys -import model -import sym -import log -import math - -class ContextModel(model.Model): - '''A ContextModel is one that is computed using information - from the Context object''' - - def __init__(self, context_manager, default=0.0): - model.Model.__init__(self) - self.wordless = 0 - self.initial = None - self.default = default - self.context_manager = context_manager - self.id = self.context_manager.add_model(self) - - '''The next feature is true if the model depends in - some way on the entire input sentence; that is, if - it cannot be scored when created, but must be scored - no earlier than during the input method (note that - this is less strict than stateful)''' - self.contextual = True - ''' It may seem somewhat counterintuitive that a - ContextModel can be non-contextual, but a good - example is the rule probabilites; although these - are computed using the Context object, they don't - really depend in any way on context''' - - - '''inherited from model.Model, called once for each input sentence''' - def input(self, fwords, meta): - # all ContextModels must make this call - self.context_manager.input(self, fwords, meta) - - - '''This function will be called via the input method - only for contextual models''' - def compute_contextual_score(self, r): - return 0.0 - - '''This function is only called on rule creation for - contextless models''' - def compute_contextless_score(self, fphrase, ephrase, paircount, fcount, fsample_count): - return 0.0 - - '''Stateless models should not need to - override this function, unless they define - something for model.TO_GOAL''' - def transition (self, r, antstates, i, j, j1=None): - return (None, 0.0) - - def estimate(self, r): - return r.getscore("context", self.id) - - def transition(self, r, antstates, i, j, j1=None): - return (None, r.getscore("context", self.id)) - - def finaltransition(self, state): - return 0.0 - - def rescore(self, ewords, score): - return score - - - -'''p(e|f)''' -class EgivenF(ContextModel): - - def __init__(self, context_manager, default=0.0): - ContextModel.__init__(self, context_manager) - self.contextual = False - - def compute_contextless_score(self, fphrase, ephrase, paircount, fcount, fsample_count): - prob = float(paircount)/float(fcount) - return -math.log10(prob) - -class CountEF(ContextModel): - - def __init__(self, context_manager, default=0.0): - ContextModel.__init__(self, context_manager) - self.contextual = False - - def compute_contextless_score(self, fphrase, ephrase, paircount, fcount, fsample_count): - return math.log10(1.0 + float(paircount)) - -class SampleCountF(ContextModel): - - def __init__(self, context_manager, default=0.0): - ContextModel.__init__(self, context_manager) - self.contextual = False - - def compute_contextless_score(self, fphrase, ephrase, paircount, fcount, fsample_count): - return math.log10(1.0 + float(fsample_count)) - - - -class EgivenFCoherent(ContextModel): - - def __init__(self, context_manager, default=0.0): - ContextModel.__init__(self, context_manager) - self.contextual = False - - def compute_contextless_score(self, fphrase, ephrase, paircount, fcount, fsample_count): - prob = float(paircount)/float(fsample_count) - #print "paircount=",paircount," , fsample_count=",fsample_count,", prob=",prob - if (prob == 0.0): return 99.0 - return -math.log10(prob) - - - -class CoherenceProb(ContextModel): - - def __init__(self, context_manager, default=0.0): - ContextModel.__init__(self, context_manager) - self.contextual = False - - def compute_contextless_score(self, fphrase, ephrase, paircount, fcount, fsample_count): - prob = float(fcount)/float(fsample_count) - return -math.log10(prob) - - - -class MaxLexEgivenF(ContextModel): - - def __init__(self, context_manager, ttable, col=0): - ContextModel.__init__(self, context_manager) - self.ttable = ttable - self.col = col - self.wordless = 0 - self.initial = None - self.contextual = False - - - def compute_contextless_score(self, fphrase, ephrase, paircount, fcount, fsample_count): - totalscore = 1.0 - fwords = map(sym.tostring, filter(lambda x: not sym.isvar(x), fphrase)) - fwords.append("NULL") - ewords = map(sym.tostring, filter(lambda x: not sym.isvar(x), ephrase)) - for e in ewords: - maxScore = 0.0 - for f in fwords: - score = self.ttable.get_score(f, e, self.col) - #print "score(MaxLexEgivenF) = ",score - if score > maxScore: - maxScore = score - totalscore *= maxScore - if totalscore == 0.0: - return 999 - else: - return -math.log10(totalscore) - - -class MaxLexFgivenE(ContextModel): - - def __init__(self, context_manager, ttable, col=1): - ContextModel.__init__(self, context_manager) - self.ttable = ttable - self.col = col - self.wordless = 0 - self.initial = None - self.contextual = False - - - def compute_contextless_score(self, fphrase, ephrase, paircount, fcount, fsample_count): - totalscore = 1.0 - fwords = map(sym.tostring, filter(lambda x: not sym.isvar(x), fphrase)) - ewords = map(sym.tostring, filter(lambda x: not sym.isvar(x), ephrase)) - ewords.append("NULL") - for f in fwords: - maxScore = 0.0 - for e in ewords: - score = self.ttable.get_score(f, e, self.col) - #print "score(MaxLexFgivenE) = ",score - if score > maxScore: - maxScore = score - totalscore *= maxScore - if totalscore == 0.0: - return 999 - else: - return -math.log10(totalscore) - - -class IsSingletonF(ContextModel): - - def __init__(self, context_manager, default=0.0): - ContextModel.__init__(self, context_manager) - self.contextual = False - - def compute_contextless_score(self, fphrase, ephrase, paircount, fcount, fsample_count): - return (fcount==1) - - -class IsSingletonFE(ContextModel): - - def __init__(self, context_manager, default=0.0): - ContextModel.__init__(self, context_manager) - self.contextual = False - - def compute_contextless_score(self, fphrase, ephrase, paircount, fcount, fsample_count): - return (paircount==1) - -class IsNotSingletonF(ContextModel): - - def __init__(self, context_manager, default=0.0): - ContextModel.__init__(self, context_manager) - self.contextual = False - - def compute_contextless_score(self, fphrase, ephrase, paircount, fcount, fsample_count): - return (fcount>1) - - -class IsNotSingletonFE(ContextModel): - - def __init__(self, context_manager, default=0.0): - ContextModel.__init__(self, context_manager) - self.contextual = False - - def compute_contextless_score(self, fphrase, ephrase, paircount, fcount, fsample_count): - return (paircount>1) - - -class IsFEGreaterThanZero(ContextModel): - - def __init__(self, context_manager, default=0.0): - ContextModel.__init__(self, context_manager) - self.contextual = False - - def compute_contextless_score(self, fphrase, ephrase, paircount, fcount, fsample_count): - return (paircount > 0.01) - - diff --git a/sa-extract/cstrmap.pxd b/sa-extract/cstrmap.pxd deleted file mode 100644 index 53becbc6..00000000 --- a/sa-extract/cstrmap.pxd +++ /dev/null @@ -1,12 +0,0 @@ -cdef extern from "strmap.h": - ctypedef struct StrMap - StrMap* stringmap_new() - void stringmap_delete(StrMap *vocab) - int stringmap_index(StrMap *vocab, char *s) - char* stringmap_word(StrMap *vocab, int i) - -cdef class StringMap: - cdef StrMap *vocab - cdef char *word(self, int i) - cdef int index(self, char *s) - diff --git a/sa-extract/cstrmap.pyx b/sa-extract/cstrmap.pyx deleted file mode 100644 index d3883ea5..00000000 --- a/sa-extract/cstrmap.pyx +++ /dev/null @@ -1,14 +0,0 @@ - -cdef class StringMap: - def __cinit__(self): - self.vocab = stringmap_new() - - def __dealloc__(self): - stringmap_delete(self.vocab) - - cdef char *word(self, int i): - return stringmap_word(self.vocab, i) - - cdef int index(self, char *s): - return stringmap_index(self.vocab, s) - diff --git a/sa-extract/csuf.pxd b/sa-extract/csuf.pxd deleted file mode 100644 index f44167dd..00000000 --- a/sa-extract/csuf.pxd +++ /dev/null @@ -1,11 +0,0 @@ -cimport cdat -cimport cintlist - -cdef class SuffixArray: - cdef cdat.DataArray darray - cdef cintlist.CIntList sa - cdef cintlist.CIntList ha - cdef __lookup_helper(self, int word_id, int offset, int low, int high) - cdef __get_range(self, int word_id, int offset, int low, int high, int midpoint) - cdef __search_low(self, int word_id, int offset, int low, int high) - cdef __search_high(self, word_id, offset, low, high) diff --git a/sa-extract/csuf.pyx b/sa-extract/csuf.pyx deleted file mode 100644 index 64c44788..00000000 --- a/sa-extract/csuf.pyx +++ /dev/null @@ -1,321 +0,0 @@ -# csuf.pyx -# Defines suffix arrays that can be directly written to/read from disk in binary format -# Adam Lopez <alopez@cs.umd.edu> - -import sys -import log -import cdat -import cintlist -import monitor - -from libc.stdio cimport FILE, fclose, fopen - -cdef class SuffixArray: - - def __init__(self, filename, from_binary=False): - self.darray = cdat.DataArray() - self.sa = cintlist.CIntList() - self.ha = cintlist.CIntList() - if from_binary: - self.read_binary(filename) - else: - self.read_text(filename) - - - def __getitem__(self, i): - return self.sa.arr[i] - - - def getSentId(self, i): - return self.darray.getSentId(i) - - - def getSent(self, i): - return self.darray.getSent(i) - - - def getSentPos(self, loc): - return self.darray.getSentPos(loc) - - def read_text(self, filename): - '''Constructs suffix array using the algorithm - of Larsson & Sadahkane (1999)''' - cdef int V, N, i, j, h, a_i, n, current_run, skip - cdef cintlist.CIntList isa, word_count - - self.darray = cdat.DataArray(filename, from_binary=False, use_sent_id=True) - N = len(self.darray) - V = len(self.darray.id2word) - - self.sa = cintlist.CIntList(initial_len=N) - self.ha = cintlist.CIntList(initial_len=V+1) - - isa = cintlist.CIntList(initial_len=N) - word_count = cintlist.CIntList(initial_len=V+1) - - '''Step 1: bucket sort data''' - sort_start_time = monitor.cpu() - start_time = sort_start_time - for i from 0 <= i < N: - a_i = self.darray.data.arr[i] - word_count.arr[a_i] = word_count.arr[a_i] + 1 - - n = 0 - for i from 0 <= i < V+1: - self.ha.arr[i] = n - n = n + word_count.arr[i] - word_count.arr[i] = 0 - - for i from 0 <= i < N: - a_i = self.darray.data.arr[i] - self.sa.arr[self.ha.arr[a_i] + word_count.arr[a_i]] = i - isa.arr[i] = self.ha.arr[a_i + 1] - 1 # bucket pointer is last index in bucket - word_count.arr[a_i] = word_count.arr[a_i] + 1 - - '''Determine size of initial runs''' - current_run = 0 - for i from 0 <= i < V+1: - if i < V and self.ha.arr[i+1] - self.ha.arr[i] == 1: - current_run = current_run + 1 - else: - if current_run > 0: - self.sa.arr[self.ha.arr[i] - current_run] = -current_run - current_run = 0 - - sys.stderr.write(" Bucket sort took %f seconds\n" % (monitor.cpu() - sort_start_time)) - - '''Step 2: prefix-doubling sort''' - h = 1 - while self.sa.arr[0] != -N: - sort_start_time = monitor.cpu() - sys.stderr.write(" Refining, sort depth = %d\n" % (h,)) - i = 0 - skip = 0 - while i < N: - if self.sa.arr[i] < 0: - #sys.stderr.write("Skip from %d to %d\n" % (i, i-self.sa.arr[i]-1)) - skip = skip + self.sa.arr[i] - i = i - self.sa.arr[i] - else: - if skip < 0: - self.sa.arr[i+skip] = skip - skip = 0 - j = isa.arr[self.sa.arr[i]] - #sys.stderr.write("Process from %d to %d (%d, %d, %d)\n" % (i, j, self.sa.arr[i], self.darray.data.arr[self.sa.arr[i]], isa.arr[self.sa.arr[i]])) - self.q3sort(i, j, h, isa) - i = j+1 - if skip < 0: - self.sa.arr[i+skip] = skip - h = h * 2 - sys.stderr.write(" Refinement took %f seconds\n" % (monitor.cpu() - sort_start_time)) - - '''Step 3: read off suffix array from inverse suffix array''' - sys.stderr.write(" Finalizing sort...\n") - for i from 0 <= i < N: - j = isa.arr[i] - self.sa.arr[j] = i - sys.stderr.write("Suffix array construction took %f seconds\n" % (monitor.cpu() - start_time)) - - def q3sort(self, int i, int j, int h, cintlist.CIntList isa, pad=""): - '''This is a ternary quicksort. It divides the array into - three partitions: items less than the pivot, items equal - to pivot, and items greater than pivot. The first and last - of these partitions are then recursively sorted''' - cdef int k, midpoint, pval, phead, ptail, tmp - - if j-i < -1: - raise Exception("Unexpected condition found in q3sort: sort from %d to %d" % (i,j)) - if j-i == -1: # recursive base case -- empty interval - return - if (j-i == 0): # recursive base case -- singleton interval - isa.arr[self.sa.arr[i]] = i - self.sa.arr[i] = -1 - return - - # NOTE: choosing the first item as a pivot value resulted in - # stack overflow for some very large buckets. I think there - # is a natural bias towards order due the way the word ids are - # assigned; thus this resulted in the range to the left of the - # pivot being nearly empty. Therefore, choose the middle item. - # If the method of assigning word_id's is changed, this method - # may need to be reconsidered as well. - midpoint = (i+j)/2 - pval = isa.arr[self.sa.arr[midpoint] + h] - if i != midpoint: - tmp = self.sa.arr[midpoint] - self.sa.arr[midpoint] = self.sa.arr[i] - self.sa.arr[i] = tmp - phead = i - ptail = i - - # find the three partitions. phead marks the first element - # of the middle partition, and ptail marks the last element - for k from i+1 <= k < j+1: - if isa.arr[self.sa.arr[k] + h] < pval: - if k > ptail+1: - tmp = self.sa.arr[phead] - self.sa.arr[phead] = self.sa.arr[k] - self.sa.arr[k] = self.sa.arr[ptail+1] - self.sa.arr[ptail+1] = tmp - else: # k == ptail+1 - tmp = self.sa.arr[phead] - self.sa.arr[phead] = self.sa.arr[k] - self.sa.arr[k] = tmp - phead = phead + 1 - ptail = ptail + 1 - else: - if isa.arr[self.sa.arr[k] + h] == pval: - if k > ptail+1: - tmp = self.sa.arr[ptail+1] - self.sa.arr[ptail+1] = self.sa.arr[k] - self.sa.arr[k] = tmp - ptail = ptail + 1 - - # recursively sort smaller suffixes - self.q3sort(i, phead-1, h, isa, pad+" ") - - # update suffixes with pivot value - # corresponds to update_group function in Larsson & Sadakane - for k from phead <= k < ptail+1: - isa.arr[self.sa.arr[k]] = ptail - if phead == ptail: - self.sa.arr[phead] = -1 - - # recursively sort larger suffixes - self.q3sort(ptail+1, j, h, isa, pad+" ") - - - def write_text(self, filename): - self.darray.write_text(filename) - - - def read_binary(self, filename): - cdef FILE *f - cdef bytes bfilename = filename - cdef char* cfilename = bfilename - f = fopen(cfilename, "r") - self.darray.read_handle(f) - self.sa.read_handle(f) - self.ha.read_handle(f) - fclose(f) - - - def write_binary(self, filename): - cdef FILE* f - cdef bytes bfilename = filename - cdef char* cfilename = bfilename - f = fopen(cfilename, "w") - self.darray.write_handle(f) - self.sa.write_handle(f) - self.ha.write_handle(f) - fclose(f) - - - def write_enhanced(self, filename): - f = open(filename, "w") - self.darray.write_enhanced_handle(f) - for a_i in self.sa: - f.write("%d " % a_i) - f.write("\n") - for w_i in self.ha: - f.write("%d " % w_i) - f.write("\n") - f.close() - - - cdef __search_high(self, word_id, offset, low, high): - cdef int midpoint - - if low >= high: - return high - midpoint = (high + low) / 2 - if self.darray.data.arr[self.sa.arr[midpoint] + offset] == word_id: - return self.__search_high(word_id, offset, midpoint+1, high) - else: - return self.__search_high(word_id, offset, low, midpoint) - - - cdef __search_low(self, int word_id, int offset, int low, int high): - cdef int midpoint - - if low >= high: - return high - midpoint = (high + low) / 2 - if self.darray.data.arr[self.sa.arr[midpoint] + offset] == word_id: - return self.__search_low(word_id, offset, low, midpoint) - else: - return self.__search_low(word_id, offset, midpoint+1, high) - - - cdef __get_range(self, int word_id, int offset, int low, int high, int midpoint): - return (self.__search_low(word_id, offset, low, midpoint), - self.__search_high(word_id, offset, midpoint, high)) - - - cdef __lookup_helper(self, int word_id, int offset, int low, int high): - cdef int midpoint - - if offset == 0: - return (self.ha.arr[word_id], self.ha.arr[word_id+1]) - if low >= high: - return None - - midpoint = (high + low) / 2 - if self.darray.data.arr[self.sa.arr[midpoint] + offset] == word_id: - return self.__get_range(word_id, offset, low, high, midpoint) - if self.darray.data.arr[self.sa.arr[midpoint] + offset] > word_id: - return self.__lookup_helper(word_id, offset, low, midpoint) - else: - return self.__lookup_helper(word_id, offset, midpoint+1, high) - - - def lookup(self, word, offset, int low, int high): - if low == -1: - low = 0 - if high == -1: - high = len(self.sa) - if word in self.darray.word2id: - word_id = self.darray.word2id[word] - return self.__lookup_helper(word_id, offset, low, high) - else: - return None - - - - def print_sa(self, isa): - '''Slow; Use only in case of emergency''' - cdef int i, j, k, N - cdef cintlist.CIntList tmp_sa - - N = len(self.sa) - for i from 0 <= i < N: - sys.stderr.write("%2d " % i) - sys.stderr.write("\n") - for i from 0 <= i < N: - sys.stderr.write("%2d " % self.darray.data.arr[i]) - sys.stderr.write("\n") - for i from 0 <= i < N: - sys.stderr.write("%2d " % isa.arr[i]) - sys.stderr.write("\n\n\n") - - # Recover partially sorted array - tmp_sa = cintlist.CIntList(initial_len=N) - for i from 0 <= i < N: - j = isa.arr[i] - tmp_sa.arr[j] = i - for i from 0 <= i < N: - if self.sa.arr[i] > 0: - tmp_sa.arr[i] = self.sa.arr[i] - - for i from 0 <= i < N: - j = tmp_sa.arr[i] - sys.stderr.write("%2d %2d | " % (i, self.sa.arr[i])) - for k from j <= k < N: - sys.stderr.write("%2d " % self.darray.data.arr[k]) - sys.stderr.write("\n") - sys.stderr.write("\n") - - - - - diff --git a/sa-extract/cveb.pxd b/sa-extract/cveb.pxd deleted file mode 100644 index 8967f8e3..00000000 --- a/sa-extract/cveb.pxd +++ /dev/null @@ -1,15 +0,0 @@ -cdef struct _VEB: - int top_universe_size - int num_bottom_bits - int max_val - int min_val - int size - void* top - void** bottom - - -cdef class VEB: - cdef _VEB* veb - cdef int _findsucc(self, int i) - cdef int _insert(self, int i) - cdef int _first(self) diff --git a/sa-extract/cveb.pyx b/sa-extract/cveb.pyx deleted file mode 100644 index ca87becc..00000000 --- a/sa-extract/cveb.pyx +++ /dev/null @@ -1,390 +0,0 @@ -#!/usr/bin/env python2.4 -'''This module implements a partial stratified tree (van Emde Boas, 1977). -Only insert findsucc, __iter__, and __contains__ are implemented. -Delete is currently not supported. -There is very little error-checking in this code -- it is designed -to be used in the limited situation described in Lopez (EMNLP-CoNLL 2007), -which doesn't cover all of the possible ways that you could misuse it -(e.g. trying to insert a key larger than the universe size) -Other notes -- this code is really rather ugly C code masquerading as -Pyrex/Python. Virtual function calls are bypassed by hand in several -places for the sake of efficiency, and other Python niceties are -removed for the same reason.''' - -from libc.stdlib cimport malloc, free -from libc.math cimport log, ceil -from libc.string cimport memset - -cdef int MIN_BOTTOM_SIZE -cdef int MIN_BOTTOM_BITS - -MIN_BOTTOM_SIZE = 32 -MIN_BOTTOM_BITS = 5 - -cdef int lower_mask[32] -cdef int i, mask - -for i from 0 <= i < MIN_BOTTOM_SIZE: - mask = (mask << 1) + 1 - lower_mask[i] = mask - - -cdef struct _BitSet: - long bitset - int min_val - int max_val - int size - - -cdef _BitSet* new_BitSet(): - cdef _BitSet* b - - b = <_BitSet*> malloc(sizeof(_BitSet)) - b.bitset = 0 - b.min_val = -1 - b.max_val = -1 - b.size = 0 - return b - - -cdef int bitset_findsucc(_BitSet* b, int i): - cdef int bitset, mask - cdef int low, high, mid - - if b.max_val == -1 or i >= b.max_val: - return -1 - if i < b.min_val: - return b.min_val - - bitset = b.bitset & ~lower_mask[i] - low = i+1 - high = b.max_val+1 - while low < high-1: - mid = (high + low)/2 - mask = ~(lower_mask[high-1] ^ lower_mask[mid-1]) - if bitset & mask == 0: - low = mid - else: - bitset = bitset & mask - high = mid - return low - - -cdef int bitset_insert(_BitSet* b, int i): - cdef int val - - val = 1 << i - if b.bitset & val == 0: - b.bitset = b.bitset | val - if b.size == 0: - b.min_val = i - b.max_val = i - else: - if i < b.min_val: - b.min_val = i - if i > b.max_val: - b.max_val = i - b.size = b.size + 1 - return 1 - return 0 - - -cdef int bitset_contains(_BitSet* b, int i): - cdef int val - - val = 1 << i - if b.bitset & val == 0: - return 0 - else: - return 1 - - -cdef class BitSetIterator: - cdef _BitSet* b - cdef int next_val - - def __next__(self): - cdef int ret_val - - if self.next_val == -1: - raise StopIteration() - ret_val = self.next_val - self.next_val = bitset_findsucc(self.b, ret_val) - return ret_val - - - -# This is a Python wrapper class to give access to the -# (entirely C-implemented) _BitSet struct. -# Very slow; use only for debugging -cdef class BitSet: - - cdef _BitSet* b - - def __cinit__(self): - self.b = new_BitSet() - - def __dealloc__(self): - free(self.b) - - def __iter__(self): - cdef BitSetIterator it - it = BitSetIterator() - it.b = self.b - it.next_val = self.b.min_val - return it - - def insert(self, i): - return bitset_insert(self.b, i) - - def findsucc(self, i): - return bitset_findsucc(self.b, i) - - def __str__(self): - return dec2bin(self.b.bitset)+" ("+str(self.b.size)+","+str(self.b.min_val)+","+str(self.b.max_val)+")" - - def min(self): - return self.b.min_val - - def max(self): - return self.b.max_val - - def __len__(self): - return self.b.size - - def __contains__(self, i): - return bool(bitset_contains(self.b, i)) - - -def dec2bin(i): - cdef d - result = "" - for d from 0 <= d < MIN_BOTTOM_SIZE: - if i & lower_mask[0] == 0: - result = "0"+result - else: - result = "1"+result - i = i >> 1 - return result - - -cdef _VEB* new_VEB(int n): - cdef _VEB* veb - cdef int num_bits, num_top_bits, i - - veb = <_VEB*> malloc(sizeof(_VEB)) - - num_bits = int(ceil(log(n) / log(2))) - veb.num_bottom_bits = num_bits/2 - if veb.num_bottom_bits < MIN_BOTTOM_BITS: - veb.num_bottom_bits = MIN_BOTTOM_BITS - veb.top_universe_size = (n >> veb.num_bottom_bits) + 1 - - veb.bottom = <void**> malloc(veb.top_universe_size * sizeof(void*)) - memset(veb.bottom, 0, veb.top_universe_size * sizeof(void*)) - - if veb.top_universe_size > MIN_BOTTOM_SIZE: - veb.top = new_VEB(veb.top_universe_size) - else: - veb.top = new_BitSet() - - veb.max_val = -1 - veb.min_val = -1 - veb.size = 0 - return veb - - -cdef int VEB_insert(_VEB* veb, int i): - cdef _VEB* subv - cdef _BitSet* subb - cdef int a, b, tmp - - if veb.size == 0: - veb.min_val = i - veb.max_val = i - elif i == veb.min_val or i == veb.max_val: - return 0 - else: - if i < veb.min_val: - tmp = i - i = veb.min_val - veb.min_val = tmp - a = i >> veb.num_bottom_bits - b = i & lower_mask[veb.num_bottom_bits-1] - if veb.bottom[a] == NULL: - if veb.top_universe_size > MIN_BOTTOM_SIZE: - subv = <_VEB*> veb.top - VEB_insert(subv, a) - else: - subb = <_BitSet*> veb.top - bitset_insert(subb, a) - if veb.num_bottom_bits > MIN_BOTTOM_BITS: - veb.bottom[a] = new_VEB(1 << veb.num_bottom_bits) - else: - veb.bottom[a] = new_BitSet() - if veb.num_bottom_bits > MIN_BOTTOM_BITS: - subv = <_VEB*> veb.bottom[a] - if VEB_insert(subv, b) == 0: - return 0 - else: - subb = <_BitSet*> veb.bottom[a] - if bitset_insert(subb, b) == 0: - return 0 - - if i > veb.max_val: - veb.max_val = i - veb.size = veb.size + 1 - return 1 - - -cdef del_VEB(_VEB* veb): - cdef int i - - if veb.top_universe_size > MIN_BOTTOM_SIZE: - i = (<_VEB*> veb.top).min_val - else: - i = (<_BitSet*> veb.top).min_val - - while i != -1: - if veb.num_bottom_bits > MIN_BOTTOM_BITS: - del_VEB(<_VEB*> veb.bottom[i]) - else: - free(<_BitSet*> veb.bottom[i]) - - if veb.top_universe_size > MIN_BOTTOM_SIZE: - i = VEB_findsucc(<_VEB*> veb.top, i) - else: - i = bitset_findsucc(<_BitSet*> veb.top, i) - - if veb.top_universe_size > MIN_BOTTOM_SIZE: - del_VEB(<_VEB*> veb.top) - else: - free(<_BitSet*> veb.top) - free(veb.bottom) - free(veb) - - -cdef int VEB_findsucc(_VEB* veb, int i): - cdef _VEB* subv - cdef _BitSet* subb - cdef int a, b, j, c, found - - if veb.max_val == -1 or i>=veb.max_val: - return -1 - if i < veb.min_val: - return veb.min_val - - a = i >> veb.num_bottom_bits - b = i & lower_mask[veb.num_bottom_bits-1] - found = 0 - if veb.bottom[a] != NULL: - if veb.num_bottom_bits > MIN_BOTTOM_BITS: - subv = <_VEB*> veb.bottom[a] - if subv.max_val > b: - j = (a << veb.num_bottom_bits) + VEB_findsucc(subv, b) - found = 1 - else: - subb = <_BitSet*> veb.bottom[a] - if subb.max_val > b: - j = (a << veb.num_bottom_bits) + bitset_findsucc(subb, b) - found = 1 - if found==0: - if veb.top_universe_size > MIN_BOTTOM_SIZE: - subv = <_VEB*> veb.top - c = VEB_findsucc(subv, a) - else: - subb = <_BitSet*> veb.top - c = bitset_findsucc(subb, a) - if veb.num_bottom_bits > MIN_BOTTOM_BITS: - subv = <_VEB*> veb.bottom[c] - j = (c << veb.num_bottom_bits) + subv.min_val - else: - subb = <_BitSet*> veb.bottom[c] - j = (c << veb.num_bottom_bits) + subb.min_val - return j - - -cdef int VEB_contains(_VEB* veb, int i): - cdef _VEB* subv - cdef _BitSet* subb - cdef int a, b - - if veb.size == 0 or i < veb.min_val or i > veb.max_val: - return 0 - - if veb.min_val == i: - return 1 - else: - if veb.size == 1: - return 0 - - a = i >> veb.num_bottom_bits - b = i & lower_mask[veb.num_bottom_bits-1] - if veb.bottom[a] == NULL: - return 0 - else: - if veb.num_bottom_bits > MIN_BOTTOM_BITS: - subv = <_VEB*> veb.bottom[a] - return VEB_contains(subv, b) - else: - subb = <_BitSet*> veb.bottom[a] - return bitset_contains(subb, b) - - -cdef class VEBIterator: - cdef _VEB* v - cdef int next_val - - def __next__(self): - cdef int ret_val - - if self.next_val == -1: - raise StopIteration() - ret_val = self.next_val - self.next_val = VEB_findsucc(self.v, ret_val) - return ret_val - - -cdef class VEB: - - def __init__(self, size): - pass - - def __cinit__(self, int size): - self.veb = new_VEB(size) - - def __dealloc__(self): - del_VEB(self.veb) - - def __iter__(self): - cdef VEBIterator it - it = VEBIterator() - it.v = self.veb - it.next_val = self.veb.min_val - return it - - def insert(self, i): - return VEB_insert(self.veb, i) - - cdef int _insert(self, int i): - return VEB_insert(self.veb, i) - - def findsucc(self, i): - return VEB_findsucc(self.veb, i) - - cdef int _first(self): - return self.veb.min_val - - cdef int _findsucc(self, int i): - return VEB_findsucc(self.veb, i) - - def __len__(self): - return self.veb.size - - def __contains__(self, i): - return VEB_contains(self.veb, i) - - - - - diff --git a/sa-extract/escape-testset.pl b/sa-extract/escape-testset.pl deleted file mode 100755 index 02fd7445..00000000 --- a/sa-extract/escape-testset.pl +++ /dev/null @@ -1,35 +0,0 @@ -#!/usr/bin/perl -w - -use utf8; -use strict; - -binmode(STDIN,":utf8"); -binmode(STDOUT,":utf8"); - -my @fh = (); -if (scalar @ARGV == 0) { - push @fh, \*STDIN; -} else { - for my $file (@ARGV) { - my $f; - open $f, "<$file" or die "Can't read $file: $!\n"; - binmode $f, ":utf8"; - push @fh, $f; - } -} - -my $id = -1; -for my $f (@fh) { - while(<$f>) { - chomp; - die "Empty line in test set" if /^\s*$/; - die "Please remove <seg> tags from input:\n$_" if /^\s*<seg/i; - $id++; - s/&/\&/g; - s/</\</g; - s/>/\>/g; - print "<seg id=\"$id\"> $_ </seg>\n"; - } -} - - diff --git a/sa-extract/example/README b/sa-extract/example/README deleted file mode 100644 index f6eac52b..00000000 --- a/sa-extract/example/README +++ /dev/null @@ -1,8 +0,0 @@ -Commands to compile a corpus and extract some grammars -====================================================== - -# compile -../sa-compile.pl -b nc=corpus.de.gz,corpus.en.gz -a gdfa=corpus.align.gz > extract.ini -# extract -cat test.de | ../escape-testset.pl | ../extractor.py -c extract.ini - diff --git a/sa-extract/example/corpus.align.gz b/sa-extract/example/corpus.align.gz Binary files differdeleted file mode 100644 index 741de7e4..00000000 --- a/sa-extract/example/corpus.align.gz +++ /dev/null diff --git a/sa-extract/example/corpus.de.gz b/sa-extract/example/corpus.de.gz Binary files differdeleted file mode 100644 index 0d66470a..00000000 --- a/sa-extract/example/corpus.de.gz +++ /dev/null diff --git a/sa-extract/example/corpus.en.gz b/sa-extract/example/corpus.en.gz Binary files differdeleted file mode 100644 index 28cb5c58..00000000 --- a/sa-extract/example/corpus.en.gz +++ /dev/null diff --git a/sa-extract/example/test.de b/sa-extract/example/test.de deleted file mode 100644 index 8923329f..00000000 --- a/sa-extract/example/test.de +++ /dev/null @@ -1,10 +0,0 @@ -dies ist der richtige ansatz für diejenigen in chinas politischer führung , die aus der who den maximalen nutzen für die unterstützung der inneren reform ziehen wollen . -taiwan hat sich auch vorgenommen , konstruktiv zu sein - wenn china mitspielt . -die stadt staaten hongkong und singapur verfolgen eine klarsichtige who - politik und konzentrieren sich auf markt zugänge und starke regeln . -malaysia und thailand sind auch recht aktiv innerhalb der who , mit verschiedenen positionen , die vom frei handel bis zum protektionismus reichen . -indonesien und die philippinen sind schwächer , überwältigt von politischer zusammen hanglosigkeit und ganz in anspruch genommen von den anstrengungen , das schlimmste zu hause zu verhüten , so dass nur geringe kräfte übrig bleiben , mit der stets anschwellenden und immer komplizierteren agenda der who fertig zu werden . -die who steht vor einer wichtigen entscheidung . -sie muss dringend den handel progressiv liberalisieren . -eine starke führung seitens der usa ist erforderlich , damit die who in diese richtung gebracht werden kann und man gleichzeitig vermeidet , die zukunft nach dem muster der eu zu gestalten ( regel wucherung ) oder nach dem muster der uno ( macht lose gespräch runde ) . -dies geschieht sicher besser unter bush , mit einem klaren bekenntnis zum offenen markt und einer aktiveren außen politik , als es unter irgendeiner demokratischen alternative geschehen könnte . -robert zoellick , präsident bushs handel beauftragter , braucht aber verbündete . diff --git a/sa-extract/example/test.ref.en b/sa-extract/example/test.ref.en deleted file mode 100644 index e50edcac..00000000 --- a/sa-extract/example/test.ref.en +++ /dev/null @@ -1,10 +0,0 @@ -this is the right approach for those in china 's leadership who wish to extract maximum benefits from the wto to bolster domestic reform . -taiwan is also set to play a constructive role -- if mainland china plays along . -the city states , hong kong and singapore , have clear - sighted wto policies , focusing on market access and strong rules . -malaysia and thailand are also fairly active in the wto , with a mix of free - market and protectionist positions . -indonesia and the philippines are weaker , overwhelmed by policy incoherence and fire - fighting at home , and with insufficient capacity to deal with the wto 's burgeoning and increasingly complicated agenda . -the wto is at a crossroads . -it sorely needs to liberalize trade progressively . -strong us leadership is required to push the wto in this direction while avoiding an eu - style future ( regulatory overload ) or a un - style future ( an irrelevant talking shop ) . -this is more likely under a bush administration with better open - market credentials and a more assertive foreign policy than any democratic alternative . -however , robert zoellick , president bush 's trade representative , needs allies . diff --git a/sa-extract/extract.ini b/sa-extract/extract.ini deleted file mode 100644 index 56913245..00000000 --- a/sa-extract/extract.ini +++ /dev/null @@ -1,116 +0,0 @@ -# This .ini file extracts grammars to a file using -# the pattern matching infrastructure. -# -# Does not do any decoding. -# -# Variables can be set using sa-system.pl -# -# Usage: decoder.py -c <this ini file> [-x <grammar file>] -# -# If the -x option is used, grammar will be written to the -# specified file, otherwise it is written to $PWD/grammar.out -# -# NOTE: all information about rules is cached, so use generous -# memory limits (rules themselves are not cached.) - -import os -import manager -import clex -import context_model -import rulefactory -import calignment -import sys - -out_grammar_file = "grammar.out" -if opts.extra: - out_grammar_file = opts.extra - -# *** these variables written by sa-system.pl. Do not modify *** -lm_file = "/tmp/sa-redpony/de-en/lm/lm/lm.gz" -f_sa_file = "/tmp/sa-redpony/de-en/bitext/soseos.vc/f.sa.bin" -e_file = "/tmp/sa-redpony/de-en/bitext/soseos.vc/e.bin" -a_file = "/tmp/sa-redpony/de-en/bitext/soseos.vc/a/gdfa/a.bin" -lex_file = "/tmp/sa-redpony/de-en/bitext/soseos.vc/a/gdfa/lex.bin" -max_len = 5 -max_nt = 2 -max_size=10 -min_gap=1 -rank1 = 100 -rank2 = 10 -precompute_file = "/tmp/sa-redpony/de-en/bitext/soseos.vc/precomp.5.2.10.1.100.10.bin" - -# check for path errors -if not os.path.exists(f_sa_file): - raise Exception("Cannot find compiled source language suffix array file %s" % f_sa_file) -if not os.path.exists(e_file): - raise Exception("Cannot find compiled target language array file %s" % e_file) -if not os.path.exists(a_file): - raise Exception("Cannot find compiled alignment file %s" % a_file) -if not os.path.exists(lex_file): - raise Exception("Cannot find compiled lexical weights file %s" % lex_file) -if not os.path.exists(precompute_file): - log.writeln("Could not find precomputed collocations %s, decoding will be slower" % precompute_file) - precompute_file = None - -### Output options -mark_phrases = False # show derivation as SGML markup in output -mert_mark_phrases = False # do the same when generating n-best lists (don't use this with minimum error rate training!) - -# Verbosity. 0 = silent, 1 = normal, 2-5 = verbose -log.level = 1 -log.file = sys.stderr - -# pattern-matching stuff -class PhonyGrammar: # saves us the cost of keeping the rules around - def add(self, thing): - pass - -local_grammar = PhonyGrammar() -xcat="X" - -cm = manager.ContextManager( - f_sa_file, - e_file, - sampler=rulefactory.Sampler(300), # lower=faster, higher=better; improvements level off above 200-300 range, -1 = don't sample, use all data (VERY SLOW!) - rulefactory=rulefactory.HieroCachingRuleFactory( - alignment=calignment.Alignment( # compiled alignment object (REQUIRED) - a_file, - from_binary=True - ), - category="["+xcat+"]", # name of generic nonterminal used by Hiero - grammar=local_grammar, # do not change for extraction - max_chunks=None, # maximum number of contiguous chunks of terminal symbols in RHS of a rule. If None, defaults to max_nonterminals+1 - max_initial_size=15, # maximum span of a grammar rule in TEST DATA - max_length=max_len, # maximum number of symbols (both T and NT) allowed in a rule - max_nonterminals=max_nt, # maximum number of nonterminals allowed in a rule (set >2 at your own risk) - max_target_chunks=None, # maximum number of contiguous chunks of terminal symbols in target-side RHS of a rule. If None, defaults to max_nonterminals+1 - max_target_length=None, # maximum number of target side symbols (both T and NT) allowed in a rule. If None, defaults to max_initial_size - min_gap_size=1, # minimum span of a nonterminal in the RHS of a rule in TEST DATA - precompute_file=precompute_file, # filename of file containing precomputed collocations - precompute_secondary_rank=rank2, # maximum frequency rank of patterns used to compute triples (don't set higher than 20). - precompute_rank=rank1, # maximum frequency rank of patterns used to compute collocations (no need to set higher than maybe 200-300) - require_aligned_terminal=True, # require extracted rules to have at least one aligned word - require_aligned_chunks=False, # require each contiguous chunk of extracted rules to have at least one aligned word - per_sentence_grammar=True, # generate a complete grammar for each input sentence - rule_file=out_grammar_file, # grammar is written to this file (sentence id is added to file name for per sentence grammars) - train_max_initial_size=max_size, # maximum span of a grammar rule extracted from TRAINING DATA - train_min_gap_size=min_gap, # minimum span of an RHS nonterminal in a rule extracted from TRAINING DATA - tight_phrases=True, # True if phrases should be tight, False otherwise (False seems to give better results but is slower) - ), - from_binary=True - ) - -# lexical weighting tables -tt = clex.CLex(lex_file, from_binary=True) - -# Only include features that depend on rule identity here -add_model(context_model.EgivenFCoherent(cm), 0.125) -add_model(context_model.SampleCountF(cm), 0.125) -add_model(context_model.CountEF(cm), 0.125) -add_model(context_model.MaxLexFgivenE(cm, tt), 0.125) -add_model(context_model.MaxLexEgivenF(cm, tt), 0.125) -add_model(context_model.IsSingletonF(cm), 0.125) -add_model(context_model.IsSingletonFE(cm), 0.125) - -# grammars, search parameters and all that other stuff are irrelevant - diff --git a/sa-extract/extractor.py b/sa-extract/extractor.py deleted file mode 100755 index 9d66ebf0..00000000 --- a/sa-extract/extractor.py +++ /dev/null @@ -1,60 +0,0 @@ -#!/usr/bin/env python - -# vim:expandtab:shiftwidth=4 - -import sys, gc, monitor, sgml -import optparse -import model -import log -import cn - -models = [] - -def add_model(m,w=0.0): - models.append(m) - -def extract_grammar(input): - confnet = cn.ConfusionNet(input) - meta = input.meta - for m in models: - m.input(confnet.columns, meta) - -if __name__ == "__main__": - optparser = optparse.OptionParser() - optparser.add_option("-c", "--config", dest="config", help="configuration module") - optparser.add_option("-x", "--extra", dest="extra", help="output grammar name override") - (opts,args) = optparser.parse_args() - - if opts.config is None: - raise ValueError, "You must specify a configuration file." - else: - if log.level >= 1: - log.write("Reading configuration from %s\n" % opts.config) - execfile(opts.config) - - if len(args) >= 1 and args[0] != "-": - input_file = file(args[0], "r") - else: - input_file = sys.stdin - - if len(args) >= 2 and args[1] != "-": - output_file = file(args[1], "w") - else: - output_file = sys.stdout - - gc.collect() - if log.level >= 1: - log.write("all structures loaded, memory %s, time %s\n" % (monitor.memory(), monitor.cpu())) - log.write("models: %s\n" % (" ".join(str(x.name) for x in models))) - - sents = sgml.read_raw(input_file) - for sent in sents: - mark = sent.getmark() - if mark is not None: - (tag, attrs) = mark - if tag == "seg": - sent.unmark() - dattrs = sgml.attrs_to_dict(attrs) - sent.meta = attrs - extract_grammar(sent) - diff --git a/sa-extract/lcp.pyx b/sa-extract/lcp.pyx deleted file mode 100644 index a992d3ee..00000000 --- a/sa-extract/lcp.pyx +++ /dev/null @@ -1,113 +0,0 @@ -#!/usr/bin/env python2.4 - -'''Compute LCP array for a suffix array using the Kasai et al. algorithm''' -'''Can also be used to compute statistics such -as k most frequent n-grams''' - -import sys - -cimport cintlist -cimport csuf -cimport cdat -cimport cveb - -cdef class LCP: - - cdef csuf.SuffixArray sa - cdef cintlist.CIntList lcp - - def __init__(self, sa): - self._construct(sa) - - cdef _construct(self, csuf.SuffixArray sa): - cdef int i, k, j, h, n - cdef cintlist.CIntList rank - - sys.stderr.write("Constructing LCP array\n") - self.sa = sa - n = self.sa.sa.len - self.lcp = cintlist.CIntList(initial_len=n) - - rank = cintlist.CIntList(initial_len=n) - for i from 0 <= i < n: - rank.arr[sa.sa.arr[i]] = i - - h = 0 - for i from 0 <= i < n: - k = rank.arr[i] - if k == 0: - self.lcp.arr[k] = -1 - else: - j = sa.sa.arr[k-1] - while i+h < n and j+h < n and sa.darray.data.arr[i+h] == sa.darray.data.arr[j+h]: - h = h+1 - self.lcp.arr[k] = h - if h > 0: - h = h-1 - sys.stderr.write("LCP array completed\n") - - - def compute_stats(self, max_n): - self._compute_stats(max_n) - - cdef _compute_stats(self, int max_n): - '''Note: the output of this function is not exact. In - particular, the frequency associated with each word is - not guaranteed to be correct. This is due to a bit of - laxness in the design; the function is intended only to - obtain a list of the most frequent words; for this - purpose it is perfectly fine''' - cdef int i, ii, iii, j, k, h, n, N, rs, freq, valid - cdef cintlist.CIntList run_start - cdef cintlist.CIntList ngram_start - cdef cveb.VEB veb - - N = self.sa.sa.len - - ngram_starts = [] - for n from 0 <= n < max_n: - ngram_starts.append(cintlist.CIntList(initial_len=N)) - - run_start = cintlist.CIntList(initial_len=max_n) - veb = cveb.VEB(N) - - for i from 0 <= i < N: - h = self.lcp.arr[i] - if h < 0: - h = 0 - for n from h <= n < max_n: - rs = run_start.arr[n] - run_start.arr[n] = i - freq = i - rs - if freq > 1000: # arbitrary, but see note below - veb._insert(freq) - ngram_start = ngram_starts[n] - while ngram_start.arr[freq] > 0: - freq = freq + 1 # cheating a bit, should be ok for sparse histogram - ngram_start.arr[freq] = rs - i = veb.veb.min_val - while i != -1: - ii = veb._findsucc(i) - for n from 0 <= n < max_n: - ngram_start = ngram_starts[n] - iii = i - rs = ngram_start.arr[iii] - while (ii==-1 or iii < ii) and rs != 0: - j = self.sa.sa.arr[rs] - valid = 1 - for k from 0 <= k < n+1: - if self.sa.darray.data.arr[j+k] < 2: - valid = 0 - if valid: - ngram = "" - for k from 0 <= k < n+1: - ngram= ngram+ self.sa.darray.id2word[self.sa.darray.data.arr[j+k]] + " " - print i, n+1, ngram - iii = iii + 1 - rs = ngram_start.arr[iii] - i = ii - - - - - diff --git a/sa-extract/lcp_ops.py b/sa-extract/lcp_ops.py deleted file mode 100755 index 9df6e82a..00000000 --- a/sa-extract/lcp_ops.py +++ /dev/null @@ -1,52 +0,0 @@ -#!/usr/bin/env python - -import lcp -import sys -import optparse -import csuf - -'''Run various computations using the LCP array''' -def main(): - - optparser = optparse.OptionParser() -# optparser.add_option("-c", "--config", dest="config", help="configuration module") - optparser.add_option("-s", "--sa-check", dest="sa_check", default=False, action="store_true") - optparser.add_option("-l", "--lcp-check", dest="lcp_check", default=False, action="store_true") - optparser.add_option("-t", "--stats", dest="stats", default=0, type="int", action="store") - optparser.add_option("-u", "--unigram", dest="uni_check", default=False, action="store_true") - optparser.add_option("-r", "--report-long-lcps", dest="long_lcp", type="int", default=0, action="store") - (opts,args) = optparser.parse_args() - - if len(args) < 1: - print >>sys.stderr, "Usage: lcp.py [opts] <sa file>" - sys.exit(1) - - safile = args[0] - sa = csuf.SuffixArray(safile, from_binary=True) - -# if opts.sa_check: -# check_sufarray(sa) - - l = lcp.LCP(sa) - - if opts.lcp_check: - print >>sys.stderr, "Checking LCP Array..." - l.check() - print >>sys.stderr, "Check finished" - - if opts.stats > 0: - l.compute_stats(opts.stats) - -# if opts.uni_check: -# if lcp is None: -# lcp = LCP(sa) -# unigram_stats(sa, lcp) -# -# if opts.long_lcp: -# if lcp is None: -# lcp = LCP(sa, opts.long_lcp) - -if __name__ == "__main__": - sys.exit(main()) - - diff --git a/sa-extract/log.py b/sa-extract/log.py deleted file mode 100644 index d4f96cb4..00000000 --- a/sa-extract/log.py +++ /dev/null @@ -1,18 +0,0 @@ -import sys - -level = 1 -file = sys.stderr - -def writeln(s="", l=0): - if level >= l: - file.write("%s\n" % s) - file.flush() - -def write(s, l=0): - if level >= l: - file.write(s) - file.flush() - - - - diff --git a/sa-extract/manager.py b/sa-extract/manager.py deleted file mode 100644 index 3a079c2a..00000000 --- a/sa-extract/manager.py +++ /dev/null @@ -1,100 +0,0 @@ -import csuf -import cdat -import cintlist - -class Sampler(object): - '''A Sampler implements a logic for choosing - samples from a population range''' - - def __init__(self): - pass - - def registerContext(self, context_manager): - self.context_manager = context_manager - - def sample(self, phrase_location): - return cintlist.CIntList() - - -class Extractor(object): - '''Extractor is responsible for extracting rules - from a given context; once a sentence id/location - is found for the source fwords, extractor is - responsible for producing any matching rule(s). - Optionally, extractor may return an empty list''' - - def __init__(self): - pass - - def registerContext(self, context_manager): - self.context_manager = context_manager - - def extract(self, fwords, loc): - return [] - - - -class RuleFactory(object): - '''RuleFactory is a class that manages the - generation of translation rules, using the Context - and (optionally) any of its contained classes or - data. The RuleFactory is responsible for handling - any caching (i.e. when presented with an input - sentence, it may lookup a rule from its cache - rather than extracting a new rule)''' - - def __init__(self): - self.num_lookups = 0 - self.num_extractions = 0 - self.num_rules = 0 - self.time = 0.0 - - - def registerContext(self, context_manager): - self.context_manager = context_manager - - - def input(self, fwords): - '''Manages the process of enumerating - rules for a given input sentence, and - looking them with calls to Context, - Sampler, and Extractor''' - return [] - - -class ContextManager(object): - - def __init__(self, ffile, efile, extractor=None, sampler=None, rulefactory=None, from_binary=False): - # NOTE: Extractor does not have a default value because - # the only nontrivial extractor right now depends on an - # alignment file - - self.fsarray = csuf.SuffixArray(ffile, from_binary) - self.edarray = cdat.DataArray(efile, from_binary) - - self.factory = rulefactory - self.factory.registerContext(self) - - self.sampler = sampler - self.sampler.registerContext(self) - - self.models = [] - self.owner = None - - - def add_model(self, model): - if self.owner is None: - self.owner = model - model_id = len(self.models) - self.models.append(model) - return model_id - - - def input(self, model, fwords, meta): - if model != self.owner: - return - self.fwords = fwords - self.factory.input(self.fwords, meta) - - - diff --git a/sa-extract/model.py b/sa-extract/model.py deleted file mode 100644 index bcdf129a..00000000 --- a/sa-extract/model.py +++ /dev/null @@ -1,7 +0,0 @@ -class Model(object): - def __init__(self, name=None): - object.__init__(self) - if name is None: - self.name = self.__class__.__name__ - else: - self.name = name diff --git a/sa-extract/monitor.py b/sa-extract/monitor.py deleted file mode 100644 index eb0bed57..00000000 --- a/sa-extract/monitor.py +++ /dev/null @@ -1,48 +0,0 @@ -import os, resource - -def cpu(): - return (resource.getrusage(resource.RUSAGE_SELF).ru_utime+ - resource.getrusage(resource.RUSAGE_SELF).ru_stime) - -# from http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/286222 - - -_proc_status = '/proc/%d/status' % os.getpid() - -_scale = {'kB': 1024.0, 'mB': 1024.0*1024.0, - 'KB': 1024.0, 'MB': 1024.0*1024.0} - -def _VmB(VmKey): - '''Private. - ''' - global _proc_status, _scale - # get pseudo file /proc/<pid>/status - try: - t = open(_proc_status) - v = t.read() - t.close() - except: - return 0.0 # non-Linux? - # get VmKey line e.g. 'VmRSS: 9999 kB\n ...' - i = v.index(VmKey) - v = v[i:].split(None, 3) # whitespace - if len(v) < 3: - return 0.0 # invalid format? - # convert Vm value to bytes - return float(v[1]) * _scale[v[2]] - -def memory(since=0.0): - '''Return memory usage in bytes. - ''' - return _VmB('VmSize:') - since - -def resident(since=0.0): - '''Return resident memory usage in bytes. - ''' - return _VmB('VmRSS:') - since - - -def stacksize(since=0.0): - '''Return stack size in bytes. - ''' - return _VmB('VmStk:') - since diff --git a/sa-extract/precomputation.pxd b/sa-extract/precomputation.pxd deleted file mode 100644 index c75d5eef..00000000 --- a/sa-extract/precomputation.pxd +++ /dev/null @@ -1,13 +0,0 @@ -from libc.stdio cimport FILE - -cdef class Precomputation: - cdef int precompute_rank - cdef int precompute_secondary_rank - cdef int max_length - cdef int max_nonterminals - cdef int train_max_initial_size - cdef int train_min_gap_size - cdef precomputed_index - cdef precomputed_collocations - cdef read_map(self, FILE* f) - cdef write_map(self, m, FILE* f) diff --git a/sa-extract/precomputation.pyx b/sa-extract/precomputation.pyx deleted file mode 100644 index ce4c21aa..00000000 --- a/sa-extract/precomputation.pyx +++ /dev/null @@ -1,478 +0,0 @@ -# precomputes a set of collocations by advancing over the text. -# warning: nasty C code - -import log -import monitor - -cimport csuf -cimport cdat -cimport cintlist - -from libc.stdio cimport FILE, fopen, fread, fwrite, fclose -from libc.stdlib cimport malloc, realloc, free -from libc.string cimport memset, memcpy - -cdef struct _Trie_Node # forward decl - -cdef struct _Trie_Edge: - int val - _Trie_Node* node - _Trie_Edge* bigger - _Trie_Edge* smaller - -cdef struct _Trie_Node: - _Trie_Edge* root - int* arr - int arr_len - -cdef _Trie_Node* new_trie_node(): - cdef _Trie_Node* node - node = <_Trie_Node*> malloc(sizeof(_Trie_Node)) - node.root = NULL - node.arr_len = 0 - node.arr = <int*> malloc(sizeof(0*sizeof(int))) - return node - -cdef _Trie_Edge* new_trie_edge(int val): - cdef _Trie_Edge* edge - edge = <_Trie_Edge*> malloc(sizeof(_Trie_Edge)) - edge.node = new_trie_node() - edge.bigger = NULL - edge.smaller = NULL - edge.val = val - return edge - -cdef free_trie_node(_Trie_Node* node): - if node != NULL: - free_trie_edge(node.root) - free(node.arr) - -cdef free_trie_edge(_Trie_Edge* edge): - if edge != NULL: - free_trie_node(edge.node) - free_trie_edge(edge.bigger) - free_trie_edge(edge.smaller) - -cdef _Trie_Node* trie_find(_Trie_Node* node, int val): - cdef _Trie_Edge* cur - cur = node.root - while cur != NULL and cur.val != val: - if val > cur.val: - cur = cur.bigger - elif val < cur.val: - cur = cur.smaller - if cur == NULL: - return NULL - else: - return cur.node - -cdef trie_node_data_append(_Trie_Node* node, int val): - cdef int new_len - new_len = node.arr_len + 1 - node.arr = <int*> realloc(node.arr, new_len*sizeof(int)) - node.arr[node.arr_len] = val - node.arr_len = new_len - -cdef trie_node_data_extend(_Trie_Node* node, int* vals, int num_vals): - cdef int new_len - new_len = node.arr_len + num_vals - node.arr = <int*> realloc(node.arr, new_len*sizeof(int)) - memcpy(node.arr + node.arr_len, vals, num_vals*sizeof(int)) - node.arr_len = new_len - - -cdef _Trie_Node* trie_insert(_Trie_Node* node, int val): - cdef _Trie_Edge** cur - cur = &node.root - while cur[0] != NULL and cur[0].val != val: - if val > cur[0].val: - cur = &cur[0].bigger - elif val < cur[0].val: - cur = &cur[0].smaller - if cur[0] == NULL: - cur[0] = new_trie_edge(val) - return cur[0].node - -cdef trie_node_to_map(_Trie_Node* node, result, prefix, int include_zeros): - cdef cintlist.CIntList arr - - if include_zeros or node.arr_len > 0: - arr = cintlist.CIntList() - free(arr.arr) - arr.arr = <int*> malloc(node.arr_len * sizeof(int)) - memcpy(arr.arr, node.arr, node.arr_len * sizeof(int)) - arr.len = node.arr_len - arr.size = node.arr_len - result[prefix] = arr - trie_edge_to_map(node.root, result, prefix, include_zeros) - -cdef trie_edge_to_map(_Trie_Edge* edge, result, prefix, int include_zeros): - if edge != NULL: - trie_edge_to_map(edge.smaller, result, prefix, include_zeros) - trie_edge_to_map(edge.bigger, result, prefix, include_zeros) - prefix = prefix + (edge.val,) - trie_node_to_map(edge.node, result, prefix, include_zeros) - -cdef class TrieMap: - - cdef _Trie_Node** root - cdef int V - - def __init__(self, alphabet_size): - self.V = alphabet_size - self.root = <_Trie_Node**> malloc(self.V * sizeof(_Trie_Node*)) - memset(self.root, 0, self.V * sizeof(_Trie_Node*)) - - - def __dealloc__(self): - cdef int i - for i from 0 <= i < self.V: - if self.root[i] != NULL: - free_trie_node(self.root[i]) - free(self.root) - - - def insert(self, pattern): - cdef int* p - cdef int i, l - l = len(pattern) - p = <int*> malloc(l*sizeof(int)) - for i from 0 <= i < l: - p[i] = pattern[i] - self._insert(p,l) - free(p) - - - cdef _Trie_Node* _insert(self, int* pattern, int pattern_len): - cdef int i - cdef _Trie_Node* node - if self.root[pattern[0]] == NULL: - self.root[pattern[0]] = new_trie_node() - node = self.root[pattern[0]] - for i from 1 <= i < pattern_len: - node = trie_insert(node, pattern[i]) - return node - - def contains(self, pattern): - cdef int* p - cdef int i, l - cdef _Trie_Node* node - l = len(pattern) - p = <int*> malloc(l*sizeof(int)) - for i from 0 <= i < l: - p[i] = pattern[i] - node = self._contains(p,l) - free(p) - if node == NULL: - return False - else: - return True - - cdef _Trie_Node* _contains(self, int* pattern, int pattern_len): - cdef int i - cdef _Trie_Node* node - node = self.root[pattern[0]] - i = 1 - while node != NULL and i < pattern_len: - node = trie_find(node, pattern[i]) - i = i+1 - return node - - def toMap(self, flag): - cdef int i, include_zeros - - if flag: - include_zeros=1 - else: - include_zeros=0 - result = {} - for i from 0 <= i < self.V: - if self.root[i] != NULL: - trie_node_to_map(self.root[i], result, (i,), include_zeros) - return result - - -cdef class Precomputation: - -# Defined in .pxd file, here for reference: -# cdef int precompute_rank -# cdef int precompute_secondary_rank -# cdef int max_length -# cdef int max_nonterminals -# cdef int train_max_initial_size -# cdef int train_min_gap_size -# cdef precomputed_index -# cdef precomputed_collocations - - def __init__(self, filename, sa=None, precompute_rank=1000, precompute_secondary_rank=20, max_length=5, - max_nonterminals=2, train_max_initial_size=10, train_min_gap_size=2, from_binary=False): - self.precompute_rank = precompute_rank - self.precompute_secondary_rank = precompute_secondary_rank - self.max_length = max_length - self.max_nonterminals = max_nonterminals - self.train_max_initial_size = train_max_initial_size - self.train_min_gap_size = train_min_gap_size - if from_binary: - self.read_binary(filename) - else: - self.precompute(filename, sa) - - - def read_binary(self, filename): - cdef FILE* f - cdef bytes bfilename = filename - cdef char* cfilename = bfilename - f = fopen(cfilename, "r") - fread(&(self.precompute_rank), sizeof(int), 1, f) - fread(&(self.precompute_secondary_rank), sizeof(int), 1, f) - fread(&(self.max_length), sizeof(int), 1, f) - fread(&(self.max_nonterminals), sizeof(int), 1, f) - fread(&(self.train_max_initial_size), sizeof(int), 1, f) - fread(&(self.train_min_gap_size), sizeof(int), 1, f) - self.precomputed_index = self.read_map(f) - self.precomputed_collocations = self.read_map(f) - fclose(f) - - - def write_binary(self, filename): - cdef FILE* f - cdef bytes bfilename = filename - cdef char* cfilename = bfilename - - f = fopen(cfilename, "w") - fwrite(&(self.precompute_rank), sizeof(int), 1, f) - fwrite(&(self.precompute_secondary_rank), sizeof(int), 1, f) - fwrite(&(self.max_length), sizeof(int), 1, f) - fwrite(&(self.max_nonterminals), sizeof(int), 1, f) - fwrite(&(self.train_max_initial_size), sizeof(int), 1, f) - fwrite(&(self.train_min_gap_size), sizeof(int), 1, f) - self.write_map(self.precomputed_index, f) - self.write_map(self.precomputed_collocations, f) - fclose(f) - - - cdef write_map(self, m, FILE* f): - cdef int i, N - cdef cintlist.CIntList arr - - N = len(m) - fwrite(&(N), sizeof(int), 1, f) - for pattern, val in m.iteritems(): - N = len(pattern) - fwrite(&(N), sizeof(int), 1, f) - for word_id in pattern: - i = word_id - fwrite(&(i), sizeof(int), 1, f) - arr = val - arr.write_handle(f) - - - cdef read_map(self, FILE* f): - cdef int i, j, k, word_id, N - cdef cintlist.CIntList arr - - m = {} - fread(&(N), sizeof(int), 1, f) - for j from 0 <= j < N: - fread(&(i), sizeof(int), 1, f) - key = () - for k from 0 <= k < i: - fread(&(word_id), sizeof(int), 1, f) - key = key + (word_id,) - arr = cintlist.CIntList() - arr.read_handle(f) - m[key] = arr - return m - - - def precompute(self, filename, sa): - cdef int i, l, N, max_pattern_len, i1, l1, i2, l2, i3, l3, ptr1, ptr2, ptr3, is_super, sent_count, max_rank - cdef csuf.SuffixArray sarray - cdef cdat.DataArray darray - cdef cintlist.CIntList data, queue, cost_by_rank, count_by_rank - cdef TrieMap frequent_patterns, super_frequent_patterns, collocations - cdef _Trie_Node* node - - sarray = sa - darray = sarray.darray - data = darray.data - - frequent_patterns = TrieMap(len(darray.id2word)) - super_frequent_patterns = TrieMap(len(darray.id2word)) - collocations = TrieMap(len(darray.id2word)) - - I_set = set() - J_set = set() - J2_set = set() - IJ_set = set() - pattern_rank = {} - - log.writeln("Precomputing frequent intersections\n", 1) - start_time = monitor.cpu() - - max_pattern_len = 0 - if filename is not None: - precompute_file = open(filename) - for rank, line in enumerate(precompute_file): - if rank >= self.precompute_rank: - break - phrase_words = line.split()[2:] - phrase = () - for word in phrase_words: - phrase = phrase + (darray.word2id[word],) - max_pattern_len = max(max_pattern_len, len(phrase)) - frequent_patterns.insert(phrase) - I_set.add(phrase) - pattern_rank[phrase] = rank - if rank < self.precompute_secondary_rank: - super_frequent_patterns.insert(phrase) - J_set.add(phrase) - precompute_file.close() - - queue = cintlist.CIntList(increment=1000) - - log.writeln(" Computing inverted indexes...", 1) - N = len(data) - for i from 0 <= i < N: - sa_word_id = data.arr[i] - if sa_word_id == 1: - queue._append(-1) - else: - for l from 1 <= l <= max_pattern_len: - node = frequent_patterns._contains(data.arr+i, l) - if node == NULL: - break - queue._append(i) - queue._append(l) - trie_node_data_append(node, i) - - log.writeln(" Computing collocations...", 1) - N = len(queue) - ptr1 = 0 - sent_count = 0 - while ptr1 < N: # main loop - i1 = queue.arr[ptr1] - if i1 > -1: - l1 = queue.arr[ptr1+1] - ptr2 = ptr1 + 2 - while ptr2 < N: - i2 = queue.arr[ptr2] - if i2 == -1 or i2 - i1 >= self.train_max_initial_size: - break - l2 = queue.arr[ptr2+1] - if i2 - i1 - l1 >= self.train_min_gap_size and i2 + l2 - i1 <= self.train_max_initial_size and l1+l2+1 <= self.max_length: - node = collocations._insert(data.arr+i1, l1) - node = trie_insert(node, -1) - for i from i2 <= i < i2+l2: - node = trie_insert(node, data.arr[i]) - trie_node_data_append(node, i1) - trie_node_data_append(node, i2) - if super_frequent_patterns._contains(data.arr+i2, l2) != NULL: - if super_frequent_patterns._contains(data.arr+i1, l1) != NULL: - is_super = 1 - else: - is_super = 0 - ptr3 = ptr2 + 2 - while ptr3 < N: - i3 = queue.arr[ptr3] - if i3 == -1 or i3 - i1 >= self.train_max_initial_size: - break - l3 = queue.arr[ptr3+1] - if i3 - i2 - l2 >= self.train_min_gap_size and i3 + l3 - i1 <= self.train_max_initial_size and l1+l2+l3+2 <= self.max_length: - if is_super or super_frequent_patterns._contains(data.arr+i3, l3) != NULL: - node = collocations._insert(data.arr+i1, l1) - node = trie_insert(node, -1) - for i from i2 <= i < i2+l2: - node = trie_insert(node, data.arr[i]) - node = trie_insert(node, -1) - for i from i3 <= i < i3+l3: - node = trie_insert(node, data.arr[i]) - trie_node_data_append(node, i1) - trie_node_data_append(node, i2) - trie_node_data_append(node, i3) - ptr3 = ptr3 + 2 - ptr2 = ptr2 + 2 - ptr1 = ptr1 + 2 - else: - sent_count = sent_count + 1 - if sent_count % 10000 == 0: - log.writeln(" %d sentences" % sent_count) - ptr1 = ptr1 + 1 - - self.precomputed_collocations = collocations.toMap(False) - self.precomputed_index = frequent_patterns.toMap(True) - - x = 0 - for pattern1 in J_set: - for pattern2 in J_set: - if len(pattern1) + len(pattern2) + 1 < self.max_length: - combined_pattern = pattern1 + (-1,) + pattern2 - J2_set.add(combined_pattern) - - for pattern1 in I_set: - for pattern2 in I_set: - x = x+1 - if len(pattern1) + len(pattern2) + 1 <= self.max_length: - combined_pattern = pattern1 + (-1,) + pattern2 - IJ_set.add(combined_pattern) - - for pattern1 in I_set: - for pattern2 in J2_set: - x = x+2 - if len(pattern1) + len(pattern2) + 1<= self.max_length: - combined_pattern = pattern1 + (-1,) + pattern2 - IJ_set.add(combined_pattern) - combined_pattern = pattern2 + (-1,) + pattern1 - IJ_set.add(combined_pattern) - - N = len(pattern_rank) - cost_by_rank = cintlist.CIntList(initial_len=N) - count_by_rank = cintlist.CIntList(initial_len=N) - for pattern, arr in self.precomputed_collocations.iteritems(): - if pattern not in IJ_set: - s = "" - for word_id in pattern: - if word_id == -1: - s = s + "X " - else: - s = s + darray.id2word[word_id] + " " - log.writeln("ERROR: unexpected pattern %s in set of precomputed collocations" % (s), 1) - else: - chunk = () - max_rank = 0 - arity = 0 - for word_id in pattern: - if word_id == -1: - max_rank = max(max_rank, pattern_rank[chunk]) - arity = arity + 1 - chunk = () - else: - chunk = chunk + (word_id,) - max_rank = max(max_rank, pattern_rank[chunk]) - cost_by_rank.arr[max_rank] = cost_by_rank.arr[max_rank] + (4*len(arr)) - count_by_rank.arr[max_rank] = count_by_rank.arr[max_rank] + (len(arr)/(arity+1)) - - cumul_cost = 0 - cumul_count = 0 - for i from 0 <= i < N: - cumul_cost = cumul_cost + cost_by_rank.arr[i] - cumul_count = cumul_count + count_by_rank.arr[i] - log.writeln("RANK %d\tCOUNT, COST: %d %d\tCUMUL: %d, %d" % (i, count_by_rank.arr[i], cost_by_rank.arr[i], cumul_count, cumul_cost)) - - num_found_patterns = len(self.precomputed_collocations) - for pattern in IJ_set: - if pattern not in self.precomputed_collocations: - self.precomputed_collocations[pattern] = cintlist.CIntList() - - stop_time = monitor.cpu() - log.writeln("Precomputed collocations for %d patterns out of %d possible (upper bound %d)" % (num_found_patterns,len(self.precomputed_collocations),x)) - log.writeln("Precomputed inverted index for %d patterns " % len(self.precomputed_index)) - log.writeln("Precomputation took %f seconds" % (stop_time - start_time)) - log.writeln("Detailed statistics:") - - - - - - - diff --git a/sa-extract/rule.pxd b/sa-extract/rule.pxd deleted file mode 100644 index c9c84e5c..00000000 --- a/sa-extract/rule.pxd +++ /dev/null @@ -1,13 +0,0 @@ -cdef class Phrase: - cdef int *syms - cdef int n, *varpos, n_vars - cdef public int chunkpos(self, int k) - cdef public int chunklen(self, int k) - -cdef class Rule: - cdef public int lhs - cdef readonly Phrase f, e - cdef float *cscores - cdef int n_scores - cdef public owner, word_alignments - diff --git a/sa-extract/rule.pyx b/sa-extract/rule.pyx deleted file mode 100644 index 7cd3efda..00000000 --- a/sa-extract/rule.pyx +++ /dev/null @@ -1,286 +0,0 @@ -from libc.stdlib cimport malloc, calloc, realloc, free, strtof, strtol -from libc.string cimport strsep, strcpy, strlen - -cdef extern from "strutil.h": - char *strstrsep(char **stringp, char *delim) - char *strip(char *s) - char **split(char *s, char *delim, int *pn) - -import sys - -import sym -cimport sym -cdef sym.Alphabet alphabet -alphabet = sym.alphabet - -global span_limit -span_limit = None - -cdef int bufsize -cdef char *buf -bufsize = 100 -buf = <char *>malloc(bufsize) -cdef ensurebufsize(int size): - global buf, bufsize - if size > bufsize: - buf = <char *>realloc(buf, size*sizeof(char)) - bufsize = size - -cdef class Phrase: - def __cinit__(self, words): - cdef int i, j, n, n_vars - cdef char **toks - cdef bytes bwords - cdef char* cwords - - n_vars = 0 - if type(words) is str: - ensurebufsize(len(words)+1) - bwords = words - cwords = bwords - strcpy(buf, cwords) - toks = split(buf, NULL, &n) - self.syms = <int *>malloc(n*sizeof(int)) - for i from 0 <= i < n: - self.syms[i] = alphabet.fromstring(toks[i], 0) - if alphabet.isvar(self.syms[i]): - n_vars = n_vars + 1 - - else: - n = len(words) - self.syms = <int *>malloc(n*sizeof(int)) - for i from 0 <= i < n: - self.syms[i] = words[i] - if alphabet.isvar(self.syms[i]): - n_vars = n_vars + 1 - self.n = n - self.n_vars = n_vars - self.varpos = <int *>malloc(n_vars*sizeof(int)) - j = 0 - for i from 0 <= i < n: - if alphabet.isvar(self.syms[i]): - self.varpos[j] = i - j = j + 1 - - def __dealloc__(self): - free(self.syms) - free(self.varpos) - - def __str__(self): - strs = [] - cdef int i, s - for i from 0 <= i < self.n: - s = self.syms[i] - strs.append(alphabet.tostring(s)) - return " ".join(strs) - - def instantiable(self, i, j, n): - return span_limit is None or (j-i) <= span_limit - - def handle(self): - """return a hashable representation that normalizes the ordering - of the nonterminal indices""" - norm = [] - cdef int i, j, s - i = 1 - j = 0 - for j from 0 <= j < self.n: - s = self.syms[j] - if alphabet.isvar(s): - s = alphabet.setindex(s,i) - i = i + 1 - norm.append(s) - return tuple(norm) - - def strhandle(self): - strs = [] - norm = [] - cdef int i, j, s - i = 1 - j = 0 - for j from 0 <= j < self.n: - s = self.syms[j] - if alphabet.isvar(s): - s = alphabet.setindex(s,i) - i = i + 1 - norm.append(alphabet.tostring(s)) - return " ".join(norm) - - def arity(self): - return self.n_vars - - def getvarpos(self, i): - if 0 <= i < self.n_vars: - return self.varpos[i] - else: - raise IndexError - - def getvar(self, i): - if 0 <= i < self.n_vars: - return self.syms[self.varpos[i]] - else: - raise IndexError - - cdef int chunkpos(self, int k): - if k == 0: - return 0 - else: - return self.varpos[k-1]+1 - - cdef int chunklen(self, int k): - if self.n_vars == 0: - return self.n - elif k == 0: - return self.varpos[0] - elif k == self.n_vars: - return self.n-self.varpos[k-1]-1 - else: - return self.varpos[k]-self.varpos[k-1]-1 - - def clen(self, k): - return self.chunklen(k) - - def getchunk(self, ci): - cdef int start, stop - start = self.chunkpos(ci) - stop = start+self.chunklen(ci) - chunk = [] - for i from start <= i < stop: - chunk.append(self.syms[i]) - return chunk - - def __cmp__(self, other): - cdef Phrase otherp - cdef int i - otherp = other - for i from 0 <= i < min(self.n, otherp.n): - if self.syms[i] < otherp.syms[i]: - return -1 - elif self.syms[i] > otherp.syms[i]: - return 1 - if self.n < otherp.n: - return -1 - elif self.n > otherp.n: - return 1 - else: - return 0 - - def __hash__(self): - cdef int i - cdef unsigned h - h = 0 - for i from 0 <= i < self.n: - if self.syms[i] > 0: - h = (h << 1) + self.syms[i] - else: - h = (h << 1) + -self.syms[i] - return h - - def __len__(self): - return self.n - - def __getitem__(self, i): - return self.syms[i] - - def __iter__(self): - cdef int i - l = [] - for i from 0 <= i < self.n: - l.append(self.syms[i]) - return iter(l) - - def subst(self, start, children): - cdef int i - for i from 0 <= i < self.n: - if alphabet.isvar(self.syms[i]): - start = start + children[alphabet.getindex(self.syms[i])-1] - else: - start = start + (self.syms[i],) - return start - -cdef class Rule: - def __cinit__(self, lhs, f, e, owner=None, scores=None, word_alignments=None): - cdef int i, n - cdef char *rest - - self.word_alignments = word_alignments - if scores is None: - self.cscores = NULL - self.n_scores = 0 - else: - n = len(scores) - self.cscores = <float *>malloc(n*sizeof(float)) - self.n_scores = n - for i from 0 <= i < n: - self.cscores[i] = scores[i] - - def __init__(self, lhs, f, e, owner=None, scores=None, word_alignments=None): - if not sym.isvar(lhs): - sys.stderr.write("error: lhs=%d\n" % lhs) - self.lhs = lhs - self.f = f - self.e = e - self.word_alignments = word_alignments - - def __dealloc__(self): - if self.cscores != NULL: - free(self.cscores) - - def __str__(self): - return self.to_line() - - def __hash__(self): - return hash((self.lhs, self.f, self.e)) - - def __cmp__(self, Rule other): - return cmp((self.lhs, self.f, self.e, self.word_alignments), (other.lhs, other.f, other.e, self.word_alignments)) - - def __iadd__(self, Rule other): - if self.n_scores != other.n_scores: - raise ValueError - for i from 0 <= i < self.n_scores: - self.cscores[i] = self.cscores[i] + other.cscores[i] - return self - - def fmerge(self, Phrase f): - if self.f == f: - self.f = f - - def arity(self): - return self.f.arity() - - def to_line(self): - scorestrs = [] - for i from 0 <= i < self.n_scores: - scorestrs.append(str(self.cscores[i])) - fields = [alphabet.tostring(self.lhs), str(self.f), str(self.e), " ".join(scorestrs)] - if self.word_alignments is not None: - alignstr = [] - for i from 0 <= i < len(self.word_alignments): - alignstr.append("%d-%d" % (self.word_alignments[i]/65536, self.word_alignments[i]%65536)) - #for s,t in self.word_alignments: - #alignstr.append("%d-%d" % (s,t)) - fields.append(" ".join(alignstr)) - - return " ||| ".join(fields) - - property scores: - def __get__(self): - s = [None]*self.n_scores - for i from 0 <= i < self.n_scores: - s[i] = self.cscores[i] - return s - - def __set__(self, s): - if self.cscores != NULL: - free(self.cscores) - self.cscores = <float *>malloc(len(s)*sizeof(float)) - self.n_scores = len(s) - for i from 0 <= i < self.n_scores: - self.cscores[i] = s[i] - -def rule_copy(r): - r1 = Rule(r.lhs, r.f, r.e, r.owner, r.scores) - r1.word_alignments = r.word_alignments - return r1 - diff --git a/sa-extract/rulefactory.pyx b/sa-extract/rulefactory.pyx deleted file mode 100644 index 792489c4..00000000 --- a/sa-extract/rulefactory.pyx +++ /dev/null @@ -1,2361 +0,0 @@ -# Pyrex implementation of the algorithms described in -# Lopez, EMNLP-CoNLL 2007 -# Much faster than the Python numbers reported there. -# Note to reader: this code is closer to C than Python -import sys -import sym -import log -import rule -import monitor -import cintlist -import csuf -import cdat -import cveb -import precomputation -import gc -import cn -import sgml - -cimport cmath -cimport csuf -cimport cdat -cimport cintlist -cimport rule -cimport cveb -cimport precomputation -cimport calignment - -from libc.stdlib cimport malloc, realloc, free -from libc.string cimport memset, memcpy -from libc.math cimport fmod, ceil, floor - -cdef int PRECOMPUTE -cdef int MERGE -cdef int BAEZA_YATES - -PRECOMPUTE = 0 -MERGE = 1 -BAEZA_YATES = 2 - -#cdef int node_count -#node_count = 0 - -cdef class TrieNode: - cdef public children - #cdef int id - - def __init__(self): - self.children = {} - #self.id = node_count - #node_count += 1 - - -cdef class ExtendedTrieNode(TrieNode): - cdef public phrase - cdef public phrase_location - cdef public suffix_link - - def __init__(self, phrase=None, phrase_location=None, suffix_link=None): - TrieNode.__init__(self) - self.phrase = phrase - self.phrase_location = phrase_location - self.suffix_link = suffix_link - - -cdef class TrieTable: - cdef public int extended - cdef public int count - cdef public root - def __cinit__(self, extended=False): - self.count = 0 - self.extended = extended - if extended: - self.root = ExtendedTrieNode() - else: - self.root = TrieNode() - -# linked list structure for storing matches in BaselineRuleFactory -cdef struct match_node: - int* match - match_node* next - -cdef class BaselineRuleFactory: - - cdef grammar, context_manager - cdef int max_terminals, max_nonterminals - cdef int max_initial_size, train_max_initial_size - cdef int min_gap_size, train_min_gap_size - cdef int category - cdef int visit - cdef float intersect_time, extract_time - cdef ruleFile, timingFile - cdef int* last_visit1 - cdef int* last_visit2 - cdef match_node** intersector1 - cdef match_node** intersector2 - cdef csuf.SuffixArray sa - cdef cintlist.CIntList sent_id - - def input(self, fwords): - flen = len(fwords) - start_time = monitor.cpu() - self.intersect_time = 0.0 - self.extract_time = 0.0 - - pyro_phrase_count = 0 - hiero_phrase_count = 0 - - frontier = [] - for i in xrange(len(fwords)): - frontier.append((i, (), False)) - - while len(frontier) > 0: - this_iter_intersect_time = self.intersect_time - new_frontier = [] - for i, prefix, is_shadow_path in frontier: - - word_id = fwords[i][0][0] - #print "word_id = %i" % word_id - phrase = prefix + (word_id,) - str_phrase = map(sym.tostring, phrase) - hiero_phrase = rule.Phrase(phrase) - - #log.writeln("pos %2d, '%s'" % (i, hiero_phrase)) - self.lookup(hiero_phrase) - if hiero_phrase.arity() == 0: - pyro_phrase_count = pyro_phrase_count + 1 - else: - hiero_phrase_count = hiero_phrase_count + 1 - - if len(phrase) - hiero_phrase.arity() < self.max_terminals and i+1 < len(fwords): - new_frontier.append((i+1, phrase, is_shadow_path)) - if hiero_phrase.arity() < self.max_nonterminals: - xcat = sym.setindex(self.category, hiero_phrase.arity()+1) - for j in xrange(i+1+self.min_gap_size, min(i+self.max_initial_size, len(fwords))): - new_frontier.append((j, phrase+(xcat,), is_shadow_path)) - log.writeln("This iteration intersect time = %f" % (self.intersect_time - this_iter_intersect_time)) - frontier = new_frontier - stop_time = monitor.cpu() - log.writeln("COUNT %d %d" % (pyro_phrase_count, hiero_phrase_count)) - - - def lookup(self, phrase): - cdef int j, g, start, stop, sent_id, num_ranges, arity - cdef match_node** cur_intersector - cdef match_node** next_intersector - cdef match_node** tmp_intersector - cdef match_node* node - cdef match_node* cur_node - cdef match_node* prev_node - cdef match_node** node_ptr - cdef int* cur_visit - cdef int* next_visit - cdef int* tmp_visit - cdef int* chunklen - - #print "\n\nLOOKUP\n\n" - ranges = [] - sizes = [] - arity = phrase.arity() - chunklen = <int *> malloc(arity*sizeof(int)) - for i from 0 <= i < arity+1: - chunk = phrase.getchunk(i) - chunklen[i] = len(chunk) - sa_range = None - phr = () - for offset, word_id in enumerate(chunk): - word = sym.tostring(word_id) - sa_range = self.context_manager.fsarray.lookup(word, offset, sa_range[0], sa_range[1]) - if sa_range is None: - #log.writeln("Returned for phrase %s" % rule.Phrase(phr)) - return - #log.writeln("Found range %s for phrase %s" % (sa_range, rule.Phrase(phr))) - ranges.append(sa_range) - sizes.append(sa_range[1]-sa_range[0]) - if phrase.arity() == 0: - return - - cur_intersector = self.intersector1 - next_intersector = self.intersector2 - cur_visit = self.last_visit1 - next_visit = self.last_visit2 - - num_ranges = len(ranges) - for i from 0 <= i < num_ranges: - sa_range = ranges[i] - start_time = monitor.cpu() - self.visit = self.visit + 1 - intersect_count = 0 - - start = sa_range[0] - stop = sa_range[1] - for j from start <= j < stop: - g = self.sa.sa.arr[j] - sent_id = self.sent_id.arr[g] - if i==0: - if next_visit[sent_id] != self.visit: - # clear intersector - node = next_intersector[sent_id] - next_intersector[sent_id] = NULL - while node != NULL: - prev_node = node - node = node.next - free(prev_node.match) - free(prev_node) - next_visit[sent_id] = self.visit - node_ptr = &(next_intersector[sent_id]) - while node_ptr[0] != NULL: - node_ptr = &(node_ptr[0].next) - node_ptr[0] = <match_node*> malloc(sizeof(match_node)) - node_ptr[0].match = <int *> malloc(sizeof(int)) - node_ptr[0].match[0] = g - node_ptr[0].next = NULL - intersect_count = intersect_count + 1 - else: - if cur_visit[sent_id] == self.visit-1: - cur_node = cur_intersector[sent_id] - while cur_node != NULL: - if g - cur_node.match[0] + chunklen[i] <= self.train_max_initial_size and g - cur_node.match[i-1] - chunklen[i-1] >= self.train_min_gap_size: - if next_visit[sent_id] != self.visit: - # clear intersector -- note that we only do this if we've got something to put there - node = next_intersector[sent_id] - next_intersector[sent_id] = NULL - while node != NULL: - prev_node = node - node = node.next - free(prev_node.match) - free(prev_node) - next_visit[sent_id] = self.visit - node_ptr = &(next_intersector[sent_id]) - while node_ptr[0] != NULL: - node_ptr = &(node_ptr[0].next) - node_ptr[0] = <match_node*> malloc(sizeof(match_node)) - node_ptr[0].match = <int *> malloc((i+1) * sizeof(int)) - memcpy(node_ptr[0].match, cur_node.match, i*sizeof(int)) - node_ptr[0].match[i] = g - node_ptr[0].next = NULL - intersect_count = intersect_count + 1 - cur_node = cur_node.next - tmp_intersector = cur_intersector - cur_intersector = next_intersector - next_intersector = tmp_intersector - - tmp_visit = cur_visit - cur_visit = next_visit - next_visit = tmp_visit - - intersect_time = monitor.cpu() - start_time - if i > 0: - log.writeln("INT %d %d %d %d %f baseline" % - (arity, prev_intersect_count, sa_range[1]-sa_range[0], - intersect_count, intersect_time)) - if intersect_count == 0: - return None - prev_intersect_count = intersect_count - free(chunklen) - - - - def __init__(self, max_terminals=5, max_nonterminals=2, - max_initial_size=10, train_max_initial_size=10, - min_gap_size=1, train_min_gap_size=2, - category='[PHRASE]', grammar=None, - ruleFile=None, timingFile=None): - self.grammar = grammar - self.max_terminals = max_terminals - self.max_nonterminals = max_nonterminals - self.max_initial_size = max_initial_size - self.train_max_initial_size = train_max_initial_size - self.min_gap_size = min_gap_size - self.train_min_gap_size = train_min_gap_size - self.category = sym.fromstring(category) - self.ruleFile = ruleFile - self.timingFile = timingFile - self.visit = 0 - - - def registerContext(self, context_manager): - cdef int num_sents - self.context_manager = context_manager - self.sa = context_manager.fsarray - self.sent_id = self.sa.darray.sent_id - - num_sents = len(self.sa.darray.sent_index) - self.last_visit1 = <int *> malloc(num_sents * sizeof(int)) - memset(self.last_visit1, 0, num_sents * sizeof(int)) - - self.last_visit2 = <int *> malloc(num_sents * sizeof(int)) - memset(self.last_visit2, 0, num_sents * sizeof(int)) - - self.intersector1 = <match_node **> malloc(num_sents * sizeof(match_node*)) - memset(self.intersector1, 0, num_sents * sizeof(match_node*)) - - self.intersector2 = <match_node **> malloc(num_sents * sizeof(match_node*)) - memset(self.intersector2, 0, num_sents * sizeof(match_node*)) - - -# encodes information needed to find a (hierarchical) phrase -# in the text. If phrase is contiguous, that's just a range -# in the suffix array; if discontiguous, it is the set of -# actual locations (packed into an array) -cdef class PhraseLocation: - cdef int sa_low - cdef int sa_high - cdef int arr_low - cdef int arr_high - cdef cintlist.CIntList arr - cdef int num_subpatterns - - # returns true if sent_id is contained - cdef int contains(self, int sent_id): - return 1 - - def __init__(self, sa_low=-1, sa_high=-1, arr_low=-1, arr_high=-1, arr=None, num_subpatterns=1): - self.sa_low = sa_low - self.sa_high = sa_high - self.arr_low = arr_low - self.arr_high = arr_high - self.arr = arr - self.num_subpatterns = num_subpatterns - - - -cdef class Sampler: - '''A Sampler implements a logic for choosing - samples from a population range''' - - cdef int sampleSize - cdef context_manager - cdef cintlist.CIntList sa - - def __init__(self, sampleSize=0): - self.sampleSize = sampleSize - if sampleSize > 0: - log.writeln("Sampling strategy: uniform, max sample size = %d" % sampleSize, 1) - else: - log.writeln("Sampling strategy: no sampling", 1) - - def registerContext(self, context_manager): - self.context_manager = context_manager - self.sa = (<csuf.SuffixArray> context_manager.fsarray).sa - - - def sample(self, PhraseLocation phrase_location): - '''Returns a sample of the locations for - the phrase. If there are less than self.sampleSize - locations, return all of them; otherwise, return - up to self.sampleSize locations. In the latter case, - we choose to sample UNIFORMLY -- that is, the locations - are chosen at uniform intervals over the entire set, rather - than randomly. This makes the algorithm deterministic, which - is good for things like MERT''' - cdef cintlist.CIntList sample - cdef double i, stepsize - cdef int num_locations, val, j - - sample = cintlist.CIntList() - if phrase_location.arr is None: - num_locations = phrase_location.sa_high - phrase_location.sa_low - if self.sampleSize == -1 or num_locations <= self.sampleSize: - sample._extend_arr(self.sa.arr + phrase_location.sa_low, num_locations) - else: - stepsize = float(num_locations)/float(self.sampleSize) - i = phrase_location.sa_low - while i < phrase_location.sa_high and sample.len < self.sampleSize: - '''Note: int(i) not guaranteed to have the desired - effect, according to the python documentation''' - if fmod(i,1.0) > 0.5: - val = int(ceil(i)) - else: - val = int(floor(i)) - sample._append(self.sa.arr[val]) - i = i + stepsize - else: - num_locations = (phrase_location.arr_high - phrase_location.arr_low) / phrase_location.num_subpatterns - if self.sampleSize == -1 or num_locations <= self.sampleSize: - sample = phrase_location.arr - else: - stepsize = float(num_locations)/float(self.sampleSize) - i = phrase_location.arr_low - while i < num_locations and sample.len < self.sampleSize * phrase_location.num_subpatterns: - '''Note: int(i) not guaranteed to have the desired - effect, according to the python documentation''' - if fmod(i,1.0) > 0.5: - val = int(ceil(i)) - else: - val = int(floor(i)) - j = phrase_location.arr_low + (val*phrase_location.num_subpatterns) - sample._extend_arr(phrase_location.arr.arr + j, phrase_location.num_subpatterns) - i = i + stepsize - return sample - - -cdef long nGramCount(PhraseLocation loc): - return (loc.arr_high - loc.arr_low)/ loc.num_subpatterns - - -# struct used to encapsulate a single matching -cdef struct Matching: - int* arr - int start - int end - int sent_id - int size - - -cdef void assign_matching(Matching* m, int* arr, int start, int step, int* sent_id_arr): - m.arr = arr - m.start = start - m.end = start + step - m.sent_id = sent_id_arr[arr[start]] - m.size = step - - -cdef int* append_combined_matching(int* arr, Matching* loc1, Matching* loc2, - int offset_by_one, int num_subpatterns, int* result_len): - cdef int i, new_len - - new_len = result_len[0] + num_subpatterns - arr = <int*> realloc(arr, new_len*sizeof(int)) - - for i from 0 <= i < loc1.size: - arr[result_len[0]+i] = loc1.arr[loc1.start+i] - if num_subpatterns > loc1.size: - arr[new_len-1] = loc2.arr[loc2.end-1] - result_len[0] = new_len - return arr - - -cdef int* extend_arr(int* arr, int* arr_len, int* appendix, int appendix_len): - cdef int new_len - - new_len = arr_len[0] + appendix_len - arr = <int*> realloc(arr, new_len*sizeof(int)) - memcpy(arr+arr_len[0], appendix, appendix_len*sizeof(int)) - arr_len[0] = new_len - return arr - - -#cdef matching2str(Matching* m): -# cdef int i -# cdef result - -# result = "(" -# for i from m.start <= i < m.end: -# result = result + str(m.arr[i]) + " " -# result = result + ")" -# return result - - -cdef int median(int low, int high, int step): - return low + (((high - low)/step)/2)*step - - -cdef void findComparableMatchings(int low, int high, int* arr, int step, int loc, int* loc_minus, int* loc_plus): - # Returns (minus, plus) indices for the portion of the array - # in which all matchings have the same first index as the one - # starting at loc - loc_plus[0] = loc + step - while loc_plus[0] < high and arr[loc_plus[0]] == arr[loc]: - loc_plus[0] = loc_plus[0] + step - loc_minus[0] = loc - while loc_minus[0]-step >= low and arr[loc_minus[0]-step] == arr[loc]: - loc_minus[0] = loc_minus[0] - step - - -cdef class HieroCachingRuleFactory: - '''This RuleFactory implements a caching - method using TrieTable, which makes phrase - generation somewhat speedier -- phrases only - need to be extracted once (however, it is - quite possible they need to be scored - for each input sentence, for contextual models)''' - - cdef rules, grammar, context_manager - - cdef int max_chunks - cdef int max_target_chunks - cdef int max_length - cdef int max_target_length - cdef int max_nonterminals - cdef int max_initial_size - cdef int train_max_initial_size - cdef int min_gap_size - cdef int train_min_gap_size - cdef int category - - cdef cacheBetweenSents - cdef precomputed_index - cdef precomputed_collocations - cdef precompute_file - cdef max_rank - cdef int precompute_rank, precompute_secondary_rank - cdef useBaezaYates - cdef use_index - cdef use_collocations - cdef float by_slack_factor - - cdef per_sentence_grammar - cdef rule_filehandler - cdef rule_file - cdef pruned_rule_file - cdef extract_file - cdef sample_file - cdef search_file - cdef timingFile - cdef log_int_stats - cdef prev_norm_prefix - cdef float intersect_time, extract_time - cdef csuf.SuffixArray fsa - cdef cdat.DataArray fda - cdef cdat.DataArray eda - - cdef calignment.Alignment alignment - cdef cintlist.CIntList eid2symid - cdef cintlist.CIntList fid2symid - cdef int tight_phrases - cdef int require_aligned_terminal - cdef int require_aligned_chunks - - cdef cintlist.CIntList findexes - cdef cintlist.CIntList findexes1 - - cdef int excluded_sent_id # exclude a sentence id - - def __init__(self, - alignment=None, # compiled alignment object (REQUIRED) - by_slack_factor=1.0, # parameter for double-binary search; doesn't seem to matter much - category="[PHRASE]", # name of generic nonterminal used by Hiero - cacheBetweenSents=False, # prevent flushing of tree between sents; use carefully or you'll run out of memory - extract_file=None, # print raw extracted rules to this file - grammar=None, # empty grammar object -- must be supplied from outside (REQUIRED) - log_int_stats=False, # prints timing data on intersections to stderr - max_chunks=None, # maximum number of contiguous chunks of terminal symbols in RHS of a rule. If None, defaults to max_nonterminals+1 - max_initial_size=10, # maximum span of a grammar rule in TEST DATA - max_length=5, # maximum number of symbols (both T and NT) allowed in a rule - max_nonterminals=2, # maximum number of nonterminals allowed in a rule (set >2 at your own risk) - max_target_chunks=None, # maximum number of contiguous chunks of terminal symbols in target-side RHS of a rule. If None, defaults to max_nonterminals+1 - max_target_length=None, # maximum number of target side symbols (both T and NT) allowed in a rule. If None, defaults to max_initial_size - min_gap_size=2, # minimum span of a nonterminal in the RHS of a rule in TEST DATA - precompute_file=None, # filename of file containing precomputed collocations - precompute_secondary_rank=20, # maximum frequency rank of patterns used to compute triples (don't set higher than 20). - precompute_rank=100, # maximum frequency rank of patterns used to compute collocations (no need to set higher than maybe 200-300) - pruned_rule_file=None, # if specified, pruned grammars will be written to this filename - require_aligned_terminal=True, # require extracted rules to have at least one aligned word - require_aligned_chunks=False, # require each contiguous chunk of extracted rules to have at least one aligned word - per_sentence_grammar=True, # generate grammar files for each input segment - rule_file=None, # UNpruned grammars will be written to this filename - sample_file=None, # Sampling statistics will be written to this filename - search_file=None, # lookup statistics will be written to this filename - train_max_initial_size=10, # maximum span of a grammar rule extracted from TRAINING DATA - train_min_gap_size=2, # minimum span of an RHS nonterminal in a rule extracted from TRAINING DATA - tight_phrases=False, # True if phrases should be tight, False otherwise (False == slower but better results) - timingFile=None, # timing statistics will be written to this filename - useBaezaYates=True, # True to require use of double-binary alg, false otherwise - use_collocations=True, # True to enable used of precomputed collocations - use_index=True # True to enable use of precomputed inverted indices - ): - '''Note: we make a distinction between the min_gap_size - and max_initial_size used in test and train. The latter - are represented by train_min_gap_size and train_max_initial_size, - respectively. This is because Chiang's model does not require - them to be the same, therefore we don't either.''' - self.rules = TrieTable(True) # cache - self.rules.root = ExtendedTrieNode(phrase_location=PhraseLocation()) - self.grammar = grammar - if alignment is None: - raise Exception("Must specify an alignment object") - self.alignment = alignment - - self.excluded_sent_id = -1 - - # grammar parameters and settings - # NOTE: setting max_nonterminals > 2 is not currently supported in Hiero - self.max_length = max_length - self.max_nonterminals = max_nonterminals - self.max_initial_size = max_initial_size - self.train_max_initial_size = train_max_initial_size - self.min_gap_size = min_gap_size - self.train_min_gap_size = train_min_gap_size - self.category = sym.fromstring(category) - - if max_chunks is None: - self.max_chunks = self.max_nonterminals + 1 - else: - self.max_chunks = max_chunks - - if max_target_chunks is None: - self.max_target_chunks = self.max_nonterminals + 1 - else: - self.max_target_chunks = max_target_chunks - - if max_target_length is None: - self.max_target_length = max_initial_size - else: - self.max_target_length = max_target_length - - # algorithmic parameters and settings - self.cacheBetweenSents = not per_sentence_grammar - self.precomputed_collocations = {} - self.precomputed_index = {} - self.use_index = use_index - self.use_collocations = use_collocations - self.max_rank = {} - self.precompute_file = precompute_file - self.precompute_rank = precompute_rank - self.precompute_secondary_rank = precompute_secondary_rank - self.useBaezaYates = useBaezaYates - self.by_slack_factor = by_slack_factor - if tight_phrases: - self.tight_phrases = 1 - else: - self.tight_phrases = 0 - - if require_aligned_chunks: - # one condition is a stronger version of the other. - self.require_aligned_chunks = 1 - self.require_aligned_terminal = 1 - elif require_aligned_terminal: - self.require_aligned_chunks = 0 - self.require_aligned_terminal = 1 - else: - self.require_aligned_chunks = 0 - self.require_aligned_terminal = 0 - - - self.per_sentence_grammar = per_sentence_grammar - if not self.per_sentence_grammar: - self.rule_filehandler = open(rule_file, "w") - # diagnostics - #if rule_file is None: - # self.rule_file = None - self.rule_file = rule_file - if extract_file is None: - self.extract_file = None - else: - self.extract_file = open(extract_file, "w") - if sample_file is None: - self.sample_file = None - else: - self.sample_file = open(sample_file, "w") - if search_file is None: - self.search_file = None - else: - self.search_file = open(search_file, "w") - self.pruned_rule_file = pruned_rule_file - self.timingFile = timingFile - self.log_int_stats = log_int_stats - self.prev_norm_prefix = () - - self.findexes = cintlist.CIntList(initial_len=10) - self.findexes1 = cintlist.CIntList(initial_len=10) - - def registerContext(self, context_manager): - '''This gives the RuleFactory access to the Context object. - Here we also use it to precompute the most expensive intersections - in the corpus quickly.''' - self.context_manager = context_manager - self.fsa = context_manager.fsarray - self.fda = self.fsa.darray - self.eda = context_manager.edarray - self.fid2symid = self.set_idmap(self.fda) - self.eid2symid = self.set_idmap(self.eda) - self.precompute() - - cdef set_idmap(self, cdat.DataArray darray): - cdef int word_id, new_word_id, N - cdef cintlist.CIntList idmap - - N = len(darray.id2word) - idmap = cintlist.CIntList(initial_len=N) - for word_id from 0 <= word_id < N: - new_word_id = sym.fromstring(darray.id2word[word_id], terminal=True) - idmap.arr[word_id] = new_word_id - return idmap - - - def pattern2phrase(self, pattern): - # pattern is a tuple, which we must convert to a hiero rule.Phrase - result = () - arity = 0 - for word_id in pattern: - if word_id == -1: - arity = arity + 1 - new_id = sym.setindex(self.category, arity) - else: - new_id = sym.fromstring(self.fda.id2word[word_id]) - result = result + (new_id,) - return rule.Phrase(result) - - def pattern2phrase_plus(self, pattern): - # returns a list containing both the pattern, and pattern - # suffixed/prefixed with the NT category. - patterns = [] - result = () - arity = 0 - for word_id in pattern: - if word_id == -1: - arity = arity + 1 - new_id = sym.setindex(self.category, arity) - else: - new_id = sym.fromstring(self.fda.id2word[word_id]) - result = result + (new_id,) - patterns.append(rule.Phrase(result)) - patterns.append(rule.Phrase(result + (sym.setindex(self.category, 1),))) - patterns.append(rule.Phrase((sym.setindex(self.category, 1),) + result)) - return patterns - - def precompute(self): - cdef precomputation.Precomputation pre - - if self.precompute_file is not None: - start_time = monitor.cpu() - log.write("Reading precomputed data from file %s... " % self.precompute_file, 1) - pre = precomputation.Precomputation(self.precompute_file, from_binary=True) - # check parameters of precomputation -- some are critical and some are not - if pre.max_nonterminals != self.max_nonterminals: - log.writeln("\nWARNING: Precomputation done with max nonterminals %d, decoder uses %d" % (pre.max_nonterminals, self.max_nonterminals)) - if pre.max_length != self.max_length: - log.writeln("\nWARNING: Precomputation done with max terminals %d, decoder uses %d" % (pre.max_length, self.max_length)) - if pre.train_max_initial_size != self.train_max_initial_size: - log.writeln("\nERROR: Precomputation done with max initial size %d, decoder uses %d" % (pre.train_max_initial_size, self.train_max_initial_size)) - raise Exception("Parameter mismatch with precomputed data") - if pre.train_min_gap_size != self.train_min_gap_size: - log.writeln("\nERROR: Precomputation done with min gap size %d, decoder uses %d" % (pre.train_min_gap_size, self.train_min_gap_size)) - raise Exception("Parameter mismatch with precomputed data") - log.writeln("done.", 1) - if self.use_index: - log.write("Converting %d hash keys on precomputed inverted index... " % (len(pre.precomputed_index)), 1) - for pattern, arr in pre.precomputed_index.iteritems(): - phrases = self.pattern2phrase_plus(pattern) - for phrase in phrases: - self.precomputed_index[phrase] = arr - log.writeln("done.", 1) - if self.use_collocations: - log.write("Converting %d hash keys on precomputed collocations... " % (len(pre.precomputed_collocations)), 1) - for pattern, arr in pre.precomputed_collocations.iteritems(): - phrase = self.pattern2phrase(pattern) - self.precomputed_collocations[phrase] = arr - log.writeln("done.", 1) - stop_time = monitor.cpu() - log.writeln("Processing precomputations took %f seconds" % (stop_time - start_time), 1) - - - def getPrecomputedCollocation(self, phrase): - if phrase in self.precomputed_collocations: - arr = self.precomputed_collocations[phrase] - return PhraseLocation(arr=arr, arr_low=0, arr_high=len(arr), num_subpatterns=phrase.arity()+1) - return None - - - cdef int* baezaYatesHelper(self, int low1, int high1, int* arr1, int step1, - int low2, int high2, int* arr2, int step2, - int offset_by_one, int len_last, int num_subpatterns, int* result_len): - cdef int i1, i2, j1, j2, med1, med2, med1_plus, med1_minus, med2_minus, med2_plus - cdef int d_first, qsetsize, dsetsize, tmp, search_low, search_high - cdef int med_result_len, low_result_len, high_result_len - cdef long comparison - cdef int* result - cdef int* low_result - cdef int* med_result - cdef int* high_result - cdef Matching loc1, loc2 - - result = <int*> malloc(0*sizeof(int*)) -# log.writeln("%sBY: [%d, %d, %d] [%d, %d, %d]" % (pad, low1, high1, step1, low2, high2, step2,), 5) - - d_first = 0 - if high1 - low1 > high2 - low2: -# log.writeln("%sD first" % (pad), 5) - d_first = 1 -# else: -# log.writeln("%sQ first" % (pad), 5) - -# '''First, check to see if we are at any of the -# recursive base cases''' -# -# '''Case 1: one of the sets is empty''' - if low1 >= high1 or low2 >= high2: -# log.writeln("%sRETURN: set is empty" % (pad), 5) - return result - -# '''Case 2: sets are non-overlapping''' - assign_matching(&loc1, arr1, high1-step1, step1, self.fda.sent_id.arr) - assign_matching(&loc2, arr2, low2, step2, self.fda.sent_id.arr) - if self.compare_matchings(&loc1, &loc2, offset_by_one, len_last) == -1: -# log.writeln("%s %s < %s" % (pad, tuple(arr1[high1-step1:high1]), tuple(arr2[low2:low2+step2])),5) -# log.writeln("%sRETURN: non-overlapping sets" % (pad), 5) - return result - - assign_matching(&loc1, arr1, low1, step1, self.fda.sent_id.arr) - assign_matching(&loc2, arr2, high2-step2, step2, self.fda.sent_id.arr) - if self.compare_matchings(&loc1, &loc2, offset_by_one, len_last) == 1: -# log.writeln("%s %s > %s" % (pad, tuple(arr1[low1:low1+step1]), tuple(arr2[high2-step2:high2])),5) -# log.writeln("%sRETURN: non-overlapping sets" % (pad), 5) - return result - - # Case 3: query set and data set do not meet size mismatch constraints; - # We use mergesort instead in this case - qsetsize = (high1-low1) / step1 - dsetsize = (high2-low2) / step2 - if d_first: - tmp = qsetsize - qsetsize = dsetsize - dsetsize = tmp - - if self.by_slack_factor * qsetsize * cmath.log(dsetsize) / cmath.log(2) > dsetsize: - free(result) - return self.mergeHelper(low1, high1, arr1, step1, low2, high2, arr2, step2, offset_by_one, len_last, num_subpatterns, result_len) - - # binary search. There are two flavors, depending on - # whether the queryset or dataset is first - if d_first: - med2 = median(low2, high2, step2) - assign_matching(&loc2, arr2, med2, step2, self.fda.sent_id.arr) - - search_low = low1 - search_high = high1 - while search_low < search_high: - med1 = median(search_low, search_high, step1) - findComparableMatchings(low1, high1, arr1, step1, med1, &med1_minus, &med1_plus) - comparison = self.compareMatchingsSet(med1_minus, med1_plus, arr1, step1, &loc2, offset_by_one, len_last) - if comparison == -1: - search_low = med1_plus - elif comparison == 1: - search_high = med1_minus - else: - break - else: - med1 = median(low1, high1, step1) - findComparableMatchings(low1, high1, arr1, step1, med1, &med1_minus, &med1_plus) - - search_low = low2 - search_high = high2 - while search_low < search_high: - med2 = median(search_low, search_high, step2) - assign_matching(&loc2, arr2, med2, step2, self.fda.sent_id.arr) - comparison = self.compareMatchingsSet(med1_minus, med1_plus, arr1, step1, &loc2, offset_by_one, len_last) - if comparison == -1: - search_high = med2 - elif comparison == 1: - search_low = med2 + step2 - else: - break - - med_result_len = 0 - med_result = <int*> malloc(0*sizeof(int*)) - if search_high > search_low: -# '''Then there is a match for the median element of Q''' -# -# '''What we want to find is the group of all bindings in the first set -# s.t. their first element == the first element of med1. Then we -# want to store the bindings for all of those elements. We can -# subsequently throw all of them away.''' - med2_minus = med2 - med2_plus = med2 + step2 - i1 = med1_minus - while i1 < med1_plus: - assign_matching(&loc1, arr1, i1, step1, self.fda.sent_id.arr) - while med2_minus-step2 >= low2: - assign_matching(&loc2, arr2, med2_minus-step2, step2, self.fda.sent_id.arr) - if self.compare_matchings(&loc1, &loc2, offset_by_one, len_last) < 1: - med2_minus = med2_minus - step2 - else: - break - i2 = med2_minus - while i2 < high2: - assign_matching(&loc2, arr2, i2, step2, self.fda.sent_id.arr) - comparison = self.compare_matchings(&loc1, &loc2, offset_by_one, len_last) - if comparison == 0: - pass - med_result = append_combined_matching(med_result, &loc1, &loc2, offset_by_one, num_subpatterns, &med_result_len) - if comparison == -1: - break - i2 = i2 + step2 - if i2 > med2_plus: - med2_plus = i2 - i1 = i1 + step1 - - tmp = med1_minus - med1_minus = med1_plus - med1_plus = tmp - else: - # No match; need to figure out the point of division in D and Q - med2_minus = med2 - med2_plus = med2 - if d_first: - med2_minus = med2_minus + step2 - if comparison == -1: - med1_minus = med1_plus - if comparison == 1: - med1_plus = med1_minus - else: - tmp = med1_minus - med1_minus = med1_plus - med1_plus = tmp - if comparison == 1: - med2_minus = med2_minus + step2 - med2_plus = med2_plus + step2 - - low_result_len = 0 - low_result = self.baezaYatesHelper(low1, med1_plus, arr1, step1, low2, med2_plus, arr2, step2, offset_by_one, len_last, num_subpatterns, &low_result_len) - high_result_len = 0 - high_result = self.baezaYatesHelper(med1_minus, high1, arr1, step1, med2_minus, high2, arr2, step2, offset_by_one, len_last, num_subpatterns, &high_result_len) - - result = extend_arr(result, result_len, low_result, low_result_len) - result = extend_arr(result, result_len, med_result, med_result_len) - result = extend_arr(result, result_len, high_result, high_result_len) - free(low_result) - free(med_result) - free(high_result) - - return result - - - - cdef long compareMatchingsSet(self, int i1_minus, int i1_plus, int* arr1, int step1, - Matching* loc2, int offset_by_one, int len_last): -# '''Compares a *set* of bindings, all with the same first element, -# to a single binding. Returns -1 if all comparisons == -1, 1 if all -# comparisons == 1, and 0 otherwise.''' - cdef int i1, comparison, prev_comparison - cdef Matching l1_stack - cdef Matching* loc1 - - loc1 = &l1_stack - - i1 = i1_minus - while i1 < i1_plus: - assign_matching(loc1, arr1, i1, step1, self.fda.sent_id.arr) - comparison = self.compare_matchings(loc1, loc2, offset_by_one, len_last) - if comparison == 0: - prev_comparison = 0 - break - elif i1 == i1_minus: - prev_comparison = comparison - else: - if comparison != prev_comparison: - prev_comparison = 0 - break - i1 = i1 + step1 - return prev_comparison - - - cdef long compare_matchings(self, Matching* loc1, Matching* loc2, int offset_by_one, int len_last): - cdef int i - - if loc1.sent_id > loc2.sent_id: - return 1 - if loc2.sent_id > loc1.sent_id: - return -1 - - if loc1.size == 1 and loc2.size == 1: - if loc2.arr[loc2.start] - loc1.arr[loc1.start] <= self.train_min_gap_size: - return 1 - - elif offset_by_one: - for i from 1 <= i < loc1.size: - if loc1.arr[loc1.start+i] > loc2.arr[loc2.start+i-1]: - return 1 - if loc1.arr[loc1.start+i] < loc2.arr[loc2.start+i-1]: - return -1 - - else: - if loc1.arr[loc1.start]+1 > loc2.arr[loc2.start]: - return 1 - if loc1.arr[loc1.start]+1 < loc2.arr[loc2.start]: - return -1 - - for i from 1 <= i < loc1.size: - if loc1.arr[loc1.start+i] > loc2.arr[loc2.start+i]: - return 1 - if loc1.arr[loc1.start+i] < loc2.arr[loc2.start+i]: - return -1 - - if loc2.arr[loc2.end-1] + len_last - loc1.arr[loc1.start] > self.train_max_initial_size: - return -1 - return 0 - - - cdef int* mergeHelper(self, int low1, int high1, int* arr1, int step1, - int low2, int high2, int* arr2, int step2, - int offset_by_one, int len_last, int num_subpatterns, int* result_len): - cdef int i1, i2, j1, j2 - cdef long comparison - cdef int* result - cdef Matching loc1, loc2 -# cdef int i - -# pad = " " -# log.writeln("->mergeHelper", 5) - - result_len[0] = 0 - result = <int*> malloc(0*sizeof(int)) - - i1 = low1 - i2 = low2 -# if log.level==5: -# log.writeln("%sMERGE lists [%d,%d,%d] and [%d,%d,%d]" % (pad,low1,high1,step1,low2,high2,step2), 5) -# log.writeln("%soffset_by_one: %d, len_last: %d" % (pad, offset_by_one, len_last), 5) -# log.write("[") -# for i from low1 <= i < high1: -# log.write("%d, " % arr1.arr[i],5) -# log.writeln("]") -# log.write("[") -# for i from low2 <= i < high2: -# log.write("%d, " % arr2.arr[i],5) -# log.writeln("]") - while i1 < high1 and i2 < high2: - -# '''First, pop all unneeded loc2's off the stack''' - assign_matching(&loc1, arr1, i1, step1, self.fda.sent_id.arr) -# if log.level==5: -# log.writeln("%s TOP1 %s" % (pad,matching2str(loc1)),5) - while i2 < high2: - assign_matching(&loc2, arr2, i2, step2, self.fda.sent_id.arr) - if self.compare_matchings(&loc1, &loc2, offset_by_one, len_last) == 1: -# if log.level==5: -# log.writeln("%s %s > %s" % (pad,matching2str(loc1),matching2str(loc2)),5) -# log.writeln("%s POP2 %s" % (pad,matching2str(loc2)),5) - i2 = i2 + step2 - else: - break - -# '''Next: process all loc1's with the same starting val''' - j1 = i1 - while i1 < high1 and arr1[j1] == arr1[i1]: - assign_matching(&loc1, arr1, i1, step1, self.fda.sent_id.arr) - j2 = i2 - while j2 < high2: - assign_matching(&loc2, arr2, j2, step2, self.fda.sent_id.arr) - comparison = self.compare_matchings(&loc1, &loc2, offset_by_one, len_last) - if comparison == 0: -# if log.level==5: -# log.writeln("%s %s == %s" % (pad,matching2str(loc1),matching2str(loc2)),5) - result = append_combined_matching(result, &loc1, &loc2, offset_by_one, num_subpatterns, result_len) - if comparison == 1: -# if log.level==5: -# log.writeln("%s %s > %s" % (pad,matching2str(loc1),matching2str(loc2)),5) - pass - if comparison == -1: -# if log.level==5: -# log.writeln("%s %s < %s" % (pad,matching2str(loc1),matching2str(loc2)),5) - break - else: - j2 = j2 + step2 -# if log.level==5: -# log.writeln("%s POP1 %s" % (pad,matching2str(loc1)),5) - i1 = i1 + step1 - -# log.writeln("<-mergeHelper", 5) - return result - - - cdef void sortPhraseLoc(self, cintlist.CIntList arr, PhraseLocation loc, rule.Phrase phrase): - cdef int i, j - cdef cveb.VEB veb - cdef cintlist.CIntList result - - if phrase in self.precomputed_index: - loc.arr = self.precomputed_index[phrase] - else: - loc.arr = cintlist.CIntList(initial_len=loc.sa_high-loc.sa_low) - veb = cveb.VEB(arr.len) - for i from loc.sa_low <= i < loc.sa_high: - veb._insert(arr.arr[i]) - i = veb.veb.min_val - for j from 0 <= j < loc.sa_high-loc.sa_low: - loc.arr.arr[j] = i - i = veb._findsucc(i) - loc.arr_low = 0 - loc.arr_high = loc.arr.len - - - cdef intersectHelper(self, rule.Phrase prefix, rule.Phrase suffix, - PhraseLocation prefix_loc, PhraseLocation suffix_loc, int algorithm): - - cdef cintlist.CIntList arr1, arr2, result - cdef int low1, high1, step1, low2, high2, step2, offset_by_one, len_last, num_subpatterns, result_len - cdef int* result_ptr - cdef csuf.SuffixArray suf - - result_len = 0 - - if sym.isvar(suffix[0]): - offset_by_one = 1 - else: - offset_by_one = 0 - - len_last = len(suffix.getchunk(suffix.arity())) - - if prefix_loc.arr is None: - suf = self.context_manager.fsarray - self.sortPhraseLoc(suf.sa, prefix_loc, prefix) - arr1 = prefix_loc.arr - low1 = prefix_loc.arr_low - high1 = prefix_loc.arr_high - step1 = prefix_loc.num_subpatterns - - if suffix_loc.arr is None: - suf = self.context_manager.fsarray - self.sortPhraseLoc(suf.sa, suffix_loc, suffix) - arr2 = suffix_loc.arr - low2 = suffix_loc.arr_low - high2 = suffix_loc.arr_high - step2 = suffix_loc.num_subpatterns - - num_subpatterns = prefix.arity()+1 - - if algorithm == MERGE: - result_ptr = self.mergeHelper(low1, high1, arr1.arr, step1, - low2, high2, arr2.arr, step2, - offset_by_one, len_last, num_subpatterns, &result_len) - else: - result_ptr = self.baezaYatesHelper(low1, high1, arr1.arr, step1, - low2, high2, arr2.arr, step2, - offset_by_one, len_last, num_subpatterns, &result_len) - - if result_len == 0: - free(result_ptr) - return None - else: - result = cintlist.CIntList() - free(result.arr) - result.arr = result_ptr - result.len = result_len - result.size = result_len - return PhraseLocation(arr_low=0, arr_high=result_len, arr=result, num_subpatterns=num_subpatterns) - - cdef loc2str(self, PhraseLocation loc): - cdef int i, j - result = "{" - i = 0 - while i < loc.arr_high: - result = result + "(" - for j from i <= j < i + loc.num_subpatterns: - result = result + ("%d " %loc.arr[j]) - result = result + ")" - i = i + loc.num_subpatterns - result = result + "}" - return result - -# cdef compareResults(self, PhraseLocation loc1, PhraseLocation loc2, phrase, type1, type2): -# cdef i -# if loc1 is None and type1=="pre": -# return -# if loc1 is None: -# if loc2 is None or loc2.arr_high == 0: -# return -# if loc2 is None: -# if loc1.arr_high == 0: -# return -# if loc1.arr_high != loc2.arr_high: -# log.writeln("ERROR: %d vs %d (%s vs %s)" % (loc1.arr_high, loc2.arr_high, type1, type2)) -# #log.writeln(" %s" % self.loc2str(loc2)) -# if loc1.arr_high == 0: -# return -# elif loc1.num_subpatterns != loc2.num_subpatterns: -# log.writeln("ERROR 2: %d vs %d (%d v %d) %s" % (loc1.num_subpatterns, loc2.num_subpatterns, loc1.arr_high, loc2.arr_high, phrase)) -# for i from 0 <= i < loc1.arr_high: -# if loc1.arr[i] != loc2.arr[i]: -# log.writeln("ERROR 3") -# - cdef PhraseLocation intersect(self, prefix_node, suffix_node, rule.Phrase phrase): - cdef rule.Phrase prefix, suffix - cdef PhraseLocation prefix_loc, suffix_loc, result - - start_time = monitor.cpu() - prefix = prefix_node.phrase - suffix = suffix_node.phrase - prefix_loc = prefix_node.phrase_location - suffix_loc = suffix_node.phrase_location - - result = self.getPrecomputedCollocation(phrase) - if result is not None: - intersect_method = "precomputed" - - if result is None: - if self.useBaezaYates: - result = self.intersectHelper(prefix, suffix, prefix_loc, suffix_loc, BAEZA_YATES) - intersect_method="double binary" - else: - result = self.intersectHelper(prefix, suffix, prefix_loc, suffix_loc, MERGE) - intersect_method="merge" - stop_time = monitor.cpu() - intersect_time = stop_time - start_time - if self.log_int_stats: - if intersect_method == "precomputed": - sort1 = "none" - sort2 = "none" - else: - if prefix in self.precomputed_index: - sort1 = "index" - else: - sort1 = "veb" - if suffix in self.precomputed_index: - sort2 = "index" - else: - sort2 = "veb" - result_len=0 - if result is not None: - result_len = len(result.arr)/result.num_subpatterns - rank = 0 -# if phrase in self.max_rank: -# rank = self.max_rank[phrase] -# else: -# rank = self.precompute_rank + 10 - log.writeln("INT %d %d %d %d %d %f %d %s %s %s" % - (len(prefix)+1 - prefix.arity(), prefix.arity(), - nGramCount(prefix_node.phrase_location), - nGramCount(suffix_node.phrase_location), - result_len, intersect_time, rank, intersect_method, sort1, sort2)) - return result - - def advance(self, frontier, res, fwords): - nf = [] - for (toskip, (i, alt, pathlen)) in frontier: - spanlen = fwords[i][alt][2] - if (toskip == 0): - #log.writeln("RES: (%d %d %d)" % (i, alt, pathlen), 3) - res.append((i, alt, pathlen)) - ni = i + spanlen - #log.writeln("proc: %d (%d %d %d) sl=%d ni=%d len(fwords)=%d" % (toskip, i, alt, pathlen, spanlen, ni, len(fwords)), 3) - if (ni < len(fwords) and (pathlen + 1) < self.max_initial_size): - for na in xrange(len(fwords[ni])): - nf.append((toskip - 1, (ni, na, pathlen + 1))) - if (len(nf) > 0): - return self.advance(nf, res, fwords) - else: - return res - - def get_all_nodes_isteps_away(self, skip, i, spanlen, pathlen, fwords, next_states, reachable_buffer): - frontier = [] - if (i+spanlen+skip >= len(next_states)): - return frontier - #print "get_all_nodes_isteps_away from %i" % (i) - key = tuple([i,spanlen]) - reachable = [] - if (key in reachable_buffer): - reachable = reachable_buffer[key] - else: - reachable = self.reachable(fwords, i, spanlen) - reachable_buffer[key] = reachable - #print "reachable(from=%i,dist=%i) = " % (i,spanlen) - #print reachable - for nextreachable in reachable: - for next_id in next_states[nextreachable]: - jump = self.shortest(fwords,i,next_id) - #print "checking next_id = %i, pathlen[sofar] = %i, jump = %i" % (next_id,pathlen,jump) - #if (next_id - (i+spanlen)) < skip: - if jump < skip: - continue - #if next_id-(i-pathlen) < self.max_initial_size: - if pathlen+jump <= self.max_initial_size: - for alt_id in xrange(len(fwords[next_id])): - if (fwords[next_id][alt_id][0] != cn.epsilon): - #frontier.append((next_id,alt_id,next_id-(i-pathlen))); - #print "finding the shortest from %i to %i" % (i, next_id) - newel = (next_id,alt_id,pathlen+jump) - if newel not in frontier: - frontier.append((next_id,alt_id,pathlen+jump)) - #print "appending to frontier = next_id=%i, alt_id=%i, pathlen=%i" % (next_id,alt_id,pathlen+jump) - #else: - #print "NOT appending to frontier = next_id=%i, alt_id=%i, pathlen=%i" % (next_id,alt_id,pathlen+jump) - #else: - #print "next_id = %s is aborted\n" % next_id - #print "returning frontier" - #print frontier - return frontier - - def reachable(self, fwords, ifrom, dist): - #print "inside reachable(%i,%i)" % (ifrom,dist) - ret = [] - if (ifrom >= len(fwords)): - return ret - for alt_id in xrange(len(fwords[ifrom])): - if (fwords[ifrom][alt_id][0] == cn.epsilon): - ret.extend(self.reachable(fwords,ifrom+fwords[ifrom][alt_id][2],dist)) - else: - if (dist==0): - if (ifrom not in ret): - ret.append(ifrom) - else: - for ifromchild in self.reachable(fwords,ifrom+fwords[ifrom][alt_id][2],dist-1): - if (ifromchild not in ret): - ret.append(ifromchild) - - return ret - - def shortest(self, fwords, ifrom, ito): - min = 1000 - #print "shortest ifrom=%i, ito=%i" % (ifrom,ito) - if (ifrom > ito): - return min - if (ifrom == ito): - return 0 - for alt_id in xrange(len(fwords[ifrom])): - currmin = self.shortest(fwords,ifrom+fwords[ifrom][alt_id][2],ito) - if (fwords[ifrom][alt_id][0] != cn.epsilon): - currmin += 1 - if (currmin<min): - min = currmin - return min - - def get_next_states(self, _columns, curr_idx, min_dist=2): - result = [] - candidate = [[curr_idx,0]] - - while len(candidate) > 0: - curr = candidate.pop() - if curr[0] >= len(_columns): - continue - if curr[0] not in result and min_dist <= curr[1] <= self.max_initial_size: - result.append(curr[0]); - curr_col = _columns[curr[0]] - for alt in curr_col: - next_id = curr[0]+alt[2] - jump = 1 - if (alt[0] == cn.epsilon): - jump = 0 - if next_id not in result and min_dist <= curr[1]+jump <= self.max_initial_size+1: - candidate.append([next_id,curr[1]+jump]) - return sorted(result); - - def input(self, fwords, meta=None, output=None): - '''When this function is called on the RuleFactory, - it looks up all of the rules that can be used to translate - the input sentence''' - cdef int i, j, k, flen, arity, num_subpatterns, num_samples - cdef float start_time - cdef PhraseLocation phrase_location - cdef cintlist.CIntList sample, chunklen - cdef Matching matching - cdef rule.Phrase hiero_phrase - - #fwords = [ ((1,0.0,1),), fwords1 ] #word id for <s> = 1, cost = 0.0, next = 1 - #print fwords - flen = len(fwords) - #print "length = %i" % flen - start_time = monitor.cpu() - self.intersect_time = 0.0 - self.extract_time = 0.0 - nodes_isteps_away_buffer = {} - hit = 0 - reachable_buffer = {} - if meta: - dattrs = sgml.attrs_to_dict(meta) - id = dattrs.get('id', 'NOID') - self.excluded_sent_id = int(dattrs.get('exclude', '-1')) - if output: - self.rule_filehandler = output - elif self.per_sentence_grammar: - self.rule_filehandler = open(self.rule_file+'.'+id, 'w') - - #print "max_initial_size = %i" % self.max_initial_size - - if not self.cacheBetweenSents: - self.rules.root = ExtendedTrieNode(phrase_location=PhraseLocation()) - self.grammar.root = [None, {}] - - frontier = [] - for i in xrange(len(fwords)): - for alt in xrange(0, len(fwords[i])): - if fwords[i][alt][0] != cn.epsilon: - frontier.append((i, i, alt, 0, self.rules.root, (), False)) - - xroot = None - x1 = sym.setindex(self.category, 1) - if x1 in self.rules.root.children: - xroot = self.rules.root.children[x1] - else: - xroot = ExtendedTrieNode(suffix_link=self.rules.root, phrase_location=PhraseLocation()) - self.rules.root.children[x1] = xroot - - for i in xrange(self.min_gap_size, len(fwords)): - for alt in xrange(0, len(fwords[i])): - if fwords[i][alt][0] != cn.epsilon: - frontier.append((i-self.min_gap_size, i, alt, self.min_gap_size, xroot, (x1,), True)) - '''for k, i, alt, pathlen, node, prefix, is_shadow_path in frontier: - if len(prefix)>0: - print k, i, alt, pathlen, node, map(sym.tostring,prefix), is_shadow_path - else: - print k, i, alt, pathlen, node, prefix, is_shadow_path''' - - #for wid in xrange(1000): - # print "%i = %s" % (wid, sym.tostring(wid)) - next_states = [] - for i in xrange(len(fwords)): - next_states.append(self.get_next_states(fwords,i,self.min_gap_size)) - #print "next state of %i" % i - #print next_states[i] - - while len(frontier) > 0: - #print "frontier = %i" % len(frontier) - this_iter_intersect_time = self.intersect_time - new_frontier = [] - for k, i, alt, pathlen, node, prefix, is_shadow_path in frontier: - #print "looking at: " - #if len(prefix)>0: - # print k, i, alt, pathlen, node, map(sym.tostring,prefix), is_shadow_path - #else: - # print k, i, alt, pathlen, node, prefix, is_shadow_path - word_id = fwords[i][alt][0] - spanlen = fwords[i][alt][2] - #print "word_id = %i, %s" % (word_id, sym.tostring(word_id)) - # to prevent .. [X] </S> - #print "prefix = ",prefix - #if word_id == 2 and len(prefix)>=2: - #print "at the end: %s" % (prefix[len(prefix)-1]) - #if prefix[len(prefix)-1]<0: - #print "break" - #continue - #print "continuing" - #if pathlen + spanlen > self.max_initial_size: - #continue - # TODO get rid of k -- pathlen is replacing it - if word_id == cn.epsilon: - #print "skipping because word_id is epsilon" - if i+spanlen >= len(fwords): - continue - for nualt in xrange(0,len(fwords[i+spanlen])): - frontier.append((k, i+spanlen, nualt, pathlen, node, prefix, is_shadow_path)) - continue - - phrase = prefix + (word_id,) - str_phrase = map(sym.tostring, phrase) - hiero_phrase = rule.Phrase(phrase) - arity = hiero_phrase.arity() - - #print "pos %2d, node %5d, '%s'" % (i, node.id, hiero_phrase) - if self.search_file is not None: - self.search_file.write("%s\n" % hiero_phrase) - - lookup_required = False - if word_id in node.children: - if node.children[word_id] is None: - #print "Path dead-ends at this node\n" - continue - else: - #print "Path continues at this node\n" - node = node.children[word_id] - else: - if node.suffix_link is None: - #print "Current node is root; lookup required\n" - lookup_required = True - else: - if word_id in node.suffix_link.children: - if node.suffix_link.children[word_id] is None: - #print "Suffix link reports path is dead end\n" - node.children[word_id] = None - continue - else: - #print "Suffix link indicates lookup is reqired\n" - lookup_required = True - else: - #print "ERROR: We never get here\n" - raise Exception("Keyword trie error") - #new_frontier.append((k, i, alt, pathlen, node, prefix, is_shadow_path)) - #print "checking whether lookup_required\n" - if lookup_required: - new_node = None - if is_shadow_path: - #print "Extending shadow path for %s \n" - # on the shadow path we don't do any search, we just use info from suffix link - new_node = ExtendedTrieNode(phrase_location=node.suffix_link.children[word_id].phrase_location, - suffix_link=node.suffix_link.children[word_id], - phrase=hiero_phrase) - else: - if arity > 0: - #print "Intersecting for %s because of arity > 0\n" % hiero_phrase - phrase_location = self.intersect(node, node.suffix_link.children[word_id], hiero_phrase) - else: - #print "Suffix array search for %s" % hiero_phrase - phrase_location = node.phrase_location - sa_range = self.context_manager.fsarray.lookup(str_phrase[-1], len(str_phrase)-1, phrase_location.sa_low, phrase_location.sa_high) - if sa_range is not None: - phrase_location = PhraseLocation(sa_low=sa_range[0], sa_high=sa_range[1]) - else: - phrase_location = None - - if phrase_location is None: - node.children[word_id] = None - #print "Search failed\n" - continue - #print "Search succeeded\n" - suffix_link = self.rules.root - if node.suffix_link is not None: - suffix_link = node.suffix_link.children[word_id] - new_node = ExtendedTrieNode(phrase_location=phrase_location, - suffix_link=suffix_link, - phrase=hiero_phrase) - node.children[word_id] = new_node - node = new_node - #print "Added node %d with suffix link %d\n" % (node.id, node.suffix_link.id) - - '''Automatically add a trailing X node, if allowed -- - This should happen before we get to extraction (so that - the node will exist if needed)''' - if arity < self.max_nonterminals: - xcat_index = arity+1 - xcat = sym.setindex(self.category, xcat_index) - suffix_link_xcat_index = xcat_index - if is_shadow_path: - suffix_link_xcat_index = xcat_index-1 - suffix_link_xcat = sym.setindex(self.category, suffix_link_xcat_index) - node.children[xcat] = ExtendedTrieNode(phrase_location=node.phrase_location, - suffix_link=node.suffix_link.children[suffix_link_xcat], - phrase= rule.Phrase(phrase + (xcat,))) - #log.writeln("Added node %d with suffix link %d (for X)" % (node.children[xcat].id, node.children[xcat].suffix_link.id), 4) - - # sample from range - if not is_shadow_path: - #print "is_not_shadow_path" - sample = self.context_manager.sampler.sample(node.phrase_location) - #print "node.phrase_location %s" % str(node.phrase_location) - #print "sample.len = %i" % len(sample) - num_subpatterns = (<PhraseLocation> node.phrase_location).num_subpatterns - chunklen = cintlist.CIntList(initial_len=num_subpatterns) - for j from 0 <= j < num_subpatterns: - chunklen.arr[j] = hiero_phrase.chunklen(j) - extracts = [] - j = 0 - extract_start = monitor.cpu() - '''orig_tight_phrases = self.tight_phrases - orig_require_aligned_terminal = self.require_aligned_terminal - orig_require_aligned_chunks = self.require_aligned_chunks - if k==0 or i==len(fwords)-1: - self.tight_phrases = 0 - self.require_aligned_terminal = 0 - self.require_aligned_chunks = 0''' - while j < sample.len: - extract = [] - - assign_matching(&matching, sample.arr, j, num_subpatterns, self.fda.sent_id.arr) - '''print "tight_phrase = " - print self.tight_phrases - print "require_aligned_terminal = " - print self.require_aligned_terminal - print "require_aligned_chunks = " - print self.require_aligned_chunks''' - - extract = self.extract(hiero_phrase, &matching, chunklen.arr, num_subpatterns) - extracts.extend(extract) - j = j + num_subpatterns - '''self.tight_phrases = orig_tight_phrases - sttice+sa.nw.normelf.require_aligned_terminal = orig_require_aligned_terminal - self.require_aligned_chunks = orig_require_aligned_chunks''' - num_samples = sample.len/num_subpatterns - extract_stop = monitor.cpu() - self.extract_time = self.extract_time + extract_stop - extract_start - #print "extract.size = %i" % len(extracts) - if len(extracts) > 0: - fphrases = {} - fals = {} - fcount = {} - for f, e, count, als in extracts: - fcount.setdefault(f, 0.0) - fcount[f] = fcount[f] + count - fphrases.setdefault(f, {}) - fphrases[f].setdefault(e, {}) - #fphrases[f][e] = fphrases[f][e] + count - fphrases[f][e].setdefault(als,0.0) - fphrases[f][e][als] = fphrases[f][e][als] + count - #print "f,e,als ",f," : ",e," : ",als," count = ",fphrases[f][e][als] - #fals[str(f)+" ||| "+str(e)] = als - for f, elist in fphrases.iteritems(): - #print "f = '%s'" % f - #if (str(f) in ['<s>','</s>','<s> [X,1]','[X,1] </s>']): - # print "rejected" - # continue - f_margin = fcount[f] - for e, alslist in elist.iteritems(): - alignment = None - count = 0 - for als, currcount in alslist.iteritems(): - #print "als = ",als,", count = ",currcount - if currcount > count: - alignment = als - count = currcount - #alignment = fals[str(f)+" ||| "+str(e)] - #print "selected = ",alignment," with count = ",count - scores = [] - for m in self.context_manager.models: - scores.append(m.compute_contextless_score(f, e, count, fcount[f], num_samples)) - r = rule.Rule(self.category, f, e, scores=scores, owner="context", word_alignments = alignment) - self.grammar.add(r) - if self.rule_filehandler is not None: - self.rule_filehandler.write("%s\n" % r.to_line()) - #print "adding a rule = %s" % r - - #if len(phrase) < self.max_length and i+spanlen < len(fwords) and pathlen+spanlen < self.max_initial_size: - if len(phrase) < self.max_length and i+spanlen < len(fwords) and pathlen+1 <= self.max_initial_size: - #to prevent [X] </s> - #print "lexicalized" - for alt_id in xrange(len(fwords[i+spanlen])): - #if (fwords[i+spanlen][alt_id][2]+pathlen+spanlen <= self.max_initial_size): - #new_frontier.append((k, i+spanlen, alt_id, pathlen + spanlen, node, phrase, is_shadow_path)) - #print "alt_id = %d\n" % alt_id - new_frontier.append((k, i+spanlen, alt_id, pathlen + 1, node, phrase, is_shadow_path)) - #print (k, i+spanlen, alt_id, pathlen + spanlen, node, map(sym.tostring,phrase), is_shadow_path) - #print "end lexicalized" - num_subpatterns = arity - if not is_shadow_path: - num_subpatterns = num_subpatterns + 1 - #to avoid <s> X ... we want <s> next to a lexicalized item - #if k>0 and i<len(fwords)-1 and len(phrase)+1 < self.max_length and arity < self.max_nonterminals and num_subpatterns < self.max_chunks: - if len(phrase)+1 < self.max_length and arity < self.max_nonterminals and num_subpatterns < self.max_chunks: - #print "masuk kondisi" - xcat = sym.setindex(self.category, arity+1) - xnode = node.children[xcat] - #frontier_nodes = self.get_all_nodes_isteps_away(self.min_gap_size, i, spanlen, pathlen, fwords, next_states) - # I put spanlen=1 below - key = tuple([self.min_gap_size, i, 1, pathlen]) - frontier_nodes = [] - if (key in nodes_isteps_away_buffer): - frontier_nodes = nodes_isteps_away_buffer[key] - else: - frontier_nodes = self.get_all_nodes_isteps_away(self.min_gap_size, i, 1, pathlen, fwords, next_states, reachable_buffer) - nodes_isteps_away_buffer[key] = frontier_nodes - - #print "new frontier:\n" - for (i, alt, pathlen) in frontier_nodes: - #if (pathlen+fwords[i][alt][2] <= self.max_initial_size): - new_frontier.append((k, i, alt, pathlen, xnode, phrase +(xcat,), is_shadow_path)) - #print k, i, alt, pathlen, node, map(sym.tostring,phrase +(xcat,)), is_shadow_path - #print "all end\n"; - #else: - #print "no new frontier1\n"; - #else : - #print "no new frontier2\n" - if self.log_int_stats: - log.writeln("This iteration intersect time = %f" % (self.intersect_time - this_iter_intersect_time)) - frontier = new_frontier - - stop_time = monitor.cpu() - log.writeln("Total time for rule lookup, extraction, and scoring = %f seconds" % (stop_time - start_time)) - #log.writeln(" Intersect time = %f seconds" % self.intersect_time) - gc.collect() - log.writeln(" Extract time = %f seconds" % self.extract_time) - if self.pruned_rule_file: - self.grammar.dump(self.pruned_rule_file) - if self.per_sentence_grammar: - self.rule_filehandler.close() - -# else: -# self.rule_filehandler.write("###EOS_"+ id +"\n") - - - cdef int find_fixpoint(self, - int f_low, f_high, - int* f_links_low, int* f_links_high, - int* e_links_low, int* e_links_high, - int e_in_low, int e_in_high, - int* e_low, int* e_high, - int* f_back_low, int* f_back_high, - int f_sent_len, int e_sent_len, - int max_f_len, int max_e_len, - int min_fx_size, int min_ex_size, - int max_new_x, - int allow_low_x, int allow_high_x, - int allow_arbitrary_x, int write_log): - cdef int e_low_prev, e_high_prev, f_low_prev, f_high_prev, new_x, new_low_x, new_high_x - - e_low[0] = e_in_low - e_high[0] = e_in_high - self.find_projection(f_low, f_high, f_links_low, f_links_high, e_low, e_high) - if e_low[0] == -1: - # low-priority corner case: if phrase w is unaligned, - # but we don't require aligned terminals, then returning - # an error here might prevent extraction of allowed - # rule X -> X_1 w X_2 / X_1 X_2. This is probably - # not worth the bother, though. - #print "find_fixpoint0" - return 0 - elif e_in_low != -1 and e_low[0] != e_in_low: - if e_in_low - e_low[0] < min_ex_size: - e_low[0] = e_in_low - min_ex_size - if e_low[0] < 0: - #print "find_fixpoint1" - return 0 - - if e_high[0] - e_low[0] > max_e_len: - #print "find_fixpoint2" - return 0 - elif e_in_high != -1 and e_high[0] != e_in_high: - if e_high[0] - e_in_high < min_ex_size: - e_high[0] = e_in_high + min_ex_size - if e_high[0] > e_sent_len: - #print "find_fixpoint3" - return 0 - - f_back_low[0] = -1 - f_back_high[0] = -1 - f_low_prev = f_low - f_high_prev = f_high - new_x = 0 - new_low_x = 0 - new_high_x = 0 - - while True: - - if f_back_low[0] == -1: - self.find_projection(e_low[0], e_high[0], e_links_low, e_links_high, f_back_low, f_back_high) - else: - self.find_projection(e_low[0], e_low_prev, e_links_low, e_links_high, f_back_low, f_back_high) - self.find_projection(e_high_prev, e_high[0], e_links_low, e_links_high, f_back_low, f_back_high) - - if f_back_low[0] > f_low: - f_back_low[0] = f_low - - if f_back_high[0] < f_high: - f_back_high[0] = f_high - - if f_back_low[0] == f_low_prev and f_back_high[0] == f_high_prev: - return 1 - - if allow_low_x == 0 and f_back_low[0] < f_low: -# log.writeln(" FAIL: f phrase is not tight") - #print " FAIL: f phrase is not tight" - return 0 - - if f_back_high[0] - f_back_low[0] > max_f_len: -# log.writeln(" FAIL: f back projection is too wide") - #print " FAIL: f back projection is too wide" - return 0 - - if allow_high_x == 0 and f_back_high[0] > f_high: -# log.writeln(" FAIL: extension on high side not allowed") - #print " FAIL: extension on high side not allowed" - return 0 - - if f_low != f_back_low[0]: - if new_low_x == 0: - if new_x >= max_new_x: -# log.writeln(" FAIL: extension required on low side violates max # of gaps") - #print " FAIL: extension required on low side violates max # of gaps" - return 0 - else: - new_x = new_x + 1 - new_low_x = 1 - if f_low - f_back_low[0] < min_fx_size: - f_back_low[0] = f_low - min_fx_size - if f_back_high[0] - f_back_low[0] > max_f_len: -# log.writeln(" FAIL: extension required on low side violates max initial length") - #print " FAIL: extension required on low side violates max initial length" - return 0 - if f_back_low[0] < 0: -# log.writeln(" FAIL: extension required on low side violates sentence boundary") - #print " FAIL: extension required on low side violates sentence boundary" - return 0 - - if f_high != f_back_high[0]: - if new_high_x == 0: - if new_x >= max_new_x: -# log.writeln(" FAIL: extension required on high side violates max # of gaps") - #print " FAIL: extension required on high side violates max # of gaps" - return 0 - else: - new_x = new_x + 1 - new_high_x = 1 - if f_back_high[0] - f_high < min_fx_size: - f_back_high[0] = f_high + min_fx_size - if f_back_high[0] - f_back_low[0] > max_f_len: -# log.writeln(" FAIL: extension required on high side violates max initial length") - #print " FAIL: extension required on high side violates max initial length" - return 0 - if f_back_high[0] > f_sent_len: -# log.writeln(" FAIL: extension required on high side violates sentence boundary") - #print " FAIL: extension required on high side violates sentence boundary" - return 0 - - e_low_prev = e_low[0] - e_high_prev = e_high[0] - - self.find_projection(f_back_low[0], f_low_prev, f_links_low, f_links_high, e_low, e_high) - self.find_projection(f_high_prev, f_back_high[0], f_links_low, f_links_high, e_low, e_high) - if e_low[0] == e_low_prev and e_high[0] == e_high_prev: - return 1 - if allow_arbitrary_x == 0: -# log.writeln(" FAIL: arbitrary expansion not permitted") - #print " FAIL: arbitrary expansion not permitted" - return 0 - if e_high[0] - e_low[0] > max_e_len: -# log.writeln(" FAIL: re-projection violates sentence max phrase length") - #print " FAIL: re-projection violates sentence max phrase length" - return 0 - f_low_prev = f_back_low[0] - f_high_prev = f_back_high[0] - - - cdef find_projection(self, int in_low, int in_high, int* in_links_low, int* in_links_high, - int* out_low, int* out_high): - cdef int i - for i from in_low <= i < in_high: - if in_links_low[i] != -1: - if out_low[0] == -1 or in_links_low[i] < out_low[0]: - out_low[0] = in_links_low[i] - if out_high[0] == -1 or in_links_high[i] > out_high[0]: - out_high[0] = in_links_high[i] - - - cdef int* int_arr_extend(self, int* arr, int* arr_len, int* data, int data_len): - cdef int new_len - new_len = arr_len[0] + data_len - arr = <int*> realloc(arr, new_len*sizeof(int)) - memcpy(arr+arr_len[0], data, data_len*sizeof(int)) - arr_len[0] = new_len - return arr - - - cdef extract_phrases(self, int e_low, int e_high, int* e_gap_low, int* e_gap_high, int* e_links_low, int num_gaps, - int f_low, int f_high, int* f_gap_low, int* f_gap_high, int* f_links_low, - int sent_id, int e_sent_len, int e_sent_start): - cdef int i, j, k, m, n, *e_gap_order, e_x_low, e_x_high, e_x_gap_low, e_x_gap_high - cdef int *e_gaps1, *e_gaps2, len1, len2, step, num_chunks - cdef cintlist.CIntList ephr_arr - cdef result - - #print "inside extract_phrases" - #print "f_low=%d, f_high=%d" % (f_low,f_high) - result = [] - len1 = 0 - e_gaps1 = <int*> malloc(0) - ephr_arr = cintlist.CIntList() - - e_gap_order = <int*> malloc(num_gaps*sizeof(int)) - if num_gaps > 0: - e_gap_order[0] = 0 - for i from 1 <= i < num_gaps: - for j from 0 <= j < i: - if e_gap_low[i] < e_gap_low[j]: - for k from j <= k < i: - e_gap_order[k+1] = e_gap_order[k] - e_gap_order[j] = i - break - else: - e_gap_order[i] = i - - e_x_low = e_low - e_x_high = e_high - if self.tight_phrases == 0: - while e_x_low > 0 and e_high - e_x_low < self.train_max_initial_size and e_links_low[e_x_low-1] == -1: - e_x_low = e_x_low - 1 - while e_x_high < e_sent_len and e_x_high - e_low < self.train_max_initial_size and e_links_low[e_x_high] == -1: - e_x_high = e_x_high + 1 - - for i from e_x_low <= i <= e_low: - e_gaps1 = self.int_arr_extend(e_gaps1, &len1, &i, 1) - - for i from 0 <= i < num_gaps: - e_gaps2 = <int*> malloc(0) - len2 = 0 - - j = e_gap_order[i] - e_x_gap_low = e_gap_low[j] - e_x_gap_high = e_gap_high[j] - if self.tight_phrases == 0: - while e_x_gap_low > e_x_low and e_links_low[e_x_gap_low-1] == -1: - e_x_gap_low = e_x_gap_low - 1 - while e_x_gap_high < e_x_high and e_links_low[e_x_gap_high] == -1: - e_x_gap_high = e_x_gap_high + 1 - - k = 0 - step = 1+(i*2) - while k < len1: - for m from e_x_gap_low <= m <= e_gap_low[j]: - if m >= e_gaps1[k+step-1]: - for n from e_gap_high[j] <= n <= e_x_gap_high: - if n-m >= 1: # extractor.py doesn't restrict target-side gap length - e_gaps2 = self.int_arr_extend(e_gaps2, &len2, e_gaps1+k, step) - e_gaps2 = self.int_arr_extend(e_gaps2, &len2, &m, 1) - e_gaps2 = self.int_arr_extend(e_gaps2, &len2, &n, 1) - k = k + step - free(e_gaps1) - e_gaps1 = e_gaps2 - len1 = len2 - - step = 1+(num_gaps*2) - e_gaps2 = <int*> malloc(0) - len2 = 0 - for i from e_high <= i <= e_x_high: - j = 0 - while j < len1: - if i - e_gaps1[j] <= self.train_max_initial_size and i >= e_gaps1[j+step-1]: - e_gaps2 = self.int_arr_extend(e_gaps2, &len2, e_gaps1+j, step) - e_gaps2 = self.int_arr_extend(e_gaps2, &len2, &i, 1) - j = j + step - free(e_gaps1) - e_gaps1 = e_gaps2 - len1 = len2 - - step = (num_gaps+1)*2 - i = 0 - - while i < len1: - ephr_arr._clear() - num_chunks = 0 - indexes = [] - for j from 0 <= j < num_gaps+1: - if e_gaps1[i+2*j] < e_gaps1[i+(2*j)+1]: - num_chunks = num_chunks + 1 - for k from e_gaps1[i+2*j] <= k < e_gaps1[i+(2*j)+1]: - indexes.append(k) - ephr_arr._append(self.eid2symid[self.eda.data.arr[e_sent_start+k]]) - if j < num_gaps: - indexes.append(sym.setindex(self.category, e_gap_order[j]+1)) - ephr_arr._append(sym.setindex(self.category, e_gap_order[j]+1)) - i = i + step - if ephr_arr.len <= self.max_target_length and num_chunks <= self.max_target_chunks: - result.append((rule.Phrase(ephr_arr),indexes)) - - free(e_gaps1) - free(e_gap_order) - return result - - cdef create_alignments(self, int* sent_links, int num_links, findexes, eindexes): - #print "create_alignments" - #s = "sent_links = " - #i = 0 - #while (i < num_links*2): - # s = s+"%d-%d " % (sent_links[i],sent_links[i+1]) - # i += 2 - #print s - #print findexes - #print eindexes - - ret = cintlist.CIntList() - for i in xrange(len(findexes)): - s = findexes[i] - if (s<0): - continue - idx = 0 - while (idx < num_links*2): - if (sent_links[idx] == s): - j = eindexes.index(sent_links[idx+1]) - ret.append(i*65536+j) - idx += 2 - return ret - - cdef extract(self, rule.Phrase phrase, Matching* matching, int* chunklen, int num_chunks): - cdef int* sent_links, *e_links_low, *e_links_high, *f_links_low, *f_links_high - cdef int *f_gap_low, *f_gap_high, *e_gap_low, *e_gap_high, num_gaps, gap_start - cdef int i, j, k, e_i, f_i, num_links, num_aligned_chunks, met_constraints - cdef int f_low, f_high, e_low, e_high, f_back_low, f_back_high - cdef int e_sent_start, e_sent_end, f_sent_start, f_sent_end, e_sent_len, f_sent_len - cdef int e_word_count, f_x_low, f_x_high, e_x_low, e_x_high, phrase_len - cdef float pair_count - cdef float available_mass - cdef extracts, phrase_list - cdef cintlist.CIntList fphr_arr - cdef rule.Phrase fphr - cdef reason_for_failure - - fphr_arr = cintlist.CIntList() - phrase_len = phrase.n - extracts = [] - sent_links = self.alignment._get_sent_links(matching.sent_id, &num_links) - - e_sent_start = self.eda.sent_index.arr[matching.sent_id] - e_sent_end = self.eda.sent_index.arr[matching.sent_id+1] - e_sent_len = e_sent_end - e_sent_start - 1 - f_sent_start = self.fda.sent_index.arr[matching.sent_id] - f_sent_end = self.fda.sent_index.arr[matching.sent_id+1] - f_sent_len = f_sent_end - f_sent_start - 1 - available_mass = 1.0 - if matching.sent_id == self.excluded_sent_id: - available_mass = 0.0 - - self.findexes1.reset() - sofar = 0 - for i in xrange(num_chunks): - for j in xrange(chunklen[i]): - self.findexes1.append(matching.arr[matching.start+i]+j-f_sent_start); - sofar += 1 - if (i+1<num_chunks): - self.findexes1.append(phrase[sofar]) - sofar += 1 - - - e_links_low = <int*> malloc(e_sent_len*sizeof(int)) - e_links_high = <int*> malloc(e_sent_len*sizeof(int)) - f_links_low = <int*> malloc(f_sent_len*sizeof(int)) - f_links_high = <int*> malloc(f_sent_len*sizeof(int)) - f_gap_low = <int*> malloc((num_chunks+1)*sizeof(int)) - f_gap_high = <int*> malloc((num_chunks+1)*sizeof(int)) - e_gap_low = <int*> malloc((num_chunks+1)*sizeof(int)) - e_gap_high = <int*> malloc((num_chunks+1)*sizeof(int)) - memset(f_gap_low, 0, (num_chunks+1)*sizeof(int)) - memset(f_gap_high, 0, (num_chunks+1)*sizeof(int)) - memset(e_gap_low, 0, (num_chunks+1)*sizeof(int)) - memset(e_gap_high, 0, (num_chunks+1)*sizeof(int)) - - reason_for_failure = "" - - for i from 0 <= i < e_sent_len: - e_links_low[i] = -1 - e_links_high[i] = -1 - for i from 0 <= i < f_sent_len: - f_links_low[i] = -1 - f_links_high[i] = -1 - - # this is really inefficient -- might be good to - # somehow replace with binary search just for the f - # links that we care about (but then how to look up - # when we want to check something on the e side?) - i = 0 - while i < num_links*2: - f_i = sent_links[i] - e_i = sent_links[i+1] - if f_links_low[f_i] == -1 or f_links_low[f_i] > e_i: - f_links_low[f_i] = e_i - if f_links_high[f_i] == -1 or f_links_high[f_i] < e_i + 1: - f_links_high[f_i] = e_i + 1 - if e_links_low[e_i] == -1 or e_links_low[e_i] > f_i: - e_links_low[e_i] = f_i - if e_links_high[e_i] == -1 or e_links_high[e_i] < f_i + 1: - e_links_high[e_i] = f_i + 1 - i = i + 2 - - als = [] - for x in xrange(matching.start,matching.end): - al = (matching.arr[x]-f_sent_start,f_links_low[matching.arr[x]-f_sent_start]) - als.append(al) - # check all source-side alignment constraints - met_constraints = 1 - if self.require_aligned_terminal: - num_aligned_chunks = 0 - for i from 0 <= i < num_chunks: - for j from 0 <= j < chunklen[i]: - if f_links_low[matching.arr[matching.start+i]+j-f_sent_start] > -1: - num_aligned_chunks = num_aligned_chunks + 1 - break - if num_aligned_chunks == 0: - reason_for_failure = "No aligned terminals" - met_constraints = 0 - if self.require_aligned_chunks and num_aligned_chunks < num_chunks: - reason_for_failure = "Unaligned chunk" - met_constraints = 0 - - if met_constraints and self.tight_phrases: - # outside edge constraints are checked later - for i from 0 <= i < num_chunks-1: - if f_links_low[matching.arr[matching.start+i]+chunklen[i]-f_sent_start] == -1: - reason_for_failure = "Gaps are not tight phrases" - met_constraints = 0 - break - if f_links_low[matching.arr[matching.start+i+1]-1-f_sent_start] == -1: - reason_for_failure = "Gaps are not tight phrases" - met_constraints = 0 - break - - f_low = matching.arr[matching.start] - f_sent_start - f_high = matching.arr[matching.start + matching.size - 1] + chunklen[num_chunks-1] - f_sent_start - if met_constraints: - - if self.find_fixpoint(f_low, f_high, f_links_low, f_links_high, e_links_low, e_links_high, - -1, -1, &e_low, &e_high, &f_back_low, &f_back_high, f_sent_len, e_sent_len, - self.train_max_initial_size, self.train_max_initial_size, - self.train_min_gap_size, 0, - self.max_nonterminals - num_chunks + 1, 1, 1, 0, 0): - gap_error = 0 - num_gaps = 0 - - if f_back_low < f_low: - f_gap_low[0] = f_back_low - f_gap_high[0] = f_low - num_gaps = 1 - gap_start = 0 - phrase_len = phrase_len+1 - if phrase_len > self.max_length: - gap_error = 1 - if self.tight_phrases: - if f_links_low[f_back_low] == -1 or f_links_low[f_low-1] == -1: - gap_error = 1 - reason_for_failure = "Inside edges of preceding subphrase are not tight" - else: - gap_start = 1 - if self.tight_phrases and f_links_low[f_low] == -1: - # this is not a hard error. we can't extract this phrase - # but we still might be able to extract a superphrase - met_constraints = 0 - - for i from 0 <= i < matching.size - 1: - f_gap_low[1+i] = matching.arr[matching.start+i] + chunklen[i] - f_sent_start - f_gap_high[1+i] = matching.arr[matching.start+i+1] - f_sent_start - num_gaps = num_gaps + 1 - - if f_high < f_back_high: - f_gap_low[gap_start+num_gaps] = f_high - f_gap_high[gap_start+num_gaps] = f_back_high - num_gaps = num_gaps + 1 - phrase_len = phrase_len+1 - if phrase_len > self.max_length: - gap_error = 1 - if self.tight_phrases: - if f_links_low[f_back_high-1] == -1 or f_links_low[f_high] == -1: - gap_error = 1 - reason_for_failure = "Inside edges of following subphrase are not tight" - else: - if self.tight_phrases and f_links_low[f_high-1] == -1: - met_constraints = 0 - - if gap_error == 0: - e_word_count = e_high - e_low - for i from 0 <= i < num_gaps: # check integrity of subphrases - if self.find_fixpoint(f_gap_low[gap_start+i], f_gap_high[gap_start+i], - f_links_low, f_links_high, e_links_low, e_links_high, - -1, -1, e_gap_low+gap_start+i, e_gap_high+gap_start+i, - f_gap_low+gap_start+i, f_gap_high+gap_start+i, - f_sent_len, e_sent_len, - self.train_max_initial_size, self.train_max_initial_size, - 0, 0, 0, 0, 0, 0, 0) == 0: - gap_error = 1 - reason_for_failure = "Subphrase [%d, %d] failed integrity check" % (f_gap_low[gap_start+i], f_gap_high[gap_start+i]) - break - - if gap_error == 0: - i = 1 - self.findexes.reset() - if f_back_low < f_low: - fphr_arr._append(sym.setindex(self.category, i)) - i = i+1 - self.findexes.append(sym.setindex(self.category, i)) - self.findexes.extend(self.findexes1) - for j from 0 <= j < phrase.n: - if sym.isvar(phrase.syms[j]): - fphr_arr._append(sym.setindex(self.category, i)) - i = i + 1 - else: - fphr_arr._append(phrase.syms[j]) - if f_back_high > f_high: - fphr_arr._append(sym.setindex(self.category, i)) - self.findexes.append(sym.setindex(self.category, i)) - - fphr = rule.Phrase(fphr_arr) - if met_constraints: - phrase_list = self.extract_phrases(e_low, e_high, e_gap_low + gap_start, e_gap_high + gap_start, e_links_low, num_gaps, - f_back_low, f_back_high, f_gap_low + gap_start, f_gap_high + gap_start, f_links_low, - matching.sent_id, e_sent_len, e_sent_start) - #print "e_low=%d, e_high=%d, gap_start=%d, num_gaps=%d, f_back_low=%d, f_back_high=%d" & (e_low, e_high, gap_start, num_gaps, f_back_low, f_back_high) - if len(phrase_list) > 0: - pair_count = available_mass / len(phrase_list) - else: - pair_count = 0 - reason_for_failure = "Didn't extract anything from [%d, %d] -> [%d, %d]" % (f_back_low, f_back_high, e_low, e_high) - for (phrase2,eindexes) in phrase_list: - als1 = self.create_alignments(sent_links,num_links,self.findexes,eindexes) - extracts.append((fphr, phrase2, pair_count, tuple(als1))) - if self.extract_file: - self.extract_file.write("%s ||| %s ||| %f ||| %d: [%d, %d] -> [%d, %d]\n" % (fphr, phrase2, pair_count, matching.sent_id+1, f_back_low, f_back_high, e_low, e_high)) - #print "extract_phrases1: %s ||| %s ||| %f ||| %d: [%d, %d] -> [%d, %d]\n" % (fphr, phrase2, pair_count, matching.sent_id+1, f_back_low, f_back_high, e_low, e_high) - - if (num_gaps < self.max_nonterminals and - phrase_len < self.max_length and - f_back_high - f_back_low + self.train_min_gap_size <= self.train_max_initial_size): - if (f_back_low == f_low and - f_low >= self.train_min_gap_size and - ((not self.tight_phrases) or (f_links_low[f_low-1] != -1 and f_links_low[f_back_high-1] != -1))): - f_x_low = f_low-self.train_min_gap_size - met_constraints = 1 - if self.tight_phrases: - while f_x_low >= 0 and f_links_low[f_x_low] == -1: - f_x_low = f_x_low - 1 - if f_x_low < 0 or f_back_high - f_x_low > self.train_max_initial_size: - met_constraints = 0 - - if (met_constraints and - self.find_fixpoint(f_x_low, f_back_high, - f_links_low, f_links_high, e_links_low, e_links_high, - e_low, e_high, &e_x_low, &e_x_high, &f_x_low, &f_x_high, - f_sent_len, e_sent_len, - self.train_max_initial_size, self.train_max_initial_size, - 1, 1, 1, 1, 0, 1, 0) and - ((not self.tight_phrases) or f_links_low[f_x_low] != -1) and - self.find_fixpoint(f_x_low, f_low, # check integrity of new subphrase - f_links_low, f_links_high, e_links_low, e_links_high, - -1, -1, e_gap_low, e_gap_high, f_gap_low, f_gap_high, - f_sent_len, e_sent_len, - self.train_max_initial_size, self.train_max_initial_size, - 0, 0, 0, 0, 0, 0, 0)): - fphr_arr._clear() - i = 1 - self.findexes.reset() - self.findexes.append(sym.setindex(self.category, i)) - fphr_arr._append(sym.setindex(self.category, i)) - i = i+1 - self.findexes.extend(self.findexes1) - for j from 0 <= j < phrase.n: - if sym.isvar(phrase.syms[j]): - fphr_arr._append(sym.setindex(self.category, i)) - i = i + 1 - else: - fphr_arr._append(phrase.syms[j]) - if f_back_high > f_high: - fphr_arr._append(sym.setindex(self.category, i)) - self.findexes.append(sym.setindex(self.category, i)) - fphr = rule.Phrase(fphr_arr) - phrase_list = self.extract_phrases(e_x_low, e_x_high, e_gap_low, e_gap_high, e_links_low, num_gaps+1, - f_x_low, f_x_high, f_gap_low, f_gap_high, f_links_low, matching.sent_id, - e_sent_len, e_sent_start) - if len(phrase_list) > 0: - pair_count = available_mass / len(phrase_list) - else: - pair_count = 0 - for phrase2,eindexes in phrase_list: - als2 = self.create_alignments(sent_links,num_links,self.findexes,eindexes) - extracts.append((fphr, phrase2, pair_count, tuple(als2))) - if self.extract_file: - self.extract_file.write("%s ||| %s ||| %f ||| %d: [%d, %d] -> [%d, %d]\n" % (fphr, phrase2, pair_count, matching.sent_id+1, f_x_low, f_x_high, e_x_low, e_x_high)) - #print "extract_phrases2: %s ||| %s ||| %f ||| %d: [%d, %d] -> [%d, %d]\n" % (fphr, phrase2, pair_count, matching.sent_id+1, f_x_low, f_x_high, e_x_low, e_x_high) - - if (f_back_high == f_high and - f_sent_len - f_high >= self.train_min_gap_size and - ((not self.tight_phrases) or (f_links_low[f_high] != -1 and f_links_low[f_back_low] != -1))): - f_x_high = f_high+self.train_min_gap_size - met_constraints = 1 - if self.tight_phrases: - while f_x_high <= f_sent_len and f_links_low[f_x_high-1] == -1: - f_x_high = f_x_high + 1 - if f_x_high > f_sent_len or f_x_high - f_back_low > self.train_max_initial_size: - met_constraints = 0 - - if (met_constraints and - self.find_fixpoint(f_back_low, f_x_high, - f_links_low, f_links_high, e_links_low, e_links_high, - e_low, e_high, &e_x_low, &e_x_high, &f_x_low, &f_x_high, - f_sent_len, e_sent_len, - self.train_max_initial_size, self.train_max_initial_size, - 1, 1, 1, 0, 1, 1, 0) and - ((not self.tight_phrases) or f_links_low[f_x_high-1] != -1) and - self.find_fixpoint(f_high, f_x_high, - f_links_low, f_links_high, e_links_low, e_links_high, - -1, -1, e_gap_low+gap_start+num_gaps, e_gap_high+gap_start+num_gaps, - f_gap_low+gap_start+num_gaps, f_gap_high+gap_start+num_gaps, - f_sent_len, e_sent_len, - self.train_max_initial_size, self.train_max_initial_size, - 0, 0, 0, 0, 0, 0, 0)): - fphr_arr._clear() - i = 1 - self.findexes.reset() - if f_back_low < f_low: - fphr_arr._append(sym.setindex(self.category, i)) - i = i+1 - self.findexes.append(sym.setindex(self.category, i)) - self.findexes.extend(self.findexes1) - for j from 0 <= j < phrase.n: - if sym.isvar(phrase.syms[j]): - fphr_arr._append(sym.setindex(self.category, i)) - i = i + 1 - else: - fphr_arr._append(phrase.syms[j]) - fphr_arr._append(sym.setindex(self.category, i)) - self.findexes.append(sym.setindex(self.category, i)) - fphr = rule.Phrase(fphr_arr) - phrase_list = self.extract_phrases(e_x_low, e_x_high, e_gap_low+gap_start, e_gap_high+gap_start, e_links_low, num_gaps+1, - f_x_low, f_x_high, f_gap_low+gap_start, f_gap_high+gap_start, f_links_low, - matching.sent_id, e_sent_len, e_sent_start) - if len(phrase_list) > 0: - pair_count = available_mass / len(phrase_list) - else: - pair_count = 0 - for phrase2, eindexes in phrase_list: - als3 = self.create_alignments(sent_links,num_links,self.findexes,eindexes) - extracts.append((fphr, phrase2, pair_count, tuple(als3))) - if self.extract_file: - self.extract_file.write("%s ||| %s ||| %f ||| %d: [%d, %d] -> [%d, %d]\n" % (fphr, phrase2, pair_count, matching.sent_id+1, f_x_low, f_x_high, e_x_low, e_x_high)) - #print "extract_phrases3: %s ||| %s ||| %f ||| %d: [%d, %d] -> [%d, %d]\n" % (fphr, phrase2, pair_count, matching.sent_id+1, f_x_low, f_x_high, e_x_low, e_x_high) - if (num_gaps < self.max_nonterminals - 1 and - phrase_len+1 < self.max_length and - f_back_high == f_high and - f_back_low == f_low and - f_back_high - f_back_low + (2*self.train_min_gap_size) <= self.train_max_initial_size and - f_low >= self.train_min_gap_size and - f_high <= f_sent_len - self.train_min_gap_size and - ((not self.tight_phrases) or (f_links_low[f_low-1] != -1 and f_links_low[f_high] != -1))): - - met_constraints = 1 - f_x_low = f_low-self.train_min_gap_size - if self.tight_phrases: - while f_x_low >= 0 and f_links_low[f_x_low] == -1: - f_x_low = f_x_low - 1 - if f_x_low < 0: - met_constraints = 0 - - f_x_high = f_high+self.train_min_gap_size - if self.tight_phrases: - while f_x_high <= f_sent_len and f_links_low[f_x_high-1] == -1: - f_x_high = f_x_high + 1 - if f_x_high > f_sent_len or f_x_high - f_x_low > self.train_max_initial_size: - met_constraints = 0 - - if (met_constraints and - self.find_fixpoint(f_x_low, f_x_high, - f_links_low, f_links_high, e_links_low, e_links_high, - e_low, e_high, &e_x_low, &e_x_high, &f_x_low, &f_x_high, - f_sent_len, e_sent_len, - self.train_max_initial_size, self.train_max_initial_size, - 1, 1, 2, 1, 1, 1, 1) and - ((not self.tight_phrases) or (f_links_low[f_x_low] != -1 and f_links_low[f_x_high-1] != -1)) and - self.find_fixpoint(f_x_low, f_low, - f_links_low, f_links_high, e_links_low, e_links_high, - -1, -1, e_gap_low, e_gap_high, f_gap_low, f_gap_high, - f_sent_len, e_sent_len, - self.train_max_initial_size, self.train_max_initial_size, - 0, 0, 0, 0, 0, 0, 0) and - self.find_fixpoint(f_high, f_x_high, - f_links_low, f_links_high, e_links_low, e_links_high, - -1, -1, e_gap_low+1+num_gaps, e_gap_high+1+num_gaps, - f_gap_low+1+num_gaps, f_gap_high+1+num_gaps, - f_sent_len, e_sent_len, - self.train_max_initial_size, self.train_max_initial_size, - 0, 0, 0, 0, 0, 0, 0)): - fphr_arr._clear() - i = 1 - self.findexes.reset() - self.findexes.append(sym.setindex(self.category, i)) - fphr_arr._append(sym.setindex(self.category, i)) - i = i+1 - self.findexes.extend(self.findexes1) - for j from 0 <= j < phrase.n: - if sym.isvar(phrase.syms[j]): - fphr_arr._append(sym.setindex(self.category, i)) - i = i + 1 - else: - fphr_arr._append(phrase.syms[j]) - fphr_arr._append(sym.setindex(self.category, i)) - self.findexes.append(sym.setindex(self.category, i)) - fphr = rule.Phrase(fphr_arr) - phrase_list = self.extract_phrases(e_x_low, e_x_high, e_gap_low, e_gap_high, e_links_low, num_gaps+2, - f_x_low, f_x_high, f_gap_low, f_gap_high, f_links_low, - matching.sent_id, e_sent_len, e_sent_start) - if len(phrase_list) > 0: - pair_count = available_mass / len(phrase_list) - else: - pair_count = 0 - for phrase2, eindexes in phrase_list: - als4 = self.create_alignments(sent_links,num_links,self.findexes,eindexes) - extracts.append((fphr, phrase2, pair_count, tuple(als4))) - if self.extract_file: - self.extract_file.write("%s ||| %s ||| %f ||| %d: [%d, %d] -> [%d, %d]\n" % (fphr, phrase2, pair_count, matching.sent_id+1, f_x_low, f_x_high, e_x_low, e_x_high)) - #print "extract_phrases4 %s ||| %s ||| %f ||| %d: [%d, %d] -> [%d, %d]\n" % (fphr, phrase2, pair_count, matching.sent_id+1, f_x_low, f_x_high, e_x_low, e_x_high) - else: - reason_for_failure = "Unable to extract basic phrase" - - free(sent_links) - free(f_links_low) - free(f_links_high) - free(e_links_low) - free(e_links_high) - free(f_gap_low) - free(f_gap_high) - free(e_gap_low) - free(e_gap_high) - - if self.sample_file is not None: - self.sample_file.write("%s ||| %d: [%d, %d] ||| %d ||| %s\n" % (phrase, matching.sent_id+1, f_low, f_high, len(extracts), reason_for_failure)) - - #print "%s ||| %d: [%d, %d] ||| %d ||| %s\n" % (phrase, matching.sent_id+1, f_low, f_high, len(extracts), reason_for_failure) - - - return extracts - diff --git a/sa-extract/sa-compile.pl b/sa-extract/sa-compile.pl deleted file mode 100755 index 1cae83a7..00000000 --- a/sa-extract/sa-compile.pl +++ /dev/null @@ -1,322 +0,0 @@ -#!/usr/bin/env perl - -use strict; -use Getopt::Long; - -my $cwd; BEGIN { use Cwd qw/ abs_path cwd /; use File::Basename; $cwd = cwd(); } - -my $rootdir = `dirname $0`; chomp $rootdir; -my $compile = "$rootdir/compile_bin.py"; -my $lcp = "$rootdir/lcp_ops.py"; -die "Can't find $compile" unless -f $compile; -die "Can't execute $compile" unless -x $compile; - -sub print_help; -sub cleanup; - -my $alignment; -my $bitext; -my $catalog; -my $dryrun = 0; -my $group; -my $help = 0; -my $ini = "$rootdir/extract.ini"; -my $lm; -my $precomp; -my $no_ini = 0; -my $remove; -my $type; -my $local_only = 1; -my $output; - -# Process command-line options -if (GetOptions( - "alignment=s" => \$alignment, - "bitext=s" => \$bitext, - "help" => \$help, - "ini=s" => \$ini, - "output=s" => \$output, - "precomp-options=s" => \$precomp, - "no-ini" => \$no_ini, -) == 0 || $help == 1 || @ARGV > 0){ - print_help; - die "\n"; -} - -open(INI, $ini) or die "Can't read $ini: $!"; - -$bitext || die "You must specify a bitext with -b\n"; -$alignment || die "You must specify an alignment with -a\n"; - -my $top_dir; -if (defined $output) { - $top_dir = $output; -} else { - $top_dir = "$cwd/sa-compiled"; -} - -my $type_dir = "$top_dir"; - -my $bitext_name; -my $bitext_f_file; -my $bitext_e_file; -my $bitext_dir; -if ($bitext){ - if ($bitext =~ /(.*)=(.*),(.*)/){ - $bitext_name = $1; - $bitext_f_file = $2; - $bitext_e_file = $3; - -e $bitext_f_file || die "Could not find file $bitext_f_file\n"; - -e $bitext_e_file || die "Could not find file $bitext_e_file\n"; - } else { - $bitext_name = $bitext; - } - - $bitext_dir = "$type_dir/bitext/$bitext_name"; - if ($bitext_f_file){ - if (-e $bitext_dir) { - die "Bitext $bitext_name already exists\n"; - } - } else { - unless (-e $bitext_dir){ - die "No bitext $bitext_name. You must specify bitext files with -b\n"; - } - } -} - -my $max_nt = 2; -my $max_len = 5; -my $max_size = 15; -my $min_gap = 1; -my $rank1 = 100; -my $rank2 = 10; -my $precomp_file; -if ($precomp){ - unless ($bitext_name){ - die "You must specify a bitext with -b if using -p\n"; - } - my @precomp_args = split(/,/, $precomp); - my $precomp_arg; - for $precomp_arg (@precomp_args){ - if ($precomp_arg =~ /(.*)=(.*)/){ - my $key = $1; - my $value = $2; - unless ($value =~ /^\d+$/){ - die "Value for -p option must be a positive integer, found $value\n"; - } - if ($key eq "max-len"){ $max_len = $value; } - elsif ($key eq "max-nt"){ $max_nt = $value; } - elsif ($key eq "max-size"){ $max_size = $value; } - elsif ($key eq "min-gap"){ $min_gap = $value; } - elsif ($key eq "rank1"){ $rank1 = $value; } - elsif ($key eq "rank2"){ $rank2 = $value; } - else{ - die "Unknown option $key given for -p\n"; - } - } else { - die "When using -p, you must specify key-value pairs using syntax: <key1>=<value1>,...,<keyN>=<valueN>\n"; - } - } -} -my $precomp_compile_needed = 0; -if ($bitext_name){ - $precomp_file = "$bitext_dir/precomp.$max_len.$max_nt.$max_size.$min_gap.$rank1.$rank2.bin"; - unless (-e $precomp_file){ - $precomp_compile_needed = 1; - } -} - -my $alignment_name; -my $alignment_file; -my $alignment_dir; -if ($alignment){ - $bitext || die "Specified alignment $alignment without specifying bitext using -b\n"; - if ($alignment =~ /(.*)=(.*)/){ - $alignment_name = $1; - $alignment_file = $2; - -e $alignment_file || die "Could not find file $alignment_file\n"; - } else { - $alignment_name = $alignment; - } - - $alignment_dir = "$bitext_dir/a/$alignment_name"; - if ($alignment_file){ - if (-e $alignment_dir){ - die "Alignment $alignment_name already exists for bitext $bitext_name\n"; - } - } else { - require_top_dirs(); - unless (-e $alignment_dir){ - die "No alignment $alignment_name for bitext $bitext_name\n"; - } - } -} - -if ($bitext_name){ - print STDERR " from files $bitext_f_file and $bitext_e_file\n"; -} else { - print " No bitext\n"; -} -if ($precomp_compile_needed){ - print STDERR " Precompilation needed: max-len=$max_len, max-nt=$max_nt, max-size=$max_size, min-gap=$min_gap, rank1=$rank1, rank2=$rank2\n"; -} -if ($alignment_name){ - print STDERR " Alignment = $alignment_name"; - if ($alignment_file){ - print STDERR " from file $alignment_file\n"; - } -} else { - print STDERR " No alignment\n"; -} - -my $script; -my $compile_dir; -$SIG{INT} = "cleanup"; -$SIG{TERM} = "cleanup"; -$SIG{HUP} = "cleanup"; - - if ($bitext_e_file || $precomp_compile_needed || $alignment_file){ - my $compiled_e_file; - my $compiled_f_file; - - $compile_dir = $top_dir; - my $compile_top_dir = "$compile_dir"; - - my $compile_bitext_dir = "$compile_top_dir/bitext/$bitext_name"; - if ($bitext_e_file){ - `mkdir -p $compile_bitext_dir`; - print STDERR "\nCompiling bitext (f side)...\n"; - `$compile -s $bitext_f_file $compile_bitext_dir/f.sa.bin`; - die "Command failed: $!" unless $? == 0; - print STDERR "\nCompiling bitext (e side)...\n"; - `$compile -d $bitext_e_file $compile_bitext_dir/e.bin`; - die "Command failed: $!" unless $? == 0; - - $compiled_f_file = "$compile_bitext_dir/f.sa.bin"; - $compiled_e_file = "$compile_bitext_dir/e.bin"; - } else { # bitext already compiled - $compiled_f_file = "$bitext_dir/f.sa.bin"; - $compiled_e_file = "$bitext_dir/e.bin"; - } - - if ($precomp_compile_needed){ - `mkdir -p $compile_bitext_dir`; - my $top_stats_file = "$compile_bitext_dir/f.top.$rank1"; - my $compiled_precomp_file = "$compile_bitext_dir/precomp.$max_len.$max_nt.$max_size.$min_gap.$rank1.$rank2.bin"; - my $cmd = "$lcp -t 4 $compiled_f_file | sort -nr | head -$rank1 > $top_stats_file"; - print STDERR "$cmd\n"; - `$cmd`; - die "Command failed: $cmd" unless $? == 0; - `$compile -r max-len=$max_len max-nt=$max_nt max-size=$max_size min-gap=$min_gap rank1=$rank1 rank2=$rank2 sa=$compiled_f_file $top_stats_file $compiled_precomp_file`; - die "Command failed: $!" unless $? == 0; - } - - if ($alignment_file){ - my $compile_alignment_dir = "$compile_top_dir/bitext/$bitext_name/a/$alignment_name"; - `mkdir -p $compile_alignment_dir`; - print STDERR "\nCompiling alignment...\n"; - my $cmd= "$compile -a $alignment_file $compile_alignment_dir/a.bin"; - print STDERR " $cmd\n"; - `$cmd`; - die "Command failed: $!" unless $? == 0; - - print STDERR "\nCompiling lexical weights file...\n"; - $cmd="$compile -x $compiled_f_file $compiled_e_file $compile_alignment_dir/a.bin $compile_alignment_dir/lex.bin"; - print STDERR " $cmd\n"; - `$cmd`; - die "Command failed: $!" unless $? == 0; - } - - chdir $compile_dir; - print STDERR "Compiling done: $compile_dir\n"; - } - - unless ($no_ini){ - my $line; - while($line=<INI>){ - $line =~ s/^([^#]*a_file\s*=\s*")(.*)("\s*)$/$1$alignment_dir\/a.bin$3/; - $line =~ s/^([^#]*lex_file\s*=\s*")(.*)("\s*)$/$1$alignment_dir\/lex.bin$3/; - $line =~ s/^([^#]*f_sa_file\s*=\s*")(.*)("\s*)$/$1$bitext_dir\/f.sa.bin$3/; - $line =~ s/^([^#]*e_file\s*=\s*")(.*)("\s*)$/$1$bitext_dir\/e.bin$3/; - $line =~ s/^([^#]*precompute_file\s*=\s*")(.*)("\s*)$/$1$bitext_dir\/precomp.$max_len.$max_nt.$max_size.$min_gap.$rank1.$rank2.bin$3/; - - $line =~ s/^([^#]*max_len\s*=\s*)(.*)(\s*)$/$1$max_len$3/; - $line =~ s/^([^#]*max_nt\s*=\s*)(.*)(\s*)$/$1$max_nt$3/; - $line =~ s/^([^#]*max_size\s*=\s*)(.*)(\s*)$/$1$max_size$3/; - $line =~ s/^([^#]*min_gap\s*=\s*)(.*)(\s*)$/$1$min_gap$3/; - $line =~ s/^([^#]*rank1\s*=\s*)(.*)(\s*)$/$1$rank1$3/; - $line =~ s/^([^#]*rank2\s*=\s*)(.*)(\s*)$/$1$rank2$3/; - - print $line; - } - } - -exit(0); - -sub cleanup { - die "Cleanup.\n"; -} - -sub print_help -{ - my $name = `basename $0`; chomp $name; - print << "Help"; - -usage: $name [options] - - Manage compilation of SA-Hiero files and creation of ini files. - In the default usage, the command deploys a set of files needed - to create a system, and writes an ini for the system on stdout. - -options: - - -a, --alignment <name>[=<filename>] - Name of an alignment of a bitext (which must be specified - with -b unless using the -c flag). If used with -r, the - alignment is removed from the deployment. If used with -c, - only alignments with this name are listed. If a filename is - given, then the file will be deployed using the name. - - -b, --bitext <name>[=<f file>,<e file>] - Name of a bitext for a particular system type (which must be - specified with -t unless using the -c flag). If used with -r, - the bitext is removed from the deployment. If used with -c, - only bitexts with the name are listed. If a filename is given, - then the file will be deployed using the name. - - -h, --help - Prints this message. - - -i, --ini <filename> - Use a specific ini file as the template for a system, rather than - the default ini file. - - -p, --precomp-options <key1>=<value1>[,<key2>=<value2>,...,<keyN>=<valueN>] - Set parameters of the grammar. This must be set by $name because - many parameters involve precomputation. There are six keys that can - be set: - max-len: maximum number of symbols (T and NT) in a grammar rule - max-nt: maximum number of nonterminals in a grammar rule - max-size: maximum span of a grammar rule extracted from training - min-gap: minimum gap spanned by a nonterminal in training - rank1: number of frequent words to precompute collocations for. - rank2: number of super-frequent words to precompute triple - collocations for. - All values must be positive integers. If not specified, defaults are: - max-len = 5 - max-nt = 2 (>2 not supported) - max-size = 10 - min-gap = 2 - rank1 = 100 (>300 not recommended) - rank2 = 10 (>10 not recommended) - - -n, --no-ini - Do not generate an ini file on stdout. If this option is used, then - the requirement to specify a full system is relaxed. Therefore, this - option can be used when the sole objective is deployment of files. - - -o, --output-dir - Write the compiled model to this directory. - -Help -} diff --git a/sa-extract/sa2cdec.py b/sa-extract/sa2cdec.py deleted file mode 100755 index 55fb19f3..00000000 --- a/sa-extract/sa2cdec.py +++ /dev/null @@ -1,19 +0,0 @@ -#!/usr/bin/env python -import sys - -featNames = [ line.strip() for line in open(sys.argv[1]) if not line.startswith('#') ] - -for line in sys.stdin: - try: - (lhs, src, tgt, feats, align) = line.strip("\n").split(' ||| ') - except: - print >>sys.stderr, 'WARNING: No alignments:', line - try: - (lhs, src, tgt, feats) = line.strip().split(' ||| ') - align = '' - except: - print >>sys.stderr, "ERROR: Malformed line:", line - raise - featValues = feats.split(' ') - namedFeats = ' '.join( name+"="+value for (name, value) in zip(featNames, featValues) ) - print " ||| ".join( (lhs, src, tgt, namedFeats, align) ) diff --git a/sa-extract/sa_feat_names.txt b/sa-extract/sa_feat_names.txt deleted file mode 100644 index 02c137d7..00000000 --- a/sa-extract/sa_feat_names.txt +++ /dev/null @@ -1,7 +0,0 @@ -EGivenFCoherent -SampleCountF -CountEF -MaxLexFGivenE -MaxLexEGivenF -IsSingletonF -IsSingletonFE diff --git a/sa-extract/setup.cfg b/sa-extract/setup.cfg deleted file mode 100644 index 8f696136..00000000 --- a/sa-extract/setup.cfg +++ /dev/null @@ -1,2 +0,0 @@ -[build_ext] -inplace=1 diff --git a/sa-extract/setup.py b/sa-extract/setup.py deleted file mode 100644 index cdcbfb54..00000000 --- a/sa-extract/setup.py +++ /dev/null @@ -1,45 +0,0 @@ -from distutils.core import setup, Extension -from distutils.util import get_platform -import os.path - -cstrmap_module = Extension('cstrmap', sources = ['cstrmap.c', 'strmap.cc']) -setup (name = 'CStringMap', version = '1.0', description = 'C string->int map', ext_modules = [cstrmap_module]) - -rule_module = Extension('rule', - sources = ['rule.c', 'strutil.c']) -setup (name = 'Rule', version = '1.0', description = 'rule class', ext_modules = [rule_module]) - -sym_module = Extension('sym', - sources = ['sym.c']) -setup (name = 'Sym', version = '1.0', description = 'symbol class', ext_modules = [sym_module]) - -cdat_module = Extension('cdat', sources = ['cdat.c']) -setup(name = "CDat", version = '1.0', description = 'C Data class', ext_modules = [cdat_module]) - -cintlist_module = Extension('cintlist', sources = ['cintlist.c']) -setup(name = "CIntList", version = '1.0', description = 'C int array/list class', ext_modules = [cintlist_module]) - -cfloatlist_module = Extension('cfloatlist', sources = ['cfloatlist.c']) -setup(name = "CFloatList", version = '1.0', description = 'C float array/list class', ext_modules = [cfloatlist_module]) - -calignment_module = Extension('calignment', sources = ['calignment.c']) -setup(name = "CAlignment", version = '1.0', description = 'C alignment class', ext_modules = [calignment_module]) - -csuf_module = Extension('csuf', sources = ['csuf.c']) -setup(name = "CSuffixArray", version = '1.0', description = 'C suffix array class', ext_modules = [csuf_module]) - -clex_module = Extension('clex', sources = ['clex.c']) -setup(name = "CLex", version = '1.0', description = 'C lexical class', ext_modules = [clex_module]) - -factory_module = Extension('rulefactory', sources = ['rulefactory.c']) -setup(name = "RuleFactory", version = '1.0', description = 'C rule factory classes', ext_modules = [factory_module]) - -cveb_module = Extension('cveb', sources = ['cveb.c']) -setup(name = "CVEB", version = '1.0', description = 'C impl. of van Emde Boas tree', ext_modules = [cveb_module]) - -lcp_module = Extension('lcp', sources = ['lcp.c']) -setup(name = "LCP", version = '1.0', description = 'C impl. of LCP', ext_modules = [lcp_module]) - -precomp_module = Extension('precomputation', sources = ['precomputation.c']) -setup(name = "Precomputation", version = '1.0', description = 'Precomputation Algorithm', ext_modules = [precomp_module]) - diff --git a/sa-extract/sgml.py b/sa-extract/sgml.py deleted file mode 100644 index 2db8b5dc..00000000 --- a/sa-extract/sgml.py +++ /dev/null @@ -1,194 +0,0 @@ -import sys, sgmllib, xml.sax.saxutils, sym - -def attrs_to_str(d): - if len(d) == 0: - return "" - l = [""]+["%s=%s" % (name, xml.sax.saxutils.quoteattr(value)) for (name, value) in d] - return " ".join(l) - -def attrs_to_dict(a): - d = {} - for (name, value) in a: - if d.has_key(name.lower()): - raise ValueError, "duplicate attribute names" - d[name.lower()] = value - return d - -class Sentence(object): - def __init__(self, words=None, meta=None): - if words is not None: - self.words = list(words) - else: - self.words = [] - if meta is not None: - self.meta = meta - else: - self.meta = [] - - def copy(self): - return Sentence(self.words, list(self.meta)) - - def mark(self, tag, attrs): - self.meta.append((tag, attrs, 0, len(self.words))) - - def getmark(self): - if len(self.meta) > 0: - (tag, attrs, i, j) = self.meta[-1] - if i == 0 and j == len(self.words): - return (tag, attrs) - else: - return None - else: - return None - - def unmark(self): - mark = self.getmark() - if mark is not None: - self.meta = self.meta[:-1] - return mark - - def __cmp__(self, other): - return cmp((self.words, self.meta), (other.words, other.meta)) - - def __str__(self): - def cmp_spans((tag1,attr1,i1,j1),(tag2,attr2,i2,j2)): - if i1==i2<=j1==j2: - return 0 - elif i2<=i1<=j1<=j2: - return -1 - elif i1<=i2<=j2<=j1: - return 1 - else: - return cmp((i1,j1),(i2,j2)) # don't care - # this guarantees that equal spans will come out nested - # we want the later spans to be outer - # this relies on stable sort - open = [[] for i in xrange(len(self.words)+1)] - # there seems to be a bug still with empty spans - empty = [[] for i in xrange(len(self.words)+1)] - close = [[] for j in xrange(len(self.words)+1)] - for (tag,attrs,i,j) in sorted(self.meta, cmp=cmp_spans): - if i == j: - # do we want these to nest? - empty[i].append("<%s%s></%s>\n" % (tag, attrs_to_str(attrs), tag)) - else: - open[i].append("<%s%s>\n" % (tag, attrs_to_str(attrs))) - close[j].append("</%s>\n" % tag) - - result = [] - if len(empty[0]) > 0: - result.extend(empty[0]) - for i in xrange(len(self.words)): - if i > 0: - result.append(" ") - result.extend(reversed(open[i])) - result.append(xml.sax.saxutils.escape(sym.tostring(self.words[i]))) - result.extend(close[i+1]) - if len(empty[i+1]) > 0: - result.extend(empty[i+1]) - - return "".join(result) - - def __add__(self, other): - if type(other) in (list, tuple): - return Sentence(self.words + list(other), self.meta) - else: - othermeta = [(tag, attrs, i+len(self.words), j+len(self.words)) for (tag, attrs, i, j) in other.meta] - return Sentence(self.words + other.words, self.meta+othermeta) - -def read_raw(f): - """Read a raw file into a list of Sentences.""" - if type(f) is str: - f = file(f, "r") - i = 0 - line = f.readline() - while line != "": - sent = process_sgml_line(line, i) - mark = sent.getmark() - if mark is None: - sent.mark('seg', [('id',str(i))]) - else: - (tag, attrs) = mark - if tag == "seg" and not attrs_to_dict(attrs).has_key('id'): - x = ('id',str(i)) - attrs.append(x) - sent.mark('seg', attrs) - if tag != "seg": - sent.mark('seg', [('id',str(i))]) - yield sent - i += 1 - line = f.readline() - -def process_sgml_line(line, id=None): - p = DatasetParser(None) - p.pos = 0 - p.words = [] - p.meta = [] - p.feed(line) - p.close() - sent = Sentence(p.words, p.meta) - return sent - -class DatasetParser(sgmllib.SGMLParser): - def __init__(self, set): - sgmllib.SGMLParser.__init__(self) - self.words = None - self.mystack = [] - self.string = None - self.convref = d = {"amp":"&", "lt":"<", "gt":">", "quot":'"', "squot":"'"} - def close(self): - self.convert() - sgmllib.SGMLParser.close(self) - - def handle_starttag(self, tag, method, attrs): - thing = method(attrs) - self.mystack.append(thing) - - def handle_endtag(self, tag, method): - thing = self.mystack.pop() - method(thing) - - def unknown_starttag(self, tag, attrs): - thing = self.start(tag, attrs) - self.mystack.append(thing) - - def unknown_endtag(self, tag): - thing = self.mystack.pop() - self.end(tag, thing) - - def start(self, tag, attrs): - self.convert() - if self.words is not None: - return (tag, attrs, self.pos, None) - else: - return None - - def convert(self): - if self.words is not None and self.string is not None: - words = self.string.split() - self.pos += len(words) - self.words.extend(words) - self.string = None - - def end(self, tag, thing): - self.convert() - if self.words is not None: - (tag, attrs, i, j) = thing - self.meta.append((tag, attrs, i, self.pos)) - - def handle_data(self, s): - if self.words is not None: - if (self.string is None): - self.string = s - else: - self.string += s - - def handle_entityref(self, ref): - # s=self.convert_entityref(ref) # if python 2.5 - s=self.convref[ref] - if self.words is not None: - if (self.string is None): - self.string = s - else: - self.string += s - diff --git a/sa-extract/strmap.cc b/sa-extract/strmap.cc deleted file mode 100644 index 5040477e..00000000 --- a/sa-extract/strmap.cc +++ /dev/null @@ -1,232 +0,0 @@ -#include "strmap.h" - -#include <vector> -#include <string> -#include <tr1/unordered_map> -#include <stdint.h> - -using namespace std; -using namespace std::tr1; - -#undef HAVE_64_BITS - -#if INTPTR_MAX == INT32_MAX -# define HAVE_64_BITS 0 -#elif INTPTR_MAX >= INT64_MAX -# define HAVE_64_BITS 1 -#else -# error "couldn't tell if HAVE_64_BITS from INTPTR_MAX INT32_MAX INT64_MAX" -#endif - -typedef uintptr_t MurmurInt; - -// MurmurHash2, by Austin Appleby - -static const uint32_t DEFAULT_SEED=2654435769U; - -#if HAVE_64_BITS -//MurmurInt MurmurHash(void const *key, int len, uint32_t seed=DEFAULT_SEED); - -inline uint64_t MurmurHash64( const void * key, int len, unsigned int seed=DEFAULT_SEED ) -{ - const uint64_t m = 0xc6a4a7935bd1e995ULL; - const int r = 47; - - uint64_t h = seed ^ (len * m); - - const uint64_t * data = (const uint64_t *)key; - const uint64_t * end = data + (len/8); - - while(data != end) - { - uint64_t k = *data++; - - k *= m; - k ^= k >> r; - k *= m; - - h ^= k; - h *= m; - } - - const unsigned char * data2 = (const unsigned char*)data; - - switch(len & 7) - { - case 7: h ^= uint64_t(data2[6]) << 48; - case 6: h ^= uint64_t(data2[5]) << 40; - case 5: h ^= uint64_t(data2[4]) << 32; - case 4: h ^= uint64_t(data2[3]) << 24; - case 3: h ^= uint64_t(data2[2]) << 16; - case 2: h ^= uint64_t(data2[1]) << 8; - case 1: h ^= uint64_t(data2[0]); - h *= m; - }; - - h ^= h >> r; - h *= m; - h ^= h >> r; - - return h; -} - -inline uint32_t MurmurHash32(void const *key, int len, uint32_t seed=DEFAULT_SEED) -{ - return (uint32_t) MurmurHash64(key,len,seed); -} - -inline MurmurInt MurmurHash(void const *key, int len, uint32_t seed=DEFAULT_SEED) -{ - return MurmurHash64(key,len,seed); -} - -#else -// 32-bit - -// Note - This code makes a few assumptions about how your machine behaves - -// 1. We can read a 4-byte value from any address without crashing -// 2. sizeof(int) == 4 -inline uint32_t MurmurHash32 ( const void * key, int len, uint32_t seed=DEFAULT_SEED) -{ - // 'm' and 'r' are mixing constants generated offline. - // They're not really 'magic', they just happen to work well. - - const uint32_t m = 0x5bd1e995; - const int r = 24; - - // Initialize the hash to a 'random' value - - uint32_t h = seed ^ len; - - // Mix 4 bytes at a time into the hash - - const unsigned char * data = (const unsigned char *)key; - - while(len >= 4) - { - uint32_t k = *(uint32_t *)data; - - k *= m; - k ^= k >> r; - k *= m; - - h *= m; - h ^= k; - - data += 4; - len -= 4; - } - - // Handle the last few bytes of the input array - - switch(len) - { - case 3: h ^= data[2] << 16; - case 2: h ^= data[1] << 8; - case 1: h ^= data[0]; - h *= m; - }; - - // Do a few final mixes of the hash to ensure the last few - // bytes are well-incorporated. - - h ^= h >> 13; - h *= m; - h ^= h >> 15; - - return h; -} - -inline MurmurInt MurmurHash ( const void * key, int len, uint32_t seed=DEFAULT_SEED) { - return MurmurHash32(key,len,seed); -} - -// 64-bit hash for 32-bit platforms - -inline uint64_t MurmurHash64 ( const void * key, int len, uint32_t seed=DEFAULT_SEED) -{ - const uint32_t m = 0x5bd1e995; - const int r = 24; - - uint32_t h1 = seed ^ len; - uint32_t h2 = 0; - - const uint32_t * data = (const uint32_t *)key; - - while(len >= 8) - { - uint32_t k1 = *data++; - k1 *= m; k1 ^= k1 >> r; k1 *= m; - h1 *= m; h1 ^= k1; - len -= 4; - - uint32_t k2 = *data++; - k2 *= m; k2 ^= k2 >> r; k2 *= m; - h2 *= m; h2 ^= k2; - len -= 4; - } - - if(len >= 4) - { - uint32_t k1 = *data++; - k1 *= m; k1 ^= k1 >> r; k1 *= m; - h1 *= m; h1 ^= k1; - len -= 4; - } - - switch(len) - { - case 3: h2 ^= ((unsigned char*)data)[2] << 16; - case 2: h2 ^= ((unsigned char*)data)[1] << 8; - case 1: h2 ^= ((unsigned char*)data)[0]; - h2 *= m; - }; - - h1 ^= h2 >> 18; h1 *= m; - h2 ^= h1 >> 22; h2 *= m; - h1 ^= h2 >> 17; h1 *= m; - h2 ^= h1 >> 19; h2 *= m; - - uint64_t h = h1; - - h = (h << 32) | h2; - - return h; -} - -#endif -//32bit - -struct MurmurHasher { - size_t operator()(const string& s) const { - return MurmurHash(s.c_str(), s.size()); - } -}; - -struct StrMap { - StrMap() { keys_.reserve(10000); keys_.push_back("<bad0>"); map_[keys_[0]] = 0; } - unordered_map<string, int, MurmurHasher> map_; - vector<string> keys_; -}; - -StrMap* stringmap_new() { - return new StrMap; -} - -void stringmap_delete(StrMap *vocab) { - delete vocab; -} - -int stringmap_index(StrMap *vocab, char *s) { - int& cell = vocab->map_[s]; - if (!cell) { - cell = vocab->keys_.size(); - vocab->keys_.push_back(s); - } - return cell; -} - -char* stringmap_word(StrMap *vocab, int i) { - return const_cast<char *>(vocab->keys_[i].c_str()); -} - diff --git a/sa-extract/strmap.h b/sa-extract/strmap.h deleted file mode 100644 index a218a4c0..00000000 --- a/sa-extract/strmap.h +++ /dev/null @@ -1,22 +0,0 @@ -#ifndef _STRMAP_H_ -#define _STRMAP_H_ - -#ifdef __cplusplus - extern "C" { -#else - typedef struct StrMap StrMap; /* dummy type to stand in for class */ -#endif - -struct StrMap; - -StrMap* stringmap_new(); -void stringmap_delete(StrMap *vocab); -int stringmap_index(StrMap *vocab, char *s); -char* stringmap_word(StrMap *vocab, int i); - -#ifdef __cplusplus - } -#endif - - -#endif diff --git a/sa-extract/strutil.c b/sa-extract/strutil.c deleted file mode 100644 index 456de87a..00000000 --- a/sa-extract/strutil.c +++ /dev/null @@ -1,63 +0,0 @@ -#include <string.h> -#include <stdlib.h> - -/* Like strsep(3) except that the delimiter is a string, not a set of characters. -*/ -char *strstrsep(char **stringp, const char *delim) { - char *match, *save; - save = *stringp; - if (*stringp == NULL) - return NULL; - match = strstr(*stringp, delim); - if (match == NULL) { - *stringp = NULL; - return save; - } - *match = '\0'; - *stringp = match + strlen(delim); - return save; -} - -static char **words = NULL; -static int max_words; -char **split(char *s, const char *delim, int *pn) { - int i; - char *tok, *rest; - - if (words == NULL) { - max_words = 10; - words = malloc(max_words*sizeof(char *)); - } - i = 0; - rest = s; - while ((tok = (delim ? strstrsep(&rest, delim) : strsep(&rest, " \t\n"))) != NULL) { - if (!delim && !*tok) // empty token - continue; - while (i+1 >= max_words) { - max_words *= 2; - words = realloc(words, max_words*sizeof(char *)); - } - words[i] = tok; - i++; - } - words[i] = NULL; - if (pn != NULL) - *pn = i; - return words; -} - -inline int isspace(char c) { - return (c == ' ' || c == '\t' || c == '\n'); -} - -char *strip(char *s) { - int n; - while (isspace(*s) && *s != '\0') - s++; - n = strlen(s); - while (n > 0 && isspace(s[n-1])) { - s[n-1] = '\0'; - n--; - } - return s; -} diff --git a/sa-extract/strutil.h b/sa-extract/strutil.h deleted file mode 100644 index 94a77033..00000000 --- a/sa-extract/strutil.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef STRUTIL_H -#define STRUTIL_H - -char *strstrsep(char **stringp, const char *delim); -char *strip(char *s); -char **split(char *s, const char *delim, int *pn); - -#endif diff --git a/sa-extract/sym.pxd b/sa-extract/sym.pxd deleted file mode 100644 index d0650f46..00000000 --- a/sa-extract/sym.pxd +++ /dev/null @@ -1,17 +0,0 @@ -cimport cstrmap - -cdef class Alphabet: - cdef readonly cstrmap.StringMap terminals, nonterminals - cdef int first_nonterminal, last_nonterminal - cdef int isvar(self, int sym) - cdef int isword(self, int sym) - cdef int getindex(self, int sym) - cdef int setindex(self, int sym, int ind) - cdef int clearindex(self, int sym) - cdef int match(self, int sym1, int sym2) - cdef char* tocat(self, int sym) - cdef int fromcat(self, char *s) - cdef char* tostring(self, int sym) - cdef int fromstring(self, char *s, int terminal) - - diff --git a/sa-extract/sym.pyx b/sa-extract/sym.pyx deleted file mode 100644 index 264cfcd9..00000000 --- a/sa-extract/sym.pyx +++ /dev/null @@ -1,155 +0,0 @@ -from libc.string cimport strrchr, strstr, strcpy, strlen -from libc.stdlib cimport malloc, realloc, strtol - -cdef int index_shift, index_mask, n_index -index_shift = 3 -n_index = 1<<index_shift -index_mask = (1<<index_shift)-1 -cdef id2sym -id2sym = {} - -cdef class Alphabet: - def __cinit__(self): - self.terminals = cstrmap.StringMap() - self.nonterminals = cstrmap.StringMap() - - def __init__(self): - self.first_nonterminal = -1 - - def __dealloc__(self): - pass - - cdef int isvar(self, int sym): - return sym < 0 - - cdef int isword(self, int sym): - return sym >= 0 - - cdef int getindex(self, int sym): - return -sym & index_mask - - cdef int setindex(self, int sym, int ind): - return -(-sym & ~index_mask | ind) - - cdef int clearindex(self, int sym): - return -(-sym& ~index_mask) - - cdef int match(self, int sym1, int sym2): - return self.clearindex(sym1) == self.clearindex(sym2); - - cdef char* tocat(self, int sym): - return self.nonterminals.word((-sym >> index_shift)-1) - - cdef int fromcat(self, char *s): - cdef int i - i = self.nonterminals.index(s) - if self.first_nonterminal == -1: - self.first_nonterminal = i - if i > self.last_nonterminal: - self.last_nonterminal = i - return -(i+1 << index_shift) - - cdef char* tostring(self, int sym): - cdef int ind - if self.isvar(sym): - if sym in id2sym: - return id2sym[sym] - - ind = self.getindex(sym) - if ind > 0: - id2sym[sym] = "[%s,%d]" % (self.tocat(sym), ind) - else: - id2sym[sym] = "[%s]" % self.tocat(sym) - return id2sym[sym] - - else: - return self.terminals.word(sym) - - cdef int fromstring(self, char *s, int terminal): - """Warning: this method is allowed to alter s.""" - cdef char *comma - cdef int n - n = strlen(s) - cdef char *sep - sep = strstr(s,"_SEP_") - if n >= 3 and s[0] == c'[' and s[n-1] == c']' and sep == NULL: - if terminal: - s1 = "\\"+s - return self.terminals.index(s1) - s[n-1] = c'\0' - s = s + 1 - comma = strrchr(s, c',') - if comma != NULL: - comma[0] = c'\0' - return self.setindex(self.fromcat(s), strtol(comma+1, NULL, 10)) - else: - return self.fromcat(s) - else: - return self.terminals.index(s) - -# Expose Python functions as top-level functions for backward compatibility - -alphabet = Alphabet() - -cdef Alphabet calphabet -calphabet = alphabet - -def isvar(int sym): - return calphabet.isvar(sym) - -def isword(int sym): - return calphabet.isword(sym) - -def getindex(int sym): - return calphabet.getindex(sym) - -def setindex(int sym, int ind): - return calphabet.setindex(sym, ind) - -def clearindex(int sym): - return calphabet.clearindex(sym) - -def match(int sym1, int sym2): - return calphabet.match(sym1, sym2) != 0 - -def totag(int sym): - return calphabet.tocat(sym) - -def fromtag(s): - s = s.upper() - return calphabet.fromcat(s) - -def tostring(sym): - if type(sym) is str: - return sym - else: - return calphabet.tostring(sym) - -cdef int bufsize -cdef char *buf -bufsize = 100 -buf = <char *>malloc(bufsize) -cdef ensurebufsize(int size): - global buf, bufsize - if size > bufsize: - buf = <char *>realloc(buf, size*sizeof(char)) - bufsize = size - -def fromstring(s, terminal=False): - cdef bytes bs - cdef char* cs - if terminal: - return calphabet.fromstring(s, 1) - else: - bs = s - cs = bs - ensurebufsize(len(s)+1) - strcpy(buf, cs) - return calphabet.fromstring(buf, 0) - -def nonterminals(): - cdef i - l = [] - for i from calphabet.first_nonterminal <= i <= calphabet.last_nonterminal: - l.append(-(i+1 << index_shift)) - return l diff --git a/sa-extract/wrap_input.py b/sa-extract/wrap_input.py deleted file mode 100755 index e859a4fd..00000000 --- a/sa-extract/wrap_input.py +++ /dev/null @@ -1,37 +0,0 @@ -#!/usr/bin/env python -import sys -import codecs -import os -import os.path -from xml.sax.saxutils import escape - -graPrefix = sys.argv[1] - -# Second argument can be a file with observable sentence-level features, -# one set of features per line (parallel with source sentences). Features are space-delimited indicator features. -obsFeatsFile = None -if len(sys.argv) == 3: - obsFeatsFilename = sys.argv[2] - obsFeatsFile = open(obsFeatsFilename) - -sys.stdin = codecs.getreader("utf-8")(sys.stdin) -sys.stdout = codecs.getwriter("utf-8")(sys.stdout) - -i = 0 -for line in sys.stdin: - filename = "%s%d"%(graPrefix,i) - if not os.path.exists(filename): - filenameGz = filename + ".gz" - if not os.path.exists(filenameGz): - print >>sys.stderr, "Grammar file not found: ", filename, filenameGz - sys.exit(1) - else: - filename = filenameGz - - if obsFeatsFile: - obsFeats = obsFeatsFile.next().strip() - print '<seg id="%d" features="%s" grammar="%s"> '%(i,obsFeats,filename) + escape(line.strip()) + " </seg>" - else: - print '<seg id="%d" grammar="%s"> '%(i,filename) + escape(line.strip()) + " </seg>" - i+=1 - |