diff options
author | Chris Dyer <prguest11@taipan.cs> | 2012-02-02 06:29:50 +0000 |
---|---|---|
committer | Chris Dyer <prguest11@taipan.cs> | 2012-02-02 06:29:50 +0000 |
commit | 7d37102fde8d8b46ed5218245a11496e870fd09f (patch) | |
tree | 97bb3de03d634ffb754a6381c961b6d5cf8d3a7d /sa-extract/clex.pyx | |
parent | a95fe83969d15b074892c8c06fc2a948b75910ff (diff) |
lopez suffix array extractor with copyrighted david chiang code excised
Diffstat (limited to 'sa-extract/clex.pyx')
-rw-r--r-- | sa-extract/clex.pyx | 460 |
1 files changed, 460 insertions, 0 deletions
diff --git a/sa-extract/clex.pyx b/sa-extract/clex.pyx new file mode 100644 index 00000000..fa30caad --- /dev/null +++ b/sa-extract/clex.pyx @@ -0,0 +1,460 @@ +# clex.pyx +# defines bilexical dictionaries in C, with some convenience methods +# for reading arrays directly as globs directly from disk. +# Adam Lopez <alopez@cs.umd.edu> + +import gzip +import sys +import context_model + +cimport cintlist +cimport cfloatlist +cimport calignment +cimport csuf +cimport cdat + +from libc.stdio cimport FILE, fopen, fread, fwrite, fclose +from libc.stdlib cimport malloc, realloc, free +from libc.string cimport memset, strcpy, strlen + +cdef struct _node: + _node* smaller + _node* bigger + int key + int val + +cdef _node* new_node(int key): + cdef _node* n + n = <_node*> malloc(sizeof(_node)) + n.smaller = NULL + n.bigger = NULL + n.key = key + n.val = 0 + return n + + +cdef del_node(_node* n): + if n.smaller != NULL: + del_node(n.smaller) + if n.bigger != NULL: + del_node(n.bigger) + free(n) + + +cdef int* get_val(_node* n, int key): + if key == n.key: + return &n.val + elif key < n.key: + if n.smaller == NULL: + n.smaller = new_node(key) + return &(n.smaller.val) + return get_val(n.smaller, key) + else: + if n.bigger == NULL: + n.bigger = new_node(key) + return &(n.bigger.val) + return get_val(n.bigger, key) + + +cdef class CLex: + + cdef cfloatlist.CFloatList col1 + cdef cfloatlist.CFloatList col2 + cdef cintlist.CIntList f_index + cdef cintlist.CIntList e_index + cdef id2eword, id2fword, eword2id, fword2id + + def __init__(self, filename, from_binary=False, + from_data=False, earray=None, fsarray=None): + self.id2eword = [] + self.id2fword = [] + self.eword2id = {} + self.fword2id = {} + self.e_index = cintlist.CIntList() + self.f_index = cintlist.CIntList() + self.col1 = cfloatlist.CFloatList() + self.col2 = cfloatlist.CFloatList() + if from_binary: + self.read_binary(filename) + else: + if from_data: + self.compute_from_data(fsarray, earray, filename) + else: + self.read_text(filename) + '''print "self.eword2id" + print "=============" + for x in self.eword2id: + print x + print "self.fword2id" + print "=============" + for x in self.fword2id: + print x + print "-------------"''' + + + cdef compute_from_data(self, csuf.SuffixArray fsa, cdat.DataArray eda, calignment.Alignment aa): + cdef int sent_id, num_links, l, i, j, f_i, e_j, I, J, V_E, V_F, num_pairs + cdef int *fsent, *esent, *alignment, *links, *ealigned, *faligned + cdef _node** dict + cdef int *fmargin, *emargin, *count + cdef bytes word + cdef int null_word + + null_word = 0 + for word in fsa.darray.id2word: # I miss list comprehensions + self.id2fword.append(word) + self.id2fword[null_word] = "NULL" + for id, word in enumerate(self.id2fword): + self.fword2id[word] = id + + for word in eda.id2word: + self.id2eword.append(word) + self.id2eword[null_word] = "NULL" + for id, word in enumerate(self.id2eword): + self.eword2id[word] = id + + num_pairs = 0 + + V_E = len(eda.id2word) + V_F = len(fsa.darray.id2word) + fmargin = <int*> malloc(V_F*sizeof(int)) + emargin = <int*> malloc(V_E*sizeof(int)) + memset(fmargin, 0, V_F*sizeof(int)) + memset(emargin, 0, V_E*sizeof(int)) + + dict = <_node**> malloc(V_F*sizeof(_node*)) + memset(dict, 0, V_F*sizeof(_node*)) + + num_sents = len(fsa.darray.sent_index) + for sent_id from 0 <= sent_id < num_sents-1: + + fsent = fsa.darray.data.arr + fsa.darray.sent_index.arr[sent_id] + I = fsa.darray.sent_index.arr[sent_id+1] - fsa.darray.sent_index.arr[sent_id] - 1 + faligned = <int*> malloc(I*sizeof(int)) + memset(faligned, 0, I*sizeof(int)) + + esent = eda.data.arr + eda.sent_index.arr[sent_id] + J = eda.sent_index.arr[sent_id+1] - eda.sent_index.arr[sent_id] - 1 + ealigned = <int*> malloc(J*sizeof(int)) + memset(ealigned, 0, J*sizeof(int)) + + links = aa._get_sent_links(sent_id, &num_links) + + for l from 0 <= l < num_links: + i = links[l*2] + j = links[l*2+1] + if i >= I or j >= J: + sys.stderr.write(" %d-%d out of bounds (I=%d,J=%d) in line %d\n" % (i,j,I,J,sent_id+1)) + assert i < I + assert j < J + f_i = fsent[i] + e_j = esent[j] + fmargin[f_i] = fmargin[f_i]+1 + emargin[e_j] = emargin[e_j]+1 + if dict[f_i] == NULL: + dict[f_i] = new_node(e_j) + dict[f_i].val = 1 + num_pairs = num_pairs + 1 + else: + count = get_val(dict[f_i], e_j) + if count[0] == 0: + num_pairs = num_pairs + 1 + count[0] = count[0] + 1 + # add count + faligned[i] = 1 + ealigned[j] = 1 + for i from 0 <= i < I: + if faligned[i] == 0: + f_i = fsent[i] + fmargin[f_i] = fmargin[f_i] + 1 + emargin[null_word] = emargin[null_word] + 1 + if dict[f_i] == NULL: + dict[f_i] = new_node(null_word) + dict[f_i].val = 1 + num_pairs = num_pairs + 1 + else: + count = get_val(dict[f_i], null_word) + if count[0] == 0: + num_pairs = num_pairs + 1 + count[0] = count[0] + 1 + for j from 0 <= j < J: + if ealigned[j] == 0: + e_j = esent[j] + fmargin[null_word] = fmargin[null_word] + 1 + emargin[e_j] = emargin[e_j] + 1 + if dict[null_word] == NULL: + dict[null_word] = new_node(e_j) + dict[null_word].val = 1 + num_pairs = num_pairs + 1 + else: + count = get_val(dict[null_word], e_j) + if count[0] == 0: + num_pairs = num_pairs + 1 + count[0] = count[0] + 1 + free(links) + free(faligned) + free(ealigned) + self.f_index = cintlist.CIntList(initial_len=V_F) + self.e_index = cintlist.CIntList(initial_len=num_pairs) + self.col1 = cfloatlist.CFloatList(initial_len=num_pairs) + self.col2 = cfloatlist.CFloatList(initial_len=num_pairs) + + num_pairs = 0 + for i from 0 <= i < V_F: + #self.f_index[i] = num_pairs + self.f_index.set(i, num_pairs) + if dict[i] != NULL: + self._add_node(dict[i], &num_pairs, float(fmargin[i]), emargin) + del_node(dict[i]) + free(fmargin) + free(emargin) + free(dict) + return + + + cdef _add_node(self, _node* n, int* num_pairs, float fmargin, int* emargin): + cdef int loc + if n.smaller != NULL: + self._add_node(n.smaller, num_pairs, fmargin, emargin) + loc = num_pairs[0] + self.e_index.set(loc, n.key) + self.col1.set(loc, float(n.val)/fmargin) + self.col2.set(loc, float(n.val)/float(emargin[n.key])) + num_pairs[0] = loc + 1 + if n.bigger != NULL: + self._add_node(n.bigger, num_pairs, fmargin, emargin) + + + def write_binary(self, filename): + cdef FILE* f + cdef bytes bfilename = filename + cdef char* cfilename = bfilename + f = fopen(cfilename, "w") + self.f_index.write_handle(f) + self.e_index.write_handle(f) + self.col1.write_handle(f) + self.col2.write_handle(f) + self.write_wordlist(self.id2fword, f) + self.write_wordlist(self.id2eword, f) + fclose(f) + + + cdef write_wordlist(self, wordlist, FILE* f): + cdef int word_len + cdef int num_words + cdef char* c_word + + num_words = len(wordlist) + fwrite(&(num_words), sizeof(int), 1, f) + for word in wordlist: + c_word = word + word_len = strlen(c_word) + 1 + fwrite(&(word_len), sizeof(int), 1, f) + fwrite(c_word, sizeof(char), word_len, f) + + + cdef read_wordlist(self, word2id, id2word, FILE* f): + cdef int num_words + cdef int word_len + cdef char* c_word + cdef bytes py_word + + fread(&(num_words), sizeof(int), 1, f) + for i from 0 <= i < num_words: + fread(&(word_len), sizeof(int), 1, f) + c_word = <char*> malloc (word_len * sizeof(char)) + fread(c_word, sizeof(char), word_len, f) + py_word = c_word + free(c_word) + word2id[py_word] = len(id2word) + id2word.append(py_word) + + def read_binary(self, filename): + cdef FILE* f + cdef bytes bfilename = filename + cdef char* cfilename = bfilename + f = fopen(cfilename, "r") + self.f_index.read_handle(f) + self.e_index.read_handle(f) + self.col1.read_handle(f) + self.col2.read_handle(f) + self.read_wordlist(self.fword2id, self.id2fword, f) + self.read_wordlist(self.eword2id, self.id2eword, f) + fclose(f) + + + def get_e_id(self, eword): + if eword not in self.eword2id: + e_id = len(self.id2eword) + self.id2eword.append(eword) + self.eword2id[eword] = e_id + return self.eword2id[eword] + + + def get_f_id(self, fword): + if fword not in self.fword2id: + f_id = len(self.id2fword) + self.id2fword.append(fword) + self.fword2id[fword] = f_id + return self.fword2id[fword] + + + def read_text(self, filename): + cdef i, j, w, e_id, f_id, n_f, n_e, N + cdef cintlist.CIntList fcount + + fcount = cintlist.CIntList() + if filename[-2:] == "gz": + f = gzip.GzipFile(filename) + else: + f = open(filename) + + # first loop merely establishes size of array objects + sys.stderr.write("Initial read...\n") + for line in f: + (fword, eword, score1, score2) = line.split() + f_id = self.get_f_id(fword) + e_id = self.get_e_id(eword) + while f_id >= len(fcount): + fcount.append(0) + fcount.arr[f_id] = fcount.arr[f_id] + 1 + + # Allocate space for dictionary in arrays + N = 0 + n_f = len(fcount) + self.f_index = cintlist.CIntList(initial_len=n_f+1) + for i from 0 <= i < n_f: + self.f_index.arr[i] = N + N = N + fcount.arr[i] + fcount.arr[i] = 0 + self.f_index.arr[n_f] = N + self.e_index = cintlist.CIntList(initial_len=N) + self.col1 = cfloatlist.CFloatList(initial_len=N) + self.col2 = cfloatlist.CFloatList(initial_len=N) + + # Re-read file, placing words into buckets + sys.stderr.write("Bucket sort...\n") + f.seek(0) + for line in f: + (fword, eword, score1, score2) = line.split() + f_id = self.get_f_id(fword) + e_id = self.get_e_id(eword) + index = self.f_index.arr[f_id] + fcount.arr[f_id] + fcount.arr[f_id] = fcount.arr[f_id] + 1 + self.e_index.arr[index] = int(e_id) + self.col1[index] = float(score1) + self.col2[index] = float(score2) + f.close() + + sys.stderr.write("Final sort...\n") + # Sort buckets by eword + for b from 0 <= b < n_f: + i = self.f_index.arr[b] + j = self.f_index.arr[b+1] + self.qsort(i,j, "") + + + cdef swap(self, int i, int j): + cdef int itmp + cdef float ftmp + + if i == j: + return + + itmp = self.e_index.arr[i] + self.e_index.arr[i] = self.e_index.arr[j] + self.e_index.arr[j] = itmp + + ftmp = self.col1.arr[i] + self.col1.arr[i] = self.col1.arr[j] + self.col1.arr[j] = ftmp + + ftmp = self.col2.arr[i] + self.col2.arr[i] = self.col2.arr[j] + self.col2.arr[j] = ftmp + + + cdef qsort(self, int i, int j, pad): + cdef int pval, p + + if i > j: + raise Exception("Sort error in CLex") + if i == j: #empty interval + return + if i == j-1: # singleton interval + return + + p = (i+j)/2 + pval = self.e_index.arr[p] + self.swap(i, p) + p = i + for k from i+1 <= k < j: + if pval >= self.e_index.arr[k]: + self.swap(p+1, k) + self.swap(p, p+1) + p = p + 1 + self.qsort(i,p, pad+" ") + self.qsort(p+1,j, pad+" ") + + + def write_enhanced(self, filename): + f = open(filename, "w") + for i in self.f_index: + f.write("%d " % i) + f.write("\n") + for i, s1, s2 in zip(self.e_index, self.col1, self.col2): + f.write("%d %f %f " % (i, s1, s2)) + f.write("\n") + for i, w in enumerate(self.id2fword): + f.write("%d %s " % (i, w)) + f.write("\n") + for i, w in enumerate(self.id2eword): + f.write("%d %s " % (i, w)) + f.write("\n") + f.close() + + + def get_score(self, fword, eword, col): + cdef e_id, f_id, low, high, midpoint, val + #print "get_score fword=",fword,", eword=",eword,", col=",col + + if eword not in self.eword2id: + return None + if fword not in self.fword2id: + return None + f_id = self.fword2id[fword] + e_id = self.eword2id[eword] + low = self.f_index.arr[f_id] + high = self.f_index.arr[f_id+1] + while high - low > 0: + midpoint = (low+high)/2 + val = self.e_index.arr[midpoint] + if val == e_id: + if col == 0: + return self.col1.arr[midpoint] + if col == 1: + return self.col2.arr[midpoint] + if val > e_id: + high = midpoint + if val < e_id: + low = midpoint + 1 + return None + + + def write_text(self, filename): + """Note: does not guarantee writing the dictionary in the original order""" + cdef i, N, e_id, f_id + + f = open(filename, "w") + N = len(self.e_index) + f_id = 0 + for i from 0 <= i < N: + while self.f_index.arr[f_id+1] == i: + f_id = f_id + 1 + e_id = self.e_index.arr[i] + score1 = self.col1.arr[i] + score2 = self.col2.arr[i] + f.write("%s %s %.6f %.6f\n" % (self.id2fword[f_id], self.id2eword[e_id], score1, score2)) + f.close() + + |