diff options
author | Victor Chahuneau <vchahune@cs.cmu.edu> | 2013-08-26 20:12:32 -0400 |
---|---|---|
committer | Victor Chahuneau <vchahune@cs.cmu.edu> | 2013-08-26 20:12:32 -0400 |
commit | ca9b58716214148eeaeaa3076e1a1dc8f8bb5892 (patch) | |
tree | bfa2fd84c86e0fdd499110e86fd464b391379df1 /python/cdec/sa/alignment.pxi | |
parent | 9d5071692ceab8d09c2bfdba24f6b927ec84b7f9 (diff) |
Improve the package structure of pycdec
This change should not break anything, but now you can run:
python setup.py build_ext --inplace
and use the cleaner:
PYTHONPATH=/path/to/cdec/python python -m ...
Diffstat (limited to 'python/cdec/sa/alignment.pxi')
-rw-r--r-- | python/cdec/sa/alignment.pxi | 106 |
1 files changed, 106 insertions, 0 deletions
diff --git a/python/cdec/sa/alignment.pxi b/python/cdec/sa/alignment.pxi new file mode 100644 index 00000000..295697f9 --- /dev/null +++ b/python/cdec/sa/alignment.pxi @@ -0,0 +1,106 @@ +from libc.stdio cimport FILE, fopen, fread, fwrite, fclose +from libc.stdlib cimport malloc, realloc, free + +# Note: Callison-Burch uses short instead of int. +# We have the space for our corpus, so this is not a problem; +# May need to revisit if things get really tight, though. + +cdef int ALIGNMENT_CODE = 1 << 16 + +cdef class Alignment: + cdef IntList links + cdef IntList sent_index + + cdef int link(self, int i, int j): + """Integerizes an alignment link pair""" + return i * ALIGNMENT_CODE + j + + def unlink(self, link): + """De-integerizes an alignment link pair""" + return (link / ALIGNMENT_CODE, link % ALIGNMENT_CODE) + + cdef _unlink(self, int link, int* f, int* e): + f[0] = link / ALIGNMENT_CODE + e[0] = link % ALIGNMENT_CODE + + def get_sent_links(self, int sent_id): + cdef IntList sent_links + cdef int* arr + cdef int arr_len + sent_links = IntList() + arr = self._get_sent_links(sent_id, &arr_len) + sent_links._extend_arr(arr, arr_len*2) + free(arr) + return sent_links + + cdef int* _get_sent_links(self, int sent_id, int* num_links): + cdef int* sent_links + cdef int i, start, end + start = self.sent_index.arr[sent_id] + end = self.sent_index.arr[sent_id+1] + num_links[0] = end - start + sent_links = <int*> malloc(2*num_links[0]*sizeof(int)) + for i from 0 <= i < num_links[0]: + self._unlink(self.links.arr[start + i], sent_links + (2*i), sent_links + (2*i) + 1) + return sent_links + + def __cinit__(self, from_binary=None, from_text=None): + self.links = IntList(1000,1000) + self.sent_index = IntList(1000,1000) + if from_binary: + self.read_binary(from_binary) + elif from_text: + self.read_text(from_text) + + def read_text(self, char* filename): + with gzip_or_text(filename) as f: + for line in f: + self.sent_index.append(len(self.links)) + pairs = line.split() + for pair in pairs: + (i, j) = map(int, pair.split('-')) + self.links.append(self.link(i, j)) + self.sent_index.append(len(self.links)) + + def read_binary(self, char* filename): + cdef FILE* f + f = fopen(filename, "r") + self.links.read_handle(f) + self.sent_index.read_handle(f) + fclose(f) + + def write_text(self, char* filename): + with open(filename, "w") as f: + sent_num = 0 + for i, link in enumerate(self.links): + while i >= self.sent_index[sent_num]: + f.write("\n") + sent_num = sent_num + 1 + f.write("%d-%d " % self.unlink(link)) + f.write("\n") + + def write_binary(self, char* filename): + cdef FILE* f + f = fopen(filename, "w") + self.links.write_handle(f) + self.sent_index.write_handle(f) + fclose(f) + + def write_enhanced(self, char* filename): + with open(filename, "w") as f: + for link in self.links: + f.write("%d " % link) + f.write("\n") + for i in self.sent_index: + f.write("%d " % i) + f.write("\n") + + def alignment(self, i): + """Return all (e,f) pairs for sentence i""" + cdef int j, start, end + result = [] + start = self.sent_index.arr[i] + end = self.sent_index.arr[i+1] + for j from start <= j < end: + result.append(self.unlink(self.links.arr[j])) + return result |