Improve the package structure of pycdec

This change should not break anything, but now you can run: python setup.py build_ext --inplace and use the cleaner: PYTHONPATH=/path/to/cdec/python python -m ...
author: Victor Chahuneau <vchahune@cs.cmu.edu> 2013-08-26 20:12:32 -0400
committer: Victor Chahuneau <vchahune@cs.cmu.edu> 2013-08-26 20:12:32 -0400
commit: ca9b58716214148eeaeaa3076e1a1dc8f8bb5892 (patch)
tree: bfa2fd84c86e0fdd499110e86fd464b391379df1 /python/cdec/sa/alignment.pxi
parent: 9d5071692ceab8d09c2bfdba24f6b927ec84b7f9 (diff)
1 files changed, 106 insertions, 0 deletions
diff --git a/python/cdec/sa/alignment.pxi b/python/cdec/sa/alignment.pxi
new file mode 100644
index 00000000..295697f9
--- /dev/null
+++ b/python/cdec/sa/alignment.pxi
@@ -0,0 +1,106 @@
+from libc.stdio cimport FILE, fopen, fread, fwrite, fclose
+from libc.stdlib cimport malloc, realloc, free
+
+# Note: Callison-Burch uses short instead of int.  
+# We have the space for our corpus, so this is not a problem;
+# May need to revisit if things get really tight, though.
+
+cdef int ALIGNMENT_CODE = 1 << 16
+
+cdef class Alignment:
+    cdef IntList links
+    cdef IntList sent_index
+
+    cdef int link(self, int i, int j):
+        """Integerizes an alignment link pair"""
+        return i * ALIGNMENT_CODE + j
+
+    def unlink(self, link):
+        """De-integerizes an alignment link pair"""
+        return (link / ALIGNMENT_CODE, link % ALIGNMENT_CODE)
+
+    cdef _unlink(self, int link, int* f, int* e):
+        f[0] = link / ALIGNMENT_CODE
+        e[0] = link % ALIGNMENT_CODE
+
+    def get_sent_links(self, int sent_id):
+        cdef IntList sent_links
+        cdef int* arr
+        cdef int arr_len
+        sent_links = IntList()
+        arr = self._get_sent_links(sent_id, &arr_len)
+        sent_links._extend_arr(arr, arr_len*2)
+        free(arr)
+        return sent_links
+
+    cdef int* _get_sent_links(self, int sent_id, int* num_links):
+        cdef int* sent_links
+        cdef int i, start, end
+        start = self.sent_index.arr[sent_id]
+        end = self.sent_index.arr[sent_id+1]
+        num_links[0] = end - start
+        sent_links = <int*> malloc(2*num_links[0]*sizeof(int))
+        for i from 0 <= i < num_links[0]:
+            self._unlink(self.links.arr[start + i], sent_links + (2*i), sent_links + (2*i) + 1)
+        return sent_links
+
+    def __cinit__(self, from_binary=None, from_text=None):
+        self.links = IntList(1000,1000)
+        self.sent_index = IntList(1000,1000)
+        if from_binary:
+            self.read_binary(from_binary)
+        elif from_text:
+            self.read_text(from_text)
+
+    def read_text(self, char* filename):
+        with gzip_or_text(filename) as f:
+            for line in f:
+                self.sent_index.append(len(self.links))
+                pairs = line.split()
+                for pair in pairs:
+                    (i, j) = map(int, pair.split('-'))
+                    self.links.append(self.link(i, j))
+            self.sent_index.append(len(self.links))
+
+    def read_binary(self, char* filename):
+        cdef FILE* f
+        f = fopen(filename, "r")
+        self.links.read_handle(f)
+        self.sent_index.read_handle(f)
+        fclose(f)
+
+    def write_text(self, char* filename):
+        with open(filename, "w") as f:
+            sent_num = 0
+            for i, link in enumerate(self.links):
+                while i >= self.sent_index[sent_num]:
+                    f.write("\n")
+                    sent_num = sent_num + 1
+                f.write("%d-%d " % self.unlink(link))
+            f.write("\n")
+
+    def write_binary(self, char* filename):
+        cdef FILE* f
+        f = fopen(filename, "w")
+        self.links.write_handle(f)
+        self.sent_index.write_handle(f)
+        fclose(f)
+
+    def write_enhanced(self, char* filename):
+        with open(filename, "w") as f:
+            for link in self.links:
+                f.write("%d " % link)
+            f.write("\n")
+            for i in self.sent_index:
+                f.write("%d " % i)
+            f.write("\n")
+
+    def alignment(self, i):
+        """Return all (e,f) pairs for sentence i"""
+        cdef int j, start, end
+        result = []
+        start = self.sent_index.arr[i]
+        end = self.sent_index.arr[i+1]
+        for j from start <= j < end:
+            result.append(self.unlink(self.links.arr[j]))
+        return result
author	Victor Chahuneau <vchahune@cs.cmu.edu>	2013-08-26 20:12:32 -0400
committer	Victor Chahuneau <vchahune@cs.cmu.edu>	2013-08-26 20:12:32 -0400
commit	ca9b58716214148eeaeaa3076e1a1dc8f8bb5892 (patch)
tree	bfa2fd84c86e0fdd499110e86fd464b391379df1 /python/cdec/sa/alignment.pxi
parent	9d5071692ceab8d09c2bfdba24f6b927ec84b7f9 (diff)