summaryrefslogtreecommitdiff
path: root/python/src/sa/data_array.pxi
diff options
context:
space:
mode:
authorKenneth Heafield <github@kheafield.com>2012-08-03 07:46:54 -0400
committerKenneth Heafield <github@kheafield.com>2012-08-03 07:46:54 -0400
commit122f46c31102b683eaab3ad81a3a98accbc694bb (patch)
tree8d499d789b159ebed25bb23b6983813d064a6296 /python/src/sa/data_array.pxi
parentac664bdb0e481539cf77098a7dd0e1ec8d937ba0 (diff)
parent193d137056c3c4f73d66f8db84691d63307de894 (diff)
Merge branch 'master' of github.com:redpony/cdec
Diffstat (limited to 'python/src/sa/data_array.pxi')
-rw-r--r--python/src/sa/data_array.pxi158
1 files changed, 158 insertions, 0 deletions
diff --git a/python/src/sa/data_array.pxi b/python/src/sa/data_array.pxi
new file mode 100644
index 00000000..7a102a7e
--- /dev/null
+++ b/python/src/sa/data_array.pxi
@@ -0,0 +1,158 @@
+# Defines "data arrays" that can be directly written to/read from disk in binary format
+# In particular, the array itself is written/read directly as a glob of binary data
+# Adam Lopez <alopez@cs.umd.edu>
+
+from libc.stdio cimport FILE, fopen, fread, fwrite, fclose
+from libc.stdlib cimport malloc, realloc, free
+from libc.string cimport memset, strcpy, strlen
+
+cdef class DataArray:
+ cdef word2id
+ cdef id2word
+ cdef IntList data
+ cdef IntList sent_id
+ cdef IntList sent_index
+ cdef bint use_sent_id
+
+ def __cinit__(self, from_binary=None, from_text=None, side=None, bint use_sent_id=False):
+ self.word2id = {"END_OF_FILE":0, "END_OF_LINE":1}
+ self.id2word = ["END_OF_FILE", "END_OF_LINE"]
+ self.data = IntList(1000,1000)
+ self.sent_id = IntList(1000,1000)
+ self.sent_index = IntList(1000,1000)
+ self.use_sent_id = use_sent_id
+ if from_binary:
+ self.read_binary(from_binary)
+ elif from_text:
+ if side:
+ self.read_bitext(from_text, (0 if side == 'source' else 1))
+ else:
+ self.read_text(from_text)
+
+ def __len__(self):
+ return len(self.data)
+
+ def getSentId(self, i):
+ return self.sent_id.arr[i]
+
+ def getSent(self, i):
+ cdef int j, start, stop
+ sent = []
+ start = self.sent_index.arr[i]
+ stop = self.sent_index.arr[i+1]
+ for i from start <= i < stop:
+ sent.append(self.id2word[self.data.arr[i]])
+ return sent
+
+ def getSentPos(self, loc):
+ return loc - self.sent_index.arr[self.sent_id.arr[loc]]
+
+ def get_id(self, word):
+ if not word in self.word2id:
+ self.word2id[word] = len(self.id2word)
+ self.id2word.append(word)
+ return self.word2id[word]
+
+ def get_word(self, id):
+ return self.id2word[id]
+
+ def write_text(self, char* filename):
+ with open(filename, "w") as f:
+ for w_id in self.data:
+ if w_id > 1:
+ f.write("%s " % self.get_word(w_id))
+ if w_id == 1:
+ f.write("\n")
+
+ def read_text(self, char* filename):
+ with gzip_or_text(filename) as fp:
+ self.read_text_data(fp)
+
+ def read_bitext(self, char* filename, int side):
+ with gzip_or_text(filename) as fp:
+ data = (line.split(' ||| ')[side] for line in fp)
+ self.read_text_data(data)
+
+ def read_text_data(self, data):
+ cdef int word_count = 0
+ for line_num, line in enumerate(data):
+ self.sent_index.append(word_count)
+ for word in line.split():
+ self.data.append(self.get_id(word))
+ if self.use_sent_id:
+ self.sent_id.append(line_num)
+ word_count = word_count + 1
+ self.data.append(1)
+ if self.use_sent_id:
+ self.sent_id.append(line_num)
+ word_count = word_count + 1
+ self.data.append(0)
+ self.sent_index.append(word_count)
+
+
+ def read_binary(self, char* filename):
+ cdef FILE* f
+ f = fopen(filename, "r")
+ self.read_handle(f)
+ fclose(f)
+
+ cdef void read_handle(self, FILE* f):
+ cdef int num_words, word_len
+ cdef unsigned i
+ cdef char* c_word
+ cdef bytes py_word
+ self.data.read_handle(f)
+ self.sent_index.read_handle(f)
+ self.sent_id.read_handle(f)
+ fread(&(num_words), sizeof(int), 1, f)
+ for i in range(num_words):
+ fread(&(word_len), sizeof(int), 1, f)
+ c_word = <char*> malloc (word_len * sizeof(char))
+ fread(c_word, sizeof(char), word_len, f)
+ py_word = c_word
+ free(c_word)
+ self.word2id[py_word] = len(self.id2word)
+ self.id2word.append(py_word)
+ if len(self.sent_id) == 0:
+ self.use_sent_id = False
+ else:
+ self.use_sent_id = True
+
+ cdef void write_handle(self, FILE* f):
+ cdef int word_len
+ cdef int num_words
+ cdef char* c_word
+ self.data.write_handle(f)
+ self.sent_index.write_handle(f)
+ self.sent_id.write_handle(f)
+ num_words = len(self.id2word) - 2
+ fwrite(&(num_words), sizeof(int), 1, f)
+ for word in self.id2word[2:]:
+ c_word = word
+ word_len = strlen(c_word) + 1
+ fwrite(&(word_len), sizeof(int), 1, f)
+ fwrite(c_word, sizeof(char), word_len, f)
+
+ def write_binary(self, char* filename):
+ cdef FILE* f
+ f = fopen(filename, "w")
+ self.write_handle(f)
+ fclose(f)
+
+ def write_enhanced_handle(self, f):
+ for i in self.data:
+ f.write("%d " %i)
+ f.write("\n")
+ for i in self.sent_index:
+ f.write("%d " %i)
+ f.write("\n")
+ for i in self.sent_id:
+ f.write("%d " %i)
+ f.write("\n")
+ for word in self.id2word:
+ f.write("%s %d " % (word, self.word2id[word]))
+ f.write("\n")
+
+ def write_enhanced(self, char* filename):
+ with open(filename, "w") as f:
+ self.write_enhanced_handle(self, f)