diff options
author | Kenneth Heafield <github@kheafield.com> | 2012-08-03 07:46:54 -0400 |
---|---|---|
committer | Kenneth Heafield <github@kheafield.com> | 2012-08-03 07:46:54 -0400 |
commit | 122f46c31102b683eaab3ad81a3a98accbc694bb (patch) | |
tree | 8d499d789b159ebed25bb23b6983813d064a6296 /python/src/sa/data_array.pxi | |
parent | ac664bdb0e481539cf77098a7dd0e1ec8d937ba0 (diff) | |
parent | 193d137056c3c4f73d66f8db84691d63307de894 (diff) |
Merge branch 'master' of github.com:redpony/cdec
Diffstat (limited to 'python/src/sa/data_array.pxi')
-rw-r--r-- | python/src/sa/data_array.pxi | 158 |
1 files changed, 158 insertions, 0 deletions
diff --git a/python/src/sa/data_array.pxi b/python/src/sa/data_array.pxi new file mode 100644 index 00000000..7a102a7e --- /dev/null +++ b/python/src/sa/data_array.pxi @@ -0,0 +1,158 @@ +# Defines "data arrays" that can be directly written to/read from disk in binary format +# In particular, the array itself is written/read directly as a glob of binary data +# Adam Lopez <alopez@cs.umd.edu> + +from libc.stdio cimport FILE, fopen, fread, fwrite, fclose +from libc.stdlib cimport malloc, realloc, free +from libc.string cimport memset, strcpy, strlen + +cdef class DataArray: + cdef word2id + cdef id2word + cdef IntList data + cdef IntList sent_id + cdef IntList sent_index + cdef bint use_sent_id + + def __cinit__(self, from_binary=None, from_text=None, side=None, bint use_sent_id=False): + self.word2id = {"END_OF_FILE":0, "END_OF_LINE":1} + self.id2word = ["END_OF_FILE", "END_OF_LINE"] + self.data = IntList(1000,1000) + self.sent_id = IntList(1000,1000) + self.sent_index = IntList(1000,1000) + self.use_sent_id = use_sent_id + if from_binary: + self.read_binary(from_binary) + elif from_text: + if side: + self.read_bitext(from_text, (0 if side == 'source' else 1)) + else: + self.read_text(from_text) + + def __len__(self): + return len(self.data) + + def getSentId(self, i): + return self.sent_id.arr[i] + + def getSent(self, i): + cdef int j, start, stop + sent = [] + start = self.sent_index.arr[i] + stop = self.sent_index.arr[i+1] + for i from start <= i < stop: + sent.append(self.id2word[self.data.arr[i]]) + return sent + + def getSentPos(self, loc): + return loc - self.sent_index.arr[self.sent_id.arr[loc]] + + def get_id(self, word): + if not word in self.word2id: + self.word2id[word] = len(self.id2word) + self.id2word.append(word) + return self.word2id[word] + + def get_word(self, id): + return self.id2word[id] + + def write_text(self, char* filename): + with open(filename, "w") as f: + for w_id in self.data: + if w_id > 1: + f.write("%s " % self.get_word(w_id)) + if w_id == 1: + f.write("\n") + + def read_text(self, char* filename): + with gzip_or_text(filename) as fp: + self.read_text_data(fp) + + def read_bitext(self, char* filename, int side): + with gzip_or_text(filename) as fp: + data = (line.split(' ||| ')[side] for line in fp) + self.read_text_data(data) + + def read_text_data(self, data): + cdef int word_count = 0 + for line_num, line in enumerate(data): + self.sent_index.append(word_count) + for word in line.split(): + self.data.append(self.get_id(word)) + if self.use_sent_id: + self.sent_id.append(line_num) + word_count = word_count + 1 + self.data.append(1) + if self.use_sent_id: + self.sent_id.append(line_num) + word_count = word_count + 1 + self.data.append(0) + self.sent_index.append(word_count) + + + def read_binary(self, char* filename): + cdef FILE* f + f = fopen(filename, "r") + self.read_handle(f) + fclose(f) + + cdef void read_handle(self, FILE* f): + cdef int num_words, word_len + cdef unsigned i + cdef char* c_word + cdef bytes py_word + self.data.read_handle(f) + self.sent_index.read_handle(f) + self.sent_id.read_handle(f) + fread(&(num_words), sizeof(int), 1, f) + for i in range(num_words): + fread(&(word_len), sizeof(int), 1, f) + c_word = <char*> malloc (word_len * sizeof(char)) + fread(c_word, sizeof(char), word_len, f) + py_word = c_word + free(c_word) + self.word2id[py_word] = len(self.id2word) + self.id2word.append(py_word) + if len(self.sent_id) == 0: + self.use_sent_id = False + else: + self.use_sent_id = True + + cdef void write_handle(self, FILE* f): + cdef int word_len + cdef int num_words + cdef char* c_word + self.data.write_handle(f) + self.sent_index.write_handle(f) + self.sent_id.write_handle(f) + num_words = len(self.id2word) - 2 + fwrite(&(num_words), sizeof(int), 1, f) + for word in self.id2word[2:]: + c_word = word + word_len = strlen(c_word) + 1 + fwrite(&(word_len), sizeof(int), 1, f) + fwrite(c_word, sizeof(char), word_len, f) + + def write_binary(self, char* filename): + cdef FILE* f + f = fopen(filename, "w") + self.write_handle(f) + fclose(f) + + def write_enhanced_handle(self, f): + for i in self.data: + f.write("%d " %i) + f.write("\n") + for i in self.sent_index: + f.write("%d " %i) + f.write("\n") + for i in self.sent_id: + f.write("%d " %i) + f.write("\n") + for word in self.id2word: + f.write("%s %d " % (word, self.word2id[word])) + f.write("\n") + + def write_enhanced(self, char* filename): + with open(filename, "w") as f: + self.write_enhanced_handle(self, f) |