diff options
author | Victor Chahuneau <vchahune@cs.cmu.edu> | 2012-07-27 01:16:03 -0400 |
---|---|---|
committer | Victor Chahuneau <vchahune@cs.cmu.edu> | 2012-07-27 01:16:03 -0400 |
commit | b2a8bccb2bd713d9ec081cf3dad0162c2cb492d8 (patch) | |
tree | c661044fd2a3943cf2ad12109b916fd7b56a519e /python/src/sa/data_array.pxi | |
parent | 148b1168c2b07abf0c7757a31141377c28ec3d91 (diff) |
[python] Fork of the suffix-array extractor with surface improvements
Available as the cdec.sa module, with commande-line helpers:
python -m cdec.sa.compile -f ... -e ... -a ... -o sa-out/ -c extract.ini
python -m cdec.sa.extract -c extract.ini -g grammars-out/ < input.txt > input.sgml
+ renamed cdec.scfg -> cdec.sa
+ Python README
Diffstat (limited to 'python/src/sa/data_array.pxi')
-rw-r--r-- | python/src/sa/data_array.pxi | 146 |
1 files changed, 146 insertions, 0 deletions
diff --git a/python/src/sa/data_array.pxi b/python/src/sa/data_array.pxi new file mode 100644 index 00000000..1c044694 --- /dev/null +++ b/python/src/sa/data_array.pxi @@ -0,0 +1,146 @@ +# Defines "data arrays" that can be directly written to/read from disk in binary format +# In particular, the array itself is written/read directly as a glob of binary data +# Adam Lopez <alopez@cs.umd.edu> + +from libc.stdio cimport FILE, fopen, fread, fwrite, fclose +from libc.stdlib cimport malloc, realloc, free +from libc.string cimport memset, strcpy, strlen + +cdef class DataArray: + cdef word2id + cdef id2word + cdef IntList data + cdef IntList sent_id + cdef IntList sent_index + cdef bint use_sent_id + + def __cinit__(self, from_binary=None, from_text=None, bint use_sent_id=False): + self.word2id = {"END_OF_FILE":0, "END_OF_LINE":1} + self.id2word = ["END_OF_FILE", "END_OF_LINE"] + self.data = IntList(1000,1000) + self.sent_id = IntList(1000,1000) + self.sent_index = IntList(1000,1000) + self.use_sent_id = use_sent_id + if from_binary: + self.read_binary(from_binary) + elif from_text: + self.read_text(from_text) + + def __len__(self): + return len(self.data) + + def getSentId(self, i): + return self.sent_id.arr[i] + + def getSent(self, i): + cdef int j, start, stop + sent = [] + start = self.sent_index.arr[i] + stop = self.sent_index.arr[i+1] + for i from start <= i < stop: + sent.append(self.id2word[self.data.arr[i]]) + return sent + + def getSentPos(self, loc): + return loc - self.sent_index.arr[self.sent_id.arr[loc]] + + def get_id(self, word): + if not word in self.word2id: + self.word2id[word] = len(self.id2word) + self.id2word.append(word) + return self.word2id[word] + + def get_word(self, id): + return self.id2word[id] + + def write_text(self, char* filename): + with open(filename, "w") as f: + for w_id in self.data: + if w_id > 1: + f.write("%s " % self.get_word(w_id)) + if w_id == 1: + f.write("\n") + + def read_text(self, char* filename): + cdef int word_count = 0 + with gzip_or_text(filename) as fp: + for line_num, line in enumerate(fp): + self.sent_index.append(word_count) + for word in line.split(): + self.data.append(self.get_id(word)) + if self.use_sent_id: + self.sent_id.append(line_num) + word_count = word_count + 1 + self.data.append(1) + if self.use_sent_id: + self.sent_id.append(line_num) + word_count = word_count + 1 + self.data.append(0) + self.sent_index.append(word_count) + + def read_binary(self, char* filename): + cdef FILE* f + f = fopen(filename, "r") + self.read_handle(f) + fclose(f) + + cdef void read_handle(self, FILE* f): + cdef int num_words, word_len + cdef unsigned i + cdef char* c_word + cdef bytes py_word + self.data.read_handle(f) + self.sent_index.read_handle(f) + self.sent_id.read_handle(f) + fread(&(num_words), sizeof(int), 1, f) + for i in range(num_words): + fread(&(word_len), sizeof(int), 1, f) + c_word = <char*> malloc (word_len * sizeof(char)) + fread(c_word, sizeof(char), word_len, f) + py_word = c_word + free(c_word) + self.word2id[py_word] = len(self.id2word) + self.id2word.append(py_word) + if len(self.sent_id) == 0: + self.use_sent_id = False + else: + self.use_sent_id = True + + cdef void write_handle(self, FILE* f): + cdef int word_len + cdef int num_words + cdef char* c_word + self.data.write_handle(f) + self.sent_index.write_handle(f) + self.sent_id.write_handle(f) + num_words = len(self.id2word) - 2 + fwrite(&(num_words), sizeof(int), 1, f) + for word in self.id2word[2:]: + c_word = word + word_len = strlen(c_word) + 1 + fwrite(&(word_len), sizeof(int), 1, f) + fwrite(c_word, sizeof(char), word_len, f) + + def write_binary(self, char* filename): + cdef FILE* f + f = fopen(filename, "w") + self.write_handle(f) + fclose(f) + + def write_enhanced_handle(self, f): + for i in self.data: + f.write("%d " %i) + f.write("\n") + for i in self.sent_index: + f.write("%d " %i) + f.write("\n") + for i in self.sent_id: + f.write("%d " %i) + f.write("\n") + for word in self.id2word: + f.write("%s %d " % (word, self.word2id[word])) + f.write("\n") + + def write_enhanced(self, char* filename): + with open(filename, "w") as f: + self.write_enhanced_handle(self, f) |