diff options
author | Patrick Simianer <simianer@cl.uni-heidelberg.de> | 2012-11-05 15:29:46 +0100 |
---|---|---|
committer | Patrick Simianer <simianer@cl.uni-heidelberg.de> | 2012-11-05 15:29:46 +0100 |
commit | 6f29f345dc06c1a1033475eac1d1340781d1d603 (patch) | |
tree | 6fa4cdd7aefd7d54c9585c2c6274db61bb8b159a /python/src/sa/data_array.pxi | |
parent | b510da2e562c695c90d565eb295c749569c59be8 (diff) | |
parent | c615c37501fa8576584a510a9d2bfe2fdd5bace7 (diff) |
merge upstream/master
Diffstat (limited to 'python/src/sa/data_array.pxi')
-rw-r--r-- | python/src/sa/data_array.pxi | 49 |
1 files changed, 24 insertions, 25 deletions
diff --git a/python/src/sa/data_array.pxi b/python/src/sa/data_array.pxi index 7a102a7e..2a8ea921 100644 --- a/python/src/sa/data_array.pxi +++ b/python/src/sa/data_array.pxi @@ -4,14 +4,14 @@ from libc.stdio cimport FILE, fopen, fread, fwrite, fclose from libc.stdlib cimport malloc, realloc, free -from libc.string cimport memset, strcpy, strlen +from libc.string cimport memset, strcpy cdef class DataArray: - cdef word2id - cdef id2word - cdef IntList data - cdef IntList sent_id - cdef IntList sent_index + cdef public word2id + cdef public id2word + cdef public IntList data + cdef public IntList sent_id + cdef public IntList sent_index cdef bint use_sent_id def __cinit__(self, from_binary=None, from_text=None, side=None, bint use_sent_id=False): @@ -32,10 +32,10 @@ cdef class DataArray: def __len__(self): return len(self.data) - def getSentId(self, i): + def get_sentence_id(self, i): return self.sent_id.arr[i] - def getSent(self, i): + def get_sentence(self, i): cdef int j, start, stop sent = [] start = self.sent_index.arr[i] @@ -44,17 +44,18 @@ cdef class DataArray: sent.append(self.id2word[self.data.arr[i]]) return sent - def getSentPos(self, loc): - return loc - self.sent_index.arr[self.sent_id.arr[loc]] - def get_id(self, word): if not word in self.word2id: self.word2id[word] = len(self.id2word) self.id2word.append(word) return self.word2id[word] - def get_word(self, id): - return self.id2word[id] + def __getitem__(self, loc): + return self.id2word[self.data.arr[loc]] + + def get_sentence_bounds(self, loc): + cdef int sid = self.sent_id.arr[loc] + return (self.sent_index.arr[sid], self.sent_index.arr[sid+1]) def write_text(self, char* filename): with open(filename, "w") as f: @@ -99,20 +100,19 @@ cdef class DataArray: cdef void read_handle(self, FILE* f): cdef int num_words, word_len cdef unsigned i - cdef char* c_word - cdef bytes py_word + cdef char* word + self.data.read_handle(f) self.sent_index.read_handle(f) self.sent_id.read_handle(f) fread(&(num_words), sizeof(int), 1, f) for i in range(num_words): fread(&(word_len), sizeof(int), 1, f) - c_word = <char*> malloc (word_len * sizeof(char)) - fread(c_word, sizeof(char), word_len, f) - py_word = c_word - free(c_word) - self.word2id[py_word] = len(self.id2word) - self.id2word.append(py_word) + word = <char*> malloc (word_len * sizeof(char)) + fread(word, sizeof(char), word_len, f) + self.word2id[word] = len(self.id2word) + self.id2word.append(word) + free(word) if len(self.sent_id) == 0: self.use_sent_id = False else: @@ -121,17 +121,16 @@ cdef class DataArray: cdef void write_handle(self, FILE* f): cdef int word_len cdef int num_words - cdef char* c_word + self.data.write_handle(f) self.sent_index.write_handle(f) self.sent_id.write_handle(f) num_words = len(self.id2word) - 2 fwrite(&(num_words), sizeof(int), 1, f) for word in self.id2word[2:]: - c_word = word - word_len = strlen(c_word) + 1 + word_len = len(word) + 1 fwrite(&(word_len), sizeof(int), 1, f) - fwrite(c_word, sizeof(char), word_len, f) + fwrite(<char *>word, sizeof(char), word_len, f) def write_binary(self, char* filename): cdef FILE* f |