summaryrefslogtreecommitdiff
path: root/python/src/sa/data_array.pxi
diff options
context:
space:
mode:
Diffstat (limited to 'python/src/sa/data_array.pxi')
-rw-r--r--python/src/sa/data_array.pxi84
1 files changed, 34 insertions, 50 deletions
diff --git a/python/src/sa/data_array.pxi b/python/src/sa/data_array.pxi
index 2a8ea921..3a54d784 100644
--- a/python/src/sa/data_array.pxi
+++ b/python/src/sa/data_array.pxi
@@ -6,23 +6,29 @@ from libc.stdio cimport FILE, fopen, fread, fwrite, fclose
from libc.stdlib cimport malloc, realloc, free
from libc.string cimport memset, strcpy
+# kept for compatibility
+INIT_VOCABULARY = ('NULL', 'END_OF_LINE')
+
cdef class DataArray:
- cdef public word2id
- cdef public id2word
+ cdef public Vocabulary voc
cdef public IntList data
cdef public IntList sent_id
cdef public IntList sent_index
cdef bint use_sent_id
- def __cinit__(self, from_binary=None, from_text=None, side=None, bint use_sent_id=False):
- self.word2id = {"END_OF_FILE":0, "END_OF_LINE":1}
- self.id2word = ["END_OF_FILE", "END_OF_LINE"]
+ def __cinit__(self, from_binary=None, from_text=None, side=None,
+ bint use_sent_id=False, mmaped=False):
+ self.voc = Vocabulary()
+ self.voc.extend(INIT_VOCABULARY)
self.data = IntList(1000,1000)
self.sent_id = IntList(1000,1000)
self.sent_index = IntList(1000,1000)
self.use_sent_id = use_sent_id
if from_binary:
- self.read_binary(from_binary)
+ if mmaped:
+ self.read_mmaped(MemoryMap(from_binary))
+ else:
+ self.read_binary(from_binary)
elif from_text:
if side:
self.read_bitext(from_text, (0 if side == 'source' else 1))
@@ -37,27 +43,21 @@ cdef class DataArray:
def get_sentence(self, i):
cdef int j, start, stop
- sent = []
start = self.sent_index.arr[i]
stop = self.sent_index.arr[i+1]
- for i from start <= i < stop:
- sent.append(self.id2word[self.data.arr[i]])
- return sent
+ sent = [self.voc.id2word[self.data.arr[j]] for j in range(start, stop)]
def get_id(self, word):
- if not word in self.word2id:
- self.word2id[word] = len(self.id2word)
- self.id2word.append(word)
- return self.word2id[word]
+ return self.voc[word]
def __getitem__(self, loc):
- return self.id2word[self.data.arr[loc]]
+ return self.voc.id2word[self.data.arr[loc]]
def get_sentence_bounds(self, loc):
cdef int sid = self.sent_id.arr[loc]
return (self.sent_index.arr[sid], self.sent_index.arr[sid+1])
- def write_text(self, char* filename):
+ def write_text(self, bytes filename):
with open(filename, "w") as f:
for w_id in self.data:
if w_id > 1:
@@ -65,11 +65,11 @@ cdef class DataArray:
if w_id == 1:
f.write("\n")
- def read_text(self, char* filename):
+ def read_text(self, bytes filename):
with gzip_or_text(filename) as fp:
self.read_text_data(fp)
- def read_bitext(self, char* filename, int side):
+ def read_bitext(self, bytes filename, int side):
with gzip_or_text(filename) as fp:
data = (line.split(' ||| ')[side] for line in fp)
self.read_text_data(data)
@@ -90,49 +90,33 @@ cdef class DataArray:
self.data.append(0)
self.sent_index.append(word_count)
-
- def read_binary(self, char* filename):
+ def read_binary(self, bytes filename):
cdef FILE* f
f = fopen(filename, "r")
self.read_handle(f)
fclose(f)
- cdef void read_handle(self, FILE* f):
- cdef int num_words, word_len
- cdef unsigned i
- cdef char* word
+ cdef void read_mmaped(self, MemoryMap buf):
+ self.data.read_mmaped(buf)
+ self.sent_index.read_mmaped(buf)
+ self.sent_id.read_mmaped(buf)
+ self.voc.read_mmaped(buf)
+ self.use_sent_id = (len(self.sent_id) > 0)
+ cdef void read_handle(self, FILE* f):
self.data.read_handle(f)
self.sent_index.read_handle(f)
self.sent_id.read_handle(f)
- fread(&(num_words), sizeof(int), 1, f)
- for i in range(num_words):
- fread(&(word_len), sizeof(int), 1, f)
- word = <char*> malloc (word_len * sizeof(char))
- fread(word, sizeof(char), word_len, f)
- self.word2id[word] = len(self.id2word)
- self.id2word.append(word)
- free(word)
- if len(self.sent_id) == 0:
- self.use_sent_id = False
- else:
- self.use_sent_id = True
+ self.voc.read_handle(f)
+ self.use_sent_id = (len(self.sent_id) > 0)
cdef void write_handle(self, FILE* f):
- cdef int word_len
- cdef int num_words
-
self.data.write_handle(f)
self.sent_index.write_handle(f)
self.sent_id.write_handle(f)
- num_words = len(self.id2word) - 2
- fwrite(&(num_words), sizeof(int), 1, f)
- for word in self.id2word[2:]:
- word_len = len(word) + 1
- fwrite(&(word_len), sizeof(int), 1, f)
- fwrite(<char *>word, sizeof(char), word_len, f)
-
- def write_binary(self, char* filename):
+ self.voc.write_handle(f, len(INIT_VOCABULARY))
+
+ def write_binary(self, bytes filename):
cdef FILE* f
f = fopen(filename, "w")
self.write_handle(f)
@@ -148,10 +132,10 @@ cdef class DataArray:
for i in self.sent_id:
f.write("%d " %i)
f.write("\n")
- for word in self.id2word:
- f.write("%s %d " % (word, self.word2id[word]))
+ for w, word in enumerate(self.voc.id2word):
+ f.write("%s %d " % (word, w))
f.write("\n")
- def write_enhanced(self, char* filename):
+ def write_enhanced(self, bytes filename):
with open(filename, "w") as f:
self.write_enhanced_handle(self, f)