From e66ce4f37f98af2e23c800c7dc6fd8fc83d07353 Mon Sep 17 00:00:00 2001 From: Victor Chahuneau Date: Sun, 23 Dec 2012 23:07:31 +0100 Subject: Memory mapping for IntList/FloatList + vocabulary class for DataArray & BiLex --- python/src/sa/vocabulary.pxi | 64 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) create mode 100644 python/src/sa/vocabulary.pxi (limited to 'python/src/sa/vocabulary.pxi') diff --git a/python/src/sa/vocabulary.pxi b/python/src/sa/vocabulary.pxi new file mode 100644 index 00000000..9e816db5 --- /dev/null +++ b/python/src/sa/vocabulary.pxi @@ -0,0 +1,64 @@ +cdef class Vocabulary: + cdef object id2word, word2id + + def __init__(self, from_vocabulary=None): + self.id2word = [] + self.word2id = {} + + def extend(self, vocabulary): + for word in vocabulary: + self[word] + + def __iter__(self): + return iter(self.id2word) + + def __getitem__(self, word): + v = self.word2id.get(word, -1) + if v == -1: + v = len(self.id2word) + self.id2word.append(word) + self.word2id[word] = v + return v + + def get(self, word, default): + return self.word2id.get(word, default) + + def __len__(self): + return len(self.id2word) + + cdef void write_handle(self, FILE* f, int offset=0): + cdef int word_len + cdef int num_words + + num_words = len(self.id2word) - offset + fwrite(&(num_words), sizeof(int), 1, f) + for word in self.id2word[offset:]: + word_len = len(word) + 1 + fwrite(&(word_len), sizeof(int), 1, f) + fwrite(word, sizeof(char), word_len, f) + + cdef void read_handle(self, FILE* f): + cdef int num_words, word_len + cdef char* word + cdef unsigned i + + fread(&(num_words), sizeof(int), 1, f) + for i in range(num_words): + fread(&(word_len), sizeof(int), 1, f) + word = malloc (word_len * sizeof(char)) + fread(word, sizeof(char), word_len, f) + self.word2id[word] = len(self.id2word) + self.id2word.append(word) + free(word) + + cdef void read_mmaped(self, MemoryMap buf): + cdef int num_words, word_len + cdef char* word + cdef unsigned i + + num_words = buf.read_int() + for i in range(num_words): + word_len = buf.read_int() + word = buf.read_char_array(word_len) + self.word2id[word] = len(self.id2word) + self.id2word.append(word) -- cgit v1.2.3