diff options
author | Victor Chahuneau <vchahune@cs.cmu.edu> | 2012-12-23 23:07:31 +0100 |
---|---|---|
committer | Victor Chahuneau <vchahune@cs.cmu.edu> | 2012-12-23 23:07:31 +0100 |
commit | e66ce4f37f98af2e23c800c7dc6fd8fc83d07353 (patch) | |
tree | 9972e8ed1adeb56ede19b2c6020e92a5116860e4 /python/src/sa/vocabulary.pxi | |
parent | 597d89c11db53e91bc011eab70fd613bbe6453e8 (diff) |
Memory mapping for IntList/FloatList
+ vocabulary class for DataArray & BiLex
Diffstat (limited to 'python/src/sa/vocabulary.pxi')
-rw-r--r-- | python/src/sa/vocabulary.pxi | 64 |
1 files changed, 64 insertions, 0 deletions
diff --git a/python/src/sa/vocabulary.pxi b/python/src/sa/vocabulary.pxi new file mode 100644 index 00000000..9e816db5 --- /dev/null +++ b/python/src/sa/vocabulary.pxi @@ -0,0 +1,64 @@ +cdef class Vocabulary: + cdef object id2word, word2id + + def __init__(self, from_vocabulary=None): + self.id2word = [] + self.word2id = {} + + def extend(self, vocabulary): + for word in vocabulary: + self[word] + + def __iter__(self): + return iter(self.id2word) + + def __getitem__(self, word): + v = self.word2id.get(word, -1) + if v == -1: + v = len(self.id2word) + self.id2word.append(word) + self.word2id[word] = v + return v + + def get(self, word, default): + return self.word2id.get(word, default) + + def __len__(self): + return len(self.id2word) + + cdef void write_handle(self, FILE* f, int offset=0): + cdef int word_len + cdef int num_words + + num_words = len(self.id2word) - offset + fwrite(&(num_words), sizeof(int), 1, f) + for word in self.id2word[offset:]: + word_len = len(word) + 1 + fwrite(&(word_len), sizeof(int), 1, f) + fwrite(<char *>word, sizeof(char), word_len, f) + + cdef void read_handle(self, FILE* f): + cdef int num_words, word_len + cdef char* word + cdef unsigned i + + fread(&(num_words), sizeof(int), 1, f) + for i in range(num_words): + fread(&(word_len), sizeof(int), 1, f) + word = <char*> malloc (word_len * sizeof(char)) + fread(word, sizeof(char), word_len, f) + self.word2id[word] = len(self.id2word) + self.id2word.append(word) + free(word) + + cdef void read_mmaped(self, MemoryMap buf): + cdef int num_words, word_len + cdef char* word + cdef unsigned i + + num_words = buf.read_int() + for i in range(num_words): + word_len = buf.read_int() + word = buf.read_char_array(word_len) + self.word2id[word] = len(self.id2word) + self.id2word.append(word) |