summaryrefslogtreecommitdiff
path: root/python/src/sa/vocabulary.pxi
diff options
context:
space:
mode:
authorVictor Chahuneau <vchahune@cs.cmu.edu>2012-12-23 23:07:31 +0100
committerVictor Chahuneau <vchahune@cs.cmu.edu>2012-12-23 23:07:31 +0100
commite66ce4f37f98af2e23c800c7dc6fd8fc83d07353 (patch)
tree9972e8ed1adeb56ede19b2c6020e92a5116860e4 /python/src/sa/vocabulary.pxi
parent597d89c11db53e91bc011eab70fd613bbe6453e8 (diff)
Memory mapping for IntList/FloatList
+ vocabulary class for DataArray & BiLex
Diffstat (limited to 'python/src/sa/vocabulary.pxi')
-rw-r--r--python/src/sa/vocabulary.pxi64
1 files changed, 64 insertions, 0 deletions
diff --git a/python/src/sa/vocabulary.pxi b/python/src/sa/vocabulary.pxi
new file mode 100644
index 00000000..9e816db5
--- /dev/null
+++ b/python/src/sa/vocabulary.pxi
@@ -0,0 +1,64 @@
+cdef class Vocabulary:
+ cdef object id2word, word2id
+
+ def __init__(self, from_vocabulary=None):
+ self.id2word = []
+ self.word2id = {}
+
+ def extend(self, vocabulary):
+ for word in vocabulary:
+ self[word]
+
+ def __iter__(self):
+ return iter(self.id2word)
+
+ def __getitem__(self, word):
+ v = self.word2id.get(word, -1)
+ if v == -1:
+ v = len(self.id2word)
+ self.id2word.append(word)
+ self.word2id[word] = v
+ return v
+
+ def get(self, word, default):
+ return self.word2id.get(word, default)
+
+ def __len__(self):
+ return len(self.id2word)
+
+ cdef void write_handle(self, FILE* f, int offset=0):
+ cdef int word_len
+ cdef int num_words
+
+ num_words = len(self.id2word) - offset
+ fwrite(&(num_words), sizeof(int), 1, f)
+ for word in self.id2word[offset:]:
+ word_len = len(word) + 1
+ fwrite(&(word_len), sizeof(int), 1, f)
+ fwrite(<char *>word, sizeof(char), word_len, f)
+
+ cdef void read_handle(self, FILE* f):
+ cdef int num_words, word_len
+ cdef char* word
+ cdef unsigned i
+
+ fread(&(num_words), sizeof(int), 1, f)
+ for i in range(num_words):
+ fread(&(word_len), sizeof(int), 1, f)
+ word = <char*> malloc (word_len * sizeof(char))
+ fread(word, sizeof(char), word_len, f)
+ self.word2id[word] = len(self.id2word)
+ self.id2word.append(word)
+ free(word)
+
+ cdef void read_mmaped(self, MemoryMap buf):
+ cdef int num_words, word_len
+ cdef char* word
+ cdef unsigned i
+
+ num_words = buf.read_int()
+ for i in range(num_words):
+ word_len = buf.read_int()
+ word = buf.read_char_array(word_len)
+ self.word2id[word] = len(self.id2word)
+ self.id2word.append(word)