summaryrefslogtreecommitdiff
path: root/python/src/sa/vocabulary.pxi
blob: 9e816db52db5c25933db4a60659bdc66dc93fed6 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
cdef class Vocabulary:
    cdef object id2word, word2id

    def __init__(self, from_vocabulary=None):
        self.id2word = []
        self.word2id = {}

    def extend(self, vocabulary):
        for word in vocabulary:
            self[word]

    def __iter__(self):
        return iter(self.id2word)

    def __getitem__(self, word):
        v = self.word2id.get(word, -1)
        if v == -1:
            v = len(self.id2word)
            self.id2word.append(word)
            self.word2id[word] = v
        return v

    def get(self, word, default):
        return self.word2id.get(word, default)

    def __len__(self):
        return len(self.id2word)

    cdef void write_handle(self, FILE* f, int offset=0):
        cdef int word_len
        cdef int num_words

        num_words = len(self.id2word) - offset
        fwrite(&(num_words), sizeof(int), 1, f)
        for word in self.id2word[offset:]:
            word_len = len(word) + 1
            fwrite(&(word_len), sizeof(int), 1, f)
            fwrite(<char *>word, sizeof(char), word_len, f)

    cdef void read_handle(self, FILE* f):
        cdef int num_words, word_len
        cdef char* word
        cdef unsigned i

        fread(&(num_words), sizeof(int), 1, f)
        for i in range(num_words):
            fread(&(word_len), sizeof(int), 1, f)
            word = <char*> malloc (word_len * sizeof(char))
            fread(word, sizeof(char), word_len, f)
            self.word2id[word] = len(self.id2word)
            self.id2word.append(word)
            free(word)

    cdef void read_mmaped(self, MemoryMap buf):
        cdef int num_words, word_len
        cdef char* word
        cdef unsigned i

        num_words = buf.read_int()
        for i in range(num_words):
            word_len = buf.read_int()
            word = buf.read_char_array(word_len)
            self.word2id[word] = len(self.id2word)
            self.id2word.append(word)