From e66ce4f37f98af2e23c800c7dc6fd8fc83d07353 Mon Sep 17 00:00:00 2001 From: Victor Chahuneau Date: Sun, 23 Dec 2012 23:07:31 +0100 Subject: Memory mapping for IntList/FloatList + vocabulary class for DataArray & BiLex --- python/src/sa/precomputation.pxi | 55 ++++++++++++++++++++++++++-------------- 1 file changed, 36 insertions(+), 19 deletions(-) (limited to 'python/src/sa/precomputation.pxi') diff --git a/python/src/sa/precomputation.pxi b/python/src/sa/precomputation.pxi index a3527f47..272aa56e 100644 --- a/python/src/sa/precomputation.pxi +++ b/python/src/sa/precomputation.pxi @@ -73,7 +73,6 @@ cdef trie_node_data_extend(_Trie_Node* node, int* vals, int num_vals): memcpy(node.arr + node.arr_len, vals, num_vals*sizeof(int)) node.arr_len = new_len - cdef _Trie_Node* trie_insert(_Trie_Node* node, int val): cdef _Trie_Edge** cur cur = &node.root @@ -116,7 +115,6 @@ cdef class TrieMap: self.root = <_Trie_Node**> malloc(self.V * sizeof(_Trie_Node*)) memset(self.root, 0, self.V * sizeof(_Trie_Node*)) - def __dealloc__(self): cdef int i for i from 0 <= i < self.V: @@ -124,7 +122,6 @@ cdef class TrieMap: free_trie_node(self.root[i]) free(self.root) - def insert(self, pattern): cdef int* p cdef int i, l @@ -135,7 +132,6 @@ cdef class TrieMap: self._insert(p,l) free(p) - cdef _Trie_Node* _insert(self, int* pattern, int pattern_len): cdef int i cdef _Trie_Node* node @@ -184,7 +180,6 @@ cdef class TrieMap: trie_node_to_map(self.root[i], result, (i,), include_zeros) return result - cdef class Precomputation: cdef int precompute_rank cdef int precompute_secondary_rank @@ -194,10 +189,8 @@ cdef class Precomputation: cdef int train_min_gap_size cdef precomputed_index cdef precomputed_collocations - cdef read_map(self, FILE* f) - cdef write_map(self, m, FILE* f) - def __cinit__(self, fsarray=None, from_stats=None, from_binary=None, + def __cinit__(self, fsarray=None, from_stats=None, from_binary=None, mmaped=False, precompute_rank=1000, precompute_secondary_rank=20, max_length=5, max_nonterminals=2, train_max_initial_size=10, train_min_gap_size=2): @@ -208,12 +201,14 @@ cdef class Precomputation: self.train_max_initial_size = train_max_initial_size self.train_min_gap_size = train_min_gap_size if from_binary: - self.read_binary(from_binary) + if mmaped: + self.read_mmaped(MemoryMap(from_binary)) + else: + self.read_binary(from_binary) elif from_stats: self.precompute(from_stats, fsarray) - - def read_binary(self, char* filename): + def read_binary(self, bytes filename): cdef FILE* f f = fopen(filename, "r") fread(&(self.precompute_rank), sizeof(int), 1, f) @@ -226,8 +221,17 @@ cdef class Precomputation: self.precomputed_collocations = self.read_map(f) fclose(f) - - def write_binary(self, char* filename): + def read_mmaped(self, MemoryMap buf): + self.precompute_rank = buf.read_int() + self.precompute_secondary_rank = buf.read_int() + self.max_length = buf.read_int() + self.max_nonterminals = buf.read_int() + self.train_max_initial_size = buf.read_int() + self.train_min_gap_size = buf.read_int() + self.precomputed_index = self.read_mmaped_map(buf) + self.precomputed_collocations = self.read_mmaped_map(buf) + + def write_binary(self, bytes filename): cdef FILE* f f = fopen(filename, "w") fwrite(&(self.precompute_rank), sizeof(int), 1, f) @@ -240,7 +244,6 @@ cdef class Precomputation: self.write_map(self.precomputed_collocations, f) fclose(f) - cdef write_map(self, m, FILE* f): cdef int i, N cdef IntList arr @@ -256,7 +259,6 @@ cdef class Precomputation: arr = val arr.write_handle(f) - cdef read_map(self, FILE* f): cdef int i, j, k, word_id, N cdef IntList arr @@ -274,6 +276,21 @@ cdef class Precomputation: m[key] = arr return m + cdef read_mmaped_map(self, MemoryMap buf): + cdef int i, j, k, word_id, N + cdef IntList arr + + m = {} + N = buf.read_int() + for j in range(N): + key_size = buf.read_int() + key = [] + for k in range(key_size): + key.append(buf.read_int()) + arr = IntList() + arr.read_mmaped(buf) + m[tuple(key)] = arr + return m def precompute(self, stats, SuffixArray sarray): cdef int i, l, N, max_pattern_len, i1, l1, i2, l2, i3, l3, ptr1, ptr2, ptr3, is_super, sent_count, max_rank @@ -284,9 +301,9 @@ cdef class Precomputation: data = darray.data - frequent_patterns = TrieMap(len(darray.id2word)) - super_frequent_patterns = TrieMap(len(darray.id2word)) - collocations = TrieMap(len(darray.id2word)) + frequent_patterns = TrieMap(len(darray.voc.id2word)) + super_frequent_patterns = TrieMap(len(darray.voc.id2word)) + collocations = TrieMap(len(darray.voc.id2word)) I_set = set() J_set = set() @@ -419,7 +436,7 @@ cdef class Precomputation: if word_id == -1: s = s + "X " else: - s = s + darray.id2word[word_id] + " " + s = s + darray.voc.id2word[word_id] + " " logger.warn("ERROR: unexpected pattern %s in set of precomputed collocations", s) else: chunk = () -- cgit v1.2.3