diff options
Diffstat (limited to 'sa-extract/precomputation.pyx')
-rw-r--r-- | sa-extract/precomputation.pyx | 478 |
1 files changed, 478 insertions, 0 deletions
diff --git a/sa-extract/precomputation.pyx b/sa-extract/precomputation.pyx new file mode 100644 index 00000000..ce4c21aa --- /dev/null +++ b/sa-extract/precomputation.pyx @@ -0,0 +1,478 @@ +# precomputes a set of collocations by advancing over the text. +# warning: nasty C code + +import log +import monitor + +cimport csuf +cimport cdat +cimport cintlist + +from libc.stdio cimport FILE, fopen, fread, fwrite, fclose +from libc.stdlib cimport malloc, realloc, free +from libc.string cimport memset, memcpy + +cdef struct _Trie_Node # forward decl + +cdef struct _Trie_Edge: + int val + _Trie_Node* node + _Trie_Edge* bigger + _Trie_Edge* smaller + +cdef struct _Trie_Node: + _Trie_Edge* root + int* arr + int arr_len + +cdef _Trie_Node* new_trie_node(): + cdef _Trie_Node* node + node = <_Trie_Node*> malloc(sizeof(_Trie_Node)) + node.root = NULL + node.arr_len = 0 + node.arr = <int*> malloc(sizeof(0*sizeof(int))) + return node + +cdef _Trie_Edge* new_trie_edge(int val): + cdef _Trie_Edge* edge + edge = <_Trie_Edge*> malloc(sizeof(_Trie_Edge)) + edge.node = new_trie_node() + edge.bigger = NULL + edge.smaller = NULL + edge.val = val + return edge + +cdef free_trie_node(_Trie_Node* node): + if node != NULL: + free_trie_edge(node.root) + free(node.arr) + +cdef free_trie_edge(_Trie_Edge* edge): + if edge != NULL: + free_trie_node(edge.node) + free_trie_edge(edge.bigger) + free_trie_edge(edge.smaller) + +cdef _Trie_Node* trie_find(_Trie_Node* node, int val): + cdef _Trie_Edge* cur + cur = node.root + while cur != NULL and cur.val != val: + if val > cur.val: + cur = cur.bigger + elif val < cur.val: + cur = cur.smaller + if cur == NULL: + return NULL + else: + return cur.node + +cdef trie_node_data_append(_Trie_Node* node, int val): + cdef int new_len + new_len = node.arr_len + 1 + node.arr = <int*> realloc(node.arr, new_len*sizeof(int)) + node.arr[node.arr_len] = val + node.arr_len = new_len + +cdef trie_node_data_extend(_Trie_Node* node, int* vals, int num_vals): + cdef int new_len + new_len = node.arr_len + num_vals + node.arr = <int*> realloc(node.arr, new_len*sizeof(int)) + memcpy(node.arr + node.arr_len, vals, num_vals*sizeof(int)) + node.arr_len = new_len + + +cdef _Trie_Node* trie_insert(_Trie_Node* node, int val): + cdef _Trie_Edge** cur + cur = &node.root + while cur[0] != NULL and cur[0].val != val: + if val > cur[0].val: + cur = &cur[0].bigger + elif val < cur[0].val: + cur = &cur[0].smaller + if cur[0] == NULL: + cur[0] = new_trie_edge(val) + return cur[0].node + +cdef trie_node_to_map(_Trie_Node* node, result, prefix, int include_zeros): + cdef cintlist.CIntList arr + + if include_zeros or node.arr_len > 0: + arr = cintlist.CIntList() + free(arr.arr) + arr.arr = <int*> malloc(node.arr_len * sizeof(int)) + memcpy(arr.arr, node.arr, node.arr_len * sizeof(int)) + arr.len = node.arr_len + arr.size = node.arr_len + result[prefix] = arr + trie_edge_to_map(node.root, result, prefix, include_zeros) + +cdef trie_edge_to_map(_Trie_Edge* edge, result, prefix, int include_zeros): + if edge != NULL: + trie_edge_to_map(edge.smaller, result, prefix, include_zeros) + trie_edge_to_map(edge.bigger, result, prefix, include_zeros) + prefix = prefix + (edge.val,) + trie_node_to_map(edge.node, result, prefix, include_zeros) + +cdef class TrieMap: + + cdef _Trie_Node** root + cdef int V + + def __init__(self, alphabet_size): + self.V = alphabet_size + self.root = <_Trie_Node**> malloc(self.V * sizeof(_Trie_Node*)) + memset(self.root, 0, self.V * sizeof(_Trie_Node*)) + + + def __dealloc__(self): + cdef int i + for i from 0 <= i < self.V: + if self.root[i] != NULL: + free_trie_node(self.root[i]) + free(self.root) + + + def insert(self, pattern): + cdef int* p + cdef int i, l + l = len(pattern) + p = <int*> malloc(l*sizeof(int)) + for i from 0 <= i < l: + p[i] = pattern[i] + self._insert(p,l) + free(p) + + + cdef _Trie_Node* _insert(self, int* pattern, int pattern_len): + cdef int i + cdef _Trie_Node* node + if self.root[pattern[0]] == NULL: + self.root[pattern[0]] = new_trie_node() + node = self.root[pattern[0]] + for i from 1 <= i < pattern_len: + node = trie_insert(node, pattern[i]) + return node + + def contains(self, pattern): + cdef int* p + cdef int i, l + cdef _Trie_Node* node + l = len(pattern) + p = <int*> malloc(l*sizeof(int)) + for i from 0 <= i < l: + p[i] = pattern[i] + node = self._contains(p,l) + free(p) + if node == NULL: + return False + else: + return True + + cdef _Trie_Node* _contains(self, int* pattern, int pattern_len): + cdef int i + cdef _Trie_Node* node + node = self.root[pattern[0]] + i = 1 + while node != NULL and i < pattern_len: + node = trie_find(node, pattern[i]) + i = i+1 + return node + + def toMap(self, flag): + cdef int i, include_zeros + + if flag: + include_zeros=1 + else: + include_zeros=0 + result = {} + for i from 0 <= i < self.V: + if self.root[i] != NULL: + trie_node_to_map(self.root[i], result, (i,), include_zeros) + return result + + +cdef class Precomputation: + +# Defined in .pxd file, here for reference: +# cdef int precompute_rank +# cdef int precompute_secondary_rank +# cdef int max_length +# cdef int max_nonterminals +# cdef int train_max_initial_size +# cdef int train_min_gap_size +# cdef precomputed_index +# cdef precomputed_collocations + + def __init__(self, filename, sa=None, precompute_rank=1000, precompute_secondary_rank=20, max_length=5, + max_nonterminals=2, train_max_initial_size=10, train_min_gap_size=2, from_binary=False): + self.precompute_rank = precompute_rank + self.precompute_secondary_rank = precompute_secondary_rank + self.max_length = max_length + self.max_nonterminals = max_nonterminals + self.train_max_initial_size = train_max_initial_size + self.train_min_gap_size = train_min_gap_size + if from_binary: + self.read_binary(filename) + else: + self.precompute(filename, sa) + + + def read_binary(self, filename): + cdef FILE* f + cdef bytes bfilename = filename + cdef char* cfilename = bfilename + f = fopen(cfilename, "r") + fread(&(self.precompute_rank), sizeof(int), 1, f) + fread(&(self.precompute_secondary_rank), sizeof(int), 1, f) + fread(&(self.max_length), sizeof(int), 1, f) + fread(&(self.max_nonterminals), sizeof(int), 1, f) + fread(&(self.train_max_initial_size), sizeof(int), 1, f) + fread(&(self.train_min_gap_size), sizeof(int), 1, f) + self.precomputed_index = self.read_map(f) + self.precomputed_collocations = self.read_map(f) + fclose(f) + + + def write_binary(self, filename): + cdef FILE* f + cdef bytes bfilename = filename + cdef char* cfilename = bfilename + + f = fopen(cfilename, "w") + fwrite(&(self.precompute_rank), sizeof(int), 1, f) + fwrite(&(self.precompute_secondary_rank), sizeof(int), 1, f) + fwrite(&(self.max_length), sizeof(int), 1, f) + fwrite(&(self.max_nonterminals), sizeof(int), 1, f) + fwrite(&(self.train_max_initial_size), sizeof(int), 1, f) + fwrite(&(self.train_min_gap_size), sizeof(int), 1, f) + self.write_map(self.precomputed_index, f) + self.write_map(self.precomputed_collocations, f) + fclose(f) + + + cdef write_map(self, m, FILE* f): + cdef int i, N + cdef cintlist.CIntList arr + + N = len(m) + fwrite(&(N), sizeof(int), 1, f) + for pattern, val in m.iteritems(): + N = len(pattern) + fwrite(&(N), sizeof(int), 1, f) + for word_id in pattern: + i = word_id + fwrite(&(i), sizeof(int), 1, f) + arr = val + arr.write_handle(f) + + + cdef read_map(self, FILE* f): + cdef int i, j, k, word_id, N + cdef cintlist.CIntList arr + + m = {} + fread(&(N), sizeof(int), 1, f) + for j from 0 <= j < N: + fread(&(i), sizeof(int), 1, f) + key = () + for k from 0 <= k < i: + fread(&(word_id), sizeof(int), 1, f) + key = key + (word_id,) + arr = cintlist.CIntList() + arr.read_handle(f) + m[key] = arr + return m + + + def precompute(self, filename, sa): + cdef int i, l, N, max_pattern_len, i1, l1, i2, l2, i3, l3, ptr1, ptr2, ptr3, is_super, sent_count, max_rank + cdef csuf.SuffixArray sarray + cdef cdat.DataArray darray + cdef cintlist.CIntList data, queue, cost_by_rank, count_by_rank + cdef TrieMap frequent_patterns, super_frequent_patterns, collocations + cdef _Trie_Node* node + + sarray = sa + darray = sarray.darray + data = darray.data + + frequent_patterns = TrieMap(len(darray.id2word)) + super_frequent_patterns = TrieMap(len(darray.id2word)) + collocations = TrieMap(len(darray.id2word)) + + I_set = set() + J_set = set() + J2_set = set() + IJ_set = set() + pattern_rank = {} + + log.writeln("Precomputing frequent intersections\n", 1) + start_time = monitor.cpu() + + max_pattern_len = 0 + if filename is not None: + precompute_file = open(filename) + for rank, line in enumerate(precompute_file): + if rank >= self.precompute_rank: + break + phrase_words = line.split()[2:] + phrase = () + for word in phrase_words: + phrase = phrase + (darray.word2id[word],) + max_pattern_len = max(max_pattern_len, len(phrase)) + frequent_patterns.insert(phrase) + I_set.add(phrase) + pattern_rank[phrase] = rank + if rank < self.precompute_secondary_rank: + super_frequent_patterns.insert(phrase) + J_set.add(phrase) + precompute_file.close() + + queue = cintlist.CIntList(increment=1000) + + log.writeln(" Computing inverted indexes...", 1) + N = len(data) + for i from 0 <= i < N: + sa_word_id = data.arr[i] + if sa_word_id == 1: + queue._append(-1) + else: + for l from 1 <= l <= max_pattern_len: + node = frequent_patterns._contains(data.arr+i, l) + if node == NULL: + break + queue._append(i) + queue._append(l) + trie_node_data_append(node, i) + + log.writeln(" Computing collocations...", 1) + N = len(queue) + ptr1 = 0 + sent_count = 0 + while ptr1 < N: # main loop + i1 = queue.arr[ptr1] + if i1 > -1: + l1 = queue.arr[ptr1+1] + ptr2 = ptr1 + 2 + while ptr2 < N: + i2 = queue.arr[ptr2] + if i2 == -1 or i2 - i1 >= self.train_max_initial_size: + break + l2 = queue.arr[ptr2+1] + if i2 - i1 - l1 >= self.train_min_gap_size and i2 + l2 - i1 <= self.train_max_initial_size and l1+l2+1 <= self.max_length: + node = collocations._insert(data.arr+i1, l1) + node = trie_insert(node, -1) + for i from i2 <= i < i2+l2: + node = trie_insert(node, data.arr[i]) + trie_node_data_append(node, i1) + trie_node_data_append(node, i2) + if super_frequent_patterns._contains(data.arr+i2, l2) != NULL: + if super_frequent_patterns._contains(data.arr+i1, l1) != NULL: + is_super = 1 + else: + is_super = 0 + ptr3 = ptr2 + 2 + while ptr3 < N: + i3 = queue.arr[ptr3] + if i3 == -1 or i3 - i1 >= self.train_max_initial_size: + break + l3 = queue.arr[ptr3+1] + if i3 - i2 - l2 >= self.train_min_gap_size and i3 + l3 - i1 <= self.train_max_initial_size and l1+l2+l3+2 <= self.max_length: + if is_super or super_frequent_patterns._contains(data.arr+i3, l3) != NULL: + node = collocations._insert(data.arr+i1, l1) + node = trie_insert(node, -1) + for i from i2 <= i < i2+l2: + node = trie_insert(node, data.arr[i]) + node = trie_insert(node, -1) + for i from i3 <= i < i3+l3: + node = trie_insert(node, data.arr[i]) + trie_node_data_append(node, i1) + trie_node_data_append(node, i2) + trie_node_data_append(node, i3) + ptr3 = ptr3 + 2 + ptr2 = ptr2 + 2 + ptr1 = ptr1 + 2 + else: + sent_count = sent_count + 1 + if sent_count % 10000 == 0: + log.writeln(" %d sentences" % sent_count) + ptr1 = ptr1 + 1 + + self.precomputed_collocations = collocations.toMap(False) + self.precomputed_index = frequent_patterns.toMap(True) + + x = 0 + for pattern1 in J_set: + for pattern2 in J_set: + if len(pattern1) + len(pattern2) + 1 < self.max_length: + combined_pattern = pattern1 + (-1,) + pattern2 + J2_set.add(combined_pattern) + + for pattern1 in I_set: + for pattern2 in I_set: + x = x+1 + if len(pattern1) + len(pattern2) + 1 <= self.max_length: + combined_pattern = pattern1 + (-1,) + pattern2 + IJ_set.add(combined_pattern) + + for pattern1 in I_set: + for pattern2 in J2_set: + x = x+2 + if len(pattern1) + len(pattern2) + 1<= self.max_length: + combined_pattern = pattern1 + (-1,) + pattern2 + IJ_set.add(combined_pattern) + combined_pattern = pattern2 + (-1,) + pattern1 + IJ_set.add(combined_pattern) + + N = len(pattern_rank) + cost_by_rank = cintlist.CIntList(initial_len=N) + count_by_rank = cintlist.CIntList(initial_len=N) + for pattern, arr in self.precomputed_collocations.iteritems(): + if pattern not in IJ_set: + s = "" + for word_id in pattern: + if word_id == -1: + s = s + "X " + else: + s = s + darray.id2word[word_id] + " " + log.writeln("ERROR: unexpected pattern %s in set of precomputed collocations" % (s), 1) + else: + chunk = () + max_rank = 0 + arity = 0 + for word_id in pattern: + if word_id == -1: + max_rank = max(max_rank, pattern_rank[chunk]) + arity = arity + 1 + chunk = () + else: + chunk = chunk + (word_id,) + max_rank = max(max_rank, pattern_rank[chunk]) + cost_by_rank.arr[max_rank] = cost_by_rank.arr[max_rank] + (4*len(arr)) + count_by_rank.arr[max_rank] = count_by_rank.arr[max_rank] + (len(arr)/(arity+1)) + + cumul_cost = 0 + cumul_count = 0 + for i from 0 <= i < N: + cumul_cost = cumul_cost + cost_by_rank.arr[i] + cumul_count = cumul_count + count_by_rank.arr[i] + log.writeln("RANK %d\tCOUNT, COST: %d %d\tCUMUL: %d, %d" % (i, count_by_rank.arr[i], cost_by_rank.arr[i], cumul_count, cumul_cost)) + + num_found_patterns = len(self.precomputed_collocations) + for pattern in IJ_set: + if pattern not in self.precomputed_collocations: + self.precomputed_collocations[pattern] = cintlist.CIntList() + + stop_time = monitor.cpu() + log.writeln("Precomputed collocations for %d patterns out of %d possible (upper bound %d)" % (num_found_patterns,len(self.precomputed_collocations),x)) + log.writeln("Precomputed inverted index for %d patterns " % len(self.precomputed_index)) + log.writeln("Precomputation took %f seconds" % (stop_time - start_time)) + log.writeln("Detailed statistics:") + + + + + + + |