summaryrefslogtreecommitdiff
path: root/sa-extract/precomputation.pyx
diff options
context:
space:
mode:
Diffstat (limited to 'sa-extract/precomputation.pyx')
-rw-r--r--sa-extract/precomputation.pyx478
1 files changed, 0 insertions, 478 deletions
diff --git a/sa-extract/precomputation.pyx b/sa-extract/precomputation.pyx
deleted file mode 100644
index ce4c21aa..00000000
--- a/sa-extract/precomputation.pyx
+++ /dev/null
@@ -1,478 +0,0 @@
-# precomputes a set of collocations by advancing over the text.
-# warning: nasty C code
-
-import log
-import monitor
-
-cimport csuf
-cimport cdat
-cimport cintlist
-
-from libc.stdio cimport FILE, fopen, fread, fwrite, fclose
-from libc.stdlib cimport malloc, realloc, free
-from libc.string cimport memset, memcpy
-
-cdef struct _Trie_Node # forward decl
-
-cdef struct _Trie_Edge:
- int val
- _Trie_Node* node
- _Trie_Edge* bigger
- _Trie_Edge* smaller
-
-cdef struct _Trie_Node:
- _Trie_Edge* root
- int* arr
- int arr_len
-
-cdef _Trie_Node* new_trie_node():
- cdef _Trie_Node* node
- node = <_Trie_Node*> malloc(sizeof(_Trie_Node))
- node.root = NULL
- node.arr_len = 0
- node.arr = <int*> malloc(sizeof(0*sizeof(int)))
- return node
-
-cdef _Trie_Edge* new_trie_edge(int val):
- cdef _Trie_Edge* edge
- edge = <_Trie_Edge*> malloc(sizeof(_Trie_Edge))
- edge.node = new_trie_node()
- edge.bigger = NULL
- edge.smaller = NULL
- edge.val = val
- return edge
-
-cdef free_trie_node(_Trie_Node* node):
- if node != NULL:
- free_trie_edge(node.root)
- free(node.arr)
-
-cdef free_trie_edge(_Trie_Edge* edge):
- if edge != NULL:
- free_trie_node(edge.node)
- free_trie_edge(edge.bigger)
- free_trie_edge(edge.smaller)
-
-cdef _Trie_Node* trie_find(_Trie_Node* node, int val):
- cdef _Trie_Edge* cur
- cur = node.root
- while cur != NULL and cur.val != val:
- if val > cur.val:
- cur = cur.bigger
- elif val < cur.val:
- cur = cur.smaller
- if cur == NULL:
- return NULL
- else:
- return cur.node
-
-cdef trie_node_data_append(_Trie_Node* node, int val):
- cdef int new_len
- new_len = node.arr_len + 1
- node.arr = <int*> realloc(node.arr, new_len*sizeof(int))
- node.arr[node.arr_len] = val
- node.arr_len = new_len
-
-cdef trie_node_data_extend(_Trie_Node* node, int* vals, int num_vals):
- cdef int new_len
- new_len = node.arr_len + num_vals
- node.arr = <int*> realloc(node.arr, new_len*sizeof(int))
- memcpy(node.arr + node.arr_len, vals, num_vals*sizeof(int))
- node.arr_len = new_len
-
-
-cdef _Trie_Node* trie_insert(_Trie_Node* node, int val):
- cdef _Trie_Edge** cur
- cur = &node.root
- while cur[0] != NULL and cur[0].val != val:
- if val > cur[0].val:
- cur = &cur[0].bigger
- elif val < cur[0].val:
- cur = &cur[0].smaller
- if cur[0] == NULL:
- cur[0] = new_trie_edge(val)
- return cur[0].node
-
-cdef trie_node_to_map(_Trie_Node* node, result, prefix, int include_zeros):
- cdef cintlist.CIntList arr
-
- if include_zeros or node.arr_len > 0:
- arr = cintlist.CIntList()
- free(arr.arr)
- arr.arr = <int*> malloc(node.arr_len * sizeof(int))
- memcpy(arr.arr, node.arr, node.arr_len * sizeof(int))
- arr.len = node.arr_len
- arr.size = node.arr_len
- result[prefix] = arr
- trie_edge_to_map(node.root, result, prefix, include_zeros)
-
-cdef trie_edge_to_map(_Trie_Edge* edge, result, prefix, int include_zeros):
- if edge != NULL:
- trie_edge_to_map(edge.smaller, result, prefix, include_zeros)
- trie_edge_to_map(edge.bigger, result, prefix, include_zeros)
- prefix = prefix + (edge.val,)
- trie_node_to_map(edge.node, result, prefix, include_zeros)
-
-cdef class TrieMap:
-
- cdef _Trie_Node** root
- cdef int V
-
- def __init__(self, alphabet_size):
- self.V = alphabet_size
- self.root = <_Trie_Node**> malloc(self.V * sizeof(_Trie_Node*))
- memset(self.root, 0, self.V * sizeof(_Trie_Node*))
-
-
- def __dealloc__(self):
- cdef int i
- for i from 0 <= i < self.V:
- if self.root[i] != NULL:
- free_trie_node(self.root[i])
- free(self.root)
-
-
- def insert(self, pattern):
- cdef int* p
- cdef int i, l
- l = len(pattern)
- p = <int*> malloc(l*sizeof(int))
- for i from 0 <= i < l:
- p[i] = pattern[i]
- self._insert(p,l)
- free(p)
-
-
- cdef _Trie_Node* _insert(self, int* pattern, int pattern_len):
- cdef int i
- cdef _Trie_Node* node
- if self.root[pattern[0]] == NULL:
- self.root[pattern[0]] = new_trie_node()
- node = self.root[pattern[0]]
- for i from 1 <= i < pattern_len:
- node = trie_insert(node, pattern[i])
- return node
-
- def contains(self, pattern):
- cdef int* p
- cdef int i, l
- cdef _Trie_Node* node
- l = len(pattern)
- p = <int*> malloc(l*sizeof(int))
- for i from 0 <= i < l:
- p[i] = pattern[i]
- node = self._contains(p,l)
- free(p)
- if node == NULL:
- return False
- else:
- return True
-
- cdef _Trie_Node* _contains(self, int* pattern, int pattern_len):
- cdef int i
- cdef _Trie_Node* node
- node = self.root[pattern[0]]
- i = 1
- while node != NULL and i < pattern_len:
- node = trie_find(node, pattern[i])
- i = i+1
- return node
-
- def toMap(self, flag):
- cdef int i, include_zeros
-
- if flag:
- include_zeros=1
- else:
- include_zeros=0
- result = {}
- for i from 0 <= i < self.V:
- if self.root[i] != NULL:
- trie_node_to_map(self.root[i], result, (i,), include_zeros)
- return result
-
-
-cdef class Precomputation:
-
-# Defined in .pxd file, here for reference:
-# cdef int precompute_rank
-# cdef int precompute_secondary_rank
-# cdef int max_length
-# cdef int max_nonterminals
-# cdef int train_max_initial_size
-# cdef int train_min_gap_size
-# cdef precomputed_index
-# cdef precomputed_collocations
-
- def __init__(self, filename, sa=None, precompute_rank=1000, precompute_secondary_rank=20, max_length=5,
- max_nonterminals=2, train_max_initial_size=10, train_min_gap_size=2, from_binary=False):
- self.precompute_rank = precompute_rank
- self.precompute_secondary_rank = precompute_secondary_rank
- self.max_length = max_length
- self.max_nonterminals = max_nonterminals
- self.train_max_initial_size = train_max_initial_size
- self.train_min_gap_size = train_min_gap_size
- if from_binary:
- self.read_binary(filename)
- else:
- self.precompute(filename, sa)
-
-
- def read_binary(self, filename):
- cdef FILE* f
- cdef bytes bfilename = filename
- cdef char* cfilename = bfilename
- f = fopen(cfilename, "r")
- fread(&(self.precompute_rank), sizeof(int), 1, f)
- fread(&(self.precompute_secondary_rank), sizeof(int), 1, f)
- fread(&(self.max_length), sizeof(int), 1, f)
- fread(&(self.max_nonterminals), sizeof(int), 1, f)
- fread(&(self.train_max_initial_size), sizeof(int), 1, f)
- fread(&(self.train_min_gap_size), sizeof(int), 1, f)
- self.precomputed_index = self.read_map(f)
- self.precomputed_collocations = self.read_map(f)
- fclose(f)
-
-
- def write_binary(self, filename):
- cdef FILE* f
- cdef bytes bfilename = filename
- cdef char* cfilename = bfilename
-
- f = fopen(cfilename, "w")
- fwrite(&(self.precompute_rank), sizeof(int), 1, f)
- fwrite(&(self.precompute_secondary_rank), sizeof(int), 1, f)
- fwrite(&(self.max_length), sizeof(int), 1, f)
- fwrite(&(self.max_nonterminals), sizeof(int), 1, f)
- fwrite(&(self.train_max_initial_size), sizeof(int), 1, f)
- fwrite(&(self.train_min_gap_size), sizeof(int), 1, f)
- self.write_map(self.precomputed_index, f)
- self.write_map(self.precomputed_collocations, f)
- fclose(f)
-
-
- cdef write_map(self, m, FILE* f):
- cdef int i, N
- cdef cintlist.CIntList arr
-
- N = len(m)
- fwrite(&(N), sizeof(int), 1, f)
- for pattern, val in m.iteritems():
- N = len(pattern)
- fwrite(&(N), sizeof(int), 1, f)
- for word_id in pattern:
- i = word_id
- fwrite(&(i), sizeof(int), 1, f)
- arr = val
- arr.write_handle(f)
-
-
- cdef read_map(self, FILE* f):
- cdef int i, j, k, word_id, N
- cdef cintlist.CIntList arr
-
- m = {}
- fread(&(N), sizeof(int), 1, f)
- for j from 0 <= j < N:
- fread(&(i), sizeof(int), 1, f)
- key = ()
- for k from 0 <= k < i:
- fread(&(word_id), sizeof(int), 1, f)
- key = key + (word_id,)
- arr = cintlist.CIntList()
- arr.read_handle(f)
- m[key] = arr
- return m
-
-
- def precompute(self, filename, sa):
- cdef int i, l, N, max_pattern_len, i1, l1, i2, l2, i3, l3, ptr1, ptr2, ptr3, is_super, sent_count, max_rank
- cdef csuf.SuffixArray sarray
- cdef cdat.DataArray darray
- cdef cintlist.CIntList data, queue, cost_by_rank, count_by_rank
- cdef TrieMap frequent_patterns, super_frequent_patterns, collocations
- cdef _Trie_Node* node
-
- sarray = sa
- darray = sarray.darray
- data = darray.data
-
- frequent_patterns = TrieMap(len(darray.id2word))
- super_frequent_patterns = TrieMap(len(darray.id2word))
- collocations = TrieMap(len(darray.id2word))
-
- I_set = set()
- J_set = set()
- J2_set = set()
- IJ_set = set()
- pattern_rank = {}
-
- log.writeln("Precomputing frequent intersections\n", 1)
- start_time = monitor.cpu()
-
- max_pattern_len = 0
- if filename is not None:
- precompute_file = open(filename)
- for rank, line in enumerate(precompute_file):
- if rank >= self.precompute_rank:
- break
- phrase_words = line.split()[2:]
- phrase = ()
- for word in phrase_words:
- phrase = phrase + (darray.word2id[word],)
- max_pattern_len = max(max_pattern_len, len(phrase))
- frequent_patterns.insert(phrase)
- I_set.add(phrase)
- pattern_rank[phrase] = rank
- if rank < self.precompute_secondary_rank:
- super_frequent_patterns.insert(phrase)
- J_set.add(phrase)
- precompute_file.close()
-
- queue = cintlist.CIntList(increment=1000)
-
- log.writeln(" Computing inverted indexes...", 1)
- N = len(data)
- for i from 0 <= i < N:
- sa_word_id = data.arr[i]
- if sa_word_id == 1:
- queue._append(-1)
- else:
- for l from 1 <= l <= max_pattern_len:
- node = frequent_patterns._contains(data.arr+i, l)
- if node == NULL:
- break
- queue._append(i)
- queue._append(l)
- trie_node_data_append(node, i)
-
- log.writeln(" Computing collocations...", 1)
- N = len(queue)
- ptr1 = 0
- sent_count = 0
- while ptr1 < N: # main loop
- i1 = queue.arr[ptr1]
- if i1 > -1:
- l1 = queue.arr[ptr1+1]
- ptr2 = ptr1 + 2
- while ptr2 < N:
- i2 = queue.arr[ptr2]
- if i2 == -1 or i2 - i1 >= self.train_max_initial_size:
- break
- l2 = queue.arr[ptr2+1]
- if i2 - i1 - l1 >= self.train_min_gap_size and i2 + l2 - i1 <= self.train_max_initial_size and l1+l2+1 <= self.max_length:
- node = collocations._insert(data.arr+i1, l1)
- node = trie_insert(node, -1)
- for i from i2 <= i < i2+l2:
- node = trie_insert(node, data.arr[i])
- trie_node_data_append(node, i1)
- trie_node_data_append(node, i2)
- if super_frequent_patterns._contains(data.arr+i2, l2) != NULL:
- if super_frequent_patterns._contains(data.arr+i1, l1) != NULL:
- is_super = 1
- else:
- is_super = 0
- ptr3 = ptr2 + 2
- while ptr3 < N:
- i3 = queue.arr[ptr3]
- if i3 == -1 or i3 - i1 >= self.train_max_initial_size:
- break
- l3 = queue.arr[ptr3+1]
- if i3 - i2 - l2 >= self.train_min_gap_size and i3 + l3 - i1 <= self.train_max_initial_size and l1+l2+l3+2 <= self.max_length:
- if is_super or super_frequent_patterns._contains(data.arr+i3, l3) != NULL:
- node = collocations._insert(data.arr+i1, l1)
- node = trie_insert(node, -1)
- for i from i2 <= i < i2+l2:
- node = trie_insert(node, data.arr[i])
- node = trie_insert(node, -1)
- for i from i3 <= i < i3+l3:
- node = trie_insert(node, data.arr[i])
- trie_node_data_append(node, i1)
- trie_node_data_append(node, i2)
- trie_node_data_append(node, i3)
- ptr3 = ptr3 + 2
- ptr2 = ptr2 + 2
- ptr1 = ptr1 + 2
- else:
- sent_count = sent_count + 1
- if sent_count % 10000 == 0:
- log.writeln(" %d sentences" % sent_count)
- ptr1 = ptr1 + 1
-
- self.precomputed_collocations = collocations.toMap(False)
- self.precomputed_index = frequent_patterns.toMap(True)
-
- x = 0
- for pattern1 in J_set:
- for pattern2 in J_set:
- if len(pattern1) + len(pattern2) + 1 < self.max_length:
- combined_pattern = pattern1 + (-1,) + pattern2
- J2_set.add(combined_pattern)
-
- for pattern1 in I_set:
- for pattern2 in I_set:
- x = x+1
- if len(pattern1) + len(pattern2) + 1 <= self.max_length:
- combined_pattern = pattern1 + (-1,) + pattern2
- IJ_set.add(combined_pattern)
-
- for pattern1 in I_set:
- for pattern2 in J2_set:
- x = x+2
- if len(pattern1) + len(pattern2) + 1<= self.max_length:
- combined_pattern = pattern1 + (-1,) + pattern2
- IJ_set.add(combined_pattern)
- combined_pattern = pattern2 + (-1,) + pattern1
- IJ_set.add(combined_pattern)
-
- N = len(pattern_rank)
- cost_by_rank = cintlist.CIntList(initial_len=N)
- count_by_rank = cintlist.CIntList(initial_len=N)
- for pattern, arr in self.precomputed_collocations.iteritems():
- if pattern not in IJ_set:
- s = ""
- for word_id in pattern:
- if word_id == -1:
- s = s + "X "
- else:
- s = s + darray.id2word[word_id] + " "
- log.writeln("ERROR: unexpected pattern %s in set of precomputed collocations" % (s), 1)
- else:
- chunk = ()
- max_rank = 0
- arity = 0
- for word_id in pattern:
- if word_id == -1:
- max_rank = max(max_rank, pattern_rank[chunk])
- arity = arity + 1
- chunk = ()
- else:
- chunk = chunk + (word_id,)
- max_rank = max(max_rank, pattern_rank[chunk])
- cost_by_rank.arr[max_rank] = cost_by_rank.arr[max_rank] + (4*len(arr))
- count_by_rank.arr[max_rank] = count_by_rank.arr[max_rank] + (len(arr)/(arity+1))
-
- cumul_cost = 0
- cumul_count = 0
- for i from 0 <= i < N:
- cumul_cost = cumul_cost + cost_by_rank.arr[i]
- cumul_count = cumul_count + count_by_rank.arr[i]
- log.writeln("RANK %d\tCOUNT, COST: %d %d\tCUMUL: %d, %d" % (i, count_by_rank.arr[i], cost_by_rank.arr[i], cumul_count, cumul_cost))
-
- num_found_patterns = len(self.precomputed_collocations)
- for pattern in IJ_set:
- if pattern not in self.precomputed_collocations:
- self.precomputed_collocations[pattern] = cintlist.CIntList()
-
- stop_time = monitor.cpu()
- log.writeln("Precomputed collocations for %d patterns out of %d possible (upper bound %d)" % (num_found_patterns,len(self.precomputed_collocations),x))
- log.writeln("Precomputed inverted index for %d patterns " % len(self.precomputed_index))
- log.writeln("Precomputation took %f seconds" % (stop_time - start_time))
- log.writeln("Detailed statistics:")
-
-
-
-
-
-
-