1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
|
#!/usr/bin/env python2.4
'''Compute LCP array for a suffix array using the Kasai et al. algorithm'''
'''Can also be used to compute statistics such
as k most frequent n-grams'''
import sys
cimport cintlist
cimport csuf
cimport cdat
cimport cveb
cdef class LCP:
cdef csuf.SuffixArray sa
cdef cintlist.CIntList lcp
def __init__(self, sa):
self._construct(sa)
cdef _construct(self, csuf.SuffixArray sa):
cdef int i, k, j, h, n
cdef cintlist.CIntList rank
sys.stderr.write("Constructing LCP array\n")
self.sa = sa
n = self.sa.sa.len
self.lcp = cintlist.CIntList(initial_len=n)
rank = cintlist.CIntList(initial_len=n)
for i from 0 <= i < n:
rank.arr[sa.sa.arr[i]] = i
h = 0
for i from 0 <= i < n:
k = rank.arr[i]
if k == 0:
self.lcp.arr[k] = -1
else:
j = sa.sa.arr[k-1]
while i+h < n and j+h < n and sa.darray.data.arr[i+h] == sa.darray.data.arr[j+h]:
h = h+1
self.lcp.arr[k] = h
if h > 0:
h = h-1
sys.stderr.write("LCP array completed\n")
def compute_stats(self, max_n):
self._compute_stats(max_n)
cdef _compute_stats(self, int max_n):
'''Note: the output of this function is not exact. In
particular, the frequency associated with each word is
not guaranteed to be correct. This is due to a bit of
laxness in the design; the function is intended only to
obtain a list of the most frequent words; for this
purpose it is perfectly fine'''
cdef int i, ii, iii, j, k, h, n, N, rs, freq, valid
cdef cintlist.CIntList run_start
cdef cintlist.CIntList ngram_start
cdef cveb.VEB veb
N = self.sa.sa.len
ngram_starts = []
for n from 0 <= n < max_n:
ngram_starts.append(cintlist.CIntList(initial_len=N))
run_start = cintlist.CIntList(initial_len=max_n)
veb = cveb.VEB(N)
for i from 0 <= i < N:
h = self.lcp.arr[i]
if h < 0:
h = 0
for n from h <= n < max_n:
rs = run_start.arr[n]
run_start.arr[n] = i
freq = i - rs
if freq > 1000: # arbitrary, but see note below
veb._insert(freq)
ngram_start = ngram_starts[n]
while ngram_start.arr[freq] > 0:
freq = freq + 1 # cheating a bit, should be ok for sparse histogram
ngram_start.arr[freq] = rs
i = veb.veb.min_val
while i != -1:
ii = veb._findsucc(i)
for n from 0 <= n < max_n:
ngram_start = ngram_starts[n]
iii = i
rs = ngram_start.arr[iii]
while (ii==-1 or iii < ii) and rs != 0:
j = self.sa.sa.arr[rs]
valid = 1
for k from 0 <= k < n+1:
if self.sa.darray.data.arr[j+k] < 2:
valid = 0
if valid:
ngram = ""
for k from 0 <= k < n+1:
ngram= ngram+ self.sa.darray.id2word[self.sa.darray.data.arr[j+k]] + " "
print i, n+1, ngram
iii = iii + 1
rs = ngram_start.arr[iii]
i = ii
|