summaryrefslogtreecommitdiff
path: root/python/src/sa/lcp.pxi
diff options
context:
space:
mode:
authorPatrick Simianer <simianer@cl.uni-heidelberg.de>2012-08-01 17:32:37 +0200
committerPatrick Simianer <simianer@cl.uni-heidelberg.de>2012-08-01 17:32:37 +0200
commit3f8e33cfe481a09c121a410e66a6074b5d05683e (patch)
treea41ecaf0bbb69fa91a581623abe89d41219c04f8 /python/src/sa/lcp.pxi
parentc139ce495861bb341e1b86a85ad4559f9ad53c14 (diff)
parent9fe0219562e5db25171cce8776381600ff9a5649 (diff)
Merge remote-tracking branch 'upstream/master'
Diffstat (limited to 'python/src/sa/lcp.pxi')
-rw-r--r--python/src/sa/lcp.pxi89
1 files changed, 89 insertions, 0 deletions
diff --git a/python/src/sa/lcp.pxi b/python/src/sa/lcp.pxi
new file mode 100644
index 00000000..8e86e611
--- /dev/null
+++ b/python/src/sa/lcp.pxi
@@ -0,0 +1,89 @@
+"""Compute LCP array for a suffix array using the Kasai et al. algorithm
+Can also be used to compute statistics such
+as k most frequent n-grams"""
+
+cdef class LCP:
+ cdef SuffixArray sa
+ cdef IntList lcp
+
+ def __cinit__(self, SuffixArray sa):
+ cdef int i, k, j, h, n
+ cdef IntList rank
+
+ logger.info("Constructing LCP array")
+ self.sa = sa
+ n = self.sa.sa.len
+ self.lcp = IntList(initial_len=n)
+
+ rank = IntList(initial_len=n)
+ for i from 0 <= i < n:
+ rank.arr[sa.sa.arr[i]] = i
+
+ h = 0
+ for i from 0 <= i < n:
+ k = rank.arr[i]
+ if k == 0:
+ self.lcp.arr[k] = -1
+ else:
+ j = sa.sa.arr[k-1]
+ while i+h < n and j+h < n and sa.darray.data.arr[i+h] == sa.darray.data.arr[j+h]:
+ h = h+1
+ self.lcp.arr[k] = h
+ if h > 0:
+ h = h-1
+ logger.info("LCP array completed")
+
+ def compute_stats(self, int max_n):
+ """Note: the output of this function is not exact. In
+ particular, the frequency associated with each word is
+ not guaranteed to be correct. This is due to a bit of
+ laxness in the design; the function is intended only to
+ obtain a list of the most frequent words; for this
+ purpose it is perfectly fine"""
+ cdef int i, ii, iii, j, k, h, n, N, rs, freq, valid
+ cdef IntList run_start
+ cdef IntList ngram_start
+ cdef VEB veb
+
+ N = self.sa.sa.len
+
+ ngram_starts = []
+ for n from 0 <= n < max_n:
+ ngram_starts.append(IntList(initial_len=N))
+
+ run_start = IntList(initial_len=max_n)
+ veb = VEB(N)
+
+ for i from 0 <= i < N:
+ h = self.lcp.arr[i]
+ if h < 0:
+ h = 0
+ for n from h <= n < max_n:
+ rs = run_start.arr[n]
+ run_start.arr[n] = i
+ freq = i - rs
+ if freq > 1000: # arbitrary, but see note below
+ veb._insert(freq)
+ ngram_start = ngram_starts[n]
+ while ngram_start.arr[freq] > 0:
+ freq = freq + 1 # cheating a bit, should be ok for sparse histogram
+ ngram_start.arr[freq] = rs
+ i = veb.veb.min_val
+ while i != -1:
+ ii = veb._findsucc(i)
+ for n from 0 <= n < max_n:
+ ngram_start = ngram_starts[n]
+ iii = i
+ rs = ngram_start.arr[iii]
+ while (ii==-1 or iii < ii) and rs != 0:
+ j = self.sa.sa.arr[rs]
+ valid = 1
+ for k from 0 <= k < n+1:
+ if self.sa.darray.data.arr[j+k] < 2:
+ valid = 0
+ if valid:
+ ngram = tuple([self.sa.darray.data.arr[j+k] for k in range(n+1)])
+ yield i, n+1, ngram
+ iii = iii + 1
+ rs = ngram_start.arr[iii]
+ i = ii