diff options
author | Victor Chahuneau <vchahune@cs.cmu.edu> | 2013-08-26 20:12:32 -0400 |
---|---|---|
committer | Victor Chahuneau <vchahune@cs.cmu.edu> | 2013-08-26 20:12:32 -0400 |
commit | 03799a2d330c6dbbe12154d4bcea236210b4f6ed (patch) | |
tree | 7adb0bc8dd2987fa32ee1299d8821dd8b7b06706 /python/cdec/sa/lcp.pxi | |
parent | 8b491e57f8a011f4f8496e44bed7eb7a4360bc93 (diff) |
Improve the package structure of pycdec
This change should not break anything, but now you can run:
python setup.py build_ext --inplace
and use the cleaner:
PYTHONPATH=/path/to/cdec/python python -m ...
Diffstat (limited to 'python/cdec/sa/lcp.pxi')
-rw-r--r-- | python/cdec/sa/lcp.pxi | 89 |
1 files changed, 89 insertions, 0 deletions
diff --git a/python/cdec/sa/lcp.pxi b/python/cdec/sa/lcp.pxi new file mode 100644 index 00000000..8e86e611 --- /dev/null +++ b/python/cdec/sa/lcp.pxi @@ -0,0 +1,89 @@ +"""Compute LCP array for a suffix array using the Kasai et al. algorithm +Can also be used to compute statistics such +as k most frequent n-grams""" + +cdef class LCP: + cdef SuffixArray sa + cdef IntList lcp + + def __cinit__(self, SuffixArray sa): + cdef int i, k, j, h, n + cdef IntList rank + + logger.info("Constructing LCP array") + self.sa = sa + n = self.sa.sa.len + self.lcp = IntList(initial_len=n) + + rank = IntList(initial_len=n) + for i from 0 <= i < n: + rank.arr[sa.sa.arr[i]] = i + + h = 0 + for i from 0 <= i < n: + k = rank.arr[i] + if k == 0: + self.lcp.arr[k] = -1 + else: + j = sa.sa.arr[k-1] + while i+h < n and j+h < n and sa.darray.data.arr[i+h] == sa.darray.data.arr[j+h]: + h = h+1 + self.lcp.arr[k] = h + if h > 0: + h = h-1 + logger.info("LCP array completed") + + def compute_stats(self, int max_n): + """Note: the output of this function is not exact. In + particular, the frequency associated with each word is + not guaranteed to be correct. This is due to a bit of + laxness in the design; the function is intended only to + obtain a list of the most frequent words; for this + purpose it is perfectly fine""" + cdef int i, ii, iii, j, k, h, n, N, rs, freq, valid + cdef IntList run_start + cdef IntList ngram_start + cdef VEB veb + + N = self.sa.sa.len + + ngram_starts = [] + for n from 0 <= n < max_n: + ngram_starts.append(IntList(initial_len=N)) + + run_start = IntList(initial_len=max_n) + veb = VEB(N) + + for i from 0 <= i < N: + h = self.lcp.arr[i] + if h < 0: + h = 0 + for n from h <= n < max_n: + rs = run_start.arr[n] + run_start.arr[n] = i + freq = i - rs + if freq > 1000: # arbitrary, but see note below + veb._insert(freq) + ngram_start = ngram_starts[n] + while ngram_start.arr[freq] > 0: + freq = freq + 1 # cheating a bit, should be ok for sparse histogram + ngram_start.arr[freq] = rs + i = veb.veb.min_val + while i != -1: + ii = veb._findsucc(i) + for n from 0 <= n < max_n: + ngram_start = ngram_starts[n] + iii = i + rs = ngram_start.arr[iii] + while (ii==-1 or iii < ii) and rs != 0: + j = self.sa.sa.arr[rs] + valid = 1 + for k from 0 <= k < n+1: + if self.sa.darray.data.arr[j+k] < 2: + valid = 0 + if valid: + ngram = tuple([self.sa.darray.data.arr[j+k] for k in range(n+1)]) + yield i, n+1, ngram + iii = iii + 1 + rs = ngram_start.arr[iii] + i = ii |