diff options
author | Victor Chahuneau <vchahune@cs.cmu.edu> | 2012-07-27 01:16:03 -0400 |
---|---|---|
committer | Victor Chahuneau <vchahune@cs.cmu.edu> | 2012-07-27 01:16:03 -0400 |
commit | b2a8bccb2bd713d9ec081cf3dad0162c2cb492d8 (patch) | |
tree | c661044fd2a3943cf2ad12109b916fd7b56a519e /python/src/sa/lcp.pxi | |
parent | 148b1168c2b07abf0c7757a31141377c28ec3d91 (diff) |
[python] Fork of the suffix-array extractor with surface improvements
Available as the cdec.sa module, with commande-line helpers:
python -m cdec.sa.compile -f ... -e ... -a ... -o sa-out/ -c extract.ini
python -m cdec.sa.extract -c extract.ini -g grammars-out/ < input.txt > input.sgml
+ renamed cdec.scfg -> cdec.sa
+ Python README
Diffstat (limited to 'python/src/sa/lcp.pxi')
-rw-r--r-- | python/src/sa/lcp.pxi | 89 |
1 files changed, 89 insertions, 0 deletions
diff --git a/python/src/sa/lcp.pxi b/python/src/sa/lcp.pxi new file mode 100644 index 00000000..8e86e611 --- /dev/null +++ b/python/src/sa/lcp.pxi @@ -0,0 +1,89 @@ +"""Compute LCP array for a suffix array using the Kasai et al. algorithm +Can also be used to compute statistics such +as k most frequent n-grams""" + +cdef class LCP: + cdef SuffixArray sa + cdef IntList lcp + + def __cinit__(self, SuffixArray sa): + cdef int i, k, j, h, n + cdef IntList rank + + logger.info("Constructing LCP array") + self.sa = sa + n = self.sa.sa.len + self.lcp = IntList(initial_len=n) + + rank = IntList(initial_len=n) + for i from 0 <= i < n: + rank.arr[sa.sa.arr[i]] = i + + h = 0 + for i from 0 <= i < n: + k = rank.arr[i] + if k == 0: + self.lcp.arr[k] = -1 + else: + j = sa.sa.arr[k-1] + while i+h < n and j+h < n and sa.darray.data.arr[i+h] == sa.darray.data.arr[j+h]: + h = h+1 + self.lcp.arr[k] = h + if h > 0: + h = h-1 + logger.info("LCP array completed") + + def compute_stats(self, int max_n): + """Note: the output of this function is not exact. In + particular, the frequency associated with each word is + not guaranteed to be correct. This is due to a bit of + laxness in the design; the function is intended only to + obtain a list of the most frequent words; for this + purpose it is perfectly fine""" + cdef int i, ii, iii, j, k, h, n, N, rs, freq, valid + cdef IntList run_start + cdef IntList ngram_start + cdef VEB veb + + N = self.sa.sa.len + + ngram_starts = [] + for n from 0 <= n < max_n: + ngram_starts.append(IntList(initial_len=N)) + + run_start = IntList(initial_len=max_n) + veb = VEB(N) + + for i from 0 <= i < N: + h = self.lcp.arr[i] + if h < 0: + h = 0 + for n from h <= n < max_n: + rs = run_start.arr[n] + run_start.arr[n] = i + freq = i - rs + if freq > 1000: # arbitrary, but see note below + veb._insert(freq) + ngram_start = ngram_starts[n] + while ngram_start.arr[freq] > 0: + freq = freq + 1 # cheating a bit, should be ok for sparse histogram + ngram_start.arr[freq] = rs + i = veb.veb.min_val + while i != -1: + ii = veb._findsucc(i) + for n from 0 <= n < max_n: + ngram_start = ngram_starts[n] + iii = i + rs = ngram_start.arr[iii] + while (ii==-1 or iii < ii) and rs != 0: + j = self.sa.sa.arr[rs] + valid = 1 + for k from 0 <= k < n+1: + if self.sa.darray.data.arr[j+k] < 2: + valid = 0 + if valid: + ngram = tuple([self.sa.darray.data.arr[j+k] for k in range(n+1)]) + yield i, n+1, ngram + iii = iii + 1 + rs = ngram_start.arr[iii] + i = ii |