diff options
author | Patrick Simianer <p@simianer.de> | 2012-03-13 09:24:47 +0100 |
---|---|---|
committer | Patrick Simianer <p@simianer.de> | 2012-03-13 09:24:47 +0100 |
commit | ef6085e558e26c8819f1735425761103021b6470 (patch) | |
tree | 5cf70e4c48c64d838e1326b5a505c8c4061bff4a /sa-extract/lcp.pyx | |
parent | 10a232656a0c882b3b955d2bcfac138ce11e8a2e (diff) | |
parent | dfbc278c1057555fda9312291c8024049e00b7d8 (diff) |
merge with upstream
Diffstat (limited to 'sa-extract/lcp.pyx')
-rw-r--r-- | sa-extract/lcp.pyx | 113 |
1 files changed, 113 insertions, 0 deletions
diff --git a/sa-extract/lcp.pyx b/sa-extract/lcp.pyx new file mode 100644 index 00000000..a992d3ee --- /dev/null +++ b/sa-extract/lcp.pyx @@ -0,0 +1,113 @@ +#!/usr/bin/env python2.4 + +'''Compute LCP array for a suffix array using the Kasai et al. algorithm''' +'''Can also be used to compute statistics such +as k most frequent n-grams''' + +import sys + +cimport cintlist +cimport csuf +cimport cdat +cimport cveb + +cdef class LCP: + + cdef csuf.SuffixArray sa + cdef cintlist.CIntList lcp + + def __init__(self, sa): + self._construct(sa) + + cdef _construct(self, csuf.SuffixArray sa): + cdef int i, k, j, h, n + cdef cintlist.CIntList rank + + sys.stderr.write("Constructing LCP array\n") + self.sa = sa + n = self.sa.sa.len + self.lcp = cintlist.CIntList(initial_len=n) + + rank = cintlist.CIntList(initial_len=n) + for i from 0 <= i < n: + rank.arr[sa.sa.arr[i]] = i + + h = 0 + for i from 0 <= i < n: + k = rank.arr[i] + if k == 0: + self.lcp.arr[k] = -1 + else: + j = sa.sa.arr[k-1] + while i+h < n and j+h < n and sa.darray.data.arr[i+h] == sa.darray.data.arr[j+h]: + h = h+1 + self.lcp.arr[k] = h + if h > 0: + h = h-1 + sys.stderr.write("LCP array completed\n") + + + def compute_stats(self, max_n): + self._compute_stats(max_n) + + cdef _compute_stats(self, int max_n): + '''Note: the output of this function is not exact. In + particular, the frequency associated with each word is + not guaranteed to be correct. This is due to a bit of + laxness in the design; the function is intended only to + obtain a list of the most frequent words; for this + purpose it is perfectly fine''' + cdef int i, ii, iii, j, k, h, n, N, rs, freq, valid + cdef cintlist.CIntList run_start + cdef cintlist.CIntList ngram_start + cdef cveb.VEB veb + + N = self.sa.sa.len + + ngram_starts = [] + for n from 0 <= n < max_n: + ngram_starts.append(cintlist.CIntList(initial_len=N)) + + run_start = cintlist.CIntList(initial_len=max_n) + veb = cveb.VEB(N) + + for i from 0 <= i < N: + h = self.lcp.arr[i] + if h < 0: + h = 0 + for n from h <= n < max_n: + rs = run_start.arr[n] + run_start.arr[n] = i + freq = i - rs + if freq > 1000: # arbitrary, but see note below + veb._insert(freq) + ngram_start = ngram_starts[n] + while ngram_start.arr[freq] > 0: + freq = freq + 1 # cheating a bit, should be ok for sparse histogram + ngram_start.arr[freq] = rs + i = veb.veb.min_val + while i != -1: + ii = veb._findsucc(i) + for n from 0 <= n < max_n: + ngram_start = ngram_starts[n] + iii = i + rs = ngram_start.arr[iii] + while (ii==-1 or iii < ii) and rs != 0: + j = self.sa.sa.arr[rs] + valid = 1 + for k from 0 <= k < n+1: + if self.sa.darray.data.arr[j+k] < 2: + valid = 0 + if valid: + ngram = "" + for k from 0 <= k < n+1: + ngram= ngram+ self.sa.darray.id2word[self.sa.darray.data.arr[j+k]] + " " + print i, n+1, ngram + iii = iii + 1 + rs = ngram_start.arr[iii] + i = ii + + + + + |