diff options
| author | Kenneth Heafield <github@kheafield.com> | 2012-08-03 07:46:54 -0400 | 
|---|---|---|
| committer | Kenneth Heafield <github@kheafield.com> | 2012-08-03 07:46:54 -0400 | 
| commit | be1ab0a8937f9c5668ea5e6c31b798e87672e55e (patch) | |
| tree | a13aad60ab6cced213401bce6a38ac885ba171ba /python/src/sa/lcp.pxi | |
| parent | e5d6f4ae41009c26978ecd62668501af9762b0bc (diff) | |
| parent | 9fe0219562e5db25171cce8776381600ff9a5649 (diff) | |
Merge branch 'master' of github.com:redpony/cdec
Diffstat (limited to 'python/src/sa/lcp.pxi')
| -rw-r--r-- | python/src/sa/lcp.pxi | 89 | 
1 files changed, 89 insertions, 0 deletions
| diff --git a/python/src/sa/lcp.pxi b/python/src/sa/lcp.pxi new file mode 100644 index 00000000..8e86e611 --- /dev/null +++ b/python/src/sa/lcp.pxi @@ -0,0 +1,89 @@ +"""Compute LCP array for a suffix array using the Kasai et al. algorithm +Can also be used to compute statistics such +as k most frequent n-grams""" + +cdef class LCP: +    cdef SuffixArray sa +    cdef IntList lcp + +    def __cinit__(self, SuffixArray sa): +        cdef int i, k, j, h, n +        cdef IntList rank + +        logger.info("Constructing LCP array") +        self.sa = sa +        n = self.sa.sa.len +        self.lcp = IntList(initial_len=n) + +        rank = IntList(initial_len=n) +        for i from 0 <= i < n: +            rank.arr[sa.sa.arr[i]] = i + +        h = 0 +        for i from 0 <= i < n: +            k = rank.arr[i] +            if k == 0: +                self.lcp.arr[k] = -1 +            else: +                j = sa.sa.arr[k-1] +                while i+h < n and j+h < n and sa.darray.data.arr[i+h] == sa.darray.data.arr[j+h]: +                    h = h+1 +                self.lcp.arr[k] = h +            if h > 0: +                h = h-1 +        logger.info("LCP array completed") + +    def compute_stats(self, int max_n): +        """Note: the output of this function is not exact.  In +        particular, the frequency associated with each word is  +        not guaranteed to be correct.  This is due to a bit of +        laxness in the design; the function is intended only to +        obtain a list of the most frequent words; for this  +        purpose it is perfectly fine""" +        cdef int i, ii, iii, j, k, h, n, N, rs, freq, valid +        cdef IntList run_start +        cdef IntList ngram_start +        cdef VEB veb +         +        N = self.sa.sa.len + +        ngram_starts = [] +        for n from 0 <= n < max_n: +            ngram_starts.append(IntList(initial_len=N)) + +        run_start = IntList(initial_len=max_n) +        veb = VEB(N) + +        for i from 0 <= i < N: +            h = self.lcp.arr[i] +            if h < 0: +                h = 0 +            for n from h <= n < max_n: +                rs = run_start.arr[n] +                run_start.arr[n] = i +                freq = i - rs +                if freq > 1000: # arbitrary, but see note below +                    veb._insert(freq) +                    ngram_start = ngram_starts[n] +                    while ngram_start.arr[freq] > 0: +                        freq = freq + 1 # cheating a bit, should be ok for sparse histogram +                    ngram_start.arr[freq] = rs +        i = veb.veb.min_val +        while i != -1: +            ii = veb._findsucc(i) +            for n from 0 <= n < max_n: +                ngram_start = ngram_starts[n] +                iii = i +                rs = ngram_start.arr[iii] +                while (ii==-1 or iii < ii) and rs != 0: +                    j = self.sa.sa.arr[rs] +                    valid = 1 +                    for k from 0 <= k < n+1: +                        if self.sa.darray.data.arr[j+k] < 2: +                            valid = 0 +                    if valid: +                        ngram = tuple([self.sa.darray.data.arr[j+k] for k in range(n+1)]) +                        yield i, n+1, ngram +                    iii = iii + 1 +                    rs = ngram_start.arr[iii] +            i = ii | 
