Merge branch 'master' of github.com:redpony/cdec

author: Kenneth Heafield <github@kheafield.com> 2012-08-03 07:46:54 -0400
committer: Kenneth Heafield <github@kheafield.com> 2012-08-03 07:46:54 -0400
commit: be1ab0a8937f9c5668ea5e6c31b798e87672e55e (patch)
tree: a13aad60ab6cced213401bce6a38ac885ba171ba /python/src/sa/lcp.pxi
parent: e5d6f4ae41009c26978ecd62668501af9762b0bc (diff)
parent: 9fe0219562e5db25171cce8776381600ff9a5649 (diff)
1 files changed, 89 insertions, 0 deletions
diff --git a/python/src/sa/lcp.pxi b/python/src/sa/lcp.pxi
new file mode 100644
index 00000000..8e86e611
--- /dev/null
+++ b/python/src/sa/lcp.pxi
@@ -0,0 +1,89 @@
+"""Compute LCP array for a suffix array using the Kasai et al. algorithm
+Can also be used to compute statistics such
+as k most frequent n-grams"""
+
+cdef class LCP:
+    cdef SuffixArray sa
+    cdef IntList lcp
+
+    def __cinit__(self, SuffixArray sa):
+        cdef int i, k, j, h, n
+        cdef IntList rank
+
+        logger.info("Constructing LCP array")
+        self.sa = sa
+        n = self.sa.sa.len
+        self.lcp = IntList(initial_len=n)
+
+        rank = IntList(initial_len=n)
+        for i from 0 <= i < n:
+            rank.arr[sa.sa.arr[i]] = i
+
+        h = 0
+        for i from 0 <= i < n:
+            k = rank.arr[i]
+            if k == 0:
+                self.lcp.arr[k] = -1
+            else:
+                j = sa.sa.arr[k-1]
+                while i+h < n and j+h < n and sa.darray.data.arr[i+h] == sa.darray.data.arr[j+h]:
+                    h = h+1
+                self.lcp.arr[k] = h
+            if h > 0:
+                h = h-1
+        logger.info("LCP array completed")
+
+    def compute_stats(self, int max_n):
+        """Note: the output of this function is not exact.  In
+        particular, the frequency associated with each word is 
+        not guaranteed to be correct.  This is due to a bit of
+        laxness in the design; the function is intended only to
+        obtain a list of the most frequent words; for this 
+        purpose it is perfectly fine"""
+        cdef int i, ii, iii, j, k, h, n, N, rs, freq, valid
+        cdef IntList run_start
+        cdef IntList ngram_start
+        cdef VEB veb
+        
+        N = self.sa.sa.len
+
+        ngram_starts = []
+        for n from 0 <= n < max_n:
+            ngram_starts.append(IntList(initial_len=N))
+
+        run_start = IntList(initial_len=max_n)
+        veb = VEB(N)
+
+        for i from 0 <= i < N:
+            h = self.lcp.arr[i]
+            if h < 0:
+                h = 0
+            for n from h <= n < max_n:
+                rs = run_start.arr[n]
+                run_start.arr[n] = i
+                freq = i - rs
+                if freq > 1000: # arbitrary, but see note below
+                    veb._insert(freq)
+                    ngram_start = ngram_starts[n]
+                    while ngram_start.arr[freq] > 0:
+                        freq = freq + 1 # cheating a bit, should be ok for sparse histogram
+                    ngram_start.arr[freq] = rs
+        i = veb.veb.min_val
+        while i != -1:
+            ii = veb._findsucc(i)
+            for n from 0 <= n < max_n:
+                ngram_start = ngram_starts[n]
+                iii = i
+                rs = ngram_start.arr[iii]
+                while (ii==-1 or iii < ii) and rs != 0:
+                    j = self.sa.sa.arr[rs]
+                    valid = 1
+                    for k from 0 <= k < n+1:
+                        if self.sa.darray.data.arr[j+k] < 2:
+                            valid = 0
+                    if valid:
+                        ngram = tuple([self.sa.darray.data.arr[j+k] for k in range(n+1)])
+                        yield i, n+1, ngram
+                    iii = iii + 1
+                    rs = ngram_start.arr[iii]
+            i = ii
author	Kenneth Heafield <github@kheafield.com>	2012-08-03 07:46:54 -0400
committer	Kenneth Heafield <github@kheafield.com>	2012-08-03 07:46:54 -0400
commit	be1ab0a8937f9c5668ea5e6c31b798e87672e55e (patch)
tree	a13aad60ab6cced213401bce6a38ac885ba171ba /python/src/sa/lcp.pxi
parent	e5d6f4ae41009c26978ecd62668501af9762b0bc (diff)
parent	9fe0219562e5db25171cce8776381600ff9a5649 (diff)