From 03799a2d330c6dbbe12154d4bcea236210b4f6ed Mon Sep 17 00:00:00 2001
From: Victor Chahuneau <vchahune@cs.cmu.edu>
Date: Mon, 26 Aug 2013 20:12:32 -0400
Subject: Improve the package structure of pycdec

This change should not break anything, but now you can run:

    python setup.py build_ext --inplace

and use the cleaner:

    PYTHONPATH=/path/to/cdec/python python -m ...
---
 python/src/sa/lcp.pxi | 89 ---------------------------------------------------
 1 file changed, 89 deletions(-)
 delete mode 100644 python/src/sa/lcp.pxi

(limited to 'python/src/sa/lcp.pxi')

diff --git a/python/src/sa/lcp.pxi b/python/src/sa/lcp.pxi
deleted file mode 100644
index 8e86e611..00000000
--- a/python/src/sa/lcp.pxi
+++ /dev/null
@@ -1,89 +0,0 @@
-"""Compute LCP array for a suffix array using the Kasai et al. algorithm
-Can also be used to compute statistics such
-as k most frequent n-grams"""
-
-cdef class LCP:
-    cdef SuffixArray sa
-    cdef IntList lcp
-
-    def __cinit__(self, SuffixArray sa):
-        cdef int i, k, j, h, n
-        cdef IntList rank
-
-        logger.info("Constructing LCP array")
-        self.sa = sa
-        n = self.sa.sa.len
-        self.lcp = IntList(initial_len=n)
-
-        rank = IntList(initial_len=n)
-        for i from 0 <= i < n:
-            rank.arr[sa.sa.arr[i]] = i
-
-        h = 0
-        for i from 0 <= i < n:
-            k = rank.arr[i]
-            if k == 0:
-                self.lcp.arr[k] = -1
-            else:
-                j = sa.sa.arr[k-1]
-                while i+h < n and j+h < n and sa.darray.data.arr[i+h] == sa.darray.data.arr[j+h]:
-                    h = h+1
-                self.lcp.arr[k] = h
-            if h > 0:
-                h = h-1
-        logger.info("LCP array completed")
-
-    def compute_stats(self, int max_n):
-        """Note: the output of this function is not exact.  In
-        particular, the frequency associated with each word is 
-        not guaranteed to be correct.  This is due to a bit of
-        laxness in the design; the function is intended only to
-        obtain a list of the most frequent words; for this 
-        purpose it is perfectly fine"""
-        cdef int i, ii, iii, j, k, h, n, N, rs, freq, valid
-        cdef IntList run_start
-        cdef IntList ngram_start
-        cdef VEB veb
-        
-        N = self.sa.sa.len
-
-        ngram_starts = []
-        for n from 0 <= n < max_n:
-            ngram_starts.append(IntList(initial_len=N))
-
-        run_start = IntList(initial_len=max_n)
-        veb = VEB(N)
-
-        for i from 0 <= i < N:
-            h = self.lcp.arr[i]
-            if h < 0:
-                h = 0
-            for n from h <= n < max_n:
-                rs = run_start.arr[n]
-                run_start.arr[n] = i
-                freq = i - rs
-                if freq > 1000: # arbitrary, but see note below
-                    veb._insert(freq)
-                    ngram_start = ngram_starts[n]
-                    while ngram_start.arr[freq] > 0:
-                        freq = freq + 1 # cheating a bit, should be ok for sparse histogram
-                    ngram_start.arr[freq] = rs
-        i = veb.veb.min_val
-        while i != -1:
-            ii = veb._findsucc(i)
-            for n from 0 <= n < max_n:
-                ngram_start = ngram_starts[n]
-                iii = i
-                rs = ngram_start.arr[iii]
-                while (ii==-1 or iii < ii) and rs != 0:
-                    j = self.sa.sa.arr[rs]
-                    valid = 1
-                    for k from 0 <= k < n+1:
-                        if self.sa.darray.data.arr[j+k] < 2:
-                            valid = 0
-                    if valid:
-                        ngram = tuple([self.sa.darray.data.arr[j+k] for k in range(n+1)])
-                        yield i, n+1, ngram
-                    iii = iii + 1
-                    rs = ngram_start.arr[iii]
-            i = ii
-- 
cgit v1.2.3