python/cdec/sa/lcp.pxi


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89

"""Compute LCP array for a suffix array using the Kasai et al. algorithm
Can also be used to compute statistics such
as k most frequent n-grams"""

cdef class LCP:
    cdef SuffixArray sa
    cdef IntList lcp

    def __cinit__(self, SuffixArray sa):
        cdef int i, k, j, h, n
        cdef IntList rank

        logger.info("Constructing LCP array")
        self.sa = sa
        n = self.sa.sa.len
        self.lcp = IntList(initial_len=n)

        rank = IntList(initial_len=n)
        for i from 0 <= i < n:
            rank.arr[sa.sa.arr[i]] = i

        h = 0
        for i from 0 <= i < n:
            k = rank.arr[i]
            if k == 0:
                self.lcp.arr[k] = -1
            else:
                j = sa.sa.arr[k-1]
                while i+h < n and j+h < n and sa.darray.data.arr[i+h] == sa.darray.data.arr[j+h]:
                    h = h+1
                self.lcp.arr[k] = h
            if h > 0:
                h = h-1
        logger.info("LCP array completed")

    def compute_stats(self, int max_n):
        """Note: the output of this function is not exact.  In
        particular, the frequency associated with each word is 
        not guaranteed to be correct.  This is due to a bit of
        laxness in the design; the function is intended only to
        obtain a list of the most frequent words; for this 
        purpose it is perfectly fine"""
        cdef int i, ii, iii, j, k, h, n, N, rs, freq, valid
        cdef IntList run_start
        cdef IntList ngram_start
        cdef VEB veb
        
        N = self.sa.sa.len

        ngram_starts = []
        for n from 0 <= n < max_n:
            ngram_starts.append(IntList(initial_len=N))

        run_start = IntList(initial_len=max_n)
        veb = VEB(N)

        for i from 0 <= i < N:
            h = self.lcp.arr[i]
            if h < 0:
                h = 0
            for n from h <= n < max_n:
                rs = run_start.arr[n]
                run_start.arr[n] = i
                freq = i - rs
                if freq > 1000: # arbitrary, but see note below
                    veb._insert(freq)
                    ngram_start = ngram_starts[n]
                    while ngram_start.arr[freq] > 0:
                        freq = freq + 1 # cheating a bit, should be ok for sparse histogram
                    ngram_start.arr[freq] = rs
        i = veb.veb.min_val
        while i != -1:
            ii = veb._findsucc(i)
            for n from 0 <= n < max_n:
                ngram_start = ngram_starts[n]
                iii = i
                rs = ngram_start.arr[iii]
                while (ii==-1 or iii < ii) and rs != 0:
                    j = self.sa.sa.arr[rs]
                    valid = 1
                    for k from 0 <= k < n+1:
                        if self.sa.darray.data.arr[j+k] < 2:
                            valid = 0
                    if valid:
                        ngram = tuple([self.sa.darray.data.arr[j+k] for k in range(n+1)])
                        yield i, n+1, ngram
                    iii = iii + 1
                    rs = ngram_start.arr[iii]
            i = ii