sa-extract/lcp.pyx


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113

#!/usr/bin/env python2.4

'''Compute LCP array for a suffix array using the Kasai et al. algorithm'''
'''Can also be used to compute statistics such
as k most frequent n-grams'''

import sys

cimport cintlist
cimport csuf
cimport cdat
cimport cveb

cdef class LCP:

	cdef csuf.SuffixArray sa
	cdef cintlist.CIntList lcp

	def __init__(self, sa):
		self._construct(sa)

	cdef _construct(self, csuf.SuffixArray sa):
		cdef int i, k, j, h, n
		cdef cintlist.CIntList rank

		sys.stderr.write("Constructing LCP array\n")
		self.sa = sa
		n = self.sa.sa.len
		self.lcp = cintlist.CIntList(initial_len=n)

		rank = cintlist.CIntList(initial_len=n)
		for i from 0 <= i < n:
			rank.arr[sa.sa.arr[i]] = i

		h = 0
		for i from 0 <= i < n:
			k = rank.arr[i]
			if k == 0:
				self.lcp.arr[k] = -1
			else:
				j = sa.sa.arr[k-1]
				while i+h < n and j+h < n and sa.darray.data.arr[i+h] == sa.darray.data.arr[j+h]:
					h = h+1
				self.lcp.arr[k] = h
			if h > 0:
				h = h-1
		sys.stderr.write("LCP array completed\n")


	def compute_stats(self, max_n):
		self._compute_stats(max_n)

	cdef _compute_stats(self, int max_n):
		'''Note: the output of this function is not exact.  In
		particular, the frequency associated with each word is 
		not guaranteed to be correct.  This is due to a bit of
		laxness in the design; the function is intended only to
		obtain a list of the most frequent words; for this 
		purpose it is perfectly fine'''
		cdef int i, ii, iii, j, k, h, n, N, rs, freq, valid
		cdef cintlist.CIntList run_start
		cdef cintlist.CIntList ngram_start
		cdef cveb.VEB veb
		
		N = self.sa.sa.len

		ngram_starts = []
		for n from 0 <= n < max_n:
			ngram_starts.append(cintlist.CIntList(initial_len=N))

		run_start = cintlist.CIntList(initial_len=max_n)
		veb = cveb.VEB(N)

		for i from 0 <= i < N:
			h = self.lcp.arr[i]
			if h < 0:
				h = 0
			for n from h <= n < max_n:
				rs = run_start.arr[n]
				run_start.arr[n] = i
				freq = i - rs
				if freq > 1000: # arbitrary, but see note below
					veb._insert(freq)
					ngram_start = ngram_starts[n]
					while ngram_start.arr[freq] > 0:
						freq = freq + 1 # cheating a bit, should be ok for sparse histogram
					ngram_start.arr[freq] = rs
		i = veb.veb.min_val
		while i != -1:
			ii = veb._findsucc(i)
			for n from 0 <= n < max_n:
				ngram_start = ngram_starts[n]
				iii = i
				rs = ngram_start.arr[iii]
				while (ii==-1 or iii < ii) and rs != 0:
					j = self.sa.sa.arr[rs]
					valid = 1
					for k from 0 <= k < n+1:
						if self.sa.darray.data.arr[j+k] < 2:
							valid = 0
					if valid:
						ngram = ""
						for k from 0 <= k < n+1:
							ngram= ngram+ self.sa.darray.id2word[self.sa.darray.data.arr[j+k]] + " "
						print i, n+1, ngram
					iii = iii + 1
					rs = ngram_start.arr[iii]
			i = ii