Merge remote-tracking branch 'upstream/master'

author: Patrick Simianer <simianer@cl.uni-heidelberg.de> 2012-08-01 17:32:37 +0200
committer: Patrick Simianer <simianer@cl.uni-heidelberg.de> 2012-08-01 17:32:37 +0200
commit: eb3ea4fd5dff1c94b237af792c9f7bf421d79d96 (patch)
tree: 2acd7674f36e6dc6e815c5856519fdea1a2d6bf8 /sa-extract/lcp.pyx
parent: e816274e337a066df1b1e86ef00136a021a17caf (diff)
parent: 193d137056c3c4f73d66f8db84691d63307de894 (diff)
1 files changed, 0 insertions, 113 deletions
diff --git a/sa-extract/lcp.pyx b/sa-extract/lcp.pyx
deleted file mode 100644
index a992d3ee..00000000
--- a/sa-extract/lcp.pyx
+++ /dev/null
@@ -1,113 +0,0 @@
-#!/usr/bin/env python2.4
-
-'''Compute LCP array for a suffix array using the Kasai et al. algorithm'''
-'''Can also be used to compute statistics such
-as k most frequent n-grams'''
-
-import sys
-
-cimport cintlist
-cimport csuf
-cimport cdat
-cimport cveb
-
-cdef class LCP:
-
-	cdef csuf.SuffixArray sa
-	cdef cintlist.CIntList lcp
-
-	def __init__(self, sa):
-		self._construct(sa)
-
-	cdef _construct(self, csuf.SuffixArray sa):
-		cdef int i, k, j, h, n
-		cdef cintlist.CIntList rank
-
-		sys.stderr.write("Constructing LCP array\n")
-		self.sa = sa
-		n = self.sa.sa.len
-		self.lcp = cintlist.CIntList(initial_len=n)
-
-		rank = cintlist.CIntList(initial_len=n)
-		for i from 0 <= i < n:
-			rank.arr[sa.sa.arr[i]] = i
-
-		h = 0
-		for i from 0 <= i < n:
-			k = rank.arr[i]
-			if k == 0:
-				self.lcp.arr[k] = -1
-			else:
-				j = sa.sa.arr[k-1]
-				while i+h < n and j+h < n and sa.darray.data.arr[i+h] == sa.darray.data.arr[j+h]:
-					h = h+1
-				self.lcp.arr[k] = h
-			if h > 0:
-				h = h-1
-		sys.stderr.write("LCP array completed\n")
-
-
-	def compute_stats(self, max_n):
-		self._compute_stats(max_n)
-
-	cdef _compute_stats(self, int max_n):
-		'''Note: the output of this function is not exact.  In
-		particular, the frequency associated with each word is 
-		not guaranteed to be correct.  This is due to a bit of
-		laxness in the design; the function is intended only to
-		obtain a list of the most frequent words; for this 
-		purpose it is perfectly fine'''
-		cdef int i, ii, iii, j, k, h, n, N, rs, freq, valid
-		cdef cintlist.CIntList run_start
-		cdef cintlist.CIntList ngram_start
-		cdef cveb.VEB veb
-		
-		N = self.sa.sa.len
-
-		ngram_starts = []
-		for n from 0 <= n < max_n:
-			ngram_starts.append(cintlist.CIntList(initial_len=N))
-
-		run_start = cintlist.CIntList(initial_len=max_n)
-		veb = cveb.VEB(N)
-
-		for i from 0 <= i < N:
-			h = self.lcp.arr[i]
-			if h < 0:
-				h = 0
-			for n from h <= n < max_n:
-				rs = run_start.arr[n]
-				run_start.arr[n] = i
-				freq = i - rs
-				if freq > 1000: # arbitrary, but see note below
-					veb._insert(freq)
-					ngram_start = ngram_starts[n]
-					while ngram_start.arr[freq] > 0:
-						freq = freq + 1 # cheating a bit, should be ok for sparse histogram
-					ngram_start.arr[freq] = rs
-		i = veb.veb.min_val
-		while i != -1:
-			ii = veb._findsucc(i)
-			for n from 0 <= n < max_n:
-				ngram_start = ngram_starts[n]
-				iii = i
-				rs = ngram_start.arr[iii]
-				while (ii==-1 or iii < ii) and rs != 0:
-					j = self.sa.sa.arr[rs]
-					valid = 1
-					for k from 0 <= k < n+1:
-						if self.sa.darray.data.arr[j+k] < 2:
-							valid = 0
-					if valid:
-						ngram = ""
-						for k from 0 <= k < n+1:
-							ngram= ngram+ self.sa.darray.id2word[self.sa.darray.data.arr[j+k]] + " "
-						print i, n+1, ngram
-					iii = iii + 1
-					rs = ngram_start.arr[iii]
-			i = ii
-
-
-
-
-
author	Patrick Simianer <simianer@cl.uni-heidelberg.de>	2012-08-01 17:32:37 +0200
committer	Patrick Simianer <simianer@cl.uni-heidelberg.de>	2012-08-01 17:32:37 +0200
commit	eb3ea4fd5dff1c94b237af792c9f7bf421d79d96 (patch)
tree	2acd7674f36e6dc6e815c5856519fdea1a2d6bf8 /sa-extract/lcp.pyx
parent	e816274e337a066df1b1e86ef00136a021a17caf (diff)
parent	193d137056c3c4f73d66f8db84691d63307de894 (diff)