summaryrefslogtreecommitdiff
path: root/sa-extract/cveb.pyx
diff options
context:
space:
mode:
authorChris Dyer <cdyer@cs.cmu.edu>2012-07-27 17:31:00 -0400
committerChris Dyer <cdyer@cs.cmu.edu>2012-07-27 17:31:00 -0400
commitb317e0efd2398d75d70e027bb1e2cf442e683981 (patch)
treeec34aff0ce4e8fb9704d1cd2b7abf00cb9a25b9a /sa-extract/cveb.pyx
parentb2a8bccb2bd713d9ec081cf3dad0162c2cb492d8 (diff)
remove old suffix array extractor (use the one in python/ instead)
Diffstat (limited to 'sa-extract/cveb.pyx')
-rw-r--r--sa-extract/cveb.pyx390
1 files changed, 0 insertions, 390 deletions
diff --git a/sa-extract/cveb.pyx b/sa-extract/cveb.pyx
deleted file mode 100644
index ca87becc..00000000
--- a/sa-extract/cveb.pyx
+++ /dev/null
@@ -1,390 +0,0 @@
-#!/usr/bin/env python2.4
-'''This module implements a partial stratified tree (van Emde Boas, 1977).
-Only insert findsucc, __iter__, and __contains__ are implemented.
-Delete is currently not supported.
-There is very little error-checking in this code -- it is designed
-to be used in the limited situation described in Lopez (EMNLP-CoNLL 2007),
-which doesn't cover all of the possible ways that you could misuse it
-(e.g. trying to insert a key larger than the universe size)
-Other notes -- this code is really rather ugly C code masquerading as
-Pyrex/Python. Virtual function calls are bypassed by hand in several
-places for the sake of efficiency, and other Python niceties are
-removed for the same reason.'''
-
-from libc.stdlib cimport malloc, free
-from libc.math cimport log, ceil
-from libc.string cimport memset
-
-cdef int MIN_BOTTOM_SIZE
-cdef int MIN_BOTTOM_BITS
-
-MIN_BOTTOM_SIZE = 32
-MIN_BOTTOM_BITS = 5
-
-cdef int lower_mask[32]
-cdef int i, mask
-
-for i from 0 <= i < MIN_BOTTOM_SIZE:
- mask = (mask << 1) + 1
- lower_mask[i] = mask
-
-
-cdef struct _BitSet:
- long bitset
- int min_val
- int max_val
- int size
-
-
-cdef _BitSet* new_BitSet():
- cdef _BitSet* b
-
- b = <_BitSet*> malloc(sizeof(_BitSet))
- b.bitset = 0
- b.min_val = -1
- b.max_val = -1
- b.size = 0
- return b
-
-
-cdef int bitset_findsucc(_BitSet* b, int i):
- cdef int bitset, mask
- cdef int low, high, mid
-
- if b.max_val == -1 or i >= b.max_val:
- return -1
- if i < b.min_val:
- return b.min_val
-
- bitset = b.bitset & ~lower_mask[i]
- low = i+1
- high = b.max_val+1
- while low < high-1:
- mid = (high + low)/2
- mask = ~(lower_mask[high-1] ^ lower_mask[mid-1])
- if bitset & mask == 0:
- low = mid
- else:
- bitset = bitset & mask
- high = mid
- return low
-
-
-cdef int bitset_insert(_BitSet* b, int i):
- cdef int val
-
- val = 1 << i
- if b.bitset & val == 0:
- b.bitset = b.bitset | val
- if b.size == 0:
- b.min_val = i
- b.max_val = i
- else:
- if i < b.min_val:
- b.min_val = i
- if i > b.max_val:
- b.max_val = i
- b.size = b.size + 1
- return 1
- return 0
-
-
-cdef int bitset_contains(_BitSet* b, int i):
- cdef int val
-
- val = 1 << i
- if b.bitset & val == 0:
- return 0
- else:
- return 1
-
-
-cdef class BitSetIterator:
- cdef _BitSet* b
- cdef int next_val
-
- def __next__(self):
- cdef int ret_val
-
- if self.next_val == -1:
- raise StopIteration()
- ret_val = self.next_val
- self.next_val = bitset_findsucc(self.b, ret_val)
- return ret_val
-
-
-
-# This is a Python wrapper class to give access to the
-# (entirely C-implemented) _BitSet struct.
-# Very slow; use only for debugging
-cdef class BitSet:
-
- cdef _BitSet* b
-
- def __cinit__(self):
- self.b = new_BitSet()
-
- def __dealloc__(self):
- free(self.b)
-
- def __iter__(self):
- cdef BitSetIterator it
- it = BitSetIterator()
- it.b = self.b
- it.next_val = self.b.min_val
- return it
-
- def insert(self, i):
- return bitset_insert(self.b, i)
-
- def findsucc(self, i):
- return bitset_findsucc(self.b, i)
-
- def __str__(self):
- return dec2bin(self.b.bitset)+" ("+str(self.b.size)+","+str(self.b.min_val)+","+str(self.b.max_val)+")"
-
- def min(self):
- return self.b.min_val
-
- def max(self):
- return self.b.max_val
-
- def __len__(self):
- return self.b.size
-
- def __contains__(self, i):
- return bool(bitset_contains(self.b, i))
-
-
-def dec2bin(i):
- cdef d
- result = ""
- for d from 0 <= d < MIN_BOTTOM_SIZE:
- if i & lower_mask[0] == 0:
- result = "0"+result
- else:
- result = "1"+result
- i = i >> 1
- return result
-
-
-cdef _VEB* new_VEB(int n):
- cdef _VEB* veb
- cdef int num_bits, num_top_bits, i
-
- veb = <_VEB*> malloc(sizeof(_VEB))
-
- num_bits = int(ceil(log(n) / log(2)))
- veb.num_bottom_bits = num_bits/2
- if veb.num_bottom_bits < MIN_BOTTOM_BITS:
- veb.num_bottom_bits = MIN_BOTTOM_BITS
- veb.top_universe_size = (n >> veb.num_bottom_bits) + 1
-
- veb.bottom = <void**> malloc(veb.top_universe_size * sizeof(void*))
- memset(veb.bottom, 0, veb.top_universe_size * sizeof(void*))
-
- if veb.top_universe_size > MIN_BOTTOM_SIZE:
- veb.top = new_VEB(veb.top_universe_size)
- else:
- veb.top = new_BitSet()
-
- veb.max_val = -1
- veb.min_val = -1
- veb.size = 0
- return veb
-
-
-cdef int VEB_insert(_VEB* veb, int i):
- cdef _VEB* subv
- cdef _BitSet* subb
- cdef int a, b, tmp
-
- if veb.size == 0:
- veb.min_val = i
- veb.max_val = i
- elif i == veb.min_val or i == veb.max_val:
- return 0
- else:
- if i < veb.min_val:
- tmp = i
- i = veb.min_val
- veb.min_val = tmp
- a = i >> veb.num_bottom_bits
- b = i & lower_mask[veb.num_bottom_bits-1]
- if veb.bottom[a] == NULL:
- if veb.top_universe_size > MIN_BOTTOM_SIZE:
- subv = <_VEB*> veb.top
- VEB_insert(subv, a)
- else:
- subb = <_BitSet*> veb.top
- bitset_insert(subb, a)
- if veb.num_bottom_bits > MIN_BOTTOM_BITS:
- veb.bottom[a] = new_VEB(1 << veb.num_bottom_bits)
- else:
- veb.bottom[a] = new_BitSet()
- if veb.num_bottom_bits > MIN_BOTTOM_BITS:
- subv = <_VEB*> veb.bottom[a]
- if VEB_insert(subv, b) == 0:
- return 0
- else:
- subb = <_BitSet*> veb.bottom[a]
- if bitset_insert(subb, b) == 0:
- return 0
-
- if i > veb.max_val:
- veb.max_val = i
- veb.size = veb.size + 1
- return 1
-
-
-cdef del_VEB(_VEB* veb):
- cdef int i
-
- if veb.top_universe_size > MIN_BOTTOM_SIZE:
- i = (<_VEB*> veb.top).min_val
- else:
- i = (<_BitSet*> veb.top).min_val
-
- while i != -1:
- if veb.num_bottom_bits > MIN_BOTTOM_BITS:
- del_VEB(<_VEB*> veb.bottom[i])
- else:
- free(<_BitSet*> veb.bottom[i])
-
- if veb.top_universe_size > MIN_BOTTOM_SIZE:
- i = VEB_findsucc(<_VEB*> veb.top, i)
- else:
- i = bitset_findsucc(<_BitSet*> veb.top, i)
-
- if veb.top_universe_size > MIN_BOTTOM_SIZE:
- del_VEB(<_VEB*> veb.top)
- else:
- free(<_BitSet*> veb.top)
- free(veb.bottom)
- free(veb)
-
-
-cdef int VEB_findsucc(_VEB* veb, int i):
- cdef _VEB* subv
- cdef _BitSet* subb
- cdef int a, b, j, c, found
-
- if veb.max_val == -1 or i>=veb.max_val:
- return -1
- if i < veb.min_val:
- return veb.min_val
-
- a = i >> veb.num_bottom_bits
- b = i & lower_mask[veb.num_bottom_bits-1]
- found = 0
- if veb.bottom[a] != NULL:
- if veb.num_bottom_bits > MIN_BOTTOM_BITS:
- subv = <_VEB*> veb.bottom[a]
- if subv.max_val > b:
- j = (a << veb.num_bottom_bits) + VEB_findsucc(subv, b)
- found = 1
- else:
- subb = <_BitSet*> veb.bottom[a]
- if subb.max_val > b:
- j = (a << veb.num_bottom_bits) + bitset_findsucc(subb, b)
- found = 1
- if found==0:
- if veb.top_universe_size > MIN_BOTTOM_SIZE:
- subv = <_VEB*> veb.top
- c = VEB_findsucc(subv, a)
- else:
- subb = <_BitSet*> veb.top
- c = bitset_findsucc(subb, a)
- if veb.num_bottom_bits > MIN_BOTTOM_BITS:
- subv = <_VEB*> veb.bottom[c]
- j = (c << veb.num_bottom_bits) + subv.min_val
- else:
- subb = <_BitSet*> veb.bottom[c]
- j = (c << veb.num_bottom_bits) + subb.min_val
- return j
-
-
-cdef int VEB_contains(_VEB* veb, int i):
- cdef _VEB* subv
- cdef _BitSet* subb
- cdef int a, b
-
- if veb.size == 0 or i < veb.min_val or i > veb.max_val:
- return 0
-
- if veb.min_val == i:
- return 1
- else:
- if veb.size == 1:
- return 0
-
- a = i >> veb.num_bottom_bits
- b = i & lower_mask[veb.num_bottom_bits-1]
- if veb.bottom[a] == NULL:
- return 0
- else:
- if veb.num_bottom_bits > MIN_BOTTOM_BITS:
- subv = <_VEB*> veb.bottom[a]
- return VEB_contains(subv, b)
- else:
- subb = <_BitSet*> veb.bottom[a]
- return bitset_contains(subb, b)
-
-
-cdef class VEBIterator:
- cdef _VEB* v
- cdef int next_val
-
- def __next__(self):
- cdef int ret_val
-
- if self.next_val == -1:
- raise StopIteration()
- ret_val = self.next_val
- self.next_val = VEB_findsucc(self.v, ret_val)
- return ret_val
-
-
-cdef class VEB:
-
- def __init__(self, size):
- pass
-
- def __cinit__(self, int size):
- self.veb = new_VEB(size)
-
- def __dealloc__(self):
- del_VEB(self.veb)
-
- def __iter__(self):
- cdef VEBIterator it
- it = VEBIterator()
- it.v = self.veb
- it.next_val = self.veb.min_val
- return it
-
- def insert(self, i):
- return VEB_insert(self.veb, i)
-
- cdef int _insert(self, int i):
- return VEB_insert(self.veb, i)
-
- def findsucc(self, i):
- return VEB_findsucc(self.veb, i)
-
- cdef int _first(self):
- return self.veb.min_val
-
- cdef int _findsucc(self, int i):
- return VEB_findsucc(self.veb, i)
-
- def __len__(self):
- return self.veb.size
-
- def __contains__(self, i):
- return VEB_contains(self.veb, i)
-
-
-
-
-