diff options
author | Chris Dyer <cdyer@cs.cmu.edu> | 2012-07-27 17:31:00 -0400 |
---|---|---|
committer | Chris Dyer <cdyer@cs.cmu.edu> | 2012-07-27 17:31:00 -0400 |
commit | b317e0efd2398d75d70e027bb1e2cf442e683981 (patch) | |
tree | ec34aff0ce4e8fb9704d1cd2b7abf00cb9a25b9a /sa-extract/cveb.pyx | |
parent | b2a8bccb2bd713d9ec081cf3dad0162c2cb492d8 (diff) |
remove old suffix array extractor (use the one in python/ instead)
Diffstat (limited to 'sa-extract/cveb.pyx')
-rw-r--r-- | sa-extract/cveb.pyx | 390 |
1 files changed, 0 insertions, 390 deletions
diff --git a/sa-extract/cveb.pyx b/sa-extract/cveb.pyx deleted file mode 100644 index ca87becc..00000000 --- a/sa-extract/cveb.pyx +++ /dev/null @@ -1,390 +0,0 @@ -#!/usr/bin/env python2.4 -'''This module implements a partial stratified tree (van Emde Boas, 1977). -Only insert findsucc, __iter__, and __contains__ are implemented. -Delete is currently not supported. -There is very little error-checking in this code -- it is designed -to be used in the limited situation described in Lopez (EMNLP-CoNLL 2007), -which doesn't cover all of the possible ways that you could misuse it -(e.g. trying to insert a key larger than the universe size) -Other notes -- this code is really rather ugly C code masquerading as -Pyrex/Python. Virtual function calls are bypassed by hand in several -places for the sake of efficiency, and other Python niceties are -removed for the same reason.''' - -from libc.stdlib cimport malloc, free -from libc.math cimport log, ceil -from libc.string cimport memset - -cdef int MIN_BOTTOM_SIZE -cdef int MIN_BOTTOM_BITS - -MIN_BOTTOM_SIZE = 32 -MIN_BOTTOM_BITS = 5 - -cdef int lower_mask[32] -cdef int i, mask - -for i from 0 <= i < MIN_BOTTOM_SIZE: - mask = (mask << 1) + 1 - lower_mask[i] = mask - - -cdef struct _BitSet: - long bitset - int min_val - int max_val - int size - - -cdef _BitSet* new_BitSet(): - cdef _BitSet* b - - b = <_BitSet*> malloc(sizeof(_BitSet)) - b.bitset = 0 - b.min_val = -1 - b.max_val = -1 - b.size = 0 - return b - - -cdef int bitset_findsucc(_BitSet* b, int i): - cdef int bitset, mask - cdef int low, high, mid - - if b.max_val == -1 or i >= b.max_val: - return -1 - if i < b.min_val: - return b.min_val - - bitset = b.bitset & ~lower_mask[i] - low = i+1 - high = b.max_val+1 - while low < high-1: - mid = (high + low)/2 - mask = ~(lower_mask[high-1] ^ lower_mask[mid-1]) - if bitset & mask == 0: - low = mid - else: - bitset = bitset & mask - high = mid - return low - - -cdef int bitset_insert(_BitSet* b, int i): - cdef int val - - val = 1 << i - if b.bitset & val == 0: - b.bitset = b.bitset | val - if b.size == 0: - b.min_val = i - b.max_val = i - else: - if i < b.min_val: - b.min_val = i - if i > b.max_val: - b.max_val = i - b.size = b.size + 1 - return 1 - return 0 - - -cdef int bitset_contains(_BitSet* b, int i): - cdef int val - - val = 1 << i - if b.bitset & val == 0: - return 0 - else: - return 1 - - -cdef class BitSetIterator: - cdef _BitSet* b - cdef int next_val - - def __next__(self): - cdef int ret_val - - if self.next_val == -1: - raise StopIteration() - ret_val = self.next_val - self.next_val = bitset_findsucc(self.b, ret_val) - return ret_val - - - -# This is a Python wrapper class to give access to the -# (entirely C-implemented) _BitSet struct. -# Very slow; use only for debugging -cdef class BitSet: - - cdef _BitSet* b - - def __cinit__(self): - self.b = new_BitSet() - - def __dealloc__(self): - free(self.b) - - def __iter__(self): - cdef BitSetIterator it - it = BitSetIterator() - it.b = self.b - it.next_val = self.b.min_val - return it - - def insert(self, i): - return bitset_insert(self.b, i) - - def findsucc(self, i): - return bitset_findsucc(self.b, i) - - def __str__(self): - return dec2bin(self.b.bitset)+" ("+str(self.b.size)+","+str(self.b.min_val)+","+str(self.b.max_val)+")" - - def min(self): - return self.b.min_val - - def max(self): - return self.b.max_val - - def __len__(self): - return self.b.size - - def __contains__(self, i): - return bool(bitset_contains(self.b, i)) - - -def dec2bin(i): - cdef d - result = "" - for d from 0 <= d < MIN_BOTTOM_SIZE: - if i & lower_mask[0] == 0: - result = "0"+result - else: - result = "1"+result - i = i >> 1 - return result - - -cdef _VEB* new_VEB(int n): - cdef _VEB* veb - cdef int num_bits, num_top_bits, i - - veb = <_VEB*> malloc(sizeof(_VEB)) - - num_bits = int(ceil(log(n) / log(2))) - veb.num_bottom_bits = num_bits/2 - if veb.num_bottom_bits < MIN_BOTTOM_BITS: - veb.num_bottom_bits = MIN_BOTTOM_BITS - veb.top_universe_size = (n >> veb.num_bottom_bits) + 1 - - veb.bottom = <void**> malloc(veb.top_universe_size * sizeof(void*)) - memset(veb.bottom, 0, veb.top_universe_size * sizeof(void*)) - - if veb.top_universe_size > MIN_BOTTOM_SIZE: - veb.top = new_VEB(veb.top_universe_size) - else: - veb.top = new_BitSet() - - veb.max_val = -1 - veb.min_val = -1 - veb.size = 0 - return veb - - -cdef int VEB_insert(_VEB* veb, int i): - cdef _VEB* subv - cdef _BitSet* subb - cdef int a, b, tmp - - if veb.size == 0: - veb.min_val = i - veb.max_val = i - elif i == veb.min_val or i == veb.max_val: - return 0 - else: - if i < veb.min_val: - tmp = i - i = veb.min_val - veb.min_val = tmp - a = i >> veb.num_bottom_bits - b = i & lower_mask[veb.num_bottom_bits-1] - if veb.bottom[a] == NULL: - if veb.top_universe_size > MIN_BOTTOM_SIZE: - subv = <_VEB*> veb.top - VEB_insert(subv, a) - else: - subb = <_BitSet*> veb.top - bitset_insert(subb, a) - if veb.num_bottom_bits > MIN_BOTTOM_BITS: - veb.bottom[a] = new_VEB(1 << veb.num_bottom_bits) - else: - veb.bottom[a] = new_BitSet() - if veb.num_bottom_bits > MIN_BOTTOM_BITS: - subv = <_VEB*> veb.bottom[a] - if VEB_insert(subv, b) == 0: - return 0 - else: - subb = <_BitSet*> veb.bottom[a] - if bitset_insert(subb, b) == 0: - return 0 - - if i > veb.max_val: - veb.max_val = i - veb.size = veb.size + 1 - return 1 - - -cdef del_VEB(_VEB* veb): - cdef int i - - if veb.top_universe_size > MIN_BOTTOM_SIZE: - i = (<_VEB*> veb.top).min_val - else: - i = (<_BitSet*> veb.top).min_val - - while i != -1: - if veb.num_bottom_bits > MIN_BOTTOM_BITS: - del_VEB(<_VEB*> veb.bottom[i]) - else: - free(<_BitSet*> veb.bottom[i]) - - if veb.top_universe_size > MIN_BOTTOM_SIZE: - i = VEB_findsucc(<_VEB*> veb.top, i) - else: - i = bitset_findsucc(<_BitSet*> veb.top, i) - - if veb.top_universe_size > MIN_BOTTOM_SIZE: - del_VEB(<_VEB*> veb.top) - else: - free(<_BitSet*> veb.top) - free(veb.bottom) - free(veb) - - -cdef int VEB_findsucc(_VEB* veb, int i): - cdef _VEB* subv - cdef _BitSet* subb - cdef int a, b, j, c, found - - if veb.max_val == -1 or i>=veb.max_val: - return -1 - if i < veb.min_val: - return veb.min_val - - a = i >> veb.num_bottom_bits - b = i & lower_mask[veb.num_bottom_bits-1] - found = 0 - if veb.bottom[a] != NULL: - if veb.num_bottom_bits > MIN_BOTTOM_BITS: - subv = <_VEB*> veb.bottom[a] - if subv.max_val > b: - j = (a << veb.num_bottom_bits) + VEB_findsucc(subv, b) - found = 1 - else: - subb = <_BitSet*> veb.bottom[a] - if subb.max_val > b: - j = (a << veb.num_bottom_bits) + bitset_findsucc(subb, b) - found = 1 - if found==0: - if veb.top_universe_size > MIN_BOTTOM_SIZE: - subv = <_VEB*> veb.top - c = VEB_findsucc(subv, a) - else: - subb = <_BitSet*> veb.top - c = bitset_findsucc(subb, a) - if veb.num_bottom_bits > MIN_BOTTOM_BITS: - subv = <_VEB*> veb.bottom[c] - j = (c << veb.num_bottom_bits) + subv.min_val - else: - subb = <_BitSet*> veb.bottom[c] - j = (c << veb.num_bottom_bits) + subb.min_val - return j - - -cdef int VEB_contains(_VEB* veb, int i): - cdef _VEB* subv - cdef _BitSet* subb - cdef int a, b - - if veb.size == 0 or i < veb.min_val or i > veb.max_val: - return 0 - - if veb.min_val == i: - return 1 - else: - if veb.size == 1: - return 0 - - a = i >> veb.num_bottom_bits - b = i & lower_mask[veb.num_bottom_bits-1] - if veb.bottom[a] == NULL: - return 0 - else: - if veb.num_bottom_bits > MIN_BOTTOM_BITS: - subv = <_VEB*> veb.bottom[a] - return VEB_contains(subv, b) - else: - subb = <_BitSet*> veb.bottom[a] - return bitset_contains(subb, b) - - -cdef class VEBIterator: - cdef _VEB* v - cdef int next_val - - def __next__(self): - cdef int ret_val - - if self.next_val == -1: - raise StopIteration() - ret_val = self.next_val - self.next_val = VEB_findsucc(self.v, ret_val) - return ret_val - - -cdef class VEB: - - def __init__(self, size): - pass - - def __cinit__(self, int size): - self.veb = new_VEB(size) - - def __dealloc__(self): - del_VEB(self.veb) - - def __iter__(self): - cdef VEBIterator it - it = VEBIterator() - it.v = self.veb - it.next_val = self.veb.min_val - return it - - def insert(self, i): - return VEB_insert(self.veb, i) - - cdef int _insert(self, int i): - return VEB_insert(self.veb, i) - - def findsucc(self, i): - return VEB_findsucc(self.veb, i) - - cdef int _first(self): - return self.veb.min_val - - cdef int _findsucc(self, int i): - return VEB_findsucc(self.veb, i) - - def __len__(self): - return self.veb.size - - def __contains__(self, i): - return VEB_contains(self.veb, i) - - - - - |