diff options
author | Victor Chahuneau <vchahune@cs.cmu.edu> | 2012-07-27 01:16:03 -0400 |
---|---|---|
committer | Victor Chahuneau <vchahune@cs.cmu.edu> | 2012-07-27 01:16:03 -0400 |
commit | 8fdc3681fb7551e7faeff9f720102cdd417ba077 (patch) | |
tree | 1129d2b79a3255c249e181141814cb92b52b4d4d /python/src/sa/sym.pxi | |
parent | 0aac9fd78f1c8b9ba3d91d702f592288075cbbde (diff) |
[python] Fork of the suffix-array extractor with surface improvements
Available as the cdec.sa module, with commande-line helpers:
python -m cdec.sa.compile -f ... -e ... -a ... -o sa-out/ -c extract.ini
python -m cdec.sa.extract -c extract.ini -g grammars-out/ < input.txt > input.sgml
+ renamed cdec.scfg -> cdec.sa
+ Python README
Diffstat (limited to 'python/src/sa/sym.pxi')
-rw-r--r-- | python/src/sa/sym.pxi | 101 |
1 files changed, 101 insertions, 0 deletions
diff --git a/python/src/sa/sym.pxi b/python/src/sa/sym.pxi new file mode 100644 index 00000000..3fd6c5a7 --- /dev/null +++ b/python/src/sa/sym.pxi @@ -0,0 +1,101 @@ +from libc.string cimport strrchr, strstr, strcpy, strlen +from libc.stdlib cimport malloc, realloc, strtol + +cdef int INDEX_SHIFT = 3 +cdef int INDEX_MASK = (1<<INDEX_SHIFT)-1 + +cdef class Alphabet: + cdef readonly StringMap terminals, nonterminals + cdef int first_nonterminal, last_nonterminal + cdef dict id2sym + + def __cinit__(self): + self.terminals = StringMap() + self.nonterminals = StringMap() + self.id2sym = {} + self.first_nonterminal = -1 + + def __dealloc__(self): + pass + + cdef int isvar(self, int sym): + return sym < 0 + + cdef int isword(self, int sym): + return sym >= 0 + + cdef int getindex(self, int sym): + return -sym & INDEX_MASK + + cdef int setindex(self, int sym, int ind): + return -(-sym & ~INDEX_MASK | ind) + + cdef int clearindex(self, int sym): + return -(-sym& ~INDEX_MASK) + + cdef int match(self, int sym1, int sym2): + return self.clearindex(sym1) == self.clearindex(sym2); + + cdef char* tocat(self, int sym): + return self.nonterminals.word((-sym >> INDEX_SHIFT)-1) + + cdef int fromcat(self, char *s): + cdef int i + i = self.nonterminals.index(s) + if self.first_nonterminal == -1: + self.first_nonterminal = i + if i > self.last_nonterminal: + self.last_nonterminal = i + return -(i+1 << INDEX_SHIFT) + + cdef char* tostring(self, int sym): + cdef int ind + if self.isvar(sym): + if sym in self.id2sym: + return self.id2sym[sym] + + ind = self.getindex(sym) + if ind > 0: + self.id2sym[sym] = "[%s,%d]" % (self.tocat(sym), ind) + else: + self.id2sym[sym] = "[%s]" % self.tocat(sym) + return self.id2sym[sym] + + else: + return self.terminals.word(sym) + + cdef int fromstring(self, char *s, bint terminal): + """Warning: this method is allowed to alter s.""" + cdef char *comma + cdef int n + n = strlen(s) + cdef char *sep + sep = strstr(s,"_SEP_") + if n >= 3 and s[0] == c'[' and s[n-1] == c']' and sep == NULL: + if terminal: + s1 = "\\"+s + return self.terminals.index(s1) + s[n-1] = c'\0' + s = s + 1 + comma = strrchr(s, c',') + if comma != NULL: + comma[0] = c'\0' + return self.setindex(self.fromcat(s), strtol(comma+1, NULL, 10)) + else: + return self.fromcat(s) + else: + return self.terminals.index(s) + +cdef Alphabet ALPHABET = Alphabet() + +def sym_tostring(int sym): + return ALPHABET.tostring(sym) + +def sym_fromstring(bytes string, bint terminal): + return ALPHABET.fromstring(string, terminal) + +def sym_isvar(int sym): + return ALPHABET.isvar(sym) + +cdef int sym_setindex(int sym, int id): + return ALPHABET.setindex(sym, id) |