From 8fdc3681fb7551e7faeff9f720102cdd417ba077 Mon Sep 17 00:00:00 2001 From: Victor Chahuneau Date: Fri, 27 Jul 2012 01:16:03 -0400 Subject: [python] Fork of the suffix-array extractor with surface improvements Available as the cdec.sa module, with commande-line helpers: python -m cdec.sa.compile -f ... -e ... -a ... -o sa-out/ -c extract.ini python -m cdec.sa.extract -c extract.ini -g grammars-out/ < input.txt > input.sgml + renamed cdec.scfg -> cdec.sa + Python README --- python/src/sa/sym.pxi | 101 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 101 insertions(+) create mode 100644 python/src/sa/sym.pxi (limited to 'python/src/sa/sym.pxi') diff --git a/python/src/sa/sym.pxi b/python/src/sa/sym.pxi new file mode 100644 index 00000000..3fd6c5a7 --- /dev/null +++ b/python/src/sa/sym.pxi @@ -0,0 +1,101 @@ +from libc.string cimport strrchr, strstr, strcpy, strlen +from libc.stdlib cimport malloc, realloc, strtol + +cdef int INDEX_SHIFT = 3 +cdef int INDEX_MASK = (1<= 0 + + cdef int getindex(self, int sym): + return -sym & INDEX_MASK + + cdef int setindex(self, int sym, int ind): + return -(-sym & ~INDEX_MASK | ind) + + cdef int clearindex(self, int sym): + return -(-sym& ~INDEX_MASK) + + cdef int match(self, int sym1, int sym2): + return self.clearindex(sym1) == self.clearindex(sym2); + + cdef char* tocat(self, int sym): + return self.nonterminals.word((-sym >> INDEX_SHIFT)-1) + + cdef int fromcat(self, char *s): + cdef int i + i = self.nonterminals.index(s) + if self.first_nonterminal == -1: + self.first_nonterminal = i + if i > self.last_nonterminal: + self.last_nonterminal = i + return -(i+1 << INDEX_SHIFT) + + cdef char* tostring(self, int sym): + cdef int ind + if self.isvar(sym): + if sym in self.id2sym: + return self.id2sym[sym] + + ind = self.getindex(sym) + if ind > 0: + self.id2sym[sym] = "[%s,%d]" % (self.tocat(sym), ind) + else: + self.id2sym[sym] = "[%s]" % self.tocat(sym) + return self.id2sym[sym] + + else: + return self.terminals.word(sym) + + cdef int fromstring(self, char *s, bint terminal): + """Warning: this method is allowed to alter s.""" + cdef char *comma + cdef int n + n = strlen(s) + cdef char *sep + sep = strstr(s,"_SEP_") + if n >= 3 and s[0] == c'[' and s[n-1] == c']' and sep == NULL: + if terminal: + s1 = "\\"+s + return self.terminals.index(s1) + s[n-1] = c'\0' + s = s + 1 + comma = strrchr(s, c',') + if comma != NULL: + comma[0] = c'\0' + return self.setindex(self.fromcat(s), strtol(comma+1, NULL, 10)) + else: + return self.fromcat(s) + else: + return self.terminals.index(s) + +cdef Alphabet ALPHABET = Alphabet() + +def sym_tostring(int sym): + return ALPHABET.tostring(sym) + +def sym_fromstring(bytes string, bint terminal): + return ALPHABET.fromstring(string, terminal) + +def sym_isvar(int sym): + return ALPHABET.isvar(sym) + +cdef int sym_setindex(int sym, int id): + return ALPHABET.setindex(sym, id) -- cgit v1.2.3