diff options
author | Victor Chahuneau <vchahune@cs.cmu.edu> | 2012-09-05 14:55:11 +0100 |
---|---|---|
committer | Victor Chahuneau <vchahune@cs.cmu.edu> | 2012-09-05 14:55:11 +0100 |
commit | fb4a9cd8874976a1c013b880b342961b72a8c0d7 (patch) | |
tree | e75e21d414980b0f4d9953cea3eb43e8c8355261 /python | |
parent | b34a325f1496eb4bbc33c5fe156eb7e28e5add27 (diff) |
Expose new feature extraction API
Diffstat (limited to 'python')
-rw-r--r-- | python/pkg/cdec/sa/__init__.py | 6 | ||||
-rw-r--r-- | python/pkg/cdec/sa/extract.py | 35 | ||||
-rw-r--r-- | python/pkg/cdec/sa/extractor.py | 5 | ||||
-rw-r--r-- | python/pkg/cdec/sa/features.py | 52 | ||||
-rw-r--r-- | python/src/sa/_sa.c | 102 | ||||
-rw-r--r-- | python/src/sa/default_scorer.pxi | 74 | ||||
-rw-r--r-- | python/src/sa/features.pxi | 6 | ||||
-rw-r--r-- | python/src/sa/rulefactory.pxi | 20 |
8 files changed, 185 insertions, 115 deletions
diff --git a/python/pkg/cdec/sa/__init__.py b/python/pkg/cdec/sa/__init__.py index ab8be809..cc532fb9 100644 --- a/python/pkg/cdec/sa/__init__.py +++ b/python/pkg/cdec/sa/__init__.py @@ -2,3 +2,9 @@ from cdec.sa._sa import sym_fromstring,\ SuffixArray, DataArray, LCP, Precomputation, Alignment, BiLex,\ HieroCachingRuleFactory, Sampler, Scorer from cdec.sa.extractor import GrammarExtractor + +_SA_FEATURES = [] + +def feature(fn): + _SA_FEATURES.append(fn) + return fn diff --git a/python/pkg/cdec/sa/extract.py b/python/pkg/cdec/sa/extract.py index 39eac824..b370c4ca 100644 --- a/python/pkg/cdec/sa/extract.py +++ b/python/pkg/cdec/sa/extract.py @@ -8,12 +8,20 @@ import signal import cdec.sa extractor, prefix = None, None -def make_extractor(config, grammars): +def make_extractor(config, grammars, features): global extractor, prefix signal.signal(signal.SIGINT, signal.SIG_IGN) # Let parent process catch Ctrl+C + if features: load_features(features) extractor = cdec.sa.GrammarExtractor(config) prefix = grammars +def load_features(features): + logging.info('Loading additional feature definitions from %s', features) + prefix = os.path.dirname(features) + sys.path.append(prefix) + __import__(os.path.basename(features).replace('.py', '')) + sys.path.remove(prefix) + def extract(inp): global extractor, prefix i, sentence = inp @@ -25,7 +33,6 @@ def extract(inp): grammar_file = os.path.abspath(grammar_file) return '<seg grammar="{0}" id="{1}">{2}</seg>'.format(grammar_file, i, sentence) - def main(): logging.basicConfig(level=logging.INFO) parser = argparse.ArgumentParser(description='Extract grammars from a compiled corpus.') @@ -37,18 +44,28 @@ def main(): help='number of parallel extractors') parser.add_argument('-s', '--chunksize', type=int, default=10, help='number of sentences / chunk') + parser.add_argument('-f', '--features', type=str, default=None, + help='additional feature definitions') args = parser.parse_args() if not os.path.exists(args.grammars): os.mkdir(args.grammars) - - logging.info('Starting %d workers; chunk size: %d', args.jobs, args.chunksize) - pool = mp.Pool(args.jobs, make_extractor, (args.config, args.grammars)) - try: - for output in pool.imap(extract, enumerate(sys.stdin), args.chunksize): + if not args.features.endswith('.py'): + sys.stderr.write('Error: feature definition file should be a python module\n') + sys.exit(1) + + if args.jobs > 1: + logging.info('Starting %d workers; chunk size: %d', args.jobs, args.chunksize) + pool = mp.Pool(args.jobs, make_extractor, (args.config, args.grammars, args.features)) + try: + for output in pool.imap(extract, enumerate(sys.stdin), args.chunksize): + print(output) + except KeyboardInterrupt: + pool.terminate() + else: + make_extractor(args.config, args.grammars, args.features) + for output in map(extract, enumerate(sys.stdin)): print(output) - except KeyboardInterrupt: - pool.terminate() if __name__ == '__main__': main() diff --git a/python/pkg/cdec/sa/extractor.py b/python/pkg/cdec/sa/extractor.py index 90cc4c51..89e35bf8 100644 --- a/python/pkg/cdec/sa/extractor.py +++ b/python/pkg/cdec/sa/extractor.py @@ -9,7 +9,7 @@ import cdec.sa MAX_INITIAL_SIZE = 15 class GrammarExtractor: - def __init__(self, config): + def __init__(self, config, features=None): if isinstance(config, str) or isinstance(config, unicode): if not os.path.exists(config): raise IOError('cannot read configuration from {0}'.format(config)) @@ -58,7 +58,8 @@ class GrammarExtractor: tt = cdec.sa.BiLex(from_binary=config['lex_file']) scorer = cdec.sa.Scorer(EgivenFCoherent, SampleCountF, CountEF, - MaxLexFgivenE(tt), MaxLexEgivenF(tt), IsSingletonF, IsSingletonFE) + MaxLexFgivenE(tt), MaxLexEgivenF(tt), IsSingletonF, IsSingletonFE, + *cdec.sa._SA_FEATURES) fsarray = cdec.sa.SuffixArray(from_binary=config['f_sa_file']) edarray = cdec.sa.DataArray(from_binary=config['e_file']) diff --git a/python/pkg/cdec/sa/features.py b/python/pkg/cdec/sa/features.py index 8fd370cc..a4ae23e8 100644 --- a/python/pkg/cdec/sa/features.py +++ b/python/pkg/cdec/sa/features.py @@ -3,55 +3,55 @@ import math MAXSCORE = 99 -def EgivenF(fphrase, ephrase, paircount, fcount, fsample_count): # p(e|f) - return -math.log10(paircount/fcount) +def EgivenF(ctx): # p(e|f) = c(e, f)/c(f) + return -math.log10(ctx.paircount/ctx.fcount) -def CountEF(fphrase, ephrase, paircount, fcount, fsample_count): - return math.log10(1 + paircount) +def CountEF(ctx): # c(e, f) + return math.log10(1 + ctx.paircount) -def SampleCountF(fphrase, ephrase, paircount, fcount, fsample_count): - return math.log10(1 + fsample_count) +def SampleCountF(ctx): # sample c(f) + return math.log10(1 + ctx.fsample_count) -def EgivenFCoherent(fphrase, ephrase, paircount, fcount, fsample_count): - prob = paircount/fsample_count +def EgivenFCoherent(ctx): # c(e, f) / sample c(f) + prob = ctx.paircount/ctx.fsample_count return -math.log10(prob) if prob > 0 else MAXSCORE -def CoherenceProb(fphrase, ephrase, paircount, fcount, fsample_count): - return -math.log10(fcount/fsample_count) +def CoherenceProb(ctx): # c(f) / sample c(f) + return -math.log10(ctx.fcount/ctx.fsample_count) def MaxLexEgivenF(ttable): - def MaxLexEgivenF(fphrase, ephrase, paircount, fcount, fsample_count): - fwords = fphrase.words + def MaxLexEgivenF(ctx): + fwords = ctx.fphrase.words fwords.append('NULL') def score(): - for e in ephrase.words: + for e in ctx.ephrase.words: maxScore = max(ttable.get_score(f, e, 0) for f in fwords) yield -math.log10(maxScore) if maxScore > 0 else MAXSCORE return sum(score()) return MaxLexEgivenF def MaxLexFgivenE(ttable): - def MaxLexFgivenE(fphrase, ephrase, paircount, fcount, fsample_count): - ewords = ephrase.words + def MaxLexFgivenE(ctx): + ewords = ctx.ephrase.words ewords.append('NULL') def score(): - for f in fphrase.words: + for f in ctx.fphrase.words: maxScore = max(ttable.get_score(f, e, 1) for e in ewords) yield -math.log10(maxScore) if maxScore > 0 else MAXSCORE return sum(score()) return MaxLexFgivenE -def IsSingletonF(fphrase, ephrase, paircount, fcount, fsample_count): - return (fcount == 1) +def IsSingletonF(ctx): + return (ctx.fcount == 1) -def IsSingletonFE(fphrase, ephrase, paircount, fcount, fsample_count): - return (paircount == 1) +def IsSingletonFE(ctx): + return (ctx.paircount == 1) -def IsNotSingletonF(fphrase, ephrase, paircount, fcount, fsample_count): - return (fcount > 1) +def IsNotSingletonF(ctx): + return (ctx.fcount > 1) -def IsNotSingletonFE(fphrase, ephrase, paircount, fcount, fsample_count): - return (paircount > 1) +def IsNotSingletonFE(ctx): + return (ctx.paircount > 1) -def IsFEGreaterThanZero(fphrase, ephrase, paircount, fcount, fsample_count): - return (paircount > 0.01) +def IsFEGreaterThanZero(ctx): + return (ctx.paircount > 0.01) diff --git a/python/src/sa/_sa.c b/python/src/sa/_sa.c index d04a8f98..a1530dda 100644 --- a/python/src/sa/_sa.c +++ b/python/src/sa/_sa.c @@ -1,4 +1,4 @@ -/* Generated by Cython 0.17 on Wed Sep 5 10:20:00 2012 */ +/* Generated by Cython 0.17 on Wed Sep 5 12:38:10 2012 */ #define PY_SSIZE_T_CLEAN #include "Python.h" @@ -54767,7 +54767,7 @@ static int __pyx_pf_3_sa_6Scorer___init__(struct __pyx_obj_3_sa_Scorer *__pyx_v_ * names = [FD.index(<char *>model.__name__) for model in models] * self.models = zip(names, models) # <<<<<<<<<<<<<< * - * cdef FeatureVector score(self, c): + * cdef FeatureVector score(self, ctx): */ __pyx_t_1 = PyTuple_New(2); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 27; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __Pyx_GOTREF(__pyx_t_1); @@ -54804,12 +54804,12 @@ static int __pyx_pf_3_sa_6Scorer___init__(struct __pyx_obj_3_sa_Scorer *__pyx_v_ /* "/Users/vchahun/Sandbox/cdec/python/src/sa/features.pxi":29 * self.models = zip(names, models) * - * cdef FeatureVector score(self, c): # <<<<<<<<<<<<<< + * cdef FeatureVector score(self, ctx): # <<<<<<<<<<<<<< * cdef FeatureVector scores = FeatureVector() * for name, model in self.models: */ -static struct __pyx_obj_3_sa_FeatureVector *__pyx_f_3_sa_6Scorer_score(struct __pyx_obj_3_sa_Scorer *__pyx_v_self, PyObject *__pyx_v_c) { +static struct __pyx_obj_3_sa_FeatureVector *__pyx_f_3_sa_6Scorer_score(struct __pyx_obj_3_sa_Scorer *__pyx_v_self, PyObject *__pyx_v_ctx) { struct __pyx_obj_3_sa_FeatureVector *__pyx_v_scores = 0; PyObject *__pyx_v_name = NULL; PyObject *__pyx_v_model = NULL; @@ -54823,9 +54823,6 @@ static struct __pyx_obj_3_sa_FeatureVector *__pyx_f_3_sa_6Scorer_score(struct __ PyObject *__pyx_t_6 = NULL; PyObject *__pyx_t_7 = NULL; PyObject *(*__pyx_t_8)(PyObject *); - PyObject *__pyx_t_9 = NULL; - PyObject *__pyx_t_10 = NULL; - PyObject *__pyx_t_11 = NULL; int __pyx_lineno = 0; const char *__pyx_filename = NULL; int __pyx_clineno = 0; @@ -54833,10 +54830,10 @@ static struct __pyx_obj_3_sa_FeatureVector *__pyx_f_3_sa_6Scorer_score(struct __ /* "/Users/vchahun/Sandbox/cdec/python/src/sa/features.pxi":30 * - * cdef FeatureVector score(self, c): + * cdef FeatureVector score(self, ctx): * cdef FeatureVector scores = FeatureVector() # <<<<<<<<<<<<<< * for name, model in self.models: - * scores.set(name, model(c.fphrase, c.ephrase, c.paircount, c.fcount, c.fsample_count)) + * scores.set(name, model(ctx)) */ __pyx_t_1 = PyObject_Call(((PyObject *)((PyObject*)__pyx_ptype_3_sa_FeatureVector)), ((PyObject *)__pyx_empty_tuple), NULL); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 30; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __Pyx_GOTREF(__pyx_t_1); @@ -54844,10 +54841,10 @@ static struct __pyx_obj_3_sa_FeatureVector *__pyx_f_3_sa_6Scorer_score(struct __ __pyx_t_1 = 0; /* "/Users/vchahun/Sandbox/cdec/python/src/sa/features.pxi":31 - * cdef FeatureVector score(self, c): + * cdef FeatureVector score(self, ctx): * cdef FeatureVector scores = FeatureVector() * for name, model in self.models: # <<<<<<<<<<<<<< - * scores.set(name, model(c.fphrase, c.ephrase, c.paircount, c.fcount, c.fsample_count)) + * scores.set(name, model(ctx)) * return scores */ if (PyList_CheckExact(__pyx_v_self->models) || PyTuple_CheckExact(__pyx_v_self->models)) { @@ -54943,60 +54940,38 @@ static struct __pyx_obj_3_sa_FeatureVector *__pyx_f_3_sa_6Scorer_score(struct __ /* "/Users/vchahun/Sandbox/cdec/python/src/sa/features.pxi":32 * cdef FeatureVector scores = FeatureVector() * for name, model in self.models: - * scores.set(name, model(c.fphrase, c.ephrase, c.paircount, c.fcount, c.fsample_count)) # <<<<<<<<<<<<<< + * scores.set(name, model(ctx)) # <<<<<<<<<<<<<< * return scores */ __pyx_t_4 = PyObject_GetAttr(((PyObject *)__pyx_v_scores), __pyx_n_s__set); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 32; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __Pyx_GOTREF(__pyx_t_4); - __pyx_t_6 = PyObject_GetAttr(__pyx_v_c, __pyx_n_s__fphrase); if (unlikely(!__pyx_t_6)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 32; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_6 = PyTuple_New(1); if (unlikely(!__pyx_t_6)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 32; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __Pyx_GOTREF(__pyx_t_6); - __pyx_t_5 = PyObject_GetAttr(__pyx_v_c, __pyx_n_s__ephrase); if (unlikely(!__pyx_t_5)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 32; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __Pyx_INCREF(__pyx_v_ctx); + PyTuple_SET_ITEM(__pyx_t_6, 0, __pyx_v_ctx); + __Pyx_GIVEREF(__pyx_v_ctx); + __pyx_t_5 = PyObject_Call(__pyx_v_model, ((PyObject *)__pyx_t_6), NULL); if (unlikely(!__pyx_t_5)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 32; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __Pyx_GOTREF(__pyx_t_5); - __pyx_t_7 = PyObject_GetAttr(__pyx_v_c, __pyx_n_s__paircount); if (unlikely(!__pyx_t_7)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 32; __pyx_clineno = __LINE__; goto __pyx_L1_error;} - __Pyx_GOTREF(__pyx_t_7); - __pyx_t_9 = PyObject_GetAttr(__pyx_v_c, __pyx_n_s__fcount); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 32; __pyx_clineno = __LINE__; goto __pyx_L1_error;} - __Pyx_GOTREF(__pyx_t_9); - __pyx_t_10 = PyObject_GetAttr(__pyx_v_c, __pyx_n_s__fsample_count); if (unlikely(!__pyx_t_10)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 32; __pyx_clineno = __LINE__; goto __pyx_L1_error;} - __Pyx_GOTREF(__pyx_t_10); - __pyx_t_11 = PyTuple_New(5); if (unlikely(!__pyx_t_11)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 32; __pyx_clineno = __LINE__; goto __pyx_L1_error;} - __Pyx_GOTREF(__pyx_t_11); - PyTuple_SET_ITEM(__pyx_t_11, 0, __pyx_t_6); - __Pyx_GIVEREF(__pyx_t_6); - PyTuple_SET_ITEM(__pyx_t_11, 1, __pyx_t_5); - __Pyx_GIVEREF(__pyx_t_5); - PyTuple_SET_ITEM(__pyx_t_11, 2, __pyx_t_7); - __Pyx_GIVEREF(__pyx_t_7); - PyTuple_SET_ITEM(__pyx_t_11, 3, __pyx_t_9); - __Pyx_GIVEREF(__pyx_t_9); - PyTuple_SET_ITEM(__pyx_t_11, 4, __pyx_t_10); - __Pyx_GIVEREF(__pyx_t_10); - __pyx_t_6 = 0; - __pyx_t_5 = 0; - __pyx_t_7 = 0; - __pyx_t_9 = 0; - __pyx_t_10 = 0; - __pyx_t_10 = PyObject_Call(__pyx_v_model, ((PyObject *)__pyx_t_11), NULL); if (unlikely(!__pyx_t_10)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 32; __pyx_clineno = __LINE__; goto __pyx_L1_error;} - __Pyx_GOTREF(__pyx_t_10); - __Pyx_DECREF(((PyObject *)__pyx_t_11)); __pyx_t_11 = 0; - __pyx_t_11 = PyTuple_New(2); if (unlikely(!__pyx_t_11)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 32; __pyx_clineno = __LINE__; goto __pyx_L1_error;} - __Pyx_GOTREF(__pyx_t_11); + __Pyx_DECREF(((PyObject *)__pyx_t_6)); __pyx_t_6 = 0; + __pyx_t_6 = PyTuple_New(2); if (unlikely(!__pyx_t_6)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 32; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __Pyx_GOTREF(__pyx_t_6); __Pyx_INCREF(__pyx_v_name); - PyTuple_SET_ITEM(__pyx_t_11, 0, __pyx_v_name); + PyTuple_SET_ITEM(__pyx_t_6, 0, __pyx_v_name); __Pyx_GIVEREF(__pyx_v_name); - PyTuple_SET_ITEM(__pyx_t_11, 1, __pyx_t_10); - __Pyx_GIVEREF(__pyx_t_10); - __pyx_t_10 = 0; - __pyx_t_10 = PyObject_Call(__pyx_t_4, ((PyObject *)__pyx_t_11), NULL); if (unlikely(!__pyx_t_10)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 32; __pyx_clineno = __LINE__; goto __pyx_L1_error;} - __Pyx_GOTREF(__pyx_t_10); + PyTuple_SET_ITEM(__pyx_t_6, 1, __pyx_t_5); + __Pyx_GIVEREF(__pyx_t_5); + __pyx_t_5 = 0; + __pyx_t_5 = PyObject_Call(__pyx_t_4, ((PyObject *)__pyx_t_6), NULL); if (unlikely(!__pyx_t_5)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 32; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __Pyx_GOTREF(__pyx_t_5); __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0; - __Pyx_DECREF(((PyObject *)__pyx_t_11)); __pyx_t_11 = 0; - __Pyx_DECREF(__pyx_t_10); __pyx_t_10 = 0; + __Pyx_DECREF(((PyObject *)__pyx_t_6)); __pyx_t_6 = 0; + __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0; } __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; /* "/Users/vchahun/Sandbox/cdec/python/src/sa/features.pxi":33 * for name, model in self.models: - * scores.set(name, model(c.fphrase, c.ephrase, c.paircount, c.fcount, c.fsample_count)) + * scores.set(name, model(ctx)) * return scores # <<<<<<<<<<<<<< */ __Pyx_XDECREF(((PyObject *)__pyx_r)); @@ -55012,9 +54987,6 @@ static struct __pyx_obj_3_sa_FeatureVector *__pyx_f_3_sa_6Scorer_score(struct __ __Pyx_XDECREF(__pyx_t_5); __Pyx_XDECREF(__pyx_t_6); __Pyx_XDECREF(__pyx_t_7); - __Pyx_XDECREF(__pyx_t_9); - __Pyx_XDECREF(__pyx_t_10); - __Pyx_XDECREF(__pyx_t_11); __Pyx_AddTraceback("_sa.Scorer.score", __pyx_clineno, __pyx_lineno, __pyx_filename); __pyx_r = 0; __pyx_L0:; @@ -64732,7 +64704,7 @@ PyMODINIT_FUNC PyInit__sa(void) * * from collections import defaultdict, Counter, namedtuple # <<<<<<<<<<<<<< * - * FeatureContext = namedtuple("FeatureContext", + * FeatureContext = namedtuple('FeatureContext', */ __pyx_t_1 = PyList_New(3); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[8]; __pyx_lineno = 12; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __Pyx_GOTREF(__pyx_t_1); @@ -64777,19 +64749,19 @@ PyMODINIT_FUNC PyInit__sa(void) /* "/Users/vchahun/Sandbox/cdec/python/src/sa/rulefactory.pxi":14 * from collections import defaultdict, Counter, namedtuple * - * FeatureContext = namedtuple("FeatureContext", # <<<<<<<<<<<<<< - * ["fphrase", - * "ephrase", + * FeatureContext = namedtuple('FeatureContext', # <<<<<<<<<<<<<< + * ['fphrase', + * 'ephrase', */ __pyx_t_2 = __Pyx_GetName(__pyx_m, __pyx_n_s__namedtuple); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[8]; __pyx_lineno = 14; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __Pyx_GOTREF(__pyx_t_2); /* "/Users/vchahun/Sandbox/cdec/python/src/sa/rulefactory.pxi":15 * - * FeatureContext = namedtuple("FeatureContext", - * ["fphrase", # <<<<<<<<<<<<<< - * "ephrase", - * "paircount", + * FeatureContext = namedtuple('FeatureContext', + * ['fphrase', # <<<<<<<<<<<<<< + * 'ephrase', + * 'paircount', */ __pyx_t_1 = PyList_New(8); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[8]; __pyx_lineno = 15; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __Pyx_GOTREF(__pyx_t_1); @@ -64833,7 +64805,7 @@ PyMODINIT_FUNC PyInit__sa(void) __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; /* "/Users/vchahun/Sandbox/cdec/python/src/sa/rulefactory.pxi":25 - * ]) + * ]) * * cdef int PRECOMPUTE = 0 # <<<<<<<<<<<<<< * cdef int MERGE = 1 @@ -64900,7 +64872,7 @@ PyMODINIT_FUNC PyInit__sa(void) __pyx_t_3 = 0; /* "/Users/vchahun/Sandbox/cdec/python/src/sa/features.pxi":1 - * cdef StringMap FD = StringMap() # <<<<<<<<<<<<<< + * cdef StringMap FD = StringMap() # Feature name dictionary # <<<<<<<<<<<<<< * * INITIAL_CAPACITY = 7 # default number of features */ @@ -64913,7 +64885,7 @@ PyMODINIT_FUNC PyInit__sa(void) __pyx_t_3 = 0; /* "/Users/vchahun/Sandbox/cdec/python/src/sa/features.pxi":3 - * cdef StringMap FD = StringMap() + * cdef StringMap FD = StringMap() # Feature name dictionary * * INITIAL_CAPACITY = 7 # default number of features # <<<<<<<<<<<<<< * INCREMENT = INITIAL_CAPACITY # double size diff --git a/python/src/sa/default_scorer.pxi b/python/src/sa/default_scorer.pxi new file mode 100644 index 00000000..483f4743 --- /dev/null +++ b/python/src/sa/default_scorer.pxi @@ -0,0 +1,74 @@ +from libc.stdlib cimport malloc, realloc, free +from libc.math cimport log10 + +MAXSCORE = -99 +EgivenFCoherent = 0 +SampleCountF = 1 +CountEF = 2 +MaxLexFgivenE = 3 +MaxLexEgivenF = 4 +IsSingletonF = 5 +IsSingletonFE = 6 +NFEATURES = 7 + +cdef class DefaultScorer(Scorer): + cdef BiLex ttable + cdef int* fid + + def __dealloc__(self): + free(self.fid) + + def __init__(self, BiLex ttable): + self.ttable = ttable + self.fid = <int*> malloc(NFEATURES*sizeof(int)) + cdef unsigned i + for i, fnames in enumerate(('EgivenFCoherent', 'SampleCountF', 'CountEF', + 'MaxLexFgivenE', 'MaxLexEgivenF', 'IsSingletonF', 'IsSingletonFE')): + self.fid[i] = FD.index(fnames) + + cdef FeatureVector score(self, Phrase fphrase, Phrase ephrase, + unsigned paircount, unsigned fcount, unsigned fsample_count): + cdef FeatureVector scores = FeatureVector() + + # EgivenFCoherent + cdef float efc = <float>paircount/fsample_count + scores.set(self.fid[EgivenFCoherent], -log10(efc) if efc > 0 else MAXSCORE) + + # SampleCountF + scores.set(self.fid[SampleCountF], log10(1 + fsample_count)) + + # CountEF + scores.set(self.fid[CountEF], log10(1 + paircount)) + + # MaxLexFgivenE TODO typify + ewords = ephrase.words + ewords.append('NULL') + cdef float mlfe = 0, max_score = -1 + for f in fphrase.words: + for e in ewords: + score = self.ttable.get_score(f, e, 1) + if score > max_score: + max_score = score + mlfe += -log10(max_score) if max_score > 0 else MAXSCORE + scores.set(self.fid[MaxLexFgivenE], mlfe) + + # MaxLexEgivenF TODO same + fwords = fphrase.words + fwords.append('NULL') + cdef float mlef = 0 + max_score = -1 + for e in ephrase.words: + for f in fwords: + score = self.ttable.get_score(f, e, 0) + if score > max_score: + max_score = score + mlef += -log10(max_score) if max_score > 0 else MAXSCORE + scores.set(self.fid[MaxLexEgivenF], mlef) + + # IsSingletonF + scores.set(self.fid[IsSingletonF], (fcount == 1)) + + # IsSingletonFE + scores.set(self.fid[IsSingletonFE], (paircount == 1)) + + return scores diff --git a/python/src/sa/features.pxi b/python/src/sa/features.pxi index eeef4feb..9b9ecf3c 100644 --- a/python/src/sa/features.pxi +++ b/python/src/sa/features.pxi @@ -1,4 +1,4 @@ -cdef StringMap FD = StringMap() +cdef StringMap FD = StringMap() # Feature name dictionary INITIAL_CAPACITY = 7 # default number of features INCREMENT = INITIAL_CAPACITY # double size @@ -26,8 +26,8 @@ cdef class Scorer: names = [FD.index(<char *>model.__name__) for model in models] self.models = zip(names, models) - cdef FeatureVector score(self, c): + cdef FeatureVector score(self, ctx): cdef FeatureVector scores = FeatureVector() for name, model in self.models: - scores.set(name, model(c.fphrase, c.ephrase, c.paircount, c.fcount, c.fsample_count)) + scores.set(name, model(ctx)) return scores diff --git a/python/src/sa/rulefactory.pxi b/python/src/sa/rulefactory.pxi index 69cadac9..287b9a67 100644 --- a/python/src/sa/rulefactory.pxi +++ b/python/src/sa/rulefactory.pxi @@ -11,16 +11,16 @@ from libc.math cimport fmod, ceil, floor, log from collections import defaultdict, Counter, namedtuple -FeatureContext = namedtuple("FeatureContext", - ["fphrase", - "ephrase", - "paircount", - "fcount", - "fsample_count", - "input_span", - "matches", - "test_sentence" - ]) +FeatureContext = namedtuple('FeatureContext', + ['fphrase', + 'ephrase', + 'paircount', + 'fcount', + 'fsample_count', + 'input_span', + 'matches', + 'test_sentence' + ]) cdef int PRECOMPUTE = 0 cdef int MERGE = 1 |