From 6fb3cc36cc4113c9f3510d87b3ae3b9c9351bf4e Mon Sep 17 00:00:00 2001 From: Victor Chahuneau Date: Wed, 5 Sep 2012 14:55:11 +0100 Subject: Expose new feature extraction API --- python/src/sa/_sa.c | 102 ++++++++++++++------------------------- python/src/sa/default_scorer.pxi | 74 ++++++++++++++++++++++++++++ python/src/sa/features.pxi | 6 +-- python/src/sa/rulefactory.pxi | 20 ++++---- 4 files changed, 124 insertions(+), 78 deletions(-) create mode 100644 python/src/sa/default_scorer.pxi (limited to 'python/src') diff --git a/python/src/sa/_sa.c b/python/src/sa/_sa.c index d04a8f98..a1530dda 100644 --- a/python/src/sa/_sa.c +++ b/python/src/sa/_sa.c @@ -1,4 +1,4 @@ -/* Generated by Cython 0.17 on Wed Sep 5 10:20:00 2012 */ +/* Generated by Cython 0.17 on Wed Sep 5 12:38:10 2012 */ #define PY_SSIZE_T_CLEAN #include "Python.h" @@ -54767,7 +54767,7 @@ static int __pyx_pf_3_sa_6Scorer___init__(struct __pyx_obj_3_sa_Scorer *__pyx_v_ * names = [FD.index(model.__name__) for model in models] * self.models = zip(names, models) # <<<<<<<<<<<<<< * - * cdef FeatureVector score(self, c): + * cdef FeatureVector score(self, ctx): */ __pyx_t_1 = PyTuple_New(2); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 27; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __Pyx_GOTREF(__pyx_t_1); @@ -54804,12 +54804,12 @@ static int __pyx_pf_3_sa_6Scorer___init__(struct __pyx_obj_3_sa_Scorer *__pyx_v_ /* "/Users/vchahun/Sandbox/cdec/python/src/sa/features.pxi":29 * self.models = zip(names, models) * - * cdef FeatureVector score(self, c): # <<<<<<<<<<<<<< + * cdef FeatureVector score(self, ctx): # <<<<<<<<<<<<<< * cdef FeatureVector scores = FeatureVector() * for name, model in self.models: */ -static struct __pyx_obj_3_sa_FeatureVector *__pyx_f_3_sa_6Scorer_score(struct __pyx_obj_3_sa_Scorer *__pyx_v_self, PyObject *__pyx_v_c) { +static struct __pyx_obj_3_sa_FeatureVector *__pyx_f_3_sa_6Scorer_score(struct __pyx_obj_3_sa_Scorer *__pyx_v_self, PyObject *__pyx_v_ctx) { struct __pyx_obj_3_sa_FeatureVector *__pyx_v_scores = 0; PyObject *__pyx_v_name = NULL; PyObject *__pyx_v_model = NULL; @@ -54823,9 +54823,6 @@ static struct __pyx_obj_3_sa_FeatureVector *__pyx_f_3_sa_6Scorer_score(struct __ PyObject *__pyx_t_6 = NULL; PyObject *__pyx_t_7 = NULL; PyObject *(*__pyx_t_8)(PyObject *); - PyObject *__pyx_t_9 = NULL; - PyObject *__pyx_t_10 = NULL; - PyObject *__pyx_t_11 = NULL; int __pyx_lineno = 0; const char *__pyx_filename = NULL; int __pyx_clineno = 0; @@ -54833,10 +54830,10 @@ static struct __pyx_obj_3_sa_FeatureVector *__pyx_f_3_sa_6Scorer_score(struct __ /* "/Users/vchahun/Sandbox/cdec/python/src/sa/features.pxi":30 * - * cdef FeatureVector score(self, c): + * cdef FeatureVector score(self, ctx): * cdef FeatureVector scores = FeatureVector() # <<<<<<<<<<<<<< * for name, model in self.models: - * scores.set(name, model(c.fphrase, c.ephrase, c.paircount, c.fcount, c.fsample_count)) + * scores.set(name, model(ctx)) */ __pyx_t_1 = PyObject_Call(((PyObject *)((PyObject*)__pyx_ptype_3_sa_FeatureVector)), ((PyObject *)__pyx_empty_tuple), NULL); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 30; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __Pyx_GOTREF(__pyx_t_1); @@ -54844,10 +54841,10 @@ static struct __pyx_obj_3_sa_FeatureVector *__pyx_f_3_sa_6Scorer_score(struct __ __pyx_t_1 = 0; /* "/Users/vchahun/Sandbox/cdec/python/src/sa/features.pxi":31 - * cdef FeatureVector score(self, c): + * cdef FeatureVector score(self, ctx): * cdef FeatureVector scores = FeatureVector() * for name, model in self.models: # <<<<<<<<<<<<<< - * scores.set(name, model(c.fphrase, c.ephrase, c.paircount, c.fcount, c.fsample_count)) + * scores.set(name, model(ctx)) * return scores */ if (PyList_CheckExact(__pyx_v_self->models) || PyTuple_CheckExact(__pyx_v_self->models)) { @@ -54943,60 +54940,38 @@ static struct __pyx_obj_3_sa_FeatureVector *__pyx_f_3_sa_6Scorer_score(struct __ /* "/Users/vchahun/Sandbox/cdec/python/src/sa/features.pxi":32 * cdef FeatureVector scores = FeatureVector() * for name, model in self.models: - * scores.set(name, model(c.fphrase, c.ephrase, c.paircount, c.fcount, c.fsample_count)) # <<<<<<<<<<<<<< + * scores.set(name, model(ctx)) # <<<<<<<<<<<<<< * return scores */ __pyx_t_4 = PyObject_GetAttr(((PyObject *)__pyx_v_scores), __pyx_n_s__set); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 32; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __Pyx_GOTREF(__pyx_t_4); - __pyx_t_6 = PyObject_GetAttr(__pyx_v_c, __pyx_n_s__fphrase); if (unlikely(!__pyx_t_6)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 32; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_6 = PyTuple_New(1); if (unlikely(!__pyx_t_6)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 32; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __Pyx_GOTREF(__pyx_t_6); - __pyx_t_5 = PyObject_GetAttr(__pyx_v_c, __pyx_n_s__ephrase); if (unlikely(!__pyx_t_5)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 32; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __Pyx_INCREF(__pyx_v_ctx); + PyTuple_SET_ITEM(__pyx_t_6, 0, __pyx_v_ctx); + __Pyx_GIVEREF(__pyx_v_ctx); + __pyx_t_5 = PyObject_Call(__pyx_v_model, ((PyObject *)__pyx_t_6), NULL); if (unlikely(!__pyx_t_5)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 32; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __Pyx_GOTREF(__pyx_t_5); - __pyx_t_7 = PyObject_GetAttr(__pyx_v_c, __pyx_n_s__paircount); if (unlikely(!__pyx_t_7)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 32; __pyx_clineno = __LINE__; goto __pyx_L1_error;} - __Pyx_GOTREF(__pyx_t_7); - __pyx_t_9 = PyObject_GetAttr(__pyx_v_c, __pyx_n_s__fcount); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 32; __pyx_clineno = __LINE__; goto __pyx_L1_error;} - __Pyx_GOTREF(__pyx_t_9); - __pyx_t_10 = PyObject_GetAttr(__pyx_v_c, __pyx_n_s__fsample_count); if (unlikely(!__pyx_t_10)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 32; __pyx_clineno = __LINE__; goto __pyx_L1_error;} - __Pyx_GOTREF(__pyx_t_10); - __pyx_t_11 = PyTuple_New(5); if (unlikely(!__pyx_t_11)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 32; __pyx_clineno = __LINE__; goto __pyx_L1_error;} - __Pyx_GOTREF(__pyx_t_11); - PyTuple_SET_ITEM(__pyx_t_11, 0, __pyx_t_6); - __Pyx_GIVEREF(__pyx_t_6); - PyTuple_SET_ITEM(__pyx_t_11, 1, __pyx_t_5); - __Pyx_GIVEREF(__pyx_t_5); - PyTuple_SET_ITEM(__pyx_t_11, 2, __pyx_t_7); - __Pyx_GIVEREF(__pyx_t_7); - PyTuple_SET_ITEM(__pyx_t_11, 3, __pyx_t_9); - __Pyx_GIVEREF(__pyx_t_9); - PyTuple_SET_ITEM(__pyx_t_11, 4, __pyx_t_10); - __Pyx_GIVEREF(__pyx_t_10); - __pyx_t_6 = 0; - __pyx_t_5 = 0; - __pyx_t_7 = 0; - __pyx_t_9 = 0; - __pyx_t_10 = 0; - __pyx_t_10 = PyObject_Call(__pyx_v_model, ((PyObject *)__pyx_t_11), NULL); if (unlikely(!__pyx_t_10)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 32; __pyx_clineno = __LINE__; goto __pyx_L1_error;} - __Pyx_GOTREF(__pyx_t_10); - __Pyx_DECREF(((PyObject *)__pyx_t_11)); __pyx_t_11 = 0; - __pyx_t_11 = PyTuple_New(2); if (unlikely(!__pyx_t_11)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 32; __pyx_clineno = __LINE__; goto __pyx_L1_error;} - __Pyx_GOTREF(__pyx_t_11); + __Pyx_DECREF(((PyObject *)__pyx_t_6)); __pyx_t_6 = 0; + __pyx_t_6 = PyTuple_New(2); if (unlikely(!__pyx_t_6)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 32; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __Pyx_GOTREF(__pyx_t_6); __Pyx_INCREF(__pyx_v_name); - PyTuple_SET_ITEM(__pyx_t_11, 0, __pyx_v_name); + PyTuple_SET_ITEM(__pyx_t_6, 0, __pyx_v_name); __Pyx_GIVEREF(__pyx_v_name); - PyTuple_SET_ITEM(__pyx_t_11, 1, __pyx_t_10); - __Pyx_GIVEREF(__pyx_t_10); - __pyx_t_10 = 0; - __pyx_t_10 = PyObject_Call(__pyx_t_4, ((PyObject *)__pyx_t_11), NULL); if (unlikely(!__pyx_t_10)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 32; __pyx_clineno = __LINE__; goto __pyx_L1_error;} - __Pyx_GOTREF(__pyx_t_10); + PyTuple_SET_ITEM(__pyx_t_6, 1, __pyx_t_5); + __Pyx_GIVEREF(__pyx_t_5); + __pyx_t_5 = 0; + __pyx_t_5 = PyObject_Call(__pyx_t_4, ((PyObject *)__pyx_t_6), NULL); if (unlikely(!__pyx_t_5)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 32; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __Pyx_GOTREF(__pyx_t_5); __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0; - __Pyx_DECREF(((PyObject *)__pyx_t_11)); __pyx_t_11 = 0; - __Pyx_DECREF(__pyx_t_10); __pyx_t_10 = 0; + __Pyx_DECREF(((PyObject *)__pyx_t_6)); __pyx_t_6 = 0; + __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0; } __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; /* "/Users/vchahun/Sandbox/cdec/python/src/sa/features.pxi":33 * for name, model in self.models: - * scores.set(name, model(c.fphrase, c.ephrase, c.paircount, c.fcount, c.fsample_count)) + * scores.set(name, model(ctx)) * return scores # <<<<<<<<<<<<<< */ __Pyx_XDECREF(((PyObject *)__pyx_r)); @@ -55012,9 +54987,6 @@ static struct __pyx_obj_3_sa_FeatureVector *__pyx_f_3_sa_6Scorer_score(struct __ __Pyx_XDECREF(__pyx_t_5); __Pyx_XDECREF(__pyx_t_6); __Pyx_XDECREF(__pyx_t_7); - __Pyx_XDECREF(__pyx_t_9); - __Pyx_XDECREF(__pyx_t_10); - __Pyx_XDECREF(__pyx_t_11); __Pyx_AddTraceback("_sa.Scorer.score", __pyx_clineno, __pyx_lineno, __pyx_filename); __pyx_r = 0; __pyx_L0:; @@ -64732,7 +64704,7 @@ PyMODINIT_FUNC PyInit__sa(void) * * from collections import defaultdict, Counter, namedtuple # <<<<<<<<<<<<<< * - * FeatureContext = namedtuple("FeatureContext", + * FeatureContext = namedtuple('FeatureContext', */ __pyx_t_1 = PyList_New(3); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[8]; __pyx_lineno = 12; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __Pyx_GOTREF(__pyx_t_1); @@ -64777,19 +64749,19 @@ PyMODINIT_FUNC PyInit__sa(void) /* "/Users/vchahun/Sandbox/cdec/python/src/sa/rulefactory.pxi":14 * from collections import defaultdict, Counter, namedtuple * - * FeatureContext = namedtuple("FeatureContext", # <<<<<<<<<<<<<< - * ["fphrase", - * "ephrase", + * FeatureContext = namedtuple('FeatureContext', # <<<<<<<<<<<<<< + * ['fphrase', + * 'ephrase', */ __pyx_t_2 = __Pyx_GetName(__pyx_m, __pyx_n_s__namedtuple); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[8]; __pyx_lineno = 14; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __Pyx_GOTREF(__pyx_t_2); /* "/Users/vchahun/Sandbox/cdec/python/src/sa/rulefactory.pxi":15 * - * FeatureContext = namedtuple("FeatureContext", - * ["fphrase", # <<<<<<<<<<<<<< - * "ephrase", - * "paircount", + * FeatureContext = namedtuple('FeatureContext', + * ['fphrase', # <<<<<<<<<<<<<< + * 'ephrase', + * 'paircount', */ __pyx_t_1 = PyList_New(8); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[8]; __pyx_lineno = 15; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __Pyx_GOTREF(__pyx_t_1); @@ -64833,7 +64805,7 @@ PyMODINIT_FUNC PyInit__sa(void) __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; /* "/Users/vchahun/Sandbox/cdec/python/src/sa/rulefactory.pxi":25 - * ]) + * ]) * * cdef int PRECOMPUTE = 0 # <<<<<<<<<<<<<< * cdef int MERGE = 1 @@ -64900,7 +64872,7 @@ PyMODINIT_FUNC PyInit__sa(void) __pyx_t_3 = 0; /* "/Users/vchahun/Sandbox/cdec/python/src/sa/features.pxi":1 - * cdef StringMap FD = StringMap() # <<<<<<<<<<<<<< + * cdef StringMap FD = StringMap() # Feature name dictionary # <<<<<<<<<<<<<< * * INITIAL_CAPACITY = 7 # default number of features */ @@ -64913,7 +64885,7 @@ PyMODINIT_FUNC PyInit__sa(void) __pyx_t_3 = 0; /* "/Users/vchahun/Sandbox/cdec/python/src/sa/features.pxi":3 - * cdef StringMap FD = StringMap() + * cdef StringMap FD = StringMap() # Feature name dictionary * * INITIAL_CAPACITY = 7 # default number of features # <<<<<<<<<<<<<< * INCREMENT = INITIAL_CAPACITY # double size diff --git a/python/src/sa/default_scorer.pxi b/python/src/sa/default_scorer.pxi new file mode 100644 index 00000000..483f4743 --- /dev/null +++ b/python/src/sa/default_scorer.pxi @@ -0,0 +1,74 @@ +from libc.stdlib cimport malloc, realloc, free +from libc.math cimport log10 + +MAXSCORE = -99 +EgivenFCoherent = 0 +SampleCountF = 1 +CountEF = 2 +MaxLexFgivenE = 3 +MaxLexEgivenF = 4 +IsSingletonF = 5 +IsSingletonFE = 6 +NFEATURES = 7 + +cdef class DefaultScorer(Scorer): + cdef BiLex ttable + cdef int* fid + + def __dealloc__(self): + free(self.fid) + + def __init__(self, BiLex ttable): + self.ttable = ttable + self.fid = malloc(NFEATURES*sizeof(int)) + cdef unsigned i + for i, fnames in enumerate(('EgivenFCoherent', 'SampleCountF', 'CountEF', + 'MaxLexFgivenE', 'MaxLexEgivenF', 'IsSingletonF', 'IsSingletonFE')): + self.fid[i] = FD.index(fnames) + + cdef FeatureVector score(self, Phrase fphrase, Phrase ephrase, + unsigned paircount, unsigned fcount, unsigned fsample_count): + cdef FeatureVector scores = FeatureVector() + + # EgivenFCoherent + cdef float efc = paircount/fsample_count + scores.set(self.fid[EgivenFCoherent], -log10(efc) if efc > 0 else MAXSCORE) + + # SampleCountF + scores.set(self.fid[SampleCountF], log10(1 + fsample_count)) + + # CountEF + scores.set(self.fid[CountEF], log10(1 + paircount)) + + # MaxLexFgivenE TODO typify + ewords = ephrase.words + ewords.append('NULL') + cdef float mlfe = 0, max_score = -1 + for f in fphrase.words: + for e in ewords: + score = self.ttable.get_score(f, e, 1) + if score > max_score: + max_score = score + mlfe += -log10(max_score) if max_score > 0 else MAXSCORE + scores.set(self.fid[MaxLexFgivenE], mlfe) + + # MaxLexEgivenF TODO same + fwords = fphrase.words + fwords.append('NULL') + cdef float mlef = 0 + max_score = -1 + for e in ephrase.words: + for f in fwords: + score = self.ttable.get_score(f, e, 0) + if score > max_score: + max_score = score + mlef += -log10(max_score) if max_score > 0 else MAXSCORE + scores.set(self.fid[MaxLexEgivenF], mlef) + + # IsSingletonF + scores.set(self.fid[IsSingletonF], (fcount == 1)) + + # IsSingletonFE + scores.set(self.fid[IsSingletonFE], (paircount == 1)) + + return scores diff --git a/python/src/sa/features.pxi b/python/src/sa/features.pxi index eeef4feb..9b9ecf3c 100644 --- a/python/src/sa/features.pxi +++ b/python/src/sa/features.pxi @@ -1,4 +1,4 @@ -cdef StringMap FD = StringMap() +cdef StringMap FD = StringMap() # Feature name dictionary INITIAL_CAPACITY = 7 # default number of features INCREMENT = INITIAL_CAPACITY # double size @@ -26,8 +26,8 @@ cdef class Scorer: names = [FD.index(model.__name__) for model in models] self.models = zip(names, models) - cdef FeatureVector score(self, c): + cdef FeatureVector score(self, ctx): cdef FeatureVector scores = FeatureVector() for name, model in self.models: - scores.set(name, model(c.fphrase, c.ephrase, c.paircount, c.fcount, c.fsample_count)) + scores.set(name, model(ctx)) return scores diff --git a/python/src/sa/rulefactory.pxi b/python/src/sa/rulefactory.pxi index 69cadac9..287b9a67 100644 --- a/python/src/sa/rulefactory.pxi +++ b/python/src/sa/rulefactory.pxi @@ -11,16 +11,16 @@ from libc.math cimport fmod, ceil, floor, log from collections import defaultdict, Counter, namedtuple -FeatureContext = namedtuple("FeatureContext", - ["fphrase", - "ephrase", - "paircount", - "fcount", - "fsample_count", - "input_span", - "matches", - "test_sentence" - ]) +FeatureContext = namedtuple('FeatureContext', + ['fphrase', + 'ephrase', + 'paircount', + 'fcount', + 'fsample_count', + 'input_span', + 'matches', + 'test_sentence' + ]) cdef int PRECOMPUTE = 0 cdef int MERGE = 1 -- cgit v1.2.3