summaryrefslogtreecommitdiff
path: root/python/src/sa
diff options
context:
space:
mode:
authorVictor Chahuneau <vchahune@cs.cmu.edu>2012-09-05 14:55:11 +0100
committerVictor Chahuneau <vchahune@cs.cmu.edu>2012-09-05 14:55:11 +0100
commit6fb3cc36cc4113c9f3510d87b3ae3b9c9351bf4e (patch)
treeae29f2c831037665ec39e24df0cdf2657dfadc5e /python/src/sa
parent1fd5b40da3bc9c55fd2fba03bb7fdb43eabee63c (diff)
Expose new feature extraction API
Diffstat (limited to 'python/src/sa')
-rw-r--r--python/src/sa/_sa.c102
-rw-r--r--python/src/sa/default_scorer.pxi74
-rw-r--r--python/src/sa/features.pxi6
-rw-r--r--python/src/sa/rulefactory.pxi20
4 files changed, 124 insertions, 78 deletions
diff --git a/python/src/sa/_sa.c b/python/src/sa/_sa.c
index d04a8f98..a1530dda 100644
--- a/python/src/sa/_sa.c
+++ b/python/src/sa/_sa.c
@@ -1,4 +1,4 @@
-/* Generated by Cython 0.17 on Wed Sep 5 10:20:00 2012 */
+/* Generated by Cython 0.17 on Wed Sep 5 12:38:10 2012 */
#define PY_SSIZE_T_CLEAN
#include "Python.h"
@@ -54767,7 +54767,7 @@ static int __pyx_pf_3_sa_6Scorer___init__(struct __pyx_obj_3_sa_Scorer *__pyx_v_
* names = [FD.index(<char *>model.__name__) for model in models]
* self.models = zip(names, models) # <<<<<<<<<<<<<<
*
- * cdef FeatureVector score(self, c):
+ * cdef FeatureVector score(self, ctx):
*/
__pyx_t_1 = PyTuple_New(2); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 27; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
__Pyx_GOTREF(__pyx_t_1);
@@ -54804,12 +54804,12 @@ static int __pyx_pf_3_sa_6Scorer___init__(struct __pyx_obj_3_sa_Scorer *__pyx_v_
/* "/Users/vchahun/Sandbox/cdec/python/src/sa/features.pxi":29
* self.models = zip(names, models)
*
- * cdef FeatureVector score(self, c): # <<<<<<<<<<<<<<
+ * cdef FeatureVector score(self, ctx): # <<<<<<<<<<<<<<
* cdef FeatureVector scores = FeatureVector()
* for name, model in self.models:
*/
-static struct __pyx_obj_3_sa_FeatureVector *__pyx_f_3_sa_6Scorer_score(struct __pyx_obj_3_sa_Scorer *__pyx_v_self, PyObject *__pyx_v_c) {
+static struct __pyx_obj_3_sa_FeatureVector *__pyx_f_3_sa_6Scorer_score(struct __pyx_obj_3_sa_Scorer *__pyx_v_self, PyObject *__pyx_v_ctx) {
struct __pyx_obj_3_sa_FeatureVector *__pyx_v_scores = 0;
PyObject *__pyx_v_name = NULL;
PyObject *__pyx_v_model = NULL;
@@ -54823,9 +54823,6 @@ static struct __pyx_obj_3_sa_FeatureVector *__pyx_f_3_sa_6Scorer_score(struct __
PyObject *__pyx_t_6 = NULL;
PyObject *__pyx_t_7 = NULL;
PyObject *(*__pyx_t_8)(PyObject *);
- PyObject *__pyx_t_9 = NULL;
- PyObject *__pyx_t_10 = NULL;
- PyObject *__pyx_t_11 = NULL;
int __pyx_lineno = 0;
const char *__pyx_filename = NULL;
int __pyx_clineno = 0;
@@ -54833,10 +54830,10 @@ static struct __pyx_obj_3_sa_FeatureVector *__pyx_f_3_sa_6Scorer_score(struct __
/* "/Users/vchahun/Sandbox/cdec/python/src/sa/features.pxi":30
*
- * cdef FeatureVector score(self, c):
+ * cdef FeatureVector score(self, ctx):
* cdef FeatureVector scores = FeatureVector() # <<<<<<<<<<<<<<
* for name, model in self.models:
- * scores.set(name, model(c.fphrase, c.ephrase, c.paircount, c.fcount, c.fsample_count))
+ * scores.set(name, model(ctx))
*/
__pyx_t_1 = PyObject_Call(((PyObject *)((PyObject*)__pyx_ptype_3_sa_FeatureVector)), ((PyObject *)__pyx_empty_tuple), NULL); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 30; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
__Pyx_GOTREF(__pyx_t_1);
@@ -54844,10 +54841,10 @@ static struct __pyx_obj_3_sa_FeatureVector *__pyx_f_3_sa_6Scorer_score(struct __
__pyx_t_1 = 0;
/* "/Users/vchahun/Sandbox/cdec/python/src/sa/features.pxi":31
- * cdef FeatureVector score(self, c):
+ * cdef FeatureVector score(self, ctx):
* cdef FeatureVector scores = FeatureVector()
* for name, model in self.models: # <<<<<<<<<<<<<<
- * scores.set(name, model(c.fphrase, c.ephrase, c.paircount, c.fcount, c.fsample_count))
+ * scores.set(name, model(ctx))
* return scores
*/
if (PyList_CheckExact(__pyx_v_self->models) || PyTuple_CheckExact(__pyx_v_self->models)) {
@@ -54943,60 +54940,38 @@ static struct __pyx_obj_3_sa_FeatureVector *__pyx_f_3_sa_6Scorer_score(struct __
/* "/Users/vchahun/Sandbox/cdec/python/src/sa/features.pxi":32
* cdef FeatureVector scores = FeatureVector()
* for name, model in self.models:
- * scores.set(name, model(c.fphrase, c.ephrase, c.paircount, c.fcount, c.fsample_count)) # <<<<<<<<<<<<<<
+ * scores.set(name, model(ctx)) # <<<<<<<<<<<<<<
* return scores
*/
__pyx_t_4 = PyObject_GetAttr(((PyObject *)__pyx_v_scores), __pyx_n_s__set); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 32; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
__Pyx_GOTREF(__pyx_t_4);
- __pyx_t_6 = PyObject_GetAttr(__pyx_v_c, __pyx_n_s__fphrase); if (unlikely(!__pyx_t_6)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 32; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+ __pyx_t_6 = PyTuple_New(1); if (unlikely(!__pyx_t_6)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 32; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
__Pyx_GOTREF(__pyx_t_6);
- __pyx_t_5 = PyObject_GetAttr(__pyx_v_c, __pyx_n_s__ephrase); if (unlikely(!__pyx_t_5)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 32; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+ __Pyx_INCREF(__pyx_v_ctx);
+ PyTuple_SET_ITEM(__pyx_t_6, 0, __pyx_v_ctx);
+ __Pyx_GIVEREF(__pyx_v_ctx);
+ __pyx_t_5 = PyObject_Call(__pyx_v_model, ((PyObject *)__pyx_t_6), NULL); if (unlikely(!__pyx_t_5)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 32; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
__Pyx_GOTREF(__pyx_t_5);
- __pyx_t_7 = PyObject_GetAttr(__pyx_v_c, __pyx_n_s__paircount); if (unlikely(!__pyx_t_7)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 32; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
- __Pyx_GOTREF(__pyx_t_7);
- __pyx_t_9 = PyObject_GetAttr(__pyx_v_c, __pyx_n_s__fcount); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 32; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
- __Pyx_GOTREF(__pyx_t_9);
- __pyx_t_10 = PyObject_GetAttr(__pyx_v_c, __pyx_n_s__fsample_count); if (unlikely(!__pyx_t_10)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 32; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
- __Pyx_GOTREF(__pyx_t_10);
- __pyx_t_11 = PyTuple_New(5); if (unlikely(!__pyx_t_11)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 32; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
- __Pyx_GOTREF(__pyx_t_11);
- PyTuple_SET_ITEM(__pyx_t_11, 0, __pyx_t_6);
- __Pyx_GIVEREF(__pyx_t_6);
- PyTuple_SET_ITEM(__pyx_t_11, 1, __pyx_t_5);
- __Pyx_GIVEREF(__pyx_t_5);
- PyTuple_SET_ITEM(__pyx_t_11, 2, __pyx_t_7);
- __Pyx_GIVEREF(__pyx_t_7);
- PyTuple_SET_ITEM(__pyx_t_11, 3, __pyx_t_9);
- __Pyx_GIVEREF(__pyx_t_9);
- PyTuple_SET_ITEM(__pyx_t_11, 4, __pyx_t_10);
- __Pyx_GIVEREF(__pyx_t_10);
- __pyx_t_6 = 0;
- __pyx_t_5 = 0;
- __pyx_t_7 = 0;
- __pyx_t_9 = 0;
- __pyx_t_10 = 0;
- __pyx_t_10 = PyObject_Call(__pyx_v_model, ((PyObject *)__pyx_t_11), NULL); if (unlikely(!__pyx_t_10)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 32; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
- __Pyx_GOTREF(__pyx_t_10);
- __Pyx_DECREF(((PyObject *)__pyx_t_11)); __pyx_t_11 = 0;
- __pyx_t_11 = PyTuple_New(2); if (unlikely(!__pyx_t_11)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 32; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
- __Pyx_GOTREF(__pyx_t_11);
+ __Pyx_DECREF(((PyObject *)__pyx_t_6)); __pyx_t_6 = 0;
+ __pyx_t_6 = PyTuple_New(2); if (unlikely(!__pyx_t_6)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 32; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+ __Pyx_GOTREF(__pyx_t_6);
__Pyx_INCREF(__pyx_v_name);
- PyTuple_SET_ITEM(__pyx_t_11, 0, __pyx_v_name);
+ PyTuple_SET_ITEM(__pyx_t_6, 0, __pyx_v_name);
__Pyx_GIVEREF(__pyx_v_name);
- PyTuple_SET_ITEM(__pyx_t_11, 1, __pyx_t_10);
- __Pyx_GIVEREF(__pyx_t_10);
- __pyx_t_10 = 0;
- __pyx_t_10 = PyObject_Call(__pyx_t_4, ((PyObject *)__pyx_t_11), NULL); if (unlikely(!__pyx_t_10)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 32; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
- __Pyx_GOTREF(__pyx_t_10);
+ PyTuple_SET_ITEM(__pyx_t_6, 1, __pyx_t_5);
+ __Pyx_GIVEREF(__pyx_t_5);
+ __pyx_t_5 = 0;
+ __pyx_t_5 = PyObject_Call(__pyx_t_4, ((PyObject *)__pyx_t_6), NULL); if (unlikely(!__pyx_t_5)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 32; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+ __Pyx_GOTREF(__pyx_t_5);
__Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
- __Pyx_DECREF(((PyObject *)__pyx_t_11)); __pyx_t_11 = 0;
- __Pyx_DECREF(__pyx_t_10); __pyx_t_10 = 0;
+ __Pyx_DECREF(((PyObject *)__pyx_t_6)); __pyx_t_6 = 0;
+ __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
}
__Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
/* "/Users/vchahun/Sandbox/cdec/python/src/sa/features.pxi":33
* for name, model in self.models:
- * scores.set(name, model(c.fphrase, c.ephrase, c.paircount, c.fcount, c.fsample_count))
+ * scores.set(name, model(ctx))
* return scores # <<<<<<<<<<<<<<
*/
__Pyx_XDECREF(((PyObject *)__pyx_r));
@@ -55012,9 +54987,6 @@ static struct __pyx_obj_3_sa_FeatureVector *__pyx_f_3_sa_6Scorer_score(struct __
__Pyx_XDECREF(__pyx_t_5);
__Pyx_XDECREF(__pyx_t_6);
__Pyx_XDECREF(__pyx_t_7);
- __Pyx_XDECREF(__pyx_t_9);
- __Pyx_XDECREF(__pyx_t_10);
- __Pyx_XDECREF(__pyx_t_11);
__Pyx_AddTraceback("_sa.Scorer.score", __pyx_clineno, __pyx_lineno, __pyx_filename);
__pyx_r = 0;
__pyx_L0:;
@@ -64732,7 +64704,7 @@ PyMODINIT_FUNC PyInit__sa(void)
*
* from collections import defaultdict, Counter, namedtuple # <<<<<<<<<<<<<<
*
- * FeatureContext = namedtuple("FeatureContext",
+ * FeatureContext = namedtuple('FeatureContext',
*/
__pyx_t_1 = PyList_New(3); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[8]; __pyx_lineno = 12; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
__Pyx_GOTREF(__pyx_t_1);
@@ -64777,19 +64749,19 @@ PyMODINIT_FUNC PyInit__sa(void)
/* "/Users/vchahun/Sandbox/cdec/python/src/sa/rulefactory.pxi":14
* from collections import defaultdict, Counter, namedtuple
*
- * FeatureContext = namedtuple("FeatureContext", # <<<<<<<<<<<<<<
- * ["fphrase",
- * "ephrase",
+ * FeatureContext = namedtuple('FeatureContext', # <<<<<<<<<<<<<<
+ * ['fphrase',
+ * 'ephrase',
*/
__pyx_t_2 = __Pyx_GetName(__pyx_m, __pyx_n_s__namedtuple); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[8]; __pyx_lineno = 14; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
__Pyx_GOTREF(__pyx_t_2);
/* "/Users/vchahun/Sandbox/cdec/python/src/sa/rulefactory.pxi":15
*
- * FeatureContext = namedtuple("FeatureContext",
- * ["fphrase", # <<<<<<<<<<<<<<
- * "ephrase",
- * "paircount",
+ * FeatureContext = namedtuple('FeatureContext',
+ * ['fphrase', # <<<<<<<<<<<<<<
+ * 'ephrase',
+ * 'paircount',
*/
__pyx_t_1 = PyList_New(8); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[8]; __pyx_lineno = 15; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
__Pyx_GOTREF(__pyx_t_1);
@@ -64833,7 +64805,7 @@ PyMODINIT_FUNC PyInit__sa(void)
__Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
/* "/Users/vchahun/Sandbox/cdec/python/src/sa/rulefactory.pxi":25
- * ])
+ * ])
*
* cdef int PRECOMPUTE = 0 # <<<<<<<<<<<<<<
* cdef int MERGE = 1
@@ -64900,7 +64872,7 @@ PyMODINIT_FUNC PyInit__sa(void)
__pyx_t_3 = 0;
/* "/Users/vchahun/Sandbox/cdec/python/src/sa/features.pxi":1
- * cdef StringMap FD = StringMap() # <<<<<<<<<<<<<<
+ * cdef StringMap FD = StringMap() # Feature name dictionary # <<<<<<<<<<<<<<
*
* INITIAL_CAPACITY = 7 # default number of features
*/
@@ -64913,7 +64885,7 @@ PyMODINIT_FUNC PyInit__sa(void)
__pyx_t_3 = 0;
/* "/Users/vchahun/Sandbox/cdec/python/src/sa/features.pxi":3
- * cdef StringMap FD = StringMap()
+ * cdef StringMap FD = StringMap() # Feature name dictionary
*
* INITIAL_CAPACITY = 7 # default number of features # <<<<<<<<<<<<<<
* INCREMENT = INITIAL_CAPACITY # double size
diff --git a/python/src/sa/default_scorer.pxi b/python/src/sa/default_scorer.pxi
new file mode 100644
index 00000000..483f4743
--- /dev/null
+++ b/python/src/sa/default_scorer.pxi
@@ -0,0 +1,74 @@
+from libc.stdlib cimport malloc, realloc, free
+from libc.math cimport log10
+
+MAXSCORE = -99
+EgivenFCoherent = 0
+SampleCountF = 1
+CountEF = 2
+MaxLexFgivenE = 3
+MaxLexEgivenF = 4
+IsSingletonF = 5
+IsSingletonFE = 6
+NFEATURES = 7
+
+cdef class DefaultScorer(Scorer):
+ cdef BiLex ttable
+ cdef int* fid
+
+ def __dealloc__(self):
+ free(self.fid)
+
+ def __init__(self, BiLex ttable):
+ self.ttable = ttable
+ self.fid = <int*> malloc(NFEATURES*sizeof(int))
+ cdef unsigned i
+ for i, fnames in enumerate(('EgivenFCoherent', 'SampleCountF', 'CountEF',
+ 'MaxLexFgivenE', 'MaxLexEgivenF', 'IsSingletonF', 'IsSingletonFE')):
+ self.fid[i] = FD.index(fnames)
+
+ cdef FeatureVector score(self, Phrase fphrase, Phrase ephrase,
+ unsigned paircount, unsigned fcount, unsigned fsample_count):
+ cdef FeatureVector scores = FeatureVector()
+
+ # EgivenFCoherent
+ cdef float efc = <float>paircount/fsample_count
+ scores.set(self.fid[EgivenFCoherent], -log10(efc) if efc > 0 else MAXSCORE)
+
+ # SampleCountF
+ scores.set(self.fid[SampleCountF], log10(1 + fsample_count))
+
+ # CountEF
+ scores.set(self.fid[CountEF], log10(1 + paircount))
+
+ # MaxLexFgivenE TODO typify
+ ewords = ephrase.words
+ ewords.append('NULL')
+ cdef float mlfe = 0, max_score = -1
+ for f in fphrase.words:
+ for e in ewords:
+ score = self.ttable.get_score(f, e, 1)
+ if score > max_score:
+ max_score = score
+ mlfe += -log10(max_score) if max_score > 0 else MAXSCORE
+ scores.set(self.fid[MaxLexFgivenE], mlfe)
+
+ # MaxLexEgivenF TODO same
+ fwords = fphrase.words
+ fwords.append('NULL')
+ cdef float mlef = 0
+ max_score = -1
+ for e in ephrase.words:
+ for f in fwords:
+ score = self.ttable.get_score(f, e, 0)
+ if score > max_score:
+ max_score = score
+ mlef += -log10(max_score) if max_score > 0 else MAXSCORE
+ scores.set(self.fid[MaxLexEgivenF], mlef)
+
+ # IsSingletonF
+ scores.set(self.fid[IsSingletonF], (fcount == 1))
+
+ # IsSingletonFE
+ scores.set(self.fid[IsSingletonFE], (paircount == 1))
+
+ return scores
diff --git a/python/src/sa/features.pxi b/python/src/sa/features.pxi
index eeef4feb..9b9ecf3c 100644
--- a/python/src/sa/features.pxi
+++ b/python/src/sa/features.pxi
@@ -1,4 +1,4 @@
-cdef StringMap FD = StringMap()
+cdef StringMap FD = StringMap() # Feature name dictionary
INITIAL_CAPACITY = 7 # default number of features
INCREMENT = INITIAL_CAPACITY # double size
@@ -26,8 +26,8 @@ cdef class Scorer:
names = [FD.index(<char *>model.__name__) for model in models]
self.models = zip(names, models)
- cdef FeatureVector score(self, c):
+ cdef FeatureVector score(self, ctx):
cdef FeatureVector scores = FeatureVector()
for name, model in self.models:
- scores.set(name, model(c.fphrase, c.ephrase, c.paircount, c.fcount, c.fsample_count))
+ scores.set(name, model(ctx))
return scores
diff --git a/python/src/sa/rulefactory.pxi b/python/src/sa/rulefactory.pxi
index 69cadac9..287b9a67 100644
--- a/python/src/sa/rulefactory.pxi
+++ b/python/src/sa/rulefactory.pxi
@@ -11,16 +11,16 @@ from libc.math cimport fmod, ceil, floor, log
from collections import defaultdict, Counter, namedtuple
-FeatureContext = namedtuple("FeatureContext",
- ["fphrase",
- "ephrase",
- "paircount",
- "fcount",
- "fsample_count",
- "input_span",
- "matches",
- "test_sentence"
- ])
+FeatureContext = namedtuple('FeatureContext',
+ ['fphrase',
+ 'ephrase',
+ 'paircount',
+ 'fcount',
+ 'fsample_count',
+ 'input_span',
+ 'matches',
+ 'test_sentence'
+ ])
cdef int PRECOMPUTE = 0
cdef int MERGE = 1