Expose new feature extraction API

author: Victor Chahuneau <vchahune@cs.cmu.edu> 2012-09-05 14:55:11 +0100
committer: Victor Chahuneau <vchahune@cs.cmu.edu> 2012-09-05 14:55:11 +0100
commit: fb4a9cd8874976a1c013b880b342961b72a8c0d7 (patch)
tree: e75e21d414980b0f4d9953cea3eb43e8c8355261 /python
parent: b34a325f1496eb4bbc33c5fe156eb7e28e5add27 (diff)
8 files changed, 185 insertions, 115 deletions
diff --git a/python/pkg/cdec/sa/__init__.py b/python/pkg/cdec/sa/__init__.py
index ab8be809..cc532fb9 100644
--- a/python/pkg/cdec/sa/__init__.py
+++ b/python/pkg/cdec/sa/__init__.py
@@ -2,3 +2,9 @@ from cdec.sa._sa import sym_fromstring,\
         SuffixArray, DataArray, LCP, Precomputation, Alignment, BiLex,\
         HieroCachingRuleFactory, Sampler, Scorer
 from cdec.sa.extractor import GrammarExtractor
+
+_SA_FEATURES = []
+
+def feature(fn):
+    _SA_FEATURES.append(fn)
+    return fn
diff --git a/python/pkg/cdec/sa/extract.py b/python/pkg/cdec/sa/extract.py
index 39eac824..b370c4ca 100644
--- a/python/pkg/cdec/sa/extract.py
+++ b/python/pkg/cdec/sa/extract.py
@@ -8,12 +8,20 @@ import signal
 import cdec.sa
 
 extractor, prefix = None, None
-def make_extractor(config, grammars):
+def make_extractor(config, grammars, features):
     global extractor, prefix
     signal.signal(signal.SIGINT, signal.SIG_IGN) # Let parent process catch Ctrl+C
+    if features: load_features(features)
     extractor = cdec.sa.GrammarExtractor(config)
     prefix = grammars
 
+def load_features(features):
+    logging.info('Loading additional feature definitions from %s', features)
+    prefix = os.path.dirname(features)
+    sys.path.append(prefix)
+    __import__(os.path.basename(features).replace('.py', ''))
+    sys.path.remove(prefix)
+
 def extract(inp):
     global extractor, prefix
     i, sentence = inp
@@ -25,7 +33,6 @@ def extract(inp):
     grammar_file = os.path.abspath(grammar_file)
     return '<seg grammar="{0}" id="{1}">{2}</seg>'.format(grammar_file, i, sentence)
 
-
 def main():
     logging.basicConfig(level=logging.INFO)
     parser = argparse.ArgumentParser(description='Extract grammars from a compiled corpus.')
@@ -37,18 +44,28 @@ def main():
                         help='number of parallel extractors')
     parser.add_argument('-s', '--chunksize', type=int, default=10,
                         help='number of sentences / chunk')
+    parser.add_argument('-f', '--features', type=str, default=None,
+                        help='additional feature definitions')
     args = parser.parse_args()
 
     if not os.path.exists(args.grammars):
         os.mkdir(args.grammars)
-
-    logging.info('Starting %d workers; chunk size: %d', args.jobs, args.chunksize)
-    pool = mp.Pool(args.jobs, make_extractor, (args.config, args.grammars))
-    try:
-        for output in pool.imap(extract, enumerate(sys.stdin), args.chunksize):
+    if not args.features.endswith('.py'):
+        sys.stderr.write('Error: feature definition file should be a python module\n')
+        sys.exit(1)
+    
+    if args.jobs > 1:
+        logging.info('Starting %d workers; chunk size: %d', args.jobs, args.chunksize)
+        pool = mp.Pool(args.jobs, make_extractor, (args.config, args.grammars, args.features))
+        try:
+            for output in pool.imap(extract, enumerate(sys.stdin), args.chunksize):
+                print(output)
+        except KeyboardInterrupt:
+            pool.terminate()
+    else:
+        make_extractor(args.config, args.grammars, args.features)
+        for output in map(extract, enumerate(sys.stdin)):
             print(output)
-    except KeyboardInterrupt:
-        pool.terminate()
 
 if __name__ == '__main__':
     main()
diff --git a/python/pkg/cdec/sa/extractor.py b/python/pkg/cdec/sa/extractor.py
index 90cc4c51..89e35bf8 100644
--- a/python/pkg/cdec/sa/extractor.py
+++ b/python/pkg/cdec/sa/extractor.py
@@ -9,7 +9,7 @@ import cdec.sa
 MAX_INITIAL_SIZE = 15
 
 class GrammarExtractor:
-    def __init__(self, config):
+    def __init__(self, config, features=None):
         if isinstance(config, str) or isinstance(config, unicode):
             if not os.path.exists(config):
                 raise IOError('cannot read configuration from {0}'.format(config))
@@ -58,7 +58,8 @@ class GrammarExtractor:
         tt = cdec.sa.BiLex(from_binary=config['lex_file'])
 
         scorer = cdec.sa.Scorer(EgivenFCoherent, SampleCountF, CountEF, 
-            MaxLexFgivenE(tt), MaxLexEgivenF(tt), IsSingletonF, IsSingletonFE)
+            MaxLexFgivenE(tt), MaxLexEgivenF(tt), IsSingletonF, IsSingletonFE,
+            *cdec.sa._SA_FEATURES)
 
         fsarray = cdec.sa.SuffixArray(from_binary=config['f_sa_file'])
         edarray = cdec.sa.DataArray(from_binary=config['e_file'])
diff --git a/python/pkg/cdec/sa/features.py b/python/pkg/cdec/sa/features.py
index 8fd370cc..a4ae23e8 100644
--- a/python/pkg/cdec/sa/features.py
+++ b/python/pkg/cdec/sa/features.py
@@ -3,55 +3,55 @@ import math
 
 MAXSCORE = 99
 
-def EgivenF(fphrase, ephrase, paircount, fcount, fsample_count): # p(e|f)
-    return -math.log10(paircount/fcount)
+def EgivenF(ctx): # p(e|f) = c(e, f)/c(f)
+    return -math.log10(ctx.paircount/ctx.fcount)
 
-def CountEF(fphrase, ephrase, paircount, fcount, fsample_count):
-    return math.log10(1 + paircount)
+def CountEF(ctx): # c(e, f)
+    return math.log10(1 + ctx.paircount)
 
-def SampleCountF(fphrase, ephrase, paircount, fcount, fsample_count):
-    return math.log10(1 + fsample_count)
+def SampleCountF(ctx): # sample c(f)
+    return math.log10(1 + ctx.fsample_count)
 
-def EgivenFCoherent(fphrase, ephrase, paircount, fcount, fsample_count):
-    prob = paircount/fsample_count
+def EgivenFCoherent(ctx): # c(e, f) / sample c(f)
+    prob = ctx.paircount/ctx.fsample_count
     return -math.log10(prob) if prob > 0 else MAXSCORE
 
-def CoherenceProb(fphrase, ephrase, paircount, fcount, fsample_count):
-    return -math.log10(fcount/fsample_count)
+def CoherenceProb(ctx): # c(f) / sample c(f)
+    return -math.log10(ctx.fcount/ctx.fsample_count)
 
 def MaxLexEgivenF(ttable):
-    def MaxLexEgivenF(fphrase, ephrase, paircount, fcount, fsample_count):
-        fwords = fphrase.words
+    def MaxLexEgivenF(ctx):
+        fwords = ctx.fphrase.words
         fwords.append('NULL')
         def score():
-            for e in ephrase.words:
+            for e in ctx.ephrase.words:
               maxScore = max(ttable.get_score(f, e, 0) for f in fwords)
               yield -math.log10(maxScore) if maxScore > 0 else MAXSCORE
         return sum(score())
     return MaxLexEgivenF
 
 def MaxLexFgivenE(ttable):
-    def MaxLexFgivenE(fphrase, ephrase, paircount, fcount, fsample_count):
-        ewords = ephrase.words
+    def MaxLexFgivenE(ctx):
+        ewords = ctx.ephrase.words
         ewords.append('NULL')
         def score():
-            for f in fphrase.words:
+            for f in ctx.fphrase.words:
               maxScore = max(ttable.get_score(f, e, 1) for e in ewords)
               yield -math.log10(maxScore) if maxScore > 0 else MAXSCORE
         return sum(score())
     return MaxLexFgivenE
 
-def IsSingletonF(fphrase, ephrase, paircount, fcount, fsample_count):
-    return (fcount == 1)
+def IsSingletonF(ctx):
+    return (ctx.fcount == 1)
 
-def IsSingletonFE(fphrase, ephrase, paircount, fcount, fsample_count):
-    return (paircount == 1)
+def IsSingletonFE(ctx):
+    return (ctx.paircount == 1)
 
-def IsNotSingletonF(fphrase, ephrase, paircount, fcount, fsample_count):
-    return (fcount > 1)
+def IsNotSingletonF(ctx):
+    return (ctx.fcount > 1)
 
-def IsNotSingletonFE(fphrase, ephrase, paircount, fcount, fsample_count):
-    return (paircount > 1)
+def IsNotSingletonFE(ctx):
+    return (ctx.paircount > 1)
 
-def IsFEGreaterThanZero(fphrase, ephrase, paircount, fcount, fsample_count):
-    return (paircount > 0.01)
+def IsFEGreaterThanZero(ctx):
+    return (ctx.paircount > 0.01)
diff --git a/python/src/sa/_sa.c b/python/src/sa/_sa.c
index d04a8f98..a1530dda 100644
--- a/python/src/sa/_sa.c
+++ b/python/src/sa/_sa.c
@@ -1,4 +1,4 @@
-/* Generated by Cython 0.17 on Wed Sep  5 10:20:00 2012 */
+/* Generated by Cython 0.17 on Wed Sep  5 12:38:10 2012 */
 
 #define PY_SSIZE_T_CLEAN
 #include "Python.h"
@@ -54767,7 +54767,7 @@ static int __pyx_pf_3_sa_6Scorer___init__(struct __pyx_obj_3_sa_Scorer *__pyx_v_
  *         names = [FD.index(<char *>model.__name__) for model in models]
  *         self.models = zip(names, models)             # <<<<<<<<<<<<<<
  * 
- *     cdef FeatureVector score(self, c):
+ *     cdef FeatureVector score(self, ctx):
  */
   __pyx_t_1 = PyTuple_New(2); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 27; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_1);
@@ -54804,12 +54804,12 @@ static int __pyx_pf_3_sa_6Scorer___init__(struct __pyx_obj_3_sa_Scorer *__pyx_v_
 /* "/Users/vchahun/Sandbox/cdec/python/src/sa/features.pxi":29
  *         self.models = zip(names, models)
  * 
- *     cdef FeatureVector score(self, c):             # <<<<<<<<<<<<<<
+ *     cdef FeatureVector score(self, ctx):             # <<<<<<<<<<<<<<
  *         cdef FeatureVector scores = FeatureVector()
  *         for name, model in self.models:
  */
 
-static struct __pyx_obj_3_sa_FeatureVector *__pyx_f_3_sa_6Scorer_score(struct __pyx_obj_3_sa_Scorer *__pyx_v_self, PyObject *__pyx_v_c) {
+static struct __pyx_obj_3_sa_FeatureVector *__pyx_f_3_sa_6Scorer_score(struct __pyx_obj_3_sa_Scorer *__pyx_v_self, PyObject *__pyx_v_ctx) {
   struct __pyx_obj_3_sa_FeatureVector *__pyx_v_scores = 0;
   PyObject *__pyx_v_name = NULL;
   PyObject *__pyx_v_model = NULL;
@@ -54823,9 +54823,6 @@ static struct __pyx_obj_3_sa_FeatureVector *__pyx_f_3_sa_6Scorer_score(struct __
   PyObject *__pyx_t_6 = NULL;
   PyObject *__pyx_t_7 = NULL;
   PyObject *(*__pyx_t_8)(PyObject *);
-  PyObject *__pyx_t_9 = NULL;
-  PyObject *__pyx_t_10 = NULL;
-  PyObject *__pyx_t_11 = NULL;
   int __pyx_lineno = 0;
   const char *__pyx_filename = NULL;
   int __pyx_clineno = 0;
@@ -54833,10 +54830,10 @@ static struct __pyx_obj_3_sa_FeatureVector *__pyx_f_3_sa_6Scorer_score(struct __
 
   /* "/Users/vchahun/Sandbox/cdec/python/src/sa/features.pxi":30
  * 
- *     cdef FeatureVector score(self, c):
+ *     cdef FeatureVector score(self, ctx):
  *         cdef FeatureVector scores = FeatureVector()             # <<<<<<<<<<<<<<
  *         for name, model in self.models:
- *             scores.set(name, model(c.fphrase, c.ephrase, c.paircount, c.fcount, c.fsample_count))
+ *             scores.set(name, model(ctx))
  */
   __pyx_t_1 = PyObject_Call(((PyObject *)((PyObject*)__pyx_ptype_3_sa_FeatureVector)), ((PyObject *)__pyx_empty_tuple), NULL); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 30; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_1);
@@ -54844,10 +54841,10 @@ static struct __pyx_obj_3_sa_FeatureVector *__pyx_f_3_sa_6Scorer_score(struct __
   __pyx_t_1 = 0;
 
   /* "/Users/vchahun/Sandbox/cdec/python/src/sa/features.pxi":31
- *     cdef FeatureVector score(self, c):
+ *     cdef FeatureVector score(self, ctx):
  *         cdef FeatureVector scores = FeatureVector()
  *         for name, model in self.models:             # <<<<<<<<<<<<<<
- *             scores.set(name, model(c.fphrase, c.ephrase, c.paircount, c.fcount, c.fsample_count))
+ *             scores.set(name, model(ctx))
  *         return scores
  */
   if (PyList_CheckExact(__pyx_v_self->models) || PyTuple_CheckExact(__pyx_v_self->models)) {
@@ -54943,60 +54940,38 @@ static struct __pyx_obj_3_sa_FeatureVector *__pyx_f_3_sa_6Scorer_score(struct __
     /* "/Users/vchahun/Sandbox/cdec/python/src/sa/features.pxi":32
  *         cdef FeatureVector scores = FeatureVector()
  *         for name, model in self.models:
- *             scores.set(name, model(c.fphrase, c.ephrase, c.paircount, c.fcount, c.fsample_count))             # <<<<<<<<<<<<<<
+ *             scores.set(name, model(ctx))             # <<<<<<<<<<<<<<
  *         return scores
  */
     __pyx_t_4 = PyObject_GetAttr(((PyObject *)__pyx_v_scores), __pyx_n_s__set); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 32; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
     __Pyx_GOTREF(__pyx_t_4);
-    __pyx_t_6 = PyObject_GetAttr(__pyx_v_c, __pyx_n_s__fphrase); if (unlikely(!__pyx_t_6)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 32; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __pyx_t_6 = PyTuple_New(1); if (unlikely(!__pyx_t_6)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 32; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
     __Pyx_GOTREF(__pyx_t_6);
-    __pyx_t_5 = PyObject_GetAttr(__pyx_v_c, __pyx_n_s__ephrase); if (unlikely(!__pyx_t_5)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 32; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __Pyx_INCREF(__pyx_v_ctx);
+    PyTuple_SET_ITEM(__pyx_t_6, 0, __pyx_v_ctx);
+    __Pyx_GIVEREF(__pyx_v_ctx);
+    __pyx_t_5 = PyObject_Call(__pyx_v_model, ((PyObject *)__pyx_t_6), NULL); if (unlikely(!__pyx_t_5)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 32; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
     __Pyx_GOTREF(__pyx_t_5);
-    __pyx_t_7 = PyObject_GetAttr(__pyx_v_c, __pyx_n_s__paircount); if (unlikely(!__pyx_t_7)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 32; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-    __Pyx_GOTREF(__pyx_t_7);
-    __pyx_t_9 = PyObject_GetAttr(__pyx_v_c, __pyx_n_s__fcount); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 32; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-    __Pyx_GOTREF(__pyx_t_9);
-    __pyx_t_10 = PyObject_GetAttr(__pyx_v_c, __pyx_n_s__fsample_count); if (unlikely(!__pyx_t_10)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 32; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-    __Pyx_GOTREF(__pyx_t_10);
-    __pyx_t_11 = PyTuple_New(5); if (unlikely(!__pyx_t_11)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 32; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-    __Pyx_GOTREF(__pyx_t_11);
-    PyTuple_SET_ITEM(__pyx_t_11, 0, __pyx_t_6);
-    __Pyx_GIVEREF(__pyx_t_6);
-    PyTuple_SET_ITEM(__pyx_t_11, 1, __pyx_t_5);
-    __Pyx_GIVEREF(__pyx_t_5);
-    PyTuple_SET_ITEM(__pyx_t_11, 2, __pyx_t_7);
-    __Pyx_GIVEREF(__pyx_t_7);
-    PyTuple_SET_ITEM(__pyx_t_11, 3, __pyx_t_9);
-    __Pyx_GIVEREF(__pyx_t_9);
-    PyTuple_SET_ITEM(__pyx_t_11, 4, __pyx_t_10);
-    __Pyx_GIVEREF(__pyx_t_10);
-    __pyx_t_6 = 0;
-    __pyx_t_5 = 0;
-    __pyx_t_7 = 0;
-    __pyx_t_9 = 0;
-    __pyx_t_10 = 0;
-    __pyx_t_10 = PyObject_Call(__pyx_v_model, ((PyObject *)__pyx_t_11), NULL); if (unlikely(!__pyx_t_10)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 32; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-    __Pyx_GOTREF(__pyx_t_10);
-    __Pyx_DECREF(((PyObject *)__pyx_t_11)); __pyx_t_11 = 0;
-    __pyx_t_11 = PyTuple_New(2); if (unlikely(!__pyx_t_11)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 32; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-    __Pyx_GOTREF(__pyx_t_11);
+    __Pyx_DECREF(((PyObject *)__pyx_t_6)); __pyx_t_6 = 0;
+    __pyx_t_6 = PyTuple_New(2); if (unlikely(!__pyx_t_6)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 32; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __Pyx_GOTREF(__pyx_t_6);
     __Pyx_INCREF(__pyx_v_name);
-    PyTuple_SET_ITEM(__pyx_t_11, 0, __pyx_v_name);
+    PyTuple_SET_ITEM(__pyx_t_6, 0, __pyx_v_name);
     __Pyx_GIVEREF(__pyx_v_name);
-    PyTuple_SET_ITEM(__pyx_t_11, 1, __pyx_t_10);
-    __Pyx_GIVEREF(__pyx_t_10);
-    __pyx_t_10 = 0;
-    __pyx_t_10 = PyObject_Call(__pyx_t_4, ((PyObject *)__pyx_t_11), NULL); if (unlikely(!__pyx_t_10)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 32; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
-    __Pyx_GOTREF(__pyx_t_10);
+    PyTuple_SET_ITEM(__pyx_t_6, 1, __pyx_t_5);
+    __Pyx_GIVEREF(__pyx_t_5);
+    __pyx_t_5 = 0;
+    __pyx_t_5 = PyObject_Call(__pyx_t_4, ((PyObject *)__pyx_t_6), NULL); if (unlikely(!__pyx_t_5)) {__pyx_filename = __pyx_f[13]; __pyx_lineno = 32; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __Pyx_GOTREF(__pyx_t_5);
     __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
-    __Pyx_DECREF(((PyObject *)__pyx_t_11)); __pyx_t_11 = 0;
-    __Pyx_DECREF(__pyx_t_10); __pyx_t_10 = 0;
+    __Pyx_DECREF(((PyObject *)__pyx_t_6)); __pyx_t_6 = 0;
+    __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
   }
   __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
 
   /* "/Users/vchahun/Sandbox/cdec/python/src/sa/features.pxi":33
  *         for name, model in self.models:
- *             scores.set(name, model(c.fphrase, c.ephrase, c.paircount, c.fcount, c.fsample_count))
+ *             scores.set(name, model(ctx))
  *         return scores             # <<<<<<<<<<<<<<
  */
   __Pyx_XDECREF(((PyObject *)__pyx_r));
@@ -55012,9 +54987,6 @@ static struct __pyx_obj_3_sa_FeatureVector *__pyx_f_3_sa_6Scorer_score(struct __
   __Pyx_XDECREF(__pyx_t_5);
   __Pyx_XDECREF(__pyx_t_6);
   __Pyx_XDECREF(__pyx_t_7);
-  __Pyx_XDECREF(__pyx_t_9);
-  __Pyx_XDECREF(__pyx_t_10);
-  __Pyx_XDECREF(__pyx_t_11);
   __Pyx_AddTraceback("_sa.Scorer.score", __pyx_clineno, __pyx_lineno, __pyx_filename);
   __pyx_r = 0;
   __pyx_L0:;
@@ -64732,7 +64704,7 @@ PyMODINIT_FUNC PyInit__sa(void)
  * 
  * from collections import defaultdict, Counter, namedtuple             # <<<<<<<<<<<<<<
  * 
- * FeatureContext = namedtuple("FeatureContext",
+ * FeatureContext = namedtuple('FeatureContext',
  */
   __pyx_t_1 = PyList_New(3); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[8]; __pyx_lineno = 12; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_1);
@@ -64777,19 +64749,19 @@ PyMODINIT_FUNC PyInit__sa(void)
   /* "/Users/vchahun/Sandbox/cdec/python/src/sa/rulefactory.pxi":14
  * from collections import defaultdict, Counter, namedtuple
  * 
- * FeatureContext = namedtuple("FeatureContext",             # <<<<<<<<<<<<<<
- *   ["fphrase",
- *    "ephrase",
+ * FeatureContext = namedtuple('FeatureContext',             # <<<<<<<<<<<<<<
+ *     ['fphrase',
+ *      'ephrase',
  */
   __pyx_t_2 = __Pyx_GetName(__pyx_m, __pyx_n_s__namedtuple); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[8]; __pyx_lineno = 14; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_2);
 
   /* "/Users/vchahun/Sandbox/cdec/python/src/sa/rulefactory.pxi":15
  * 
- * FeatureContext = namedtuple("FeatureContext",
- *   ["fphrase",             # <<<<<<<<<<<<<<
- *    "ephrase",
- *    "paircount",
+ * FeatureContext = namedtuple('FeatureContext',
+ *     ['fphrase',             # <<<<<<<<<<<<<<
+ *      'ephrase',
+ *      'paircount',
  */
   __pyx_t_1 = PyList_New(8); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[8]; __pyx_lineno = 15; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
   __Pyx_GOTREF(__pyx_t_1);
@@ -64833,7 +64805,7 @@ PyMODINIT_FUNC PyInit__sa(void)
   __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
 
   /* "/Users/vchahun/Sandbox/cdec/python/src/sa/rulefactory.pxi":25
- *   ])
+ *     ])
  * 
  * cdef int PRECOMPUTE = 0             # <<<<<<<<<<<<<<
  * cdef int MERGE = 1
@@ -64900,7 +64872,7 @@ PyMODINIT_FUNC PyInit__sa(void)
   __pyx_t_3 = 0;
 
   /* "/Users/vchahun/Sandbox/cdec/python/src/sa/features.pxi":1
- * cdef StringMap FD = StringMap()             # <<<<<<<<<<<<<<
+ * cdef StringMap FD = StringMap() # Feature name dictionary             # <<<<<<<<<<<<<<
  * 
  * INITIAL_CAPACITY = 7 # default number of features
  */
@@ -64913,7 +64885,7 @@ PyMODINIT_FUNC PyInit__sa(void)
   __pyx_t_3 = 0;
 
   /* "/Users/vchahun/Sandbox/cdec/python/src/sa/features.pxi":3
- * cdef StringMap FD = StringMap()
+ * cdef StringMap FD = StringMap() # Feature name dictionary
  * 
  * INITIAL_CAPACITY = 7 # default number of features             # <<<<<<<<<<<<<<
  * INCREMENT = INITIAL_CAPACITY # double size
diff --git a/python/src/sa/default_scorer.pxi b/python/src/sa/default_scorer.pxi
new file mode 100644
index 00000000..483f4743
--- /dev/null
+++ b/python/src/sa/default_scorer.pxi
@@ -0,0 +1,74 @@
+from libc.stdlib cimport malloc, realloc, free
+from libc.math cimport log10
+
+MAXSCORE = -99
+EgivenFCoherent = 0
+SampleCountF = 1
+CountEF = 2
+MaxLexFgivenE = 3
+MaxLexEgivenF = 4
+IsSingletonF = 5
+IsSingletonFE = 6
+NFEATURES = 7
+
+cdef class DefaultScorer(Scorer):
+    cdef BiLex ttable
+    cdef int* fid
+
+    def __dealloc__(self):
+        free(self.fid)
+
+    def __init__(self, BiLex ttable):
+        self.ttable = ttable
+        self.fid = <int*> malloc(NFEATURES*sizeof(int))
+        cdef unsigned i
+        for i, fnames in enumerate(('EgivenFCoherent', 'SampleCountF', 'CountEF',
+                'MaxLexFgivenE', 'MaxLexEgivenF', 'IsSingletonF', 'IsSingletonFE')):
+            self.fid[i] = FD.index(fnames)
+
+    cdef FeatureVector score(self, Phrase fphrase, Phrase ephrase,
+            unsigned paircount, unsigned fcount, unsigned fsample_count):
+        cdef FeatureVector scores = FeatureVector()
+
+        #  EgivenFCoherent
+        cdef float efc = <float>paircount/fsample_count
+        scores.set(self.fid[EgivenFCoherent], -log10(efc) if efc > 0 else MAXSCORE)
+
+        # SampleCountF
+        scores.set(self.fid[SampleCountF], log10(1 + fsample_count))
+
+        # CountEF
+        scores.set(self.fid[CountEF], log10(1 + paircount))
+
+        # MaxLexFgivenE TODO typify
+        ewords = ephrase.words
+        ewords.append('NULL')
+        cdef float mlfe = 0, max_score = -1
+        for f in fphrase.words:
+            for e in ewords:
+                score = self.ttable.get_score(f, e, 1)
+                if score > max_score:
+                    max_score = score
+            mlfe += -log10(max_score) if max_score > 0 else MAXSCORE
+        scores.set(self.fid[MaxLexFgivenE], mlfe)
+
+        # MaxLexEgivenF TODO same
+        fwords = fphrase.words
+        fwords.append('NULL')
+        cdef float mlef = 0
+        max_score = -1
+        for e in ephrase.words:
+            for f in fwords:
+                score = self.ttable.get_score(f, e, 0)
+                if score > max_score:
+                    max_score = score
+            mlef += -log10(max_score) if max_score > 0 else MAXSCORE
+        scores.set(self.fid[MaxLexEgivenF], mlef)
+
+        # IsSingletonF
+        scores.set(self.fid[IsSingletonF], (fcount == 1))
+
+        # IsSingletonFE
+        scores.set(self.fid[IsSingletonFE], (paircount == 1))
+
+        return scores
diff --git a/python/src/sa/features.pxi b/python/src/sa/features.pxi
index eeef4feb..9b9ecf3c 100644
--- a/python/src/sa/features.pxi
+++ b/python/src/sa/features.pxi
@@ -1,4 +1,4 @@
-cdef StringMap FD = StringMap()
+cdef StringMap FD = StringMap() # Feature name dictionary
 
 INITIAL_CAPACITY = 7 # default number of features
 INCREMENT = INITIAL_CAPACITY # double size
@@ -26,8 +26,8 @@ cdef class Scorer:
         names = [FD.index(<char *>model.__name__) for model in models]
         self.models = zip(names, models)
 
-    cdef FeatureVector score(self, c):
+    cdef FeatureVector score(self, ctx):
         cdef FeatureVector scores = FeatureVector()
         for name, model in self.models:
-            scores.set(name, model(c.fphrase, c.ephrase, c.paircount, c.fcount, c.fsample_count))
+            scores.set(name, model(ctx))
         return scores
diff --git a/python/src/sa/rulefactory.pxi b/python/src/sa/rulefactory.pxi
index 69cadac9..287b9a67 100644
--- a/python/src/sa/rulefactory.pxi
+++ b/python/src/sa/rulefactory.pxi
@@ -11,16 +11,16 @@ from libc.math cimport fmod, ceil, floor, log
 
 from collections import defaultdict, Counter, namedtuple
 
-FeatureContext = namedtuple("FeatureContext",
-  ["fphrase", 
-   "ephrase", 
-   "paircount", 
-   "fcount", 
-   "fsample_count",
-   "input_span",
-   "matches",
-   "test_sentence"
-  ])
+FeatureContext = namedtuple('FeatureContext',
+    ['fphrase', 
+     'ephrase', 
+     'paircount', 
+     'fcount', 
+     'fsample_count',
+     'input_span',
+     'matches',
+     'test_sentence'
+    ])
 
 cdef int PRECOMPUTE = 0
 cdef int MERGE = 1
author	Victor Chahuneau <vchahune@cs.cmu.edu>	2012-09-05 14:55:11 +0100
committer	Victor Chahuneau <vchahune@cs.cmu.edu>	2012-09-05 14:55:11 +0100
commit	fb4a9cd8874976a1c013b880b342961b72a8c0d7 (patch)
tree	e75e21d414980b0f4d9953cea3eb43e8c8355261 /python
parent	b34a325f1496eb4bbc33c5fe156eb7e28e5add27 (diff)