4 files changed, 119 insertions, 50 deletions
diff --git a/python/pkg/cdec/sa/__init__.py b/python/pkg/cdec/sa/__init__.py
index fd4a4148..e0a344b7 100644
--- a/python/pkg/cdec/sa/__init__.py
+++ b/python/pkg/cdec/sa/__init__.py
@@ -1,4 +1,24 @@
-from cdec.sa._sa import sym_fromstring,\
+from cdec.sa._sa import make_lattice, decode_lattice, decode_sentence,\
         SuffixArray, DataArray, LCP, Precomputation, Alignment, BiLex,\
-        HieroCachingRuleFactory, Sampler
+        HieroCachingRuleFactory, Sampler, Scorer
 from cdec.sa.extractor import GrammarExtractor
+
+_SA_FEATURES = []
+_SA_ANNOTATORS = {}
+_SA_CONFIGURE = []
+
+def feature(fn):
+    _SA_FEATURES.append(fn)
+    return fn
+
+def annotator(fn):
+    _SA_ANNOTATORS[fn.__name__] = fn
+
+def annotate(sentence):
+    meta = {}
+    for name, fn in _SA_ANNOTATORS.iteritems():
+        meta[name] = fn(sentence)
+    return meta
+
+def configure(fn):
+    _SA_CONFIGURE.append(fn)
diff --git a/python/pkg/cdec/sa/extract.py b/python/pkg/cdec/sa/extract.py
index 875bf42e..10a81556 100644
--- a/python/pkg/cdec/sa/extract.py
+++ b/python/pkg/cdec/sa/extract.py
@@ -3,29 +3,72 @@ import sys
 import os
 import argparse
 import logging
+import multiprocessing as mp
+import signal
 import cdec.sa
 
+extractor, prefix = None, None
+def make_extractor(config, grammars, features):
+    global extractor, prefix
+    signal.signal(signal.SIGINT, signal.SIG_IGN) # Let parent process catch Ctrl+C
+    load_features(features)
+    extractor = cdec.sa.GrammarExtractor(config)
+    prefix = grammars
+
+def load_features(features):
+    for featdef in features:
+        logging.info('Loading additional feature definitions from %s', featdef)
+        prefix = os.path.dirname(featdef)
+        sys.path.append(prefix)
+        __import__(os.path.basename(featdef).replace('.py', ''))
+        sys.path.remove(prefix)
+
+def extract(inp):
+    global extractor, prefix
+    i, sentence = inp
+    sentence = sentence[:-1]
+    grammar_file = os.path.join(prefix, 'grammar.{0}'.format(i))
+    with open(grammar_file, 'w') as output:
+        for rule in extractor.grammar(sentence):
+            output.write(str(rule)+'\n')
+    grammar_file = os.path.abspath(grammar_file)
+    return '<seg grammar="{0}" id="{1}">{2}</seg>'.format(grammar_file, i, sentence)
+
 def main():
     logging.basicConfig(level=logging.INFO)
     parser = argparse.ArgumentParser(description='Extract grammars from a compiled corpus.')
     parser.add_argument('-c', '--config', required=True,
-                        help='Extractor configuration')
+                        help='extractor configuration')
     parser.add_argument('-g', '--grammars', required=True,
-                        help='Grammar output path')
+                        help='grammar output path')
+    parser.add_argument('-j', '--jobs', type=int, default=1,
+                        help='number of parallel extractors')
+    parser.add_argument('-s', '--chunksize', type=int, default=10,
+                        help='number of sentences / chunk')
+    parser.add_argument('-f', '--features', nargs='*', default=[],
+                        help='additional feature definitions')
     args = parser.parse_args()
 
     if not os.path.exists(args.grammars):
         os.mkdir(args.grammars)
-
-    extractor = cdec.sa.GrammarExtractor(args.config)
-    for i, sentence in enumerate(sys.stdin):
-        sentence = sentence[:-1]
-        grammar_file = os.path.join(args.grammars, 'grammar.{0}'.format(i))
-        with open(grammar_file, 'w') as output:
-            for rule in extractor.grammar(sentence):
-                output.write(str(rule)+'\n')
-        grammar_file = os.path.abspath(grammar_file)
-        print('<seg grammar="{0}" id="{1}">{2}</seg>'.format(grammar_file, i, sentence))
+    for featdef in args.features:
+        if not featdef.endswith('.py'):
+            sys.stderr.write('Error: feature definition file <{0}>'
+                    ' should be a python module\n'.format(featdef))
+            sys.exit(1)
+    
+    if args.jobs > 1:
+        logging.info('Starting %d workers; chunk size: %d', args.jobs, args.chunksize)
+        pool = mp.Pool(args.jobs, make_extractor, (args.config, args.grammars, args.features))
+        try:
+            for output in pool.imap(extract, enumerate(sys.stdin), args.chunksize):
+                print(output)
+        except KeyboardInterrupt:
+            pool.terminate()
+    else:
+        make_extractor(args.config, args.grammars, args.features)
+        for output in map(extract, enumerate(sys.stdin)):
+            print(output)
 
 if __name__ == '__main__':
     main()
diff --git a/python/pkg/cdec/sa/extractor.py b/python/pkg/cdec/sa/extractor.py
index bb912e16..a5ce8a68 100644
--- a/python/pkg/cdec/sa/extractor.py
+++ b/python/pkg/cdec/sa/extractor.py
@@ -9,7 +9,7 @@ import cdec.sa
 MAX_INITIAL_SIZE = 15
 
 class GrammarExtractor:
-    def __init__(self, config):
+    def __init__(self, config, features=None):
         if isinstance(config, str) or isinstance(config, unicode):
             if not os.path.exists(config):
                 raise IOError('cannot read configuration from {0}'.format(config))
@@ -57,8 +57,11 @@ class GrammarExtractor:
         # lexical weighting tables
         tt = cdec.sa.BiLex(from_binary=config['lex_file'])
 
-        self.models = (EgivenFCoherent, SampleCountF, CountEF, 
-                MaxLexFgivenE(tt), MaxLexEgivenF(tt), IsSingletonF, IsSingletonFE)
+        # TODO: use @cdec.sa.features decorator for standard features too
+        # + add a mask to disable features
+        scorer = cdec.sa.Scorer(EgivenFCoherent, SampleCountF, CountEF, 
+            MaxLexFgivenE(tt), MaxLexEgivenF(tt), IsSingletonF, IsSingletonFE,
+            *cdec.sa._SA_FEATURES)
 
         fsarray = cdec.sa.SuffixArray(from_binary=config['f_sa_file'])
         edarray = cdec.sa.DataArray(from_binary=config['e_file'])
@@ -67,12 +70,15 @@ class GrammarExtractor:
         # -1 = don't sample, use all data (VERY SLOW!)
         sampler = cdec.sa.Sampler(300, fsarray)
 
-        self.factory.configure(fsarray, edarray, sampler)
+        self.factory.configure(fsarray, edarray, sampler, scorer)
+        # Initialize feature definitions with configuration
+        for fn in cdec.sa._SA_CONFIGURE:
+            fn(config)
 
     def grammar(self, sentence):
         if isinstance(sentence, unicode):
             sentence = sentence.encode('utf8')
-        cnet = chain(('<s>',), sentence.split(), ('</s>',))
-        cnet = (cdec.sa.sym_fromstring(word, terminal=True) for word in cnet)
-        cnet = tuple(((word, None, 1), ) for word in cnet)
-        return self.factory.input(cnet, self.models)
+        words = tuple(chain(('<s>',), sentence.split(), ('</s>',)))
+        meta = cdec.sa.annotate(words)
+        cnet = cdec.sa.make_lattice(words)
+        return self.factory.input(cnet, meta)
diff --git a/python/pkg/cdec/sa/features.py b/python/pkg/cdec/sa/features.py
index 325b9e13..a4ae23e8 100644
--- a/python/pkg/cdec/sa/features.py
+++ b/python/pkg/cdec/sa/features.py
@@ -3,55 +3,55 @@ import math
 
 MAXSCORE = 99
 
-def EgivenF(fphrase, ephrase, paircount, fcount, fsample_count): # p(e|f)
-    return -math.log10(paircount/fcount)
+def EgivenF(ctx): # p(e|f) = c(e, f)/c(f)
+    return -math.log10(ctx.paircount/ctx.fcount)
 
-def CountEF(fphrase, ephrase, paircount, fcount, fsample_count):
-    return math.log10(1 + paircount)
+def CountEF(ctx): # c(e, f)
+    return math.log10(1 + ctx.paircount)
 
-def SampleCountF(fphrase, ephrase, paircount, fcount, fsample_count):
-    return math.log10(1 + fsample_count)
+def SampleCountF(ctx): # sample c(f)
+    return math.log10(1 + ctx.fsample_count)
 
-def EgivenFCoherent(fphrase, ephrase, paircount, fcount, fsample_count):
-    prob = paircount/fsample_count
+def EgivenFCoherent(ctx): # c(e, f) / sample c(f)
+    prob = ctx.paircount/ctx.fsample_count
     return -math.log10(prob) if prob > 0 else MAXSCORE
 
-def CoherenceProb(fphrase, ephrase, paircount, fcount, fsample_count):
-    return -math.log10(fcount/fsample_count)
+def CoherenceProb(ctx): # c(f) / sample c(f)
+    return -math.log10(ctx.fcount/ctx.fsample_count)
 
 def MaxLexEgivenF(ttable):
-    def feature(fphrase, ephrase, paircount, fcount, fsample_count):
-        fwords = fphrase.words
+    def MaxLexEgivenF(ctx):
+        fwords = ctx.fphrase.words
         fwords.append('NULL')
         def score():
-            for e in ephrase.words:
+            for e in ctx.ephrase.words:
               maxScore = max(ttable.get_score(f, e, 0) for f in fwords)
               yield -math.log10(maxScore) if maxScore > 0 else MAXSCORE
         return sum(score())
-    return feature
+    return MaxLexEgivenF
 
 def MaxLexFgivenE(ttable):
-    def feature(fphrase, ephrase, paircount, fcount, fsample_count):
-        ewords = ephrase.words
+    def MaxLexFgivenE(ctx):
+        ewords = ctx.ephrase.words
         ewords.append('NULL')
         def score():
-            for f in fphrase.words:
+            for f in ctx.fphrase.words:
               maxScore = max(ttable.get_score(f, e, 1) for e in ewords)
               yield -math.log10(maxScore) if maxScore > 0 else MAXSCORE
         return sum(score())
-    return feature
+    return MaxLexFgivenE
 
-def IsSingletonF(fphrase, ephrase, paircount, fcount, fsample_count):
-    return (fcount == 1)
+def IsSingletonF(ctx):
+    return (ctx.fcount == 1)
 
-def IsSingletonFE(fphrase, ephrase, paircount, fcount, fsample_count):
-    return (paircount == 1)
+def IsSingletonFE(ctx):
+    return (ctx.paircount == 1)
 
-def IsNotSingletonF(fphrase, ephrase, paircount, fcount, fsample_count):
-    return (fcount > 1)
+def IsNotSingletonF(ctx):
+    return (ctx.fcount > 1)
 
-def IsNotSingletonFE(fphrase, ephrase, paircount, fcount, fsample_count):
-    return (paircount > 1)
+def IsNotSingletonFE(ctx):
+    return (ctx.paircount > 1)
 
-def IsFEGreaterThanZero(fphrase, ephrase, paircount, fcount, fsample_count):
-    return (paircount > 0.01)
+def IsFEGreaterThanZero(ctx):
+    return (ctx.paircount > 0.01)