From 6fb3cc36cc4113c9f3510d87b3ae3b9c9351bf4e Mon Sep 17 00:00:00 2001
From: Victor Chahuneau <vchahune@cs.cmu.edu>
Date: Wed, 5 Sep 2012 14:55:11 +0100
Subject: Expose new feature extraction API

---
 python/pkg/cdec/sa/__init__.py  |  6 +++++
 python/pkg/cdec/sa/extract.py   | 35 ++++++++++++++++++++-------
 python/pkg/cdec/sa/extractor.py |  5 ++--
 python/pkg/cdec/sa/features.py  | 52 ++++++++++++++++++++---------------------
 4 files changed, 61 insertions(+), 37 deletions(-)

(limited to 'python/pkg/cdec')
diff --git a/python/pkg/cdec/sa/__init__.py b/python/pkg/cdec/sa/__init__.py
index ab8be809..cc532fb9 100644
--- a/python/pkg/cdec/sa/__init__.py
+++ b/python/pkg/cdec/sa/__init__.py
@@ -2,3 +2,9 @@ from cdec.sa._sa import sym_fromstring,\
         SuffixArray, DataArray, LCP, Precomputation, Alignment, BiLex,\
         HieroCachingRuleFactory, Sampler, Scorer
 from cdec.sa.extractor import GrammarExtractor
+
+_SA_FEATURES = []
+
+def feature(fn):
+    _SA_FEATURES.append(fn)
+    return fn
diff --git a/python/pkg/cdec/sa/extract.py b/python/pkg/cdec/sa/extract.py
index 39eac824..b370c4ca 100644
--- a/python/pkg/cdec/sa/extract.py
+++ b/python/pkg/cdec/sa/extract.py
@@ -8,12 +8,20 @@ import signal
 import cdec.sa
 
 extractor, prefix = None, None
-def make_extractor(config, grammars):
+def make_extractor(config, grammars, features):
     global extractor, prefix
     signal.signal(signal.SIGINT, signal.SIG_IGN) # Let parent process catch Ctrl+C
+    if features: load_features(features)
     extractor = cdec.sa.GrammarExtractor(config)
     prefix = grammars
 
+def load_features(features):
+    logging.info('Loading additional feature definitions from %s', features)
+    prefix = os.path.dirname(features)
+    sys.path.append(prefix)
+    __import__(os.path.basename(features).replace('.py', ''))
+    sys.path.remove(prefix)
+
 def extract(inp):
     global extractor, prefix
     i, sentence = inp
@@ -25,7 +33,6 @@ def extract(inp):
     grammar_file = os.path.abspath(grammar_file)
     return '<seg grammar="{0}" id="{1}">{2}</seg>'.format(grammar_file, i, sentence)
 
-
 def main():
     logging.basicConfig(level=logging.INFO)
     parser = argparse.ArgumentParser(description='Extract grammars from a compiled corpus.')
@@ -37,18 +44,28 @@ def main():
                         help='number of parallel extractors')
     parser.add_argument('-s', '--chunksize', type=int, default=10,
                         help='number of sentences / chunk')
+    parser.add_argument('-f', '--features', type=str, default=None,
+                        help='additional feature definitions')
     args = parser.parse_args()
 
     if not os.path.exists(args.grammars):
         os.mkdir(args.grammars)
-
-    logging.info('Starting %d workers; chunk size: %d', args.jobs, args.chunksize)
-    pool = mp.Pool(args.jobs, make_extractor, (args.config, args.grammars))
-    try:
-        for output in pool.imap(extract, enumerate(sys.stdin), args.chunksize):
+    if not args.features.endswith('.py'):
+        sys.stderr.write('Error: feature definition file should be a python module\n')
+        sys.exit(1)
+    
+    if args.jobs > 1:
+        logging.info('Starting %d workers; chunk size: %d', args.jobs, args.chunksize)
+        pool = mp.Pool(args.jobs, make_extractor, (args.config, args.grammars, args.features))
+        try:
+            for output in pool.imap(extract, enumerate(sys.stdin), args.chunksize):
+                print(output)
+        except KeyboardInterrupt:
+            pool.terminate()
+    else:
+        make_extractor(args.config, args.grammars, args.features)
+        for output in map(extract, enumerate(sys.stdin)):
             print(output)
-    except KeyboardInterrupt:
-        pool.terminate()
 
 if __name__ == '__main__':
     main()
diff --git a/python/pkg/cdec/sa/extractor.py b/python/pkg/cdec/sa/extractor.py
index 90cc4c51..89e35bf8 100644
--- a/python/pkg/cdec/sa/extractor.py
+++ b/python/pkg/cdec/sa/extractor.py
@@ -9,7 +9,7 @@ import cdec.sa
 MAX_INITIAL_SIZE = 15
 
 class GrammarExtractor:
-    def __init__(self, config):
+    def __init__(self, config, features=None):
         if isinstance(config, str) or isinstance(config, unicode):
             if not os.path.exists(config):
                 raise IOError('cannot read configuration from {0}'.format(config))
@@ -58,7 +58,8 @@ class GrammarExtractor:
         tt = cdec.sa.BiLex(from_binary=config['lex_file'])
 
         scorer = cdec.sa.Scorer(EgivenFCoherent, SampleCountF, CountEF, 
-            MaxLexFgivenE(tt), MaxLexEgivenF(tt), IsSingletonF, IsSingletonFE)
+            MaxLexFgivenE(tt), MaxLexEgivenF(tt), IsSingletonF, IsSingletonFE,
+            *cdec.sa._SA_FEATURES)
 
         fsarray = cdec.sa.SuffixArray(from_binary=config['f_sa_file'])
         edarray = cdec.sa.DataArray(from_binary=config['e_file'])
diff --git a/python/pkg/cdec/sa/features.py b/python/pkg/cdec/sa/features.py
index 8fd370cc..a4ae23e8 100644
--- a/python/pkg/cdec/sa/features.py
+++ b/python/pkg/cdec/sa/features.py
@@ -3,55 +3,55 @@ import math
 
 MAXSCORE = 99
 
-def EgivenF(fphrase, ephrase, paircount, fcount, fsample_count): # p(e|f)
-    return -math.log10(paircount/fcount)
+def EgivenF(ctx): # p(e|f) = c(e, f)/c(f)
+    return -math.log10(ctx.paircount/ctx.fcount)
 
-def CountEF(fphrase, ephrase, paircount, fcount, fsample_count):
-    return math.log10(1 + paircount)
+def CountEF(ctx): # c(e, f)
+    return math.log10(1 + ctx.paircount)
 
-def SampleCountF(fphrase, ephrase, paircount, fcount, fsample_count):
-    return math.log10(1 + fsample_count)
+def SampleCountF(ctx): # sample c(f)
+    return math.log10(1 + ctx.fsample_count)
 
-def EgivenFCoherent(fphrase, ephrase, paircount, fcount, fsample_count):
-    prob = paircount/fsample_count
+def EgivenFCoherent(ctx): # c(e, f) / sample c(f)
+    prob = ctx.paircount/ctx.fsample_count
     return -math.log10(prob) if prob > 0 else MAXSCORE
 
-def CoherenceProb(fphrase, ephrase, paircount, fcount, fsample_count):
-    return -math.log10(fcount/fsample_count)
+def CoherenceProb(ctx): # c(f) / sample c(f)
+    return -math.log10(ctx.fcount/ctx.fsample_count)
 
 def MaxLexEgivenF(ttable):
-    def MaxLexEgivenF(fphrase, ephrase, paircount, fcount, fsample_count):
-        fwords = fphrase.words
+    def MaxLexEgivenF(ctx):
+        fwords = ctx.fphrase.words
         fwords.append('NULL')
         def score():
-            for e in ephrase.words:
+            for e in ctx.ephrase.words:
               maxScore = max(ttable.get_score(f, e, 0) for f in fwords)
               yield -math.log10(maxScore) if maxScore > 0 else MAXSCORE
         return sum(score())
     return MaxLexEgivenF
 
 def MaxLexFgivenE(ttable):
-    def MaxLexFgivenE(fphrase, ephrase, paircount, fcount, fsample_count):
-        ewords = ephrase.words
+    def MaxLexFgivenE(ctx):
+        ewords = ctx.ephrase.words
         ewords.append('NULL')
         def score():
-            for f in fphrase.words:
+            for f in ctx.fphrase.words:
               maxScore = max(ttable.get_score(f, e, 1) for e in ewords)
               yield -math.log10(maxScore) if maxScore > 0 else MAXSCORE
         return sum(score())
     return MaxLexFgivenE
 
-def IsSingletonF(fphrase, ephrase, paircount, fcount, fsample_count):
-    return (fcount == 1)
+def IsSingletonF(ctx):
+    return (ctx.fcount == 1)
 
-def IsSingletonFE(fphrase, ephrase, paircount, fcount, fsample_count):
-    return (paircount == 1)
+def IsSingletonFE(ctx):
+    return (ctx.paircount == 1)
 
-def IsNotSingletonF(fphrase, ephrase, paircount, fcount, fsample_count):
-    return (fcount > 1)
+def IsNotSingletonF(ctx):
+    return (ctx.fcount > 1)
 
-def IsNotSingletonFE(fphrase, ephrase, paircount, fcount, fsample_count):
-    return (paircount > 1)
+def IsNotSingletonFE(ctx):
+    return (ctx.paircount > 1)
 
-def IsFEGreaterThanZero(fphrase, ephrase, paircount, fcount, fsample_count):
-    return (paircount > 0.01)
+def IsFEGreaterThanZero(ctx):
+    return (ctx.paircount > 0.01)
-- 
cgit v1.2.3