From ca3da3a815b6e85531d6ded07e7d6bec7852748c Mon Sep 17 00:00:00 2001
From: Michael Denkowski <michael.j.denkowski@gmail.com>
Date: Thu, 24 Jan 2013 16:55:21 -0500
Subject: Scored grammars from online extraction.  Don't trust them yet.

---
 python/pkg/cdec/sa/extract.py   |  6 ++--
 python/pkg/cdec/sa/extractor.py | 15 ++++++++--
 python/pkg/cdec/sa/features.py  | 61 +++++++++++++++++++++++++++++++++++------
 3 files changed, 68 insertions(+), 14 deletions(-)

(limited to 'python/pkg/cdec/sa')
diff --git a/python/pkg/cdec/sa/extract.py b/python/pkg/cdec/sa/extract.py
index 9fc37345..20eab9dd 100644
--- a/python/pkg/cdec/sa/extract.py
+++ b/python/pkg/cdec/sa/extract.py
@@ -12,10 +12,10 @@ extractor, prefix = None, None
 online = False
 
 def make_extractor(config, grammars, features):
-    global extractor, prefix
+    global extractor, prefix, online
     signal.signal(signal.SIGINT, signal.SIG_IGN) # Let parent process catch Ctrl+C
     load_features(features)
-    extractor = cdec.sa.GrammarExtractor(config)
+    extractor = cdec.sa.GrammarExtractor(config, online)
     prefix = grammars
 
 def load_features(features):
@@ -53,7 +53,7 @@ def extract(inp):
     # Add training instance _after_ extracting grammars
     if online:
         extractor.add_instance(sentence, reference, alignment)
-        extractor.dump_online_stats()
+        #extractor.dump_online_stats()
     grammar_file = os.path.abspath(grammar_file)
     return '<seg grammar="{0}" id="{1}"> {2} </seg>{3}'.format(grammar_file, i, sentence, suffix)
 
diff --git a/python/pkg/cdec/sa/extractor.py b/python/pkg/cdec/sa/extractor.py
index 62a251a7..5ef8041c 100644
--- a/python/pkg/cdec/sa/extractor.py
+++ b/python/pkg/cdec/sa/extractor.py
@@ -2,14 +2,15 @@ from itertools import chain
 import os, sys
 import cdec.configobj
 from cdec.sa.features import EgivenFCoherent, SampleCountF, CountEF,\
-        MaxLexEgivenF, MaxLexFgivenE, IsSingletonF, IsSingletonFE
+        MaxLexEgivenF, MaxLexFgivenE, IsSingletonF, IsSingletonFE,\
+        IsSupportedOnline
 import cdec.sa
 
 # maximum span of a grammar rule in TEST DATA
 MAX_INITIAL_SIZE = 15
 
 class GrammarExtractor:
-    def __init__(self, config, features=None):
+    def __init__(self, config, online=False, features=None):
         if isinstance(config, basestring):
             if not os.path.exists(config):
                 raise IOError('cannot read configuration from {0}'.format(config))
@@ -57,11 +58,19 @@ class GrammarExtractor:
         # lexical weighting tables
         tt = cdec.sa.BiLex(from_binary=config['lex_file'])
 
+        # TODO: clean this up
+        extended_features = []
+        #extended_features.append(IsSupportedOnline)
+        if online:
+            extended_features.append(IsSupportedOnline)
+            
         # TODO: use @cdec.sa.features decorator for standard features too
         # + add a mask to disable features
+        for f in cdec.sa._SA_FEATURES:
+            extended_features.append(f)
         scorer = cdec.sa.Scorer(EgivenFCoherent, SampleCountF, CountEF, 
             MaxLexFgivenE(tt), MaxLexEgivenF(tt), IsSingletonF, IsSingletonFE,
-            *cdec.sa._SA_FEATURES)
+            *extended_features)
 
         fsarray = cdec.sa.SuffixArray(from_binary=config['f_sa_file'])
         edarray = cdec.sa.DataArray(from_binary=config['e_file'])
diff --git a/python/pkg/cdec/sa/features.py b/python/pkg/cdec/sa/features.py
index a4ae23e8..cede5304 100644
--- a/python/pkg/cdec/sa/features.py
+++ b/python/pkg/cdec/sa/features.py
@@ -4,20 +4,39 @@ import math
 MAXSCORE = 99
 
 def EgivenF(ctx): # p(e|f) = c(e, f)/c(f)
-    return -math.log10(ctx.paircount/ctx.fcount)
+    if not ctx.online:
+        prob = ctx.paircount/ctx.fcount
+    else:
+        prob = (ctx.paircount + ctx.online.paircount) / (ctx.fcount + ctx.online.fcount)
+    return -math.log10(prob)
 
 def CountEF(ctx): # c(e, f)
-    return math.log10(1 + ctx.paircount)
+    if not ctx.online:
+        count = 1 + ctx.paircount
+    else:
+        count = 1 + ctx.paircount + ctx.online.paircount
+    return math.log10(count)
 
 def SampleCountF(ctx): # sample c(f)
-    return math.log10(1 + ctx.fsample_count)
+    if not ctx.online:
+        count = 1 + ctx.fsample_count
+    else:
+        count = 1 + ctx.fsample_count + ctx.online.fcount
+    return math.log10(count)
 
 def EgivenFCoherent(ctx): # c(e, f) / sample c(f)
-    prob = ctx.paircount/ctx.fsample_count
+    if not ctx.online:
+        prob = ctx.paircount/ctx.fsample_count
+    else:
+        prob = (ctx.paircount + ctx.online.paircount) / (ctx.fsample_count + ctx.online.fcount)
     return -math.log10(prob) if prob > 0 else MAXSCORE
 
 def CoherenceProb(ctx): # c(f) / sample c(f)
-    return -math.log10(ctx.fcount/ctx.fsample_count)
+    if not ctx.online:
+        prob = ctx.fcount/ctx.fsample_count
+    else:
+        prob = (ctx.fcount + ctx.online.fcount) / (ctx.fsample_count + ctx.online.fcount)
+    return -math.log10(prob)
 
 def MaxLexEgivenF(ttable):
     def MaxLexEgivenF(ctx):
@@ -42,16 +61,42 @@ def MaxLexFgivenE(ttable):
     return MaxLexFgivenE
 
 def IsSingletonF(ctx):
-    return (ctx.fcount == 1)
+    if not ctx.online:
+        count = ctx.fcount
+    else:
+        count = ctx.fcount + ctx.online.fcount  
+    return (count == 1)
 
 def IsSingletonFE(ctx):
-    return (ctx.paircount == 1)
+    if not ctx.online:
+        count = ctx.paircount
+    else:
+        count = ctx.paircount + ctx.online.paircount
+    return (count == 1)
 
 def IsNotSingletonF(ctx):
-    return (ctx.fcount > 1)
+    if not ctx.online:
+        count = ctx.fcount
+    else:
+        count = ctx.fcount + ctx.online.fcount
+    return (count > 1)
 
 def IsNotSingletonFE(ctx):
+    if not ctx.online:
+        count = ctx.paircount
+    else:
+        count = ctx.paircount + ctx.online.paircount
     return (ctx.paircount > 1)
 
 def IsFEGreaterThanZero(ctx):
+    if not ctx.online:
+        count = ctx.paircount
+    else:
+        count = ctx.paircount + ctx.online.paircount
     return (ctx.paircount > 0.01)
+
+def IsSupportedOnline(ctx):
+    if ctx.online:
+        return (ctx.online.fcount > 0.01)
+    else:
+        return False
\ No newline at end of file
-- 
cgit v1.2.3