Merge branch 'master' of github.com:pks/cdec-dtrain

author: Patrick Simianer <simianer@cl.uni-heidelberg.de> 2013-03-03 12:06:43 +0100
committer: Patrick Simianer <simianer@cl.uni-heidelberg.de> 2013-03-03 12:06:43 +0100
commit: f7f9048f8e4d34682f17bfd050d238005feb3ee3 (patch)
tree: fa20fa16b0f5a8009a9254622b65ebeaec049399 /python/pkg
parent: 9d306b30c9abba995ba35243e5cb461bb472a61f (diff)
parent: 12f2eab0e7dc7167af47cddf8ef88968656277da (diff)
4 files changed, 119 insertions, 24 deletions
diff --git a/python/pkg/cdec/sa/__init__.py b/python/pkg/cdec/sa/__init__.py
index 418531d9..14ba5ecb 100644
--- a/python/pkg/cdec/sa/__init__.py
+++ b/python/pkg/cdec/sa/__init__.py
@@ -1,5 +1,5 @@
 from cdec.sa._sa import make_lattice, decode_lattice, decode_sentence,\
-        encode_words, decode_words,\
+        encode_words, decode_words, isvar,\
         SuffixArray, DataArray, LCP, Precomputation, Alignment, BiLex,\
         HieroCachingRuleFactory, Sampler, Scorer
 from cdec.sa.extractor import GrammarExtractor
diff --git a/python/pkg/cdec/sa/extract.py b/python/pkg/cdec/sa/extract.py
index 9fc37345..2e596bd3 100644
--- a/python/pkg/cdec/sa/extract.py
+++ b/python/pkg/cdec/sa/extract.py
@@ -12,10 +12,10 @@ extractor, prefix = None, None
 online = False
 
 def make_extractor(config, grammars, features):
-    global extractor, prefix
+    global extractor, prefix, online
     signal.signal(signal.SIGINT, signal.SIG_IGN) # Let parent process catch Ctrl+C
     load_features(features)
-    extractor = cdec.sa.GrammarExtractor(config)
+    extractor = cdec.sa.GrammarExtractor(config, online)
     prefix = grammars
 
 def load_features(features):
@@ -53,7 +53,6 @@ def extract(inp):
     # Add training instance _after_ extracting grammars
     if online:
         extractor.add_instance(sentence, reference, alignment)
-        extractor.dump_online_stats()
     grammar_file = os.path.abspath(grammar_file)
     return '<seg grammar="{0}" id="{1}"> {2} </seg>{3}'.format(grammar_file, i, sentence, suffix)
 
diff --git a/python/pkg/cdec/sa/extractor.py b/python/pkg/cdec/sa/extractor.py
index 62a251a7..acc13cbc 100644
--- a/python/pkg/cdec/sa/extractor.py
+++ b/python/pkg/cdec/sa/extractor.py
@@ -2,14 +2,15 @@ from itertools import chain
 import os, sys
 import cdec.configobj
 from cdec.sa.features import EgivenFCoherent, SampleCountF, CountEF,\
-        MaxLexEgivenF, MaxLexFgivenE, IsSingletonF, IsSingletonFE
+        MaxLexEgivenF, MaxLexFgivenE, IsSingletonF, IsSingletonFE,\
+        IsSupportedOnline
 import cdec.sa
 
 # maximum span of a grammar rule in TEST DATA
 MAX_INITIAL_SIZE = 15
 
 class GrammarExtractor:
-    def __init__(self, config, features=None):
+    def __init__(self, config, online=False, features=None):
         if isinstance(config, basestring):
             if not os.path.exists(config):
                 raise IOError('cannot read configuration from {0}'.format(config))
@@ -57,11 +58,19 @@ class GrammarExtractor:
         # lexical weighting tables
         tt = cdec.sa.BiLex(from_binary=config['lex_file'])
 
+        # TODO: clean this up
+        extended_features = []
+        if online:
+            extended_features.append(IsSupportedOnline)
+            
         # TODO: use @cdec.sa.features decorator for standard features too
         # + add a mask to disable features
+        for f in cdec.sa._SA_FEATURES:
+            extended_features.append(f)
+            
         scorer = cdec.sa.Scorer(EgivenFCoherent, SampleCountF, CountEF, 
             MaxLexFgivenE(tt), MaxLexEgivenF(tt), IsSingletonF, IsSingletonFE,
-            *cdec.sa._SA_FEATURES)
+            *extended_features)
 
         fsarray = cdec.sa.SuffixArray(from_binary=config['f_sa_file'])
         edarray = cdec.sa.DataArray(from_binary=config['e_file'])
@@ -92,4 +101,6 @@ class GrammarExtractor:
     
     # Debugging
     def dump_online_stats(self):
-        self.factory.dump_online_stats()
-\ No newline at end of file
+        self.factory.dump_online_stats()
+    def dump_online_rules(self):
+        self.factory.dump_online_rules()
+\ No newline at end of file
diff --git a/python/pkg/cdec/sa/features.py b/python/pkg/cdec/sa/features.py
index a4ae23e8..46412cd5 100644
--- a/python/pkg/cdec/sa/features.py
+++ b/python/pkg/cdec/sa/features.py
@@ -1,57 +1,142 @@
 from __future__ import division
 import math
 
+from cdec.sa import isvar
+
 MAXSCORE = 99
 
 def EgivenF(ctx): # p(e|f) = c(e, f)/c(f)
-    return -math.log10(ctx.paircount/ctx.fcount)
+    if not ctx.online:
+        prob = ctx.paircount/ctx.fcount
+    else:
+        prob = (ctx.paircount + ctx.online.paircount) / (ctx.fcount + ctx.online.fcount)
+    return -math.log10(prob)
 
 def CountEF(ctx): # c(e, f)
-    return math.log10(1 + ctx.paircount)
+    if not ctx.online:
+        count = 1 + ctx.paircount
+    else:
+        count = 1 + ctx.paircount + ctx.online.paircount
+    return math.log10(count)
 
 def SampleCountF(ctx): # sample c(f)
-    return math.log10(1 + ctx.fsample_count)
+    if not ctx.online:
+        count = 1 + ctx.fsample_count
+    else:
+        count = 1 + ctx.fsample_count + ctx.online.fsample_count
+    return math.log10(count)
 
 def EgivenFCoherent(ctx): # c(e, f) / sample c(f)
-    prob = ctx.paircount/ctx.fsample_count
+    if not ctx.online:
+        prob = ctx.paircount/ctx.fsample_count
+    else:
+        prob = (ctx.paircount + ctx.online.paircount) / (ctx.fsample_count + ctx.online.fsample_count)
     return -math.log10(prob) if prob > 0 else MAXSCORE
 
 def CoherenceProb(ctx): # c(f) / sample c(f)
-    return -math.log10(ctx.fcount/ctx.fsample_count)
+    if not ctx.online:
+        prob = ctx.fcount/ctx.fsample_count
+    else:
+        prob = (ctx.fcount + ctx.online.fcount) / (ctx.fsample_count + ctx.online.fsample_count)
+    return -math.log10(prob)
 
 def MaxLexEgivenF(ttable):
     def MaxLexEgivenF(ctx):
         fwords = ctx.fphrase.words
         fwords.append('NULL')
-        def score():
+        # Always use this for now
+        if not ctx.online or ctx.online:
+            maxOffScore = 0.0
+            for e in ctx.ephrase.words:
+                maxScore = max(ttable.get_score(f, e, 0) for f in fwords)
+                maxOffScore += -math.log10(maxScore) if maxScore > 0 else MAXSCORE
+            return maxOffScore
+        else:
+            # For now, straight average
+            maxOffScore = 0.0
+            maxOnScore = 0.0
             for e in ctx.ephrase.words:
-              maxScore = max(ttable.get_score(f, e, 0) for f in fwords)
-              yield -math.log10(maxScore) if maxScore > 0 else MAXSCORE
-        return sum(score())
+                maxScore = max(ttable.get_score(f, e, 0) for f in fwords)
+                maxOffScore += -math.log10(maxScore) if maxScore > 0 else MAXSCORE
+            for e in ctx.ephrase:
+                if not isvar(e):
+                    maxScore = 0.0
+                    for f in ctx.fphrase:
+                        if not isvar(f):
+                            b_f = ctx.online.bilex_f.get(f, 0)
+                            if b_f:
+                                maxScore = max(maxScore, ctx.online.bilex_fe.get(f, {}).get(e))
+                    maxOnScore += -math.log10(maxScore) if maxScore > 0 else MAXSCORE
+            return (maxOffScore + maxOnScore) / 2
     return MaxLexEgivenF
 
 def MaxLexFgivenE(ttable):
     def MaxLexFgivenE(ctx):
         ewords = ctx.ephrase.words
         ewords.append('NULL')
-        def score():
+        # Always use this for now
+        if not ctx.online or ctx.online:
+            maxOffScore = 0.0
             for f in ctx.fphrase.words:
-              maxScore = max(ttable.get_score(f, e, 1) for e in ewords)
-              yield -math.log10(maxScore) if maxScore > 0 else MAXSCORE
-        return sum(score())
+                maxScore = max(ttable.get_score(f, e, 1) for e in ewords)
+                maxOffScore += -math.log10(maxScore) if maxScore > 0 else MAXSCORE
+            return maxOffScore
+        else:
+            # For now, straight average
+            maxOffScore = 0.0
+            maxOnScore = 0.0
+            for f in ctx.fphrase.words:
+                maxScore = max(ttable.get_score(f, e, 1) for e in ewords)
+                maxOffScore += -math.log10(maxScore) if maxScore > 0 else MAXSCORE
+            for f in ctx.fphrase:
+                if not isvar(f):
+                    maxScore = 0.0
+                    for e in ctx.ephrase:
+                        if not isvar(e):
+                            b_e = ctx.online.bilex_e.get(e, 0)
+                            if b_e:
+                                maxScore = max(maxScore, ctx.online.bilex_fe.get(f, {}).get(e, 0) / b_e )
+                    maxOnScore += -math.log10(maxScore) if maxScore > 0 else MAXSCORE
+            return (maxOffScore + maxOnScore) / 2
     return MaxLexFgivenE
 
 def IsSingletonF(ctx):
-    return (ctx.fcount == 1)
+    if not ctx.online:
+        count = ctx.fcount
+    else:
+        count = ctx.fcount + ctx.online.fcount  
+    return (count == 1)
 
 def IsSingletonFE(ctx):
-    return (ctx.paircount == 1)
+    if not ctx.online:
+        count = ctx.paircount
+    else:
+        count = ctx.paircount + ctx.online.paircount
+    return (count == 1)
 
 def IsNotSingletonF(ctx):
-    return (ctx.fcount > 1)
+    if not ctx.online:
+        count = ctx.fcount
+    else:
+        count = ctx.fcount + ctx.online.fcount
+    return (count > 1)
 
 def IsNotSingletonFE(ctx):
+    if not ctx.online:
+        count = ctx.paircount
+    else:
+        count = ctx.paircount + ctx.online.paircount
     return (ctx.paircount > 1)
 
 def IsFEGreaterThanZero(ctx):
+    if not ctx.online:
+        count = ctx.paircount
+    else:
+        count = ctx.paircount + ctx.online.paircount
     return (ctx.paircount > 0.01)
+
+def IsSupportedOnline(ctx): # Occurs in online data?
+    if ctx.online:
+        return (ctx.online.paircount > 0.01)
+    else:
+        return False
+\ No newline at end of file
author	Patrick Simianer <simianer@cl.uni-heidelberg.de>	2013-03-03 12:06:43 +0100
committer	Patrick Simianer <simianer@cl.uni-heidelberg.de>	2013-03-03 12:06:43 +0100
commit	f7f9048f8e4d34682f17bfd050d238005feb3ee3 (patch)
tree	fa20fa16b0f5a8009a9254622b65ebeaec049399 /python/pkg
parent	9d306b30c9abba995ba35243e5cb461bb472a61f (diff)
parent	12f2eab0e7dc7167af47cddf8ef88968656277da (diff)