Merge branch 'master' of github.com:pks/cdec-dtrain

author: Patrick Simianer <simianer@cl.uni-heidelberg.de> 2013-03-03 12:06:43 +0100
committer: Patrick Simianer <simianer@cl.uni-heidelberg.de> 2013-03-03 12:06:43 +0100
commit: f7f9048f8e4d34682f17bfd050d238005feb3ee3 (patch)
tree: fa20fa16b0f5a8009a9254622b65ebeaec049399 /python/pkg/cdec/sa/features.py
parent: 9d306b30c9abba995ba35243e5cb461bb472a61f (diff)
parent: 12f2eab0e7dc7167af47cddf8ef88968656277da (diff)
1 files changed, 101 insertions, 16 deletions
diff --git a/python/pkg/cdec/sa/features.py b/python/pkg/cdec/sa/features.py
index a4ae23e8..46412cd5 100644
--- a/python/pkg/cdec/sa/features.py
+++ b/python/pkg/cdec/sa/features.py
@@ -1,57 +1,142 @@
 from __future__ import division
 import math
 
+from cdec.sa import isvar
+
 MAXSCORE = 99
 
 def EgivenF(ctx): # p(e|f) = c(e, f)/c(f)
-    return -math.log10(ctx.paircount/ctx.fcount)
+    if not ctx.online:
+        prob = ctx.paircount/ctx.fcount
+    else:
+        prob = (ctx.paircount + ctx.online.paircount) / (ctx.fcount + ctx.online.fcount)
+    return -math.log10(prob)
 
 def CountEF(ctx): # c(e, f)
-    return math.log10(1 + ctx.paircount)
+    if not ctx.online:
+        count = 1 + ctx.paircount
+    else:
+        count = 1 + ctx.paircount + ctx.online.paircount
+    return math.log10(count)
 
 def SampleCountF(ctx): # sample c(f)
-    return math.log10(1 + ctx.fsample_count)
+    if not ctx.online:
+        count = 1 + ctx.fsample_count
+    else:
+        count = 1 + ctx.fsample_count + ctx.online.fsample_count
+    return math.log10(count)
 
 def EgivenFCoherent(ctx): # c(e, f) / sample c(f)
-    prob = ctx.paircount/ctx.fsample_count
+    if not ctx.online:
+        prob = ctx.paircount/ctx.fsample_count
+    else:
+        prob = (ctx.paircount + ctx.online.paircount) / (ctx.fsample_count + ctx.online.fsample_count)
     return -math.log10(prob) if prob > 0 else MAXSCORE
 
 def CoherenceProb(ctx): # c(f) / sample c(f)
-    return -math.log10(ctx.fcount/ctx.fsample_count)
+    if not ctx.online:
+        prob = ctx.fcount/ctx.fsample_count
+    else:
+        prob = (ctx.fcount + ctx.online.fcount) / (ctx.fsample_count + ctx.online.fsample_count)
+    return -math.log10(prob)
 
 def MaxLexEgivenF(ttable):
     def MaxLexEgivenF(ctx):
         fwords = ctx.fphrase.words
         fwords.append('NULL')
-        def score():
+        # Always use this for now
+        if not ctx.online or ctx.online:
+            maxOffScore = 0.0
+            for e in ctx.ephrase.words:
+                maxScore = max(ttable.get_score(f, e, 0) for f in fwords)
+                maxOffScore += -math.log10(maxScore) if maxScore > 0 else MAXSCORE
+            return maxOffScore
+        else:
+            # For now, straight average
+            maxOffScore = 0.0
+            maxOnScore = 0.0
             for e in ctx.ephrase.words:
-              maxScore = max(ttable.get_score(f, e, 0) for f in fwords)
-              yield -math.log10(maxScore) if maxScore > 0 else MAXSCORE
-        return sum(score())
+                maxScore = max(ttable.get_score(f, e, 0) for f in fwords)
+                maxOffScore += -math.log10(maxScore) if maxScore > 0 else MAXSCORE
+            for e in ctx.ephrase:
+                if not isvar(e):
+                    maxScore = 0.0
+                    for f in ctx.fphrase:
+                        if not isvar(f):
+                            b_f = ctx.online.bilex_f.get(f, 0)
+                            if b_f:
+                                maxScore = max(maxScore, ctx.online.bilex_fe.get(f, {}).get(e))
+                    maxOnScore += -math.log10(maxScore) if maxScore > 0 else MAXSCORE
+            return (maxOffScore + maxOnScore) / 2
     return MaxLexEgivenF
 
 def MaxLexFgivenE(ttable):
     def MaxLexFgivenE(ctx):
         ewords = ctx.ephrase.words
         ewords.append('NULL')
-        def score():
+        # Always use this for now
+        if not ctx.online or ctx.online:
+            maxOffScore = 0.0
             for f in ctx.fphrase.words:
-              maxScore = max(ttable.get_score(f, e, 1) for e in ewords)
-              yield -math.log10(maxScore) if maxScore > 0 else MAXSCORE
-        return sum(score())
+                maxScore = max(ttable.get_score(f, e, 1) for e in ewords)
+                maxOffScore += -math.log10(maxScore) if maxScore > 0 else MAXSCORE
+            return maxOffScore
+        else:
+            # For now, straight average
+            maxOffScore = 0.0
+            maxOnScore = 0.0
+            for f in ctx.fphrase.words:
+                maxScore = max(ttable.get_score(f, e, 1) for e in ewords)
+                maxOffScore += -math.log10(maxScore) if maxScore > 0 else MAXSCORE
+            for f in ctx.fphrase:
+                if not isvar(f):
+                    maxScore = 0.0
+                    for e in ctx.ephrase:
+                        if not isvar(e):
+                            b_e = ctx.online.bilex_e.get(e, 0)
+                            if b_e:
+                                maxScore = max(maxScore, ctx.online.bilex_fe.get(f, {}).get(e, 0) / b_e )
+                    maxOnScore += -math.log10(maxScore) if maxScore > 0 else MAXSCORE
+            return (maxOffScore + maxOnScore) / 2
     return MaxLexFgivenE
 
 def IsSingletonF(ctx):
-    return (ctx.fcount == 1)
+    if not ctx.online:
+        count = ctx.fcount
+    else:
+        count = ctx.fcount + ctx.online.fcount  
+    return (count == 1)
 
 def IsSingletonFE(ctx):
-    return (ctx.paircount == 1)
+    if not ctx.online:
+        count = ctx.paircount
+    else:
+        count = ctx.paircount + ctx.online.paircount
+    return (count == 1)
 
 def IsNotSingletonF(ctx):
-    return (ctx.fcount > 1)
+    if not ctx.online:
+        count = ctx.fcount
+    else:
+        count = ctx.fcount + ctx.online.fcount
+    return (count > 1)
 
 def IsNotSingletonFE(ctx):
+    if not ctx.online:
+        count = ctx.paircount
+    else:
+        count = ctx.paircount + ctx.online.paircount
     return (ctx.paircount > 1)
 
 def IsFEGreaterThanZero(ctx):
+    if not ctx.online:
+        count = ctx.paircount
+    else:
+        count = ctx.paircount + ctx.online.paircount
     return (ctx.paircount > 0.01)
+
+def IsSupportedOnline(ctx): # Occurs in online data?
+    if ctx.online:
+        return (ctx.online.paircount > 0.01)
+    else:
+        return False
+\ No newline at end of file
author	Patrick Simianer <simianer@cl.uni-heidelberg.de>	2013-03-03 12:06:43 +0100
committer	Patrick Simianer <simianer@cl.uni-heidelberg.de>	2013-03-03 12:06:43 +0100
commit	f7f9048f8e4d34682f17bfd050d238005feb3ee3 (patch)
tree	fa20fa16b0f5a8009a9254622b65ebeaec049399 /python/pkg/cdec/sa/features.py
parent	9d306b30c9abba995ba35243e5cb461bb472a61f (diff)
parent	12f2eab0e7dc7167af47cddf8ef88968656277da (diff)