Merge remote-tracking branch 'upstream/master'

author: Patrick Simianer <simianer@cl.uni-heidelberg.de> 2012-06-13 14:42:07 +0200
committer: Patrick Simianer <simianer@cl.uni-heidelberg.de> 2012-06-13 14:42:07 +0200
commit: e6d3c25191873ca0cf99db8e89702ed91d65277c (patch)
tree: b0697ece6f5e4a8229915758c68750793a23f776 /python/cdec/scfg/features.py
parent: 62c805c90c5347b844f92574e240db5c65578e12 (diff)
parent: 3acdf1e4b37637d6df86a7b54fb0f1b0464c172b (diff)
1 files changed, 62 insertions, 0 deletions
diff --git a/python/cdec/scfg/features.py b/python/cdec/scfg/features.py
new file mode 100644
index 00000000..6419cdd8
--- /dev/null
+++ b/python/cdec/scfg/features.py
@@ -0,0 +1,62 @@
+from __future__ import division
+import math
+import sym
+
+def contextless(feature):
+    feature.compute_contextless_score = feature
+    return feature
+
+MAXSCORE = 99
+
+def EgivenF(fphrase, ephrase, paircount, fcount, fsample_count): # p(e|f)
+    return -math.log10(paircount/fcount)
+
+def CountEF(fphrase, ephrase, paircount, fcount, fsample_count):
+    return math.log10(1 + paircount)
+
+def SampleCountF(fphrase, ephrase, paircount, fcount, fsample_count):
+    return math.log10(1 + fsample_count)
+
+def EgivenFCoherent(fphrase, ephrase, paircount, fcount, fsample_count):
+    prob = paircount/fsample_count
+    return -math.log10(prob) if prob > 0 else MAXSCORE
+
+def CoherenceProb(fphrase, ephrase, paircount, fcount, fsample_count):
+    return -math.log10(fcount/fsample_count)
+
+def MaxLexEgivenF(ttable):
+    def feature(fphrase, ephrase, paircount, fcount, fsample_count):
+        fwords = [sym.tostring(w) for w in fphrase if not sym.isvar(w)] + ['NULL']
+        ewords = (sym.tostring(w) for w in ephrase if not sym.isvar(w))
+        def score():
+            for e in ewords:
+              maxScore = max(ttable.get_score(f, e, 0) for f in fwords)
+              yield -math.log10(maxScore) if maxScore > 0 else MAXSCORE
+        return sum(score())
+    return feature
+
+def MaxLexFgivenE(ttable):
+    def feature(fphrase, ephrase, paircount, fcount, fsample_count):
+        fwords = (sym.tostring(w) for w in fphrase if not sym.isvar(w))
+        ewords = [sym.tostring(w) for w in ephrase if not sym.isvar(w)] + ['NULL']
+        def score():
+            for f in fwords:
+              maxScore = max(ttable.get_score(f, e, 1) for e in ewords)
+              yield -math.log10(maxScore) if maxScore > 0 else MAXSCORE
+        return sum(score())
+    return feature
+
+def IsSingletonF(fphrase, ephrase, paircount, fcount, fsample_count):
+    return (fcount == 1)
+
+def IsSingletonFE(fphrase, ephrase, paircount, fcount, fsample_count):
+    return (paircount == 1)
+
+def IsNotSingletonF(fphrase, ephrase, paircount, fcount, fsample_count):
+    return (fcount > 1)
+
+def IsNotSingletonFE(fphrase, ephrase, paircount, fcount, fsample_count):
+    return (paircount > 1)
+
+def IsFEGreaterThanZero(fphrase, ephrase, paircount, fcount, fsample_count):
+    return (paircount > 0.01)
author	Patrick Simianer <simianer@cl.uni-heidelberg.de>	2012-06-13 14:42:07 +0200
committer	Patrick Simianer <simianer@cl.uni-heidelberg.de>	2012-06-13 14:42:07 +0200
commit	e6d3c25191873ca0cf99db8e89702ed91d65277c (patch)
tree	b0697ece6f5e4a8229915758c68750793a23f776 /python/cdec/scfg/features.py
parent	62c805c90c5347b844f92574e240db5c65578e12 (diff)
parent	3acdf1e4b37637d6df86a7b54fb0f1b0464c172b (diff)