summaryrefslogtreecommitdiff
path: root/python/cdec/sa/features.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/cdec/sa/features.py')
-rw-r--r--python/cdec/sa/features.py142
1 files changed, 142 insertions, 0 deletions
diff --git a/python/cdec/sa/features.py b/python/cdec/sa/features.py
new file mode 100644
index 00000000..c8fc1cca
--- /dev/null
+++ b/python/cdec/sa/features.py
@@ -0,0 +1,142 @@
+from __future__ import division
+import math
+
+from cdec.sa import isvar
+
+MAXSCORE = 99
+
+def EgivenF(ctx): # p(e|f) = c(e, f)/c(f)
+ if not ctx.online:
+ prob = ctx.paircount/ctx.fcount
+ else:
+ prob = (ctx.paircount + ctx.online.paircount) / (ctx.fcount + ctx.online.fcount)
+ return -math.log10(prob)
+
+def CountEF(ctx): # c(e, f)
+ if not ctx.online:
+ count = 1 + ctx.paircount
+ else:
+ count = 1 + ctx.paircount + ctx.online.paircount
+ return math.log10(count)
+
+def SampleCountF(ctx): # sample c(f)
+ if not ctx.online:
+ count = 1 + ctx.fsample_count
+ else:
+ count = 1 + ctx.fsample_count + ctx.online.fsample_count
+ return math.log10(count)
+
+def EgivenFCoherent(ctx): # c(e, f) / sample c(f)
+ if not ctx.online:
+ prob = ctx.paircount/ctx.fsample_count
+ else:
+ prob = (ctx.paircount + ctx.online.paircount) / (ctx.fsample_count + ctx.online.fsample_count)
+ return -math.log10(prob) if prob > 0 else MAXSCORE
+
+def CoherenceProb(ctx): # c(f) / sample c(f)
+ if not ctx.online:
+ prob = ctx.fcount/ctx.fsample_count
+ else:
+ prob = (ctx.fcount + ctx.online.fcount) / (ctx.fsample_count + ctx.online.fsample_count)
+ return -math.log10(prob)
+
+def MaxLexEgivenF(ttable):
+ def MaxLexEgivenF(ctx):
+ fwords = ctx.fphrase.words
+ fwords.append('NULL')
+ # Always use this for now
+ if not ctx.online or ctx.online:
+ maxOffScore = 0.0
+ for e in ctx.ephrase.words:
+ maxScore = max(ttable.get_score(f, e, 0) for f in fwords)
+ maxOffScore += -math.log10(maxScore) if maxScore > 0 else MAXSCORE
+ return maxOffScore
+ else:
+ # For now, straight average
+ maxOffScore = 0.0
+ maxOnScore = 0.0
+ for e in ctx.ephrase.words:
+ maxScore = max(ttable.get_score(f, e, 0) for f in fwords)
+ maxOffScore += -math.log10(maxScore) if maxScore > 0 else MAXSCORE
+ for e in ctx.ephrase:
+ if not isvar(e):
+ maxScore = 0.0
+ for f in ctx.fphrase:
+ if not isvar(f):
+ b_f = ctx.online.bilex_f.get(f, 0)
+ if b_f:
+ maxScore = max(maxScore, ctx.online.bilex_fe.get(f, {}).get(e))
+ maxOnScore += -math.log10(maxScore) if maxScore > 0 else MAXSCORE
+ return (maxOffScore + maxOnScore) / 2
+ return MaxLexEgivenF
+
+def MaxLexFgivenE(ttable):
+ def MaxLexFgivenE(ctx):
+ ewords = ctx.ephrase.words
+ ewords.append('NULL')
+ # Always use this for now
+ if not ctx.online or ctx.online:
+ maxOffScore = 0.0
+ for f in ctx.fphrase.words:
+ maxScore = max(ttable.get_score(f, e, 1) for e in ewords)
+ maxOffScore += -math.log10(maxScore) if maxScore > 0 else MAXSCORE
+ return maxOffScore
+ else:
+ # For now, straight average
+ maxOffScore = 0.0
+ maxOnScore = 0.0
+ for f in ctx.fphrase.words:
+ maxScore = max(ttable.get_score(f, e, 1) for e in ewords)
+ maxOffScore += -math.log10(maxScore) if maxScore > 0 else MAXSCORE
+ for f in ctx.fphrase:
+ if not isvar(f):
+ maxScore = 0.0
+ for e in ctx.ephrase:
+ if not isvar(e):
+ b_e = ctx.online.bilex_e.get(e, 0)
+ if b_e:
+ maxScore = max(maxScore, ctx.online.bilex_fe.get(f, {}).get(e, 0) / b_e )
+ maxOnScore += -math.log10(maxScore) if maxScore > 0 else MAXSCORE
+ return (maxOffScore + maxOnScore) / 2
+ return MaxLexFgivenE
+
+def IsSingletonF(ctx):
+ if not ctx.online:
+ count = ctx.fcount
+ else:
+ count = ctx.fcount + ctx.online.fcount
+ return math.fabs(count - 1) < 1e-6
+
+def IsSingletonFE(ctx):
+ if not ctx.online:
+ count = ctx.paircount
+ else:
+ count = ctx.paircount + ctx.online.paircount
+ return (count == 1)
+
+def IsNotSingletonF(ctx):
+ if not ctx.online:
+ count = ctx.fcount
+ else:
+ count = ctx.fcount + ctx.online.fcount
+ return (count > 1)
+
+def IsNotSingletonFE(ctx):
+ if not ctx.online:
+ count = ctx.paircount
+ else:
+ count = ctx.paircount + ctx.online.paircount
+ return (ctx.paircount > 1)
+
+def IsFEGreaterThanZero(ctx):
+ if not ctx.online:
+ count = ctx.paircount
+ else:
+ count = ctx.paircount + ctx.online.paircount
+ return (ctx.paircount > 0.01)
+
+def IsSupportedOnline(ctx): # Occurs in online data?
+ if ctx.online:
+ return (ctx.online.paircount > 0.01)
+ else:
+ return False