diff options
author | Patrick Simianer <simianer@cl.uni-heidelberg.de> | 2013-03-03 12:06:43 +0100 |
---|---|---|
committer | Patrick Simianer <simianer@cl.uni-heidelberg.de> | 2013-03-03 12:06:43 +0100 |
commit | f7f9048f8e4d34682f17bfd050d238005feb3ee3 (patch) | |
tree | fa20fa16b0f5a8009a9254622b65ebeaec049399 /python/pkg/cdec/sa/features.py | |
parent | 9d306b30c9abba995ba35243e5cb461bb472a61f (diff) | |
parent | 12f2eab0e7dc7167af47cddf8ef88968656277da (diff) |
Merge branch 'master' of github.com:pks/cdec-dtrain
Diffstat (limited to 'python/pkg/cdec/sa/features.py')
-rw-r--r-- | python/pkg/cdec/sa/features.py | 117 |
1 files changed, 101 insertions, 16 deletions
diff --git a/python/pkg/cdec/sa/features.py b/python/pkg/cdec/sa/features.py index a4ae23e8..46412cd5 100644 --- a/python/pkg/cdec/sa/features.py +++ b/python/pkg/cdec/sa/features.py @@ -1,57 +1,142 @@ from __future__ import division import math +from cdec.sa import isvar + MAXSCORE = 99 def EgivenF(ctx): # p(e|f) = c(e, f)/c(f) - return -math.log10(ctx.paircount/ctx.fcount) + if not ctx.online: + prob = ctx.paircount/ctx.fcount + else: + prob = (ctx.paircount + ctx.online.paircount) / (ctx.fcount + ctx.online.fcount) + return -math.log10(prob) def CountEF(ctx): # c(e, f) - return math.log10(1 + ctx.paircount) + if not ctx.online: + count = 1 + ctx.paircount + else: + count = 1 + ctx.paircount + ctx.online.paircount + return math.log10(count) def SampleCountF(ctx): # sample c(f) - return math.log10(1 + ctx.fsample_count) + if not ctx.online: + count = 1 + ctx.fsample_count + else: + count = 1 + ctx.fsample_count + ctx.online.fsample_count + return math.log10(count) def EgivenFCoherent(ctx): # c(e, f) / sample c(f) - prob = ctx.paircount/ctx.fsample_count + if not ctx.online: + prob = ctx.paircount/ctx.fsample_count + else: + prob = (ctx.paircount + ctx.online.paircount) / (ctx.fsample_count + ctx.online.fsample_count) return -math.log10(prob) if prob > 0 else MAXSCORE def CoherenceProb(ctx): # c(f) / sample c(f) - return -math.log10(ctx.fcount/ctx.fsample_count) + if not ctx.online: + prob = ctx.fcount/ctx.fsample_count + else: + prob = (ctx.fcount + ctx.online.fcount) / (ctx.fsample_count + ctx.online.fsample_count) + return -math.log10(prob) def MaxLexEgivenF(ttable): def MaxLexEgivenF(ctx): fwords = ctx.fphrase.words fwords.append('NULL') - def score(): + # Always use this for now + if not ctx.online or ctx.online: + maxOffScore = 0.0 + for e in ctx.ephrase.words: + maxScore = max(ttable.get_score(f, e, 0) for f in fwords) + maxOffScore += -math.log10(maxScore) if maxScore > 0 else MAXSCORE + return maxOffScore + else: + # For now, straight average + maxOffScore = 0.0 + maxOnScore = 0.0 for e in ctx.ephrase.words: - maxScore = max(ttable.get_score(f, e, 0) for f in fwords) - yield -math.log10(maxScore) if maxScore > 0 else MAXSCORE - return sum(score()) + maxScore = max(ttable.get_score(f, e, 0) for f in fwords) + maxOffScore += -math.log10(maxScore) if maxScore > 0 else MAXSCORE + for e in ctx.ephrase: + if not isvar(e): + maxScore = 0.0 + for f in ctx.fphrase: + if not isvar(f): + b_f = ctx.online.bilex_f.get(f, 0) + if b_f: + maxScore = max(maxScore, ctx.online.bilex_fe.get(f, {}).get(e)) + maxOnScore += -math.log10(maxScore) if maxScore > 0 else MAXSCORE + return (maxOffScore + maxOnScore) / 2 return MaxLexEgivenF def MaxLexFgivenE(ttable): def MaxLexFgivenE(ctx): ewords = ctx.ephrase.words ewords.append('NULL') - def score(): + # Always use this for now + if not ctx.online or ctx.online: + maxOffScore = 0.0 for f in ctx.fphrase.words: - maxScore = max(ttable.get_score(f, e, 1) for e in ewords) - yield -math.log10(maxScore) if maxScore > 0 else MAXSCORE - return sum(score()) + maxScore = max(ttable.get_score(f, e, 1) for e in ewords) + maxOffScore += -math.log10(maxScore) if maxScore > 0 else MAXSCORE + return maxOffScore + else: + # For now, straight average + maxOffScore = 0.0 + maxOnScore = 0.0 + for f in ctx.fphrase.words: + maxScore = max(ttable.get_score(f, e, 1) for e in ewords) + maxOffScore += -math.log10(maxScore) if maxScore > 0 else MAXSCORE + for f in ctx.fphrase: + if not isvar(f): + maxScore = 0.0 + for e in ctx.ephrase: + if not isvar(e): + b_e = ctx.online.bilex_e.get(e, 0) + if b_e: + maxScore = max(maxScore, ctx.online.bilex_fe.get(f, {}).get(e, 0) / b_e ) + maxOnScore += -math.log10(maxScore) if maxScore > 0 else MAXSCORE + return (maxOffScore + maxOnScore) / 2 return MaxLexFgivenE def IsSingletonF(ctx): - return (ctx.fcount == 1) + if not ctx.online: + count = ctx.fcount + else: + count = ctx.fcount + ctx.online.fcount + return (count == 1) def IsSingletonFE(ctx): - return (ctx.paircount == 1) + if not ctx.online: + count = ctx.paircount + else: + count = ctx.paircount + ctx.online.paircount + return (count == 1) def IsNotSingletonF(ctx): - return (ctx.fcount > 1) + if not ctx.online: + count = ctx.fcount + else: + count = ctx.fcount + ctx.online.fcount + return (count > 1) def IsNotSingletonFE(ctx): + if not ctx.online: + count = ctx.paircount + else: + count = ctx.paircount + ctx.online.paircount return (ctx.paircount > 1) def IsFEGreaterThanZero(ctx): + if not ctx.online: + count = ctx.paircount + else: + count = ctx.paircount + ctx.online.paircount return (ctx.paircount > 0.01) + +def IsSupportedOnline(ctx): # Occurs in online data? + if ctx.online: + return (ctx.online.paircount > 0.01) + else: + return False
\ No newline at end of file |