diff options
author | Patrick Simianer <simianer@cl.uni-heidelberg.de> | 2012-06-13 14:42:07 +0200 |
---|---|---|
committer | Patrick Simianer <simianer@cl.uni-heidelberg.de> | 2012-06-13 14:42:07 +0200 |
commit | dcba237fdda20cb32a5d9890c3beae790db446c2 (patch) | |
tree | b09846c7ed1ea3c0f13748ed13ce49efeca7fecc /python/cdec/scfg/features.py | |
parent | d86299e31deb81a836b1b2f2a356a6f4b28eda9e (diff) | |
parent | 34c2c129a376abfc79e76cf2fa1282b89ef605b6 (diff) |
Merge remote-tracking branch 'upstream/master'
Diffstat (limited to 'python/cdec/scfg/features.py')
-rw-r--r-- | python/cdec/scfg/features.py | 62 |
1 files changed, 62 insertions, 0 deletions
diff --git a/python/cdec/scfg/features.py b/python/cdec/scfg/features.py new file mode 100644 index 00000000..6419cdd8 --- /dev/null +++ b/python/cdec/scfg/features.py @@ -0,0 +1,62 @@ +from __future__ import division +import math +import sym + +def contextless(feature): + feature.compute_contextless_score = feature + return feature + +MAXSCORE = 99 + +def EgivenF(fphrase, ephrase, paircount, fcount, fsample_count): # p(e|f) + return -math.log10(paircount/fcount) + +def CountEF(fphrase, ephrase, paircount, fcount, fsample_count): + return math.log10(1 + paircount) + +def SampleCountF(fphrase, ephrase, paircount, fcount, fsample_count): + return math.log10(1 + fsample_count) + +def EgivenFCoherent(fphrase, ephrase, paircount, fcount, fsample_count): + prob = paircount/fsample_count + return -math.log10(prob) if prob > 0 else MAXSCORE + +def CoherenceProb(fphrase, ephrase, paircount, fcount, fsample_count): + return -math.log10(fcount/fsample_count) + +def MaxLexEgivenF(ttable): + def feature(fphrase, ephrase, paircount, fcount, fsample_count): + fwords = [sym.tostring(w) for w in fphrase if not sym.isvar(w)] + ['NULL'] + ewords = (sym.tostring(w) for w in ephrase if not sym.isvar(w)) + def score(): + for e in ewords: + maxScore = max(ttable.get_score(f, e, 0) for f in fwords) + yield -math.log10(maxScore) if maxScore > 0 else MAXSCORE + return sum(score()) + return feature + +def MaxLexFgivenE(ttable): + def feature(fphrase, ephrase, paircount, fcount, fsample_count): + fwords = (sym.tostring(w) for w in fphrase if not sym.isvar(w)) + ewords = [sym.tostring(w) for w in ephrase if not sym.isvar(w)] + ['NULL'] + def score(): + for f in fwords: + maxScore = max(ttable.get_score(f, e, 1) for e in ewords) + yield -math.log10(maxScore) if maxScore > 0 else MAXSCORE + return sum(score()) + return feature + +def IsSingletonF(fphrase, ephrase, paircount, fcount, fsample_count): + return (fcount == 1) + +def IsSingletonFE(fphrase, ephrase, paircount, fcount, fsample_count): + return (paircount == 1) + +def IsNotSingletonF(fphrase, ephrase, paircount, fcount, fsample_count): + return (fcount > 1) + +def IsNotSingletonFE(fphrase, ephrase, paircount, fcount, fsample_count): + return (paircount > 1) + +def IsFEGreaterThanZero(fphrase, ephrase, paircount, fcount, fsample_count): + return (paircount > 0.01) |