From 8fdc3681fb7551e7faeff9f720102cdd417ba077 Mon Sep 17 00:00:00 2001 From: Victor Chahuneau Date: Fri, 27 Jul 2012 01:16:03 -0400 Subject: [python] Fork of the suffix-array extractor with surface improvements Available as the cdec.sa module, with commande-line helpers: python -m cdec.sa.compile -f ... -e ... -a ... -o sa-out/ -c extract.ini python -m cdec.sa.extract -c extract.ini -g grammars-out/ < input.txt > input.sgml + renamed cdec.scfg -> cdec.sa + Python README --- python/cdec/sa/features.py | 60 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) create mode 100644 python/cdec/sa/features.py (limited to 'python/cdec/sa/features.py') diff --git a/python/cdec/sa/features.py b/python/cdec/sa/features.py new file mode 100644 index 00000000..8d35d8e6 --- /dev/null +++ b/python/cdec/sa/features.py @@ -0,0 +1,60 @@ +from __future__ import division +import math +import cdec.sa + +MAXSCORE = 99 + +def EgivenF(fphrase, ephrase, paircount, fcount, fsample_count): # p(e|f) + return -math.log10(paircount/fcount) + +def CountEF(fphrase, ephrase, paircount, fcount, fsample_count): + return math.log10(1 + paircount) + +def SampleCountF(fphrase, ephrase, paircount, fcount, fsample_count): + return math.log10(1 + fsample_count) + +def EgivenFCoherent(fphrase, ephrase, paircount, fcount, fsample_count): + prob = paircount/fsample_count + return -math.log10(prob) if prob > 0 else MAXSCORE + +def CoherenceProb(fphrase, ephrase, paircount, fcount, fsample_count): + return -math.log10(fcount/fsample_count) + +def MaxLexEgivenF(ttable): + def feature(fphrase, ephrase, paircount, fcount, fsample_count): + fwords = [cdec.sa.sym_tostring(w) for w in fphrase if not cdec.sa.sym_isvar(w)] + fwords.append('NULL') + ewords = (cdec.sa.sym_tostring(w) for w in ephrase if not cdec.sa.sym_isvar(w)) + def score(): + for e in ewords: + maxScore = max(ttable.get_score(f, e, 0) for f in fwords) + yield -math.log10(maxScore) if maxScore > 0 else MAXSCORE + return sum(score()) + return feature + +def MaxLexFgivenE(ttable): + def feature(fphrase, ephrase, paircount, fcount, fsample_count): + fwords = (cdec.sa.sym_tostring(w) for w in fphrase if not cdec.sa.sym_isvar(w)) + ewords = [cdec.sa.sym_tostring(w) for w in ephrase if not cdec.sa.sym_isvar(w)] + ewords.append('NULL') + def score(): + for f in fwords: + maxScore = max(ttable.get_score(f, e, 1) for e in ewords) + yield -math.log10(maxScore) if maxScore > 0 else MAXSCORE + return sum(score()) + return feature + +def IsSingletonF(fphrase, ephrase, paircount, fcount, fsample_count): + return (fcount == 1) + +def IsSingletonFE(fphrase, ephrase, paircount, fcount, fsample_count): + return (paircount == 1) + +def IsNotSingletonF(fphrase, ephrase, paircount, fcount, fsample_count): + return (fcount > 1) + +def IsNotSingletonFE(fphrase, ephrase, paircount, fcount, fsample_count): + return (paircount > 1) + +def IsFEGreaterThanZero(fphrase, ephrase, paircount, fcount, fsample_count): + return (paircount > 0.01) -- cgit v1.2.3