summaryrefslogtreecommitdiff
path: root/python/cdec/scfg/features.py
diff options
context:
space:
mode:
authorVictor Chahuneau <vchahune@cs.cmu.edu>2012-07-27 01:16:03 -0400
committerVictor Chahuneau <vchahune@cs.cmu.edu>2012-07-27 01:16:03 -0400
commitb2a8bccb2bd713d9ec081cf3dad0162c2cb492d8 (patch)
treec661044fd2a3943cf2ad12109b916fd7b56a519e /python/cdec/scfg/features.py
parent148b1168c2b07abf0c7757a31141377c28ec3d91 (diff)
[python] Fork of the suffix-array extractor with surface improvements
Available as the cdec.sa module, with commande-line helpers: python -m cdec.sa.compile -f ... -e ... -a ... -o sa-out/ -c extract.ini python -m cdec.sa.extract -c extract.ini -g grammars-out/ < input.txt > input.sgml + renamed cdec.scfg -> cdec.sa + Python README
Diffstat (limited to 'python/cdec/scfg/features.py')
-rw-r--r--python/cdec/scfg/features.py62
1 files changed, 0 insertions, 62 deletions
diff --git a/python/cdec/scfg/features.py b/python/cdec/scfg/features.py
deleted file mode 100644
index 6419cdd8..00000000
--- a/python/cdec/scfg/features.py
+++ /dev/null
@@ -1,62 +0,0 @@
-from __future__ import division
-import math
-import sym
-
-def contextless(feature):
- feature.compute_contextless_score = feature
- return feature
-
-MAXSCORE = 99
-
-def EgivenF(fphrase, ephrase, paircount, fcount, fsample_count): # p(e|f)
- return -math.log10(paircount/fcount)
-
-def CountEF(fphrase, ephrase, paircount, fcount, fsample_count):
- return math.log10(1 + paircount)
-
-def SampleCountF(fphrase, ephrase, paircount, fcount, fsample_count):
- return math.log10(1 + fsample_count)
-
-def EgivenFCoherent(fphrase, ephrase, paircount, fcount, fsample_count):
- prob = paircount/fsample_count
- return -math.log10(prob) if prob > 0 else MAXSCORE
-
-def CoherenceProb(fphrase, ephrase, paircount, fcount, fsample_count):
- return -math.log10(fcount/fsample_count)
-
-def MaxLexEgivenF(ttable):
- def feature(fphrase, ephrase, paircount, fcount, fsample_count):
- fwords = [sym.tostring(w) for w in fphrase if not sym.isvar(w)] + ['NULL']
- ewords = (sym.tostring(w) for w in ephrase if not sym.isvar(w))
- def score():
- for e in ewords:
- maxScore = max(ttable.get_score(f, e, 0) for f in fwords)
- yield -math.log10(maxScore) if maxScore > 0 else MAXSCORE
- return sum(score())
- return feature
-
-def MaxLexFgivenE(ttable):
- def feature(fphrase, ephrase, paircount, fcount, fsample_count):
- fwords = (sym.tostring(w) for w in fphrase if not sym.isvar(w))
- ewords = [sym.tostring(w) for w in ephrase if not sym.isvar(w)] + ['NULL']
- def score():
- for f in fwords:
- maxScore = max(ttable.get_score(f, e, 1) for e in ewords)
- yield -math.log10(maxScore) if maxScore > 0 else MAXSCORE
- return sum(score())
- return feature
-
-def IsSingletonF(fphrase, ephrase, paircount, fcount, fsample_count):
- return (fcount == 1)
-
-def IsSingletonFE(fphrase, ephrase, paircount, fcount, fsample_count):
- return (paircount == 1)
-
-def IsNotSingletonF(fphrase, ephrase, paircount, fcount, fsample_count):
- return (fcount > 1)
-
-def IsNotSingletonFE(fphrase, ephrase, paircount, fcount, fsample_count):
- return (paircount > 1)
-
-def IsFEGreaterThanZero(fphrase, ephrase, paircount, fcount, fsample_count):
- return (paircount > 0.01)