[python] Fork of the suffix-array extractor with surface improvements

Available as the cdec.sa module, with commande-line helpers: python -m cdec.sa.compile -f ... -e ... -a ... -o sa-out/ -c extract.ini python -m cdec.sa.extract -c extract.ini -g grammars-out/ < input.txt > input.sgml + renamed cdec.scfg -> cdec.sa + Python README
author: Victor Chahuneau <vchahune@cs.cmu.edu> 2012-07-27 01:16:03 -0400
committer: Victor Chahuneau <vchahune@cs.cmu.edu> 2012-07-27 01:16:03 -0400
commit: 8fdc3681fb7551e7faeff9f720102cdd417ba077 (patch)
tree: 1129d2b79a3255c249e181141814cb92b52b4d4d /python/cdec/sa/features.py
parent: 0aac9fd78f1c8b9ba3d91d702f592288075cbbde (diff)
1 files changed, 60 insertions, 0 deletions
diff --git a/python/cdec/sa/features.py b/python/cdec/sa/features.py
new file mode 100644
index 00000000..8d35d8e6
--- /dev/null
+++ b/python/cdec/sa/features.py
@@ -0,0 +1,60 @@
+from __future__ import division
+import math
+import cdec.sa
+
+MAXSCORE = 99
+
+def EgivenF(fphrase, ephrase, paircount, fcount, fsample_count): # p(e|f)
+    return -math.log10(paircount/fcount)
+
+def CountEF(fphrase, ephrase, paircount, fcount, fsample_count):
+    return math.log10(1 + paircount)
+
+def SampleCountF(fphrase, ephrase, paircount, fcount, fsample_count):
+    return math.log10(1 + fsample_count)
+
+def EgivenFCoherent(fphrase, ephrase, paircount, fcount, fsample_count):
+    prob = paircount/fsample_count
+    return -math.log10(prob) if prob > 0 else MAXSCORE
+
+def CoherenceProb(fphrase, ephrase, paircount, fcount, fsample_count):
+    return -math.log10(fcount/fsample_count)
+
+def MaxLexEgivenF(ttable):
+    def feature(fphrase, ephrase, paircount, fcount, fsample_count):
+        fwords = [cdec.sa.sym_tostring(w) for w in fphrase if not cdec.sa.sym_isvar(w)]
+        fwords.append('NULL')
+        ewords = (cdec.sa.sym_tostring(w) for w in ephrase if not cdec.sa.sym_isvar(w))
+        def score():
+            for e in ewords:
+              maxScore = max(ttable.get_score(f, e, 0) for f in fwords)
+              yield -math.log10(maxScore) if maxScore > 0 else MAXSCORE
+        return sum(score())
+    return feature
+
+def MaxLexFgivenE(ttable):
+    def feature(fphrase, ephrase, paircount, fcount, fsample_count):
+        fwords = (cdec.sa.sym_tostring(w) for w in fphrase if not cdec.sa.sym_isvar(w))
+        ewords = [cdec.sa.sym_tostring(w) for w in ephrase if not cdec.sa.sym_isvar(w)]
+        ewords.append('NULL')
+        def score():
+            for f in fwords:
+              maxScore = max(ttable.get_score(f, e, 1) for e in ewords)
+              yield -math.log10(maxScore) if maxScore > 0 else MAXSCORE
+        return sum(score())
+    return feature
+
+def IsSingletonF(fphrase, ephrase, paircount, fcount, fsample_count):
+    return (fcount == 1)
+
+def IsSingletonFE(fphrase, ephrase, paircount, fcount, fsample_count):
+    return (paircount == 1)
+
+def IsNotSingletonF(fphrase, ephrase, paircount, fcount, fsample_count):
+    return (fcount > 1)
+
+def IsNotSingletonFE(fphrase, ephrase, paircount, fcount, fsample_count):
+    return (paircount > 1)
+
+def IsFEGreaterThanZero(fphrase, ephrase, paircount, fcount, fsample_count):
+    return (paircount > 0.01)
author	Victor Chahuneau <vchahune@cs.cmu.edu>	2012-07-27 01:16:03 -0400
committer	Victor Chahuneau <vchahune@cs.cmu.edu>	2012-07-27 01:16:03 -0400
commit	8fdc3681fb7551e7faeff9f720102cdd417ba077 (patch)
tree	1129d2b79a3255c249e181141814cb92b52b4d4d /python/cdec/sa/features.py
parent	0aac9fd78f1c8b9ba3d91d702f592288075cbbde (diff)