Expose new feature extraction API

author: Victor Chahuneau <vchahune@cs.cmu.edu> 2012-09-05 14:55:11 +0100
committer: Victor Chahuneau <vchahune@cs.cmu.edu> 2012-09-05 14:55:11 +0100
commit: 6fb3cc36cc4113c9f3510d87b3ae3b9c9351bf4e (patch)
tree: ae29f2c831037665ec39e24df0cdf2657dfadc5e /python/src/sa/default_scorer.pxi
parent: 1fd5b40da3bc9c55fd2fba03bb7fdb43eabee63c (diff)
1 files changed, 74 insertions, 0 deletions
diff --git a/python/src/sa/default_scorer.pxi b/python/src/sa/default_scorer.pxi
new file mode 100644
index 00000000..483f4743
--- /dev/null
+++ b/python/src/sa/default_scorer.pxi
@@ -0,0 +1,74 @@
+from libc.stdlib cimport malloc, realloc, free
+from libc.math cimport log10
+
+MAXSCORE = -99
+EgivenFCoherent = 0
+SampleCountF = 1
+CountEF = 2
+MaxLexFgivenE = 3
+MaxLexEgivenF = 4
+IsSingletonF = 5
+IsSingletonFE = 6
+NFEATURES = 7
+
+cdef class DefaultScorer(Scorer):
+    cdef BiLex ttable
+    cdef int* fid
+
+    def __dealloc__(self):
+        free(self.fid)
+
+    def __init__(self, BiLex ttable):
+        self.ttable = ttable
+        self.fid = <int*> malloc(NFEATURES*sizeof(int))
+        cdef unsigned i
+        for i, fnames in enumerate(('EgivenFCoherent', 'SampleCountF', 'CountEF',
+                'MaxLexFgivenE', 'MaxLexEgivenF', 'IsSingletonF', 'IsSingletonFE')):
+            self.fid[i] = FD.index(fnames)
+
+    cdef FeatureVector score(self, Phrase fphrase, Phrase ephrase,
+            unsigned paircount, unsigned fcount, unsigned fsample_count):
+        cdef FeatureVector scores = FeatureVector()
+
+        #  EgivenFCoherent
+        cdef float efc = <float>paircount/fsample_count
+        scores.set(self.fid[EgivenFCoherent], -log10(efc) if efc > 0 else MAXSCORE)
+
+        # SampleCountF
+        scores.set(self.fid[SampleCountF], log10(1 + fsample_count))
+
+        # CountEF
+        scores.set(self.fid[CountEF], log10(1 + paircount))
+
+        # MaxLexFgivenE TODO typify
+        ewords = ephrase.words
+        ewords.append('NULL')
+        cdef float mlfe = 0, max_score = -1
+        for f in fphrase.words:
+            for e in ewords:
+                score = self.ttable.get_score(f, e, 1)
+                if score > max_score:
+                    max_score = score
+            mlfe += -log10(max_score) if max_score > 0 else MAXSCORE
+        scores.set(self.fid[MaxLexFgivenE], mlfe)
+
+        # MaxLexEgivenF TODO same
+        fwords = fphrase.words
+        fwords.append('NULL')
+        cdef float mlef = 0
+        max_score = -1
+        for e in ephrase.words:
+            for f in fwords:
+                score = self.ttable.get_score(f, e, 0)
+                if score > max_score:
+                    max_score = score
+            mlef += -log10(max_score) if max_score > 0 else MAXSCORE
+        scores.set(self.fid[MaxLexEgivenF], mlef)
+
+        # IsSingletonF
+        scores.set(self.fid[IsSingletonF], (fcount == 1))
+
+        # IsSingletonFE
+        scores.set(self.fid[IsSingletonFE], (paircount == 1))
+
+        return scores
author	Victor Chahuneau <vchahune@cs.cmu.edu>	2012-09-05 14:55:11 +0100
committer	Victor Chahuneau <vchahune@cs.cmu.edu>	2012-09-05 14:55:11 +0100
commit	6fb3cc36cc4113c9f3510d87b3ae3b9c9351bf4e (patch)
tree	ae29f2c831037665ec39e24df0cdf2657dfadc5e /python/src/sa/default_scorer.pxi
parent	1fd5b40da3bc9c55fd2fba03bb7fdb43eabee63c (diff)