From 6fb3cc36cc4113c9f3510d87b3ae3b9c9351bf4e Mon Sep 17 00:00:00 2001
From: Victor Chahuneau <vchahune@cs.cmu.edu>
Date: Wed, 5 Sep 2012 14:55:11 +0100
Subject: Expose new feature extraction API

---
 python/src/sa/default_scorer.pxi | 74 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 74 insertions(+)
 create mode 100644 python/src/sa/default_scorer.pxi

(limited to 'python/src/sa/default_scorer.pxi')
diff --git a/python/src/sa/default_scorer.pxi b/python/src/sa/default_scorer.pxi
new file mode 100644
index 00000000..483f4743
--- /dev/null
+++ b/python/src/sa/default_scorer.pxi
@@ -0,0 +1,74 @@
+from libc.stdlib cimport malloc, realloc, free
+from libc.math cimport log10
+
+MAXSCORE = -99
+EgivenFCoherent = 0
+SampleCountF = 1
+CountEF = 2
+MaxLexFgivenE = 3
+MaxLexEgivenF = 4
+IsSingletonF = 5
+IsSingletonFE = 6
+NFEATURES = 7
+
+cdef class DefaultScorer(Scorer):
+    cdef BiLex ttable
+    cdef int* fid
+
+    def __dealloc__(self):
+        free(self.fid)
+
+    def __init__(self, BiLex ttable):
+        self.ttable = ttable
+        self.fid = <int*> malloc(NFEATURES*sizeof(int))
+        cdef unsigned i
+        for i, fnames in enumerate(('EgivenFCoherent', 'SampleCountF', 'CountEF',
+                'MaxLexFgivenE', 'MaxLexEgivenF', 'IsSingletonF', 'IsSingletonFE')):
+            self.fid[i] = FD.index(fnames)
+
+    cdef FeatureVector score(self, Phrase fphrase, Phrase ephrase,
+            unsigned paircount, unsigned fcount, unsigned fsample_count):
+        cdef FeatureVector scores = FeatureVector()
+
+        #  EgivenFCoherent
+        cdef float efc = <float>paircount/fsample_count
+        scores.set(self.fid[EgivenFCoherent], -log10(efc) if efc > 0 else MAXSCORE)
+
+        # SampleCountF
+        scores.set(self.fid[SampleCountF], log10(1 + fsample_count))
+
+        # CountEF
+        scores.set(self.fid[CountEF], log10(1 + paircount))
+
+        # MaxLexFgivenE TODO typify
+        ewords = ephrase.words
+        ewords.append('NULL')
+        cdef float mlfe = 0, max_score = -1
+        for f in fphrase.words:
+            for e in ewords:
+                score = self.ttable.get_score(f, e, 1)
+                if score > max_score:
+                    max_score = score
+            mlfe += -log10(max_score) if max_score > 0 else MAXSCORE
+        scores.set(self.fid[MaxLexFgivenE], mlfe)
+
+        # MaxLexEgivenF TODO same
+        fwords = fphrase.words
+        fwords.append('NULL')
+        cdef float mlef = 0
+        max_score = -1
+        for e in ephrase.words:
+            for f in fwords:
+                score = self.ttable.get_score(f, e, 0)
+                if score > max_score:
+                    max_score = score
+            mlef += -log10(max_score) if max_score > 0 else MAXSCORE
+        scores.set(self.fid[MaxLexEgivenF], mlef)
+
+        # IsSingletonF
+        scores.set(self.fid[IsSingletonF], (fcount == 1))
+
+        # IsSingletonFE
+        scores.set(self.fid[IsSingletonFE], (paircount == 1))
+
+        return scores
-- 
cgit v1.2.3