From a68aaea190197f17f308b91f66ceff222a784460 Mon Sep 17 00:00:00 2001
From: Victor Chahuneau <vchahune@cs.cmu.edu>
Date: Tue, 14 Aug 2012 22:50:37 -0400
Subject: [cdec.sa] Explicit feature names in grammar extractor output

+ sparse features in extractor
+ hg.intersect(string)
+ basestring = str|unicode
---
 python/pkg/cdec/sa/__init__.py  | 2 +-
 python/pkg/cdec/sa/extractor.py | 8 ++++----
 python/pkg/cdec/sa/features.py  | 8 ++++----
 3 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'python/pkg/cdec')
diff --git a/python/pkg/cdec/sa/__init__.py b/python/pkg/cdec/sa/__init__.py
index fd4a4148..ab8be809 100644
--- a/python/pkg/cdec/sa/__init__.py
+++ b/python/pkg/cdec/sa/__init__.py
@@ -1,4 +1,4 @@
 from cdec.sa._sa import sym_fromstring,\
         SuffixArray, DataArray, LCP, Precomputation, Alignment, BiLex,\
-        HieroCachingRuleFactory, Sampler
+        HieroCachingRuleFactory, Sampler, Scorer
 from cdec.sa.extractor import GrammarExtractor
diff --git a/python/pkg/cdec/sa/extractor.py b/python/pkg/cdec/sa/extractor.py
index bb912e16..90cc4c51 100644
--- a/python/pkg/cdec/sa/extractor.py
+++ b/python/pkg/cdec/sa/extractor.py
@@ -57,8 +57,8 @@ class GrammarExtractor:
         # lexical weighting tables
         tt = cdec.sa.BiLex(from_binary=config['lex_file'])
 
-        self.models = (EgivenFCoherent, SampleCountF, CountEF, 
-                MaxLexFgivenE(tt), MaxLexEgivenF(tt), IsSingletonF, IsSingletonFE)
+        scorer = cdec.sa.Scorer(EgivenFCoherent, SampleCountF, CountEF, 
+            MaxLexFgivenE(tt), MaxLexEgivenF(tt), IsSingletonF, IsSingletonFE)
 
         fsarray = cdec.sa.SuffixArray(from_binary=config['f_sa_file'])
         edarray = cdec.sa.DataArray(from_binary=config['e_file'])
@@ -67,7 +67,7 @@ class GrammarExtractor:
         # -1 = don't sample, use all data (VERY SLOW!)
         sampler = cdec.sa.Sampler(300, fsarray)
 
-        self.factory.configure(fsarray, edarray, sampler)
+        self.factory.configure(fsarray, edarray, sampler, scorer)
 
     def grammar(self, sentence):
         if isinstance(sentence, unicode):
@@ -75,4 +75,4 @@ class GrammarExtractor:
         cnet = chain(('<s>',), sentence.split(), ('</s>',))
         cnet = (cdec.sa.sym_fromstring(word, terminal=True) for word in cnet)
         cnet = tuple(((word, None, 1), ) for word in cnet)
-        return self.factory.input(cnet, self.models)
+        return self.factory.input(cnet)
diff --git a/python/pkg/cdec/sa/features.py b/python/pkg/cdec/sa/features.py
index 325b9e13..8fd370cc 100644
--- a/python/pkg/cdec/sa/features.py
+++ b/python/pkg/cdec/sa/features.py
@@ -20,7 +20,7 @@ def CoherenceProb(fphrase, ephrase, paircount, fcount, fsample_count):
     return -math.log10(fcount/fsample_count)
 
 def MaxLexEgivenF(ttable):
-    def feature(fphrase, ephrase, paircount, fcount, fsample_count):
+    def MaxLexEgivenF(fphrase, ephrase, paircount, fcount, fsample_count):
         fwords = fphrase.words
         fwords.append('NULL')
         def score():
@@ -28,10 +28,10 @@ def MaxLexEgivenF(ttable):
               maxScore = max(ttable.get_score(f, e, 0) for f in fwords)
               yield -math.log10(maxScore) if maxScore > 0 else MAXSCORE
         return sum(score())
-    return feature
+    return MaxLexEgivenF
 
 def MaxLexFgivenE(ttable):
-    def feature(fphrase, ephrase, paircount, fcount, fsample_count):
+    def MaxLexFgivenE(fphrase, ephrase, paircount, fcount, fsample_count):
         ewords = ephrase.words
         ewords.append('NULL')
         def score():
@@ -39,7 +39,7 @@ def MaxLexFgivenE(ttable):
               maxScore = max(ttable.get_score(f, e, 1) for e in ewords)
               yield -math.log10(maxScore) if maxScore > 0 else MAXSCORE
         return sum(score())
-    return feature
+    return MaxLexFgivenE
 
 def IsSingletonF(fphrase, ephrase, paircount, fcount, fsample_count):
     return (fcount == 1)
-- 
cgit v1.2.3